module/zfs/zio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26
  27 #include <sys/sysmacros.h>
  28 #include <sys/zfs_context.h>
  29 #include <sys/fm/fs/zfs.h>
  30 #include <sys/spa.h>
  31 #include <sys/txg.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/zio_impl.h>
  35 #include <sys/zio_compress.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/dmu_objset.h>
  38 #include <sys/arc.h>
  39 #include <sys/ddt.h>
  40 #include <sys/blkptr.h>
  41 #include <sys/zfeature.h>
  42 #include <sys/metaslab_impl.h>
  43 #include <sys/time.h>
  44 #include <sys/trace_zio.h>
  45 #include <sys/abd.h>
  46 #include <sys/dsl_crypt.h>
  47
  48 /*
  49  * ==========================================================================
  50  * I/O type descriptions
  51  * ==========================================================================
  52  */
  53 const char *zio_type_name[ZIO_TYPES] = {
  54         /*
  55          * Note: Linux kernel thread name length is limited
  56          * so these names will differ from upstream open zfs.
  57          */
  58         "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
  59 };
  60
  61 int zio_dva_throttle_enabled = B_TRUE;
  62
  63 /*
  64  * ==========================================================================
  65  * I/O kmem caches
  66  * ==========================================================================
  67  */
  68 kmem_cache_t *zio_cache;
  69 kmem_cache_t *zio_link_cache;
  70 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  71 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  72 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
  73 uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  74 uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  75 #endif
  76
  77 int zio_delay_max = ZIO_DELAY_MAX;
  78
  79 #define ZIO_PIPELINE_CONTINUE           0x100
  80 #define ZIO_PIPELINE_STOP               0x101
  81
  82 #define BP_SPANB(indblkshift, level) \
  83         (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
  84 #define COMPARE_META_LEVEL      0x80000000ul
  85 /*
  86  * The following actions directly effect the spa's sync-to-convergence logic.
  87  * The values below define the sync pass when we start performing the action.
  88  * Care should be taken when changing these values as they directly impact
  89  * spa_sync() performance. Tuning these values may introduce subtle performance
  90  * pathologies and should only be done in the context of performance analysis.
  91  * These tunables will eventually be removed and replaced with #defines once
  92  * enough analysis has been done to determine optimal values.
  93  *
  94  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  95  * regular blocks are not deferred.
  96  */
  97 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  98 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  99 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
 100
 101 /*
 102  * An allocating zio is one that either currently has the DVA allocate
 103  * stage set or will have it later in its lifetime.
 104  */
 105 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 106
 107 int zio_requeue_io_start_cut_in_line = 1;
 108
 109 #ifdef ZFS_DEBUG
 110 int zio_buf_debug_limit = 16384;
 111 #else
 112 int zio_buf_debug_limit = 0;
 113 #endif
 114
 115 static inline void __zio_execute(zio_t *zio);
 116
 117 static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 118
 119 void
 120 zio_init(void)
 121 {
 122         size_t c;
 123         vmem_t *data_alloc_arena = NULL;
 124
 125         zio_cache = kmem_cache_create("zio_cache",
 126             sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 127         zio_link_cache = kmem_cache_create("zio_link_cache",
 128             sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 129
 130         /*
 131          * For small buffers, we want a cache for each multiple of
 132          * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 133          * for each quarter-power of 2.
 134          */
 135         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 136                 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 137                 size_t p2 = size;
 138                 size_t align = 0;
 139                 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 140
 141 #if defined(_ILP32) && defined(_KERNEL)
 142                 /*
 143                  * Cache size limited to 1M on 32-bit platforms until ARC
 144                  * buffers no longer require virtual address space.
 145                  */
 146                 if (size > zfs_max_recordsize)
 147                         break;
 148 #endif
 149
 150                 while (!ISP2(p2))
 151                         p2 &= p2 - 1;
 152
 153 #ifndef _KERNEL
 154                 /*
 155                  * If we are using watchpoints, put each buffer on its own page,
 156                  * to eliminate the performance overhead of trapping to the
 157                  * kernel when modifying a non-watched buffer that shares the
 158                  * page with a watched buffer.
 159                  */
 160                 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 161                         continue;
 162                 /*
 163                  * Here's the problem - on 4K native devices in userland on
 164                  * Linux using O_DIRECT, buffers must be 4K aligned or I/O
 165                  * will fail with EINVAL, causing zdb (and others) to coredump.
 166                  * Since userland probably doesn't need optimized buffer caches,
 167                  * we just force 4K alignment on everything.
 168                  */
 169                 align = 8 * SPA_MINBLOCKSIZE;
 170 #else
 171                 if (size < PAGESIZE) {
 172                         align = SPA_MINBLOCKSIZE;
 173                 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 174                         align = PAGESIZE;
 175                 }
 176 #endif
 177
 178                 if (align != 0) {
 179                         char name[36];
 180                         (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 181                         zio_buf_cache[c] = kmem_cache_create(name, size,
 182                             align, NULL, NULL, NULL, NULL, NULL, cflags);
 183
 184                         (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 185                         zio_data_buf_cache[c] = kmem_cache_create(name, size,
 186                             align, NULL, NULL, NULL, NULL,
 187                             data_alloc_arena, cflags);
 188                 }
 189         }
 190
 191         while (--c != 0) {
 192                 ASSERT(zio_buf_cache[c] != NULL);
 193                 if (zio_buf_cache[c - 1] == NULL)
 194                         zio_buf_cache[c - 1] = zio_buf_cache[c];
 195
 196                 ASSERT(zio_data_buf_cache[c] != NULL);
 197                 if (zio_data_buf_cache[c - 1] == NULL)
 198                         zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 199         }
 200
 201         zio_inject_init();
 202
 203         lz4_init();
 204 }
 205
 206 void
 207 zio_fini(void)
 208 {
 209         size_t c;
 210         kmem_cache_t *last_cache = NULL;
 211         kmem_cache_t *last_data_cache = NULL;
 212
 213         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 214 #ifdef _ILP32
 215                 /*
 216                  * Cache size limited to 1M on 32-bit platforms until ARC
 217                  * buffers no longer require virtual address space.
 218                  */
 219                 if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
 220                         break;
 221 #endif
 222 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 223                 if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c])
 224                         (void) printf("zio_fini: [%d] %llu != %llu\n",
 225                             (int)((c + 1) << SPA_MINBLOCKSHIFT),
 226                             (long long unsigned)zio_buf_cache_allocs[c],
 227                             (long long unsigned)zio_buf_cache_frees[c]);
 228 #endif
 229                 if (zio_buf_cache[c] != last_cache) {
 230                         last_cache = zio_buf_cache[c];
 231                         kmem_cache_destroy(zio_buf_cache[c]);
 232                 }
 233                 zio_buf_cache[c] = NULL;
 234
 235                 if (zio_data_buf_cache[c] != last_data_cache) {
 236                         last_data_cache = zio_data_buf_cache[c];
 237                         kmem_cache_destroy(zio_data_buf_cache[c]);
 238                 }
 239                 zio_data_buf_cache[c] = NULL;
 240         }
 241
 242         kmem_cache_destroy(zio_link_cache);
 243         kmem_cache_destroy(zio_cache);
 244
 245         zio_inject_fini();
 246
 247         lz4_fini();
 248 }
 249
 250 /*
 251  * ==========================================================================
 252  * Allocate and free I/O buffers
 253  * ==========================================================================
 254  */
 255
 256 /*
 257  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 258  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 259  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 260  * excess / transient data in-core during a crashdump.
 261  */
 262 void *
 263 zio_buf_alloc(size_t size)
 264 {
 265         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 266
 267         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 268 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 269         atomic_add_64(&zio_buf_cache_allocs[c], 1);
 270 #endif
 271
 272         return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 273 }
 274
 275 /*
 276  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 277  * crashdump if the kernel panics.  This exists so that we will limit the amount
 278  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 279  * of kernel heap dumped to disk when the kernel panics)
 280  */
 281 void *
 282 zio_data_buf_alloc(size_t size)
 283 {
 284         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 285
 286         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 287
 288         return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 289 }
 290
 291 void
 292 zio_buf_free(void *buf, size_t size)
 293 {
 294         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 295
 296         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 297 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 298         atomic_add_64(&zio_buf_cache_frees[c], 1);
 299 #endif
 300
 301         kmem_cache_free(zio_buf_cache[c], buf);
 302 }
 303
 304 void
 305 zio_data_buf_free(void *buf, size_t size)
 306 {
 307         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 308
 309         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 310
 311         kmem_cache_free(zio_data_buf_cache[c], buf);
 312 }
 313
 314 static void
 315 zio_abd_free(void *abd, size_t size)
 316 {
 317         abd_free((abd_t *)abd);
 318 }
 319
 320 /*
 321  * ==========================================================================
 322  * Push and pop I/O transform buffers
 323  * ==========================================================================
 324  */
 325 void
 326 zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
 327     zio_transform_func_t *transform)
 328 {
 329         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 330
 331         /*
 332          * Ensure that anyone expecting this zio to contain a linear ABD isn't
 333          * going to get a nasty surprise when they try to access the data.
 334          */
 335         IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
 336
 337         zt->zt_orig_abd = zio->io_abd;
 338         zt->zt_orig_size = zio->io_size;
 339         zt->zt_bufsize = bufsize;
 340         zt->zt_transform = transform;
 341
 342         zt->zt_next = zio->io_transform_stack;
 343         zio->io_transform_stack = zt;
 344
 345         zio->io_abd = data;
 346         zio->io_size = size;
 347 }
 348
 349 void
 350 zio_pop_transforms(zio_t *zio)
 351 {
 352         zio_transform_t *zt;
 353
 354         while ((zt = zio->io_transform_stack) != NULL) {
 355                 if (zt->zt_transform != NULL)
 356                         zt->zt_transform(zio,
 357                             zt->zt_orig_abd, zt->zt_orig_size);
 358
 359                 if (zt->zt_bufsize != 0)
 360                         abd_free(zio->io_abd);
 361
 362                 zio->io_abd = zt->zt_orig_abd;
 363                 zio->io_size = zt->zt_orig_size;
 364                 zio->io_transform_stack = zt->zt_next;
 365
 366                 kmem_free(zt, sizeof (zio_transform_t));
 367         }
 368 }
 369
 370 /*
 371  * ==========================================================================
 372  * I/O transform callbacks for subblocks, decompression, and decryption
 373  * ==========================================================================
 374  */
 375 static void
 376 zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 377 {
 378         ASSERT(zio->io_size > size);
 379
 380         if (zio->io_type == ZIO_TYPE_READ)
 381                 abd_copy(data, zio->io_abd, size);
 382 }
 383
 384 static void
 385 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 386 {
 387         if (zio->io_error == 0) {
 388                 void *tmp = abd_borrow_buf(data, size);
 389                 int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 390                     zio->io_abd, tmp, zio->io_size, size);
 391                 abd_return_buf_copy(data, tmp, size);
 392
 393                 if (ret != 0)
 394                         zio->io_error = SET_ERROR(EIO);
 395         }
 396 }
 397
 398 static void
 399 zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 400 {
 401         int ret;
 402         void *tmp;
 403         blkptr_t *bp = zio->io_bp;
 404         uint64_t lsize = BP_GET_LSIZE(bp);
 405         dmu_object_type_t ot = BP_GET_TYPE(bp);
 406         uint8_t salt[ZIO_DATA_SALT_LEN];
 407         uint8_t iv[ZIO_DATA_IV_LEN];
 408         uint8_t mac[ZIO_DATA_MAC_LEN];
 409         boolean_t no_crypt = B_FALSE;
 410
 411         ASSERT(BP_USES_CRYPT(bp));
 412         ASSERT3U(size, !=, 0);
 413
 414         if (zio->io_error != 0)
 415                 return;
 416
 417         /*
 418          * Verify the cksum of MACs stored in an indirect bp. It will always
 419          * be possible to verify this since it does not require an encryption
 420          * key.
 421          */
 422         if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
 423                 zio_crypt_decode_mac_bp(bp, mac);
 424
 425                 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 426                         /*
 427                          * We haven't decompressed the data yet, but
 428                          * zio_crypt_do_indirect_mac_checksum() requires
 429                          * decompressed data to be able to parse out the MACs
 430                          * from the indirect block. We decompress it now and
 431                          * throw away the result after we are finished.
 432                          */
 433                         tmp = zio_buf_alloc(lsize);
 434                         ret = zio_decompress_data(BP_GET_COMPRESS(bp),
 435                             zio->io_abd, tmp, zio->io_size, lsize);
 436                         if (ret != 0) {
 437                                 ret = SET_ERROR(EIO);
 438                                 goto error;
 439                         }
 440                         ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
 441                             tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
 442                         zio_buf_free(tmp, lsize);
 443                 } else {
 444                         ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 445                             zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
 446                 }
 447                 abd_copy(data, zio->io_abd, size);
 448
 449                 if (ret != 0)
 450                         goto error;
 451
 452                 return;
 453         }
 454
 455         /*
 456          * If this is an authenticated block, just check the MAC. It would be
 457          * nice to separate this out into its own flag, but for the moment
 458          * enum zio_flag is out of bits.
 459          */
 460         if (BP_IS_AUTHENTICATED(bp)) {
 461                 if (ot == DMU_OT_OBJSET) {
 462                         ret = spa_do_crypt_objset_mac_abd(B_FALSE, zio->io_spa,
 463                             zio->io_bookmark.zb_objset, zio->io_abd, size,
 464                             BP_SHOULD_BYTESWAP(bp));
 465                 } else {
 466                         zio_crypt_decode_mac_bp(bp, mac);
 467                         ret = spa_do_crypt_mac_abd(B_FALSE, zio->io_spa,
 468                             zio->io_bookmark.zb_objset, zio->io_abd, size, mac);
 469                 }
 470                 abd_copy(data, zio->io_abd, size);
 471
 472                 if (ret != 0)
 473                         goto error;
 474
 475                 return;
 476         }
 477
 478         zio_crypt_decode_params_bp(bp, salt, iv);
 479
 480         if (ot == DMU_OT_INTENT_LOG) {
 481                 tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
 482                 zio_crypt_decode_mac_zil(tmp, mac);
 483                 abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
 484         } else {
 485                 zio_crypt_decode_mac_bp(bp, mac);
 486         }
 487
 488         ret = spa_do_crypt_abd(B_FALSE, zio->io_spa, zio->io_bookmark.zb_objset,
 489             bp, bp->blk_birth, size, data, zio->io_abd, iv, mac, salt,
 490             &no_crypt);
 491         if (no_crypt)
 492                 abd_copy(data, zio->io_abd, size);
 493
 494         if (ret != 0)
 495                 goto error;
 496
 497         return;
 498
 499 error:
 500         /* assert that the key was found unless this was speculative */
 501         ASSERT(ret != ENOENT || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
 502
 503         /*
 504          * If there was a decryption / authentication error return EIO as
 505          * the io_error. If this was not a speculative zio, create an ereport.
 506          */
 507         if (ret == ECKSUM) {
 508                 ret = SET_ERROR(EIO);
 509                 if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 510                         zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 511                             zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0);
 512                 }
 513         } else {
 514                 zio->io_error = ret;
 515         }
 516 }
 517
 518 /*
 519  * ==========================================================================
 520  * I/O parent/child relationships and pipeline interlocks
 521  * ==========================================================================
 522  */
 523 zio_t *
 524 zio_walk_parents(zio_t *cio, zio_link_t **zl)
 525 {
 526         list_t *pl = &cio->io_parent_list;
 527
 528         *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
 529         if (*zl == NULL)
 530                 return (NULL);
 531
 532         ASSERT((*zl)->zl_child == cio);
 533         return ((*zl)->zl_parent);
 534 }
 535
 536 zio_t *
 537 zio_walk_children(zio_t *pio, zio_link_t **zl)
 538 {
 539         list_t *cl = &pio->io_child_list;
 540
 541         *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
 542         if (*zl == NULL)
 543                 return (NULL);
 544
 545         ASSERT((*zl)->zl_parent == pio);
 546         return ((*zl)->zl_child);
 547 }
 548
 549 zio_t *
 550 zio_unique_parent(zio_t *cio)
 551 {
 552         zio_link_t *zl = NULL;
 553         zio_t *pio = zio_walk_parents(cio, &zl);
 554
 555         VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 556         return (pio);
 557 }
 558
 559 void
 560 zio_add_child(zio_t *pio, zio_t *cio)
 561 {
 562         zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 563
 564         /*
 565          * Logical I/Os can have logical, gang, or vdev children.
 566          * Gang I/Os can have gang or vdev children.
 567          * Vdev I/Os can only have vdev children.
 568          * The following ASSERT captures all of these constraints.
 569          */
 570         ASSERT(cio->io_child_type <= pio->io_child_type);
 571
 572         zl->zl_parent = pio;
 573         zl->zl_child = cio;
 574
 575         mutex_enter(&cio->io_lock);
 576         mutex_enter(&pio->io_lock);
 577
 578         ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 579
 580         for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 581                 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 582
 583         list_insert_head(&pio->io_child_list, zl);
 584         list_insert_head(&cio->io_parent_list, zl);
 585
 586         pio->io_child_count++;
 587         cio->io_parent_count++;
 588
 589         mutex_exit(&pio->io_lock);
 590         mutex_exit(&cio->io_lock);
 591 }
 592
 593 static void
 594 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 595 {
 596         ASSERT(zl->zl_parent == pio);
 597         ASSERT(zl->zl_child == cio);
 598
 599         mutex_enter(&cio->io_lock);
 600         mutex_enter(&pio->io_lock);
 601
 602         list_remove(&pio->io_child_list, zl);
 603         list_remove(&cio->io_parent_list, zl);
 604
 605         pio->io_child_count--;
 606         cio->io_parent_count--;
 607
 608         mutex_exit(&pio->io_lock);
 609         mutex_exit(&cio->io_lock);
 610         kmem_cache_free(zio_link_cache, zl);
 611 }
 612
 613 static boolean_t
 614 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 615 {
 616         uint64_t *countp = &zio->io_children[child][wait];
 617         boolean_t waiting = B_FALSE;
 618
 619         mutex_enter(&zio->io_lock);
 620         ASSERT(zio->io_stall == NULL);
 621         if (*countp != 0) {
 622                 zio->io_stage >>= 1;
 623                 ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 624                 zio->io_stall = countp;
 625                 waiting = B_TRUE;
 626         }
 627         mutex_exit(&zio->io_lock);
 628
 629         return (waiting);
 630 }
 631
 632 __attribute__((always_inline))
 633 static inline void
 634 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 635 {
 636         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 637         int *errorp = &pio->io_child_error[zio->io_child_type];
 638
 639         mutex_enter(&pio->io_lock);
 640         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 641                 *errorp = zio_worst_error(*errorp, zio->io_error);
 642         pio->io_reexecute |= zio->io_reexecute;
 643         ASSERT3U(*countp, >, 0);
 644
 645         (*countp)--;
 646
 647         if (*countp == 0 && pio->io_stall == countp) {
 648                 zio_taskq_type_t type =
 649                     pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
 650                     ZIO_TASKQ_INTERRUPT;
 651                 pio->io_stall = NULL;
 652                 mutex_exit(&pio->io_lock);
 653                 /*
 654                  * Dispatch the parent zio in its own taskq so that
 655                  * the child can continue to make progress. This also
 656                  * prevents overflowing the stack when we have deeply nested
 657                  * parent-child relationships.
 658                  */
 659                 zio_taskq_dispatch(pio, type, B_FALSE);
 660         } else {
 661                 mutex_exit(&pio->io_lock);
 662         }
 663 }
 664
 665 static void
 666 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 667 {
 668         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 669                 zio->io_error = zio->io_child_error[c];
 670 }
 671
 672 int
 673 zio_bookmark_compare(const void *x1, const void *x2)
 674 {
 675         const zio_t *z1 = x1;
 676         const zio_t *z2 = x2;
 677
 678         if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
 679                 return (-1);
 680         if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
 681                 return (1);
 682
 683         if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
 684                 return (-1);
 685         if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
 686                 return (1);
 687
 688         if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
 689                 return (-1);
 690         if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
 691                 return (1);
 692
 693         if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
 694                 return (-1);
 695         if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
 696                 return (1);
 697
 698         if (z1 < z2)
 699                 return (-1);
 700         if (z1 > z2)
 701                 return (1);
 702
 703         return (0);
 704 }
 705
 706 /*
 707  * ==========================================================================
 708  * Create the various types of I/O (read, write, free, etc)
 709  * ==========================================================================
 710  */
 711 static zio_t *
 712 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 713     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
 714     void *private, zio_type_t type, zio_priority_t priority,
 715     enum zio_flag flags, vdev_t *vd, uint64_t offset,
 716     const zbookmark_phys_t *zb, enum zio_stage stage,
 717     enum zio_stage pipeline)
 718 {
 719         zio_t *zio;
 720
 721         ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
 722         ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 723         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 724
 725         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 726         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 727         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 728
 729         IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
 730
 731         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 732         bzero(zio, sizeof (zio_t));
 733
 734         mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 735         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 736
 737         list_create(&zio->io_parent_list, sizeof (zio_link_t),
 738             offsetof(zio_link_t, zl_parent_node));
 739         list_create(&zio->io_child_list, sizeof (zio_link_t),
 740             offsetof(zio_link_t, zl_child_node));
 741         metaslab_trace_init(&zio->io_alloc_list);
 742
 743         if (vd != NULL)
 744                 zio->io_child_type = ZIO_CHILD_VDEV;
 745         else if (flags & ZIO_FLAG_GANG_CHILD)
 746                 zio->io_child_type = ZIO_CHILD_GANG;
 747         else if (flags & ZIO_FLAG_DDT_CHILD)
 748                 zio->io_child_type = ZIO_CHILD_DDT;
 749         else
 750                 zio->io_child_type = ZIO_CHILD_LOGICAL;
 751
 752         if (bp != NULL) {
 753                 zio->io_bp = (blkptr_t *)bp;
 754                 zio->io_bp_copy = *bp;
 755                 zio->io_bp_orig = *bp;
 756                 if (type != ZIO_TYPE_WRITE ||
 757                     zio->io_child_type == ZIO_CHILD_DDT)
 758                         zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 759                 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 760                         zio->io_logical = zio;
 761                 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 762                         pipeline |= ZIO_GANG_STAGES;
 763         }
 764
 765         zio->io_spa = spa;
 766         zio->io_txg = txg;
 767         zio->io_done = done;
 768         zio->io_private = private;
 769         zio->io_type = type;
 770         zio->io_priority = priority;
 771         zio->io_vd = vd;
 772         zio->io_offset = offset;
 773         zio->io_orig_abd = zio->io_abd = data;
 774         zio->io_orig_size = zio->io_size = psize;
 775         zio->io_lsize = lsize;
 776         zio->io_orig_flags = zio->io_flags = flags;
 777         zio->io_orig_stage = zio->io_stage = stage;
 778         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 779         zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 780
 781         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 782         zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 783
 784         if (zb != NULL)
 785                 zio->io_bookmark = *zb;
 786
 787         if (pio != NULL) {
 788                 if (zio->io_logical == NULL)
 789                         zio->io_logical = pio->io_logical;
 790                 if (zio->io_child_type == ZIO_CHILD_GANG)
 791                         zio->io_gang_leader = pio->io_gang_leader;
 792                 zio_add_child(pio, zio);
 793         }
 794
 795         taskq_init_ent(&zio->io_tqent);
 796
 797         return (zio);
 798 }
 799
 800 static void
 801 zio_destroy(zio_t *zio)
 802 {
 803         metaslab_trace_fini(&zio->io_alloc_list);
 804         list_destroy(&zio->io_parent_list);
 805         list_destroy(&zio->io_child_list);
 806         mutex_destroy(&zio->io_lock);
 807         cv_destroy(&zio->io_cv);
 808         kmem_cache_free(zio_cache, zio);
 809 }
 810
 811 zio_t *
 812 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 813     void *private, enum zio_flag flags)
 814 {
 815         zio_t *zio;
 816
 817         zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 818             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 819             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 820
 821         return (zio);
 822 }
 823
 824 zio_t *
 825 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 826 {
 827         return (zio_null(NULL, spa, NULL, done, private, flags));
 828 }
 829
 830 void
 831 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 832 {
 833         if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 834                 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 835                     bp, (longlong_t)BP_GET_TYPE(bp));
 836         }
 837         if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 838             BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 839                 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 840                     bp, (longlong_t)BP_GET_CHECKSUM(bp));
 841         }
 842         if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 843             BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 844                 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 845                     bp, (longlong_t)BP_GET_COMPRESS(bp));
 846         }
 847         if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 848                 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 849                     bp, (longlong_t)BP_GET_LSIZE(bp));
 850         }
 851         if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 852                 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 853                     bp, (longlong_t)BP_GET_PSIZE(bp));
 854         }
 855
 856         if (BP_IS_EMBEDDED(bp)) {
 857                 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 858                         zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 859                             bp, (longlong_t)BPE_GET_ETYPE(bp));
 860                 }
 861         }
 862
 863         /*
 864          * Pool-specific checks.
 865          *
 866          * Note: it would be nice to verify that the blk_birth and
 867          * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 868          * allows the birth time of log blocks (and dmu_sync()-ed blocks
 869          * that are in the log) to be arbitrarily large.
 870          */
 871         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 872                 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 873
 874                 if (vdevid >= spa->spa_root_vdev->vdev_children) {
 875                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 876                             "VDEV %llu",
 877                             bp, i, (longlong_t)vdevid);
 878                         continue;
 879                 }
 880                 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 881                 if (vd == NULL) {
 882                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 883                             "VDEV %llu",
 884                             bp, i, (longlong_t)vdevid);
 885                         continue;
 886                 }
 887                 if (vd->vdev_ops == &vdev_hole_ops) {
 888                         zfs_panic_recover("blkptr at %p DVA %u has hole "
 889                             "VDEV %llu",
 890                             bp, i, (longlong_t)vdevid);
 891                         continue;
 892                 }
 893                 if (vd->vdev_ops == &vdev_missing_ops) {
 894                         /*
 895                          * "missing" vdevs are valid during import, but we
 896                          * don't have their detailed info (e.g. asize), so
 897                          * we can't perform any more checks on them.
 898                          */
 899                         continue;
 900                 }
 901                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 902                 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 903                 if (BP_IS_GANG(bp))
 904                         asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 905                 if (offset + asize > vd->vdev_asize) {
 906                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 907                             "OFFSET %llu",
 908                             bp, i, (longlong_t)offset);
 909                 }
 910         }
 911 }
 912
 913 zio_t *
 914 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 915     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
 916     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 917 {
 918         zio_t *zio;
 919
 920         zfs_blkptr_verify(spa, bp);
 921
 922         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 923             data, size, size, done, private,
 924             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 925             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 926             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 927
 928         return (zio);
 929 }
 930
 931 zio_t *
 932 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 933     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
 934     zio_done_func_t *ready, zio_done_func_t *children_ready,
 935     zio_done_func_t *physdone, zio_done_func_t *done,
 936     void *private, zio_priority_t priority, enum zio_flag flags,
 937     const zbookmark_phys_t *zb)
 938 {
 939         zio_t *zio;
 940
 941         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 942             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 943             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 944             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 945             DMU_OT_IS_VALID(zp->zp_type) &&
 946             zp->zp_level < 32 &&
 947             zp->zp_copies > 0 &&
 948             zp->zp_copies <= spa_max_replication(spa));
 949
 950         zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 951             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 952             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 953             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 954
 955         zio->io_ready = ready;
 956         zio->io_children_ready = children_ready;
 957         zio->io_physdone = physdone;
 958         zio->io_prop = *zp;
 959
 960         /*
 961          * Data can be NULL if we are going to call zio_write_override() to
 962          * provide the already-allocated BP.  But we may need the data to
 963          * verify a dedup hit (if requested).  In this case, don't try to
 964          * dedup (just take the already-allocated BP verbatim). Encrypted
 965          * dedup blocks need data as well so we also disable dedup in this
 966          * case.
 967          */
 968         if (data == NULL &&
 969             (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
 970                 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 971         }
 972
 973         return (zio);
 974 }
 975
 976 zio_t *
 977 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
 978     uint64_t size, zio_done_func_t *done, void *private,
 979     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 980 {
 981         zio_t *zio;
 982
 983         zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
 984             ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 985             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 986
 987         return (zio);
 988 }
 989
 990 void
 991 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 992 {
 993         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 994         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 995         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 996         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 997
 998         /*
 999          * We must reset the io_prop to match the values that existed
1000          * when the bp was first written by dmu_sync() keeping in mind
1001          * that nopwrite and dedup are mutually exclusive.
1002          */
1003         zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
1004         zio->io_prop.zp_nopwrite = nopwrite;
1005         zio->io_prop.zp_copies = copies;
1006         zio->io_bp_override = bp;
1007 }
1008
1009 void
1010 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
1011 {
1012
1013         /*
1014          * The check for EMBEDDED is a performance optimization.  We
1015          * process the free here (by ignoring it) rather than
1016          * putting it on the list and then processing it in zio_free_sync().
1017          */
1018         if (BP_IS_EMBEDDED(bp))
1019                 return;
1020         metaslab_check_free(spa, bp);
1021
1022         /*
1023          * Frees that are for the currently-syncing txg, are not going to be
1024          * deferred, and which will not need to do a read (i.e. not GANG or
1025          * DEDUP), can be processed immediately.  Otherwise, put them on the
1026          * in-memory list for later processing.
1027          */
1028         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
1029             txg != spa->spa_syncing_txg ||
1030             spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
1031                 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
1032         } else {
1033                 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
1034         }
1035 }
1036
1037 zio_t *
1038 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
1039     enum zio_flag flags)
1040 {
1041         zio_t *zio;
1042         enum zio_stage stage = ZIO_FREE_PIPELINE;
1043
1044         ASSERT(!BP_IS_HOLE(bp));
1045         ASSERT(spa_syncing_txg(spa) == txg);
1046         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
1047
1048         if (BP_IS_EMBEDDED(bp))
1049                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
1050
1051         metaslab_check_free(spa, bp);
1052         arc_freed(spa, bp);
1053
1054         /*
1055          * GANG and DEDUP blocks can induce a read (for the gang block header,
1056          * or the DDT), so issue them asynchronously so that this thread is
1057          * not tied up.
1058          */
1059         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
1060                 stage |= ZIO_STAGE_ISSUE_ASYNC;
1061
1062         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
1063             BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
1064             flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
1065
1066         return (zio);
1067 }
1068
1069 zio_t *
1070 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
1071     zio_done_func_t *done, void *private, enum zio_flag flags)
1072 {
1073         zio_t *zio;
1074
1075         dprintf_bp(bp, "claiming in txg %llu", txg);
1076
1077         if (BP_IS_EMBEDDED(bp))
1078                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
1079
1080         /*
1081          * A claim is an allocation of a specific block.  Claims are needed
1082          * to support immediate writes in the intent log.  The issue is that
1083          * immediate writes contain committed data, but in a txg that was
1084          * *not* committed.  Upon opening the pool after an unclean shutdown,
1085          * the intent log claims all blocks that contain immediate write data
1086          * so that the SPA knows they're in use.
1087          *
1088          * All claims *must* be resolved in the first txg -- before the SPA
1089          * starts allocating blocks -- so that nothing is allocated twice.
1090          * If txg == 0 we just verify that the block is claimable.
1091          */
1092         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
1093         ASSERT(txg == spa_first_txg(spa) || txg == 0);
1094         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
1095
1096         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
1097             BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
1098             flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
1099         ASSERT0(zio->io_queued_timestamp);
1100
1101         return (zio);
1102 }
1103
1104 zio_t *
1105 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
1106     zio_done_func_t *done, void *private, enum zio_flag flags)
1107 {
1108         zio_t *zio;
1109         int c;
1110
1111         if (vd->vdev_children == 0) {
1112                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
1113                     ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
1114                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
1115
1116                 zio->io_cmd = cmd;
1117         } else {
1118                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
1119
1120                 for (c = 0; c < vd->vdev_children; c++)
1121                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
1122                             done, private, flags));
1123         }
1124
1125         return (zio);
1126 }
1127
1128 zio_t *
1129 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
1130     abd_t *data, int checksum, zio_done_func_t *done, void *private,
1131     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
1132 {
1133         zio_t *zio;
1134
1135         ASSERT(vd->vdev_children == 0);
1136         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
1137             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
1138         ASSERT3U(offset + size, <=, vd->vdev_psize);
1139
1140         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
1141             private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
1142             offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
1143
1144         zio->io_prop.zp_checksum = checksum;
1145
1146         return (zio);
1147 }
1148
1149 zio_t *
1150 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
1151     abd_t *data, int checksum, zio_done_func_t *done, void *private,
1152     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
1153 {
1154         zio_t *zio;
1155
1156         ASSERT(vd->vdev_children == 0);
1157         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
1158             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
1159         ASSERT3U(offset + size, <=, vd->vdev_psize);
1160
1161         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
1162             private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
1163             offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
1164
1165         zio->io_prop.zp_checksum = checksum;
1166
1167         if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
1168                 /*
1169                  * zec checksums are necessarily destructive -- they modify
1170                  * the end of the write buffer to hold the verifier/checksum.
1171                  * Therefore, we must make a local copy in case the data is
1172                  * being written to multiple places in parallel.
1173                  */
1174                 abd_t *wbuf = abd_alloc_sametype(data, size);
1175                 abd_copy(wbuf, data, size);
1176
1177                 zio_push_transform(zio, wbuf, size, size, NULL);
1178         }
1179
1180         return (zio);
1181 }
1182
1183 /*
1184  * Create a child I/O to do some work for us.
1185  */
1186 zio_t *
1187 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
1188     abd_t *data, uint64_t size, int type, zio_priority_t priority,
1189     enum zio_flag flags, zio_done_func_t *done, void *private)
1190 {
1191         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
1192         zio_t *zio;
1193
1194         ASSERT(vd->vdev_parent ==
1195             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
1196
1197         if (type == ZIO_TYPE_READ && bp != NULL) {
1198                 /*
1199                  * If we have the bp, then the child should perform the
1200                  * checksum and the parent need not.  This pushes error
1201                  * detection as close to the leaves as possible and
1202                  * eliminates redundant checksums in the interior nodes.
1203                  */
1204                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
1205                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
1206         }
1207
1208         if (vd->vdev_children == 0)
1209                 offset += VDEV_LABEL_START_SIZE;
1210
1211         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
1212
1213         /*
1214          * If we've decided to do a repair, the write is not speculative --
1215          * even if the original read was.
1216          */
1217         if (flags & ZIO_FLAG_IO_REPAIR)
1218                 flags &= ~ZIO_FLAG_SPECULATIVE;
1219
1220         /*
1221          * If we're creating a child I/O that is not associated with a
1222          * top-level vdev, then the child zio is not an allocating I/O.
1223          * If this is a retried I/O then we ignore it since we will
1224          * have already processed the original allocating I/O.
1225          */
1226         if (flags & ZIO_FLAG_IO_ALLOCATING &&
1227             (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
1228                 ASSERTV(metaslab_class_t *mc = spa_normal_class(pio->io_spa));
1229
1230                 ASSERT(mc->mc_alloc_throttle_enabled);
1231                 ASSERT(type == ZIO_TYPE_WRITE);
1232                 ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
1233                 ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
1234                 ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
1235                     pio->io_child_type == ZIO_CHILD_GANG);
1236
1237                 flags &= ~ZIO_FLAG_IO_ALLOCATING;
1238         }
1239
1240
1241         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
1242             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
1243             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
1244         ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
1245
1246         zio->io_physdone = pio->io_physdone;
1247         if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1248                 zio->io_logical->io_phys_children++;
1249
1250         return (zio);
1251 }
1252
1253 zio_t *
1254 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
1255     int type, zio_priority_t priority, enum zio_flag flags,
1256     zio_done_func_t *done, void *private)
1257 {
1258         zio_t *zio;
1259
1260         ASSERT(vd->vdev_ops->vdev_op_leaf);
1261
1262         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1263             data, size, size, done, private, type, priority,
1264             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
1265             vd, offset, NULL,
1266             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
1267
1268         return (zio);
1269 }
1270
1271 void
1272 zio_flush(zio_t *zio, vdev_t *vd)
1273 {
1274         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
1275             NULL, NULL,
1276             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1277 }
1278
1279 void
1280 zio_shrink(zio_t *zio, uint64_t size)
1281 {
1282         ASSERT(zio->io_executor == NULL);
1283         ASSERT(zio->io_orig_size == zio->io_size);
1284         ASSERT(size <= zio->io_size);
1285
1286         /*
1287          * We don't shrink for raidz because of problems with the
1288          * reconstruction when reading back less than the block size.
1289          * Note, BP_IS_RAIDZ() assumes no compression.
1290          */
1291         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1292         if (!BP_IS_RAIDZ(zio->io_bp)) {
1293                 /* we are not doing a raw write */
1294                 ASSERT3U(zio->io_size, ==, zio->io_lsize);
1295                 zio->io_orig_size = zio->io_size = zio->io_lsize = size;
1296         }
1297 }
1298
1299 /*
1300  * ==========================================================================
1301  * Prepare to read and write logical blocks
1302  * ==========================================================================
1303  */
1304
1305 static int
1306 zio_read_bp_init(zio_t *zio)
1307 {
1308         blkptr_t *bp = zio->io_bp;
1309         uint64_t psize =
1310             BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1311
1312         if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1313             zio->io_child_type == ZIO_CHILD_LOGICAL &&
1314             !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
1315                 zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
1316                     psize, psize, zio_decompress);
1317         }
1318
1319         if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
1320             BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
1321             zio->io_child_type == ZIO_CHILD_LOGICAL) {
1322                 zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
1323                     psize, psize, zio_decrypt);
1324         }
1325
1326         if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1327                 int psize = BPE_GET_PSIZE(bp);
1328                 void *data = abd_borrow_buf(zio->io_abd, psize);
1329
1330                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1331                 decode_embedded_bp_compressed(bp, data);
1332                 abd_return_buf_copy(zio->io_abd, data, psize);
1333         } else {
1334                 ASSERT(!BP_IS_EMBEDDED(bp));
1335         }
1336
1337         if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1338                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1339
1340         if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1341                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1342
1343         if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1344                 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1345
1346         return (ZIO_PIPELINE_CONTINUE);
1347 }
1348
1349 static int
1350 zio_write_bp_init(zio_t *zio)
1351 {
1352         if (!IO_IS_ALLOCATING(zio))
1353                 return (ZIO_PIPELINE_CONTINUE);
1354
1355         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1356
1357         if (zio->io_bp_override) {
1358                 blkptr_t *bp = zio->io_bp;
1359                 zio_prop_t *zp = &zio->io_prop;
1360
1361                 ASSERT(bp->blk_birth != zio->io_txg);
1362                 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1363
1364                 *bp = *zio->io_bp_override;
1365                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1366
1367                 if (BP_IS_EMBEDDED(bp))
1368                         return (ZIO_PIPELINE_CONTINUE);
1369
1370                 /*
1371                  * If we've been overridden and nopwrite is set then
1372                  * set the flag accordingly to indicate that a nopwrite
1373                  * has already occurred.
1374                  */
1375                 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1376                         ASSERT(!zp->zp_dedup);
1377                         ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
1378                         zio->io_flags |= ZIO_FLAG_NOPWRITE;
1379                         return (ZIO_PIPELINE_CONTINUE);
1380                 }
1381
1382                 ASSERT(!zp->zp_nopwrite);
1383
1384                 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1385                         return (ZIO_PIPELINE_CONTINUE);
1386
1387                 ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
1388                     ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
1389
1390                 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
1391                     !zp->zp_encrypt) {
1392                         BP_SET_DEDUP(bp, 1);
1393                         zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1394                         return (ZIO_PIPELINE_CONTINUE);
1395                 }
1396
1397                 /*
1398                  * We were unable to handle this as an override bp, treat
1399                  * it as a regular write I/O.
1400                  */
1401                 zio->io_bp_override = NULL;
1402                 *bp = zio->io_bp_orig;
1403                 zio->io_pipeline = zio->io_orig_pipeline;
1404         }
1405
1406         return (ZIO_PIPELINE_CONTINUE);
1407 }
1408
1409 static int
1410 zio_write_compress(zio_t *zio)
1411 {
1412         spa_t *spa = zio->io_spa;
1413         zio_prop_t *zp = &zio->io_prop;
1414         enum zio_compress compress = zp->zp_compress;
1415         blkptr_t *bp = zio->io_bp;
1416         uint64_t lsize = zio->io_lsize;
1417         uint64_t psize = zio->io_size;
1418         int pass = 1;
1419
1420         /*
1421          * If our children haven't all reached the ready stage,
1422          * wait for them and then repeat this pipeline stage.
1423          */
1424         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1425             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1426                 return (ZIO_PIPELINE_STOP);
1427
1428         if (!IO_IS_ALLOCATING(zio))
1429                 return (ZIO_PIPELINE_CONTINUE);
1430
1431         if (zio->io_children_ready != NULL) {
1432                 /*
1433                  * Now that all our children are ready, run the callback
1434                  * associated with this zio in case it wants to modify the
1435                  * data to be written.
1436                  */
1437                 ASSERT3U(zp->zp_level, >, 0);
1438                 zio->io_children_ready(zio);
1439         }
1440
1441         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1442         ASSERT(zio->io_bp_override == NULL);
1443
1444         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1445                 /*
1446                  * We're rewriting an existing block, which means we're
1447                  * working on behalf of spa_sync().  For spa_sync() to
1448                  * converge, it must eventually be the case that we don't
1449                  * have to allocate new blocks.  But compression changes
1450                  * the blocksize, which forces a reallocate, and makes
1451                  * convergence take longer.  Therefore, after the first
1452                  * few passes, stop compressing to ensure convergence.
1453                  */
1454                 pass = spa_sync_pass(spa);
1455
1456                 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1457                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1458                 ASSERT(!BP_GET_DEDUP(bp));
1459
1460                 if (pass >= zfs_sync_pass_dont_compress)
1461                         compress = ZIO_COMPRESS_OFF;
1462
1463                 /* Make sure someone doesn't change their mind on overwrites */
1464                 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1465                     spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1466         }
1467
1468         /* If it's a compressed write that is not raw, compress the buffer. */
1469         if (compress != ZIO_COMPRESS_OFF &&
1470             !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
1471                 void *cbuf = zio_buf_alloc(lsize);
1472                 psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
1473                 if (psize == 0 || psize == lsize) {
1474                         compress = ZIO_COMPRESS_OFF;
1475                         zio_buf_free(cbuf, lsize);
1476                 } else if (!zp->zp_dedup && !zp->zp_encrypt &&
1477                     psize <= BPE_PAYLOAD_SIZE &&
1478                     zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1479                     spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1480                         encode_embedded_bp_compressed(bp,
1481                             cbuf, compress, lsize, psize);
1482                         BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1483                         BP_SET_TYPE(bp, zio->io_prop.zp_type);
1484                         BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1485                         zio_buf_free(cbuf, lsize);
1486                         bp->blk_birth = zio->io_txg;
1487                         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1488                         ASSERT(spa_feature_is_active(spa,
1489                             SPA_FEATURE_EMBEDDED_DATA));
1490                         return (ZIO_PIPELINE_CONTINUE);
1491                 } else {
1492                         /*
1493                          * Round up compressed size up to the ashift
1494                          * of the smallest-ashift device, and zero the tail.
1495                          * This ensures that the compressed size of the BP
1496                          * (and thus compressratio property) are correct,
1497                          * in that we charge for the padding used to fill out
1498                          * the last sector.
1499                          */
1500                         ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
1501                         size_t rounded = (size_t)P2ROUNDUP(psize,
1502                             1ULL << spa->spa_min_ashift);
1503                         if (rounded >= lsize) {
1504                                 compress = ZIO_COMPRESS_OFF;
1505                                 zio_buf_free(cbuf, lsize);
1506                                 psize = lsize;
1507                         } else {
1508                                 abd_t *cdata = abd_get_from_buf(cbuf, lsize);
1509                                 abd_take_ownership_of_buf(cdata, B_TRUE);
1510                                 abd_zero_off(cdata, psize, rounded - psize);
1511                                 psize = rounded;
1512                                 zio_push_transform(zio, cdata,
1513                                     psize, lsize, NULL);
1514                         }
1515                 }
1516
1517                 /*
1518                  * We were unable to handle this as an override bp, treat
1519                  * it as a regular write I/O.
1520                  */
1521                 zio->io_bp_override = NULL;
1522                 *bp = zio->io_bp_orig;
1523                 zio->io_pipeline = zio->io_orig_pipeline;
1524
1525         } else {
1526                 ASSERT3U(psize, !=, 0);
1527
1528         }
1529
1530         /*
1531          * The final pass of spa_sync() must be all rewrites, but the first
1532          * few passes offer a trade-off: allocating blocks defers convergence,
1533          * but newly allocated blocks are sequential, so they can be written
1534          * to disk faster.  Therefore, we allow the first few passes of
1535          * spa_sync() to allocate new blocks, but force rewrites after that.
1536          * There should only be a handful of blocks after pass 1 in any case.
1537          */
1538         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1539             BP_GET_PSIZE(bp) == psize &&
1540             pass >= zfs_sync_pass_rewrite) {
1541                 ASSERT(psize != 0);
1542                 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1543                 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1544                 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1545         } else {
1546                 BP_ZERO(bp);
1547                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1548         }
1549
1550         if (psize == 0) {
1551                 if (zio->io_bp_orig.blk_birth != 0 &&
1552                     spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1553                         BP_SET_LSIZE(bp, lsize);
1554                         BP_SET_TYPE(bp, zp->zp_type);
1555                         BP_SET_LEVEL(bp, zp->zp_level);
1556                         BP_SET_BIRTH(bp, zio->io_txg, 0);
1557                 }
1558                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1559         } else {
1560                 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1561                 BP_SET_LSIZE(bp, lsize);
1562                 BP_SET_TYPE(bp, zp->zp_type);
1563                 BP_SET_LEVEL(bp, zp->zp_level);
1564                 BP_SET_PSIZE(bp, psize);
1565                 BP_SET_COMPRESS(bp, compress);
1566                 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1567                 BP_SET_DEDUP(bp, zp->zp_dedup);
1568                 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1569                 if (zp->zp_dedup) {
1570                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1571                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1572                         ASSERT(!zp->zp_encrypt ||
1573                             DMU_OT_IS_ENCRYPTED(zp->zp_type));
1574                         zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1575                 }
1576                 if (zp->zp_nopwrite) {
1577                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1578                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1579                         zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1580                 }
1581         }
1582         return (ZIO_PIPELINE_CONTINUE);
1583 }
1584
1585 static int
1586 zio_free_bp_init(zio_t *zio)
1587 {
1588         blkptr_t *bp = zio->io_bp;
1589
1590         if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1591                 if (BP_GET_DEDUP(bp))
1592                         zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1593         }
1594
1595         return (ZIO_PIPELINE_CONTINUE);
1596 }
1597
1598 /*
1599  * ==========================================================================
1600  * Execute the I/O pipeline
1601  * ==========================================================================
1602  */
1603
1604 static void
1605 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1606 {
1607         spa_t *spa = zio->io_spa;
1608         zio_type_t t = zio->io_type;
1609         int flags = (cutinline ? TQ_FRONT : 0);
1610
1611         /*
1612          * If we're a config writer or a probe, the normal issue and
1613          * interrupt threads may all be blocked waiting for the config lock.
1614          * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1615          */
1616         if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1617                 t = ZIO_TYPE_NULL;
1618
1619         /*
1620          * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1621          */
1622         if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1623                 t = ZIO_TYPE_NULL;
1624
1625         /*
1626          * If this is a high priority I/O, then use the high priority taskq if
1627          * available.
1628          */
1629         if (zio->io_priority == ZIO_PRIORITY_NOW &&
1630             spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1631                 q++;
1632
1633         ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1634
1635         /*
1636          * NB: We are assuming that the zio can only be dispatched
1637          * to a single taskq at a time.  It would be a grievous error
1638          * to dispatch the zio to another taskq at the same time.
1639          */
1640         ASSERT(taskq_empty_ent(&zio->io_tqent));
1641         spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1642             flags, &zio->io_tqent);
1643 }
1644
1645 static boolean_t
1646 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1647 {
1648         kthread_t *executor = zio->io_executor;
1649         spa_t *spa = zio->io_spa;
1650
1651         for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1652                 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1653                 uint_t i;
1654                 for (i = 0; i < tqs->stqs_count; i++) {
1655                         if (taskq_member(tqs->stqs_taskq[i], executor))
1656                                 return (B_TRUE);
1657                 }
1658         }
1659
1660         return (B_FALSE);
1661 }
1662
1663 static int
1664 zio_issue_async(zio_t *zio)
1665 {
1666         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1667
1668         return (ZIO_PIPELINE_STOP);
1669 }
1670
1671 void
1672 zio_interrupt(zio_t *zio)
1673 {
1674         zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1675 }
1676
1677 void
1678 zio_delay_interrupt(zio_t *zio)
1679 {
1680         /*
1681          * The timeout_generic() function isn't defined in userspace, so
1682          * rather than trying to implement the function, the zio delay
1683          * functionality has been disabled for userspace builds.
1684          */
1685
1686 #ifdef _KERNEL
1687         /*
1688          * If io_target_timestamp is zero, then no delay has been registered
1689          * for this IO, thus jump to the end of this function and "skip" the
1690          * delay; issuing it directly to the zio layer.
1691          */
1692         if (zio->io_target_timestamp != 0) {
1693                 hrtime_t now = gethrtime();
1694
1695                 if (now >= zio->io_target_timestamp) {
1696                         /*
1697                          * This IO has already taken longer than the target
1698                          * delay to complete, so we don't want to delay it
1699                          * any longer; we "miss" the delay and issue it
1700                          * directly to the zio layer. This is likely due to
1701                          * the target latency being set to a value less than
1702                          * the underlying hardware can satisfy (e.g. delay
1703                          * set to 1ms, but the disks take 10ms to complete an
1704                          * IO request).
1705                          */
1706
1707                         DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
1708                             hrtime_t, now);
1709
1710                         zio_interrupt(zio);
1711                 } else {
1712                         taskqid_t tid;
1713                         hrtime_t diff = zio->io_target_timestamp - now;
1714                         clock_t expire_at_tick = ddi_get_lbolt() +
1715                             NSEC_TO_TICK(diff);
1716
1717                         DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
1718                             hrtime_t, now, hrtime_t, diff);
1719
1720                         if (NSEC_TO_TICK(diff) == 0) {
1721                                 /* Our delay is less than a jiffy - just spin */
1722                                 zfs_sleep_until(zio->io_target_timestamp);
1723                         } else {
1724                                 /*
1725                                  * Use taskq_dispatch_delay() in the place of
1726                                  * OpenZFS's timeout_generic().
1727                                  */
1728                                 tid = taskq_dispatch_delay(system_taskq,
1729                                     (task_func_t *)zio_interrupt,
1730                                     zio, TQ_NOSLEEP, expire_at_tick);
1731                                 if (tid == TASKQID_INVALID) {
1732                                         /*
1733                                          * Couldn't allocate a task.  Just
1734                                          * finish the zio without a delay.
1735                                          */
1736                                         zio_interrupt(zio);
1737                                 }
1738                         }
1739                 }
1740                 return;
1741         }
1742 #endif
1743         DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
1744         zio_interrupt(zio);
1745 }
1746
1747 /*
1748  * Execute the I/O pipeline until one of the following occurs:
1749  * (1) the I/O completes; (2) the pipeline stalls waiting for
1750  * dependent child I/Os; (3) the I/O issues, so we're waiting
1751  * for an I/O completion interrupt; (4) the I/O is delegated by
1752  * vdev-level caching or aggregation; (5) the I/O is deferred
1753  * due to vdev-level queueing; (6) the I/O is handed off to
1754  * another thread.  In all cases, the pipeline stops whenever
1755  * there's no CPU work; it never burns a thread in cv_wait_io().
1756  *
1757  * There's no locking on io_stage because there's no legitimate way
1758  * for multiple threads to be attempting to process the same I/O.
1759  */
1760 static zio_pipe_stage_t *zio_pipeline[];
1761
1762 /*
1763  * zio_execute() is a wrapper around the static function
1764  * __zio_execute() so that we can force  __zio_execute() to be
1765  * inlined.  This reduces stack overhead which is important
1766  * because __zio_execute() is called recursively in several zio
1767  * code paths.  zio_execute() itself cannot be inlined because
1768  * it is externally visible.
1769  */
1770 void
1771 zio_execute(zio_t *zio)
1772 {
1773         fstrans_cookie_t cookie;
1774
1775         cookie = spl_fstrans_mark();
1776         __zio_execute(zio);
1777         spl_fstrans_unmark(cookie);
1778 }
1779
1780 /*
1781  * Used to determine if in the current context the stack is sized large
1782  * enough to allow zio_execute() to be called recursively.  A minimum
1783  * stack size of 16K is required to avoid needing to re-dispatch the zio.
1784  */
1785 boolean_t
1786 zio_execute_stack_check(zio_t *zio)
1787 {
1788 #if !defined(HAVE_LARGE_STACKS)
1789         dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
1790
1791         /* Executing in txg_sync_thread() context. */
1792         if (dp && curthread == dp->dp_tx.tx_sync_thread)
1793                 return (B_TRUE);
1794
1795         /* Pool initialization outside of zio_taskq context. */
1796         if (dp && spa_is_initializing(dp->dp_spa) &&
1797             !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
1798             !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
1799                 return (B_TRUE);
1800 #endif /* HAVE_LARGE_STACKS */
1801
1802         return (B_FALSE);
1803 }
1804
1805 __attribute__((always_inline))
1806 static inline void
1807 __zio_execute(zio_t *zio)
1808 {
1809         zio->io_executor = curthread;
1810
1811         ASSERT3U(zio->io_queued_timestamp, >, 0);
1812
1813         while (zio->io_stage < ZIO_STAGE_DONE) {
1814                 enum zio_stage pipeline = zio->io_pipeline;
1815                 enum zio_stage stage = zio->io_stage;
1816                 int rv;
1817
1818                 ASSERT(!MUTEX_HELD(&zio->io_lock));
1819                 ASSERT(ISP2(stage));
1820                 ASSERT(zio->io_stall == NULL);
1821
1822                 do {
1823                         stage <<= 1;
1824                 } while ((stage & pipeline) == 0);
1825
1826                 ASSERT(stage <= ZIO_STAGE_DONE);
1827
1828                 /*
1829                  * If we are in interrupt context and this pipeline stage
1830                  * will grab a config lock that is held across I/O,
1831                  * or may wait for an I/O that needs an interrupt thread
1832                  * to complete, issue async to avoid deadlock.
1833                  *
1834                  * For VDEV_IO_START, we cut in line so that the io will
1835                  * be sent to disk promptly.
1836                  */
1837                 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1838                     zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1839                         boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1840                             zio_requeue_io_start_cut_in_line : B_FALSE;
1841                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1842                         return;
1843                 }
1844
1845                 /*
1846                  * If the current context doesn't have large enough stacks
1847                  * the zio must be issued asynchronously to prevent overflow.
1848                  */
1849                 if (zio_execute_stack_check(zio)) {
1850                         boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1851                             zio_requeue_io_start_cut_in_line : B_FALSE;
1852                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1853                         return;
1854                 }
1855
1856                 zio->io_stage = stage;
1857                 zio->io_pipeline_trace |= zio->io_stage;
1858                 rv = zio_pipeline[highbit64(stage) - 1](zio);
1859
1860                 if (rv == ZIO_PIPELINE_STOP)
1861                         return;
1862
1863                 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1864         }
1865 }
1866
1867
1868 /*
1869  * ==========================================================================
1870  * Initiate I/O, either sync or async
1871  * ==========================================================================
1872  */
1873 int
1874 zio_wait(zio_t *zio)
1875 {
1876         int error;
1877
1878         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1879         ASSERT(zio->io_executor == NULL);
1880
1881         zio->io_waiter = curthread;
1882         ASSERT0(zio->io_queued_timestamp);
1883         zio->io_queued_timestamp = gethrtime();
1884
1885         __zio_execute(zio);
1886
1887         mutex_enter(&zio->io_lock);
1888         while (zio->io_executor != NULL)
1889                 cv_wait_io(&zio->io_cv, &zio->io_lock);
1890         mutex_exit(&zio->io_lock);
1891
1892         error = zio->io_error;
1893         zio_destroy(zio);
1894
1895         return (error);
1896 }
1897
1898 void
1899 zio_nowait(zio_t *zio)
1900 {
1901         ASSERT(zio->io_executor == NULL);
1902
1903         if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1904             zio_unique_parent(zio) == NULL) {
1905                 zio_t *pio;
1906
1907                 /*
1908                  * This is a logical async I/O with no parent to wait for it.
1909                  * We add it to the spa_async_root_zio "Godfather" I/O which
1910                  * will ensure they complete prior to unloading the pool.
1911                  */
1912                 spa_t *spa = zio->io_spa;
1913                 kpreempt_disable();
1914                 pio = spa->spa_async_zio_root[CPU_SEQID];
1915                 kpreempt_enable();
1916
1917                 zio_add_child(pio, zio);
1918         }
1919
1920         ASSERT0(zio->io_queued_timestamp);
1921         zio->io_queued_timestamp = gethrtime();
1922         __zio_execute(zio);
1923 }
1924
1925 /*
1926  * ==========================================================================
1927  * Reexecute or suspend/resume failed I/O
1928  * ==========================================================================
1929  */
1930
1931 static void
1932 zio_reexecute(zio_t *pio)
1933 {
1934         zio_t *cio, *cio_next;
1935
1936         ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1937         ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1938         ASSERT(pio->io_gang_leader == NULL);
1939         ASSERT(pio->io_gang_tree == NULL);
1940
1941         pio->io_flags = pio->io_orig_flags;
1942         pio->io_stage = pio->io_orig_stage;
1943         pio->io_pipeline = pio->io_orig_pipeline;
1944         pio->io_reexecute = 0;
1945         pio->io_flags |= ZIO_FLAG_REEXECUTED;
1946         pio->io_pipeline_trace = 0;
1947         pio->io_error = 0;
1948         for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1949                 pio->io_state[w] = 0;
1950         for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1951                 pio->io_child_error[c] = 0;
1952
1953         if (IO_IS_ALLOCATING(pio))
1954                 BP_ZERO(pio->io_bp);
1955
1956         /*
1957          * As we reexecute pio's children, new children could be created.
1958          * New children go to the head of pio's io_child_list, however,
1959          * so we will (correctly) not reexecute them.  The key is that
1960          * the remainder of pio's io_child_list, from 'cio_next' onward,
1961          * cannot be affected by any side effects of reexecuting 'cio'.
1962          */
1963         zio_link_t *zl = NULL;
1964         for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
1965                 cio_next = zio_walk_children(pio, &zl);
1966                 mutex_enter(&pio->io_lock);
1967                 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1968                         pio->io_children[cio->io_child_type][w]++;
1969                 mutex_exit(&pio->io_lock);
1970                 zio_reexecute(cio);
1971         }
1972
1973         /*
1974          * Now that all children have been reexecuted, execute the parent.
1975          * We don't reexecute "The Godfather" I/O here as it's the
1976          * responsibility of the caller to wait on it.
1977          */
1978         if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
1979                 pio->io_queued_timestamp = gethrtime();
1980                 __zio_execute(pio);
1981         }
1982 }
1983
1984 void
1985 zio_suspend(spa_t *spa, zio_t *zio)
1986 {
1987         if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1988                 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1989                     "failure and the failure mode property for this pool "
1990                     "is set to panic.", spa_name(spa));
1991
1992         cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
1993             "failure and has been suspended.\n", spa_name(spa));
1994
1995         zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
1996             NULL, NULL, 0, 0);
1997
1998         mutex_enter(&spa->spa_suspend_lock);
1999
2000         if (spa->spa_suspend_zio_root == NULL)
2001                 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
2002                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2003                     ZIO_FLAG_GODFATHER);
2004
2005         spa->spa_suspended = B_TRUE;
2006
2007         if (zio != NULL) {
2008                 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
2009                 ASSERT(zio != spa->spa_suspend_zio_root);
2010                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2011                 ASSERT(zio_unique_parent(zio) == NULL);
2012                 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
2013                 zio_add_child(spa->spa_suspend_zio_root, zio);
2014         }
2015
2016         mutex_exit(&spa->spa_suspend_lock);
2017 }
2018
2019 int
2020 zio_resume(spa_t *spa)
2021 {
2022         zio_t *pio;
2023
2024         /*
2025          * Reexecute all previously suspended i/o.
2026          */
2027         mutex_enter(&spa->spa_suspend_lock);
2028         spa->spa_suspended = B_FALSE;
2029         cv_broadcast(&spa->spa_suspend_cv);
2030         pio = spa->spa_suspend_zio_root;
2031         spa->spa_suspend_zio_root = NULL;
2032         mutex_exit(&spa->spa_suspend_lock);
2033
2034         if (pio == NULL)
2035                 return (0);
2036
2037         zio_reexecute(pio);
2038         return (zio_wait(pio));
2039 }
2040
2041 void
2042 zio_resume_wait(spa_t *spa)
2043 {
2044         mutex_enter(&spa->spa_suspend_lock);
2045         while (spa_suspended(spa))
2046                 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
2047         mutex_exit(&spa->spa_suspend_lock);
2048 }
2049
2050 /*
2051  * ==========================================================================
2052  * Gang blocks.
2053  *
2054  * A gang block is a collection of small blocks that looks to the DMU
2055  * like one large block.  When zio_dva_allocate() cannot find a block
2056  * of the requested size, due to either severe fragmentation or the pool
2057  * being nearly full, it calls zio_write_gang_block() to construct the
2058  * block from smaller fragments.
2059  *
2060  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
2061  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
2062  * an indirect block: it's an array of block pointers.  It consumes
2063  * only one sector and hence is allocatable regardless of fragmentation.
2064  * The gang header's bps point to its gang members, which hold the data.
2065  *
2066  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
2067  * as the verifier to ensure uniqueness of the SHA256 checksum.
2068  * Critically, the gang block bp's blk_cksum is the checksum of the data,
2069  * not the gang header.  This ensures that data block signatures (needed for
2070  * deduplication) are independent of how the block is physically stored.
2071  *
2072  * Gang blocks can be nested: a gang member may itself be a gang block.
2073  * Thus every gang block is a tree in which root and all interior nodes are
2074  * gang headers, and the leaves are normal blocks that contain user data.
2075  * The root of the gang tree is called the gang leader.
2076  *
2077  * To perform any operation (read, rewrite, free, claim) on a gang block,
2078  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
2079  * in the io_gang_tree field of the original logical i/o by recursively
2080  * reading the gang leader and all gang headers below it.  This yields
2081  * an in-core tree containing the contents of every gang header and the
2082  * bps for every constituent of the gang block.
2083  *
2084  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
2085  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
2086  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
2087  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
2088  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
2089  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
2090  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
2091  * of the gang header plus zio_checksum_compute() of the data to update the
2092  * gang header's blk_cksum as described above.
2093  *
2094  * The two-phase assemble/issue model solves the problem of partial failure --
2095  * what if you'd freed part of a gang block but then couldn't read the
2096  * gang header for another part?  Assembling the entire gang tree first
2097  * ensures that all the necessary gang header I/O has succeeded before
2098  * starting the actual work of free, claim, or write.  Once the gang tree
2099  * is assembled, free and claim are in-memory operations that cannot fail.
2100  *
2101  * In the event that a gang write fails, zio_dva_unallocate() walks the
2102  * gang tree to immediately free (i.e. insert back into the space map)
2103  * everything we've allocated.  This ensures that we don't get ENOSPC
2104  * errors during repeated suspend/resume cycles due to a flaky device.
2105  *
2106  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
2107  * the gang tree, we won't modify the block, so we can safely defer the free
2108  * (knowing that the block is still intact).  If we *can* assemble the gang
2109  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
2110  * each constituent bp and we can allocate a new block on the next sync pass.
2111  *
2112  * In all cases, the gang tree allows complete recovery from partial failure.
2113  * ==========================================================================
2114  */
2115
2116 static void
2117 zio_gang_issue_func_done(zio_t *zio)
2118 {
2119         abd_put(zio->io_abd);
2120 }
2121
2122 static zio_t *
2123 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
2124     uint64_t offset)
2125 {
2126         if (gn != NULL)
2127                 return (pio);
2128
2129         return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
2130             BP_GET_PSIZE(bp), zio_gang_issue_func_done,
2131             NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2132             &pio->io_bookmark));
2133 }
2134
2135 static zio_t *
2136 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
2137     uint64_t offset)
2138 {
2139         zio_t *zio;
2140
2141         if (gn != NULL) {
2142                 abd_t *gbh_abd =
2143                     abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
2144                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
2145                     gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
2146                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2147                     &pio->io_bookmark);
2148                 /*
2149                  * As we rewrite each gang header, the pipeline will compute
2150                  * a new gang block header checksum for it; but no one will
2151                  * compute a new data checksum, so we do that here.  The one
2152                  * exception is the gang leader: the pipeline already computed
2153                  * its data checksum because that stage precedes gang assembly.
2154                  * (Presently, nothing actually uses interior data checksums;
2155                  * this is just good hygiene.)
2156                  */
2157                 if (gn != pio->io_gang_leader->io_gang_tree) {
2158                         abd_t *buf = abd_get_offset(data, offset);
2159
2160                         zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
2161                             buf, BP_GET_PSIZE(bp));
2162
2163                         abd_put(buf);
2164                 }
2165                 /*
2166                  * If we are here to damage data for testing purposes,
2167                  * leave the GBH alone so that we can detect the damage.
2168                  */
2169                 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
2170                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2171         } else {
2172                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
2173                     abd_get_offset(data, offset), BP_GET_PSIZE(bp),
2174                     zio_gang_issue_func_done, NULL, pio->io_priority,
2175                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2176         }
2177
2178         return (zio);
2179 }
2180
2181 /* ARGSUSED */
2182 static zio_t *
2183 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
2184     uint64_t offset)
2185 {
2186         return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
2187             ZIO_GANG_CHILD_FLAGS(pio)));
2188 }
2189
2190 /* ARGSUSED */
2191 static zio_t *
2192 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
2193     uint64_t offset)
2194 {
2195         return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
2196             NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
2197 }
2198
2199 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
2200         NULL,
2201         zio_read_gang,
2202         zio_rewrite_gang,
2203         zio_free_gang,
2204         zio_claim_gang,
2205         NULL
2206 };
2207
2208 static void zio_gang_tree_assemble_done(zio_t *zio);
2209
2210 static zio_gang_node_t *
2211 zio_gang_node_alloc(zio_gang_node_t **gnpp)
2212 {
2213         zio_gang_node_t *gn;
2214
2215         ASSERT(*gnpp == NULL);
2216
2217         gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
2218         gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
2219         *gnpp = gn;
2220
2221         return (gn);
2222 }
2223
2224 static void
2225 zio_gang_node_free(zio_gang_node_t **gnpp)
2226 {
2227         zio_gang_node_t *gn = *gnpp;
2228
2229         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
2230                 ASSERT(gn->gn_child[g] == NULL);
2231
2232         zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
2233         kmem_free(gn, sizeof (*gn));
2234         *gnpp = NULL;
2235 }
2236
2237 static void
2238 zio_gang_tree_free(zio_gang_node_t **gnpp)
2239 {
2240         zio_gang_node_t *gn = *gnpp;
2241
2242         if (gn == NULL)
2243                 return;
2244
2245         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
2246                 zio_gang_tree_free(&gn->gn_child[g]);
2247
2248         zio_gang_node_free(gnpp);
2249 }
2250
2251 static void
2252 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
2253 {
2254         zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
2255         abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
2256
2257         ASSERT(gio->io_gang_leader == gio);
2258         ASSERT(BP_IS_GANG(bp));
2259
2260         zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
2261             zio_gang_tree_assemble_done, gn, gio->io_priority,
2262             ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
2263 }
2264
2265 static void
2266 zio_gang_tree_assemble_done(zio_t *zio)
2267 {
2268         zio_t *gio = zio->io_gang_leader;
2269         zio_gang_node_t *gn = zio->io_private;
2270         blkptr_t *bp = zio->io_bp;
2271
2272         ASSERT(gio == zio_unique_parent(zio));
2273         ASSERT(zio->io_child_count == 0);
2274
2275         if (zio->io_error)
2276                 return;
2277
2278         /* this ABD was created from a linear buf in zio_gang_tree_assemble */
2279         if (BP_SHOULD_BYTESWAP(bp))
2280                 byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
2281
2282         ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
2283         ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
2284         ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
2285
2286         abd_put(zio->io_abd);
2287
2288         for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2289                 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
2290                 if (!BP_IS_GANG(gbp))
2291                         continue;
2292                 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
2293         }
2294 }
2295
2296 static void
2297 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
2298     uint64_t offset)
2299 {
2300         zio_t *gio = pio->io_gang_leader;
2301         zio_t *zio;
2302
2303         ASSERT(BP_IS_GANG(bp) == !!gn);
2304         ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
2305         ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
2306
2307         /*
2308          * If you're a gang header, your data is in gn->gn_gbh.
2309          * If you're a gang member, your data is in 'data' and gn == NULL.
2310          */
2311         zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
2312
2313         if (gn != NULL) {
2314                 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
2315
2316                 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2317                         blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
2318                         if (BP_IS_HOLE(gbp))
2319                                 continue;
2320                         zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
2321                             offset);
2322                         offset += BP_GET_PSIZE(gbp);
2323                 }
2324         }
2325
2326         if (gn == gio->io_gang_tree)
2327                 ASSERT3U(gio->io_size, ==, offset);
2328
2329         if (zio != pio)
2330                 zio_nowait(zio);
2331 }
2332
2333 static int
2334 zio_gang_assemble(zio_t *zio)
2335 {
2336         blkptr_t *bp = zio->io_bp;
2337
2338         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
2339         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2340
2341         zio->io_gang_leader = zio;
2342
2343         zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
2344
2345         return (ZIO_PIPELINE_CONTINUE);
2346 }
2347
2348 static int
2349 zio_gang_issue(zio_t *zio)
2350 {
2351         blkptr_t *bp = zio->io_bp;
2352
2353         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
2354                 return (ZIO_PIPELINE_STOP);
2355
2356         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
2357         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2358
2359         if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
2360                 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
2361                     0);
2362         else
2363                 zio_gang_tree_free(&zio->io_gang_tree);
2364
2365         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2366
2367         return (ZIO_PIPELINE_CONTINUE);
2368 }
2369
2370 static void
2371 zio_write_gang_member_ready(zio_t *zio)
2372 {
2373         zio_t *pio = zio_unique_parent(zio);
2374         dva_t *cdva = zio->io_bp->blk_dva;
2375         dva_t *pdva = pio->io_bp->blk_dva;
2376         uint64_t asize;
2377         ASSERTV(zio_t *gio = zio->io_gang_leader);
2378
2379         if (BP_IS_HOLE(zio->io_bp))
2380                 return;
2381
2382         ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
2383
2384         ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
2385         ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
2386         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
2387         ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
2388         ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
2389
2390         mutex_enter(&pio->io_lock);
2391         for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
2392                 ASSERT(DVA_GET_GANG(&pdva[d]));
2393                 asize = DVA_GET_ASIZE(&pdva[d]);
2394                 asize += DVA_GET_ASIZE(&cdva[d]);
2395                 DVA_SET_ASIZE(&pdva[d], asize);
2396         }
2397         mutex_exit(&pio->io_lock);
2398 }
2399
2400 static void
2401 zio_write_gang_done(zio_t *zio)
2402 {
2403         abd_put(zio->io_abd);
2404 }
2405
2406 static int
2407 zio_write_gang_block(zio_t *pio)
2408 {
2409         spa_t *spa = pio->io_spa;
2410         metaslab_class_t *mc = spa_normal_class(spa);
2411         blkptr_t *bp = pio->io_bp;
2412         zio_t *gio = pio->io_gang_leader;
2413         zio_t *zio;
2414         zio_gang_node_t *gn, **gnpp;
2415         zio_gbh_phys_t *gbh;
2416         abd_t *gbh_abd;
2417         uint64_t txg = pio->io_txg;
2418         uint64_t resid = pio->io_size;
2419         uint64_t lsize;
2420         int copies = gio->io_prop.zp_copies;
2421         int gbh_copies;
2422         zio_prop_t zp;
2423         int error;
2424
2425         /*
2426          * encrypted blocks need DVA[2] free so encrypted gang headers can't
2427          * have a third copy.
2428          */
2429         gbh_copies = MIN(copies + 1, spa_max_replication(spa));
2430         if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
2431                 gbh_copies = SPA_DVAS_PER_BP - 1;
2432
2433         int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
2434         if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
2435                 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
2436                 ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
2437
2438                 flags |= METASLAB_ASYNC_ALLOC;
2439                 VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
2440
2441                 /*
2442                  * The logical zio has already placed a reservation for
2443                  * 'copies' allocation slots but gang blocks may require
2444                  * additional copies. These additional copies
2445                  * (i.e. gbh_copies - copies) are guaranteed to succeed
2446                  * since metaslab_class_throttle_reserve() always allows
2447                  * additional reservations for gang blocks.
2448                  */
2449                 VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
2450                     pio, flags));
2451         }
2452
2453         error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
2454             bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
2455             &pio->io_alloc_list, pio);
2456         if (error) {
2457                 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
2458                         ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
2459                         ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
2460
2461                         /*
2462                          * If we failed to allocate the gang block header then
2463                          * we remove any additional allocation reservations that
2464                          * we placed here. The original reservation will
2465                          * be removed when the logical I/O goes to the ready
2466                          * stage.
2467                          */
2468                         metaslab_class_throttle_unreserve(mc,
2469                             gbh_copies - copies, pio);
2470                 }
2471
2472                 pio->io_error = error;
2473                 return (ZIO_PIPELINE_CONTINUE);
2474         }
2475
2476         if (pio == gio) {
2477                 gnpp = &gio->io_gang_tree;
2478         } else {
2479                 gnpp = pio->io_private;
2480                 ASSERT(pio->io_ready == zio_write_gang_member_ready);
2481         }
2482
2483         gn = zio_gang_node_alloc(gnpp);
2484         gbh = gn->gn_gbh;
2485         bzero(gbh, SPA_GANGBLOCKSIZE);
2486         gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
2487
2488         /*
2489          * Create the gang header.
2490          */
2491         zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
2492             zio_write_gang_done, NULL, pio->io_priority,
2493             ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2494
2495         /*
2496          * Create and nowait the gang children.
2497          */
2498         for (int g = 0; resid != 0; resid -= lsize, g++) {
2499                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
2500                     SPA_MINBLOCKSIZE);
2501                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
2502
2503                 zp.zp_checksum = gio->io_prop.zp_checksum;
2504                 zp.zp_compress = ZIO_COMPRESS_OFF;
2505                 zp.zp_type = DMU_OT_NONE;
2506                 zp.zp_level = 0;
2507                 zp.zp_copies = gio->io_prop.zp_copies;
2508                 zp.zp_dedup = B_FALSE;
2509                 zp.zp_dedup_verify = B_FALSE;
2510                 zp.zp_nopwrite = B_FALSE;
2511                 zp.zp_encrypt = gio->io_prop.zp_encrypt;
2512                 zp.zp_byteorder = gio->io_prop.zp_byteorder;
2513                 bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
2514                 bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
2515                 bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
2516
2517                 zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
2518                     abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
2519                     lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
2520                     zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
2521                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2522
2523                 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
2524                         ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
2525                         ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
2526
2527                         /*
2528                          * Gang children won't throttle but we should
2529                          * account for their work, so reserve an allocation
2530                          * slot for them here.
2531                          */
2532                         VERIFY(metaslab_class_throttle_reserve(mc,
2533                             zp.zp_copies, cio, flags));
2534                 }
2535                 zio_nowait(cio);
2536         }
2537
2538         /*
2539          * Set pio's pipeline to just wait for zio to finish.
2540          */
2541         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2542
2543         /*
2544          * We didn't allocate this bp, so make sure it doesn't get unmarked.
2545          */
2546         pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
2547
2548         zio_nowait(zio);
2549
2550         return (ZIO_PIPELINE_CONTINUE);
2551 }
2552
2553 /*
2554  * The zio_nop_write stage in the pipeline determines if allocating a
2555  * new bp is necessary.  The nopwrite feature can handle writes in
2556  * either syncing or open context (i.e. zil writes) and as a result is
2557  * mutually exclusive with dedup.
2558  *
2559  * By leveraging a cryptographically secure checksum, such as SHA256, we
2560  * can compare the checksums of the new data and the old to determine if
2561  * allocating a new block is required.  Note that our requirements for
2562  * cryptographic strength are fairly weak: there can't be any accidental
2563  * hash collisions, but we don't need to be secure against intentional
2564  * (malicious) collisions.  To trigger a nopwrite, you have to be able
2565  * to write the file to begin with, and triggering an incorrect (hash
2566  * collision) nopwrite is no worse than simply writing to the file.
2567  * That said, there are no known attacks against the checksum algorithms
2568  * used for nopwrite, assuming that the salt and the checksums
2569  * themselves remain secret.
2570  */
2571 static int
2572 zio_nop_write(zio_t *zio)
2573 {
2574         blkptr_t *bp = zio->io_bp;
2575         blkptr_t *bp_orig = &zio->io_bp_orig;
2576         zio_prop_t *zp = &zio->io_prop;
2577
2578         ASSERT(BP_GET_LEVEL(bp) == 0);
2579         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2580         ASSERT(zp->zp_nopwrite);
2581         ASSERT(!zp->zp_dedup);
2582         ASSERT(zio->io_bp_override == NULL);
2583         ASSERT(IO_IS_ALLOCATING(zio));
2584
2585         /*
2586          * Check to see if the original bp and the new bp have matching
2587          * characteristics (i.e. same checksum, compression algorithms, etc).
2588          * If they don't then just continue with the pipeline which will
2589          * allocate a new bp.
2590          */
2591         if (BP_IS_HOLE(bp_orig) ||
2592             !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
2593             ZCHECKSUM_FLAG_NOPWRITE) ||
2594             BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
2595             BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2596             BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2597             BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2598             zp->zp_copies != BP_GET_NDVAS(bp_orig))
2599                 return (ZIO_PIPELINE_CONTINUE);
2600
2601         /*
2602          * If the checksums match then reset the pipeline so that we
2603          * avoid allocating a new bp and issuing any I/O.
2604          */
2605         if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2606                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
2607                     ZCHECKSUM_FLAG_NOPWRITE);
2608                 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2609                 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2610                 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2611                 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2612                     sizeof (uint64_t)) == 0);
2613
2614                 *bp = *bp_orig;
2615                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2616                 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2617         }
2618
2619         return (ZIO_PIPELINE_CONTINUE);
2620 }
2621
2622 /*
2623  * ==========================================================================
2624  * Dedup
2625  * ==========================================================================
2626  */
2627 static void
2628 zio_ddt_child_read_done(zio_t *zio)
2629 {
2630         blkptr_t *bp = zio->io_bp;
2631         ddt_entry_t *dde = zio->io_private;
2632         ddt_phys_t *ddp;
2633         zio_t *pio = zio_unique_parent(zio);
2634
2635         mutex_enter(&pio->io_lock);
2636         ddp = ddt_phys_select(dde, bp);
2637         if (zio->io_error == 0)
2638                 ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
2639
2640         if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
2641                 dde->dde_repair_abd = zio->io_abd;
2642         else
2643                 abd_free(zio->io_abd);
2644         mutex_exit(&pio->io_lock);
2645 }
2646
2647 static int
2648 zio_ddt_read_start(zio_t *zio)
2649 {
2650         blkptr_t *bp = zio->io_bp;
2651
2652         ASSERT(BP_GET_DEDUP(bp));
2653         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2654         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2655
2656         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2657                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2658                 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2659                 ddt_phys_t *ddp = dde->dde_phys;
2660                 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2661                 blkptr_t blk;
2662
2663                 ASSERT(zio->io_vsd == NULL);
2664                 zio->io_vsd = dde;
2665
2666                 if (ddp_self == NULL)
2667                         return (ZIO_PIPELINE_CONTINUE);
2668
2669                 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2670                         if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2671                                 continue;
2672                         ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2673                             &blk);
2674                         zio_nowait(zio_read(zio, zio->io_spa, &blk,
2675                             abd_alloc_for_io(zio->io_size, B_TRUE),
2676                             zio->io_size, zio_ddt_child_read_done, dde,
2677                             zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
2678                             ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
2679                 }
2680                 return (ZIO_PIPELINE_CONTINUE);
2681         }
2682
2683         zio_nowait(zio_read(zio, zio->io_spa, bp,
2684             zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
2685             ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2686
2687         return (ZIO_PIPELINE_CONTINUE);
2688 }
2689
2690 static int
2691 zio_ddt_read_done(zio_t *zio)
2692 {
2693         blkptr_t *bp = zio->io_bp;
2694
2695         if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2696                 return (ZIO_PIPELINE_STOP);
2697
2698         ASSERT(BP_GET_DEDUP(bp));
2699         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2700         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2701
2702         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2703                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2704                 ddt_entry_t *dde = zio->io_vsd;
2705                 if (ddt == NULL) {
2706                         ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2707                         return (ZIO_PIPELINE_CONTINUE);
2708                 }
2709                 if (dde == NULL) {
2710                         zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2711                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2712                         return (ZIO_PIPELINE_STOP);
2713                 }
2714                 if (dde->dde_repair_abd != NULL) {
2715                         abd_copy(zio->io_abd, dde->dde_repair_abd,
2716                             zio->io_size);
2717                         zio->io_child_error[ZIO_CHILD_DDT] = 0;
2718                 }
2719                 ddt_repair_done(ddt, dde);
2720                 zio->io_vsd = NULL;
2721         }
2722
2723         ASSERT(zio->io_vsd == NULL);
2724
2725         return (ZIO_PIPELINE_CONTINUE);
2726 }
2727
2728 static boolean_t
2729 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2730 {
2731         spa_t *spa = zio->io_spa;
2732         boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
2733
2734         ASSERT(!(zio->io_bp_override && do_raw));
2735
2736         /*
2737          * Note: we compare the original data, not the transformed data,
2738          * because when zio->io_bp is an override bp, we will not have
2739          * pushed the I/O transforms.  That's an important optimization
2740          * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2741          * However, we should never get a raw, override zio so in these
2742          * cases we can compare the io_abd directly. This is useful because
2743          * it allows us to do dedup verification even if we don't have access
2744          * to the original data (for instance, if the encryption keys aren't
2745          * loaded).
2746          */
2747
2748         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2749                 zio_t *lio = dde->dde_lead_zio[p];
2750
2751                 if (lio != NULL && do_raw) {
2752                         return (lio->io_size != zio->io_size ||
2753                             abd_cmp(zio->io_abd, lio->io_abd) != 0);
2754                 } else if (lio != NULL) {
2755                         return (lio->io_orig_size != zio->io_orig_size ||
2756                             abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
2757                 }
2758         }
2759
2760         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2761                 ddt_phys_t *ddp = &dde->dde_phys[p];
2762
2763                 if (ddp->ddp_phys_birth != 0 && do_raw) {
2764                         blkptr_t blk = *zio->io_bp;
2765                         uint64_t psize;
2766                         abd_t *tmpabd;
2767                         int error;
2768
2769                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2770                         psize = BP_GET_PSIZE(&blk);
2771
2772                         if (psize != zio->io_size)
2773                                 return (B_TRUE);
2774
2775                         ddt_exit(ddt);
2776
2777                         tmpabd = abd_alloc_for_io(psize, B_TRUE);
2778
2779                         error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
2780                             psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
2781                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2782                             ZIO_FLAG_RAW, &zio->io_bookmark));
2783
2784                         if (error == 0) {
2785                                 if (abd_cmp(tmpabd, zio->io_abd) != 0)
2786                                         error = SET_ERROR(ENOENT);
2787                         }
2788
2789                         abd_free(tmpabd);
2790                         ddt_enter(ddt);
2791                         return (error != 0);
2792                 } else if (ddp->ddp_phys_birth != 0) {
2793                         arc_buf_t *abuf = NULL;
2794                         arc_flags_t aflags = ARC_FLAG_WAIT;
2795                         blkptr_t blk = *zio->io_bp;
2796                         int error;
2797
2798                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2799
2800                         if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
2801                                 return (B_TRUE);
2802
2803                         ddt_exit(ddt);
2804
2805                         error = arc_read(NULL, spa, &blk,
2806                             arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2807                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2808                             &aflags, &zio->io_bookmark);
2809
2810                         if (error == 0) {
2811                                 if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
2812                                     zio->io_orig_size) != 0)
2813                                         error = SET_ERROR(ENOENT);
2814                                 arc_buf_destroy(abuf, &abuf);
2815                         }
2816
2817                         ddt_enter(ddt);
2818                         return (error != 0);
2819                 }
2820         }
2821
2822         return (B_FALSE);
2823 }
2824
2825 static void
2826 zio_ddt_child_write_ready(zio_t *zio)
2827 {
2828         int p = zio->io_prop.zp_copies;
2829         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2830         ddt_entry_t *dde = zio->io_private;
2831         ddt_phys_t *ddp = &dde->dde_phys[p];
2832         zio_t *pio;
2833
2834         if (zio->io_error)
2835                 return;
2836
2837         ddt_enter(ddt);
2838
2839         ASSERT(dde->dde_lead_zio[p] == zio);
2840
2841         ddt_phys_fill(ddp, zio->io_bp);
2842
2843         zio_link_t *zl = NULL;
2844         while ((pio = zio_walk_parents(zio, &zl)) != NULL)
2845                 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2846
2847         ddt_exit(ddt);
2848 }
2849
2850 static void
2851 zio_ddt_child_write_done(zio_t *zio)
2852 {
2853         int p = zio->io_prop.zp_copies;
2854         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2855         ddt_entry_t *dde = zio->io_private;
2856         ddt_phys_t *ddp = &dde->dde_phys[p];
2857
2858         ddt_enter(ddt);
2859
2860         ASSERT(ddp->ddp_refcnt == 0);
2861         ASSERT(dde->dde_lead_zio[p] == zio);
2862         dde->dde_lead_zio[p] = NULL;
2863
2864         if (zio->io_error == 0) {
2865                 zio_link_t *zl = NULL;
2866                 while (zio_walk_parents(zio, &zl) != NULL)
2867                         ddt_phys_addref(ddp);
2868         } else {
2869                 ddt_phys_clear(ddp);
2870         }
2871
2872         ddt_exit(ddt);
2873 }
2874
2875 static void
2876 zio_ddt_ditto_write_done(zio_t *zio)
2877 {
2878         int p = DDT_PHYS_DITTO;
2879         ASSERTV(zio_prop_t *zp = &zio->io_prop);
2880         blkptr_t *bp = zio->io_bp;
2881         ddt_t *ddt = ddt_select(zio->io_spa, bp);
2882         ddt_entry_t *dde = zio->io_private;
2883         ddt_phys_t *ddp = &dde->dde_phys[p];
2884         ddt_key_t *ddk = &dde->dde_key;
2885
2886         ddt_enter(ddt);
2887
2888         ASSERT(ddp->ddp_refcnt == 0);
2889         ASSERT(dde->dde_lead_zio[p] == zio);
2890         dde->dde_lead_zio[p] = NULL;
2891
2892         if (zio->io_error == 0) {
2893                 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2894                 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2895                 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2896                 if (ddp->ddp_phys_birth != 0)
2897                         ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2898                 ddt_phys_fill(ddp, bp);
2899         }
2900
2901         ddt_exit(ddt);
2902 }
2903
2904 static int
2905 zio_ddt_write(zio_t *zio)
2906 {
2907         spa_t *spa = zio->io_spa;
2908         blkptr_t *bp = zio->io_bp;
2909         uint64_t txg = zio->io_txg;
2910         zio_prop_t *zp = &zio->io_prop;
2911         int p = zp->zp_copies;
2912         int ditto_copies;
2913         zio_t *cio = NULL;
2914         zio_t *dio = NULL;
2915         ddt_t *ddt = ddt_select(spa, bp);
2916         ddt_entry_t *dde;
2917         ddt_phys_t *ddp;
2918
2919         ASSERT(BP_GET_DEDUP(bp));
2920         ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2921         ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2922         ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
2923
2924         ddt_enter(ddt);
2925         dde = ddt_lookup(ddt, bp, B_TRUE);
2926         ddp = &dde->dde_phys[p];
2927
2928         if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2929                 /*
2930                  * If we're using a weak checksum, upgrade to a strong checksum
2931                  * and try again.  If we're already using a strong checksum,
2932                  * we can't resolve it, so just convert to an ordinary write.
2933                  * (And automatically e-mail a paper to Nature?)
2934                  */
2935                 if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
2936                     ZCHECKSUM_FLAG_DEDUP)) {
2937                         zp->zp_checksum = spa_dedup_checksum(spa);
2938                         zio_pop_transforms(zio);
2939                         zio->io_stage = ZIO_STAGE_OPEN;
2940                         BP_ZERO(bp);
2941                 } else {
2942                         zp->zp_dedup = B_FALSE;
2943                 }
2944                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2945                 ddt_exit(ddt);
2946                 return (ZIO_PIPELINE_CONTINUE);
2947         }
2948
2949         ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2950         ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2951
2952         if (ditto_copies > ddt_ditto_copies_present(dde) &&
2953             dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2954                 zio_prop_t czp = *zp;
2955
2956                 czp.zp_copies = ditto_copies;
2957
2958                 /*
2959                  * If we arrived here with an override bp, we won't have run
2960                  * the transform stack, so we won't have the data we need to
2961                  * generate a child i/o.  So, toss the override bp and restart.
2962                  * This is safe, because using the override bp is just an
2963                  * optimization; and it's rare, so the cost doesn't matter.
2964                  */
2965                 if (zio->io_bp_override) {
2966                         zio_pop_transforms(zio);
2967                         zio->io_stage = ZIO_STAGE_OPEN;
2968                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2969                         zio->io_bp_override = NULL;
2970                         BP_ZERO(bp);
2971                         ddt_exit(ddt);
2972                         return (ZIO_PIPELINE_CONTINUE);
2973                 }
2974
2975                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
2976                     zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
2977                     NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
2978                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2979
2980                 zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
2981                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2982         }
2983
2984         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2985                 if (ddp->ddp_phys_birth != 0)
2986                         ddt_bp_fill(ddp, bp, txg);
2987                 if (dde->dde_lead_zio[p] != NULL)
2988                         zio_add_child(zio, dde->dde_lead_zio[p]);
2989                 else
2990                         ddt_phys_addref(ddp);
2991         } else if (zio->io_bp_override) {
2992                 ASSERT(bp->blk_birth == txg);
2993                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2994                 ddt_phys_fill(ddp, bp);
2995                 ddt_phys_addref(ddp);
2996         } else {
2997                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
2998                     zio->io_orig_size, zio->io_orig_size, zp,
2999                     zio_ddt_child_write_ready, NULL, NULL,
3000                     zio_ddt_child_write_done, dde, zio->io_priority,
3001                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
3002
3003                 zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
3004                 dde->dde_lead_zio[p] = cio;
3005         }
3006
3007         ddt_exit(ddt);
3008
3009         if (cio)
3010                 zio_nowait(cio);
3011         if (dio)
3012                 zio_nowait(dio);
3013
3014         return (ZIO_PIPELINE_CONTINUE);
3015 }
3016
3017 ddt_entry_t *freedde; /* for debugging */
3018
3019 static int
3020 zio_ddt_free(zio_t *zio)
3021 {
3022         spa_t *spa = zio->io_spa;
3023         blkptr_t *bp = zio->io_bp;
3024         ddt_t *ddt = ddt_select(spa, bp);
3025         ddt_entry_t *dde;
3026         ddt_phys_t *ddp;
3027
3028         ASSERT(BP_GET_DEDUP(bp));
3029         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3030
3031         ddt_enter(ddt);
3032         freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
3033         if (dde) {
3034                 ddp = ddt_phys_select(dde, bp);
3035                 if (ddp)
3036                         ddt_phys_decref(ddp);
3037         }
3038         ddt_exit(ddt);
3039
3040         return (ZIO_PIPELINE_CONTINUE);
3041 }
3042
3043 /*
3044  * ==========================================================================
3045  * Allocate and free blocks
3046  * ==========================================================================
3047  */
3048
3049 static zio_t *
3050 zio_io_to_allocate(spa_t *spa)
3051 {
3052         zio_t *zio;
3053
3054         ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
3055
3056         zio = avl_first(&spa->spa_alloc_tree);
3057         if (zio == NULL)
3058                 return (NULL);
3059
3060         ASSERT(IO_IS_ALLOCATING(zio));
3061
3062         /*
3063          * Try to place a reservation for this zio. If we're unable to
3064          * reserve then we throttle.
3065          */
3066         if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
3067             zio->io_prop.zp_copies, zio, 0)) {
3068                 return (NULL);
3069         }
3070
3071         avl_remove(&spa->spa_alloc_tree, zio);
3072         ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
3073
3074         return (zio);
3075 }
3076
3077 static int
3078 zio_dva_throttle(zio_t *zio)
3079 {
3080         spa_t *spa = zio->io_spa;
3081         zio_t *nio;
3082
3083         if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
3084             !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
3085             zio->io_child_type == ZIO_CHILD_GANG ||
3086             zio->io_flags & ZIO_FLAG_NODATA) {
3087                 return (ZIO_PIPELINE_CONTINUE);
3088         }
3089
3090         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
3091
3092         ASSERT3U(zio->io_queued_timestamp, >, 0);
3093         ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
3094
3095         mutex_enter(&spa->spa_alloc_lock);
3096
3097         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
3098         avl_add(&spa->spa_alloc_tree, zio);
3099
3100         nio = zio_io_to_allocate(zio->io_spa);
3101         mutex_exit(&spa->spa_alloc_lock);
3102
3103         if (nio == zio)
3104                 return (ZIO_PIPELINE_CONTINUE);
3105
3106         if (nio != NULL) {
3107                 ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
3108                 /*
3109                  * We are passing control to a new zio so make sure that
3110                  * it is processed by a different thread. We do this to
3111                  * avoid stack overflows that can occur when parents are
3112                  * throttled and children are making progress. We allow
3113                  * it to go to the head of the taskq since it's already
3114                  * been waiting.
3115                  */
3116                 zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
3117         }
3118         return (ZIO_PIPELINE_STOP);
3119 }
3120
3121 void
3122 zio_allocate_dispatch(spa_t *spa)
3123 {
3124         zio_t *zio;
3125
3126         mutex_enter(&spa->spa_alloc_lock);
3127         zio = zio_io_to_allocate(spa);
3128         mutex_exit(&spa->spa_alloc_lock);
3129         if (zio == NULL)
3130                 return;
3131
3132         ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
3133         ASSERT0(zio->io_error);
3134         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
3135 }
3136
3137 static int
3138 zio_dva_allocate(zio_t *zio)
3139 {
3140         spa_t *spa = zio->io_spa;
3141         metaslab_class_t *mc = spa_normal_class(spa);
3142         blkptr_t *bp = zio->io_bp;
3143         int error;
3144         int flags = 0;
3145
3146         if (zio->io_gang_leader == NULL) {
3147                 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
3148                 zio->io_gang_leader = zio;
3149         }
3150
3151         ASSERT(BP_IS_HOLE(bp));
3152         ASSERT0(BP_GET_NDVAS(bp));
3153         ASSERT3U(zio->io_prop.zp_copies, >, 0);
3154         ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
3155         ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
3156
3157         flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
3158         if (zio->io_flags & ZIO_FLAG_NODATA)
3159                 flags |= METASLAB_DONT_THROTTLE;
3160         if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
3161                 flags |= METASLAB_GANG_CHILD;
3162         if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
3163                 flags |= METASLAB_ASYNC_ALLOC;
3164
3165         error = metaslab_alloc(spa, mc, zio->io_size, bp,
3166             zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
3167             &zio->io_alloc_list, zio);
3168
3169         if (error != 0) {
3170                 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
3171                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
3172                     error);
3173                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
3174                         return (zio_write_gang_block(zio));
3175                 zio->io_error = error;
3176         }
3177
3178         return (ZIO_PIPELINE_CONTINUE);
3179 }
3180
3181 static int
3182 zio_dva_free(zio_t *zio)
3183 {
3184         metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
3185
3186         return (ZIO_PIPELINE_CONTINUE);
3187 }
3188
3189 static int
3190 zio_dva_claim(zio_t *zio)
3191 {
3192         int error;
3193
3194         error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
3195         if (error)
3196                 zio->io_error = error;
3197
3198         return (ZIO_PIPELINE_CONTINUE);
3199 }
3200
3201 /*
3202  * Undo an allocation.  This is used by zio_done() when an I/O fails
3203  * and we want to give back the block we just allocated.
3204  * This handles both normal blocks and gang blocks.
3205  */
3206 static void
3207 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
3208 {
3209         ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
3210         ASSERT(zio->io_bp_override == NULL);
3211
3212         if (!BP_IS_HOLE(bp))
3213                 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
3214
3215         if (gn != NULL) {
3216                 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
3217                         zio_dva_unallocate(zio, gn->gn_child[g],
3218                             &gn->gn_gbh->zg_blkptr[g]);
3219                 }
3220         }
3221 }
3222
3223 /*
3224  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
3225  */
3226 int
3227 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
3228     uint64_t size, boolean_t *slog)
3229 {
3230         int error = 1;
3231         zio_alloc_list_t io_alloc_list;
3232
3233         ASSERT(txg > spa_syncing_txg(spa));
3234
3235         metaslab_trace_init(&io_alloc_list);
3236         error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
3237             txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL);
3238         if (error == 0) {
3239                 *slog = TRUE;
3240         } else {
3241                 error = metaslab_alloc(spa, spa_normal_class(spa), size,
3242                     new_bp, 1, txg, NULL, METASLAB_FASTWRITE,
3243                     &io_alloc_list, NULL);
3244                 if (error == 0)
3245                         *slog = FALSE;
3246         }
3247         metaslab_trace_fini(&io_alloc_list);
3248
3249         if (error == 0) {
3250                 BP_SET_LSIZE(new_bp, size);
3251                 BP_SET_PSIZE(new_bp, size);
3252                 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
3253                 BP_SET_CHECKSUM(new_bp,
3254                     spa_version(spa) >= SPA_VERSION_SLIM_ZIL
3255                     ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
3256                 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
3257                 BP_SET_LEVEL(new_bp, 0);
3258                 BP_SET_DEDUP(new_bp, 0);
3259                 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
3260
3261                 /*
3262                  * encrypted blocks will require an IV and salt. We generate
3263                  * these now since we will not be rewriting the bp at
3264                  * rewrite time.
3265                  */
3266                 if (os->os_encrypted) {
3267                         uint8_t iv[ZIO_DATA_IV_LEN];
3268                         uint8_t salt[ZIO_DATA_SALT_LEN];
3269
3270                         BP_SET_CRYPT(new_bp, B_TRUE);
3271                         VERIFY0(spa_crypt_get_salt(spa,
3272                             dmu_objset_id(os), salt));
3273                         VERIFY0(zio_crypt_generate_iv(iv));
3274
3275                         zio_crypt_encode_params_bp(new_bp, salt, iv);
3276                 }
3277         }
3278
3279         return (error);
3280 }
3281
3282 /*
3283  * Free an intent log block.
3284  */
3285 void
3286 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
3287 {
3288         ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
3289         ASSERT(!BP_IS_GANG(bp));
3290
3291         zio_free(spa, txg, bp);
3292 }
3293
3294 /*
3295  * ==========================================================================
3296  * Read and write to physical devices
3297  * ==========================================================================
3298  */
3299
3300
3301 /*
3302  * Issue an I/O to the underlying vdev. Typically the issue pipeline
3303  * stops after this stage and will resume upon I/O completion.
3304  * However, there are instances where the vdev layer may need to
3305  * continue the pipeline when an I/O was not issued. Since the I/O
3306  * that was sent to the vdev layer might be different than the one
3307  * currently active in the pipeline (see vdev_queue_io()), we explicitly
3308  * force the underlying vdev layers to call either zio_execute() or
3309  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
3310  */
3311 static int
3312 zio_vdev_io_start(zio_t *zio)
3313 {
3314         vdev_t *vd = zio->io_vd;
3315         uint64_t align;
3316         spa_t *spa = zio->io_spa;
3317
3318         zio->io_delay = 0;
3319
3320         ASSERT(zio->io_error == 0);
3321         ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
3322
3323         if (vd == NULL) {
3324                 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
3325                         spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
3326
3327                 /*
3328                  * The mirror_ops handle multiple DVAs in a single BP.
3329                  */
3330                 vdev_mirror_ops.vdev_op_io_start(zio);
3331                 return (ZIO_PIPELINE_STOP);
3332         }
3333
3334         ASSERT3P(zio->io_logical, !=, zio);
3335
3336         /*
3337          * We keep track of time-sensitive I/Os so that the scan thread
3338          * can quickly react to certain workloads.  In particular, we care
3339          * about non-scrubbing, top-level reads and writes with the following
3340          * characteristics:
3341          *      - synchronous writes of user data to non-slog devices
3342          *      - any reads of user data
3343          * When these conditions are met, adjust the timestamp of spa_last_io
3344          * which allows the scan thread to adjust its workload accordingly.
3345          */
3346         if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
3347             vd == vd->vdev_top && !vd->vdev_islog &&
3348             zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
3349             zio->io_txg != spa_syncing_txg(spa)) {
3350                 uint64_t old = spa->spa_last_io;
3351                 uint64_t new = ddi_get_lbolt64();
3352                 if (old != new)
3353                         (void) atomic_cas_64(&spa->spa_last_io, old, new);
3354         }
3355
3356         align = 1ULL << vd->vdev_top->vdev_ashift;
3357
3358         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
3359             P2PHASE(zio->io_size, align) != 0) {
3360                 /* Transform logical writes to be a full physical block size. */
3361                 uint64_t asize = P2ROUNDUP(zio->io_size, align);
3362                 abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
3363                 ASSERT(vd == vd->vdev_top);
3364                 if (zio->io_type == ZIO_TYPE_WRITE) {
3365                         abd_copy(abuf, zio->io_abd, zio->io_size);
3366                         abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
3367                 }
3368                 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
3369         }
3370
3371         /*
3372          * If this is not a physical io, make sure that it is properly aligned
3373          * before proceeding.
3374          */
3375         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
3376                 ASSERT0(P2PHASE(zio->io_offset, align));
3377                 ASSERT0(P2PHASE(zio->io_size, align));
3378         } else {
3379                 /*
3380                  * For physical writes, we allow 512b aligned writes and assume
3381                  * the device will perform a read-modify-write as necessary.
3382                  */
3383                 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
3384                 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
3385         }
3386
3387         VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
3388
3389         /*
3390          * If this is a repair I/O, and there's no self-healing involved --
3391          * that is, we're just resilvering what we expect to resilver --
3392          * then don't do the I/O unless zio's txg is actually in vd's DTL.
3393          * This prevents spurious resilvering with nested replication.
3394          * For example, given a mirror of mirrors, (A+B)+(C+D), if only
3395          * A is out of date, we'll read from C+D, then use the data to
3396          * resilver A+B -- but we don't actually want to resilver B, just A.
3397          * The top-level mirror has no way to know this, so instead we just
3398          * discard unnecessary repairs as we work our way down the vdev tree.
3399          * The same logic applies to any form of nested replication:
3400          * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
3401          */
3402         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
3403             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
3404             zio->io_txg != 0 && /* not a delegated i/o */
3405             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
3406                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
3407                 zio_vdev_io_bypass(zio);
3408                 return (ZIO_PIPELINE_CONTINUE);
3409         }
3410
3411         if (vd->vdev_ops->vdev_op_leaf &&
3412             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
3413
3414                 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
3415                         return (ZIO_PIPELINE_CONTINUE);
3416
3417                 if ((zio = vdev_queue_io(zio)) == NULL)
3418                         return (ZIO_PIPELINE_STOP);
3419
3420                 if (!vdev_accessible(vd, zio)) {
3421                         zio->io_error = SET_ERROR(ENXIO);
3422                         zio_interrupt(zio);
3423                         return (ZIO_PIPELINE_STOP);
3424                 }
3425                 zio->io_delay = gethrtime();
3426         }
3427
3428         vd->vdev_ops->vdev_op_io_start(zio);
3429         return (ZIO_PIPELINE_STOP);
3430 }
3431
3432 static int
3433 zio_vdev_io_done(zio_t *zio)
3434 {
3435         vdev_t *vd = zio->io_vd;
3436         vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
3437         boolean_t unexpected_error = B_FALSE;
3438
3439         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
3440                 return (ZIO_PIPELINE_STOP);
3441
3442         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
3443
3444         if (zio->io_delay)
3445                 zio->io_delay = gethrtime() - zio->io_delay;
3446
3447         if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
3448
3449                 vdev_queue_io_done(zio);
3450
3451                 if (zio->io_type == ZIO_TYPE_WRITE)
3452                         vdev_cache_write(zio);
3453
3454                 if (zio_injection_enabled && zio->io_error == 0)
3455                         zio->io_error = zio_handle_device_injections(vd, zio,
3456                             EIO, EILSEQ);
3457
3458                 if (zio_injection_enabled && zio->io_error == 0)
3459                         zio->io_error = zio_handle_label_injection(zio, EIO);
3460
3461                 if (zio->io_error) {
3462                         if (!vdev_accessible(vd, zio)) {
3463                                 zio->io_error = SET_ERROR(ENXIO);
3464                         } else {
3465                                 unexpected_error = B_TRUE;
3466                         }
3467                 }
3468         }
3469
3470         ops->vdev_op_io_done(zio);
3471
3472         if (unexpected_error)
3473                 VERIFY(vdev_probe(vd, zio) == NULL);
3474
3475         return (ZIO_PIPELINE_CONTINUE);
3476 }
3477
3478 /*
3479  * For non-raidz ZIOs, we can just copy aside the bad data read from the
3480  * disk, and use that to finish the checksum ereport later.
3481  */
3482 static void
3483 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
3484     const abd_t *good_buf)
3485 {
3486         /* no processing needed */
3487         zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
3488 }
3489
3490 /*ARGSUSED*/
3491 void
3492 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
3493 {
3494         void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
3495
3496         abd_copy(abd, zio->io_abd, zio->io_size);
3497
3498         zcr->zcr_cbinfo = zio->io_size;
3499         zcr->zcr_cbdata = abd;
3500         zcr->zcr_finish = zio_vsd_default_cksum_finish;
3501         zcr->zcr_free = zio_abd_free;
3502 }
3503
3504 static int
3505 zio_vdev_io_assess(zio_t *zio)
3506 {
3507         vdev_t *vd = zio->io_vd;
3508
3509         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
3510                 return (ZIO_PIPELINE_STOP);
3511
3512         if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
3513                 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
3514
3515         if (zio->io_vsd != NULL) {
3516                 zio->io_vsd_ops->vsd_free(zio);
3517                 zio->io_vsd = NULL;
3518         }
3519
3520         if (zio_injection_enabled && zio->io_error == 0)
3521                 zio->io_error = zio_handle_fault_injection(zio, EIO);
3522
3523         /*
3524          * If the I/O failed, determine whether we should attempt to retry it.
3525          *
3526          * On retry, we cut in line in the issue queue, since we don't want
3527          * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
3528          */
3529         if (zio->io_error && vd == NULL &&
3530             !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
3531                 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
3532                 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
3533                 zio->io_error = 0;
3534                 zio->io_flags |= ZIO_FLAG_IO_RETRY |
3535                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
3536                 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
3537                 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
3538                     zio_requeue_io_start_cut_in_line);
3539                 return (ZIO_PIPELINE_STOP);
3540         }
3541
3542         /*
3543          * If we got an error on a leaf device, convert it to ENXIO
3544          * if the device is not accessible at all.
3545          */
3546         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
3547             !vdev_accessible(vd, zio))
3548                 zio->io_error = SET_ERROR(ENXIO);
3549
3550         /*
3551          * If we can't write to an interior vdev (mirror or RAID-Z),
3552          * set vdev_cant_write so that we stop trying to allocate from it.
3553          */
3554         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
3555             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
3556                 vd->vdev_cant_write = B_TRUE;
3557         }
3558
3559         /*
3560          * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
3561          * attempts will ever succeed. In this case we set a persistent bit so
3562          * that we don't bother with it in the future.
3563          */
3564         if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
3565             zio->io_type == ZIO_TYPE_IOCTL &&
3566             zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
3567                 vd->vdev_nowritecache = B_TRUE;
3568
3569         if (zio->io_error)
3570                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3571
3572         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
3573             zio->io_physdone != NULL) {
3574                 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
3575                 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
3576                 zio->io_physdone(zio->io_logical);
3577         }
3578
3579         return (ZIO_PIPELINE_CONTINUE);
3580 }
3581
3582 void
3583 zio_vdev_io_reissue(zio_t *zio)
3584 {
3585         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
3586         ASSERT(zio->io_error == 0);
3587
3588         zio->io_stage >>= 1;
3589 }
3590
3591 void
3592 zio_vdev_io_redone(zio_t *zio)
3593 {
3594         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
3595
3596         zio->io_stage >>= 1;
3597 }
3598
3599 void
3600 zio_vdev_io_bypass(zio_t *zio)
3601 {
3602         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
3603         ASSERT(zio->io_error == 0);
3604
3605         zio->io_flags |= ZIO_FLAG_IO_BYPASS;
3606         zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
3607 }
3608
3609 /*
3610  * ==========================================================================
3611  * Encrypt and store encryption parameters
3612  * ==========================================================================
3613  */
3614
3615
3616 /*
3617  * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
3618  * managing the storage of encryption parameters and passing them to the
3619  * lower-level encryption functions.
3620  */
3621 static int
3622 zio_encrypt(zio_t *zio)
3623 {
3624         zio_prop_t *zp = &zio->io_prop;
3625         spa_t *spa = zio->io_spa;
3626         blkptr_t *bp = zio->io_bp;
3627         uint64_t psize = BP_GET_PSIZE(bp);
3628         dmu_object_type_t ot = BP_GET_TYPE(bp);
3629         void *enc_buf = NULL;
3630         abd_t *eabd = NULL;
3631         uint8_t salt[ZIO_DATA_SALT_LEN];
3632         uint8_t iv[ZIO_DATA_IV_LEN];
3633         uint8_t mac[ZIO_DATA_MAC_LEN];
3634         boolean_t no_crypt = B_FALSE;
3635
3636         /* the root zio already encrypted the data */
3637         if (zio->io_child_type == ZIO_CHILD_GANG)
3638                 return (ZIO_PIPELINE_CONTINUE);
3639
3640         /* only ZIL blocks are re-encrypted on rewrite */
3641         if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
3642                 return (ZIO_PIPELINE_CONTINUE);
3643
3644         if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
3645                 BP_SET_CRYPT(bp, B_FALSE);
3646                 return (ZIO_PIPELINE_CONTINUE);
3647         }
3648
3649         /* if we are doing raw encryption set the provided encryption params */
3650         if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
3651                 BP_SET_CRYPT(bp, B_TRUE);
3652                 BP_SET_BYTEORDER(bp, zp->zp_byteorder);
3653                 if (ot != DMU_OT_OBJSET)
3654                         zio_crypt_encode_mac_bp(bp, zp->zp_mac);
3655                 if (DMU_OT_IS_ENCRYPTED(ot))
3656                         zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
3657                 return (ZIO_PIPELINE_CONTINUE);
3658         }
3659
3660         /* indirect blocks only maintain a cksum of the lower level MACs */
3661         if (BP_GET_LEVEL(bp) > 0) {
3662                 BP_SET_CRYPT(bp, B_TRUE);
3663                 VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
3664                     zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
3665                     mac));
3666                 zio_crypt_encode_mac_bp(bp, mac);
3667                 return (ZIO_PIPELINE_CONTINUE);
3668         }
3669
3670         /*
3671          * Objset blocks are a special case since they have 2 256-bit MACs
3672          * embedded within them.
3673          */
3674         if (ot == DMU_OT_OBJSET) {
3675                 ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
3676                 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
3677                 BP_SET_CRYPT(bp, B_TRUE);
3678                 VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa,
3679                     zio->io_bookmark.zb_objset, zio->io_abd, psize,
3680                     BP_SHOULD_BYTESWAP(bp)));
3681                 return (ZIO_PIPELINE_CONTINUE);
3682         }
3683
3684         /* unencrypted object types are only authenticated with a MAC */
3685         if (!DMU_OT_IS_ENCRYPTED(ot)) {
3686                 BP_SET_CRYPT(bp, B_TRUE);
3687                 VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa,
3688                     zio->io_bookmark.zb_objset, zio->io_abd, psize, mac));
3689                 zio_crypt_encode_mac_bp(bp, mac);
3690                 return (ZIO_PIPELINE_CONTINUE);
3691         }
3692
3693         /*
3694          * Later passes of sync-to-convergence may decide to rewrite data
3695          * in place to avoid more disk reallocations. This presents a problem
3696          * for encryption because this consitutes rewriting the new data with
3697          * the same encryption key and IV. However, this only applies to blocks
3698          * in the MOS (particularly the spacemaps) and we do not encrypt the
3699          * MOS. We assert that the zio is allocating or an intent log write
3700          * to enforce this.
3701          */
3702         ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
3703         ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
3704         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
3705         ASSERT3U(psize, !=, 0);
3706
3707         enc_buf = zio_buf_alloc(psize);
3708         eabd = abd_get_from_buf(enc_buf, psize);
3709         abd_take_ownership_of_buf(eabd, B_TRUE);
3710
3711         /*
3712          * For an explanation of what encryption parameters are stored
3713          * where, see the block comment in zio_crypt.c.
3714          */
3715         if (ot == DMU_OT_INTENT_LOG) {
3716                 zio_crypt_decode_params_bp(bp, salt, iv);
3717         } else {
3718                 BP_SET_CRYPT(bp, B_TRUE);
3719         }
3720
3721         /* Perform the encryption. This should not fail */
3722         VERIFY0(spa_do_crypt_abd(B_TRUE, spa, zio->io_bookmark.zb_objset, bp,
3723             zio->io_txg, psize, zio->io_abd, eabd, iv, mac, salt, &no_crypt));
3724
3725         /* encode encryption metadata into the bp */
3726         if (ot == DMU_OT_INTENT_LOG) {
3727                 /*
3728                  * ZIL blocks store the MAC in the embedded checksum, so the
3729                  * transform must always be applied.
3730                  */
3731                 zio_crypt_encode_mac_zil(enc_buf, mac);
3732                 zio_push_transform(zio, eabd, psize, psize, NULL);
3733         } else {
3734                 BP_SET_CRYPT(bp, B_TRUE);
3735                 zio_crypt_encode_params_bp(bp, salt, iv);
3736                 zio_crypt_encode_mac_bp(bp, mac);
3737
3738                 if (no_crypt) {
3739                         ASSERT3U(ot, ==, DMU_OT_DNODE);
3740                         abd_free(eabd);
3741                 } else {
3742                         zio_push_transform(zio, eabd, psize, psize, NULL);
3743                 }
3744         }
3745
3746         return (ZIO_PIPELINE_CONTINUE);
3747 }
3748
3749 /*
3750  * ==========================================================================
3751  * Generate and verify checksums
3752  * ==========================================================================
3753  */
3754 static int
3755 zio_checksum_generate(zio_t *zio)
3756 {
3757         blkptr_t *bp = zio->io_bp;
3758         enum zio_checksum checksum;
3759
3760         if (bp == NULL) {
3761                 /*
3762                  * This is zio_write_phys().
3763                  * We're either generating a label checksum, or none at all.
3764                  */
3765                 checksum = zio->io_prop.zp_checksum;
3766
3767                 if (checksum == ZIO_CHECKSUM_OFF)
3768                         return (ZIO_PIPELINE_CONTINUE);
3769
3770                 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
3771         } else {
3772                 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
3773                         ASSERT(!IO_IS_ALLOCATING(zio));
3774                         checksum = ZIO_CHECKSUM_GANG_HEADER;
3775                 } else {
3776                         checksum = BP_GET_CHECKSUM(bp);
3777                 }
3778         }
3779
3780         zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
3781
3782         return (ZIO_PIPELINE_CONTINUE);
3783 }
3784
3785 static int
3786 zio_checksum_verify(zio_t *zio)
3787 {
3788         zio_bad_cksum_t info;
3789         blkptr_t *bp = zio->io_bp;
3790         int error;
3791
3792         ASSERT(zio->io_vd != NULL);
3793
3794         if (bp == NULL) {
3795                 /*
3796                  * This is zio_read_phys().
3797                  * We're either verifying a label checksum, or nothing at all.
3798                  */
3799                 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
3800                         return (ZIO_PIPELINE_CONTINUE);
3801
3802                 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
3803         }
3804
3805         if ((error = zio_checksum_error(zio, &info)) != 0) {
3806                 zio->io_error = error;
3807                 if (error == ECKSUM &&
3808                     !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3809                         zfs_ereport_start_checksum(zio->io_spa,
3810                             zio->io_vd, &zio->io_bookmark, zio,
3811                             zio->io_offset, zio->io_size, NULL, &info);
3812                 }
3813         }
3814
3815         return (ZIO_PIPELINE_CONTINUE);
3816 }
3817
3818 /*
3819  * Called by RAID-Z to ensure we don't compute the checksum twice.
3820  */
3821 void
3822 zio_checksum_verified(zio_t *zio)
3823 {
3824         zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
3825 }
3826
3827 /*
3828  * ==========================================================================
3829  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
3830  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
3831  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
3832  * indicate errors that are specific to one I/O, and most likely permanent.
3833  * Any other error is presumed to be worse because we weren't expecting it.
3834  * ==========================================================================
3835  */
3836 int
3837 zio_worst_error(int e1, int e2)
3838 {
3839         static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
3840         int r1, r2;
3841
3842         for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
3843                 if (e1 == zio_error_rank[r1])
3844                         break;
3845
3846         for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
3847                 if (e2 == zio_error_rank[r2])
3848                         break;
3849
3850         return (r1 > r2 ? e1 : e2);
3851 }
3852
3853 /*
3854  * ==========================================================================
3855  * I/O completion
3856  * ==========================================================================
3857  */
3858 static int
3859 zio_ready(zio_t *zio)
3860 {
3861         blkptr_t *bp = zio->io_bp;
3862         zio_t *pio, *pio_next;
3863         zio_link_t *zl = NULL;
3864
3865         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3866             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
3867                 return (ZIO_PIPELINE_STOP);
3868
3869         if (zio->io_ready) {
3870                 ASSERT(IO_IS_ALLOCATING(zio));
3871                 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3872                     (zio->io_flags & ZIO_FLAG_NOPWRITE));
3873                 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
3874
3875                 zio->io_ready(zio);
3876         }
3877
3878         if (bp != NULL && bp != &zio->io_bp_copy)
3879                 zio->io_bp_copy = *bp;
3880
3881         if (zio->io_error != 0) {
3882                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3883
3884                 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
3885                         ASSERT(IO_IS_ALLOCATING(zio));
3886                         ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
3887                         /*
3888                          * We were unable to allocate anything, unreserve and
3889                          * issue the next I/O to allocate.
3890                          */
3891                         metaslab_class_throttle_unreserve(
3892                             spa_normal_class(zio->io_spa),
3893                             zio->io_prop.zp_copies, zio);
3894                         zio_allocate_dispatch(zio->io_spa);
3895                 }
3896         }
3897
3898         mutex_enter(&zio->io_lock);
3899         zio->io_state[ZIO_WAIT_READY] = 1;
3900         pio = zio_walk_parents(zio, &zl);
3901         mutex_exit(&zio->io_lock);
3902
3903         /*
3904          * As we notify zio's parents, new parents could be added.
3905          * New parents go to the head of zio's io_parent_list, however,
3906          * so we will (correctly) not notify them.  The remainder of zio's
3907          * io_parent_list, from 'pio_next' onward, cannot change because
3908          * all parents must wait for us to be done before they can be done.
3909          */
3910         for (; pio != NULL; pio = pio_next) {
3911                 pio_next = zio_walk_parents(zio, &zl);
3912                 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
3913         }
3914
3915         if (zio->io_flags & ZIO_FLAG_NODATA) {
3916                 if (BP_IS_GANG(bp)) {
3917                         zio->io_flags &= ~ZIO_FLAG_NODATA;
3918                 } else {
3919                         ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
3920                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3921                 }
3922         }
3923
3924         if (zio_injection_enabled &&
3925             zio->io_spa->spa_syncing_txg == zio->io_txg)
3926                 zio_handle_ignored_writes(zio);
3927
3928         return (ZIO_PIPELINE_CONTINUE);
3929 }
3930
3931 /*
3932  * Update the allocation throttle accounting.
3933  */
3934 static void
3935 zio_dva_throttle_done(zio_t *zio)
3936 {
3937         ASSERTV(zio_t *lio = zio->io_logical);
3938         zio_t *pio = zio_unique_parent(zio);
3939         vdev_t *vd = zio->io_vd;
3940         int flags = METASLAB_ASYNC_ALLOC;
3941
3942         ASSERT3P(zio->io_bp, !=, NULL);
3943         ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3944         ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
3945         ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
3946         ASSERT(vd != NULL);
3947         ASSERT3P(vd, ==, vd->vdev_top);
3948         ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
3949         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
3950         ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
3951         ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
3952         ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
3953
3954         /*
3955          * Parents of gang children can have two flavors -- ones that
3956          * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
3957          * and ones that allocated the constituent blocks. The allocation
3958          * throttle needs to know the allocating parent zio so we must find
3959          * it here.
3960          */
3961         if (pio->io_child_type == ZIO_CHILD_GANG) {
3962                 /*
3963                  * If our parent is a rewrite gang child then our grandparent
3964                  * would have been the one that performed the allocation.
3965                  */
3966                 if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
3967                         pio = zio_unique_parent(pio);
3968                 flags |= METASLAB_GANG_CHILD;
3969         }
3970
3971         ASSERT(IO_IS_ALLOCATING(pio));
3972         ASSERT3P(zio, !=, zio->io_logical);
3973         ASSERT(zio->io_logical != NULL);
3974         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
3975         ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
3976
3977         mutex_enter(&pio->io_lock);
3978         metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
3979         mutex_exit(&pio->io_lock);
3980
3981         metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
3982             1, pio);
3983
3984         /*
3985          * Call into the pipeline to see if there is more work that
3986          * needs to be done. If there is work to be done it will be
3987          * dispatched to another taskq thread.
3988          */
3989         zio_allocate_dispatch(zio->io_spa);
3990 }
3991
3992 static int
3993 zio_done(zio_t *zio)
3994 {
3995         /*
3996          * Always attempt to keep stack usage minimal here since
3997          * we can be called recurisvely up to 19 levels deep.
3998          */
3999         const uint64_t psize = zio->io_size;
4000         zio_t *pio, *pio_next;
4001         zio_link_t *zl = NULL;
4002
4003         /*
4004          * If our children haven't all completed,
4005          * wait for them and then repeat this pipeline stage.
4006          */
4007         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
4008             zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
4009             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
4010             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
4011                 return (ZIO_PIPELINE_STOP);
4012
4013         /*
4014          * If the allocation throttle is enabled, then update the accounting.
4015          * We only track child I/Os that are part of an allocating async
4016          * write. We must do this since the allocation is performed
4017          * by the logical I/O but the actual write is done by child I/Os.
4018          */
4019         if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
4020             zio->io_child_type == ZIO_CHILD_VDEV) {
4021                 ASSERT(spa_normal_class(
4022                     zio->io_spa)->mc_alloc_throttle_enabled);
4023                 zio_dva_throttle_done(zio);
4024         }
4025
4026         /*
4027          * If the allocation throttle is enabled, verify that
4028          * we have decremented the refcounts for every I/O that was throttled.
4029          */
4030         if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
4031                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
4032                 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
4033                 ASSERT(zio->io_bp != NULL);
4034                 metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio);
4035                 VERIFY(refcount_not_held(
4036                     &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio));
4037         }
4038
4039
4040         for (int c = 0; c < ZIO_CHILD_TYPES; c++)
4041                 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
4042                         ASSERT(zio->io_children[c][w] == 0);
4043
4044         if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
4045                 ASSERT(zio->io_bp->blk_pad[0] == 0);
4046                 ASSERT(zio->io_bp->blk_pad[1] == 0);
4047                 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
4048                     sizeof (blkptr_t)) == 0 ||
4049                     (zio->io_bp == zio_unique_parent(zio)->io_bp));
4050                 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
4051                     zio->io_bp_override == NULL &&
4052                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
4053                         ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
4054                         ASSERT3U(zio->io_prop.zp_copies, <=,
4055                             BP_GET_NDVAS(zio->io_bp));
4056                         ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
4057                             (BP_COUNT_GANG(zio->io_bp) ==
4058                             BP_GET_NDVAS(zio->io_bp)));
4059                 }
4060                 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
4061                         VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
4062         }
4063
4064         /*
4065          * If there were child vdev/gang/ddt errors, they apply to us now.
4066          */
4067         zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
4068         zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
4069         zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
4070
4071         /*
4072          * If the I/O on the transformed data was successful, generate any
4073          * checksum reports now while we still have the transformed data.
4074          */
4075         if (zio->io_error == 0) {
4076                 while (zio->io_cksum_report != NULL) {
4077                         zio_cksum_report_t *zcr = zio->io_cksum_report;
4078                         uint64_t align = zcr->zcr_align;
4079                         uint64_t asize = P2ROUNDUP(psize, align);
4080                         abd_t *adata = zio->io_abd;
4081
4082                         if (asize != psize) {
4083                                 adata = abd_alloc(asize, B_TRUE);
4084                                 abd_copy(adata, zio->io_abd, psize);
4085                                 abd_zero_off(adata, psize, asize - psize);
4086                         }
4087
4088                         zio->io_cksum_report = zcr->zcr_next;
4089                         zcr->zcr_next = NULL;
4090                         zcr->zcr_finish(zcr, adata);
4091                         zfs_ereport_free_checksum(zcr);
4092
4093                         if (asize != psize)
4094                                 abd_free(adata);
4095                 }
4096         }
4097
4098         zio_pop_transforms(zio);        /* note: may set zio->io_error */
4099
4100         vdev_stat_update(zio, psize);
4101
4102         /*
4103          * If this I/O is attached to a particular vdev is slow, exceeding
4104          * 30 seconds to complete, post an error described the I/O delay.
4105          * We ignore these errors if the device is currently unavailable.
4106          */
4107         if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
4108                 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
4109                         zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
4110                             zio->io_vd, &zio->io_bookmark, zio, 0, 0);
4111         }
4112
4113         if (zio->io_error) {
4114                 /*
4115                  * If this I/O is attached to a particular vdev,
4116                  * generate an error message describing the I/O failure
4117                  * at the block level.  We ignore these errors if the
4118                  * device is currently unavailable.
4119                  */
4120                 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
4121                     !vdev_is_dead(zio->io_vd))
4122                         zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
4123                             zio->io_vd, &zio->io_bookmark, zio, 0, 0);
4124
4125                 if ((zio->io_error == EIO || !(zio->io_flags &
4126                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
4127                     zio == zio->io_logical) {
4128                         /*
4129                          * For logical I/O requests, tell the SPA to log the
4130                          * error and generate a logical data ereport.
4131                          */
4132                         spa_log_error(zio->io_spa, &zio->io_bookmark);
4133                         zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
4134                             NULL, &zio->io_bookmark, zio, 0, 0);
4135                 }
4136         }
4137
4138         if (zio->io_error && zio == zio->io_logical) {
4139                 /*
4140                  * Determine whether zio should be reexecuted.  This will
4141                  * propagate all the way to the root via zio_notify_parent().
4142                  */
4143                 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
4144                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
4145
4146                 if (IO_IS_ALLOCATING(zio) &&
4147                     !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
4148                         if (zio->io_error != ENOSPC)
4149                                 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
4150                         else
4151                                 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
4152                 }
4153
4154                 if ((zio->io_type == ZIO_TYPE_READ ||
4155                     zio->io_type == ZIO_TYPE_FREE) &&
4156                     !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
4157                     zio->io_error == ENXIO &&
4158                     spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
4159                     spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
4160                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
4161
4162                 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
4163                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
4164
4165                 /*
4166                  * Here is a possibly good place to attempt to do
4167                  * either combinatorial reconstruction or error correction
4168                  * based on checksums.  It also might be a good place
4169                  * to send out preliminary ereports before we suspend
4170                  * processing.
4171                  */
4172         }
4173
4174         /*
4175          * If there were logical child errors, they apply to us now.
4176          * We defer this until now to avoid conflating logical child
4177          * errors with errors that happened to the zio itself when
4178          * updating vdev stats and reporting FMA events above.
4179          */
4180         zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
4181
4182         if ((zio->io_error || zio->io_reexecute) &&
4183             IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
4184             !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
4185                 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
4186
4187         zio_gang_tree_free(&zio->io_gang_tree);
4188
4189         /*
4190          * Godfather I/Os should never suspend.
4191          */
4192         if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
4193             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
4194                 zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
4195
4196         if (zio->io_reexecute) {
4197                 /*
4198                  * This is a logical I/O that wants to reexecute.
4199                  *
4200                  * Reexecute is top-down.  When an i/o fails, if it's not
4201                  * the root, it simply notifies its parent and sticks around.
4202                  * The parent, seeing that it still has children in zio_done(),
4203                  * does the same.  This percolates all the way up to the root.
4204                  * The root i/o will reexecute or suspend the entire tree.
4205                  *
4206                  * This approach ensures that zio_reexecute() honors
4207                  * all the original i/o dependency relationships, e.g.
4208                  * parents not executing until children are ready.
4209                  */
4210                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
4211
4212                 zio->io_gang_leader = NULL;
4213
4214                 mutex_enter(&zio->io_lock);
4215                 zio->io_state[ZIO_WAIT_DONE] = 1;
4216                 mutex_exit(&zio->io_lock);
4217
4218                 /*
4219                  * "The Godfather" I/O monitors its children but is
4220                  * not a true parent to them. It will track them through
4221                  * the pipeline but severs its ties whenever they get into
4222                  * trouble (e.g. suspended). This allows "The Godfather"
4223                  * I/O to return status without blocking.
4224                  */
4225                 zl = NULL;
4226                 for (pio = zio_walk_parents(zio, &zl); pio != NULL;
4227                     pio = pio_next) {
4228                         zio_link_t *remove_zl = zl;
4229                         pio_next = zio_walk_parents(zio, &zl);
4230
4231                         if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
4232                             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
4233                                 zio_remove_child(pio, zio, remove_zl);
4234                                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
4235                         }
4236                 }
4237
4238                 if ((pio = zio_unique_parent(zio)) != NULL) {
4239                         /*
4240                          * We're not a root i/o, so there's nothing to do
4241                          * but notify our parent.  Don't propagate errors
4242                          * upward since we haven't permanently failed yet.
4243                          */
4244                         ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
4245                         zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
4246                         zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
4247                 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
4248                         /*
4249                          * We'd fail again if we reexecuted now, so suspend
4250                          * until conditions improve (e.g. device comes online).
4251                          */
4252                         zio_suspend(zio->io_spa, zio);
4253                 } else {
4254                         /*
4255                          * Reexecution is potentially a huge amount of work.
4256                          * Hand it off to the otherwise-unused claim taskq.
4257                          */
4258                         ASSERT(taskq_empty_ent(&zio->io_tqent));
4259                         spa_taskq_dispatch_ent(zio->io_spa,
4260                             ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
4261                             (task_func_t *)zio_reexecute, zio, 0,
4262                             &zio->io_tqent);
4263                 }
4264                 return (ZIO_PIPELINE_STOP);
4265         }
4266
4267         ASSERT(zio->io_child_count == 0);
4268         ASSERT(zio->io_reexecute == 0);
4269         ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
4270
4271         /*
4272          * Report any checksum errors, since the I/O is complete.
4273          */
4274         while (zio->io_cksum_report != NULL) {
4275                 zio_cksum_report_t *zcr = zio->io_cksum_report;
4276                 zio->io_cksum_report = zcr->zcr_next;
4277                 zcr->zcr_next = NULL;
4278                 zcr->zcr_finish(zcr, NULL);
4279                 zfs_ereport_free_checksum(zcr);
4280         }
4281
4282         if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
4283             !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
4284             !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
4285                 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
4286         }
4287
4288         /*
4289          * It is the responsibility of the done callback to ensure that this
4290          * particular zio is no longer discoverable for adoption, and as
4291          * such, cannot acquire any new parents.
4292          */
4293         if (zio->io_done)
4294                 zio->io_done(zio);
4295
4296         mutex_enter(&zio->io_lock);
4297         zio->io_state[ZIO_WAIT_DONE] = 1;
4298         mutex_exit(&zio->io_lock);
4299
4300         zl = NULL;
4301         for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
4302                 zio_link_t *remove_zl = zl;
4303                 pio_next = zio_walk_parents(zio, &zl);
4304                 zio_remove_child(pio, zio, remove_zl);
4305                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
4306         }
4307
4308         if (zio->io_waiter != NULL) {
4309                 mutex_enter(&zio->io_lock);
4310                 zio->io_executor = NULL;
4311                 cv_broadcast(&zio->io_cv);
4312                 mutex_exit(&zio->io_lock);
4313         } else {
4314                 zio_destroy(zio);
4315         }
4316
4317         return (ZIO_PIPELINE_STOP);
4318 }
4319
4320 /*
4321  * ==========================================================================
4322  * I/O pipeline definition
4323  * ==========================================================================
4324  */
4325 static zio_pipe_stage_t *zio_pipeline[] = {
4326         NULL,
4327         zio_read_bp_init,
4328         zio_write_bp_init,
4329         zio_free_bp_init,
4330         zio_issue_async,
4331         zio_write_compress,
4332         zio_encrypt,
4333         zio_checksum_generate,
4334         zio_nop_write,
4335         zio_ddt_read_start,
4336         zio_ddt_read_done,
4337         zio_ddt_write,
4338         zio_ddt_free,
4339         zio_gang_assemble,
4340         zio_gang_issue,
4341         zio_dva_throttle,
4342         zio_dva_allocate,
4343         zio_dva_free,
4344         zio_dva_claim,
4345         zio_ready,
4346         zio_vdev_io_start,
4347         zio_vdev_io_done,
4348         zio_vdev_io_assess,
4349         zio_checksum_verify,
4350         zio_done
4351 };
4352
4353
4354
4355
4356 /*
4357  * Compare two zbookmark_phys_t's to see which we would reach first in a
4358  * pre-order traversal of the object tree.
4359  *
4360  * This is simple in every case aside from the meta-dnode object. For all other
4361  * objects, we traverse them in order (object 1 before object 2, and so on).
4362  * However, all of these objects are traversed while traversing object 0, since
4363  * the data it points to is the list of objects.  Thus, we need to convert to a
4364  * canonical representation so we can compare meta-dnode bookmarks to
4365  * non-meta-dnode bookmarks.
4366  *
4367  * We do this by calculating "equivalents" for each field of the zbookmark.
4368  * zbookmarks outside of the meta-dnode use their own object and level, and
4369  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
4370  * blocks this bookmark refers to) by multiplying their blkid by their span
4371  * (the number of L0 blocks contained within one block at their level).
4372  * zbookmarks inside the meta-dnode calculate their object equivalent
4373  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
4374  * level + 1<<31 (any value larger than a level could ever be) for their level.
4375  * This causes them to always compare before a bookmark in their object
4376  * equivalent, compare appropriately to bookmarks in other objects, and to
4377  * compare appropriately to other bookmarks in the meta-dnode.
4378  */
4379 int
4380 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
4381     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
4382 {
4383         /*
4384          * These variables represent the "equivalent" values for the zbookmark,
4385          * after converting zbookmarks inside the meta dnode to their
4386          * normal-object equivalents.
4387          */
4388         uint64_t zb1obj, zb2obj;
4389         uint64_t zb1L0, zb2L0;
4390         uint64_t zb1level, zb2level;
4391
4392         if (zb1->zb_object == zb2->zb_object &&
4393             zb1->zb_level == zb2->zb_level &&
4394             zb1->zb_blkid == zb2->zb_blkid)
4395                 return (0);
4396
4397         /*
4398          * BP_SPANB calculates the span in blocks.
4399          */
4400         zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
4401         zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
4402
4403         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
4404                 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
4405                 zb1L0 = 0;
4406                 zb1level = zb1->zb_level + COMPARE_META_LEVEL;
4407         } else {
4408                 zb1obj = zb1->zb_object;
4409                 zb1level = zb1->zb_level;
4410         }
4411
4412         if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
4413                 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
4414                 zb2L0 = 0;
4415                 zb2level = zb2->zb_level + COMPARE_META_LEVEL;
4416         } else {
4417                 zb2obj = zb2->zb_object;
4418                 zb2level = zb2->zb_level;
4419         }
4420
4421         /* Now that we have a canonical representation, do the comparison. */
4422         if (zb1obj != zb2obj)
4423                 return (zb1obj < zb2obj ? -1 : 1);
4424         else if (zb1L0 != zb2L0)
4425                 return (zb1L0 < zb2L0 ? -1 : 1);
4426         else if (zb1level != zb2level)
4427                 return (zb1level > zb2level ? -1 : 1);
4428         /*
4429          * This can (theoretically) happen if the bookmarks have the same object
4430          * and level, but different blkids, if the block sizes are not the same.
4431          * There is presently no way to change the indirect block sizes
4432          */
4433         return (0);
4434 }
4435
4436 /*
4437  *  This function checks the following: given that last_block is the place that
4438  *  our traversal stopped last time, does that guarantee that we've visited
4439  *  every node under subtree_root?  Therefore, we can't just use the raw output
4440  *  of zbookmark_compare.  We have to pass in a modified version of
4441  *  subtree_root; by incrementing the block id, and then checking whether
4442  *  last_block is before or equal to that, we can tell whether or not having
4443  *  visited last_block implies that all of subtree_root's children have been
4444  *  visited.
4445  */
4446 boolean_t
4447 zbookmark_subtree_completed(const dnode_phys_t *dnp,
4448     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
4449 {
4450         zbookmark_phys_t mod_zb = *subtree_root;
4451         mod_zb.zb_blkid++;
4452         ASSERT(last_block->zb_level == 0);
4453
4454         /* The objset_phys_t isn't before anything. */
4455         if (dnp == NULL)
4456                 return (B_FALSE);
4457
4458         /*
4459          * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
4460          * data block size in sectors, because that variable is only used if
4461          * the bookmark refers to a block in the meta-dnode.  Since we don't
4462          * know without examining it what object it refers to, and there's no
4463          * harm in passing in this value in other cases, we always pass it in.
4464          *
4465          * We pass in 0 for the indirect block size shift because zb2 must be
4466          * level 0.  The indirect block size is only used to calculate the span
4467          * of the bookmark, but since the bookmark must be level 0, the span is
4468          * always 1, so the math works out.
4469          *
4470          * If you make changes to how the zbookmark_compare code works, be sure
4471          * to make sure that this code still works afterwards.
4472          */
4473         return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
4474             1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
4475             last_block) <= 0);
4476 }
4477
4478 #if defined(_KERNEL) && defined(HAVE_SPL)
4479 EXPORT_SYMBOL(zio_type_name);
4480 EXPORT_SYMBOL(zio_buf_alloc);
4481 EXPORT_SYMBOL(zio_data_buf_alloc);
4482 EXPORT_SYMBOL(zio_buf_free);
4483 EXPORT_SYMBOL(zio_data_buf_free);
4484
4485 module_param(zio_delay_max, int, 0644);
4486 MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
4487
4488 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
4489 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
4490
4491 module_param(zfs_sync_pass_deferred_free, int, 0644);
4492 MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
4493         "Defer frees starting in this pass");
4494
4495 module_param(zfs_sync_pass_dont_compress, int, 0644);
4496 MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
4497         "Don't compress starting in this pass");
4498
4499 module_param(zfs_sync_pass_rewrite, int, 0644);
4500 MODULE_PARM_DESC(zfs_sync_pass_rewrite,
4501         "Rewrite new bps starting in this pass");
4502
4503 module_param(zio_dva_throttle_enabled, int, 0644);
4504 MODULE_PARM_DESC(zio_dva_throttle_enabled,
4505         "Throttle block allocations in the ZIO pipeline");
4506 #endif