module/zfs/zio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/fm/fs/zfs.h>
  29 #include <sys/spa.h>
  30 #include <sys/txg.h>
  31 #include <sys/spa_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio_impl.h>
  34 #include <sys/zio_compress.h>
  35 #include <sys/zio_checksum.h>
  36 #include <sys/dmu_objset.h>
  37 #include <sys/arc.h>
  38 #include <sys/ddt.h>
  39 #include <sys/blkptr.h>
  40 #include <sys/zfeature.h>
  41
  42 /*
  43  * ==========================================================================
  44  * I/O type descriptions
  45  * ==========================================================================
  46  */
  47 const char *zio_type_name[ZIO_TYPES] = {
  48         "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
  49 };
  50
  51 /*
  52  * ==========================================================================
  53  * I/O kmem caches
  54  * ==========================================================================
  55  */
  56 kmem_cache_t *zio_cache;
  57 kmem_cache_t *zio_link_cache;
  58 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  59 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  60 int zio_delay_max = ZIO_DELAY_MAX;
  61
  62 #define ZIO_PIPELINE_CONTINUE           0x100
  63 #define ZIO_PIPELINE_STOP               0x101
  64
  65 /*
  66  * The following actions directly effect the spa's sync-to-convergence logic.
  67  * The values below define the sync pass when we start performing the action.
  68  * Care should be taken when changing these values as they directly impact
  69  * spa_sync() performance. Tuning these values may introduce subtle performance
  70  * pathologies and should only be done in the context of performance analysis.
  71  * These tunables will eventually be removed and replaced with #defines once
  72  * enough analysis has been done to determine optimal values.
  73  *
  74  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  75  * regular blocks are not deferred.
  76  */
  77 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  78 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  79 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
  80
  81 /*
  82  * An allocating zio is one that either currently has the DVA allocate
  83  * stage set or will have it later in its lifetime.
  84  */
  85 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
  86
  87 int zio_requeue_io_start_cut_in_line = 1;
  88
  89 #ifdef ZFS_DEBUG
  90 int zio_buf_debug_limit = 16384;
  91 #else
  92 int zio_buf_debug_limit = 0;
  93 #endif
  94
  95 static inline void __zio_execute(zio_t *zio);
  96
  97 void
  98 zio_init(void)
  99 {
 100         size_t c;
 101         vmem_t *data_alloc_arena = NULL;
 102
 103         zio_cache = kmem_cache_create("zio_cache",
 104             sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 105         zio_link_cache = kmem_cache_create("zio_link_cache",
 106             sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 107
 108         /*
 109          * For small buffers, we want a cache for each multiple of
 110          * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
 111          * for each quarter-power of 2.  For large buffers, we want
 112          * a cache for each multiple of PAGESIZE.
 113          */
 114         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 115                 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 116                 size_t p2 = size;
 117                 size_t align = 0;
 118                 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 119
 120                 while (p2 & (p2 - 1))
 121                         p2 &= p2 - 1;
 122
 123 #ifndef _KERNEL
 124                 /*
 125                  * If we are using watchpoints, put each buffer on its own page,
 126                  * to eliminate the performance overhead of trapping to the
 127                  * kernel when modifying a non-watched buffer that shares the
 128                  * page with a watched buffer.
 129                  */
 130                 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 131                         continue;
 132 #endif
 133                 if (size <= 4 * SPA_MINBLOCKSIZE) {
 134                         align = SPA_MINBLOCKSIZE;
 135                 } else if (IS_P2ALIGNED(size, PAGESIZE)) {
 136                         align = PAGESIZE;
 137                 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 138                         align = p2 >> 2;
 139                 }
 140
 141                 if (align != 0) {
 142                         char name[36];
 143                         (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 144                         zio_buf_cache[c] = kmem_cache_create(name, size,
 145                             align, NULL, NULL, NULL, NULL, NULL, cflags);
 146
 147                         (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 148                         zio_data_buf_cache[c] = kmem_cache_create(name, size,
 149                             align, NULL, NULL, NULL, NULL,
 150                             data_alloc_arena, cflags);
 151                 }
 152         }
 153
 154         while (--c != 0) {
 155                 ASSERT(zio_buf_cache[c] != NULL);
 156                 if (zio_buf_cache[c - 1] == NULL)
 157                         zio_buf_cache[c - 1] = zio_buf_cache[c];
 158
 159                 ASSERT(zio_data_buf_cache[c] != NULL);
 160                 if (zio_data_buf_cache[c - 1] == NULL)
 161                         zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 162         }
 163
 164         zio_inject_init();
 165
 166         lz4_init();
 167 }
 168
 169 void
 170 zio_fini(void)
 171 {
 172         size_t c;
 173         kmem_cache_t *last_cache = NULL;
 174         kmem_cache_t *last_data_cache = NULL;
 175
 176         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 177                 if (zio_buf_cache[c] != last_cache) {
 178                         last_cache = zio_buf_cache[c];
 179                         kmem_cache_destroy(zio_buf_cache[c]);
 180                 }
 181                 zio_buf_cache[c] = NULL;
 182
 183                 if (zio_data_buf_cache[c] != last_data_cache) {
 184                         last_data_cache = zio_data_buf_cache[c];
 185                         kmem_cache_destroy(zio_data_buf_cache[c]);
 186                 }
 187                 zio_data_buf_cache[c] = NULL;
 188         }
 189
 190         kmem_cache_destroy(zio_link_cache);
 191         kmem_cache_destroy(zio_cache);
 192
 193         zio_inject_fini();
 194
 195         lz4_fini();
 196 }
 197
 198 /*
 199  * ==========================================================================
 200  * Allocate and free I/O buffers
 201  * ==========================================================================
 202  */
 203
 204 /*
 205  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 206  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 207  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 208  * excess / transient data in-core during a crashdump.
 209  */
 210 void *
 211 zio_buf_alloc(size_t size)
 212 {
 213         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 214
 215         ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 216
 217         return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 218 }
 219
 220 /*
 221  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 222  * crashdump if the kernel panics.  This exists so that we will limit the amount
 223  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 224  * of kernel heap dumped to disk when the kernel panics)
 225  */
 226 void *
 227 zio_data_buf_alloc(size_t size)
 228 {
 229         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 230
 231         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 232
 233         return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 234 }
 235
 236 void
 237 zio_buf_free(void *buf, size_t size)
 238 {
 239         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 240
 241         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 242
 243         kmem_cache_free(zio_buf_cache[c], buf);
 244 }
 245
 246 void
 247 zio_data_buf_free(void *buf, size_t size)
 248 {
 249         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 250
 251         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 252
 253         kmem_cache_free(zio_data_buf_cache[c], buf);
 254 }
 255
 256 /*
 257  * ==========================================================================
 258  * Push and pop I/O transform buffers
 259  * ==========================================================================
 260  */
 261 static void
 262 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 263         zio_transform_func_t *transform)
 264 {
 265         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 266
 267         zt->zt_orig_data = zio->io_data;
 268         zt->zt_orig_size = zio->io_size;
 269         zt->zt_bufsize = bufsize;
 270         zt->zt_transform = transform;
 271
 272         zt->zt_next = zio->io_transform_stack;
 273         zio->io_transform_stack = zt;
 274
 275         zio->io_data = data;
 276         zio->io_size = size;
 277 }
 278
 279 static void
 280 zio_pop_transforms(zio_t *zio)
 281 {
 282         zio_transform_t *zt;
 283
 284         while ((zt = zio->io_transform_stack) != NULL) {
 285                 if (zt->zt_transform != NULL)
 286                         zt->zt_transform(zio,
 287                             zt->zt_orig_data, zt->zt_orig_size);
 288
 289                 if (zt->zt_bufsize != 0)
 290                         zio_buf_free(zio->io_data, zt->zt_bufsize);
 291
 292                 zio->io_data = zt->zt_orig_data;
 293                 zio->io_size = zt->zt_orig_size;
 294                 zio->io_transform_stack = zt->zt_next;
 295
 296                 kmem_free(zt, sizeof (zio_transform_t));
 297         }
 298 }
 299
 300 /*
 301  * ==========================================================================
 302  * I/O transform callbacks for subblocks and decompression
 303  * ==========================================================================
 304  */
 305 static void
 306 zio_subblock(zio_t *zio, void *data, uint64_t size)
 307 {
 308         ASSERT(zio->io_size > size);
 309
 310         if (zio->io_type == ZIO_TYPE_READ)
 311                 bcopy(zio->io_data, data, size);
 312 }
 313
 314 static void
 315 zio_decompress(zio_t *zio, void *data, uint64_t size)
 316 {
 317         if (zio->io_error == 0 &&
 318             zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 319             zio->io_data, data, zio->io_size, size) != 0)
 320                 zio->io_error = SET_ERROR(EIO);
 321 }
 322
 323 /*
 324  * ==========================================================================
 325  * I/O parent/child relationships and pipeline interlocks
 326  * ==========================================================================
 327  */
 328 /*
 329  * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 330  *        continue calling these functions until they return NULL.
 331  *        Otherwise, the next caller will pick up the list walk in
 332  *        some indeterminate state.  (Otherwise every caller would
 333  *        have to pass in a cookie to keep the state represented by
 334  *        io_walk_link, which gets annoying.)
 335  */
 336 zio_t *
 337 zio_walk_parents(zio_t *cio)
 338 {
 339         zio_link_t *zl = cio->io_walk_link;
 340         list_t *pl = &cio->io_parent_list;
 341
 342         zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 343         cio->io_walk_link = zl;
 344
 345         if (zl == NULL)
 346                 return (NULL);
 347
 348         ASSERT(zl->zl_child == cio);
 349         return (zl->zl_parent);
 350 }
 351
 352 zio_t *
 353 zio_walk_children(zio_t *pio)
 354 {
 355         zio_link_t *zl = pio->io_walk_link;
 356         list_t *cl = &pio->io_child_list;
 357
 358         zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 359         pio->io_walk_link = zl;
 360
 361         if (zl == NULL)
 362                 return (NULL);
 363
 364         ASSERT(zl->zl_parent == pio);
 365         return (zl->zl_child);
 366 }
 367
 368 zio_t *
 369 zio_unique_parent(zio_t *cio)
 370 {
 371         zio_t *pio = zio_walk_parents(cio);
 372
 373         VERIFY(zio_walk_parents(cio) == NULL);
 374         return (pio);
 375 }
 376
 377 void
 378 zio_add_child(zio_t *pio, zio_t *cio)
 379 {
 380         zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 381         int w;
 382
 383         /*
 384          * Logical I/Os can have logical, gang, or vdev children.
 385          * Gang I/Os can have gang or vdev children.
 386          * Vdev I/Os can only have vdev children.
 387          * The following ASSERT captures all of these constraints.
 388          */
 389         ASSERT(cio->io_child_type <= pio->io_child_type);
 390
 391         zl->zl_parent = pio;
 392         zl->zl_child = cio;
 393
 394         mutex_enter(&cio->io_lock);
 395         mutex_enter(&pio->io_lock);
 396
 397         ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 398
 399         for (w = 0; w < ZIO_WAIT_TYPES; w++)
 400                 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 401
 402         list_insert_head(&pio->io_child_list, zl);
 403         list_insert_head(&cio->io_parent_list, zl);
 404
 405         pio->io_child_count++;
 406         cio->io_parent_count++;
 407
 408         mutex_exit(&pio->io_lock);
 409         mutex_exit(&cio->io_lock);
 410 }
 411
 412 static void
 413 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 414 {
 415         ASSERT(zl->zl_parent == pio);
 416         ASSERT(zl->zl_child == cio);
 417
 418         mutex_enter(&cio->io_lock);
 419         mutex_enter(&pio->io_lock);
 420
 421         list_remove(&pio->io_child_list, zl);
 422         list_remove(&cio->io_parent_list, zl);
 423
 424         pio->io_child_count--;
 425         cio->io_parent_count--;
 426
 427         mutex_exit(&pio->io_lock);
 428         mutex_exit(&cio->io_lock);
 429
 430         kmem_cache_free(zio_link_cache, zl);
 431 }
 432
 433 static boolean_t
 434 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 435 {
 436         uint64_t *countp = &zio->io_children[child][wait];
 437         boolean_t waiting = B_FALSE;
 438
 439         mutex_enter(&zio->io_lock);
 440         ASSERT(zio->io_stall == NULL);
 441         if (*countp != 0) {
 442                 zio->io_stage >>= 1;
 443                 zio->io_stall = countp;
 444                 waiting = B_TRUE;
 445         }
 446         mutex_exit(&zio->io_lock);
 447
 448         return (waiting);
 449 }
 450
 451 __attribute__((always_inline))
 452 static inline void
 453 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 454 {
 455         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 456         int *errorp = &pio->io_child_error[zio->io_child_type];
 457
 458         mutex_enter(&pio->io_lock);
 459         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 460                 *errorp = zio_worst_error(*errorp, zio->io_error);
 461         pio->io_reexecute |= zio->io_reexecute;
 462         ASSERT3U(*countp, >, 0);
 463
 464         (*countp)--;
 465
 466         if (*countp == 0 && pio->io_stall == countp) {
 467                 pio->io_stall = NULL;
 468                 mutex_exit(&pio->io_lock);
 469                 __zio_execute(pio);
 470         } else {
 471                 mutex_exit(&pio->io_lock);
 472         }
 473 }
 474
 475 static void
 476 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 477 {
 478         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 479                 zio->io_error = zio->io_child_error[c];
 480 }
 481
 482 /*
 483  * ==========================================================================
 484  * Create the various types of I/O (read, write, free, etc)
 485  * ==========================================================================
 486  */
 487 static zio_t *
 488 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 489     void *data, uint64_t size, zio_done_func_t *done, void *private,
 490     zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 491     vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
 492     enum zio_stage stage, enum zio_stage pipeline)
 493 {
 494         zio_t *zio;
 495
 496         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 497         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 498         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 499
 500         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 501         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 502         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 503
 504         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 505         bzero(zio, sizeof (zio_t));
 506
 507         mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 508         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 509
 510         list_create(&zio->io_parent_list, sizeof (zio_link_t),
 511             offsetof(zio_link_t, zl_parent_node));
 512         list_create(&zio->io_child_list, sizeof (zio_link_t),
 513             offsetof(zio_link_t, zl_child_node));
 514
 515         if (vd != NULL)
 516                 zio->io_child_type = ZIO_CHILD_VDEV;
 517         else if (flags & ZIO_FLAG_GANG_CHILD)
 518                 zio->io_child_type = ZIO_CHILD_GANG;
 519         else if (flags & ZIO_FLAG_DDT_CHILD)
 520                 zio->io_child_type = ZIO_CHILD_DDT;
 521         else
 522                 zio->io_child_type = ZIO_CHILD_LOGICAL;
 523
 524         if (bp != NULL) {
 525                 zio->io_bp = (blkptr_t *)bp;
 526                 zio->io_bp_copy = *bp;
 527                 zio->io_bp_orig = *bp;
 528                 if (type != ZIO_TYPE_WRITE ||
 529                     zio->io_child_type == ZIO_CHILD_DDT)
 530                         zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 531                 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 532                         zio->io_logical = zio;
 533                 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 534                         pipeline |= ZIO_GANG_STAGES;
 535         }
 536
 537         zio->io_spa = spa;
 538         zio->io_txg = txg;
 539         zio->io_done = done;
 540         zio->io_private = private;
 541         zio->io_type = type;
 542         zio->io_priority = priority;
 543         zio->io_vd = vd;
 544         zio->io_offset = offset;
 545         zio->io_orig_data = zio->io_data = data;
 546         zio->io_orig_size = zio->io_size = size;
 547         zio->io_orig_flags = zio->io_flags = flags;
 548         zio->io_orig_stage = zio->io_stage = stage;
 549         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 550
 551         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 552         zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 553
 554         if (zb != NULL)
 555                 zio->io_bookmark = *zb;
 556
 557         if (pio != NULL) {
 558                 if (zio->io_logical == NULL)
 559                         zio->io_logical = pio->io_logical;
 560                 if (zio->io_child_type == ZIO_CHILD_GANG)
 561                         zio->io_gang_leader = pio->io_gang_leader;
 562                 zio_add_child(pio, zio);
 563         }
 564
 565         taskq_init_ent(&zio->io_tqent);
 566
 567         return (zio);
 568 }
 569
 570 static void
 571 zio_destroy(zio_t *zio)
 572 {
 573         list_destroy(&zio->io_parent_list);
 574         list_destroy(&zio->io_child_list);
 575         mutex_destroy(&zio->io_lock);
 576         cv_destroy(&zio->io_cv);
 577         kmem_cache_free(zio_cache, zio);
 578 }
 579
 580 zio_t *
 581 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 582     void *private, enum zio_flag flags)
 583 {
 584         zio_t *zio;
 585
 586         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 587             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 588             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 589
 590         return (zio);
 591 }
 592
 593 zio_t *
 594 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 595 {
 596         return (zio_null(NULL, spa, NULL, done, private, flags));
 597 }
 598
 599 zio_t *
 600 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 601     void *data, uint64_t size, zio_done_func_t *done, void *private,
 602     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 603 {
 604         zio_t *zio;
 605
 606         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 607             data, size, done, private,
 608             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 609             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 610             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 611
 612         return (zio);
 613 }
 614
 615 zio_t *
 616 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 617     void *data, uint64_t size, const zio_prop_t *zp,
 618     zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 619     void *private,
 620     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 621 {
 622         zio_t *zio;
 623
 624         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 625             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 626             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 627             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 628             DMU_OT_IS_VALID(zp->zp_type) &&
 629             zp->zp_level < 32 &&
 630             zp->zp_copies > 0 &&
 631             zp->zp_copies <= spa_max_replication(spa));
 632
 633         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 634             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 635             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 636             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 637
 638         zio->io_ready = ready;
 639         zio->io_physdone = physdone;
 640         zio->io_prop = *zp;
 641
 642         /*
 643          * Data can be NULL if we are going to call zio_write_override() to
 644          * provide the already-allocated BP.  But we may need the data to
 645          * verify a dedup hit (if requested).  In this case, don't try to
 646          * dedup (just take the already-allocated BP verbatim).
 647          */
 648         if (data == NULL && zio->io_prop.zp_dedup_verify) {
 649                 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 650         }
 651
 652         return (zio);
 653 }
 654
 655 zio_t *
 656 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 657     uint64_t size, zio_done_func_t *done, void *private,
 658     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 659 {
 660         zio_t *zio;
 661
 662         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 663             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 664             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 665
 666         return (zio);
 667 }
 668
 669 void
 670 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 671 {
 672         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 673         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 674         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 675         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 676
 677         /*
 678          * We must reset the io_prop to match the values that existed
 679          * when the bp was first written by dmu_sync() keeping in mind
 680          * that nopwrite and dedup are mutually exclusive.
 681          */
 682         zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 683         zio->io_prop.zp_nopwrite = nopwrite;
 684         zio->io_prop.zp_copies = copies;
 685         zio->io_bp_override = bp;
 686 }
 687
 688 void
 689 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 690 {
 691
 692         /*
 693          * The check for EMBEDDED is a performance optimization.  We
 694          * process the free here (by ignoring it) rather than
 695          * putting it on the list and then processing it in zio_free_sync().
 696          */
 697         if (BP_IS_EMBEDDED(bp))
 698                 return;
 699         metaslab_check_free(spa, bp);
 700
 701         /*
 702          * Frees that are for the currently-syncing txg, are not going to be
 703          * deferred, and which will not need to do a read (i.e. not GANG or
 704          * DEDUP), can be processed immediately.  Otherwise, put them on the
 705          * in-memory list for later processing.
 706          */
 707         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 708             txg != spa->spa_syncing_txg ||
 709             spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 710                 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 711         } else {
 712                 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
 713         }
 714 }
 715
 716 zio_t *
 717 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 718     enum zio_flag flags)
 719 {
 720         zio_t *zio;
 721         enum zio_stage stage = ZIO_FREE_PIPELINE;
 722
 723         ASSERT(!BP_IS_HOLE(bp));
 724         ASSERT(spa_syncing_txg(spa) == txg);
 725         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 726
 727         if (BP_IS_EMBEDDED(bp))
 728                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 729
 730         metaslab_check_free(spa, bp);
 731         arc_freed(spa, bp);
 732
 733         /*
 734          * GANG and DEDUP blocks can induce a read (for the gang block header,
 735          * or the DDT), so issue them asynchronously so that this thread is
 736          * not tied up.
 737          */
 738         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 739                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 740
 741         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 742             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 743             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 744
 745         return (zio);
 746 }
 747
 748 zio_t *
 749 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 750     zio_done_func_t *done, void *private, enum zio_flag flags)
 751 {
 752         zio_t *zio;
 753
 754         dprintf_bp(bp, "claiming in txg %llu", txg);
 755
 756         if (BP_IS_EMBEDDED(bp))
 757                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 758
 759         /*
 760          * A claim is an allocation of a specific block.  Claims are needed
 761          * to support immediate writes in the intent log.  The issue is that
 762          * immediate writes contain committed data, but in a txg that was
 763          * *not* committed.  Upon opening the pool after an unclean shutdown,
 764          * the intent log claims all blocks that contain immediate write data
 765          * so that the SPA knows they're in use.
 766          *
 767          * All claims *must* be resolved in the first txg -- before the SPA
 768          * starts allocating blocks -- so that nothing is allocated twice.
 769          * If txg == 0 we just verify that the block is claimable.
 770          */
 771         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 772         ASSERT(txg == spa_first_txg(spa) || txg == 0);
 773         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 774
 775         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 776             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 777             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 778
 779         return (zio);
 780 }
 781
 782 zio_t *
 783 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 784     zio_done_func_t *done, void *private, enum zio_flag flags)
 785 {
 786         zio_t *zio;
 787         int c;
 788
 789         if (vd->vdev_children == 0) {
 790                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 791                     ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 792                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 793
 794                 zio->io_cmd = cmd;
 795         } else {
 796                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 797
 798                 for (c = 0; c < vd->vdev_children; c++)
 799                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 800                             done, private, flags));
 801         }
 802
 803         return (zio);
 804 }
 805
 806 zio_t *
 807 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 808     void *data, int checksum, zio_done_func_t *done, void *private,
 809     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 810 {
 811         zio_t *zio;
 812
 813         ASSERT(vd->vdev_children == 0);
 814         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 815             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 816         ASSERT3U(offset + size, <=, vd->vdev_psize);
 817
 818         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 819             ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 820             NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 821
 822         zio->io_prop.zp_checksum = checksum;
 823
 824         return (zio);
 825 }
 826
 827 zio_t *
 828 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 829     void *data, int checksum, zio_done_func_t *done, void *private,
 830     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 831 {
 832         zio_t *zio;
 833
 834         ASSERT(vd->vdev_children == 0);
 835         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 836             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 837         ASSERT3U(offset + size, <=, vd->vdev_psize);
 838
 839         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 840             ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 841             NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 842
 843         zio->io_prop.zp_checksum = checksum;
 844
 845         if (zio_checksum_table[checksum].ci_eck) {
 846                 /*
 847                  * zec checksums are necessarily destructive -- they modify
 848                  * the end of the write buffer to hold the verifier/checksum.
 849                  * Therefore, we must make a local copy in case the data is
 850                  * being written to multiple places in parallel.
 851                  */
 852                 void *wbuf = zio_buf_alloc(size);
 853                 bcopy(data, wbuf, size);
 854                 zio_push_transform(zio, wbuf, size, size, NULL);
 855         }
 856
 857         return (zio);
 858 }
 859
 860 /*
 861  * Create a child I/O to do some work for us.
 862  */
 863 zio_t *
 864 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 865         void *data, uint64_t size, int type, zio_priority_t priority,
 866         enum zio_flag flags, zio_done_func_t *done, void *private)
 867 {
 868         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 869         zio_t *zio;
 870
 871         ASSERT(vd->vdev_parent ==
 872             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 873
 874         if (type == ZIO_TYPE_READ && bp != NULL) {
 875                 /*
 876                  * If we have the bp, then the child should perform the
 877                  * checksum and the parent need not.  This pushes error
 878                  * detection as close to the leaves as possible and
 879                  * eliminates redundant checksums in the interior nodes.
 880                  */
 881                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 882                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 883         }
 884
 885         if (vd->vdev_children == 0)
 886                 offset += VDEV_LABEL_START_SIZE;
 887
 888         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 889
 890         /*
 891          * If we've decided to do a repair, the write is not speculative --
 892          * even if the original read was.
 893          */
 894         if (flags & ZIO_FLAG_IO_REPAIR)
 895                 flags &= ~ZIO_FLAG_SPECULATIVE;
 896
 897         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 898             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 899             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 900
 901         zio->io_physdone = pio->io_physdone;
 902         if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
 903                 zio->io_logical->io_phys_children++;
 904
 905         return (zio);
 906 }
 907
 908 zio_t *
 909 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 910         int type, zio_priority_t priority, enum zio_flag flags,
 911         zio_done_func_t *done, void *private)
 912 {
 913         zio_t *zio;
 914
 915         ASSERT(vd->vdev_ops->vdev_op_leaf);
 916
 917         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 918             data, size, done, private, type, priority,
 919             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 920             vd, offset, NULL,
 921             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 922
 923         return (zio);
 924 }
 925
 926 void
 927 zio_flush(zio_t *zio, vdev_t *vd)
 928 {
 929         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 930             NULL, NULL,
 931             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 932 }
 933
 934 void
 935 zio_shrink(zio_t *zio, uint64_t size)
 936 {
 937         ASSERT(zio->io_executor == NULL);
 938         ASSERT(zio->io_orig_size == zio->io_size);
 939         ASSERT(size <= zio->io_size);
 940
 941         /*
 942          * We don't shrink for raidz because of problems with the
 943          * reconstruction when reading back less than the block size.
 944          * Note, BP_IS_RAIDZ() assumes no compression.
 945          */
 946         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 947         if (!BP_IS_RAIDZ(zio->io_bp))
 948                 zio->io_orig_size = zio->io_size = size;
 949 }
 950
 951 /*
 952  * ==========================================================================
 953  * Prepare to read and write logical blocks
 954  * ==========================================================================
 955  */
 956
 957 static int
 958 zio_read_bp_init(zio_t *zio)
 959 {
 960         blkptr_t *bp = zio->io_bp;
 961
 962         if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 963             zio->io_child_type == ZIO_CHILD_LOGICAL &&
 964             !(zio->io_flags & ZIO_FLAG_RAW)) {
 965                 uint64_t psize =
 966                     BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 967                 void *cbuf = zio_buf_alloc(psize);
 968
 969                 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 970         }
 971
 972         if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 973                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 974                 decode_embedded_bp_compressed(bp, zio->io_data);
 975         } else {
 976                 ASSERT(!BP_IS_EMBEDDED(bp));
 977         }
 978
 979         if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 980                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 981
 982         if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
 983                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 984
 985         if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 986                 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 987
 988         return (ZIO_PIPELINE_CONTINUE);
 989 }
 990
 991 static int
 992 zio_write_bp_init(zio_t *zio)
 993 {
 994         spa_t *spa = zio->io_spa;
 995         zio_prop_t *zp = &zio->io_prop;
 996         enum zio_compress compress = zp->zp_compress;
 997         blkptr_t *bp = zio->io_bp;
 998         uint64_t lsize = zio->io_size;
 999         uint64_t psize = lsize;
1000         int pass = 1;
1001
1002         /*
1003          * If our children haven't all reached the ready stage,
1004          * wait for them and then repeat this pipeline stage.
1005          */
1006         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1007             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1008                 return (ZIO_PIPELINE_STOP);
1009
1010         if (!IO_IS_ALLOCATING(zio))
1011                 return (ZIO_PIPELINE_CONTINUE);
1012
1013         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1014
1015         if (zio->io_bp_override) {
1016                 ASSERT(bp->blk_birth != zio->io_txg);
1017                 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1018
1019                 *bp = *zio->io_bp_override;
1020                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1021
1022                 if (BP_IS_EMBEDDED(bp))
1023                         return (ZIO_PIPELINE_CONTINUE);
1024
1025                 /*
1026                  * If we've been overridden and nopwrite is set then
1027                  * set the flag accordingly to indicate that a nopwrite
1028                  * has already occurred.
1029                  */
1030                 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1031                         ASSERT(!zp->zp_dedup);
1032                         zio->io_flags |= ZIO_FLAG_NOPWRITE;
1033                         return (ZIO_PIPELINE_CONTINUE);
1034                 }
1035
1036                 ASSERT(!zp->zp_nopwrite);
1037
1038                 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1039                         return (ZIO_PIPELINE_CONTINUE);
1040
1041                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1042                     zp->zp_dedup_verify);
1043
1044                 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1045                         BP_SET_DEDUP(bp, 1);
1046                         zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1047                         return (ZIO_PIPELINE_CONTINUE);
1048                 }
1049         }
1050
1051         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1052                 /*
1053                  * We're rewriting an existing block, which means we're
1054                  * working on behalf of spa_sync().  For spa_sync() to
1055                  * converge, it must eventually be the case that we don't
1056                  * have to allocate new blocks.  But compression changes
1057                  * the blocksize, which forces a reallocate, and makes
1058                  * convergence take longer.  Therefore, after the first
1059                  * few passes, stop compressing to ensure convergence.
1060                  */
1061                 pass = spa_sync_pass(spa);
1062
1063                 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1064                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1065                 ASSERT(!BP_GET_DEDUP(bp));
1066
1067                 if (pass >= zfs_sync_pass_dont_compress)
1068                         compress = ZIO_COMPRESS_OFF;
1069
1070                 /* Make sure someone doesn't change their mind on overwrites */
1071                 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1072                     spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1073         }
1074
1075         if (compress != ZIO_COMPRESS_OFF) {
1076                 void *cbuf = zio_buf_alloc(lsize);
1077                 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1078                 if (psize == 0 || psize == lsize) {
1079                         compress = ZIO_COMPRESS_OFF;
1080                         zio_buf_free(cbuf, lsize);
1081                 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1082                     zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1083                     spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1084                         encode_embedded_bp_compressed(bp,
1085                             cbuf, compress, lsize, psize);
1086                         BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1087                         BP_SET_TYPE(bp, zio->io_prop.zp_type);
1088                         BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1089                         zio_buf_free(cbuf, lsize);
1090                         bp->blk_birth = zio->io_txg;
1091                         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1092                         ASSERT(spa_feature_is_active(spa,
1093                             SPA_FEATURE_EMBEDDED_DATA));
1094                         return (ZIO_PIPELINE_CONTINUE);
1095                 } else {
1096                         /*
1097                          * Round up compressed size to MINBLOCKSIZE and
1098                          * zero the tail.
1099                          */
1100                         size_t rounded =
1101                             P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
1102                         if (rounded > psize) {
1103                                 bzero((char *)cbuf + psize, rounded - psize);
1104                                 psize = rounded;
1105                         }
1106                         if (psize == lsize) {
1107                                 compress = ZIO_COMPRESS_OFF;
1108                                 zio_buf_free(cbuf, lsize);
1109                         } else {
1110                                 zio_push_transform(zio, cbuf,
1111                                     psize, lsize, NULL);
1112                         }
1113                 }
1114         }
1115
1116         /*
1117          * The final pass of spa_sync() must be all rewrites, but the first
1118          * few passes offer a trade-off: allocating blocks defers convergence,
1119          * but newly allocated blocks are sequential, so they can be written
1120          * to disk faster.  Therefore, we allow the first few passes of
1121          * spa_sync() to allocate new blocks, but force rewrites after that.
1122          * There should only be a handful of blocks after pass 1 in any case.
1123          */
1124         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1125             BP_GET_PSIZE(bp) == psize &&
1126             pass >= zfs_sync_pass_rewrite) {
1127                 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1128                 ASSERT(psize != 0);
1129                 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1130                 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1131         } else {
1132                 BP_ZERO(bp);
1133                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1134         }
1135
1136         if (psize == 0) {
1137                 if (zio->io_bp_orig.blk_birth != 0 &&
1138                     spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1139                         BP_SET_LSIZE(bp, lsize);
1140                         BP_SET_TYPE(bp, zp->zp_type);
1141                         BP_SET_LEVEL(bp, zp->zp_level);
1142                         BP_SET_BIRTH(bp, zio->io_txg, 0);
1143                 }
1144                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1145         } else {
1146                 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1147                 BP_SET_LSIZE(bp, lsize);
1148                 BP_SET_TYPE(bp, zp->zp_type);
1149                 BP_SET_LEVEL(bp, zp->zp_level);
1150                 BP_SET_PSIZE(bp, psize);
1151                 BP_SET_COMPRESS(bp, compress);
1152                 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1153                 BP_SET_DEDUP(bp, zp->zp_dedup);
1154                 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1155                 if (zp->zp_dedup) {
1156                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1157                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1158                         zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1159                 }
1160                 if (zp->zp_nopwrite) {
1161                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1162                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1163                         zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1164                 }
1165         }
1166
1167         return (ZIO_PIPELINE_CONTINUE);
1168 }
1169
1170 static int
1171 zio_free_bp_init(zio_t *zio)
1172 {
1173         blkptr_t *bp = zio->io_bp;
1174
1175         if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1176                 if (BP_GET_DEDUP(bp))
1177                         zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1178         }
1179
1180         return (ZIO_PIPELINE_CONTINUE);
1181 }
1182
1183 /*
1184  * ==========================================================================
1185  * Execute the I/O pipeline
1186  * ==========================================================================
1187  */
1188
1189 static void
1190 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1191 {
1192         spa_t *spa = zio->io_spa;
1193         zio_type_t t = zio->io_type;
1194         int flags = (cutinline ? TQ_FRONT : 0);
1195
1196         /*
1197          * If we're a config writer or a probe, the normal issue and
1198          * interrupt threads may all be blocked waiting for the config lock.
1199          * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1200          */
1201         if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1202                 t = ZIO_TYPE_NULL;
1203
1204         /*
1205          * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1206          */
1207         if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1208                 t = ZIO_TYPE_NULL;
1209
1210         /*
1211          * If this is a high priority I/O, then use the high priority taskq if
1212          * available.
1213          */
1214         if (zio->io_priority == ZIO_PRIORITY_NOW &&
1215             spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1216                 q++;
1217
1218         ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1219
1220         /*
1221          * NB: We are assuming that the zio can only be dispatched
1222          * to a single taskq at a time.  It would be a grievous error
1223          * to dispatch the zio to another taskq at the same time.
1224          */
1225         ASSERT(taskq_empty_ent(&zio->io_tqent));
1226         spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1227             flags, &zio->io_tqent);
1228 }
1229
1230 static boolean_t
1231 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1232 {
1233         kthread_t *executor = zio->io_executor;
1234         spa_t *spa = zio->io_spa;
1235         zio_type_t t;
1236
1237         for (t = 0; t < ZIO_TYPES; t++) {
1238                 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1239                 uint_t i;
1240                 for (i = 0; i < tqs->stqs_count; i++) {
1241                         if (taskq_member(tqs->stqs_taskq[i], executor))
1242                                 return (B_TRUE);
1243                 }
1244         }
1245
1246         return (B_FALSE);
1247 }
1248
1249 static int
1250 zio_issue_async(zio_t *zio)
1251 {
1252         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1253
1254         return (ZIO_PIPELINE_STOP);
1255 }
1256
1257 void
1258 zio_interrupt(zio_t *zio)
1259 {
1260         zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1261 }
1262
1263 /*
1264  * Execute the I/O pipeline until one of the following occurs:
1265  * (1) the I/O completes; (2) the pipeline stalls waiting for
1266  * dependent child I/Os; (3) the I/O issues, so we're waiting
1267  * for an I/O completion interrupt; (4) the I/O is delegated by
1268  * vdev-level caching or aggregation; (5) the I/O is deferred
1269  * due to vdev-level queueing; (6) the I/O is handed off to
1270  * another thread.  In all cases, the pipeline stops whenever
1271  * there's no CPU work; it never burns a thread in cv_wait_io().
1272  *
1273  * There's no locking on io_stage because there's no legitimate way
1274  * for multiple threads to be attempting to process the same I/O.
1275  */
1276 static zio_pipe_stage_t *zio_pipeline[];
1277
1278 /*
1279  * zio_execute() is a wrapper around the static function
1280  * __zio_execute() so that we can force  __zio_execute() to be
1281  * inlined.  This reduces stack overhead which is important
1282  * because __zio_execute() is called recursively in several zio
1283  * code paths.  zio_execute() itself cannot be inlined because
1284  * it is externally visible.
1285  */
1286 void
1287 zio_execute(zio_t *zio)
1288 {
1289         fstrans_cookie_t cookie;
1290
1291         cookie = spl_fstrans_mark();
1292         __zio_execute(zio);
1293         spl_fstrans_unmark(cookie);
1294 }
1295
1296 __attribute__((always_inline))
1297 static inline void
1298 __zio_execute(zio_t *zio)
1299 {
1300         zio->io_executor = curthread;
1301
1302         while (zio->io_stage < ZIO_STAGE_DONE) {
1303                 enum zio_stage pipeline = zio->io_pipeline;
1304                 enum zio_stage stage = zio->io_stage;
1305                 dsl_pool_t *dp;
1306                 boolean_t cut;
1307                 int rv;
1308
1309                 ASSERT(!MUTEX_HELD(&zio->io_lock));
1310                 ASSERT(ISP2(stage));
1311                 ASSERT(zio->io_stall == NULL);
1312
1313                 do {
1314                         stage <<= 1;
1315                 } while ((stage & pipeline) == 0);
1316
1317                 ASSERT(stage <= ZIO_STAGE_DONE);
1318
1319                 dp = spa_get_dsl(zio->io_spa);
1320                 cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1321                     zio_requeue_io_start_cut_in_line : B_FALSE;
1322
1323                 /*
1324                  * If we are in interrupt context and this pipeline stage
1325                  * will grab a config lock that is held across I/O,
1326                  * or may wait for an I/O that needs an interrupt thread
1327                  * to complete, issue async to avoid deadlock.
1328                  *
1329                  * For VDEV_IO_START, we cut in line so that the io will
1330                  * be sent to disk promptly.
1331                  */
1332                 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1333                     zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1334                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1335                         return;
1336                 }
1337
1338                 /*
1339                  * If we executing in the context of the tx_sync_thread,
1340                  * or we are performing pool initialization outside of a
1341                  * zio_taskq[ZIO_TASKQ_ISSUE|ZIO_TASKQ_ISSUE_HIGH] context.
1342                  * Then issue the zio asynchronously to minimize stack usage
1343                  * for these deep call paths.
1344                  */
1345                 if ((dp && curthread == dp->dp_tx.tx_sync_thread) ||
1346                     (dp && spa_is_initializing(dp->dp_spa) &&
1347                     !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
1348                     !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))) {
1349                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1350                         return;
1351                 }
1352
1353                 zio->io_stage = stage;
1354                 rv = zio_pipeline[highbit64(stage) - 1](zio);
1355
1356                 if (rv == ZIO_PIPELINE_STOP)
1357                         return;
1358
1359                 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1360         }
1361 }
1362
1363
1364 /*
1365  * ==========================================================================
1366  * Initiate I/O, either sync or async
1367  * ==========================================================================
1368  */
1369 int
1370 zio_wait(zio_t *zio)
1371 {
1372         int error;
1373
1374         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1375         ASSERT(zio->io_executor == NULL);
1376
1377         zio->io_waiter = curthread;
1378
1379         __zio_execute(zio);
1380
1381         mutex_enter(&zio->io_lock);
1382         while (zio->io_executor != NULL)
1383                 cv_wait_io(&zio->io_cv, &zio->io_lock);
1384         mutex_exit(&zio->io_lock);
1385
1386         error = zio->io_error;
1387         zio_destroy(zio);
1388
1389         return (error);
1390 }
1391
1392 void
1393 zio_nowait(zio_t *zio)
1394 {
1395         ASSERT(zio->io_executor == NULL);
1396
1397         if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1398             zio_unique_parent(zio) == NULL) {
1399                 zio_t *pio;
1400
1401                 /*
1402                  * This is a logical async I/O with no parent to wait for it.
1403                  * We add it to the spa_async_root_zio "Godfather" I/O which
1404                  * will ensure they complete prior to unloading the pool.
1405                  */
1406                 spa_t *spa = zio->io_spa;
1407                 kpreempt_disable();
1408                 pio = spa->spa_async_zio_root[CPU_SEQID];
1409                 kpreempt_enable();
1410
1411                 zio_add_child(pio, zio);
1412         }
1413
1414         __zio_execute(zio);
1415 }
1416
1417 /*
1418  * ==========================================================================
1419  * Reexecute or suspend/resume failed I/O
1420  * ==========================================================================
1421  */
1422
1423 static void
1424 zio_reexecute(zio_t *pio)
1425 {
1426         zio_t *cio, *cio_next;
1427         int c, w;
1428
1429         ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1430         ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1431         ASSERT(pio->io_gang_leader == NULL);
1432         ASSERT(pio->io_gang_tree == NULL);
1433
1434         pio->io_flags = pio->io_orig_flags;
1435         pio->io_stage = pio->io_orig_stage;
1436         pio->io_pipeline = pio->io_orig_pipeline;
1437         pio->io_reexecute = 0;
1438         pio->io_flags |= ZIO_FLAG_REEXECUTED;
1439         pio->io_error = 0;
1440         for (w = 0; w < ZIO_WAIT_TYPES; w++)
1441                 pio->io_state[w] = 0;
1442         for (c = 0; c < ZIO_CHILD_TYPES; c++)
1443                 pio->io_child_error[c] = 0;
1444
1445         if (IO_IS_ALLOCATING(pio))
1446                 BP_ZERO(pio->io_bp);
1447
1448         /*
1449          * As we reexecute pio's children, new children could be created.
1450          * New children go to the head of pio's io_child_list, however,
1451          * so we will (correctly) not reexecute them.  The key is that
1452          * the remainder of pio's io_child_list, from 'cio_next' onward,
1453          * cannot be affected by any side effects of reexecuting 'cio'.
1454          */
1455         for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1456                 cio_next = zio_walk_children(pio);
1457                 mutex_enter(&pio->io_lock);
1458                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
1459                         pio->io_children[cio->io_child_type][w]++;
1460                 mutex_exit(&pio->io_lock);
1461                 zio_reexecute(cio);
1462         }
1463
1464         /*
1465          * Now that all children have been reexecuted, execute the parent.
1466          * We don't reexecute "The Godfather" I/O here as it's the
1467          * responsibility of the caller to wait on him.
1468          */
1469         if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1470                 __zio_execute(pio);
1471 }
1472
1473 void
1474 zio_suspend(spa_t *spa, zio_t *zio)
1475 {
1476         if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1477                 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1478                     "failure and the failure mode property for this pool "
1479                     "is set to panic.", spa_name(spa));
1480
1481         cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
1482             "failure and has been suspended.\n", spa_name(spa));
1483
1484         zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1485
1486         mutex_enter(&spa->spa_suspend_lock);
1487
1488         if (spa->spa_suspend_zio_root == NULL)
1489                 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1490                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1491                     ZIO_FLAG_GODFATHER);
1492
1493         spa->spa_suspended = B_TRUE;
1494
1495         if (zio != NULL) {
1496                 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1497                 ASSERT(zio != spa->spa_suspend_zio_root);
1498                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1499                 ASSERT(zio_unique_parent(zio) == NULL);
1500                 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1501                 zio_add_child(spa->spa_suspend_zio_root, zio);
1502         }
1503
1504         mutex_exit(&spa->spa_suspend_lock);
1505 }
1506
1507 int
1508 zio_resume(spa_t *spa)
1509 {
1510         zio_t *pio;
1511
1512         /*
1513          * Reexecute all previously suspended i/o.
1514          */
1515         mutex_enter(&spa->spa_suspend_lock);
1516         spa->spa_suspended = B_FALSE;
1517         cv_broadcast(&spa->spa_suspend_cv);
1518         pio = spa->spa_suspend_zio_root;
1519         spa->spa_suspend_zio_root = NULL;
1520         mutex_exit(&spa->spa_suspend_lock);
1521
1522         if (pio == NULL)
1523                 return (0);
1524
1525         zio_reexecute(pio);
1526         return (zio_wait(pio));
1527 }
1528
1529 void
1530 zio_resume_wait(spa_t *spa)
1531 {
1532         mutex_enter(&spa->spa_suspend_lock);
1533         while (spa_suspended(spa))
1534                 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1535         mutex_exit(&spa->spa_suspend_lock);
1536 }
1537
1538 /*
1539  * ==========================================================================
1540  * Gang blocks.
1541  *
1542  * A gang block is a collection of small blocks that looks to the DMU
1543  * like one large block.  When zio_dva_allocate() cannot find a block
1544  * of the requested size, due to either severe fragmentation or the pool
1545  * being nearly full, it calls zio_write_gang_block() to construct the
1546  * block from smaller fragments.
1547  *
1548  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1549  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1550  * an indirect block: it's an array of block pointers.  It consumes
1551  * only one sector and hence is allocatable regardless of fragmentation.
1552  * The gang header's bps point to its gang members, which hold the data.
1553  *
1554  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1555  * as the verifier to ensure uniqueness of the SHA256 checksum.
1556  * Critically, the gang block bp's blk_cksum is the checksum of the data,
1557  * not the gang header.  This ensures that data block signatures (needed for
1558  * deduplication) are independent of how the block is physically stored.
1559  *
1560  * Gang blocks can be nested: a gang member may itself be a gang block.
1561  * Thus every gang block is a tree in which root and all interior nodes are
1562  * gang headers, and the leaves are normal blocks that contain user data.
1563  * The root of the gang tree is called the gang leader.
1564  *
1565  * To perform any operation (read, rewrite, free, claim) on a gang block,
1566  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1567  * in the io_gang_tree field of the original logical i/o by recursively
1568  * reading the gang leader and all gang headers below it.  This yields
1569  * an in-core tree containing the contents of every gang header and the
1570  * bps for every constituent of the gang block.
1571  *
1572  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1573  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1574  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1575  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1576  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1577  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1578  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1579  * of the gang header plus zio_checksum_compute() of the data to update the
1580  * gang header's blk_cksum as described above.
1581  *
1582  * The two-phase assemble/issue model solves the problem of partial failure --
1583  * what if you'd freed part of a gang block but then couldn't read the
1584  * gang header for another part?  Assembling the entire gang tree first
1585  * ensures that all the necessary gang header I/O has succeeded before
1586  * starting the actual work of free, claim, or write.  Once the gang tree
1587  * is assembled, free and claim are in-memory operations that cannot fail.
1588  *
1589  * In the event that a gang write fails, zio_dva_unallocate() walks the
1590  * gang tree to immediately free (i.e. insert back into the space map)
1591  * everything we've allocated.  This ensures that we don't get ENOSPC
1592  * errors during repeated suspend/resume cycles due to a flaky device.
1593  *
1594  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1595  * the gang tree, we won't modify the block, so we can safely defer the free
1596  * (knowing that the block is still intact).  If we *can* assemble the gang
1597  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1598  * each constituent bp and we can allocate a new block on the next sync pass.
1599  *
1600  * In all cases, the gang tree allows complete recovery from partial failure.
1601  * ==========================================================================
1602  */
1603
1604 static zio_t *
1605 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1606 {
1607         if (gn != NULL)
1608                 return (pio);
1609
1610         return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1611             NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1612             &pio->io_bookmark));
1613 }
1614
1615 zio_t *
1616 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1617 {
1618         zio_t *zio;
1619
1620         if (gn != NULL) {
1621                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1622                     gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1623                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1624                 /*
1625                  * As we rewrite each gang header, the pipeline will compute
1626                  * a new gang block header checksum for it; but no one will
1627                  * compute a new data checksum, so we do that here.  The one
1628                  * exception is the gang leader: the pipeline already computed
1629                  * its data checksum because that stage precedes gang assembly.
1630                  * (Presently, nothing actually uses interior data checksums;
1631                  * this is just good hygiene.)
1632                  */
1633                 if (gn != pio->io_gang_leader->io_gang_tree) {
1634                         zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1635                             data, BP_GET_PSIZE(bp));
1636                 }
1637                 /*
1638                  * If we are here to damage data for testing purposes,
1639                  * leave the GBH alone so that we can detect the damage.
1640                  */
1641                 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1642                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1643         } else {
1644                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1645                     data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1646                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1647         }
1648
1649         return (zio);
1650 }
1651
1652 /* ARGSUSED */
1653 zio_t *
1654 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1655 {
1656         return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1657             ZIO_GANG_CHILD_FLAGS(pio)));
1658 }
1659
1660 /* ARGSUSED */
1661 zio_t *
1662 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1663 {
1664         return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1665             NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1666 }
1667
1668 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1669         NULL,
1670         zio_read_gang,
1671         zio_rewrite_gang,
1672         zio_free_gang,
1673         zio_claim_gang,
1674         NULL
1675 };
1676
1677 static void zio_gang_tree_assemble_done(zio_t *zio);
1678
1679 static zio_gang_node_t *
1680 zio_gang_node_alloc(zio_gang_node_t **gnpp)
1681 {
1682         zio_gang_node_t *gn;
1683
1684         ASSERT(*gnpp == NULL);
1685
1686         gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1687         gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1688         *gnpp = gn;
1689
1690         return (gn);
1691 }
1692
1693 static void
1694 zio_gang_node_free(zio_gang_node_t **gnpp)
1695 {
1696         zio_gang_node_t *gn = *gnpp;
1697         int g;
1698
1699         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1700                 ASSERT(gn->gn_child[g] == NULL);
1701
1702         zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1703         kmem_free(gn, sizeof (*gn));
1704         *gnpp = NULL;
1705 }
1706
1707 static void
1708 zio_gang_tree_free(zio_gang_node_t **gnpp)
1709 {
1710         zio_gang_node_t *gn = *gnpp;
1711         int g;
1712
1713         if (gn == NULL)
1714                 return;
1715
1716         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1717                 zio_gang_tree_free(&gn->gn_child[g]);
1718
1719         zio_gang_node_free(gnpp);
1720 }
1721
1722 static void
1723 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1724 {
1725         zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1726
1727         ASSERT(gio->io_gang_leader == gio);
1728         ASSERT(BP_IS_GANG(bp));
1729
1730         zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1731             SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1732             gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1733 }
1734
1735 static void
1736 zio_gang_tree_assemble_done(zio_t *zio)
1737 {
1738         zio_t *gio = zio->io_gang_leader;
1739         zio_gang_node_t *gn = zio->io_private;
1740         blkptr_t *bp = zio->io_bp;
1741         int g;
1742
1743         ASSERT(gio == zio_unique_parent(zio));
1744         ASSERT(zio->io_child_count == 0);
1745
1746         if (zio->io_error)
1747                 return;
1748
1749         if (BP_SHOULD_BYTESWAP(bp))
1750                 byteswap_uint64_array(zio->io_data, zio->io_size);
1751
1752         ASSERT(zio->io_data == gn->gn_gbh);
1753         ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1754         ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1755
1756         for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1757                 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1758                 if (!BP_IS_GANG(gbp))
1759                         continue;
1760                 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1761         }
1762 }
1763
1764 static void
1765 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1766 {
1767         zio_t *gio = pio->io_gang_leader;
1768         zio_t *zio;
1769         int g;
1770
1771         ASSERT(BP_IS_GANG(bp) == !!gn);
1772         ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1773         ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1774
1775         /*
1776          * If you're a gang header, your data is in gn->gn_gbh.
1777          * If you're a gang member, your data is in 'data' and gn == NULL.
1778          */
1779         zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1780
1781         if (gn != NULL) {
1782                 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1783
1784                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1785                         blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1786                         if (BP_IS_HOLE(gbp))
1787                                 continue;
1788                         zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1789                         data = (char *)data + BP_GET_PSIZE(gbp);
1790                 }
1791         }
1792
1793         if (gn == gio->io_gang_tree)
1794                 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1795
1796         if (zio != pio)
1797                 zio_nowait(zio);
1798 }
1799
1800 static int
1801 zio_gang_assemble(zio_t *zio)
1802 {
1803         blkptr_t *bp = zio->io_bp;
1804
1805         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1806         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1807
1808         zio->io_gang_leader = zio;
1809
1810         zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1811
1812         return (ZIO_PIPELINE_CONTINUE);
1813 }
1814
1815 static int
1816 zio_gang_issue(zio_t *zio)
1817 {
1818         blkptr_t *bp = zio->io_bp;
1819
1820         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1821                 return (ZIO_PIPELINE_STOP);
1822
1823         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1824         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1825
1826         if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1827                 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1828         else
1829                 zio_gang_tree_free(&zio->io_gang_tree);
1830
1831         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1832
1833         return (ZIO_PIPELINE_CONTINUE);
1834 }
1835
1836 static void
1837 zio_write_gang_member_ready(zio_t *zio)
1838 {
1839         zio_t *pio = zio_unique_parent(zio);
1840         dva_t *cdva = zio->io_bp->blk_dva;
1841         dva_t *pdva = pio->io_bp->blk_dva;
1842         uint64_t asize;
1843         int d;
1844         ASSERTV(zio_t *gio = zio->io_gang_leader);
1845
1846         if (BP_IS_HOLE(zio->io_bp))
1847                 return;
1848
1849         ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1850
1851         ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1852         ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1853         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1854         ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1855         ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1856
1857         mutex_enter(&pio->io_lock);
1858         for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1859                 ASSERT(DVA_GET_GANG(&pdva[d]));
1860                 asize = DVA_GET_ASIZE(&pdva[d]);
1861                 asize += DVA_GET_ASIZE(&cdva[d]);
1862                 DVA_SET_ASIZE(&pdva[d], asize);
1863         }
1864         mutex_exit(&pio->io_lock);
1865 }
1866
1867 static int
1868 zio_write_gang_block(zio_t *pio)
1869 {
1870         spa_t *spa = pio->io_spa;
1871         blkptr_t *bp = pio->io_bp;
1872         zio_t *gio = pio->io_gang_leader;
1873         zio_t *zio;
1874         zio_gang_node_t *gn, **gnpp;
1875         zio_gbh_phys_t *gbh;
1876         uint64_t txg = pio->io_txg;
1877         uint64_t resid = pio->io_size;
1878         uint64_t lsize;
1879         int copies = gio->io_prop.zp_copies;
1880         int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1881         zio_prop_t zp;
1882         int g, error;
1883
1884         error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1885             bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1886             METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1887         if (error) {
1888                 pio->io_error = error;
1889                 return (ZIO_PIPELINE_CONTINUE);
1890         }
1891
1892         if (pio == gio) {
1893                 gnpp = &gio->io_gang_tree;
1894         } else {
1895                 gnpp = pio->io_private;
1896                 ASSERT(pio->io_ready == zio_write_gang_member_ready);
1897         }
1898
1899         gn = zio_gang_node_alloc(gnpp);
1900         gbh = gn->gn_gbh;
1901         bzero(gbh, SPA_GANGBLOCKSIZE);
1902
1903         /*
1904          * Create the gang header.
1905          */
1906         zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1907             pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1908
1909         /*
1910          * Create and nowait the gang children.
1911          */
1912         for (g = 0; resid != 0; resid -= lsize, g++) {
1913                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1914                     SPA_MINBLOCKSIZE);
1915                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1916
1917                 zp.zp_checksum = gio->io_prop.zp_checksum;
1918                 zp.zp_compress = ZIO_COMPRESS_OFF;
1919                 zp.zp_type = DMU_OT_NONE;
1920                 zp.zp_level = 0;
1921                 zp.zp_copies = gio->io_prop.zp_copies;
1922                 zp.zp_dedup = B_FALSE;
1923                 zp.zp_dedup_verify = B_FALSE;
1924                 zp.zp_nopwrite = B_FALSE;
1925
1926                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1927                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1928                     zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1929                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1930                     &pio->io_bookmark));
1931         }
1932
1933         /*
1934          * Set pio's pipeline to just wait for zio to finish.
1935          */
1936         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1937
1938         /*
1939          * We didn't allocate this bp, so make sure it doesn't get unmarked.
1940          */
1941         pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
1942
1943         zio_nowait(zio);
1944
1945         return (ZIO_PIPELINE_CONTINUE);
1946 }
1947
1948 /*
1949  * The zio_nop_write stage in the pipeline determines if allocating
1950  * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1951  * such as SHA256, we can compare the checksums of the new data and the old
1952  * to determine if allocating a new block is required.  The nopwrite
1953  * feature can handle writes in either syncing or open context (i.e. zil
1954  * writes) and as a result is mutually exclusive with dedup.
1955  */
1956 static int
1957 zio_nop_write(zio_t *zio)
1958 {
1959         blkptr_t *bp = zio->io_bp;
1960         blkptr_t *bp_orig = &zio->io_bp_orig;
1961         zio_prop_t *zp = &zio->io_prop;
1962
1963         ASSERT(BP_GET_LEVEL(bp) == 0);
1964         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1965         ASSERT(zp->zp_nopwrite);
1966         ASSERT(!zp->zp_dedup);
1967         ASSERT(zio->io_bp_override == NULL);
1968         ASSERT(IO_IS_ALLOCATING(zio));
1969
1970         /*
1971          * Check to see if the original bp and the new bp have matching
1972          * characteristics (i.e. same checksum, compression algorithms, etc).
1973          * If they don't then just continue with the pipeline which will
1974          * allocate a new bp.
1975          */
1976         if (BP_IS_HOLE(bp_orig) ||
1977             !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
1978             BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
1979             BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
1980             BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
1981             zp->zp_copies != BP_GET_NDVAS(bp_orig))
1982                 return (ZIO_PIPELINE_CONTINUE);
1983
1984         /*
1985          * If the checksums match then reset the pipeline so that we
1986          * avoid allocating a new bp and issuing any I/O.
1987          */
1988         if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
1989                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
1990                 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
1991                 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
1992                 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
1993                 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
1994                     sizeof (uint64_t)) == 0);
1995
1996                 *bp = *bp_orig;
1997                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1998                 zio->io_flags |= ZIO_FLAG_NOPWRITE;
1999         }
2000
2001         return (ZIO_PIPELINE_CONTINUE);
2002 }
2003
2004 /*
2005  * ==========================================================================
2006  * Dedup
2007  * ==========================================================================
2008  */
2009 static void
2010 zio_ddt_child_read_done(zio_t *zio)
2011 {
2012         blkptr_t *bp = zio->io_bp;
2013         ddt_entry_t *dde = zio->io_private;
2014         ddt_phys_t *ddp;
2015         zio_t *pio = zio_unique_parent(zio);
2016
2017         mutex_enter(&pio->io_lock);
2018         ddp = ddt_phys_select(dde, bp);
2019         if (zio->io_error == 0)
2020                 ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
2021         if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2022                 dde->dde_repair_data = zio->io_data;
2023         else
2024                 zio_buf_free(zio->io_data, zio->io_size);
2025         mutex_exit(&pio->io_lock);
2026 }
2027
2028 static int
2029 zio_ddt_read_start(zio_t *zio)
2030 {
2031         blkptr_t *bp = zio->io_bp;
2032         int p;
2033
2034         ASSERT(BP_GET_DEDUP(bp));
2035         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2036         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2037
2038         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2039                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2040                 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2041                 ddt_phys_t *ddp = dde->dde_phys;
2042                 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2043                 blkptr_t blk;
2044
2045                 ASSERT(zio->io_vsd == NULL);
2046                 zio->io_vsd = dde;
2047
2048                 if (ddp_self == NULL)
2049                         return (ZIO_PIPELINE_CONTINUE);
2050
2051                 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2052                         if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2053                                 continue;
2054                         ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2055                             &blk);
2056                         zio_nowait(zio_read(zio, zio->io_spa, &blk,
2057                             zio_buf_alloc(zio->io_size), zio->io_size,
2058                             zio_ddt_child_read_done, dde, zio->io_priority,
2059                             ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2060                             &zio->io_bookmark));
2061                 }
2062                 return (ZIO_PIPELINE_CONTINUE);
2063         }
2064
2065         zio_nowait(zio_read(zio, zio->io_spa, bp,
2066             zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2067             ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2068
2069         return (ZIO_PIPELINE_CONTINUE);
2070 }
2071
2072 static int
2073 zio_ddt_read_done(zio_t *zio)
2074 {
2075         blkptr_t *bp = zio->io_bp;
2076
2077         if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2078                 return (ZIO_PIPELINE_STOP);
2079
2080         ASSERT(BP_GET_DEDUP(bp));
2081         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2082         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2083
2084         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2085                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2086                 ddt_entry_t *dde = zio->io_vsd;
2087                 if (ddt == NULL) {
2088                         ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2089                         return (ZIO_PIPELINE_CONTINUE);
2090                 }
2091                 if (dde == NULL) {
2092                         zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2093                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2094                         return (ZIO_PIPELINE_STOP);
2095                 }
2096                 if (dde->dde_repair_data != NULL) {
2097                         bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2098                         zio->io_child_error[ZIO_CHILD_DDT] = 0;
2099                 }
2100                 ddt_repair_done(ddt, dde);
2101                 zio->io_vsd = NULL;
2102         }
2103
2104         ASSERT(zio->io_vsd == NULL);
2105
2106         return (ZIO_PIPELINE_CONTINUE);
2107 }
2108
2109 static boolean_t
2110 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2111 {
2112         spa_t *spa = zio->io_spa;
2113         int p;
2114
2115         /*
2116          * Note: we compare the original data, not the transformed data,
2117          * because when zio->io_bp is an override bp, we will not have
2118          * pushed the I/O transforms.  That's an important optimization
2119          * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2120          */
2121         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2122                 zio_t *lio = dde->dde_lead_zio[p];
2123
2124                 if (lio != NULL) {
2125                         return (lio->io_orig_size != zio->io_orig_size ||
2126                             bcmp(zio->io_orig_data, lio->io_orig_data,
2127                             zio->io_orig_size) != 0);
2128                 }
2129         }
2130
2131         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2132                 ddt_phys_t *ddp = &dde->dde_phys[p];
2133
2134                 if (ddp->ddp_phys_birth != 0) {
2135                         arc_buf_t *abuf = NULL;
2136                         uint32_t aflags = ARC_WAIT;
2137                         blkptr_t blk = *zio->io_bp;
2138                         int error;
2139
2140                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2141
2142                         ddt_exit(ddt);
2143
2144                         error = arc_read(NULL, spa, &blk,
2145                             arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2146                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2147                             &aflags, &zio->io_bookmark);
2148
2149                         if (error == 0) {
2150                                 if (arc_buf_size(abuf) != zio->io_orig_size ||
2151                                     bcmp(abuf->b_data, zio->io_orig_data,
2152                                     zio->io_orig_size) != 0)
2153                                         error = SET_ERROR(EEXIST);
2154                                 VERIFY(arc_buf_remove_ref(abuf, &abuf));
2155                         }
2156
2157                         ddt_enter(ddt);
2158                         return (error != 0);
2159                 }
2160         }
2161
2162         return (B_FALSE);
2163 }
2164
2165 static void
2166 zio_ddt_child_write_ready(zio_t *zio)
2167 {
2168         int p = zio->io_prop.zp_copies;
2169         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2170         ddt_entry_t *dde = zio->io_private;
2171         ddt_phys_t *ddp = &dde->dde_phys[p];
2172         zio_t *pio;
2173
2174         if (zio->io_error)
2175                 return;
2176
2177         ddt_enter(ddt);
2178
2179         ASSERT(dde->dde_lead_zio[p] == zio);
2180
2181         ddt_phys_fill(ddp, zio->io_bp);
2182
2183         while ((pio = zio_walk_parents(zio)) != NULL)
2184                 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2185
2186         ddt_exit(ddt);
2187 }
2188
2189 static void
2190 zio_ddt_child_write_done(zio_t *zio)
2191 {
2192         int p = zio->io_prop.zp_copies;
2193         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2194         ddt_entry_t *dde = zio->io_private;
2195         ddt_phys_t *ddp = &dde->dde_phys[p];
2196
2197         ddt_enter(ddt);
2198
2199         ASSERT(ddp->ddp_refcnt == 0);
2200         ASSERT(dde->dde_lead_zio[p] == zio);
2201         dde->dde_lead_zio[p] = NULL;
2202
2203         if (zio->io_error == 0) {
2204                 while (zio_walk_parents(zio) != NULL)
2205                         ddt_phys_addref(ddp);
2206         } else {
2207                 ddt_phys_clear(ddp);
2208         }
2209
2210         ddt_exit(ddt);
2211 }
2212
2213 static void
2214 zio_ddt_ditto_write_done(zio_t *zio)
2215 {
2216         int p = DDT_PHYS_DITTO;
2217         blkptr_t *bp = zio->io_bp;
2218         ddt_t *ddt = ddt_select(zio->io_spa, bp);
2219         ddt_entry_t *dde = zio->io_private;
2220         ddt_phys_t *ddp = &dde->dde_phys[p];
2221         ddt_key_t *ddk = &dde->dde_key;
2222         ASSERTV(zio_prop_t *zp = &zio->io_prop);
2223
2224         ddt_enter(ddt);
2225
2226         ASSERT(ddp->ddp_refcnt == 0);
2227         ASSERT(dde->dde_lead_zio[p] == zio);
2228         dde->dde_lead_zio[p] = NULL;
2229
2230         if (zio->io_error == 0) {
2231                 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2232                 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2233                 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2234                 if (ddp->ddp_phys_birth != 0)
2235                         ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2236                 ddt_phys_fill(ddp, bp);
2237         }
2238
2239         ddt_exit(ddt);
2240 }
2241
2242 static int
2243 zio_ddt_write(zio_t *zio)
2244 {
2245         spa_t *spa = zio->io_spa;
2246         blkptr_t *bp = zio->io_bp;
2247         uint64_t txg = zio->io_txg;
2248         zio_prop_t *zp = &zio->io_prop;
2249         int p = zp->zp_copies;
2250         int ditto_copies;
2251         zio_t *cio = NULL;
2252         zio_t *dio = NULL;
2253         ddt_t *ddt = ddt_select(spa, bp);
2254         ddt_entry_t *dde;
2255         ddt_phys_t *ddp;
2256
2257         ASSERT(BP_GET_DEDUP(bp));
2258         ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2259         ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2260
2261         ddt_enter(ddt);
2262         dde = ddt_lookup(ddt, bp, B_TRUE);
2263         ddp = &dde->dde_phys[p];
2264
2265         if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2266                 /*
2267                  * If we're using a weak checksum, upgrade to a strong checksum
2268                  * and try again.  If we're already using a strong checksum,
2269                  * we can't resolve it, so just convert to an ordinary write.
2270                  * (And automatically e-mail a paper to Nature?)
2271                  */
2272                 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2273                         zp->zp_checksum = spa_dedup_checksum(spa);
2274                         zio_pop_transforms(zio);
2275                         zio->io_stage = ZIO_STAGE_OPEN;
2276                         BP_ZERO(bp);
2277                 } else {
2278                         zp->zp_dedup = B_FALSE;
2279                 }
2280                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2281                 ddt_exit(ddt);
2282                 return (ZIO_PIPELINE_CONTINUE);
2283         }
2284
2285         ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2286         ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2287
2288         if (ditto_copies > ddt_ditto_copies_present(dde) &&
2289             dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2290                 zio_prop_t czp = *zp;
2291
2292                 czp.zp_copies = ditto_copies;
2293
2294                 /*
2295                  * If we arrived here with an override bp, we won't have run
2296                  * the transform stack, so we won't have the data we need to
2297                  * generate a child i/o.  So, toss the override bp and restart.
2298                  * This is safe, because using the override bp is just an
2299                  * optimization; and it's rare, so the cost doesn't matter.
2300                  */
2301                 if (zio->io_bp_override) {
2302                         zio_pop_transforms(zio);
2303                         zio->io_stage = ZIO_STAGE_OPEN;
2304                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2305                         zio->io_bp_override = NULL;
2306                         BP_ZERO(bp);
2307                         ddt_exit(ddt);
2308                         return (ZIO_PIPELINE_CONTINUE);
2309                 }
2310
2311                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2312                     zio->io_orig_size, &czp, NULL, NULL,
2313                     zio_ddt_ditto_write_done, dde, zio->io_priority,
2314                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2315
2316                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2317                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2318         }
2319
2320         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2321                 if (ddp->ddp_phys_birth != 0)
2322                         ddt_bp_fill(ddp, bp, txg);
2323                 if (dde->dde_lead_zio[p] != NULL)
2324                         zio_add_child(zio, dde->dde_lead_zio[p]);
2325                 else
2326                         ddt_phys_addref(ddp);
2327         } else if (zio->io_bp_override) {
2328                 ASSERT(bp->blk_birth == txg);
2329                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2330                 ddt_phys_fill(ddp, bp);
2331                 ddt_phys_addref(ddp);
2332         } else {
2333                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2334                     zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2335                     zio_ddt_child_write_done, dde, zio->io_priority,
2336                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2337
2338                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2339                 dde->dde_lead_zio[p] = cio;
2340         }
2341
2342         ddt_exit(ddt);
2343
2344         if (cio)
2345                 zio_nowait(cio);
2346         if (dio)
2347                 zio_nowait(dio);
2348
2349         return (ZIO_PIPELINE_CONTINUE);
2350 }
2351
2352 ddt_entry_t *freedde; /* for debugging */
2353
2354 static int
2355 zio_ddt_free(zio_t *zio)
2356 {
2357         spa_t *spa = zio->io_spa;
2358         blkptr_t *bp = zio->io_bp;
2359         ddt_t *ddt = ddt_select(spa, bp);
2360         ddt_entry_t *dde;
2361         ddt_phys_t *ddp;
2362
2363         ASSERT(BP_GET_DEDUP(bp));
2364         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2365
2366         ddt_enter(ddt);
2367         freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2368         if (dde) {
2369                 ddp = ddt_phys_select(dde, bp);
2370                 if (ddp)
2371                         ddt_phys_decref(ddp);
2372         }
2373         ddt_exit(ddt);
2374
2375         return (ZIO_PIPELINE_CONTINUE);
2376 }
2377
2378 /*
2379  * ==========================================================================
2380  * Allocate and free blocks
2381  * ==========================================================================
2382  */
2383 static int
2384 zio_dva_allocate(zio_t *zio)
2385 {
2386         spa_t *spa = zio->io_spa;
2387         metaslab_class_t *mc = spa_normal_class(spa);
2388         blkptr_t *bp = zio->io_bp;
2389         int error;
2390         int flags = 0;
2391
2392         if (zio->io_gang_leader == NULL) {
2393                 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2394                 zio->io_gang_leader = zio;
2395         }
2396
2397         ASSERT(BP_IS_HOLE(bp));
2398         ASSERT0(BP_GET_NDVAS(bp));
2399         ASSERT3U(zio->io_prop.zp_copies, >, 0);
2400         ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2401         ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2402
2403         /*
2404          * The dump device does not support gang blocks so allocation on
2405          * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2406          * the "fast" gang feature.
2407          */
2408         flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2409         flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2410             METASLAB_GANG_CHILD : 0;
2411         flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
2412         error = metaslab_alloc(spa, mc, zio->io_size, bp,
2413             zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2414
2415         if (error) {
2416                 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2417                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2418                     error);
2419                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2420                         return (zio_write_gang_block(zio));
2421                 zio->io_error = error;
2422         }
2423
2424         return (ZIO_PIPELINE_CONTINUE);
2425 }
2426
2427 static int
2428 zio_dva_free(zio_t *zio)
2429 {
2430         metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2431
2432         return (ZIO_PIPELINE_CONTINUE);
2433 }
2434
2435 static int
2436 zio_dva_claim(zio_t *zio)
2437 {
2438         int error;
2439
2440         error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2441         if (error)
2442                 zio->io_error = error;
2443
2444         return (ZIO_PIPELINE_CONTINUE);
2445 }
2446
2447 /*
2448  * Undo an allocation.  This is used by zio_done() when an I/O fails
2449  * and we want to give back the block we just allocated.
2450  * This handles both normal blocks and gang blocks.
2451  */
2452 static void
2453 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2454 {
2455         int g;
2456
2457         ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2458         ASSERT(zio->io_bp_override == NULL);
2459
2460         if (!BP_IS_HOLE(bp))
2461                 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2462
2463         if (gn != NULL) {
2464                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2465                         zio_dva_unallocate(zio, gn->gn_child[g],
2466                             &gn->gn_gbh->zg_blkptr[g]);
2467                 }
2468         }
2469 }
2470
2471 /*
2472  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2473  */
2474 int
2475 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
2476     boolean_t use_slog)
2477 {
2478         int error = 1;
2479
2480         ASSERT(txg > spa_syncing_txg(spa));
2481
2482         /*
2483          * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2484          * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2485          * when allocating them.
2486          */
2487         if (use_slog) {
2488                 error = metaslab_alloc(spa, spa_log_class(spa), size,
2489                     new_bp, 1, txg, NULL,
2490                     METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
2491         }
2492
2493         if (error) {
2494                 error = metaslab_alloc(spa, spa_normal_class(spa), size,
2495                     new_bp, 1, txg, NULL,
2496                     METASLAB_FASTWRITE);
2497         }
2498
2499         if (error == 0) {
2500                 BP_SET_LSIZE(new_bp, size);
2501                 BP_SET_PSIZE(new_bp, size);
2502                 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2503                 BP_SET_CHECKSUM(new_bp,
2504                     spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2505                     ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2506                 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2507                 BP_SET_LEVEL(new_bp, 0);
2508                 BP_SET_DEDUP(new_bp, 0);
2509                 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2510         }
2511
2512         return (error);
2513 }
2514
2515 /*
2516  * Free an intent log block.
2517  */
2518 void
2519 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2520 {
2521         ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2522         ASSERT(!BP_IS_GANG(bp));
2523
2524         zio_free(spa, txg, bp);
2525 }
2526
2527 /*
2528  * ==========================================================================
2529  * Read and write to physical devices
2530  * ==========================================================================
2531  */
2532
2533
2534 /*
2535  * Issue an I/O to the underlying vdev. Typically the issue pipeline
2536  * stops after this stage and will resume upon I/O completion.
2537  * However, there are instances where the vdev layer may need to
2538  * continue the pipeline when an I/O was not issued. Since the I/O
2539  * that was sent to the vdev layer might be different than the one
2540  * currently active in the pipeline (see vdev_queue_io()), we explicitly
2541  * force the underlying vdev layers to call either zio_execute() or
2542  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2543  */
2544 static int
2545 zio_vdev_io_start(zio_t *zio)
2546 {
2547         vdev_t *vd = zio->io_vd;
2548         uint64_t align;
2549         spa_t *spa = zio->io_spa;
2550
2551         ASSERT(zio->io_error == 0);
2552         ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2553
2554         if (vd == NULL) {
2555                 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2556                         spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2557
2558                 /*
2559                  * The mirror_ops handle multiple DVAs in a single BP.
2560                  */
2561                 vdev_mirror_ops.vdev_op_io_start(zio);
2562                 return (ZIO_PIPELINE_STOP);
2563         }
2564
2565         /*
2566          * We keep track of time-sensitive I/Os so that the scan thread
2567          * can quickly react to certain workloads.  In particular, we care
2568          * about non-scrubbing, top-level reads and writes with the following
2569          * characteristics:
2570          *      - synchronous writes of user data to non-slog devices
2571          *      - any reads of user data
2572          * When these conditions are met, adjust the timestamp of spa_last_io
2573          * which allows the scan thread to adjust its workload accordingly.
2574          */
2575         if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2576             vd == vd->vdev_top && !vd->vdev_islog &&
2577             zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2578             zio->io_txg != spa_syncing_txg(spa)) {
2579                 uint64_t old = spa->spa_last_io;
2580                 uint64_t new = ddi_get_lbolt64();
2581                 if (old != new)
2582                         (void) atomic_cas_64(&spa->spa_last_io, old, new);
2583         }
2584
2585         align = 1ULL << vd->vdev_top->vdev_ashift;
2586
2587         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2588             P2PHASE(zio->io_size, align) != 0) {
2589                 /* Transform logical writes to be a full physical block size. */
2590                 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2591                 char *abuf = zio_buf_alloc(asize);
2592                 ASSERT(vd == vd->vdev_top);
2593                 if (zio->io_type == ZIO_TYPE_WRITE) {
2594                         bcopy(zio->io_data, abuf, zio->io_size);
2595                         bzero(abuf + zio->io_size, asize - zio->io_size);
2596                 }
2597                 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2598         }
2599
2600         /*
2601          * If this is not a physical io, make sure that it is properly aligned
2602          * before proceeding.
2603          */
2604         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2605                 ASSERT0(P2PHASE(zio->io_offset, align));
2606                 ASSERT0(P2PHASE(zio->io_size, align));
2607         } else {
2608                 /*
2609                  * For physical writes, we allow 512b aligned writes and assume
2610                  * the device will perform a read-modify-write as necessary.
2611                  */
2612                 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2613                 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2614         }
2615
2616         VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2617
2618         /*
2619          * If this is a repair I/O, and there's no self-healing involved --
2620          * that is, we're just resilvering what we expect to resilver --
2621          * then don't do the I/O unless zio's txg is actually in vd's DTL.
2622          * This prevents spurious resilvering with nested replication.
2623          * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2624          * A is out of date, we'll read from C+D, then use the data to
2625          * resilver A+B -- but we don't actually want to resilver B, just A.
2626          * The top-level mirror has no way to know this, so instead we just
2627          * discard unnecessary repairs as we work our way down the vdev tree.
2628          * The same logic applies to any form of nested replication:
2629          * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2630          */
2631         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2632             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2633             zio->io_txg != 0 && /* not a delegated i/o */
2634             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2635                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2636                 zio_vdev_io_bypass(zio);
2637                 return (ZIO_PIPELINE_CONTINUE);
2638         }
2639
2640         if (vd->vdev_ops->vdev_op_leaf &&
2641             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2642
2643                 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2644                         return (ZIO_PIPELINE_CONTINUE);
2645
2646                 if ((zio = vdev_queue_io(zio)) == NULL)
2647                         return (ZIO_PIPELINE_STOP);
2648
2649                 if (!vdev_accessible(vd, zio)) {
2650                         zio->io_error = SET_ERROR(ENXIO);
2651                         zio_interrupt(zio);
2652                         return (ZIO_PIPELINE_STOP);
2653                 }
2654         }
2655
2656         vd->vdev_ops->vdev_op_io_start(zio);
2657         return (ZIO_PIPELINE_STOP);
2658 }
2659
2660 static int
2661 zio_vdev_io_done(zio_t *zio)
2662 {
2663         vdev_t *vd = zio->io_vd;
2664         vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2665         boolean_t unexpected_error = B_FALSE;
2666
2667         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2668                 return (ZIO_PIPELINE_STOP);
2669
2670         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2671
2672         if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2673
2674                 vdev_queue_io_done(zio);
2675
2676                 if (zio->io_type == ZIO_TYPE_WRITE)
2677                         vdev_cache_write(zio);
2678
2679                 if (zio_injection_enabled && zio->io_error == 0)
2680                         zio->io_error = zio_handle_device_injection(vd,
2681                             zio, EIO);
2682
2683                 if (zio_injection_enabled && zio->io_error == 0)
2684                         zio->io_error = zio_handle_label_injection(zio, EIO);
2685
2686                 if (zio->io_error) {
2687                         if (!vdev_accessible(vd, zio)) {
2688                                 zio->io_error = SET_ERROR(ENXIO);
2689                         } else {
2690                                 unexpected_error = B_TRUE;
2691                         }
2692                 }
2693         }
2694
2695         ops->vdev_op_io_done(zio);
2696
2697         if (unexpected_error)
2698                 VERIFY(vdev_probe(vd, zio) == NULL);
2699
2700         return (ZIO_PIPELINE_CONTINUE);
2701 }
2702
2703 /*
2704  * For non-raidz ZIOs, we can just copy aside the bad data read from the
2705  * disk, and use that to finish the checksum ereport later.
2706  */
2707 static void
2708 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2709     const void *good_buf)
2710 {
2711         /* no processing needed */
2712         zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2713 }
2714
2715 /*ARGSUSED*/
2716 void
2717 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2718 {
2719         void *buf = zio_buf_alloc(zio->io_size);
2720
2721         bcopy(zio->io_data, buf, zio->io_size);
2722
2723         zcr->zcr_cbinfo = zio->io_size;
2724         zcr->zcr_cbdata = buf;
2725         zcr->zcr_finish = zio_vsd_default_cksum_finish;
2726         zcr->zcr_free = zio_buf_free;
2727 }
2728
2729 static int
2730 zio_vdev_io_assess(zio_t *zio)
2731 {
2732         vdev_t *vd = zio->io_vd;
2733
2734         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2735                 return (ZIO_PIPELINE_STOP);
2736
2737         if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2738                 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2739
2740         if (zio->io_vsd != NULL) {
2741                 zio->io_vsd_ops->vsd_free(zio);
2742                 zio->io_vsd = NULL;
2743         }
2744
2745         if (zio_injection_enabled && zio->io_error == 0)
2746                 zio->io_error = zio_handle_fault_injection(zio, EIO);
2747
2748         /*
2749          * If the I/O failed, determine whether we should attempt to retry it.
2750          *
2751          * On retry, we cut in line in the issue queue, since we don't want
2752          * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2753          */
2754         if (zio->io_error && vd == NULL &&
2755             !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2756                 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2757                 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2758                 zio->io_error = 0;
2759                 zio->io_flags |= ZIO_FLAG_IO_RETRY |
2760                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2761                 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2762                 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2763                     zio_requeue_io_start_cut_in_line);
2764                 return (ZIO_PIPELINE_STOP);
2765         }
2766
2767         /*
2768          * If we got an error on a leaf device, convert it to ENXIO
2769          * if the device is not accessible at all.
2770          */
2771         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2772             !vdev_accessible(vd, zio))
2773                 zio->io_error = SET_ERROR(ENXIO);
2774
2775         /*
2776          * If we can't write to an interior vdev (mirror or RAID-Z),
2777          * set vdev_cant_write so that we stop trying to allocate from it.
2778          */
2779         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2780             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2781                 vd->vdev_cant_write = B_TRUE;
2782         }
2783
2784         if (zio->io_error)
2785                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2786
2787         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2788             zio->io_physdone != NULL) {
2789                 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2790                 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2791                 zio->io_physdone(zio->io_logical);
2792         }
2793
2794         return (ZIO_PIPELINE_CONTINUE);
2795 }
2796
2797 void
2798 zio_vdev_io_reissue(zio_t *zio)
2799 {
2800         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2801         ASSERT(zio->io_error == 0);
2802
2803         zio->io_stage >>= 1;
2804 }
2805
2806 void
2807 zio_vdev_io_redone(zio_t *zio)
2808 {
2809         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2810
2811         zio->io_stage >>= 1;
2812 }
2813
2814 void
2815 zio_vdev_io_bypass(zio_t *zio)
2816 {
2817         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2818         ASSERT(zio->io_error == 0);
2819
2820         zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2821         zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2822 }
2823
2824 /*
2825  * ==========================================================================
2826  * Generate and verify checksums
2827  * ==========================================================================
2828  */
2829 static int
2830 zio_checksum_generate(zio_t *zio)
2831 {
2832         blkptr_t *bp = zio->io_bp;
2833         enum zio_checksum checksum;
2834
2835         if (bp == NULL) {
2836                 /*
2837                  * This is zio_write_phys().
2838                  * We're either generating a label checksum, or none at all.
2839                  */
2840                 checksum = zio->io_prop.zp_checksum;
2841
2842                 if (checksum == ZIO_CHECKSUM_OFF)
2843                         return (ZIO_PIPELINE_CONTINUE);
2844
2845                 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2846         } else {
2847                 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2848                         ASSERT(!IO_IS_ALLOCATING(zio));
2849                         checksum = ZIO_CHECKSUM_GANG_HEADER;
2850                 } else {
2851                         checksum = BP_GET_CHECKSUM(bp);
2852                 }
2853         }
2854
2855         zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2856
2857         return (ZIO_PIPELINE_CONTINUE);
2858 }
2859
2860 static int
2861 zio_checksum_verify(zio_t *zio)
2862 {
2863         zio_bad_cksum_t info;
2864         blkptr_t *bp = zio->io_bp;
2865         int error;
2866
2867         ASSERT(zio->io_vd != NULL);
2868
2869         if (bp == NULL) {
2870                 /*
2871                  * This is zio_read_phys().
2872                  * We're either verifying a label checksum, or nothing at all.
2873                  */
2874                 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2875                         return (ZIO_PIPELINE_CONTINUE);
2876
2877                 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2878         }
2879
2880         if ((error = zio_checksum_error(zio, &info)) != 0) {
2881                 zio->io_error = error;
2882                 if (error == ECKSUM &&
2883                     !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2884                         zfs_ereport_start_checksum(zio->io_spa,
2885                             zio->io_vd, zio, zio->io_offset,
2886                             zio->io_size, NULL, &info);
2887                 }
2888         }
2889
2890         return (ZIO_PIPELINE_CONTINUE);
2891 }
2892
2893 /*
2894  * Called by RAID-Z to ensure we don't compute the checksum twice.
2895  */
2896 void
2897 zio_checksum_verified(zio_t *zio)
2898 {
2899         zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2900 }
2901
2902 /*
2903  * ==========================================================================
2904  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2905  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
2906  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2907  * indicate errors that are specific to one I/O, and most likely permanent.
2908  * Any other error is presumed to be worse because we weren't expecting it.
2909  * ==========================================================================
2910  */
2911 int
2912 zio_worst_error(int e1, int e2)
2913 {
2914         static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2915         int r1, r2;
2916
2917         for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2918                 if (e1 == zio_error_rank[r1])
2919                         break;
2920
2921         for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2922                 if (e2 == zio_error_rank[r2])
2923                         break;
2924
2925         return (r1 > r2 ? e1 : e2);
2926 }
2927
2928 /*
2929  * ==========================================================================
2930  * I/O completion
2931  * ==========================================================================
2932  */
2933 static int
2934 zio_ready(zio_t *zio)
2935 {
2936         blkptr_t *bp = zio->io_bp;
2937         zio_t *pio, *pio_next;
2938
2939         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2940             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2941                 return (ZIO_PIPELINE_STOP);
2942
2943         if (zio->io_ready) {
2944                 ASSERT(IO_IS_ALLOCATING(zio));
2945                 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2946                     (zio->io_flags & ZIO_FLAG_NOPWRITE));
2947                 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2948
2949                 zio->io_ready(zio);
2950         }
2951
2952         if (bp != NULL && bp != &zio->io_bp_copy)
2953                 zio->io_bp_copy = *bp;
2954
2955         if (zio->io_error)
2956                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2957
2958         mutex_enter(&zio->io_lock);
2959         zio->io_state[ZIO_WAIT_READY] = 1;
2960         pio = zio_walk_parents(zio);
2961         mutex_exit(&zio->io_lock);
2962
2963         /*
2964          * As we notify zio's parents, new parents could be added.
2965          * New parents go to the head of zio's io_parent_list, however,
2966          * so we will (correctly) not notify them.  The remainder of zio's
2967          * io_parent_list, from 'pio_next' onward, cannot change because
2968          * all parents must wait for us to be done before they can be done.
2969          */
2970         for (; pio != NULL; pio = pio_next) {
2971                 pio_next = zio_walk_parents(zio);
2972                 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2973         }
2974
2975         if (zio->io_flags & ZIO_FLAG_NODATA) {
2976                 if (BP_IS_GANG(bp)) {
2977                         zio->io_flags &= ~ZIO_FLAG_NODATA;
2978                 } else {
2979                         ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2980                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2981                 }
2982         }
2983
2984         if (zio_injection_enabled &&
2985             zio->io_spa->spa_syncing_txg == zio->io_txg)
2986                 zio_handle_ignored_writes(zio);
2987
2988         return (ZIO_PIPELINE_CONTINUE);
2989 }
2990
2991 static int
2992 zio_done(zio_t *zio)
2993 {
2994         zio_t *pio, *pio_next;
2995         int c, w;
2996
2997         /*
2998          * If our children haven't all completed,
2999          * wait for them and then repeat this pipeline stage.
3000          */
3001         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3002             zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3003             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3004             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3005                 return (ZIO_PIPELINE_STOP);
3006
3007         for (c = 0; c < ZIO_CHILD_TYPES; c++)
3008                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
3009                         ASSERT(zio->io_children[c][w] == 0);
3010
3011         if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
3012                 ASSERT(zio->io_bp->blk_pad[0] == 0);
3013                 ASSERT(zio->io_bp->blk_pad[1] == 0);
3014                 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
3015                     sizeof (blkptr_t)) == 0 ||
3016                     (zio->io_bp == zio_unique_parent(zio)->io_bp));
3017                 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
3018                     zio->io_bp_override == NULL &&
3019                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3020                         ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
3021                         ASSERT3U(zio->io_prop.zp_copies, <=,
3022                             BP_GET_NDVAS(zio->io_bp));
3023                         ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
3024                             (BP_COUNT_GANG(zio->io_bp) ==
3025                             BP_GET_NDVAS(zio->io_bp)));
3026                 }
3027                 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3028                         VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
3029         }
3030
3031         /*
3032          * If there were child vdev/gang/ddt errors, they apply to us now.
3033          */
3034         zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3035         zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3036         zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3037
3038         /*
3039          * If the I/O on the transformed data was successful, generate any
3040          * checksum reports now while we still have the transformed data.
3041          */
3042         if (zio->io_error == 0) {
3043                 while (zio->io_cksum_report != NULL) {
3044                         zio_cksum_report_t *zcr = zio->io_cksum_report;
3045                         uint64_t align = zcr->zcr_align;
3046                         uint64_t asize = P2ROUNDUP(zio->io_size, align);
3047                         char *abuf = zio->io_data;
3048
3049                         if (asize != zio->io_size) {
3050                                 abuf = zio_buf_alloc(asize);
3051                                 bcopy(zio->io_data, abuf, zio->io_size);
3052                                 bzero(abuf+zio->io_size, asize-zio->io_size);
3053                         }
3054
3055                         zio->io_cksum_report = zcr->zcr_next;
3056                         zcr->zcr_next = NULL;
3057                         zcr->zcr_finish(zcr, abuf);
3058                         zfs_ereport_free_checksum(zcr);
3059
3060                         if (asize != zio->io_size)
3061                                 zio_buf_free(abuf, asize);
3062                 }
3063         }
3064
3065         zio_pop_transforms(zio);        /* note: may set zio->io_error */
3066
3067         vdev_stat_update(zio, zio->io_size);
3068
3069         /*
3070          * If this I/O is attached to a particular vdev is slow, exceeding
3071          * 30 seconds to complete, post an error described the I/O delay.
3072          * We ignore these errors if the device is currently unavailable.
3073          */
3074         if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
3075                 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
3076                         zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
3077                             zio->io_vd, zio, 0, 0);
3078         }
3079
3080         if (zio->io_error) {
3081                 /*
3082                  * If this I/O is attached to a particular vdev,
3083                  * generate an error message describing the I/O failure
3084                  * at the block level.  We ignore these errors if the
3085                  * device is currently unavailable.
3086                  */
3087                 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
3088                         !vdev_is_dead(zio->io_vd))
3089                         zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
3090                                                 zio->io_vd, zio, 0, 0);
3091
3092                 if ((zio->io_error == EIO || !(zio->io_flags &
3093                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3094                     zio == zio->io_logical) {
3095                         /*
3096                          * For logical I/O requests, tell the SPA to log the
3097                          * error and generate a logical data ereport.
3098                          */
3099                         spa_log_error(zio->io_spa, zio);
3100                         zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
3101                             NULL, zio, 0, 0);
3102                 }
3103         }
3104
3105         if (zio->io_error && zio == zio->io_logical) {
3106                 /*
3107                  * Determine whether zio should be reexecuted.  This will
3108                  * propagate all the way to the root via zio_notify_parent().
3109                  */
3110                 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
3111                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3112
3113                 if (IO_IS_ALLOCATING(zio) &&
3114                     !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3115                         if (zio->io_error != ENOSPC)
3116                                 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3117                         else
3118                                 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3119                 }
3120
3121                 if ((zio->io_type == ZIO_TYPE_READ ||
3122                     zio->io_type == ZIO_TYPE_FREE) &&
3123                     !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3124                     zio->io_error == ENXIO &&
3125                     spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
3126                     spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
3127                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3128
3129                 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3130                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3131
3132                 /*
3133                  * Here is a possibly good place to attempt to do
3134                  * either combinatorial reconstruction or error correction
3135                  * based on checksums.  It also might be a good place
3136                  * to send out preliminary ereports before we suspend
3137                  * processing.
3138                  */
3139         }
3140
3141         /*
3142          * If there were logical child errors, they apply to us now.
3143          * We defer this until now to avoid conflating logical child
3144          * errors with errors that happened to the zio itself when
3145          * updating vdev stats and reporting FMA events above.
3146          */
3147         zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3148
3149         if ((zio->io_error || zio->io_reexecute) &&
3150             IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3151             !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3152                 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
3153
3154         zio_gang_tree_free(&zio->io_gang_tree);
3155
3156         /*
3157          * Godfather I/Os should never suspend.
3158          */
3159         if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3160             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3161                 zio->io_reexecute = 0;
3162
3163         if (zio->io_reexecute) {
3164                 /*
3165                  * This is a logical I/O that wants to reexecute.
3166                  *
3167                  * Reexecute is top-down.  When an i/o fails, if it's not
3168                  * the root, it simply notifies its parent and sticks around.
3169                  * The parent, seeing that it still has children in zio_done(),
3170                  * does the same.  This percolates all the way up to the root.
3171                  * The root i/o will reexecute or suspend the entire tree.
3172                  *
3173                  * This approach ensures that zio_reexecute() honors
3174                  * all the original i/o dependency relationships, e.g.
3175                  * parents not executing until children are ready.
3176                  */
3177                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3178
3179                 zio->io_gang_leader = NULL;
3180
3181                 mutex_enter(&zio->io_lock);
3182                 zio->io_state[ZIO_WAIT_DONE] = 1;
3183                 mutex_exit(&zio->io_lock);
3184
3185                 /*
3186                  * "The Godfather" I/O monitors its children but is
3187                  * not a true parent to them. It will track them through
3188                  * the pipeline but severs its ties whenever they get into
3189                  * trouble (e.g. suspended). This allows "The Godfather"
3190                  * I/O to return status without blocking.
3191                  */
3192                 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3193                         zio_link_t *zl = zio->io_walk_link;
3194                         pio_next = zio_walk_parents(zio);
3195
3196                         if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3197                             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3198                                 zio_remove_child(pio, zio, zl);
3199                                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3200                         }
3201                 }
3202
3203                 if ((pio = zio_unique_parent(zio)) != NULL) {
3204                         /*
3205                          * We're not a root i/o, so there's nothing to do
3206                          * but notify our parent.  Don't propagate errors
3207                          * upward since we haven't permanently failed yet.
3208                          */
3209                         ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3210                         zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3211                         zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3212                 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3213                         /*
3214                          * We'd fail again if we reexecuted now, so suspend
3215                          * until conditions improve (e.g. device comes online).
3216                          */
3217                         zio_suspend(zio->io_spa, zio);
3218                 } else {
3219                         /*
3220                          * Reexecution is potentially a huge amount of work.
3221                          * Hand it off to the otherwise-unused claim taskq.
3222                          */
3223                         ASSERT(taskq_empty_ent(&zio->io_tqent));
3224                         spa_taskq_dispatch_ent(zio->io_spa,
3225                             ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
3226                             (task_func_t *)zio_reexecute, zio, 0,
3227                             &zio->io_tqent);
3228                 }
3229                 return (ZIO_PIPELINE_STOP);
3230         }
3231
3232         ASSERT(zio->io_child_count == 0);
3233         ASSERT(zio->io_reexecute == 0);
3234         ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3235
3236         /*
3237          * Report any checksum errors, since the I/O is complete.
3238          */
3239         while (zio->io_cksum_report != NULL) {
3240                 zio_cksum_report_t *zcr = zio->io_cksum_report;
3241                 zio->io_cksum_report = zcr->zcr_next;
3242                 zcr->zcr_next = NULL;
3243                 zcr->zcr_finish(zcr, NULL);
3244                 zfs_ereport_free_checksum(zcr);
3245         }
3246
3247         if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
3248             !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
3249             !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
3250                 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
3251         }
3252
3253         /*
3254          * It is the responsibility of the done callback to ensure that this
3255          * particular zio is no longer discoverable for adoption, and as
3256          * such, cannot acquire any new parents.
3257          */
3258         if (zio->io_done)
3259                 zio->io_done(zio);
3260
3261         mutex_enter(&zio->io_lock);
3262         zio->io_state[ZIO_WAIT_DONE] = 1;
3263         mutex_exit(&zio->io_lock);
3264
3265         for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3266                 zio_link_t *zl = zio->io_walk_link;
3267                 pio_next = zio_walk_parents(zio);
3268                 zio_remove_child(pio, zio, zl);
3269                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3270         }
3271
3272         if (zio->io_waiter != NULL) {
3273                 mutex_enter(&zio->io_lock);
3274                 zio->io_executor = NULL;
3275                 cv_broadcast(&zio->io_cv);
3276                 mutex_exit(&zio->io_lock);
3277         } else {
3278                 zio_destroy(zio);
3279         }
3280
3281         return (ZIO_PIPELINE_STOP);
3282 }
3283
3284 /*
3285  * ==========================================================================
3286  * I/O pipeline definition
3287  * ==========================================================================
3288  */
3289 static zio_pipe_stage_t *zio_pipeline[] = {
3290         NULL,
3291         zio_read_bp_init,
3292         zio_free_bp_init,
3293         zio_issue_async,
3294         zio_write_bp_init,
3295         zio_checksum_generate,
3296         zio_nop_write,
3297         zio_ddt_read_start,
3298         zio_ddt_read_done,
3299         zio_ddt_write,
3300         zio_ddt_free,
3301         zio_gang_assemble,
3302         zio_gang_issue,
3303         zio_dva_allocate,
3304         zio_dva_free,
3305         zio_dva_claim,
3306         zio_ready,
3307         zio_vdev_io_start,
3308         zio_vdev_io_done,
3309         zio_vdev_io_assess,
3310         zio_checksum_verify,
3311         zio_done
3312 };
3313
3314 /* dnp is the dnode for zb1->zb_object */
3315 boolean_t
3316 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3317     const zbookmark_phys_t *zb2)
3318 {
3319         uint64_t zb1nextL0, zb2thisobj;
3320
3321         ASSERT(zb1->zb_objset == zb2->zb_objset);
3322         ASSERT(zb2->zb_level == 0);
3323
3324         /* The objset_phys_t isn't before anything. */
3325         if (dnp == NULL)
3326                 return (B_FALSE);
3327
3328         zb1nextL0 = (zb1->zb_blkid + 1) <<
3329             ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3330
3331         zb2thisobj = zb2->zb_object ? zb2->zb_object :
3332             zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3333
3334         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3335                 uint64_t nextobj = zb1nextL0 *
3336                     (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3337                 return (nextobj <= zb2thisobj);
3338         }
3339
3340         if (zb1->zb_object < zb2thisobj)
3341                 return (B_TRUE);
3342         if (zb1->zb_object > zb2thisobj)
3343                 return (B_FALSE);
3344         if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3345                 return (B_FALSE);
3346         return (zb1nextL0 <= zb2->zb_blkid);
3347 }
3348
3349 #if defined(_KERNEL) && defined(HAVE_SPL)
3350 EXPORT_SYMBOL(zio_type_name);
3351 EXPORT_SYMBOL(zio_buf_alloc);
3352 EXPORT_SYMBOL(zio_data_buf_alloc);
3353 EXPORT_SYMBOL(zio_buf_free);
3354 EXPORT_SYMBOL(zio_data_buf_free);
3355
3356 module_param(zio_delay_max, int, 0644);
3357 MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
3358
3359 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
3360 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
3361
3362 module_param(zfs_sync_pass_deferred_free, int, 0644);
3363 MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
3364         "Defer frees starting in this pass");
3365
3366 module_param(zfs_sync_pass_dont_compress, int, 0644);
3367 MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
3368         "Don't compress starting in this pass");
3369
3370 module_param(zfs_sync_pass_rewrite, int, 0644);
3371 MODULE_PARM_DESC(zfs_sync_pass_rewrite,
3372         "Rewrite new bps starting in this pass");
3373 #endif