module/zfs/zio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26
  27 #include <sys/sysmacros.h>
  28 #include <sys/zfs_context.h>
  29 #include <sys/fm/fs/zfs.h>
  30 #include <sys/spa.h>
  31 #include <sys/txg.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/zio_impl.h>
  35 #include <sys/zio_compress.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/dmu_objset.h>
  38 #include <sys/arc.h>
  39 #include <sys/ddt.h>
  40 #include <sys/blkptr.h>
  41 #include <sys/zfeature.h>
  42
  43 /*
  44  * ==========================================================================
  45  * I/O type descriptions
  46  * ==========================================================================
  47  */
  48 const char *zio_type_name[ZIO_TYPES] = {
  49         "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
  50 };
  51
  52 /*
  53  * ==========================================================================
  54  * I/O kmem caches
  55  * ==========================================================================
  56  */
  57 kmem_cache_t *zio_cache;
  58 kmem_cache_t *zio_link_cache;
  59 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  60 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  61 int zio_delay_max = ZIO_DELAY_MAX;
  62
  63 #define ZIO_PIPELINE_CONTINUE           0x100
  64 #define ZIO_PIPELINE_STOP               0x101
  65
  66 /*
  67  * The following actions directly effect the spa's sync-to-convergence logic.
  68  * The values below define the sync pass when we start performing the action.
  69  * Care should be taken when changing these values as they directly impact
  70  * spa_sync() performance. Tuning these values may introduce subtle performance
  71  * pathologies and should only be done in the context of performance analysis.
  72  * These tunables will eventually be removed and replaced with #defines once
  73  * enough analysis has been done to determine optimal values.
  74  *
  75  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  76  * regular blocks are not deferred.
  77  */
  78 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  79 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  80 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
  81
  82 /*
  83  * An allocating zio is one that either currently has the DVA allocate
  84  * stage set or will have it later in its lifetime.
  85  */
  86 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
  87
  88 int zio_requeue_io_start_cut_in_line = 1;
  89
  90 #ifdef ZFS_DEBUG
  91 int zio_buf_debug_limit = 16384;
  92 #else
  93 int zio_buf_debug_limit = 0;
  94 #endif
  95
  96 static inline void __zio_execute(zio_t *zio);
  97
  98 void
  99 zio_init(void)
 100 {
 101         size_t c;
 102         vmem_t *data_alloc_arena = NULL;
 103
 104         zio_cache = kmem_cache_create("zio_cache",
 105             sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 106         zio_link_cache = kmem_cache_create("zio_link_cache",
 107             sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 108
 109         /*
 110          * For small buffers, we want a cache for each multiple of
 111          * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 112          * for each quarter-power of 2.
 113          */
 114         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 115                 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 116                 size_t p2 = size;
 117                 size_t align = 0;
 118                 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 119
 120 #ifdef _ILP32
 121                 /*
 122                  * Cache size limited to 1M on 32-bit platforms until ARC
 123                  * buffers no longer require virtual address space.
 124                  */
 125                 if (size > zfs_max_recordsize)
 126                         break;
 127 #endif
 128
 129                 while (!ISP2(p2))
 130                         p2 &= p2 - 1;
 131
 132 #ifndef _KERNEL
 133                 /*
 134                  * If we are using watchpoints, put each buffer on its own page,
 135                  * to eliminate the performance overhead of trapping to the
 136                  * kernel when modifying a non-watched buffer that shares the
 137                  * page with a watched buffer.
 138                  */
 139                 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 140                         continue;
 141 #endif
 142                 if (size <= 4 * SPA_MINBLOCKSIZE) {
 143                         align = SPA_MINBLOCKSIZE;
 144                 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 145                         align = MIN(p2 >> 2, PAGESIZE);
 146                 }
 147
 148                 if (align != 0) {
 149                         char name[36];
 150                         (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 151                         zio_buf_cache[c] = kmem_cache_create(name, size,
 152                             align, NULL, NULL, NULL, NULL, NULL, cflags);
 153
 154                         (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 155                         zio_data_buf_cache[c] = kmem_cache_create(name, size,
 156                             align, NULL, NULL, NULL, NULL,
 157                             data_alloc_arena, cflags);
 158                 }
 159         }
 160
 161         while (--c != 0) {
 162                 ASSERT(zio_buf_cache[c] != NULL);
 163                 if (zio_buf_cache[c - 1] == NULL)
 164                         zio_buf_cache[c - 1] = zio_buf_cache[c];
 165
 166                 ASSERT(zio_data_buf_cache[c] != NULL);
 167                 if (zio_data_buf_cache[c - 1] == NULL)
 168                         zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 169         }
 170
 171         zio_inject_init();
 172
 173         lz4_init();
 174 }
 175
 176 void
 177 zio_fini(void)
 178 {
 179         size_t c;
 180         kmem_cache_t *last_cache = NULL;
 181         kmem_cache_t *last_data_cache = NULL;
 182
 183         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 184 #ifdef _ILP32
 185                 /*
 186                  * Cache size limited to 1M on 32-bit platforms until ARC
 187                  * buffers no longer require virtual address space.
 188                  */
 189                 if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
 190                         break;
 191 #endif
 192                 if (zio_buf_cache[c] != last_cache) {
 193                         last_cache = zio_buf_cache[c];
 194                         kmem_cache_destroy(zio_buf_cache[c]);
 195                 }
 196                 zio_buf_cache[c] = NULL;
 197
 198                 if (zio_data_buf_cache[c] != last_data_cache) {
 199                         last_data_cache = zio_data_buf_cache[c];
 200                         kmem_cache_destroy(zio_data_buf_cache[c]);
 201                 }
 202                 zio_data_buf_cache[c] = NULL;
 203         }
 204
 205         kmem_cache_destroy(zio_link_cache);
 206         kmem_cache_destroy(zio_cache);
 207
 208         zio_inject_fini();
 209
 210         lz4_fini();
 211 }
 212
 213 /*
 214  * ==========================================================================
 215  * Allocate and free I/O buffers
 216  * ==========================================================================
 217  */
 218
 219 /*
 220  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 221  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 222  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 223  * excess / transient data in-core during a crashdump.
 224  */
 225 void *
 226 zio_buf_alloc(size_t size)
 227 {
 228         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 229
 230         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 231
 232         return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 233 }
 234
 235 /*
 236  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 237  * crashdump if the kernel panics.  This exists so that we will limit the amount
 238  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 239  * of kernel heap dumped to disk when the kernel panics)
 240  */
 241 void *
 242 zio_data_buf_alloc(size_t size)
 243 {
 244         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 245
 246         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 247
 248         return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 249 }
 250
 251 void
 252 zio_buf_free(void *buf, size_t size)
 253 {
 254         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 255
 256         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 257
 258         kmem_cache_free(zio_buf_cache[c], buf);
 259 }
 260
 261 void
 262 zio_data_buf_free(void *buf, size_t size)
 263 {
 264         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 265
 266         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 267
 268         kmem_cache_free(zio_data_buf_cache[c], buf);
 269 }
 270
 271 /*
 272  * ==========================================================================
 273  * Push and pop I/O transform buffers
 274  * ==========================================================================
 275  */
 276 static void
 277 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 278         zio_transform_func_t *transform)
 279 {
 280         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 281
 282         zt->zt_orig_data = zio->io_data;
 283         zt->zt_orig_size = zio->io_size;
 284         zt->zt_bufsize = bufsize;
 285         zt->zt_transform = transform;
 286
 287         zt->zt_next = zio->io_transform_stack;
 288         zio->io_transform_stack = zt;
 289
 290         zio->io_data = data;
 291         zio->io_size = size;
 292 }
 293
 294 static void
 295 zio_pop_transforms(zio_t *zio)
 296 {
 297         zio_transform_t *zt;
 298
 299         while ((zt = zio->io_transform_stack) != NULL) {
 300                 if (zt->zt_transform != NULL)
 301                         zt->zt_transform(zio,
 302                             zt->zt_orig_data, zt->zt_orig_size);
 303
 304                 if (zt->zt_bufsize != 0)
 305                         zio_buf_free(zio->io_data, zt->zt_bufsize);
 306
 307                 zio->io_data = zt->zt_orig_data;
 308                 zio->io_size = zt->zt_orig_size;
 309                 zio->io_transform_stack = zt->zt_next;
 310
 311                 kmem_free(zt, sizeof (zio_transform_t));
 312         }
 313 }
 314
 315 /*
 316  * ==========================================================================
 317  * I/O transform callbacks for subblocks and decompression
 318  * ==========================================================================
 319  */
 320 static void
 321 zio_subblock(zio_t *zio, void *data, uint64_t size)
 322 {
 323         ASSERT(zio->io_size > size);
 324
 325         if (zio->io_type == ZIO_TYPE_READ)
 326                 bcopy(zio->io_data, data, size);
 327 }
 328
 329 static void
 330 zio_decompress(zio_t *zio, void *data, uint64_t size)
 331 {
 332         if (zio->io_error == 0 &&
 333             zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 334             zio->io_data, data, zio->io_size, size) != 0)
 335                 zio->io_error = SET_ERROR(EIO);
 336 }
 337
 338 /*
 339  * ==========================================================================
 340  * I/O parent/child relationships and pipeline interlocks
 341  * ==========================================================================
 342  */
 343 /*
 344  * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 345  *        continue calling these functions until they return NULL.
 346  *        Otherwise, the next caller will pick up the list walk in
 347  *        some indeterminate state.  (Otherwise every caller would
 348  *        have to pass in a cookie to keep the state represented by
 349  *        io_walk_link, which gets annoying.)
 350  */
 351 zio_t *
 352 zio_walk_parents(zio_t *cio)
 353 {
 354         zio_link_t *zl = cio->io_walk_link;
 355         list_t *pl = &cio->io_parent_list;
 356
 357         zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 358         cio->io_walk_link = zl;
 359
 360         if (zl == NULL)
 361                 return (NULL);
 362
 363         ASSERT(zl->zl_child == cio);
 364         return (zl->zl_parent);
 365 }
 366
 367 zio_t *
 368 zio_walk_children(zio_t *pio)
 369 {
 370         zio_link_t *zl = pio->io_walk_link;
 371         list_t *cl = &pio->io_child_list;
 372
 373         zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 374         pio->io_walk_link = zl;
 375
 376         if (zl == NULL)
 377                 return (NULL);
 378
 379         ASSERT(zl->zl_parent == pio);
 380         return (zl->zl_child);
 381 }
 382
 383 zio_t *
 384 zio_unique_parent(zio_t *cio)
 385 {
 386         zio_t *pio = zio_walk_parents(cio);
 387
 388         VERIFY(zio_walk_parents(cio) == NULL);
 389         return (pio);
 390 }
 391
 392 void
 393 zio_add_child(zio_t *pio, zio_t *cio)
 394 {
 395         zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 396         int w;
 397
 398         /*
 399          * Logical I/Os can have logical, gang, or vdev children.
 400          * Gang I/Os can have gang or vdev children.
 401          * Vdev I/Os can only have vdev children.
 402          * The following ASSERT captures all of these constraints.
 403          */
 404         ASSERT(cio->io_child_type <= pio->io_child_type);
 405
 406         zl->zl_parent = pio;
 407         zl->zl_child = cio;
 408
 409         mutex_enter(&cio->io_lock);
 410         mutex_enter(&pio->io_lock);
 411
 412         ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 413
 414         for (w = 0; w < ZIO_WAIT_TYPES; w++)
 415                 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 416
 417         list_insert_head(&pio->io_child_list, zl);
 418         list_insert_head(&cio->io_parent_list, zl);
 419
 420         pio->io_child_count++;
 421         cio->io_parent_count++;
 422
 423         mutex_exit(&pio->io_lock);
 424         mutex_exit(&cio->io_lock);
 425 }
 426
 427 static void
 428 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 429 {
 430         ASSERT(zl->zl_parent == pio);
 431         ASSERT(zl->zl_child == cio);
 432
 433         mutex_enter(&cio->io_lock);
 434         mutex_enter(&pio->io_lock);
 435
 436         list_remove(&pio->io_child_list, zl);
 437         list_remove(&cio->io_parent_list, zl);
 438
 439         pio->io_child_count--;
 440         cio->io_parent_count--;
 441
 442         mutex_exit(&pio->io_lock);
 443         mutex_exit(&cio->io_lock);
 444
 445         kmem_cache_free(zio_link_cache, zl);
 446 }
 447
 448 static boolean_t
 449 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 450 {
 451         uint64_t *countp = &zio->io_children[child][wait];
 452         boolean_t waiting = B_FALSE;
 453
 454         mutex_enter(&zio->io_lock);
 455         ASSERT(zio->io_stall == NULL);
 456         if (*countp != 0) {
 457                 zio->io_stage >>= 1;
 458                 zio->io_stall = countp;
 459                 waiting = B_TRUE;
 460         }
 461         mutex_exit(&zio->io_lock);
 462
 463         return (waiting);
 464 }
 465
 466 __attribute__((always_inline))
 467 static inline void
 468 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 469 {
 470         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 471         int *errorp = &pio->io_child_error[zio->io_child_type];
 472
 473         mutex_enter(&pio->io_lock);
 474         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 475                 *errorp = zio_worst_error(*errorp, zio->io_error);
 476         pio->io_reexecute |= zio->io_reexecute;
 477         ASSERT3U(*countp, >, 0);
 478
 479         (*countp)--;
 480
 481         if (*countp == 0 && pio->io_stall == countp) {
 482                 pio->io_stall = NULL;
 483                 mutex_exit(&pio->io_lock);
 484                 __zio_execute(pio);
 485         } else {
 486                 mutex_exit(&pio->io_lock);
 487         }
 488 }
 489
 490 static void
 491 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 492 {
 493         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 494                 zio->io_error = zio->io_child_error[c];
 495 }
 496
 497 /*
 498  * ==========================================================================
 499  * Create the various types of I/O (read, write, free, etc)
 500  * ==========================================================================
 501  */
 502 static zio_t *
 503 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 504     void *data, uint64_t size, zio_done_func_t *done, void *private,
 505     zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 506     vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
 507     enum zio_stage stage, enum zio_stage pipeline)
 508 {
 509         zio_t *zio;
 510
 511         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 512         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 513         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 514
 515         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 516         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 517         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 518
 519         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 520         bzero(zio, sizeof (zio_t));
 521
 522         mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 523         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 524
 525         list_create(&zio->io_parent_list, sizeof (zio_link_t),
 526             offsetof(zio_link_t, zl_parent_node));
 527         list_create(&zio->io_child_list, sizeof (zio_link_t),
 528             offsetof(zio_link_t, zl_child_node));
 529
 530         if (vd != NULL)
 531                 zio->io_child_type = ZIO_CHILD_VDEV;
 532         else if (flags & ZIO_FLAG_GANG_CHILD)
 533                 zio->io_child_type = ZIO_CHILD_GANG;
 534         else if (flags & ZIO_FLAG_DDT_CHILD)
 535                 zio->io_child_type = ZIO_CHILD_DDT;
 536         else
 537                 zio->io_child_type = ZIO_CHILD_LOGICAL;
 538
 539         if (bp != NULL) {
 540                 zio->io_bp = (blkptr_t *)bp;
 541                 zio->io_bp_copy = *bp;
 542                 zio->io_bp_orig = *bp;
 543                 if (type != ZIO_TYPE_WRITE ||
 544                     zio->io_child_type == ZIO_CHILD_DDT)
 545                         zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 546                 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 547                         zio->io_logical = zio;
 548                 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 549                         pipeline |= ZIO_GANG_STAGES;
 550         }
 551
 552         zio->io_spa = spa;
 553         zio->io_txg = txg;
 554         zio->io_done = done;
 555         zio->io_private = private;
 556         zio->io_type = type;
 557         zio->io_priority = priority;
 558         zio->io_vd = vd;
 559         zio->io_offset = offset;
 560         zio->io_orig_data = zio->io_data = data;
 561         zio->io_orig_size = zio->io_size = size;
 562         zio->io_orig_flags = zio->io_flags = flags;
 563         zio->io_orig_stage = zio->io_stage = stage;
 564         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 565
 566         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 567         zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 568
 569         if (zb != NULL)
 570                 zio->io_bookmark = *zb;
 571
 572         if (pio != NULL) {
 573                 if (zio->io_logical == NULL)
 574                         zio->io_logical = pio->io_logical;
 575                 if (zio->io_child_type == ZIO_CHILD_GANG)
 576                         zio->io_gang_leader = pio->io_gang_leader;
 577                 zio_add_child(pio, zio);
 578         }
 579
 580         taskq_init_ent(&zio->io_tqent);
 581
 582         return (zio);
 583 }
 584
 585 static void
 586 zio_destroy(zio_t *zio)
 587 {
 588         list_destroy(&zio->io_parent_list);
 589         list_destroy(&zio->io_child_list);
 590         mutex_destroy(&zio->io_lock);
 591         cv_destroy(&zio->io_cv);
 592         kmem_cache_free(zio_cache, zio);
 593 }
 594
 595 zio_t *
 596 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 597     void *private, enum zio_flag flags)
 598 {
 599         zio_t *zio;
 600
 601         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 602             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 603             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 604
 605         return (zio);
 606 }
 607
 608 zio_t *
 609 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 610 {
 611         return (zio_null(NULL, spa, NULL, done, private, flags));
 612 }
 613
 614 void
 615 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 616 {
 617         int i;
 618
 619         if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 620                 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 621                     bp, (longlong_t)BP_GET_TYPE(bp));
 622         }
 623         if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 624             BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 625                 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 626                     bp, (longlong_t)BP_GET_CHECKSUM(bp));
 627         }
 628         if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 629             BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 630                 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 631                     bp, (longlong_t)BP_GET_COMPRESS(bp));
 632         }
 633         if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 634                 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 635                     bp, (longlong_t)BP_GET_LSIZE(bp));
 636         }
 637         if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 638                 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 639                     bp, (longlong_t)BP_GET_PSIZE(bp));
 640         }
 641
 642         if (BP_IS_EMBEDDED(bp)) {
 643                 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 644                         zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 645                             bp, (longlong_t)BPE_GET_ETYPE(bp));
 646                 }
 647         }
 648
 649         /*
 650          * Pool-specific checks.
 651          *
 652          * Note: it would be nice to verify that the blk_birth and
 653          * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 654          * allows the birth time of log blocks (and dmu_sync()-ed blocks
 655          * that are in the log) to be arbitrarily large.
 656          */
 657         for (i = 0; i < BP_GET_NDVAS(bp); i++) {
 658                 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 659                 vdev_t *vd;
 660                 uint64_t offset, asize;
 661                 if (vdevid >= spa->spa_root_vdev->vdev_children) {
 662                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 663                             "VDEV %llu",
 664                             bp, i, (longlong_t)vdevid);
 665                 }
 666                 vd = spa->spa_root_vdev->vdev_child[vdevid];
 667                 if (vd == NULL) {
 668                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 669                             "VDEV %llu",
 670                             bp, i, (longlong_t)vdevid);
 671                 }
 672                 if (vd->vdev_ops == &vdev_hole_ops) {
 673                         zfs_panic_recover("blkptr at %p DVA %u has hole "
 674                             "VDEV %llu",
 675                             bp, i, (longlong_t)vdevid);
 676
 677                 }
 678                 if (vd->vdev_ops == &vdev_missing_ops) {
 679                         /*
 680                          * "missing" vdevs are valid during import, but we
 681                          * don't have their detailed info (e.g. asize), so
 682                          * we can't perform any more checks on them.
 683                          */
 684                         continue;
 685                 }
 686                 offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 687                 asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 688                 if (BP_IS_GANG(bp))
 689                         asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 690                 if (offset + asize > vd->vdev_asize) {
 691                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 692                             "OFFSET %llu",
 693                             bp, i, (longlong_t)offset);
 694                 }
 695         }
 696 }
 697
 698 zio_t *
 699 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 700     void *data, uint64_t size, zio_done_func_t *done, void *private,
 701     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 702 {
 703         zio_t *zio;
 704
 705         zfs_blkptr_verify(spa, bp);
 706
 707         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 708             data, size, done, private,
 709             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 710             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 711             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 712
 713         return (zio);
 714 }
 715
 716 zio_t *
 717 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 718     void *data, uint64_t size, const zio_prop_t *zp,
 719     zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 720     void *private,
 721     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 722 {
 723         zio_t *zio;
 724
 725         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 726             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 727             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 728             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 729             DMU_OT_IS_VALID(zp->zp_type) &&
 730             zp->zp_level < 32 &&
 731             zp->zp_copies > 0 &&
 732             zp->zp_copies <= spa_max_replication(spa));
 733
 734         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 735             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 736             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 737             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 738
 739         zio->io_ready = ready;
 740         zio->io_physdone = physdone;
 741         zio->io_prop = *zp;
 742
 743         /*
 744          * Data can be NULL if we are going to call zio_write_override() to
 745          * provide the already-allocated BP.  But we may need the data to
 746          * verify a dedup hit (if requested).  In this case, don't try to
 747          * dedup (just take the already-allocated BP verbatim).
 748          */
 749         if (data == NULL && zio->io_prop.zp_dedup_verify) {
 750                 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 751         }
 752
 753         return (zio);
 754 }
 755
 756 zio_t *
 757 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 758     uint64_t size, zio_done_func_t *done, void *private,
 759     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 760 {
 761         zio_t *zio;
 762
 763         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 764             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 765             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 766
 767         return (zio);
 768 }
 769
 770 void
 771 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 772 {
 773         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 774         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 775         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 776         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 777
 778         /*
 779          * We must reset the io_prop to match the values that existed
 780          * when the bp was first written by dmu_sync() keeping in mind
 781          * that nopwrite and dedup are mutually exclusive.
 782          */
 783         zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 784         zio->io_prop.zp_nopwrite = nopwrite;
 785         zio->io_prop.zp_copies = copies;
 786         zio->io_bp_override = bp;
 787 }
 788
 789 void
 790 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 791 {
 792
 793         /*
 794          * The check for EMBEDDED is a performance optimization.  We
 795          * process the free here (by ignoring it) rather than
 796          * putting it on the list and then processing it in zio_free_sync().
 797          */
 798         if (BP_IS_EMBEDDED(bp))
 799                 return;
 800         metaslab_check_free(spa, bp);
 801
 802         /*
 803          * Frees that are for the currently-syncing txg, are not going to be
 804          * deferred, and which will not need to do a read (i.e. not GANG or
 805          * DEDUP), can be processed immediately.  Otherwise, put them on the
 806          * in-memory list for later processing.
 807          */
 808         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 809             txg != spa->spa_syncing_txg ||
 810             spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 811                 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 812         } else {
 813                 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
 814         }
 815 }
 816
 817 zio_t *
 818 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 819     enum zio_flag flags)
 820 {
 821         zio_t *zio;
 822         enum zio_stage stage = ZIO_FREE_PIPELINE;
 823
 824         ASSERT(!BP_IS_HOLE(bp));
 825         ASSERT(spa_syncing_txg(spa) == txg);
 826         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 827
 828         if (BP_IS_EMBEDDED(bp))
 829                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 830
 831         metaslab_check_free(spa, bp);
 832         arc_freed(spa, bp);
 833
 834         /*
 835          * GANG and DEDUP blocks can induce a read (for the gang block header,
 836          * or the DDT), so issue them asynchronously so that this thread is
 837          * not tied up.
 838          */
 839         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 840                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 841
 842         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 843             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 844             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 845
 846         return (zio);
 847 }
 848
 849 zio_t *
 850 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 851     zio_done_func_t *done, void *private, enum zio_flag flags)
 852 {
 853         zio_t *zio;
 854
 855         dprintf_bp(bp, "claiming in txg %llu", txg);
 856
 857         if (BP_IS_EMBEDDED(bp))
 858                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 859
 860         /*
 861          * A claim is an allocation of a specific block.  Claims are needed
 862          * to support immediate writes in the intent log.  The issue is that
 863          * immediate writes contain committed data, but in a txg that was
 864          * *not* committed.  Upon opening the pool after an unclean shutdown,
 865          * the intent log claims all blocks that contain immediate write data
 866          * so that the SPA knows they're in use.
 867          *
 868          * All claims *must* be resolved in the first txg -- before the SPA
 869          * starts allocating blocks -- so that nothing is allocated twice.
 870          * If txg == 0 we just verify that the block is claimable.
 871          */
 872         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 873         ASSERT(txg == spa_first_txg(spa) || txg == 0);
 874         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 875
 876         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 877             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 878             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 879
 880         return (zio);
 881 }
 882
 883 zio_t *
 884 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 885     zio_done_func_t *done, void *private, enum zio_flag flags)
 886 {
 887         zio_t *zio;
 888         int c;
 889
 890         if (vd->vdev_children == 0) {
 891                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 892                     ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 893                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 894
 895                 zio->io_cmd = cmd;
 896         } else {
 897                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 898
 899                 for (c = 0; c < vd->vdev_children; c++)
 900                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 901                             done, private, flags));
 902         }
 903
 904         return (zio);
 905 }
 906
 907 zio_t *
 908 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 909     void *data, int checksum, zio_done_func_t *done, void *private,
 910     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 911 {
 912         zio_t *zio;
 913
 914         ASSERT(vd->vdev_children == 0);
 915         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 916             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 917         ASSERT3U(offset + size, <=, vd->vdev_psize);
 918
 919         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 920             ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 921             NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 922
 923         zio->io_prop.zp_checksum = checksum;
 924
 925         return (zio);
 926 }
 927
 928 zio_t *
 929 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 930     void *data, int checksum, zio_done_func_t *done, void *private,
 931     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 932 {
 933         zio_t *zio;
 934
 935         ASSERT(vd->vdev_children == 0);
 936         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 937             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 938         ASSERT3U(offset + size, <=, vd->vdev_psize);
 939
 940         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 941             ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 942             NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 943
 944         zio->io_prop.zp_checksum = checksum;
 945
 946         if (zio_checksum_table[checksum].ci_eck) {
 947                 /*
 948                  * zec checksums are necessarily destructive -- they modify
 949                  * the end of the write buffer to hold the verifier/checksum.
 950                  * Therefore, we must make a local copy in case the data is
 951                  * being written to multiple places in parallel.
 952                  */
 953                 void *wbuf = zio_buf_alloc(size);
 954                 bcopy(data, wbuf, size);
 955                 zio_push_transform(zio, wbuf, size, size, NULL);
 956         }
 957
 958         return (zio);
 959 }
 960
 961 /*
 962  * Create a child I/O to do some work for us.
 963  */
 964 zio_t *
 965 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 966         void *data, uint64_t size, int type, zio_priority_t priority,
 967         enum zio_flag flags, zio_done_func_t *done, void *private)
 968 {
 969         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 970         zio_t *zio;
 971
 972         ASSERT(vd->vdev_parent ==
 973             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 974
 975         if (type == ZIO_TYPE_READ && bp != NULL) {
 976                 /*
 977                  * If we have the bp, then the child should perform the
 978                  * checksum and the parent need not.  This pushes error
 979                  * detection as close to the leaves as possible and
 980                  * eliminates redundant checksums in the interior nodes.
 981                  */
 982                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 983                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 984         }
 985
 986         if (vd->vdev_children == 0)
 987                 offset += VDEV_LABEL_START_SIZE;
 988
 989         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 990
 991         /*
 992          * If we've decided to do a repair, the write is not speculative --
 993          * even if the original read was.
 994          */
 995         if (flags & ZIO_FLAG_IO_REPAIR)
 996                 flags &= ~ZIO_FLAG_SPECULATIVE;
 997
 998         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 999             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
1000             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
1001
1002         zio->io_physdone = pio->io_physdone;
1003         if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1004                 zio->io_logical->io_phys_children++;
1005
1006         return (zio);
1007 }
1008
1009 zio_t *
1010 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
1011         int type, zio_priority_t priority, enum zio_flag flags,
1012         zio_done_func_t *done, void *private)
1013 {
1014         zio_t *zio;
1015
1016         ASSERT(vd->vdev_ops->vdev_op_leaf);
1017
1018         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1019             data, size, done, private, type, priority,
1020             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
1021             vd, offset, NULL,
1022             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
1023
1024         return (zio);
1025 }
1026
1027 void
1028 zio_flush(zio_t *zio, vdev_t *vd)
1029 {
1030         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
1031             NULL, NULL,
1032             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1033 }
1034
1035 void
1036 zio_shrink(zio_t *zio, uint64_t size)
1037 {
1038         ASSERT(zio->io_executor == NULL);
1039         ASSERT(zio->io_orig_size == zio->io_size);
1040         ASSERT(size <= zio->io_size);
1041
1042         /*
1043          * We don't shrink for raidz because of problems with the
1044          * reconstruction when reading back less than the block size.
1045          * Note, BP_IS_RAIDZ() assumes no compression.
1046          */
1047         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1048         if (!BP_IS_RAIDZ(zio->io_bp))
1049                 zio->io_orig_size = zio->io_size = size;
1050 }
1051
1052 /*
1053  * ==========================================================================
1054  * Prepare to read and write logical blocks
1055  * ==========================================================================
1056  */
1057
1058 static int
1059 zio_read_bp_init(zio_t *zio)
1060 {
1061         blkptr_t *bp = zio->io_bp;
1062
1063         if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1064             zio->io_child_type == ZIO_CHILD_LOGICAL &&
1065             !(zio->io_flags & ZIO_FLAG_RAW)) {
1066                 uint64_t psize =
1067                     BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1068                 void *cbuf = zio_buf_alloc(psize);
1069
1070                 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1071         }
1072
1073         if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1074                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1075                 decode_embedded_bp_compressed(bp, zio->io_data);
1076         } else {
1077                 ASSERT(!BP_IS_EMBEDDED(bp));
1078         }
1079
1080         if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1081                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1082
1083         if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1084                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1085
1086         if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1087                 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1088
1089         return (ZIO_PIPELINE_CONTINUE);
1090 }
1091
1092 static int
1093 zio_write_bp_init(zio_t *zio)
1094 {
1095         spa_t *spa = zio->io_spa;
1096         zio_prop_t *zp = &zio->io_prop;
1097         enum zio_compress compress = zp->zp_compress;
1098         blkptr_t *bp = zio->io_bp;
1099         uint64_t lsize = zio->io_size;
1100         uint64_t psize = lsize;
1101         int pass = 1;
1102
1103         /*
1104          * If our children haven't all reached the ready stage,
1105          * wait for them and then repeat this pipeline stage.
1106          */
1107         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1108             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1109                 return (ZIO_PIPELINE_STOP);
1110
1111         if (!IO_IS_ALLOCATING(zio))
1112                 return (ZIO_PIPELINE_CONTINUE);
1113
1114         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1115
1116         if (zio->io_bp_override) {
1117                 ASSERT(bp->blk_birth != zio->io_txg);
1118                 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1119
1120                 *bp = *zio->io_bp_override;
1121                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1122
1123                 if (BP_IS_EMBEDDED(bp))
1124                         return (ZIO_PIPELINE_CONTINUE);
1125
1126                 /*
1127                  * If we've been overridden and nopwrite is set then
1128                  * set the flag accordingly to indicate that a nopwrite
1129                  * has already occurred.
1130                  */
1131                 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1132                         ASSERT(!zp->zp_dedup);
1133                         zio->io_flags |= ZIO_FLAG_NOPWRITE;
1134                         return (ZIO_PIPELINE_CONTINUE);
1135                 }
1136
1137                 ASSERT(!zp->zp_nopwrite);
1138
1139                 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1140                         return (ZIO_PIPELINE_CONTINUE);
1141
1142                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1143                     zp->zp_dedup_verify);
1144
1145                 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1146                         BP_SET_DEDUP(bp, 1);
1147                         zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1148                         return (ZIO_PIPELINE_CONTINUE);
1149                 }
1150         }
1151
1152         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1153                 /*
1154                  * We're rewriting an existing block, which means we're
1155                  * working on behalf of spa_sync().  For spa_sync() to
1156                  * converge, it must eventually be the case that we don't
1157                  * have to allocate new blocks.  But compression changes
1158                  * the blocksize, which forces a reallocate, and makes
1159                  * convergence take longer.  Therefore, after the first
1160                  * few passes, stop compressing to ensure convergence.
1161                  */
1162                 pass = spa_sync_pass(spa);
1163
1164                 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1165                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1166                 ASSERT(!BP_GET_DEDUP(bp));
1167
1168                 if (pass >= zfs_sync_pass_dont_compress)
1169                         compress = ZIO_COMPRESS_OFF;
1170
1171                 /* Make sure someone doesn't change their mind on overwrites */
1172                 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1173                     spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1174         }
1175
1176         if (compress != ZIO_COMPRESS_OFF) {
1177                 void *cbuf = zio_buf_alloc(lsize);
1178                 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1179                 if (psize == 0 || psize == lsize) {
1180                         compress = ZIO_COMPRESS_OFF;
1181                         zio_buf_free(cbuf, lsize);
1182                 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1183                     zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1184                     spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1185                         encode_embedded_bp_compressed(bp,
1186                             cbuf, compress, lsize, psize);
1187                         BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1188                         BP_SET_TYPE(bp, zio->io_prop.zp_type);
1189                         BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1190                         zio_buf_free(cbuf, lsize);
1191                         bp->blk_birth = zio->io_txg;
1192                         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1193                         ASSERT(spa_feature_is_active(spa,
1194                             SPA_FEATURE_EMBEDDED_DATA));
1195                         return (ZIO_PIPELINE_CONTINUE);
1196                 } else {
1197                         /*
1198                          * Round up compressed size up to the ashift
1199                          * of the smallest-ashift device, and zero the tail.
1200                          * This ensures that the compressed size of the BP
1201                          * (and thus compressratio property) are correct,
1202                          * in that we charge for the padding used to fill out
1203                          * the last sector.
1204                          */
1205                         size_t rounded;
1206
1207                         ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
1208
1209                         rounded = (size_t)P2ROUNDUP(psize,
1210                             1ULL << spa->spa_min_ashift);
1211                         if (rounded >= lsize) {
1212                                 compress = ZIO_COMPRESS_OFF;
1213                                 zio_buf_free(cbuf, lsize);
1214                                 psize = lsize;
1215                         } else {
1216                                 bzero((char *)cbuf + psize, rounded - psize);
1217                                 psize = rounded;
1218                                 zio_push_transform(zio, cbuf,
1219                                     psize, lsize, NULL);
1220                         }
1221                 }
1222         }
1223
1224         /*
1225          * The final pass of spa_sync() must be all rewrites, but the first
1226          * few passes offer a trade-off: allocating blocks defers convergence,
1227          * but newly allocated blocks are sequential, so they can be written
1228          * to disk faster.  Therefore, we allow the first few passes of
1229          * spa_sync() to allocate new blocks, but force rewrites after that.
1230          * There should only be a handful of blocks after pass 1 in any case.
1231          */
1232         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1233             BP_GET_PSIZE(bp) == psize &&
1234             pass >= zfs_sync_pass_rewrite) {
1235                 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1236                 ASSERT(psize != 0);
1237                 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1238                 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1239         } else {
1240                 BP_ZERO(bp);
1241                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1242         }
1243
1244         if (psize == 0) {
1245                 if (zio->io_bp_orig.blk_birth != 0 &&
1246                     spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1247                         BP_SET_LSIZE(bp, lsize);
1248                         BP_SET_TYPE(bp, zp->zp_type);
1249                         BP_SET_LEVEL(bp, zp->zp_level);
1250                         BP_SET_BIRTH(bp, zio->io_txg, 0);
1251                 }
1252                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1253         } else {
1254                 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1255                 BP_SET_LSIZE(bp, lsize);
1256                 BP_SET_TYPE(bp, zp->zp_type);
1257                 BP_SET_LEVEL(bp, zp->zp_level);
1258                 BP_SET_PSIZE(bp, psize);
1259                 BP_SET_COMPRESS(bp, compress);
1260                 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1261                 BP_SET_DEDUP(bp, zp->zp_dedup);
1262                 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1263                 if (zp->zp_dedup) {
1264                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1265                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1266                         zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1267                 }
1268                 if (zp->zp_nopwrite) {
1269                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1270                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1271                         zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1272                 }
1273         }
1274
1275         return (ZIO_PIPELINE_CONTINUE);
1276 }
1277
1278 static int
1279 zio_free_bp_init(zio_t *zio)
1280 {
1281         blkptr_t *bp = zio->io_bp;
1282
1283         if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1284                 if (BP_GET_DEDUP(bp))
1285                         zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1286         }
1287
1288         return (ZIO_PIPELINE_CONTINUE);
1289 }
1290
1291 /*
1292  * ==========================================================================
1293  * Execute the I/O pipeline
1294  * ==========================================================================
1295  */
1296
1297 static void
1298 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1299 {
1300         spa_t *spa = zio->io_spa;
1301         zio_type_t t = zio->io_type;
1302         int flags = (cutinline ? TQ_FRONT : 0);
1303
1304         /*
1305          * If we're a config writer or a probe, the normal issue and
1306          * interrupt threads may all be blocked waiting for the config lock.
1307          * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1308          */
1309         if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1310                 t = ZIO_TYPE_NULL;
1311
1312         /*
1313          * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1314          */
1315         if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1316                 t = ZIO_TYPE_NULL;
1317
1318         /*
1319          * If this is a high priority I/O, then use the high priority taskq if
1320          * available.
1321          */
1322         if (zio->io_priority == ZIO_PRIORITY_NOW &&
1323             spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1324                 q++;
1325
1326         ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1327
1328         /*
1329          * NB: We are assuming that the zio can only be dispatched
1330          * to a single taskq at a time.  It would be a grievous error
1331          * to dispatch the zio to another taskq at the same time.
1332          */
1333         ASSERT(taskq_empty_ent(&zio->io_tqent));
1334         spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1335             flags, &zio->io_tqent);
1336 }
1337
1338 static boolean_t
1339 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1340 {
1341         kthread_t *executor = zio->io_executor;
1342         spa_t *spa = zio->io_spa;
1343         zio_type_t t;
1344
1345         for (t = 0; t < ZIO_TYPES; t++) {
1346                 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1347                 uint_t i;
1348                 for (i = 0; i < tqs->stqs_count; i++) {
1349                         if (taskq_member(tqs->stqs_taskq[i], executor))
1350                                 return (B_TRUE);
1351                 }
1352         }
1353
1354         return (B_FALSE);
1355 }
1356
1357 static int
1358 zio_issue_async(zio_t *zio)
1359 {
1360         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1361
1362         return (ZIO_PIPELINE_STOP);
1363 }
1364
1365 void
1366 zio_interrupt(zio_t *zio)
1367 {
1368         zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1369 }
1370
1371 /*
1372  * Execute the I/O pipeline until one of the following occurs:
1373  * (1) the I/O completes; (2) the pipeline stalls waiting for
1374  * dependent child I/Os; (3) the I/O issues, so we're waiting
1375  * for an I/O completion interrupt; (4) the I/O is delegated by
1376  * vdev-level caching or aggregation; (5) the I/O is deferred
1377  * due to vdev-level queueing; (6) the I/O is handed off to
1378  * another thread.  In all cases, the pipeline stops whenever
1379  * there's no CPU work; it never burns a thread in cv_wait_io().
1380  *
1381  * There's no locking on io_stage because there's no legitimate way
1382  * for multiple threads to be attempting to process the same I/O.
1383  */
1384 static zio_pipe_stage_t *zio_pipeline[];
1385
1386 /*
1387  * zio_execute() is a wrapper around the static function
1388  * __zio_execute() so that we can force  __zio_execute() to be
1389  * inlined.  This reduces stack overhead which is important
1390  * because __zio_execute() is called recursively in several zio
1391  * code paths.  zio_execute() itself cannot be inlined because
1392  * it is externally visible.
1393  */
1394 void
1395 zio_execute(zio_t *zio)
1396 {
1397         fstrans_cookie_t cookie;
1398
1399         cookie = spl_fstrans_mark();
1400         __zio_execute(zio);
1401         spl_fstrans_unmark(cookie);
1402 }
1403
1404 __attribute__((always_inline))
1405 static inline void
1406 __zio_execute(zio_t *zio)
1407 {
1408         zio->io_executor = curthread;
1409
1410         while (zio->io_stage < ZIO_STAGE_DONE) {
1411                 enum zio_stage pipeline = zio->io_pipeline;
1412                 enum zio_stage stage = zio->io_stage;
1413                 dsl_pool_t *dp;
1414                 boolean_t cut;
1415                 int rv;
1416
1417                 ASSERT(!MUTEX_HELD(&zio->io_lock));
1418                 ASSERT(ISP2(stage));
1419                 ASSERT(zio->io_stall == NULL);
1420
1421                 do {
1422                         stage <<= 1;
1423                 } while ((stage & pipeline) == 0);
1424
1425                 ASSERT(stage <= ZIO_STAGE_DONE);
1426
1427                 dp = spa_get_dsl(zio->io_spa);
1428                 cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1429                     zio_requeue_io_start_cut_in_line : B_FALSE;
1430
1431                 /*
1432                  * If we are in interrupt context and this pipeline stage
1433                  * will grab a config lock that is held across I/O,
1434                  * or may wait for an I/O that needs an interrupt thread
1435                  * to complete, issue async to avoid deadlock.
1436                  *
1437                  * For VDEV_IO_START, we cut in line so that the io will
1438                  * be sent to disk promptly.
1439                  */
1440                 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1441                     zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1442                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1443                         return;
1444                 }
1445
1446                 /*
1447                  * If we executing in the context of the tx_sync_thread,
1448                  * or we are performing pool initialization outside of a
1449                  * zio_taskq[ZIO_TASKQ_ISSUE|ZIO_TASKQ_ISSUE_HIGH] context.
1450                  * Then issue the zio asynchronously to minimize stack usage
1451                  * for these deep call paths.
1452                  */
1453                 if ((dp && curthread == dp->dp_tx.tx_sync_thread) ||
1454                     (dp && spa_is_initializing(dp->dp_spa) &&
1455                     !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
1456                     !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))) {
1457                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1458                         return;
1459                 }
1460
1461                 zio->io_stage = stage;
1462                 rv = zio_pipeline[highbit64(stage) - 1](zio);
1463
1464                 if (rv == ZIO_PIPELINE_STOP)
1465                         return;
1466
1467                 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1468         }
1469 }
1470
1471
1472 /*
1473  * ==========================================================================
1474  * Initiate I/O, either sync or async
1475  * ==========================================================================
1476  */
1477 int
1478 zio_wait(zio_t *zio)
1479 {
1480         int error;
1481
1482         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1483         ASSERT(zio->io_executor == NULL);
1484
1485         zio->io_waiter = curthread;
1486
1487         __zio_execute(zio);
1488
1489         mutex_enter(&zio->io_lock);
1490         while (zio->io_executor != NULL)
1491                 cv_wait_io(&zio->io_cv, &zio->io_lock);
1492         mutex_exit(&zio->io_lock);
1493
1494         error = zio->io_error;
1495         zio_destroy(zio);
1496
1497         return (error);
1498 }
1499
1500 void
1501 zio_nowait(zio_t *zio)
1502 {
1503         ASSERT(zio->io_executor == NULL);
1504
1505         if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1506             zio_unique_parent(zio) == NULL) {
1507                 zio_t *pio;
1508
1509                 /*
1510                  * This is a logical async I/O with no parent to wait for it.
1511                  * We add it to the spa_async_root_zio "Godfather" I/O which
1512                  * will ensure they complete prior to unloading the pool.
1513                  */
1514                 spa_t *spa = zio->io_spa;
1515                 kpreempt_disable();
1516                 pio = spa->spa_async_zio_root[CPU_SEQID];
1517                 kpreempt_enable();
1518
1519                 zio_add_child(pio, zio);
1520         }
1521
1522         __zio_execute(zio);
1523 }
1524
1525 /*
1526  * ==========================================================================
1527  * Reexecute or suspend/resume failed I/O
1528  * ==========================================================================
1529  */
1530
1531 static void
1532 zio_reexecute(zio_t *pio)
1533 {
1534         zio_t *cio, *cio_next;
1535         int c, w;
1536
1537         ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1538         ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1539         ASSERT(pio->io_gang_leader == NULL);
1540         ASSERT(pio->io_gang_tree == NULL);
1541
1542         pio->io_flags = pio->io_orig_flags;
1543         pio->io_stage = pio->io_orig_stage;
1544         pio->io_pipeline = pio->io_orig_pipeline;
1545         pio->io_reexecute = 0;
1546         pio->io_flags |= ZIO_FLAG_REEXECUTED;
1547         pio->io_error = 0;
1548         for (w = 0; w < ZIO_WAIT_TYPES; w++)
1549                 pio->io_state[w] = 0;
1550         for (c = 0; c < ZIO_CHILD_TYPES; c++)
1551                 pio->io_child_error[c] = 0;
1552
1553         if (IO_IS_ALLOCATING(pio))
1554                 BP_ZERO(pio->io_bp);
1555
1556         /*
1557          * As we reexecute pio's children, new children could be created.
1558          * New children go to the head of pio's io_child_list, however,
1559          * so we will (correctly) not reexecute them.  The key is that
1560          * the remainder of pio's io_child_list, from 'cio_next' onward,
1561          * cannot be affected by any side effects of reexecuting 'cio'.
1562          */
1563         for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1564                 cio_next = zio_walk_children(pio);
1565                 mutex_enter(&pio->io_lock);
1566                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
1567                         pio->io_children[cio->io_child_type][w]++;
1568                 mutex_exit(&pio->io_lock);
1569                 zio_reexecute(cio);
1570         }
1571
1572         /*
1573          * Now that all children have been reexecuted, execute the parent.
1574          * We don't reexecute "The Godfather" I/O here as it's the
1575          * responsibility of the caller to wait on him.
1576          */
1577         if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1578                 __zio_execute(pio);
1579 }
1580
1581 void
1582 zio_suspend(spa_t *spa, zio_t *zio)
1583 {
1584         if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1585                 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1586                     "failure and the failure mode property for this pool "
1587                     "is set to panic.", spa_name(spa));
1588
1589         cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
1590             "failure and has been suspended.\n", spa_name(spa));
1591
1592         zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1593
1594         mutex_enter(&spa->spa_suspend_lock);
1595
1596         if (spa->spa_suspend_zio_root == NULL)
1597                 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1598                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1599                     ZIO_FLAG_GODFATHER);
1600
1601         spa->spa_suspended = B_TRUE;
1602
1603         if (zio != NULL) {
1604                 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1605                 ASSERT(zio != spa->spa_suspend_zio_root);
1606                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1607                 ASSERT(zio_unique_parent(zio) == NULL);
1608                 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1609                 zio_add_child(spa->spa_suspend_zio_root, zio);
1610         }
1611
1612         mutex_exit(&spa->spa_suspend_lock);
1613 }
1614
1615 int
1616 zio_resume(spa_t *spa)
1617 {
1618         zio_t *pio;
1619
1620         /*
1621          * Reexecute all previously suspended i/o.
1622          */
1623         mutex_enter(&spa->spa_suspend_lock);
1624         spa->spa_suspended = B_FALSE;
1625         cv_broadcast(&spa->spa_suspend_cv);
1626         pio = spa->spa_suspend_zio_root;
1627         spa->spa_suspend_zio_root = NULL;
1628         mutex_exit(&spa->spa_suspend_lock);
1629
1630         if (pio == NULL)
1631                 return (0);
1632
1633         zio_reexecute(pio);
1634         return (zio_wait(pio));
1635 }
1636
1637 void
1638 zio_resume_wait(spa_t *spa)
1639 {
1640         mutex_enter(&spa->spa_suspend_lock);
1641         while (spa_suspended(spa))
1642                 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1643         mutex_exit(&spa->spa_suspend_lock);
1644 }
1645
1646 /*
1647  * ==========================================================================
1648  * Gang blocks.
1649  *
1650  * A gang block is a collection of small blocks that looks to the DMU
1651  * like one large block.  When zio_dva_allocate() cannot find a block
1652  * of the requested size, due to either severe fragmentation or the pool
1653  * being nearly full, it calls zio_write_gang_block() to construct the
1654  * block from smaller fragments.
1655  *
1656  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1657  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1658  * an indirect block: it's an array of block pointers.  It consumes
1659  * only one sector and hence is allocatable regardless of fragmentation.
1660  * The gang header's bps point to its gang members, which hold the data.
1661  *
1662  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1663  * as the verifier to ensure uniqueness of the SHA256 checksum.
1664  * Critically, the gang block bp's blk_cksum is the checksum of the data,
1665  * not the gang header.  This ensures that data block signatures (needed for
1666  * deduplication) are independent of how the block is physically stored.
1667  *
1668  * Gang blocks can be nested: a gang member may itself be a gang block.
1669  * Thus every gang block is a tree in which root and all interior nodes are
1670  * gang headers, and the leaves are normal blocks that contain user data.
1671  * The root of the gang tree is called the gang leader.
1672  *
1673  * To perform any operation (read, rewrite, free, claim) on a gang block,
1674  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1675  * in the io_gang_tree field of the original logical i/o by recursively
1676  * reading the gang leader and all gang headers below it.  This yields
1677  * an in-core tree containing the contents of every gang header and the
1678  * bps for every constituent of the gang block.
1679  *
1680  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1681  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1682  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1683  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1684  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1685  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1686  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1687  * of the gang header plus zio_checksum_compute() of the data to update the
1688  * gang header's blk_cksum as described above.
1689  *
1690  * The two-phase assemble/issue model solves the problem of partial failure --
1691  * what if you'd freed part of a gang block but then couldn't read the
1692  * gang header for another part?  Assembling the entire gang tree first
1693  * ensures that all the necessary gang header I/O has succeeded before
1694  * starting the actual work of free, claim, or write.  Once the gang tree
1695  * is assembled, free and claim are in-memory operations that cannot fail.
1696  *
1697  * In the event that a gang write fails, zio_dva_unallocate() walks the
1698  * gang tree to immediately free (i.e. insert back into the space map)
1699  * everything we've allocated.  This ensures that we don't get ENOSPC
1700  * errors during repeated suspend/resume cycles due to a flaky device.
1701  *
1702  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1703  * the gang tree, we won't modify the block, so we can safely defer the free
1704  * (knowing that the block is still intact).  If we *can* assemble the gang
1705  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1706  * each constituent bp and we can allocate a new block on the next sync pass.
1707  *
1708  * In all cases, the gang tree allows complete recovery from partial failure.
1709  * ==========================================================================
1710  */
1711
1712 static zio_t *
1713 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1714 {
1715         if (gn != NULL)
1716                 return (pio);
1717
1718         return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1719             NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1720             &pio->io_bookmark));
1721 }
1722
1723 zio_t *
1724 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1725 {
1726         zio_t *zio;
1727
1728         if (gn != NULL) {
1729                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1730                     gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1731                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1732                 /*
1733                  * As we rewrite each gang header, the pipeline will compute
1734                  * a new gang block header checksum for it; but no one will
1735                  * compute a new data checksum, so we do that here.  The one
1736                  * exception is the gang leader: the pipeline already computed
1737                  * its data checksum because that stage precedes gang assembly.
1738                  * (Presently, nothing actually uses interior data checksums;
1739                  * this is just good hygiene.)
1740                  */
1741                 if (gn != pio->io_gang_leader->io_gang_tree) {
1742                         zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1743                             data, BP_GET_PSIZE(bp));
1744                 }
1745                 /*
1746                  * If we are here to damage data for testing purposes,
1747                  * leave the GBH alone so that we can detect the damage.
1748                  */
1749                 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1750                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1751         } else {
1752                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1753                     data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1754                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1755         }
1756
1757         return (zio);
1758 }
1759
1760 /* ARGSUSED */
1761 zio_t *
1762 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1763 {
1764         return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1765             ZIO_GANG_CHILD_FLAGS(pio)));
1766 }
1767
1768 /* ARGSUSED */
1769 zio_t *
1770 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1771 {
1772         return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1773             NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1774 }
1775
1776 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1777         NULL,
1778         zio_read_gang,
1779         zio_rewrite_gang,
1780         zio_free_gang,
1781         zio_claim_gang,
1782         NULL
1783 };
1784
1785 static void zio_gang_tree_assemble_done(zio_t *zio);
1786
1787 static zio_gang_node_t *
1788 zio_gang_node_alloc(zio_gang_node_t **gnpp)
1789 {
1790         zio_gang_node_t *gn;
1791
1792         ASSERT(*gnpp == NULL);
1793
1794         gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1795         gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1796         *gnpp = gn;
1797
1798         return (gn);
1799 }
1800
1801 static void
1802 zio_gang_node_free(zio_gang_node_t **gnpp)
1803 {
1804         zio_gang_node_t *gn = *gnpp;
1805         int g;
1806
1807         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1808                 ASSERT(gn->gn_child[g] == NULL);
1809
1810         zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1811         kmem_free(gn, sizeof (*gn));
1812         *gnpp = NULL;
1813 }
1814
1815 static void
1816 zio_gang_tree_free(zio_gang_node_t **gnpp)
1817 {
1818         zio_gang_node_t *gn = *gnpp;
1819         int g;
1820
1821         if (gn == NULL)
1822                 return;
1823
1824         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1825                 zio_gang_tree_free(&gn->gn_child[g]);
1826
1827         zio_gang_node_free(gnpp);
1828 }
1829
1830 static void
1831 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1832 {
1833         zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1834
1835         ASSERT(gio->io_gang_leader == gio);
1836         ASSERT(BP_IS_GANG(bp));
1837
1838         zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1839             SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1840             gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1841 }
1842
1843 static void
1844 zio_gang_tree_assemble_done(zio_t *zio)
1845 {
1846         zio_t *gio = zio->io_gang_leader;
1847         zio_gang_node_t *gn = zio->io_private;
1848         blkptr_t *bp = zio->io_bp;
1849         int g;
1850
1851         ASSERT(gio == zio_unique_parent(zio));
1852         ASSERT(zio->io_child_count == 0);
1853
1854         if (zio->io_error)
1855                 return;
1856
1857         if (BP_SHOULD_BYTESWAP(bp))
1858                 byteswap_uint64_array(zio->io_data, zio->io_size);
1859
1860         ASSERT(zio->io_data == gn->gn_gbh);
1861         ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1862         ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1863
1864         for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1865                 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1866                 if (!BP_IS_GANG(gbp))
1867                         continue;
1868                 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1869         }
1870 }
1871
1872 static void
1873 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1874 {
1875         zio_t *gio = pio->io_gang_leader;
1876         zio_t *zio;
1877         int g;
1878
1879         ASSERT(BP_IS_GANG(bp) == !!gn);
1880         ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1881         ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1882
1883         /*
1884          * If you're a gang header, your data is in gn->gn_gbh.
1885          * If you're a gang member, your data is in 'data' and gn == NULL.
1886          */
1887         zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1888
1889         if (gn != NULL) {
1890                 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1891
1892                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1893                         blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1894                         if (BP_IS_HOLE(gbp))
1895                                 continue;
1896                         zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1897                         data = (char *)data + BP_GET_PSIZE(gbp);
1898                 }
1899         }
1900
1901         if (gn == gio->io_gang_tree)
1902                 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1903
1904         if (zio != pio)
1905                 zio_nowait(zio);
1906 }
1907
1908 static int
1909 zio_gang_assemble(zio_t *zio)
1910 {
1911         blkptr_t *bp = zio->io_bp;
1912
1913         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1914         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1915
1916         zio->io_gang_leader = zio;
1917
1918         zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1919
1920         return (ZIO_PIPELINE_CONTINUE);
1921 }
1922
1923 static int
1924 zio_gang_issue(zio_t *zio)
1925 {
1926         blkptr_t *bp = zio->io_bp;
1927
1928         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1929                 return (ZIO_PIPELINE_STOP);
1930
1931         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1932         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1933
1934         if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1935                 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1936         else
1937                 zio_gang_tree_free(&zio->io_gang_tree);
1938
1939         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1940
1941         return (ZIO_PIPELINE_CONTINUE);
1942 }
1943
1944 static void
1945 zio_write_gang_member_ready(zio_t *zio)
1946 {
1947         zio_t *pio = zio_unique_parent(zio);
1948         dva_t *cdva = zio->io_bp->blk_dva;
1949         dva_t *pdva = pio->io_bp->blk_dva;
1950         uint64_t asize;
1951         int d;
1952         ASSERTV(zio_t *gio = zio->io_gang_leader);
1953
1954         if (BP_IS_HOLE(zio->io_bp))
1955                 return;
1956
1957         ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1958
1959         ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1960         ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1961         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1962         ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1963         ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1964
1965         mutex_enter(&pio->io_lock);
1966         for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1967                 ASSERT(DVA_GET_GANG(&pdva[d]));
1968                 asize = DVA_GET_ASIZE(&pdva[d]);
1969                 asize += DVA_GET_ASIZE(&cdva[d]);
1970                 DVA_SET_ASIZE(&pdva[d], asize);
1971         }
1972         mutex_exit(&pio->io_lock);
1973 }
1974
1975 static int
1976 zio_write_gang_block(zio_t *pio)
1977 {
1978         spa_t *spa = pio->io_spa;
1979         blkptr_t *bp = pio->io_bp;
1980         zio_t *gio = pio->io_gang_leader;
1981         zio_t *zio;
1982         zio_gang_node_t *gn, **gnpp;
1983         zio_gbh_phys_t *gbh;
1984         uint64_t txg = pio->io_txg;
1985         uint64_t resid = pio->io_size;
1986         uint64_t lsize;
1987         int copies = gio->io_prop.zp_copies;
1988         int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1989         zio_prop_t zp;
1990         int g, error;
1991
1992         error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1993             bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1994             METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1995         if (error) {
1996                 pio->io_error = error;
1997                 return (ZIO_PIPELINE_CONTINUE);
1998         }
1999
2000         if (pio == gio) {
2001                 gnpp = &gio->io_gang_tree;
2002         } else {
2003                 gnpp = pio->io_private;
2004                 ASSERT(pio->io_ready == zio_write_gang_member_ready);
2005         }
2006
2007         gn = zio_gang_node_alloc(gnpp);
2008         gbh = gn->gn_gbh;
2009         bzero(gbh, SPA_GANGBLOCKSIZE);
2010
2011         /*
2012          * Create the gang header.
2013          */
2014         zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
2015             pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2016
2017         /*
2018          * Create and nowait the gang children.
2019          */
2020         for (g = 0; resid != 0; resid -= lsize, g++) {
2021                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
2022                     SPA_MINBLOCKSIZE);
2023                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
2024
2025                 zp.zp_checksum = gio->io_prop.zp_checksum;
2026                 zp.zp_compress = ZIO_COMPRESS_OFF;
2027                 zp.zp_type = DMU_OT_NONE;
2028                 zp.zp_level = 0;
2029                 zp.zp_copies = gio->io_prop.zp_copies;
2030                 zp.zp_dedup = B_FALSE;
2031                 zp.zp_dedup_verify = B_FALSE;
2032                 zp.zp_nopwrite = B_FALSE;
2033
2034                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
2035                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
2036                     zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
2037                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2038                     &pio->io_bookmark));
2039         }
2040
2041         /*
2042          * Set pio's pipeline to just wait for zio to finish.
2043          */
2044         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2045
2046         /*
2047          * We didn't allocate this bp, so make sure it doesn't get unmarked.
2048          */
2049         pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
2050
2051         zio_nowait(zio);
2052
2053         return (ZIO_PIPELINE_CONTINUE);
2054 }
2055
2056 /*
2057  * The zio_nop_write stage in the pipeline determines if allocating
2058  * a new bp is necessary.  By leveraging a cryptographically secure checksum,
2059  * such as SHA256, we can compare the checksums of the new data and the old
2060  * to determine if allocating a new block is required.  The nopwrite
2061  * feature can handle writes in either syncing or open context (i.e. zil
2062  * writes) and as a result is mutually exclusive with dedup.
2063  */
2064 static int
2065 zio_nop_write(zio_t *zio)
2066 {
2067         blkptr_t *bp = zio->io_bp;
2068         blkptr_t *bp_orig = &zio->io_bp_orig;
2069         zio_prop_t *zp = &zio->io_prop;
2070
2071         ASSERT(BP_GET_LEVEL(bp) == 0);
2072         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2073         ASSERT(zp->zp_nopwrite);
2074         ASSERT(!zp->zp_dedup);
2075         ASSERT(zio->io_bp_override == NULL);
2076         ASSERT(IO_IS_ALLOCATING(zio));
2077
2078         /*
2079          * Check to see if the original bp and the new bp have matching
2080          * characteristics (i.e. same checksum, compression algorithms, etc).
2081          * If they don't then just continue with the pipeline which will
2082          * allocate a new bp.
2083          */
2084         if (BP_IS_HOLE(bp_orig) ||
2085             !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2086             BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2087             BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2088             BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2089             zp->zp_copies != BP_GET_NDVAS(bp_orig))
2090                 return (ZIO_PIPELINE_CONTINUE);
2091
2092         /*
2093          * If the checksums match then reset the pipeline so that we
2094          * avoid allocating a new bp and issuing any I/O.
2095          */
2096         if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2097                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2098                 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2099                 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2100                 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2101                 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2102                     sizeof (uint64_t)) == 0);
2103
2104                 *bp = *bp_orig;
2105                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2106                 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2107         }
2108
2109         return (ZIO_PIPELINE_CONTINUE);
2110 }
2111
2112 /*
2113  * ==========================================================================
2114  * Dedup
2115  * ==========================================================================
2116  */
2117 static void
2118 zio_ddt_child_read_done(zio_t *zio)
2119 {
2120         blkptr_t *bp = zio->io_bp;
2121         ddt_entry_t *dde = zio->io_private;
2122         ddt_phys_t *ddp;
2123         zio_t *pio = zio_unique_parent(zio);
2124
2125         mutex_enter(&pio->io_lock);
2126         ddp = ddt_phys_select(dde, bp);
2127         if (zio->io_error == 0)
2128                 ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
2129         if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2130                 dde->dde_repair_data = zio->io_data;
2131         else
2132                 zio_buf_free(zio->io_data, zio->io_size);
2133         mutex_exit(&pio->io_lock);
2134 }
2135
2136 static int
2137 zio_ddt_read_start(zio_t *zio)
2138 {
2139         blkptr_t *bp = zio->io_bp;
2140         int p;
2141
2142         ASSERT(BP_GET_DEDUP(bp));
2143         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2144         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2145
2146         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2147                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2148                 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2149                 ddt_phys_t *ddp = dde->dde_phys;
2150                 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2151                 blkptr_t blk;
2152
2153                 ASSERT(zio->io_vsd == NULL);
2154                 zio->io_vsd = dde;
2155
2156                 if (ddp_self == NULL)
2157                         return (ZIO_PIPELINE_CONTINUE);
2158
2159                 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2160                         if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2161                                 continue;
2162                         ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2163                             &blk);
2164                         zio_nowait(zio_read(zio, zio->io_spa, &blk,
2165                             zio_buf_alloc(zio->io_size), zio->io_size,
2166                             zio_ddt_child_read_done, dde, zio->io_priority,
2167                             ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2168                             &zio->io_bookmark));
2169                 }
2170                 return (ZIO_PIPELINE_CONTINUE);
2171         }
2172
2173         zio_nowait(zio_read(zio, zio->io_spa, bp,
2174             zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2175             ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2176
2177         return (ZIO_PIPELINE_CONTINUE);
2178 }
2179
2180 static int
2181 zio_ddt_read_done(zio_t *zio)
2182 {
2183         blkptr_t *bp = zio->io_bp;
2184
2185         if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2186                 return (ZIO_PIPELINE_STOP);
2187
2188         ASSERT(BP_GET_DEDUP(bp));
2189         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2190         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2191
2192         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2193                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2194                 ddt_entry_t *dde = zio->io_vsd;
2195                 if (ddt == NULL) {
2196                         ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2197                         return (ZIO_PIPELINE_CONTINUE);
2198                 }
2199                 if (dde == NULL) {
2200                         zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2201                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2202                         return (ZIO_PIPELINE_STOP);
2203                 }
2204                 if (dde->dde_repair_data != NULL) {
2205                         bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2206                         zio->io_child_error[ZIO_CHILD_DDT] = 0;
2207                 }
2208                 ddt_repair_done(ddt, dde);
2209                 zio->io_vsd = NULL;
2210         }
2211
2212         ASSERT(zio->io_vsd == NULL);
2213
2214         return (ZIO_PIPELINE_CONTINUE);
2215 }
2216
2217 static boolean_t
2218 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2219 {
2220         spa_t *spa = zio->io_spa;
2221         int p;
2222
2223         /*
2224          * Note: we compare the original data, not the transformed data,
2225          * because when zio->io_bp is an override bp, we will not have
2226          * pushed the I/O transforms.  That's an important optimization
2227          * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2228          */
2229         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2230                 zio_t *lio = dde->dde_lead_zio[p];
2231
2232                 if (lio != NULL) {
2233                         return (lio->io_orig_size != zio->io_orig_size ||
2234                             bcmp(zio->io_orig_data, lio->io_orig_data,
2235                             zio->io_orig_size) != 0);
2236                 }
2237         }
2238
2239         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2240                 ddt_phys_t *ddp = &dde->dde_phys[p];
2241
2242                 if (ddp->ddp_phys_birth != 0) {
2243                         arc_buf_t *abuf = NULL;
2244                         arc_flags_t aflags = ARC_FLAG_WAIT;
2245                         blkptr_t blk = *zio->io_bp;
2246                         int error;
2247
2248                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2249
2250                         ddt_exit(ddt);
2251
2252                         error = arc_read(NULL, spa, &blk,
2253                             arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2254                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2255                             &aflags, &zio->io_bookmark);
2256
2257                         if (error == 0) {
2258                                 if (arc_buf_size(abuf) != zio->io_orig_size ||
2259                                     bcmp(abuf->b_data, zio->io_orig_data,
2260                                     zio->io_orig_size) != 0)
2261                                         error = SET_ERROR(EEXIST);
2262                                 VERIFY(arc_buf_remove_ref(abuf, &abuf));
2263                         }
2264
2265                         ddt_enter(ddt);
2266                         return (error != 0);
2267                 }
2268         }
2269
2270         return (B_FALSE);
2271 }
2272
2273 static void
2274 zio_ddt_child_write_ready(zio_t *zio)
2275 {
2276         int p = zio->io_prop.zp_copies;
2277         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2278         ddt_entry_t *dde = zio->io_private;
2279         ddt_phys_t *ddp = &dde->dde_phys[p];
2280         zio_t *pio;
2281
2282         if (zio->io_error)
2283                 return;
2284
2285         ddt_enter(ddt);
2286
2287         ASSERT(dde->dde_lead_zio[p] == zio);
2288
2289         ddt_phys_fill(ddp, zio->io_bp);
2290
2291         while ((pio = zio_walk_parents(zio)) != NULL)
2292                 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2293
2294         ddt_exit(ddt);
2295 }
2296
2297 static void
2298 zio_ddt_child_write_done(zio_t *zio)
2299 {
2300         int p = zio->io_prop.zp_copies;
2301         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2302         ddt_entry_t *dde = zio->io_private;
2303         ddt_phys_t *ddp = &dde->dde_phys[p];
2304
2305         ddt_enter(ddt);
2306
2307         ASSERT(ddp->ddp_refcnt == 0);
2308         ASSERT(dde->dde_lead_zio[p] == zio);
2309         dde->dde_lead_zio[p] = NULL;
2310
2311         if (zio->io_error == 0) {
2312                 while (zio_walk_parents(zio) != NULL)
2313                         ddt_phys_addref(ddp);
2314         } else {
2315                 ddt_phys_clear(ddp);
2316         }
2317
2318         ddt_exit(ddt);
2319 }
2320
2321 static void
2322 zio_ddt_ditto_write_done(zio_t *zio)
2323 {
2324         int p = DDT_PHYS_DITTO;
2325         blkptr_t *bp = zio->io_bp;
2326         ddt_t *ddt = ddt_select(zio->io_spa, bp);
2327         ddt_entry_t *dde = zio->io_private;
2328         ddt_phys_t *ddp = &dde->dde_phys[p];
2329         ddt_key_t *ddk = &dde->dde_key;
2330         ASSERTV(zio_prop_t *zp = &zio->io_prop);
2331
2332         ddt_enter(ddt);
2333
2334         ASSERT(ddp->ddp_refcnt == 0);
2335         ASSERT(dde->dde_lead_zio[p] == zio);
2336         dde->dde_lead_zio[p] = NULL;
2337
2338         if (zio->io_error == 0) {
2339                 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2340                 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2341                 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2342                 if (ddp->ddp_phys_birth != 0)
2343                         ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2344                 ddt_phys_fill(ddp, bp);
2345         }
2346
2347         ddt_exit(ddt);
2348 }
2349
2350 static int
2351 zio_ddt_write(zio_t *zio)
2352 {
2353         spa_t *spa = zio->io_spa;
2354         blkptr_t *bp = zio->io_bp;
2355         uint64_t txg = zio->io_txg;
2356         zio_prop_t *zp = &zio->io_prop;
2357         int p = zp->zp_copies;
2358         int ditto_copies;
2359         zio_t *cio = NULL;
2360         zio_t *dio = NULL;
2361         ddt_t *ddt = ddt_select(spa, bp);
2362         ddt_entry_t *dde;
2363         ddt_phys_t *ddp;
2364
2365         ASSERT(BP_GET_DEDUP(bp));
2366         ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2367         ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2368
2369         ddt_enter(ddt);
2370         dde = ddt_lookup(ddt, bp, B_TRUE);
2371         ddp = &dde->dde_phys[p];
2372
2373         if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2374                 /*
2375                  * If we're using a weak checksum, upgrade to a strong checksum
2376                  * and try again.  If we're already using a strong checksum,
2377                  * we can't resolve it, so just convert to an ordinary write.
2378                  * (And automatically e-mail a paper to Nature?)
2379                  */
2380                 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2381                         zp->zp_checksum = spa_dedup_checksum(spa);
2382                         zio_pop_transforms(zio);
2383                         zio->io_stage = ZIO_STAGE_OPEN;
2384                         BP_ZERO(bp);
2385                 } else {
2386                         zp->zp_dedup = B_FALSE;
2387                 }
2388                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2389                 ddt_exit(ddt);
2390                 return (ZIO_PIPELINE_CONTINUE);
2391         }
2392
2393         ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2394         ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2395
2396         if (ditto_copies > ddt_ditto_copies_present(dde) &&
2397             dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2398                 zio_prop_t czp = *zp;
2399
2400                 czp.zp_copies = ditto_copies;
2401
2402                 /*
2403                  * If we arrived here with an override bp, we won't have run
2404                  * the transform stack, so we won't have the data we need to
2405                  * generate a child i/o.  So, toss the override bp and restart.
2406                  * This is safe, because using the override bp is just an
2407                  * optimization; and it's rare, so the cost doesn't matter.
2408                  */
2409                 if (zio->io_bp_override) {
2410                         zio_pop_transforms(zio);
2411                         zio->io_stage = ZIO_STAGE_OPEN;
2412                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2413                         zio->io_bp_override = NULL;
2414                         BP_ZERO(bp);
2415                         ddt_exit(ddt);
2416                         return (ZIO_PIPELINE_CONTINUE);
2417                 }
2418
2419                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2420                     zio->io_orig_size, &czp, NULL, NULL,
2421                     zio_ddt_ditto_write_done, dde, zio->io_priority,
2422                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2423
2424                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2425                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2426         }
2427
2428         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2429                 if (ddp->ddp_phys_birth != 0)
2430                         ddt_bp_fill(ddp, bp, txg);
2431                 if (dde->dde_lead_zio[p] != NULL)
2432                         zio_add_child(zio, dde->dde_lead_zio[p]);
2433                 else
2434                         ddt_phys_addref(ddp);
2435         } else if (zio->io_bp_override) {
2436                 ASSERT(bp->blk_birth == txg);
2437                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2438                 ddt_phys_fill(ddp, bp);
2439                 ddt_phys_addref(ddp);
2440         } else {
2441                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2442                     zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2443                     zio_ddt_child_write_done, dde, zio->io_priority,
2444                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2445
2446                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2447                 dde->dde_lead_zio[p] = cio;
2448         }
2449
2450         ddt_exit(ddt);
2451
2452         if (cio)
2453                 zio_nowait(cio);
2454         if (dio)
2455                 zio_nowait(dio);
2456
2457         return (ZIO_PIPELINE_CONTINUE);
2458 }
2459
2460 ddt_entry_t *freedde; /* for debugging */
2461
2462 static int
2463 zio_ddt_free(zio_t *zio)
2464 {
2465         spa_t *spa = zio->io_spa;
2466         blkptr_t *bp = zio->io_bp;
2467         ddt_t *ddt = ddt_select(spa, bp);
2468         ddt_entry_t *dde;
2469         ddt_phys_t *ddp;
2470
2471         ASSERT(BP_GET_DEDUP(bp));
2472         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2473
2474         ddt_enter(ddt);
2475         freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2476         if (dde) {
2477                 ddp = ddt_phys_select(dde, bp);
2478                 if (ddp)
2479                         ddt_phys_decref(ddp);
2480         }
2481         ddt_exit(ddt);
2482
2483         return (ZIO_PIPELINE_CONTINUE);
2484 }
2485
2486 /*
2487  * ==========================================================================
2488  * Allocate and free blocks
2489  * ==========================================================================
2490  */
2491 static int
2492 zio_dva_allocate(zio_t *zio)
2493 {
2494         spa_t *spa = zio->io_spa;
2495         metaslab_class_t *mc = spa_normal_class(spa);
2496         blkptr_t *bp = zio->io_bp;
2497         int error;
2498         int flags = 0;
2499
2500         if (zio->io_gang_leader == NULL) {
2501                 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2502                 zio->io_gang_leader = zio;
2503         }
2504
2505         ASSERT(BP_IS_HOLE(bp));
2506         ASSERT0(BP_GET_NDVAS(bp));
2507         ASSERT3U(zio->io_prop.zp_copies, >, 0);
2508         ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2509         ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2510
2511         /*
2512          * The dump device does not support gang blocks so allocation on
2513          * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2514          * the "fast" gang feature.
2515          */
2516         flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2517         flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2518             METASLAB_GANG_CHILD : 0;
2519         flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
2520         error = metaslab_alloc(spa, mc, zio->io_size, bp,
2521             zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2522
2523         if (error) {
2524                 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2525                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2526                     error);
2527                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2528                         return (zio_write_gang_block(zio));
2529                 zio->io_error = error;
2530         }
2531
2532         return (ZIO_PIPELINE_CONTINUE);
2533 }
2534
2535 static int
2536 zio_dva_free(zio_t *zio)
2537 {
2538         metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2539
2540         return (ZIO_PIPELINE_CONTINUE);
2541 }
2542
2543 static int
2544 zio_dva_claim(zio_t *zio)
2545 {
2546         int error;
2547
2548         error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2549         if (error)
2550                 zio->io_error = error;
2551
2552         return (ZIO_PIPELINE_CONTINUE);
2553 }
2554
2555 /*
2556  * Undo an allocation.  This is used by zio_done() when an I/O fails
2557  * and we want to give back the block we just allocated.
2558  * This handles both normal blocks and gang blocks.
2559  */
2560 static void
2561 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2562 {
2563         int g;
2564
2565         ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2566         ASSERT(zio->io_bp_override == NULL);
2567
2568         if (!BP_IS_HOLE(bp))
2569                 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2570
2571         if (gn != NULL) {
2572                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2573                         zio_dva_unallocate(zio, gn->gn_child[g],
2574                             &gn->gn_gbh->zg_blkptr[g]);
2575                 }
2576         }
2577 }
2578
2579 /*
2580  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2581  */
2582 int
2583 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
2584     boolean_t use_slog)
2585 {
2586         int error = 1;
2587
2588         ASSERT(txg > spa_syncing_txg(spa));
2589
2590         /*
2591          * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2592          * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2593          * when allocating them.
2594          */
2595         if (use_slog) {
2596                 error = metaslab_alloc(spa, spa_log_class(spa), size,
2597                     new_bp, 1, txg, NULL,
2598                     METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
2599         }
2600
2601         if (error) {
2602                 error = metaslab_alloc(spa, spa_normal_class(spa), size,
2603                     new_bp, 1, txg, NULL,
2604                     METASLAB_FASTWRITE);
2605         }
2606
2607         if (error == 0) {
2608                 BP_SET_LSIZE(new_bp, size);
2609                 BP_SET_PSIZE(new_bp, size);
2610                 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2611                 BP_SET_CHECKSUM(new_bp,
2612                     spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2613                     ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2614                 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2615                 BP_SET_LEVEL(new_bp, 0);
2616                 BP_SET_DEDUP(new_bp, 0);
2617                 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2618         }
2619
2620         return (error);
2621 }
2622
2623 /*
2624  * Free an intent log block.
2625  */
2626 void
2627 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2628 {
2629         ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2630         ASSERT(!BP_IS_GANG(bp));
2631
2632         zio_free(spa, txg, bp);
2633 }
2634
2635 /*
2636  * ==========================================================================
2637  * Read and write to physical devices
2638  * ==========================================================================
2639  */
2640
2641
2642 /*
2643  * Issue an I/O to the underlying vdev. Typically the issue pipeline
2644  * stops after this stage and will resume upon I/O completion.
2645  * However, there are instances where the vdev layer may need to
2646  * continue the pipeline when an I/O was not issued. Since the I/O
2647  * that was sent to the vdev layer might be different than the one
2648  * currently active in the pipeline (see vdev_queue_io()), we explicitly
2649  * force the underlying vdev layers to call either zio_execute() or
2650  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2651  */
2652 static int
2653 zio_vdev_io_start(zio_t *zio)
2654 {
2655         vdev_t *vd = zio->io_vd;
2656         uint64_t align;
2657         spa_t *spa = zio->io_spa;
2658
2659         ASSERT(zio->io_error == 0);
2660         ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2661
2662         if (vd == NULL) {
2663                 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2664                         spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2665
2666                 /*
2667                  * The mirror_ops handle multiple DVAs in a single BP.
2668                  */
2669                 vdev_mirror_ops.vdev_op_io_start(zio);
2670                 return (ZIO_PIPELINE_STOP);
2671         }
2672
2673         /*
2674          * We keep track of time-sensitive I/Os so that the scan thread
2675          * can quickly react to certain workloads.  In particular, we care
2676          * about non-scrubbing, top-level reads and writes with the following
2677          * characteristics:
2678          *      - synchronous writes of user data to non-slog devices
2679          *      - any reads of user data
2680          * When these conditions are met, adjust the timestamp of spa_last_io
2681          * which allows the scan thread to adjust its workload accordingly.
2682          */
2683         if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2684             vd == vd->vdev_top && !vd->vdev_islog &&
2685             zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2686             zio->io_txg != spa_syncing_txg(spa)) {
2687                 uint64_t old = spa->spa_last_io;
2688                 uint64_t new = ddi_get_lbolt64();
2689                 if (old != new)
2690                         (void) atomic_cas_64(&spa->spa_last_io, old, new);
2691         }
2692
2693         align = 1ULL << vd->vdev_top->vdev_ashift;
2694
2695         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2696             P2PHASE(zio->io_size, align) != 0) {
2697                 /* Transform logical writes to be a full physical block size. */
2698                 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2699                 char *abuf = zio_buf_alloc(asize);
2700                 ASSERT(vd == vd->vdev_top);
2701                 if (zio->io_type == ZIO_TYPE_WRITE) {
2702                         bcopy(zio->io_data, abuf, zio->io_size);
2703                         bzero(abuf + zio->io_size, asize - zio->io_size);
2704                 }
2705                 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2706         }
2707
2708         /*
2709          * If this is not a physical io, make sure that it is properly aligned
2710          * before proceeding.
2711          */
2712         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2713                 ASSERT0(P2PHASE(zio->io_offset, align));
2714                 ASSERT0(P2PHASE(zio->io_size, align));
2715         } else {
2716                 /*
2717                  * For physical writes, we allow 512b aligned writes and assume
2718                  * the device will perform a read-modify-write as necessary.
2719                  */
2720                 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2721                 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2722         }
2723
2724         VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2725
2726         /*
2727          * If this is a repair I/O, and there's no self-healing involved --
2728          * that is, we're just resilvering what we expect to resilver --
2729          * then don't do the I/O unless zio's txg is actually in vd's DTL.
2730          * This prevents spurious resilvering with nested replication.
2731          * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2732          * A is out of date, we'll read from C+D, then use the data to
2733          * resilver A+B -- but we don't actually want to resilver B, just A.
2734          * The top-level mirror has no way to know this, so instead we just
2735          * discard unnecessary repairs as we work our way down the vdev tree.
2736          * The same logic applies to any form of nested replication:
2737          * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2738          */
2739         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2740             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2741             zio->io_txg != 0 && /* not a delegated i/o */
2742             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2743                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2744                 zio_vdev_io_bypass(zio);
2745                 return (ZIO_PIPELINE_CONTINUE);
2746         }
2747
2748         if (vd->vdev_ops->vdev_op_leaf &&
2749             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2750
2751                 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2752                         return (ZIO_PIPELINE_CONTINUE);
2753
2754                 if ((zio = vdev_queue_io(zio)) == NULL)
2755                         return (ZIO_PIPELINE_STOP);
2756
2757                 if (!vdev_accessible(vd, zio)) {
2758                         zio->io_error = SET_ERROR(ENXIO);
2759                         zio_interrupt(zio);
2760                         return (ZIO_PIPELINE_STOP);
2761                 }
2762         }
2763
2764         vd->vdev_ops->vdev_op_io_start(zio);
2765         return (ZIO_PIPELINE_STOP);
2766 }
2767
2768 static int
2769 zio_vdev_io_done(zio_t *zio)
2770 {
2771         vdev_t *vd = zio->io_vd;
2772         vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2773         boolean_t unexpected_error = B_FALSE;
2774
2775         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2776                 return (ZIO_PIPELINE_STOP);
2777
2778         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2779
2780         if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2781
2782                 vdev_queue_io_done(zio);
2783
2784                 if (zio->io_type == ZIO_TYPE_WRITE)
2785                         vdev_cache_write(zio);
2786
2787                 if (zio_injection_enabled && zio->io_error == 0)
2788                         zio->io_error = zio_handle_device_injection(vd,
2789                             zio, EIO);
2790
2791                 if (zio_injection_enabled && zio->io_error == 0)
2792                         zio->io_error = zio_handle_label_injection(zio, EIO);
2793
2794                 if (zio->io_error) {
2795                         if (!vdev_accessible(vd, zio)) {
2796                                 zio->io_error = SET_ERROR(ENXIO);
2797                         } else {
2798                                 unexpected_error = B_TRUE;
2799                         }
2800                 }
2801         }
2802
2803         ops->vdev_op_io_done(zio);
2804
2805         if (unexpected_error)
2806                 VERIFY(vdev_probe(vd, zio) == NULL);
2807
2808         return (ZIO_PIPELINE_CONTINUE);
2809 }
2810
2811 /*
2812  * For non-raidz ZIOs, we can just copy aside the bad data read from the
2813  * disk, and use that to finish the checksum ereport later.
2814  */
2815 static void
2816 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2817     const void *good_buf)
2818 {
2819         /* no processing needed */
2820         zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2821 }
2822
2823 /*ARGSUSED*/
2824 void
2825 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2826 {
2827         void *buf = zio_buf_alloc(zio->io_size);
2828
2829         bcopy(zio->io_data, buf, zio->io_size);
2830
2831         zcr->zcr_cbinfo = zio->io_size;
2832         zcr->zcr_cbdata = buf;
2833         zcr->zcr_finish = zio_vsd_default_cksum_finish;
2834         zcr->zcr_free = zio_buf_free;
2835 }
2836
2837 static int
2838 zio_vdev_io_assess(zio_t *zio)
2839 {
2840         vdev_t *vd = zio->io_vd;
2841
2842         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2843                 return (ZIO_PIPELINE_STOP);
2844
2845         if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2846                 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2847
2848         if (zio->io_vsd != NULL) {
2849                 zio->io_vsd_ops->vsd_free(zio);
2850                 zio->io_vsd = NULL;
2851         }
2852
2853         if (zio_injection_enabled && zio->io_error == 0)
2854                 zio->io_error = zio_handle_fault_injection(zio, EIO);
2855
2856         /*
2857          * If the I/O failed, determine whether we should attempt to retry it.
2858          *
2859          * On retry, we cut in line in the issue queue, since we don't want
2860          * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2861          */
2862         if (zio->io_error && vd == NULL &&
2863             !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2864                 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2865                 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2866                 zio->io_error = 0;
2867                 zio->io_flags |= ZIO_FLAG_IO_RETRY |
2868                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2869                 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2870                 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2871                     zio_requeue_io_start_cut_in_line);
2872                 return (ZIO_PIPELINE_STOP);
2873         }
2874
2875         /*
2876          * If we got an error on a leaf device, convert it to ENXIO
2877          * if the device is not accessible at all.
2878          */
2879         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2880             !vdev_accessible(vd, zio))
2881                 zio->io_error = SET_ERROR(ENXIO);
2882
2883         /*
2884          * If we can't write to an interior vdev (mirror or RAID-Z),
2885          * set vdev_cant_write so that we stop trying to allocate from it.
2886          */
2887         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2888             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2889                 vd->vdev_cant_write = B_TRUE;
2890         }
2891
2892         if (zio->io_error)
2893                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2894
2895         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2896             zio->io_physdone != NULL) {
2897                 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2898                 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2899                 zio->io_physdone(zio->io_logical);
2900         }
2901
2902         return (ZIO_PIPELINE_CONTINUE);
2903 }
2904
2905 void
2906 zio_vdev_io_reissue(zio_t *zio)
2907 {
2908         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2909         ASSERT(zio->io_error == 0);
2910
2911         zio->io_stage >>= 1;
2912 }
2913
2914 void
2915 zio_vdev_io_redone(zio_t *zio)
2916 {
2917         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2918
2919         zio->io_stage >>= 1;
2920 }
2921
2922 void
2923 zio_vdev_io_bypass(zio_t *zio)
2924 {
2925         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2926         ASSERT(zio->io_error == 0);
2927
2928         zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2929         zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2930 }
2931
2932 /*
2933  * ==========================================================================
2934  * Generate and verify checksums
2935  * ==========================================================================
2936  */
2937 static int
2938 zio_checksum_generate(zio_t *zio)
2939 {
2940         blkptr_t *bp = zio->io_bp;
2941         enum zio_checksum checksum;
2942
2943         if (bp == NULL) {
2944                 /*
2945                  * This is zio_write_phys().
2946                  * We're either generating a label checksum, or none at all.
2947                  */
2948                 checksum = zio->io_prop.zp_checksum;
2949
2950                 if (checksum == ZIO_CHECKSUM_OFF)
2951                         return (ZIO_PIPELINE_CONTINUE);
2952
2953                 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2954         } else {
2955                 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2956                         ASSERT(!IO_IS_ALLOCATING(zio));
2957                         checksum = ZIO_CHECKSUM_GANG_HEADER;
2958                 } else {
2959                         checksum = BP_GET_CHECKSUM(bp);
2960                 }
2961         }
2962
2963         zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2964
2965         return (ZIO_PIPELINE_CONTINUE);
2966 }
2967
2968 static int
2969 zio_checksum_verify(zio_t *zio)
2970 {
2971         zio_bad_cksum_t info;
2972         blkptr_t *bp = zio->io_bp;
2973         int error;
2974
2975         ASSERT(zio->io_vd != NULL);
2976
2977         if (bp == NULL) {
2978                 /*
2979                  * This is zio_read_phys().
2980                  * We're either verifying a label checksum, or nothing at all.
2981                  */
2982                 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2983                         return (ZIO_PIPELINE_CONTINUE);
2984
2985                 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2986         }
2987
2988         if ((error = zio_checksum_error(zio, &info)) != 0) {
2989                 zio->io_error = error;
2990                 if (error == ECKSUM &&
2991                     !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2992                         zfs_ereport_start_checksum(zio->io_spa,
2993                             zio->io_vd, zio, zio->io_offset,
2994                             zio->io_size, NULL, &info);
2995                 }
2996         }
2997
2998         return (ZIO_PIPELINE_CONTINUE);
2999 }
3000
3001 /*
3002  * Called by RAID-Z to ensure we don't compute the checksum twice.
3003  */
3004 void
3005 zio_checksum_verified(zio_t *zio)
3006 {
3007         zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
3008 }
3009
3010 /*
3011  * ==========================================================================
3012  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
3013  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
3014  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
3015  * indicate errors that are specific to one I/O, and most likely permanent.
3016  * Any other error is presumed to be worse because we weren't expecting it.
3017  * ==========================================================================
3018  */
3019 int
3020 zio_worst_error(int e1, int e2)
3021 {
3022         static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
3023         int r1, r2;
3024
3025         for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
3026                 if (e1 == zio_error_rank[r1])
3027                         break;
3028
3029         for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
3030                 if (e2 == zio_error_rank[r2])
3031                         break;
3032
3033         return (r1 > r2 ? e1 : e2);
3034 }
3035
3036 /*
3037  * ==========================================================================
3038  * I/O completion
3039  * ==========================================================================
3040  */
3041 static int
3042 zio_ready(zio_t *zio)
3043 {
3044         blkptr_t *bp = zio->io_bp;
3045         zio_t *pio, *pio_next;
3046
3047         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3048             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
3049                 return (ZIO_PIPELINE_STOP);
3050
3051         if (zio->io_ready) {
3052                 ASSERT(IO_IS_ALLOCATING(zio));
3053                 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3054                     (zio->io_flags & ZIO_FLAG_NOPWRITE));
3055                 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
3056
3057                 zio->io_ready(zio);
3058         }
3059
3060         if (bp != NULL && bp != &zio->io_bp_copy)
3061                 zio->io_bp_copy = *bp;
3062
3063         if (zio->io_error)
3064                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3065
3066         mutex_enter(&zio->io_lock);
3067         zio->io_state[ZIO_WAIT_READY] = 1;
3068         pio = zio_walk_parents(zio);
3069         mutex_exit(&zio->io_lock);
3070
3071         /*
3072          * As we notify zio's parents, new parents could be added.
3073          * New parents go to the head of zio's io_parent_list, however,
3074          * so we will (correctly) not notify them.  The remainder of zio's
3075          * io_parent_list, from 'pio_next' onward, cannot change because
3076          * all parents must wait for us to be done before they can be done.
3077          */
3078         for (; pio != NULL; pio = pio_next) {
3079                 pio_next = zio_walk_parents(zio);
3080                 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
3081         }
3082
3083         if (zio->io_flags & ZIO_FLAG_NODATA) {
3084                 if (BP_IS_GANG(bp)) {
3085                         zio->io_flags &= ~ZIO_FLAG_NODATA;
3086                 } else {
3087                         ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3088                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3089                 }
3090         }
3091
3092         if (zio_injection_enabled &&
3093             zio->io_spa->spa_syncing_txg == zio->io_txg)
3094                 zio_handle_ignored_writes(zio);
3095
3096         return (ZIO_PIPELINE_CONTINUE);
3097 }
3098
3099 static int
3100 zio_done(zio_t *zio)
3101 {
3102         zio_t *pio, *pio_next;
3103         int c, w;
3104
3105         /*
3106          * If our children haven't all completed,
3107          * wait for them and then repeat this pipeline stage.
3108          */
3109         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3110             zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3111             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3112             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3113                 return (ZIO_PIPELINE_STOP);
3114
3115         for (c = 0; c < ZIO_CHILD_TYPES; c++)
3116                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
3117                         ASSERT(zio->io_children[c][w] == 0);
3118
3119         if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
3120                 ASSERT(zio->io_bp->blk_pad[0] == 0);
3121                 ASSERT(zio->io_bp->blk_pad[1] == 0);
3122                 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
3123                     sizeof (blkptr_t)) == 0 ||
3124                     (zio->io_bp == zio_unique_parent(zio)->io_bp));
3125                 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
3126                     zio->io_bp_override == NULL &&
3127                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3128                         ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
3129                         ASSERT3U(zio->io_prop.zp_copies, <=,
3130                             BP_GET_NDVAS(zio->io_bp));
3131                         ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
3132                             (BP_COUNT_GANG(zio->io_bp) ==
3133                             BP_GET_NDVAS(zio->io_bp)));
3134                 }
3135                 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3136                         VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
3137         }
3138
3139         /*
3140          * If there were child vdev/gang/ddt errors, they apply to us now.
3141          */
3142         zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3143         zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3144         zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3145
3146         /*
3147          * If the I/O on the transformed data was successful, generate any
3148          * checksum reports now while we still have the transformed data.
3149          */
3150         if (zio->io_error == 0) {
3151                 while (zio->io_cksum_report != NULL) {
3152                         zio_cksum_report_t *zcr = zio->io_cksum_report;
3153                         uint64_t align = zcr->zcr_align;
3154                         uint64_t asize = P2ROUNDUP(zio->io_size, align);
3155                         char *abuf = zio->io_data;
3156
3157                         if (asize != zio->io_size) {
3158                                 abuf = zio_buf_alloc(asize);
3159                                 bcopy(zio->io_data, abuf, zio->io_size);
3160                                 bzero(abuf+zio->io_size, asize-zio->io_size);
3161                         }
3162
3163                         zio->io_cksum_report = zcr->zcr_next;
3164                         zcr->zcr_next = NULL;
3165                         zcr->zcr_finish(zcr, abuf);
3166                         zfs_ereport_free_checksum(zcr);
3167
3168                         if (asize != zio->io_size)
3169                                 zio_buf_free(abuf, asize);
3170                 }
3171         }
3172
3173         zio_pop_transforms(zio);        /* note: may set zio->io_error */
3174
3175         vdev_stat_update(zio, zio->io_size);
3176
3177         /*
3178          * If this I/O is attached to a particular vdev is slow, exceeding
3179          * 30 seconds to complete, post an error described the I/O delay.
3180          * We ignore these errors if the device is currently unavailable.
3181          */
3182         if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
3183                 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
3184                         zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
3185                             zio->io_vd, zio, 0, 0);
3186         }
3187
3188         if (zio->io_error) {
3189                 /*
3190                  * If this I/O is attached to a particular vdev,
3191                  * generate an error message describing the I/O failure
3192                  * at the block level.  We ignore these errors if the
3193                  * device is currently unavailable.
3194                  */
3195                 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
3196                         !vdev_is_dead(zio->io_vd))
3197                         zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
3198                                                 zio->io_vd, zio, 0, 0);
3199
3200                 if ((zio->io_error == EIO || !(zio->io_flags &
3201                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3202                     zio == zio->io_logical) {
3203                         /*
3204                          * For logical I/O requests, tell the SPA to log the
3205                          * error and generate a logical data ereport.
3206                          */
3207                         spa_log_error(zio->io_spa, zio);
3208                         zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
3209                             NULL, zio, 0, 0);
3210                 }
3211         }
3212
3213         if (zio->io_error && zio == zio->io_logical) {
3214                 /*
3215                  * Determine whether zio should be reexecuted.  This will
3216                  * propagate all the way to the root via zio_notify_parent().
3217                  */
3218                 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
3219                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3220
3221                 if (IO_IS_ALLOCATING(zio) &&
3222                     !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3223                         if (zio->io_error != ENOSPC)
3224                                 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3225                         else
3226                                 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3227                 }
3228
3229                 if ((zio->io_type == ZIO_TYPE_READ ||
3230                     zio->io_type == ZIO_TYPE_FREE) &&
3231                     !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3232                     zio->io_error == ENXIO &&
3233                     spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
3234                     spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
3235                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3236
3237                 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3238                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3239
3240                 /*
3241                  * Here is a possibly good place to attempt to do
3242                  * either combinatorial reconstruction or error correction
3243                  * based on checksums.  It also might be a good place
3244                  * to send out preliminary ereports before we suspend
3245                  * processing.
3246                  */
3247         }
3248
3249         /*
3250          * If there were logical child errors, they apply to us now.
3251          * We defer this until now to avoid conflating logical child
3252          * errors with errors that happened to the zio itself when
3253          * updating vdev stats and reporting FMA events above.
3254          */
3255         zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3256
3257         if ((zio->io_error || zio->io_reexecute) &&
3258             IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3259             !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3260                 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
3261
3262         zio_gang_tree_free(&zio->io_gang_tree);
3263
3264         /*
3265          * Godfather I/Os should never suspend.
3266          */
3267         if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3268             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3269                 zio->io_reexecute = 0;
3270
3271         if (zio->io_reexecute) {
3272                 /*
3273                  * This is a logical I/O that wants to reexecute.
3274                  *
3275                  * Reexecute is top-down.  When an i/o fails, if it's not
3276                  * the root, it simply notifies its parent and sticks around.
3277                  * The parent, seeing that it still has children in zio_done(),
3278                  * does the same.  This percolates all the way up to the root.
3279                  * The root i/o will reexecute or suspend the entire tree.
3280                  *
3281                  * This approach ensures that zio_reexecute() honors
3282                  * all the original i/o dependency relationships, e.g.
3283                  * parents not executing until children are ready.
3284                  */
3285                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3286
3287                 zio->io_gang_leader = NULL;
3288
3289                 mutex_enter(&zio->io_lock);
3290                 zio->io_state[ZIO_WAIT_DONE] = 1;
3291                 mutex_exit(&zio->io_lock);
3292
3293                 /*
3294                  * "The Godfather" I/O monitors its children but is
3295                  * not a true parent to them. It will track them through
3296                  * the pipeline but severs its ties whenever they get into
3297                  * trouble (e.g. suspended). This allows "The Godfather"
3298                  * I/O to return status without blocking.
3299                  */
3300                 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3301                         zio_link_t *zl = zio->io_walk_link;
3302                         pio_next = zio_walk_parents(zio);
3303
3304                         if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3305                             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3306                                 zio_remove_child(pio, zio, zl);
3307                                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3308                         }
3309                 }
3310
3311                 if ((pio = zio_unique_parent(zio)) != NULL) {
3312                         /*
3313                          * We're not a root i/o, so there's nothing to do
3314                          * but notify our parent.  Don't propagate errors
3315                          * upward since we haven't permanently failed yet.
3316                          */
3317                         ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3318                         zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3319                         zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3320                 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3321                         /*
3322                          * We'd fail again if we reexecuted now, so suspend
3323                          * until conditions improve (e.g. device comes online).
3324                          */
3325                         zio_suspend(zio->io_spa, zio);
3326                 } else {
3327                         /*
3328                          * Reexecution is potentially a huge amount of work.
3329                          * Hand it off to the otherwise-unused claim taskq.
3330                          */
3331                         ASSERT(taskq_empty_ent(&zio->io_tqent));
3332                         spa_taskq_dispatch_ent(zio->io_spa,
3333                             ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
3334                             (task_func_t *)zio_reexecute, zio, 0,
3335                             &zio->io_tqent);
3336                 }
3337                 return (ZIO_PIPELINE_STOP);
3338         }
3339
3340         ASSERT(zio->io_child_count == 0);
3341         ASSERT(zio->io_reexecute == 0);
3342         ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3343
3344         /*
3345          * Report any checksum errors, since the I/O is complete.
3346          */
3347         while (zio->io_cksum_report != NULL) {
3348                 zio_cksum_report_t *zcr = zio->io_cksum_report;
3349                 zio->io_cksum_report = zcr->zcr_next;
3350                 zcr->zcr_next = NULL;
3351                 zcr->zcr_finish(zcr, NULL);
3352                 zfs_ereport_free_checksum(zcr);
3353         }
3354
3355         if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
3356             !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
3357             !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
3358                 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
3359         }
3360
3361         /*
3362          * It is the responsibility of the done callback to ensure that this
3363          * particular zio is no longer discoverable for adoption, and as
3364          * such, cannot acquire any new parents.
3365          */
3366         if (zio->io_done)
3367                 zio->io_done(zio);
3368
3369         mutex_enter(&zio->io_lock);
3370         zio->io_state[ZIO_WAIT_DONE] = 1;
3371         mutex_exit(&zio->io_lock);
3372
3373         for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3374                 zio_link_t *zl = zio->io_walk_link;
3375                 pio_next = zio_walk_parents(zio);
3376                 zio_remove_child(pio, zio, zl);
3377                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3378         }
3379
3380         if (zio->io_waiter != NULL) {
3381                 mutex_enter(&zio->io_lock);
3382                 zio->io_executor = NULL;
3383                 cv_broadcast(&zio->io_cv);
3384                 mutex_exit(&zio->io_lock);
3385         } else {
3386                 zio_destroy(zio);
3387         }
3388
3389         return (ZIO_PIPELINE_STOP);
3390 }
3391
3392 /*
3393  * ==========================================================================
3394  * I/O pipeline definition
3395  * ==========================================================================
3396  */
3397 static zio_pipe_stage_t *zio_pipeline[] = {
3398         NULL,
3399         zio_read_bp_init,
3400         zio_free_bp_init,
3401         zio_issue_async,
3402         zio_write_bp_init,
3403         zio_checksum_generate,
3404         zio_nop_write,
3405         zio_ddt_read_start,
3406         zio_ddt_read_done,
3407         zio_ddt_write,
3408         zio_ddt_free,
3409         zio_gang_assemble,
3410         zio_gang_issue,
3411         zio_dva_allocate,
3412         zio_dva_free,
3413         zio_dva_claim,
3414         zio_ready,
3415         zio_vdev_io_start,
3416         zio_vdev_io_done,
3417         zio_vdev_io_assess,
3418         zio_checksum_verify,
3419         zio_done
3420 };
3421
3422 /* dnp is the dnode for zb1->zb_object */
3423 boolean_t
3424 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3425     const zbookmark_phys_t *zb2)
3426 {
3427         uint64_t zb1nextL0, zb2thisobj;
3428
3429         ASSERT(zb1->zb_objset == zb2->zb_objset);
3430         ASSERT(zb2->zb_level == 0);
3431
3432         /* The objset_phys_t isn't before anything. */
3433         if (dnp == NULL)
3434                 return (B_FALSE);
3435
3436         zb1nextL0 = (zb1->zb_blkid + 1) <<
3437             ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3438
3439         zb2thisobj = zb2->zb_object ? zb2->zb_object :
3440             zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3441
3442         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3443                 uint64_t nextobj = zb1nextL0 *
3444                     (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3445                 return (nextobj <= zb2thisobj);
3446         }
3447
3448         if (zb1->zb_object < zb2thisobj)
3449                 return (B_TRUE);
3450         if (zb1->zb_object > zb2thisobj)
3451                 return (B_FALSE);
3452         if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3453                 return (B_FALSE);
3454         return (zb1nextL0 <= zb2->zb_blkid);
3455 }
3456
3457 #if defined(_KERNEL) && defined(HAVE_SPL)
3458 EXPORT_SYMBOL(zio_type_name);
3459 EXPORT_SYMBOL(zio_buf_alloc);
3460 EXPORT_SYMBOL(zio_data_buf_alloc);
3461 EXPORT_SYMBOL(zio_buf_free);
3462 EXPORT_SYMBOL(zio_data_buf_free);
3463
3464 module_param(zio_delay_max, int, 0644);
3465 MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
3466
3467 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
3468 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
3469
3470 module_param(zfs_sync_pass_deferred_free, int, 0644);
3471 MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
3472         "Defer frees starting in this pass");
3473
3474 module_param(zfs_sync_pass_dont_compress, int, 0644);
3475 MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
3476         "Don't compress starting in this pass");
3477
3478 module_param(zfs_sync_pass_rewrite, int, 0644);
3479 MODULE_PARM_DESC(zfs_sync_pass_rewrite,
3480         "Rewrite new bps starting in this pass");
3481 #endif