module/zfs/zio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26
  27 #include <sys/sysmacros.h>
  28 #include <sys/zfs_context.h>
  29 #include <sys/fm/fs/zfs.h>
  30 #include <sys/spa.h>
  31 #include <sys/txg.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/zio_impl.h>
  35 #include <sys/zio_compress.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/dmu_objset.h>
  38 #include <sys/arc.h>
  39 #include <sys/ddt.h>
  40 #include <sys/blkptr.h>
  41 #include <sys/zfeature.h>
  42
  43 /*
  44  * ==========================================================================
  45  * I/O type descriptions
  46  * ==========================================================================
  47  */
  48 const char *zio_type_name[ZIO_TYPES] = {
  49         "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
  50 };
  51
  52 /*
  53  * ==========================================================================
  54  * I/O kmem caches
  55  * ==========================================================================
  56  */
  57 kmem_cache_t *zio_cache;
  58 kmem_cache_t *zio_link_cache;
  59 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  60 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  61 int zio_delay_max = ZIO_DELAY_MAX;
  62
  63 #define ZIO_PIPELINE_CONTINUE           0x100
  64 #define ZIO_PIPELINE_STOP               0x101
  65
  66 #define BP_SPANB(indblkshift, level) \
  67         (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
  68 #define COMPARE_META_LEVEL      0x80000000ul
  69 /*
  70  * The following actions directly effect the spa's sync-to-convergence logic.
  71  * The values below define the sync pass when we start performing the action.
  72  * Care should be taken when changing these values as they directly impact
  73  * spa_sync() performance. Tuning these values may introduce subtle performance
  74  * pathologies and should only be done in the context of performance analysis.
  75  * These tunables will eventually be removed and replaced with #defines once
  76  * enough analysis has been done to determine optimal values.
  77  *
  78  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  79  * regular blocks are not deferred.
  80  */
  81 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  82 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  83 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
  84
  85 /*
  86  * An allocating zio is one that either currently has the DVA allocate
  87  * stage set or will have it later in its lifetime.
  88  */
  89 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
  90
  91 int zio_requeue_io_start_cut_in_line = 1;
  92
  93 #ifdef ZFS_DEBUG
  94 int zio_buf_debug_limit = 16384;
  95 #else
  96 int zio_buf_debug_limit = 0;
  97 #endif
  98
  99 static inline void __zio_execute(zio_t *zio);
 100
 101 void
 102 zio_init(void)
 103 {
 104         size_t c;
 105         vmem_t *data_alloc_arena = NULL;
 106
 107         zio_cache = kmem_cache_create("zio_cache",
 108             sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 109         zio_link_cache = kmem_cache_create("zio_link_cache",
 110             sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 111
 112         /*
 113          * For small buffers, we want a cache for each multiple of
 114          * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 115          * for each quarter-power of 2.
 116          */
 117         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 118                 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 119                 size_t p2 = size;
 120                 size_t align = 0;
 121                 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 122
 123 #ifdef _ILP32
 124                 /*
 125                  * Cache size limited to 1M on 32-bit platforms until ARC
 126                  * buffers no longer require virtual address space.
 127                  */
 128                 if (size > zfs_max_recordsize)
 129                         break;
 130 #endif
 131
 132                 while (!ISP2(p2))
 133                         p2 &= p2 - 1;
 134
 135 #ifndef _KERNEL
 136                 /*
 137                  * If we are using watchpoints, put each buffer on its own page,
 138                  * to eliminate the performance overhead of trapping to the
 139                  * kernel when modifying a non-watched buffer that shares the
 140                  * page with a watched buffer.
 141                  */
 142                 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 143                         continue;
 144 #endif
 145                 if (size <= 4 * SPA_MINBLOCKSIZE) {
 146                         align = SPA_MINBLOCKSIZE;
 147                 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 148                         align = MIN(p2 >> 2, PAGESIZE);
 149                 }
 150
 151                 if (align != 0) {
 152                         char name[36];
 153                         (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 154                         zio_buf_cache[c] = kmem_cache_create(name, size,
 155                             align, NULL, NULL, NULL, NULL, NULL, cflags);
 156
 157                         (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 158                         zio_data_buf_cache[c] = kmem_cache_create(name, size,
 159                             align, NULL, NULL, NULL, NULL,
 160                             data_alloc_arena, cflags);
 161                 }
 162         }
 163
 164         while (--c != 0) {
 165                 ASSERT(zio_buf_cache[c] != NULL);
 166                 if (zio_buf_cache[c - 1] == NULL)
 167                         zio_buf_cache[c - 1] = zio_buf_cache[c];
 168
 169                 ASSERT(zio_data_buf_cache[c] != NULL);
 170                 if (zio_data_buf_cache[c - 1] == NULL)
 171                         zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 172         }
 173
 174         zio_inject_init();
 175
 176         lz4_init();
 177 }
 178
 179 void
 180 zio_fini(void)
 181 {
 182         size_t c;
 183         kmem_cache_t *last_cache = NULL;
 184         kmem_cache_t *last_data_cache = NULL;
 185
 186         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 187 #ifdef _ILP32
 188                 /*
 189                  * Cache size limited to 1M on 32-bit platforms until ARC
 190                  * buffers no longer require virtual address space.
 191                  */
 192                 if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
 193                         break;
 194 #endif
 195                 if (zio_buf_cache[c] != last_cache) {
 196                         last_cache = zio_buf_cache[c];
 197                         kmem_cache_destroy(zio_buf_cache[c]);
 198                 }
 199                 zio_buf_cache[c] = NULL;
 200
 201                 if (zio_data_buf_cache[c] != last_data_cache) {
 202                         last_data_cache = zio_data_buf_cache[c];
 203                         kmem_cache_destroy(zio_data_buf_cache[c]);
 204                 }
 205                 zio_data_buf_cache[c] = NULL;
 206         }
 207
 208         kmem_cache_destroy(zio_link_cache);
 209         kmem_cache_destroy(zio_cache);
 210
 211         zio_inject_fini();
 212
 213         lz4_fini();
 214 }
 215
 216 /*
 217  * ==========================================================================
 218  * Allocate and free I/O buffers
 219  * ==========================================================================
 220  */
 221
 222 /*
 223  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 224  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 225  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 226  * excess / transient data in-core during a crashdump.
 227  */
 228 void *
 229 zio_buf_alloc(size_t size)
 230 {
 231         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 232
 233         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 234
 235         return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 236 }
 237
 238 /*
 239  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 240  * crashdump if the kernel panics.  This exists so that we will limit the amount
 241  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 242  * of kernel heap dumped to disk when the kernel panics)
 243  */
 244 void *
 245 zio_data_buf_alloc(size_t size)
 246 {
 247         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 248
 249         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 250
 251         return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 252 }
 253
 254 /*
 255  * Use zio_buf_alloc_flags when specific allocation flags are needed.  e.g.
 256  * passing KM_NOSLEEP when it is acceptable for an allocation to fail.
 257  */
 258 void *
 259 zio_buf_alloc_flags(size_t size, int flags)
 260 {
 261         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 262
 263         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 264
 265         return (kmem_cache_alloc(zio_buf_cache[c], flags));
 266 }
 267
 268 void
 269 zio_buf_free(void *buf, size_t size)
 270 {
 271         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 272
 273         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 274
 275         kmem_cache_free(zio_buf_cache[c], buf);
 276 }
 277
 278 void
 279 zio_data_buf_free(void *buf, size_t size)
 280 {
 281         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 282
 283         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 284
 285         kmem_cache_free(zio_data_buf_cache[c], buf);
 286 }
 287
 288 /*
 289  * ==========================================================================
 290  * Push and pop I/O transform buffers
 291  * ==========================================================================
 292  */
 293 static void
 294 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 295         zio_transform_func_t *transform)
 296 {
 297         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 298
 299         zt->zt_orig_data = zio->io_data;
 300         zt->zt_orig_size = zio->io_size;
 301         zt->zt_bufsize = bufsize;
 302         zt->zt_transform = transform;
 303
 304         zt->zt_next = zio->io_transform_stack;
 305         zio->io_transform_stack = zt;
 306
 307         zio->io_data = data;
 308         zio->io_size = size;
 309 }
 310
 311 static void
 312 zio_pop_transforms(zio_t *zio)
 313 {
 314         zio_transform_t *zt;
 315
 316         while ((zt = zio->io_transform_stack) != NULL) {
 317                 if (zt->zt_transform != NULL)
 318                         zt->zt_transform(zio,
 319                             zt->zt_orig_data, zt->zt_orig_size);
 320
 321                 if (zt->zt_bufsize != 0)
 322                         zio_buf_free(zio->io_data, zt->zt_bufsize);
 323
 324                 zio->io_data = zt->zt_orig_data;
 325                 zio->io_size = zt->zt_orig_size;
 326                 zio->io_transform_stack = zt->zt_next;
 327
 328                 kmem_free(zt, sizeof (zio_transform_t));
 329         }
 330 }
 331
 332 /*
 333  * ==========================================================================
 334  * I/O transform callbacks for subblocks and decompression
 335  * ==========================================================================
 336  */
 337 static void
 338 zio_subblock(zio_t *zio, void *data, uint64_t size)
 339 {
 340         ASSERT(zio->io_size > size);
 341
 342         if (zio->io_type == ZIO_TYPE_READ)
 343                 bcopy(zio->io_data, data, size);
 344 }
 345
 346 static void
 347 zio_decompress(zio_t *zio, void *data, uint64_t size)
 348 {
 349         if (zio->io_error == 0 &&
 350             zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 351             zio->io_data, data, zio->io_size, size) != 0)
 352                 zio->io_error = SET_ERROR(EIO);
 353 }
 354
 355 /*
 356  * ==========================================================================
 357  * I/O parent/child relationships and pipeline interlocks
 358  * ==========================================================================
 359  */
 360 /*
 361  * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 362  *        continue calling these functions until they return NULL.
 363  *        Otherwise, the next caller will pick up the list walk in
 364  *        some indeterminate state.  (Otherwise every caller would
 365  *        have to pass in a cookie to keep the state represented by
 366  *        io_walk_link, which gets annoying.)
 367  */
 368 zio_t *
 369 zio_walk_parents(zio_t *cio)
 370 {
 371         zio_link_t *zl = cio->io_walk_link;
 372         list_t *pl = &cio->io_parent_list;
 373
 374         zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 375         cio->io_walk_link = zl;
 376
 377         if (zl == NULL)
 378                 return (NULL);
 379
 380         ASSERT(zl->zl_child == cio);
 381         return (zl->zl_parent);
 382 }
 383
 384 zio_t *
 385 zio_walk_children(zio_t *pio)
 386 {
 387         zio_link_t *zl = pio->io_walk_link;
 388         list_t *cl = &pio->io_child_list;
 389
 390         zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 391         pio->io_walk_link = zl;
 392
 393         if (zl == NULL)
 394                 return (NULL);
 395
 396         ASSERT(zl->zl_parent == pio);
 397         return (zl->zl_child);
 398 }
 399
 400 zio_t *
 401 zio_unique_parent(zio_t *cio)
 402 {
 403         zio_t *pio = zio_walk_parents(cio);
 404
 405         VERIFY(zio_walk_parents(cio) == NULL);
 406         return (pio);
 407 }
 408
 409 void
 410 zio_add_child(zio_t *pio, zio_t *cio)
 411 {
 412         zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 413         int w;
 414
 415         /*
 416          * Logical I/Os can have logical, gang, or vdev children.
 417          * Gang I/Os can have gang or vdev children.
 418          * Vdev I/Os can only have vdev children.
 419          * The following ASSERT captures all of these constraints.
 420          */
 421         ASSERT(cio->io_child_type <= pio->io_child_type);
 422
 423         zl->zl_parent = pio;
 424         zl->zl_child = cio;
 425
 426         mutex_enter(&cio->io_lock);
 427         mutex_enter(&pio->io_lock);
 428
 429         ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 430
 431         for (w = 0; w < ZIO_WAIT_TYPES; w++)
 432                 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 433
 434         list_insert_head(&pio->io_child_list, zl);
 435         list_insert_head(&cio->io_parent_list, zl);
 436
 437         pio->io_child_count++;
 438         cio->io_parent_count++;
 439
 440         mutex_exit(&pio->io_lock);
 441         mutex_exit(&cio->io_lock);
 442 }
 443
 444 static void
 445 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 446 {
 447         ASSERT(zl->zl_parent == pio);
 448         ASSERT(zl->zl_child == cio);
 449
 450         mutex_enter(&cio->io_lock);
 451         mutex_enter(&pio->io_lock);
 452
 453         list_remove(&pio->io_child_list, zl);
 454         list_remove(&cio->io_parent_list, zl);
 455
 456         pio->io_child_count--;
 457         cio->io_parent_count--;
 458
 459         mutex_exit(&pio->io_lock);
 460         mutex_exit(&cio->io_lock);
 461
 462         kmem_cache_free(zio_link_cache, zl);
 463 }
 464
 465 static boolean_t
 466 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 467 {
 468         uint64_t *countp = &zio->io_children[child][wait];
 469         boolean_t waiting = B_FALSE;
 470
 471         mutex_enter(&zio->io_lock);
 472         ASSERT(zio->io_stall == NULL);
 473         if (*countp != 0) {
 474                 zio->io_stage >>= 1;
 475                 zio->io_stall = countp;
 476                 waiting = B_TRUE;
 477         }
 478         mutex_exit(&zio->io_lock);
 479
 480         return (waiting);
 481 }
 482
 483 __attribute__((always_inline))
 484 static inline void
 485 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 486 {
 487         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 488         int *errorp = &pio->io_child_error[zio->io_child_type];
 489
 490         mutex_enter(&pio->io_lock);
 491         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 492                 *errorp = zio_worst_error(*errorp, zio->io_error);
 493         pio->io_reexecute |= zio->io_reexecute;
 494         ASSERT3U(*countp, >, 0);
 495
 496         (*countp)--;
 497
 498         if (*countp == 0 && pio->io_stall == countp) {
 499                 pio->io_stall = NULL;
 500                 mutex_exit(&pio->io_lock);
 501                 __zio_execute(pio);
 502         } else {
 503                 mutex_exit(&pio->io_lock);
 504         }
 505 }
 506
 507 static void
 508 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 509 {
 510         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 511                 zio->io_error = zio->io_child_error[c];
 512 }
 513
 514 /*
 515  * ==========================================================================
 516  * Create the various types of I/O (read, write, free, etc)
 517  * ==========================================================================
 518  */
 519 static zio_t *
 520 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 521     void *data, uint64_t size, zio_done_func_t *done, void *private,
 522     zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 523     vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
 524     enum zio_stage stage, enum zio_stage pipeline)
 525 {
 526         zio_t *zio;
 527
 528         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 529         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 530         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 531
 532         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 533         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 534         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 535
 536         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 537         bzero(zio, sizeof (zio_t));
 538
 539         mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 540         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 541
 542         list_create(&zio->io_parent_list, sizeof (zio_link_t),
 543             offsetof(zio_link_t, zl_parent_node));
 544         list_create(&zio->io_child_list, sizeof (zio_link_t),
 545             offsetof(zio_link_t, zl_child_node));
 546
 547         if (vd != NULL)
 548                 zio->io_child_type = ZIO_CHILD_VDEV;
 549         else if (flags & ZIO_FLAG_GANG_CHILD)
 550                 zio->io_child_type = ZIO_CHILD_GANG;
 551         else if (flags & ZIO_FLAG_DDT_CHILD)
 552                 zio->io_child_type = ZIO_CHILD_DDT;
 553         else
 554                 zio->io_child_type = ZIO_CHILD_LOGICAL;
 555
 556         if (bp != NULL) {
 557                 zio->io_bp = (blkptr_t *)bp;
 558                 zio->io_bp_copy = *bp;
 559                 zio->io_bp_orig = *bp;
 560                 if (type != ZIO_TYPE_WRITE ||
 561                     zio->io_child_type == ZIO_CHILD_DDT)
 562                         zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 563                 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 564                         zio->io_logical = zio;
 565                 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 566                         pipeline |= ZIO_GANG_STAGES;
 567         }
 568
 569         zio->io_spa = spa;
 570         zio->io_txg = txg;
 571         zio->io_done = done;
 572         zio->io_private = private;
 573         zio->io_type = type;
 574         zio->io_priority = priority;
 575         zio->io_vd = vd;
 576         zio->io_offset = offset;
 577         zio->io_orig_data = zio->io_data = data;
 578         zio->io_orig_size = zio->io_size = size;
 579         zio->io_orig_flags = zio->io_flags = flags;
 580         zio->io_orig_stage = zio->io_stage = stage;
 581         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 582
 583         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 584         zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 585
 586         if (zb != NULL)
 587                 zio->io_bookmark = *zb;
 588
 589         if (pio != NULL) {
 590                 if (zio->io_logical == NULL)
 591                         zio->io_logical = pio->io_logical;
 592                 if (zio->io_child_type == ZIO_CHILD_GANG)
 593                         zio->io_gang_leader = pio->io_gang_leader;
 594                 zio_add_child(pio, zio);
 595         }
 596
 597         taskq_init_ent(&zio->io_tqent);
 598
 599         return (zio);
 600 }
 601
 602 static void
 603 zio_destroy(zio_t *zio)
 604 {
 605         list_destroy(&zio->io_parent_list);
 606         list_destroy(&zio->io_child_list);
 607         mutex_destroy(&zio->io_lock);
 608         cv_destroy(&zio->io_cv);
 609         kmem_cache_free(zio_cache, zio);
 610 }
 611
 612 zio_t *
 613 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 614     void *private, enum zio_flag flags)
 615 {
 616         zio_t *zio;
 617
 618         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 619             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 620             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 621
 622         return (zio);
 623 }
 624
 625 zio_t *
 626 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 627 {
 628         return (zio_null(NULL, spa, NULL, done, private, flags));
 629 }
 630
 631 void
 632 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 633 {
 634         int i;
 635
 636         if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 637                 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 638                     bp, (longlong_t)BP_GET_TYPE(bp));
 639         }
 640         if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 641             BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 642                 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 643                     bp, (longlong_t)BP_GET_CHECKSUM(bp));
 644         }
 645         if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 646             BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 647                 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 648                     bp, (longlong_t)BP_GET_COMPRESS(bp));
 649         }
 650         if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 651                 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 652                     bp, (longlong_t)BP_GET_LSIZE(bp));
 653         }
 654         if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 655                 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 656                     bp, (longlong_t)BP_GET_PSIZE(bp));
 657         }
 658
 659         if (BP_IS_EMBEDDED(bp)) {
 660                 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 661                         zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 662                             bp, (longlong_t)BPE_GET_ETYPE(bp));
 663                 }
 664         }
 665
 666         /*
 667          * Pool-specific checks.
 668          *
 669          * Note: it would be nice to verify that the blk_birth and
 670          * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 671          * allows the birth time of log blocks (and dmu_sync()-ed blocks
 672          * that are in the log) to be arbitrarily large.
 673          */
 674         for (i = 0; i < BP_GET_NDVAS(bp); i++) {
 675                 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 676                 vdev_t *vd;
 677                 uint64_t offset, asize;
 678                 if (vdevid >= spa->spa_root_vdev->vdev_children) {
 679                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 680                             "VDEV %llu",
 681                             bp, i, (longlong_t)vdevid);
 682                 }
 683                 vd = spa->spa_root_vdev->vdev_child[vdevid];
 684                 if (vd == NULL) {
 685                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 686                             "VDEV %llu",
 687                             bp, i, (longlong_t)vdevid);
 688                 }
 689                 if (vd->vdev_ops == &vdev_hole_ops) {
 690                         zfs_panic_recover("blkptr at %p DVA %u has hole "
 691                             "VDEV %llu",
 692                             bp, i, (longlong_t)vdevid);
 693
 694                 }
 695                 if (vd->vdev_ops == &vdev_missing_ops) {
 696                         /*
 697                          * "missing" vdevs are valid during import, but we
 698                          * don't have their detailed info (e.g. asize), so
 699                          * we can't perform any more checks on them.
 700                          */
 701                         continue;
 702                 }
 703                 offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 704                 asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 705                 if (BP_IS_GANG(bp))
 706                         asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 707                 if (offset + asize > vd->vdev_asize) {
 708                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 709                             "OFFSET %llu",
 710                             bp, i, (longlong_t)offset);
 711                 }
 712         }
 713 }
 714
 715 zio_t *
 716 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 717     void *data, uint64_t size, zio_done_func_t *done, void *private,
 718     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 719 {
 720         zio_t *zio;
 721
 722         zfs_blkptr_verify(spa, bp);
 723
 724         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 725             data, size, done, private,
 726             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 727             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 728             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 729
 730         return (zio);
 731 }
 732
 733 zio_t *
 734 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 735     void *data, uint64_t size, const zio_prop_t *zp,
 736     zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 737     void *private,
 738     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 739 {
 740         zio_t *zio;
 741
 742         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 743             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 744             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 745             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 746             DMU_OT_IS_VALID(zp->zp_type) &&
 747             zp->zp_level < 32 &&
 748             zp->zp_copies > 0 &&
 749             zp->zp_copies <= spa_max_replication(spa));
 750
 751         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 752             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 753             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 754             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 755
 756         zio->io_ready = ready;
 757         zio->io_physdone = physdone;
 758         zio->io_prop = *zp;
 759
 760         /*
 761          * Data can be NULL if we are going to call zio_write_override() to
 762          * provide the already-allocated BP.  But we may need the data to
 763          * verify a dedup hit (if requested).  In this case, don't try to
 764          * dedup (just take the already-allocated BP verbatim).
 765          */
 766         if (data == NULL && zio->io_prop.zp_dedup_verify) {
 767                 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 768         }
 769
 770         return (zio);
 771 }
 772
 773 zio_t *
 774 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 775     uint64_t size, zio_done_func_t *done, void *private,
 776     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 777 {
 778         zio_t *zio;
 779
 780         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 781             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 782             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 783
 784         return (zio);
 785 }
 786
 787 void
 788 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 789 {
 790         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 791         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 792         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 793         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 794
 795         /*
 796          * We must reset the io_prop to match the values that existed
 797          * when the bp was first written by dmu_sync() keeping in mind
 798          * that nopwrite and dedup are mutually exclusive.
 799          */
 800         zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 801         zio->io_prop.zp_nopwrite = nopwrite;
 802         zio->io_prop.zp_copies = copies;
 803         zio->io_bp_override = bp;
 804 }
 805
 806 void
 807 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 808 {
 809
 810         /*
 811          * The check for EMBEDDED is a performance optimization.  We
 812          * process the free here (by ignoring it) rather than
 813          * putting it on the list and then processing it in zio_free_sync().
 814          */
 815         if (BP_IS_EMBEDDED(bp))
 816                 return;
 817         metaslab_check_free(spa, bp);
 818
 819         /*
 820          * Frees that are for the currently-syncing txg, are not going to be
 821          * deferred, and which will not need to do a read (i.e. not GANG or
 822          * DEDUP), can be processed immediately.  Otherwise, put them on the
 823          * in-memory list for later processing.
 824          */
 825         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 826             txg != spa->spa_syncing_txg ||
 827             spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 828                 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 829         } else {
 830                 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
 831         }
 832 }
 833
 834 zio_t *
 835 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 836     enum zio_flag flags)
 837 {
 838         zio_t *zio;
 839         enum zio_stage stage = ZIO_FREE_PIPELINE;
 840
 841         ASSERT(!BP_IS_HOLE(bp));
 842         ASSERT(spa_syncing_txg(spa) == txg);
 843         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 844
 845         if (BP_IS_EMBEDDED(bp))
 846                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 847
 848         metaslab_check_free(spa, bp);
 849         arc_freed(spa, bp);
 850
 851         /*
 852          * GANG and DEDUP blocks can induce a read (for the gang block header,
 853          * or the DDT), so issue them asynchronously so that this thread is
 854          * not tied up.
 855          */
 856         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 857                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 858
 859         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 860             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 861             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 862
 863         return (zio);
 864 }
 865
 866 zio_t *
 867 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 868     zio_done_func_t *done, void *private, enum zio_flag flags)
 869 {
 870         zio_t *zio;
 871
 872         dprintf_bp(bp, "claiming in txg %llu", txg);
 873
 874         if (BP_IS_EMBEDDED(bp))
 875                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 876
 877         /*
 878          * A claim is an allocation of a specific block.  Claims are needed
 879          * to support immediate writes in the intent log.  The issue is that
 880          * immediate writes contain committed data, but in a txg that was
 881          * *not* committed.  Upon opening the pool after an unclean shutdown,
 882          * the intent log claims all blocks that contain immediate write data
 883          * so that the SPA knows they're in use.
 884          *
 885          * All claims *must* be resolved in the first txg -- before the SPA
 886          * starts allocating blocks -- so that nothing is allocated twice.
 887          * If txg == 0 we just verify that the block is claimable.
 888          */
 889         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 890         ASSERT(txg == spa_first_txg(spa) || txg == 0);
 891         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 892
 893         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 894             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 895             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 896
 897         return (zio);
 898 }
 899
 900 zio_t *
 901 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 902     zio_done_func_t *done, void *private, enum zio_flag flags)
 903 {
 904         zio_t *zio;
 905         int c;
 906
 907         if (vd->vdev_children == 0) {
 908                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 909                     ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 910                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 911
 912                 zio->io_cmd = cmd;
 913         } else {
 914                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 915
 916                 for (c = 0; c < vd->vdev_children; c++)
 917                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 918                             done, private, flags));
 919         }
 920
 921         return (zio);
 922 }
 923
 924 zio_t *
 925 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 926     void *data, int checksum, zio_done_func_t *done, void *private,
 927     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 928 {
 929         zio_t *zio;
 930
 931         ASSERT(vd->vdev_children == 0);
 932         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 933             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 934         ASSERT3U(offset + size, <=, vd->vdev_psize);
 935
 936         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 937             ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 938             NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 939
 940         zio->io_prop.zp_checksum = checksum;
 941
 942         return (zio);
 943 }
 944
 945 zio_t *
 946 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 947     void *data, int checksum, zio_done_func_t *done, void *private,
 948     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 949 {
 950         zio_t *zio;
 951
 952         ASSERT(vd->vdev_children == 0);
 953         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 954             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 955         ASSERT3U(offset + size, <=, vd->vdev_psize);
 956
 957         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 958             ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 959             NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 960
 961         zio->io_prop.zp_checksum = checksum;
 962
 963         if (zio_checksum_table[checksum].ci_eck) {
 964                 /*
 965                  * zec checksums are necessarily destructive -- they modify
 966                  * the end of the write buffer to hold the verifier/checksum.
 967                  * Therefore, we must make a local copy in case the data is
 968                  * being written to multiple places in parallel.
 969                  */
 970                 void *wbuf = zio_buf_alloc(size);
 971                 bcopy(data, wbuf, size);
 972                 zio_push_transform(zio, wbuf, size, size, NULL);
 973         }
 974
 975         return (zio);
 976 }
 977
 978 /*
 979  * Create a child I/O to do some work for us.
 980  */
 981 zio_t *
 982 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 983         void *data, uint64_t size, int type, zio_priority_t priority,
 984         enum zio_flag flags, zio_done_func_t *done, void *private)
 985 {
 986         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 987         zio_t *zio;
 988
 989         ASSERT(vd->vdev_parent ==
 990             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 991
 992         if (type == ZIO_TYPE_READ && bp != NULL) {
 993                 /*
 994                  * If we have the bp, then the child should perform the
 995                  * checksum and the parent need not.  This pushes error
 996                  * detection as close to the leaves as possible and
 997                  * eliminates redundant checksums in the interior nodes.
 998                  */
 999                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
1000                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
1001         }
1002
1003         if (vd->vdev_children == 0)
1004                 offset += VDEV_LABEL_START_SIZE;
1005
1006         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
1007
1008         /*
1009          * If we've decided to do a repair, the write is not speculative --
1010          * even if the original read was.
1011          */
1012         if (flags & ZIO_FLAG_IO_REPAIR)
1013                 flags &= ~ZIO_FLAG_SPECULATIVE;
1014
1015         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
1016             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
1017             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
1018
1019         zio->io_physdone = pio->io_physdone;
1020         if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1021                 zio->io_logical->io_phys_children++;
1022
1023         return (zio);
1024 }
1025
1026 zio_t *
1027 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
1028         int type, zio_priority_t priority, enum zio_flag flags,
1029         zio_done_func_t *done, void *private)
1030 {
1031         zio_t *zio;
1032
1033         ASSERT(vd->vdev_ops->vdev_op_leaf);
1034
1035         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1036             data, size, done, private, type, priority,
1037             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
1038             vd, offset, NULL,
1039             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
1040
1041         return (zio);
1042 }
1043
1044 void
1045 zio_flush(zio_t *zio, vdev_t *vd)
1046 {
1047         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
1048             NULL, NULL,
1049             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1050 }
1051
1052 void
1053 zio_shrink(zio_t *zio, uint64_t size)
1054 {
1055         ASSERT(zio->io_executor == NULL);
1056         ASSERT(zio->io_orig_size == zio->io_size);
1057         ASSERT(size <= zio->io_size);
1058
1059         /*
1060          * We don't shrink for raidz because of problems with the
1061          * reconstruction when reading back less than the block size.
1062          * Note, BP_IS_RAIDZ() assumes no compression.
1063          */
1064         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1065         if (!BP_IS_RAIDZ(zio->io_bp))
1066                 zio->io_orig_size = zio->io_size = size;
1067 }
1068
1069 /*
1070  * ==========================================================================
1071  * Prepare to read and write logical blocks
1072  * ==========================================================================
1073  */
1074
1075 static int
1076 zio_read_bp_init(zio_t *zio)
1077 {
1078         blkptr_t *bp = zio->io_bp;
1079
1080         if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1081             zio->io_child_type == ZIO_CHILD_LOGICAL &&
1082             !(zio->io_flags & ZIO_FLAG_RAW)) {
1083                 uint64_t psize =
1084                     BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1085                 void *cbuf = zio_buf_alloc(psize);
1086
1087                 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1088         }
1089
1090         if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1091                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1092                 decode_embedded_bp_compressed(bp, zio->io_data);
1093         } else {
1094                 ASSERT(!BP_IS_EMBEDDED(bp));
1095         }
1096
1097         if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1098                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1099
1100         if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1101                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1102
1103         if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1104                 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1105
1106         return (ZIO_PIPELINE_CONTINUE);
1107 }
1108
1109 static int
1110 zio_write_bp_init(zio_t *zio)
1111 {
1112         spa_t *spa = zio->io_spa;
1113         zio_prop_t *zp = &zio->io_prop;
1114         enum zio_compress compress = zp->zp_compress;
1115         blkptr_t *bp = zio->io_bp;
1116         uint64_t lsize = zio->io_size;
1117         uint64_t psize = lsize;
1118         int pass = 1;
1119
1120         /*
1121          * If our children haven't all reached the ready stage,
1122          * wait for them and then repeat this pipeline stage.
1123          */
1124         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1125             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1126                 return (ZIO_PIPELINE_STOP);
1127
1128         if (!IO_IS_ALLOCATING(zio))
1129                 return (ZIO_PIPELINE_CONTINUE);
1130
1131         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1132
1133         if (zio->io_bp_override) {
1134                 ASSERT(bp->blk_birth != zio->io_txg);
1135                 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1136
1137                 *bp = *zio->io_bp_override;
1138                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1139
1140                 if (BP_IS_EMBEDDED(bp))
1141                         return (ZIO_PIPELINE_CONTINUE);
1142
1143                 /*
1144                  * If we've been overridden and nopwrite is set then
1145                  * set the flag accordingly to indicate that a nopwrite
1146                  * has already occurred.
1147                  */
1148                 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1149                         ASSERT(!zp->zp_dedup);
1150                         zio->io_flags |= ZIO_FLAG_NOPWRITE;
1151                         return (ZIO_PIPELINE_CONTINUE);
1152                 }
1153
1154                 ASSERT(!zp->zp_nopwrite);
1155
1156                 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1157                         return (ZIO_PIPELINE_CONTINUE);
1158
1159                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1160                     zp->zp_dedup_verify);
1161
1162                 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1163                         BP_SET_DEDUP(bp, 1);
1164                         zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1165                         return (ZIO_PIPELINE_CONTINUE);
1166                 }
1167                 zio->io_bp_override = NULL;
1168                 BP_ZERO(bp);
1169         }
1170
1171         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1172                 /*
1173                  * We're rewriting an existing block, which means we're
1174                  * working on behalf of spa_sync().  For spa_sync() to
1175                  * converge, it must eventually be the case that we don't
1176                  * have to allocate new blocks.  But compression changes
1177                  * the blocksize, which forces a reallocate, and makes
1178                  * convergence take longer.  Therefore, after the first
1179                  * few passes, stop compressing to ensure convergence.
1180                  */
1181                 pass = spa_sync_pass(spa);
1182
1183                 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1184                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1185                 ASSERT(!BP_GET_DEDUP(bp));
1186
1187                 if (pass >= zfs_sync_pass_dont_compress)
1188                         compress = ZIO_COMPRESS_OFF;
1189
1190                 /* Make sure someone doesn't change their mind on overwrites */
1191                 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1192                     spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1193         }
1194
1195         if (compress != ZIO_COMPRESS_OFF) {
1196                 void *cbuf = zio_buf_alloc(lsize);
1197                 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1198                 if (psize == 0 || psize == lsize) {
1199                         compress = ZIO_COMPRESS_OFF;
1200                         zio_buf_free(cbuf, lsize);
1201                 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1202                     zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1203                     spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1204                         encode_embedded_bp_compressed(bp,
1205                             cbuf, compress, lsize, psize);
1206                         BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1207                         BP_SET_TYPE(bp, zio->io_prop.zp_type);
1208                         BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1209                         zio_buf_free(cbuf, lsize);
1210                         bp->blk_birth = zio->io_txg;
1211                         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1212                         ASSERT(spa_feature_is_active(spa,
1213                             SPA_FEATURE_EMBEDDED_DATA));
1214                         return (ZIO_PIPELINE_CONTINUE);
1215                 } else {
1216                         /*
1217                          * Round up compressed size up to the ashift
1218                          * of the smallest-ashift device, and zero the tail.
1219                          * This ensures that the compressed size of the BP
1220                          * (and thus compressratio property) are correct,
1221                          * in that we charge for the padding used to fill out
1222                          * the last sector.
1223                          */
1224                         size_t rounded;
1225
1226                         ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
1227
1228                         rounded = (size_t)P2ROUNDUP(psize,
1229                             1ULL << spa->spa_min_ashift);
1230                         if (rounded >= lsize) {
1231                                 compress = ZIO_COMPRESS_OFF;
1232                                 zio_buf_free(cbuf, lsize);
1233                                 psize = lsize;
1234                         } else {
1235                                 bzero((char *)cbuf + psize, rounded - psize);
1236                                 psize = rounded;
1237                                 zio_push_transform(zio, cbuf,
1238                                     psize, lsize, NULL);
1239                         }
1240                 }
1241         }
1242
1243         /*
1244          * The final pass of spa_sync() must be all rewrites, but the first
1245          * few passes offer a trade-off: allocating blocks defers convergence,
1246          * but newly allocated blocks are sequential, so they can be written
1247          * to disk faster.  Therefore, we allow the first few passes of
1248          * spa_sync() to allocate new blocks, but force rewrites after that.
1249          * There should only be a handful of blocks after pass 1 in any case.
1250          */
1251         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1252             BP_GET_PSIZE(bp) == psize &&
1253             pass >= zfs_sync_pass_rewrite) {
1254                 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1255                 ASSERT(psize != 0);
1256                 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1257                 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1258         } else {
1259                 BP_ZERO(bp);
1260                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1261         }
1262
1263         if (psize == 0) {
1264                 if (zio->io_bp_orig.blk_birth != 0 &&
1265                     spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1266                         BP_SET_LSIZE(bp, lsize);
1267                         BP_SET_TYPE(bp, zp->zp_type);
1268                         BP_SET_LEVEL(bp, zp->zp_level);
1269                         BP_SET_BIRTH(bp, zio->io_txg, 0);
1270                 }
1271                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1272         } else {
1273                 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1274                 BP_SET_LSIZE(bp, lsize);
1275                 BP_SET_TYPE(bp, zp->zp_type);
1276                 BP_SET_LEVEL(bp, zp->zp_level);
1277                 BP_SET_PSIZE(bp, psize);
1278                 BP_SET_COMPRESS(bp, compress);
1279                 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1280                 BP_SET_DEDUP(bp, zp->zp_dedup);
1281                 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1282                 if (zp->zp_dedup) {
1283                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1284                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1285                         zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1286                 }
1287                 if (zp->zp_nopwrite) {
1288                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1289                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1290                         zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1291                 }
1292         }
1293
1294         return (ZIO_PIPELINE_CONTINUE);
1295 }
1296
1297 static int
1298 zio_free_bp_init(zio_t *zio)
1299 {
1300         blkptr_t *bp = zio->io_bp;
1301
1302         if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1303                 if (BP_GET_DEDUP(bp))
1304                         zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1305         }
1306
1307         return (ZIO_PIPELINE_CONTINUE);
1308 }
1309
1310 /*
1311  * ==========================================================================
1312  * Execute the I/O pipeline
1313  * ==========================================================================
1314  */
1315
1316 static void
1317 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1318 {
1319         spa_t *spa = zio->io_spa;
1320         zio_type_t t = zio->io_type;
1321         int flags = (cutinline ? TQ_FRONT : 0);
1322
1323         /*
1324          * If we're a config writer or a probe, the normal issue and
1325          * interrupt threads may all be blocked waiting for the config lock.
1326          * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1327          */
1328         if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1329                 t = ZIO_TYPE_NULL;
1330
1331         /*
1332          * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1333          */
1334         if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1335                 t = ZIO_TYPE_NULL;
1336
1337         /*
1338          * If this is a high priority I/O, then use the high priority taskq if
1339          * available.
1340          */
1341         if (zio->io_priority == ZIO_PRIORITY_NOW &&
1342             spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1343                 q++;
1344
1345         ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1346
1347         /*
1348          * NB: We are assuming that the zio can only be dispatched
1349          * to a single taskq at a time.  It would be a grievous error
1350          * to dispatch the zio to another taskq at the same time.
1351          */
1352         ASSERT(taskq_empty_ent(&zio->io_tqent));
1353         spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1354             flags, &zio->io_tqent);
1355 }
1356
1357 static boolean_t
1358 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1359 {
1360         kthread_t *executor = zio->io_executor;
1361         spa_t *spa = zio->io_spa;
1362         zio_type_t t;
1363
1364         for (t = 0; t < ZIO_TYPES; t++) {
1365                 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1366                 uint_t i;
1367                 for (i = 0; i < tqs->stqs_count; i++) {
1368                         if (taskq_member(tqs->stqs_taskq[i], executor))
1369                                 return (B_TRUE);
1370                 }
1371         }
1372
1373         return (B_FALSE);
1374 }
1375
1376 static int
1377 zio_issue_async(zio_t *zio)
1378 {
1379         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1380
1381         return (ZIO_PIPELINE_STOP);
1382 }
1383
1384 void
1385 zio_interrupt(zio_t *zio)
1386 {
1387         zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1388 }
1389
1390 /*
1391  * Execute the I/O pipeline until one of the following occurs:
1392  * (1) the I/O completes; (2) the pipeline stalls waiting for
1393  * dependent child I/Os; (3) the I/O issues, so we're waiting
1394  * for an I/O completion interrupt; (4) the I/O is delegated by
1395  * vdev-level caching or aggregation; (5) the I/O is deferred
1396  * due to vdev-level queueing; (6) the I/O is handed off to
1397  * another thread.  In all cases, the pipeline stops whenever
1398  * there's no CPU work; it never burns a thread in cv_wait_io().
1399  *
1400  * There's no locking on io_stage because there's no legitimate way
1401  * for multiple threads to be attempting to process the same I/O.
1402  */
1403 static zio_pipe_stage_t *zio_pipeline[];
1404
1405 /*
1406  * zio_execute() is a wrapper around the static function
1407  * __zio_execute() so that we can force  __zio_execute() to be
1408  * inlined.  This reduces stack overhead which is important
1409  * because __zio_execute() is called recursively in several zio
1410  * code paths.  zio_execute() itself cannot be inlined because
1411  * it is externally visible.
1412  */
1413 void
1414 zio_execute(zio_t *zio)
1415 {
1416         fstrans_cookie_t cookie;
1417
1418         cookie = spl_fstrans_mark();
1419         __zio_execute(zio);
1420         spl_fstrans_unmark(cookie);
1421 }
1422
1423 /*
1424  * Used to determine if in the current context the stack is sized large
1425  * enough to allow zio_execute() to be called recursively.  A minimum
1426  * stack size of 16K is required to avoid needing to re-dispatch the zio.
1427  */
1428 boolean_t
1429 zio_execute_stack_check(zio_t *zio)
1430 {
1431 #if !defined(HAVE_LARGE_STACKS)
1432         dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
1433
1434         /* Executing in txg_sync_thread() context. */
1435         if (dp && curthread == dp->dp_tx.tx_sync_thread)
1436                 return (B_TRUE);
1437
1438         /* Pool initialization outside of zio_taskq context. */
1439         if (dp && spa_is_initializing(dp->dp_spa) &&
1440             !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
1441             !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
1442                 return (B_TRUE);
1443 #endif /* HAVE_LARGE_STACKS */
1444
1445         return (B_FALSE);
1446 }
1447
1448 __attribute__((always_inline))
1449 static inline void
1450 __zio_execute(zio_t *zio)
1451 {
1452         zio->io_executor = curthread;
1453
1454         while (zio->io_stage < ZIO_STAGE_DONE) {
1455                 enum zio_stage pipeline = zio->io_pipeline;
1456                 enum zio_stage stage = zio->io_stage;
1457                 int rv;
1458
1459                 ASSERT(!MUTEX_HELD(&zio->io_lock));
1460                 ASSERT(ISP2(stage));
1461                 ASSERT(zio->io_stall == NULL);
1462
1463                 do {
1464                         stage <<= 1;
1465                 } while ((stage & pipeline) == 0);
1466
1467                 ASSERT(stage <= ZIO_STAGE_DONE);
1468
1469                 /*
1470                  * If we are in interrupt context and this pipeline stage
1471                  * will grab a config lock that is held across I/O,
1472                  * or may wait for an I/O that needs an interrupt thread
1473                  * to complete, issue async to avoid deadlock.
1474                  *
1475                  * For VDEV_IO_START, we cut in line so that the io will
1476                  * be sent to disk promptly.
1477                  */
1478                 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1479                     zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1480                         boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1481                             zio_requeue_io_start_cut_in_line : B_FALSE;
1482                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1483                         return;
1484                 }
1485
1486                 /*
1487                  * If the current context doesn't have large enough stacks
1488                  * the zio must be issued asynchronously to prevent overflow.
1489                  */
1490                 if (zio_execute_stack_check(zio)) {
1491                         boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1492                             zio_requeue_io_start_cut_in_line : B_FALSE;
1493                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1494                         return;
1495                 }
1496
1497                 zio->io_stage = stage;
1498                 rv = zio_pipeline[highbit64(stage) - 1](zio);
1499
1500                 if (rv == ZIO_PIPELINE_STOP)
1501                         return;
1502
1503                 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1504         }
1505 }
1506
1507
1508 /*
1509  * ==========================================================================
1510  * Initiate I/O, either sync or async
1511  * ==========================================================================
1512  */
1513 int
1514 zio_wait(zio_t *zio)
1515 {
1516         int error;
1517
1518         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1519         ASSERT(zio->io_executor == NULL);
1520
1521         zio->io_waiter = curthread;
1522
1523         __zio_execute(zio);
1524
1525         mutex_enter(&zio->io_lock);
1526         while (zio->io_executor != NULL)
1527                 cv_wait_io(&zio->io_cv, &zio->io_lock);
1528         mutex_exit(&zio->io_lock);
1529
1530         error = zio->io_error;
1531         zio_destroy(zio);
1532
1533         return (error);
1534 }
1535
1536 void
1537 zio_nowait(zio_t *zio)
1538 {
1539         ASSERT(zio->io_executor == NULL);
1540
1541         if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1542             zio_unique_parent(zio) == NULL) {
1543                 zio_t *pio;
1544
1545                 /*
1546                  * This is a logical async I/O with no parent to wait for it.
1547                  * We add it to the spa_async_root_zio "Godfather" I/O which
1548                  * will ensure they complete prior to unloading the pool.
1549                  */
1550                 spa_t *spa = zio->io_spa;
1551                 kpreempt_disable();
1552                 pio = spa->spa_async_zio_root[CPU_SEQID];
1553                 kpreempt_enable();
1554
1555                 zio_add_child(pio, zio);
1556         }
1557
1558         __zio_execute(zio);
1559 }
1560
1561 /*
1562  * ==========================================================================
1563  * Reexecute or suspend/resume failed I/O
1564  * ==========================================================================
1565  */
1566
1567 static void
1568 zio_reexecute(zio_t *pio)
1569 {
1570         zio_t *cio, *cio_next;
1571         int c, w;
1572
1573         ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1574         ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1575         ASSERT(pio->io_gang_leader == NULL);
1576         ASSERT(pio->io_gang_tree == NULL);
1577
1578         pio->io_flags = pio->io_orig_flags;
1579         pio->io_stage = pio->io_orig_stage;
1580         pio->io_pipeline = pio->io_orig_pipeline;
1581         pio->io_reexecute = 0;
1582         pio->io_flags |= ZIO_FLAG_REEXECUTED;
1583         pio->io_error = 0;
1584         for (w = 0; w < ZIO_WAIT_TYPES; w++)
1585                 pio->io_state[w] = 0;
1586         for (c = 0; c < ZIO_CHILD_TYPES; c++)
1587                 pio->io_child_error[c] = 0;
1588
1589         if (IO_IS_ALLOCATING(pio))
1590                 BP_ZERO(pio->io_bp);
1591
1592         /*
1593          * As we reexecute pio's children, new children could be created.
1594          * New children go to the head of pio's io_child_list, however,
1595          * so we will (correctly) not reexecute them.  The key is that
1596          * the remainder of pio's io_child_list, from 'cio_next' onward,
1597          * cannot be affected by any side effects of reexecuting 'cio'.
1598          */
1599         for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1600                 cio_next = zio_walk_children(pio);
1601                 mutex_enter(&pio->io_lock);
1602                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
1603                         pio->io_children[cio->io_child_type][w]++;
1604                 mutex_exit(&pio->io_lock);
1605                 zio_reexecute(cio);
1606         }
1607
1608         /*
1609          * Now that all children have been reexecuted, execute the parent.
1610          * We don't reexecute "The Godfather" I/O here as it's the
1611          * responsibility of the caller to wait on him.
1612          */
1613         if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1614                 __zio_execute(pio);
1615 }
1616
1617 void
1618 zio_suspend(spa_t *spa, zio_t *zio)
1619 {
1620         if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1621                 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1622                     "failure and the failure mode property for this pool "
1623                     "is set to panic.", spa_name(spa));
1624
1625         cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
1626             "failure and has been suspended.\n", spa_name(spa));
1627
1628         zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1629
1630         mutex_enter(&spa->spa_suspend_lock);
1631
1632         if (spa->spa_suspend_zio_root == NULL)
1633                 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1634                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1635                     ZIO_FLAG_GODFATHER);
1636
1637         spa->spa_suspended = B_TRUE;
1638
1639         if (zio != NULL) {
1640                 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1641                 ASSERT(zio != spa->spa_suspend_zio_root);
1642                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1643                 ASSERT(zio_unique_parent(zio) == NULL);
1644                 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1645                 zio_add_child(spa->spa_suspend_zio_root, zio);
1646         }
1647
1648         mutex_exit(&spa->spa_suspend_lock);
1649 }
1650
1651 int
1652 zio_resume(spa_t *spa)
1653 {
1654         zio_t *pio;
1655
1656         /*
1657          * Reexecute all previously suspended i/o.
1658          */
1659         mutex_enter(&spa->spa_suspend_lock);
1660         spa->spa_suspended = B_FALSE;
1661         cv_broadcast(&spa->spa_suspend_cv);
1662         pio = spa->spa_suspend_zio_root;
1663         spa->spa_suspend_zio_root = NULL;
1664         mutex_exit(&spa->spa_suspend_lock);
1665
1666         if (pio == NULL)
1667                 return (0);
1668
1669         zio_reexecute(pio);
1670         return (zio_wait(pio));
1671 }
1672
1673 void
1674 zio_resume_wait(spa_t *spa)
1675 {
1676         mutex_enter(&spa->spa_suspend_lock);
1677         while (spa_suspended(spa))
1678                 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1679         mutex_exit(&spa->spa_suspend_lock);
1680 }
1681
1682 /*
1683  * ==========================================================================
1684  * Gang blocks.
1685  *
1686  * A gang block is a collection of small blocks that looks to the DMU
1687  * like one large block.  When zio_dva_allocate() cannot find a block
1688  * of the requested size, due to either severe fragmentation or the pool
1689  * being nearly full, it calls zio_write_gang_block() to construct the
1690  * block from smaller fragments.
1691  *
1692  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1693  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1694  * an indirect block: it's an array of block pointers.  It consumes
1695  * only one sector and hence is allocatable regardless of fragmentation.
1696  * The gang header's bps point to its gang members, which hold the data.
1697  *
1698  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1699  * as the verifier to ensure uniqueness of the SHA256 checksum.
1700  * Critically, the gang block bp's blk_cksum is the checksum of the data,
1701  * not the gang header.  This ensures that data block signatures (needed for
1702  * deduplication) are independent of how the block is physically stored.
1703  *
1704  * Gang blocks can be nested: a gang member may itself be a gang block.
1705  * Thus every gang block is a tree in which root and all interior nodes are
1706  * gang headers, and the leaves are normal blocks that contain user data.
1707  * The root of the gang tree is called the gang leader.
1708  *
1709  * To perform any operation (read, rewrite, free, claim) on a gang block,
1710  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1711  * in the io_gang_tree field of the original logical i/o by recursively
1712  * reading the gang leader and all gang headers below it.  This yields
1713  * an in-core tree containing the contents of every gang header and the
1714  * bps for every constituent of the gang block.
1715  *
1716  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1717  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1718  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1719  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1720  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1721  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1722  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1723  * of the gang header plus zio_checksum_compute() of the data to update the
1724  * gang header's blk_cksum as described above.
1725  *
1726  * The two-phase assemble/issue model solves the problem of partial failure --
1727  * what if you'd freed part of a gang block but then couldn't read the
1728  * gang header for another part?  Assembling the entire gang tree first
1729  * ensures that all the necessary gang header I/O has succeeded before
1730  * starting the actual work of free, claim, or write.  Once the gang tree
1731  * is assembled, free and claim are in-memory operations that cannot fail.
1732  *
1733  * In the event that a gang write fails, zio_dva_unallocate() walks the
1734  * gang tree to immediately free (i.e. insert back into the space map)
1735  * everything we've allocated.  This ensures that we don't get ENOSPC
1736  * errors during repeated suspend/resume cycles due to a flaky device.
1737  *
1738  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1739  * the gang tree, we won't modify the block, so we can safely defer the free
1740  * (knowing that the block is still intact).  If we *can* assemble the gang
1741  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1742  * each constituent bp and we can allocate a new block on the next sync pass.
1743  *
1744  * In all cases, the gang tree allows complete recovery from partial failure.
1745  * ==========================================================================
1746  */
1747
1748 static zio_t *
1749 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1750 {
1751         if (gn != NULL)
1752                 return (pio);
1753
1754         return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1755             NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1756             &pio->io_bookmark));
1757 }
1758
1759 zio_t *
1760 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1761 {
1762         zio_t *zio;
1763
1764         if (gn != NULL) {
1765                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1766                     gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1767                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1768                 /*
1769                  * As we rewrite each gang header, the pipeline will compute
1770                  * a new gang block header checksum for it; but no one will
1771                  * compute a new data checksum, so we do that here.  The one
1772                  * exception is the gang leader: the pipeline already computed
1773                  * its data checksum because that stage precedes gang assembly.
1774                  * (Presently, nothing actually uses interior data checksums;
1775                  * this is just good hygiene.)
1776                  */
1777                 if (gn != pio->io_gang_leader->io_gang_tree) {
1778                         zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1779                             data, BP_GET_PSIZE(bp));
1780                 }
1781                 /*
1782                  * If we are here to damage data for testing purposes,
1783                  * leave the GBH alone so that we can detect the damage.
1784                  */
1785                 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1786                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1787         } else {
1788                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1789                     data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1790                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1791         }
1792
1793         return (zio);
1794 }
1795
1796 /* ARGSUSED */
1797 zio_t *
1798 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1799 {
1800         return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1801             ZIO_GANG_CHILD_FLAGS(pio)));
1802 }
1803
1804 /* ARGSUSED */
1805 zio_t *
1806 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1807 {
1808         return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1809             NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1810 }
1811
1812 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1813         NULL,
1814         zio_read_gang,
1815         zio_rewrite_gang,
1816         zio_free_gang,
1817         zio_claim_gang,
1818         NULL
1819 };
1820
1821 static void zio_gang_tree_assemble_done(zio_t *zio);
1822
1823 static zio_gang_node_t *
1824 zio_gang_node_alloc(zio_gang_node_t **gnpp)
1825 {
1826         zio_gang_node_t *gn;
1827
1828         ASSERT(*gnpp == NULL);
1829
1830         gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1831         gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1832         *gnpp = gn;
1833
1834         return (gn);
1835 }
1836
1837 static void
1838 zio_gang_node_free(zio_gang_node_t **gnpp)
1839 {
1840         zio_gang_node_t *gn = *gnpp;
1841         int g;
1842
1843         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1844                 ASSERT(gn->gn_child[g] == NULL);
1845
1846         zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1847         kmem_free(gn, sizeof (*gn));
1848         *gnpp = NULL;
1849 }
1850
1851 static void
1852 zio_gang_tree_free(zio_gang_node_t **gnpp)
1853 {
1854         zio_gang_node_t *gn = *gnpp;
1855         int g;
1856
1857         if (gn == NULL)
1858                 return;
1859
1860         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1861                 zio_gang_tree_free(&gn->gn_child[g]);
1862
1863         zio_gang_node_free(gnpp);
1864 }
1865
1866 static void
1867 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1868 {
1869         zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1870
1871         ASSERT(gio->io_gang_leader == gio);
1872         ASSERT(BP_IS_GANG(bp));
1873
1874         zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1875             SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1876             gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1877 }
1878
1879 static void
1880 zio_gang_tree_assemble_done(zio_t *zio)
1881 {
1882         zio_t *gio = zio->io_gang_leader;
1883         zio_gang_node_t *gn = zio->io_private;
1884         blkptr_t *bp = zio->io_bp;
1885         int g;
1886
1887         ASSERT(gio == zio_unique_parent(zio));
1888         ASSERT(zio->io_child_count == 0);
1889
1890         if (zio->io_error)
1891                 return;
1892
1893         if (BP_SHOULD_BYTESWAP(bp))
1894                 byteswap_uint64_array(zio->io_data, zio->io_size);
1895
1896         ASSERT(zio->io_data == gn->gn_gbh);
1897         ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1898         ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1899
1900         for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1901                 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1902                 if (!BP_IS_GANG(gbp))
1903                         continue;
1904                 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1905         }
1906 }
1907
1908 static void
1909 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1910 {
1911         zio_t *gio = pio->io_gang_leader;
1912         zio_t *zio;
1913         int g;
1914
1915         ASSERT(BP_IS_GANG(bp) == !!gn);
1916         ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1917         ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1918
1919         /*
1920          * If you're a gang header, your data is in gn->gn_gbh.
1921          * If you're a gang member, your data is in 'data' and gn == NULL.
1922          */
1923         zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1924
1925         if (gn != NULL) {
1926                 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1927
1928                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1929                         blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1930                         if (BP_IS_HOLE(gbp))
1931                                 continue;
1932                         zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1933                         data = (char *)data + BP_GET_PSIZE(gbp);
1934                 }
1935         }
1936
1937         if (gn == gio->io_gang_tree)
1938                 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1939
1940         if (zio != pio)
1941                 zio_nowait(zio);
1942 }
1943
1944 static int
1945 zio_gang_assemble(zio_t *zio)
1946 {
1947         blkptr_t *bp = zio->io_bp;
1948
1949         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1950         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1951
1952         zio->io_gang_leader = zio;
1953
1954         zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1955
1956         return (ZIO_PIPELINE_CONTINUE);
1957 }
1958
1959 static int
1960 zio_gang_issue(zio_t *zio)
1961 {
1962         blkptr_t *bp = zio->io_bp;
1963
1964         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1965                 return (ZIO_PIPELINE_STOP);
1966
1967         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1968         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1969
1970         if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1971                 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1972         else
1973                 zio_gang_tree_free(&zio->io_gang_tree);
1974
1975         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1976
1977         return (ZIO_PIPELINE_CONTINUE);
1978 }
1979
1980 static void
1981 zio_write_gang_member_ready(zio_t *zio)
1982 {
1983         zio_t *pio = zio_unique_parent(zio);
1984         dva_t *cdva = zio->io_bp->blk_dva;
1985         dva_t *pdva = pio->io_bp->blk_dva;
1986         uint64_t asize;
1987         int d;
1988         ASSERTV(zio_t *gio = zio->io_gang_leader);
1989
1990         if (BP_IS_HOLE(zio->io_bp))
1991                 return;
1992
1993         ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1994
1995         ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1996         ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1997         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1998         ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1999         ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
2000
2001         mutex_enter(&pio->io_lock);
2002         for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
2003                 ASSERT(DVA_GET_GANG(&pdva[d]));
2004                 asize = DVA_GET_ASIZE(&pdva[d]);
2005                 asize += DVA_GET_ASIZE(&cdva[d]);
2006                 DVA_SET_ASIZE(&pdva[d], asize);
2007         }
2008         mutex_exit(&pio->io_lock);
2009 }
2010
2011 static int
2012 zio_write_gang_block(zio_t *pio)
2013 {
2014         spa_t *spa = pio->io_spa;
2015         blkptr_t *bp = pio->io_bp;
2016         zio_t *gio = pio->io_gang_leader;
2017         zio_t *zio;
2018         zio_gang_node_t *gn, **gnpp;
2019         zio_gbh_phys_t *gbh;
2020         uint64_t txg = pio->io_txg;
2021         uint64_t resid = pio->io_size;
2022         uint64_t lsize;
2023         int copies = gio->io_prop.zp_copies;
2024         int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
2025         zio_prop_t zp;
2026         int g, error;
2027
2028         error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
2029             bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
2030             METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
2031         if (error) {
2032                 pio->io_error = error;
2033                 return (ZIO_PIPELINE_CONTINUE);
2034         }
2035
2036         if (pio == gio) {
2037                 gnpp = &gio->io_gang_tree;
2038         } else {
2039                 gnpp = pio->io_private;
2040                 ASSERT(pio->io_ready == zio_write_gang_member_ready);
2041         }
2042
2043         gn = zio_gang_node_alloc(gnpp);
2044         gbh = gn->gn_gbh;
2045         bzero(gbh, SPA_GANGBLOCKSIZE);
2046
2047         /*
2048          * Create the gang header.
2049          */
2050         zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
2051             pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2052
2053         /*
2054          * Create and nowait the gang children.
2055          */
2056         for (g = 0; resid != 0; resid -= lsize, g++) {
2057                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
2058                     SPA_MINBLOCKSIZE);
2059                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
2060
2061                 zp.zp_checksum = gio->io_prop.zp_checksum;
2062                 zp.zp_compress = ZIO_COMPRESS_OFF;
2063                 zp.zp_type = DMU_OT_NONE;
2064                 zp.zp_level = 0;
2065                 zp.zp_copies = gio->io_prop.zp_copies;
2066                 zp.zp_dedup = B_FALSE;
2067                 zp.zp_dedup_verify = B_FALSE;
2068                 zp.zp_nopwrite = B_FALSE;
2069
2070                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
2071                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
2072                     zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
2073                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2074                     &pio->io_bookmark));
2075         }
2076
2077         /*
2078          * Set pio's pipeline to just wait for zio to finish.
2079          */
2080         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2081
2082         /*
2083          * We didn't allocate this bp, so make sure it doesn't get unmarked.
2084          */
2085         pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
2086
2087         zio_nowait(zio);
2088
2089         return (ZIO_PIPELINE_CONTINUE);
2090 }
2091
2092 /*
2093  * The zio_nop_write stage in the pipeline determines if allocating
2094  * a new bp is necessary.  By leveraging a cryptographically secure checksum,
2095  * such as SHA256, we can compare the checksums of the new data and the old
2096  * to determine if allocating a new block is required.  The nopwrite
2097  * feature can handle writes in either syncing or open context (i.e. zil
2098  * writes) and as a result is mutually exclusive with dedup.
2099  */
2100 static int
2101 zio_nop_write(zio_t *zio)
2102 {
2103         blkptr_t *bp = zio->io_bp;
2104         blkptr_t *bp_orig = &zio->io_bp_orig;
2105         zio_prop_t *zp = &zio->io_prop;
2106
2107         ASSERT(BP_GET_LEVEL(bp) == 0);
2108         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2109         ASSERT(zp->zp_nopwrite);
2110         ASSERT(!zp->zp_dedup);
2111         ASSERT(zio->io_bp_override == NULL);
2112         ASSERT(IO_IS_ALLOCATING(zio));
2113
2114         /*
2115          * Check to see if the original bp and the new bp have matching
2116          * characteristics (i.e. same checksum, compression algorithms, etc).
2117          * If they don't then just continue with the pipeline which will
2118          * allocate a new bp.
2119          */
2120         if (BP_IS_HOLE(bp_orig) ||
2121             !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2122             BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2123             BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2124             BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2125             zp->zp_copies != BP_GET_NDVAS(bp_orig))
2126                 return (ZIO_PIPELINE_CONTINUE);
2127
2128         /*
2129          * If the checksums match then reset the pipeline so that we
2130          * avoid allocating a new bp and issuing any I/O.
2131          */
2132         if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2133                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2134                 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2135                 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2136                 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2137                 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2138                     sizeof (uint64_t)) == 0);
2139
2140                 *bp = *bp_orig;
2141                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2142                 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2143         }
2144
2145         return (ZIO_PIPELINE_CONTINUE);
2146 }
2147
2148 /*
2149  * ==========================================================================
2150  * Dedup
2151  * ==========================================================================
2152  */
2153 static void
2154 zio_ddt_child_read_done(zio_t *zio)
2155 {
2156         blkptr_t *bp = zio->io_bp;
2157         ddt_entry_t *dde = zio->io_private;
2158         ddt_phys_t *ddp;
2159         zio_t *pio = zio_unique_parent(zio);
2160
2161         mutex_enter(&pio->io_lock);
2162         ddp = ddt_phys_select(dde, bp);
2163         if (zio->io_error == 0)
2164                 ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
2165         if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2166                 dde->dde_repair_data = zio->io_data;
2167         else
2168                 zio_buf_free(zio->io_data, zio->io_size);
2169         mutex_exit(&pio->io_lock);
2170 }
2171
2172 static int
2173 zio_ddt_read_start(zio_t *zio)
2174 {
2175         blkptr_t *bp = zio->io_bp;
2176         int p;
2177
2178         ASSERT(BP_GET_DEDUP(bp));
2179         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2180         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2181
2182         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2183                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2184                 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2185                 ddt_phys_t *ddp = dde->dde_phys;
2186                 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2187                 blkptr_t blk;
2188
2189                 ASSERT(zio->io_vsd == NULL);
2190                 zio->io_vsd = dde;
2191
2192                 if (ddp_self == NULL)
2193                         return (ZIO_PIPELINE_CONTINUE);
2194
2195                 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2196                         if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2197                                 continue;
2198                         ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2199                             &blk);
2200                         zio_nowait(zio_read(zio, zio->io_spa, &blk,
2201                             zio_buf_alloc(zio->io_size), zio->io_size,
2202                             zio_ddt_child_read_done, dde, zio->io_priority,
2203                             ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2204                             &zio->io_bookmark));
2205                 }
2206                 return (ZIO_PIPELINE_CONTINUE);
2207         }
2208
2209         zio_nowait(zio_read(zio, zio->io_spa, bp,
2210             zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2211             ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2212
2213         return (ZIO_PIPELINE_CONTINUE);
2214 }
2215
2216 static int
2217 zio_ddt_read_done(zio_t *zio)
2218 {
2219         blkptr_t *bp = zio->io_bp;
2220
2221         if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2222                 return (ZIO_PIPELINE_STOP);
2223
2224         ASSERT(BP_GET_DEDUP(bp));
2225         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2226         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2227
2228         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2229                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2230                 ddt_entry_t *dde = zio->io_vsd;
2231                 if (ddt == NULL) {
2232                         ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2233                         return (ZIO_PIPELINE_CONTINUE);
2234                 }
2235                 if (dde == NULL) {
2236                         zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2237                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2238                         return (ZIO_PIPELINE_STOP);
2239                 }
2240                 if (dde->dde_repair_data != NULL) {
2241                         bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2242                         zio->io_child_error[ZIO_CHILD_DDT] = 0;
2243                 }
2244                 ddt_repair_done(ddt, dde);
2245                 zio->io_vsd = NULL;
2246         }
2247
2248         ASSERT(zio->io_vsd == NULL);
2249
2250         return (ZIO_PIPELINE_CONTINUE);
2251 }
2252
2253 static boolean_t
2254 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2255 {
2256         spa_t *spa = zio->io_spa;
2257         int p;
2258
2259         /*
2260          * Note: we compare the original data, not the transformed data,
2261          * because when zio->io_bp is an override bp, we will not have
2262          * pushed the I/O transforms.  That's an important optimization
2263          * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2264          */
2265         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2266                 zio_t *lio = dde->dde_lead_zio[p];
2267
2268                 if (lio != NULL) {
2269                         return (lio->io_orig_size != zio->io_orig_size ||
2270                             bcmp(zio->io_orig_data, lio->io_orig_data,
2271                             zio->io_orig_size) != 0);
2272                 }
2273         }
2274
2275         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2276                 ddt_phys_t *ddp = &dde->dde_phys[p];
2277
2278                 if (ddp->ddp_phys_birth != 0) {
2279                         arc_buf_t *abuf = NULL;
2280                         arc_flags_t aflags = ARC_FLAG_WAIT;
2281                         blkptr_t blk = *zio->io_bp;
2282                         int error;
2283
2284                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2285
2286                         ddt_exit(ddt);
2287
2288                         error = arc_read(NULL, spa, &blk,
2289                             arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2290                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2291                             &aflags, &zio->io_bookmark);
2292
2293                         if (error == 0) {
2294                                 if (arc_buf_size(abuf) != zio->io_orig_size ||
2295                                     bcmp(abuf->b_data, zio->io_orig_data,
2296                                     zio->io_orig_size) != 0)
2297                                         error = SET_ERROR(EEXIST);
2298                                 VERIFY(arc_buf_remove_ref(abuf, &abuf));
2299                         }
2300
2301                         ddt_enter(ddt);
2302                         return (error != 0);
2303                 }
2304         }
2305
2306         return (B_FALSE);
2307 }
2308
2309 static void
2310 zio_ddt_child_write_ready(zio_t *zio)
2311 {
2312         int p = zio->io_prop.zp_copies;
2313         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2314         ddt_entry_t *dde = zio->io_private;
2315         ddt_phys_t *ddp = &dde->dde_phys[p];
2316         zio_t *pio;
2317
2318         if (zio->io_error)
2319                 return;
2320
2321         ddt_enter(ddt);
2322
2323         ASSERT(dde->dde_lead_zio[p] == zio);
2324
2325         ddt_phys_fill(ddp, zio->io_bp);
2326
2327         while ((pio = zio_walk_parents(zio)) != NULL)
2328                 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2329
2330         ddt_exit(ddt);
2331 }
2332
2333 static void
2334 zio_ddt_child_write_done(zio_t *zio)
2335 {
2336         int p = zio->io_prop.zp_copies;
2337         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2338         ddt_entry_t *dde = zio->io_private;
2339         ddt_phys_t *ddp = &dde->dde_phys[p];
2340
2341         ddt_enter(ddt);
2342
2343         ASSERT(ddp->ddp_refcnt == 0);
2344         ASSERT(dde->dde_lead_zio[p] == zio);
2345         dde->dde_lead_zio[p] = NULL;
2346
2347         if (zio->io_error == 0) {
2348                 while (zio_walk_parents(zio) != NULL)
2349                         ddt_phys_addref(ddp);
2350         } else {
2351                 ddt_phys_clear(ddp);
2352         }
2353
2354         ddt_exit(ddt);
2355 }
2356
2357 static void
2358 zio_ddt_ditto_write_done(zio_t *zio)
2359 {
2360         int p = DDT_PHYS_DITTO;
2361         blkptr_t *bp = zio->io_bp;
2362         ddt_t *ddt = ddt_select(zio->io_spa, bp);
2363         ddt_entry_t *dde = zio->io_private;
2364         ddt_phys_t *ddp = &dde->dde_phys[p];
2365         ddt_key_t *ddk = &dde->dde_key;
2366         ASSERTV(zio_prop_t *zp = &zio->io_prop);
2367
2368         ddt_enter(ddt);
2369
2370         ASSERT(ddp->ddp_refcnt == 0);
2371         ASSERT(dde->dde_lead_zio[p] == zio);
2372         dde->dde_lead_zio[p] = NULL;
2373
2374         if (zio->io_error == 0) {
2375                 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2376                 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2377                 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2378                 if (ddp->ddp_phys_birth != 0)
2379                         ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2380                 ddt_phys_fill(ddp, bp);
2381         }
2382
2383         ddt_exit(ddt);
2384 }
2385
2386 static int
2387 zio_ddt_write(zio_t *zio)
2388 {
2389         spa_t *spa = zio->io_spa;
2390         blkptr_t *bp = zio->io_bp;
2391         uint64_t txg = zio->io_txg;
2392         zio_prop_t *zp = &zio->io_prop;
2393         int p = zp->zp_copies;
2394         int ditto_copies;
2395         zio_t *cio = NULL;
2396         zio_t *dio = NULL;
2397         ddt_t *ddt = ddt_select(spa, bp);
2398         ddt_entry_t *dde;
2399         ddt_phys_t *ddp;
2400
2401         ASSERT(BP_GET_DEDUP(bp));
2402         ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2403         ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2404
2405         ddt_enter(ddt);
2406         dde = ddt_lookup(ddt, bp, B_TRUE);
2407         ddp = &dde->dde_phys[p];
2408
2409         if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2410                 /*
2411                  * If we're using a weak checksum, upgrade to a strong checksum
2412                  * and try again.  If we're already using a strong checksum,
2413                  * we can't resolve it, so just convert to an ordinary write.
2414                  * (And automatically e-mail a paper to Nature?)
2415                  */
2416                 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2417                         zp->zp_checksum = spa_dedup_checksum(spa);
2418                         zio_pop_transforms(zio);
2419                         zio->io_stage = ZIO_STAGE_OPEN;
2420                         BP_ZERO(bp);
2421                 } else {
2422                         zp->zp_dedup = B_FALSE;
2423                 }
2424                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2425                 ddt_exit(ddt);
2426                 return (ZIO_PIPELINE_CONTINUE);
2427         }
2428
2429         ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2430         ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2431
2432         if (ditto_copies > ddt_ditto_copies_present(dde) &&
2433             dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2434                 zio_prop_t czp = *zp;
2435
2436                 czp.zp_copies = ditto_copies;
2437
2438                 /*
2439                  * If we arrived here with an override bp, we won't have run
2440                  * the transform stack, so we won't have the data we need to
2441                  * generate a child i/o.  So, toss the override bp and restart.
2442                  * This is safe, because using the override bp is just an
2443                  * optimization; and it's rare, so the cost doesn't matter.
2444                  */
2445                 if (zio->io_bp_override) {
2446                         zio_pop_transforms(zio);
2447                         zio->io_stage = ZIO_STAGE_OPEN;
2448                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2449                         zio->io_bp_override = NULL;
2450                         BP_ZERO(bp);
2451                         ddt_exit(ddt);
2452                         return (ZIO_PIPELINE_CONTINUE);
2453                 }
2454
2455                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2456                     zio->io_orig_size, &czp, NULL, NULL,
2457                     zio_ddt_ditto_write_done, dde, zio->io_priority,
2458                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2459
2460                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2461                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2462         }
2463
2464         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2465                 if (ddp->ddp_phys_birth != 0)
2466                         ddt_bp_fill(ddp, bp, txg);
2467                 if (dde->dde_lead_zio[p] != NULL)
2468                         zio_add_child(zio, dde->dde_lead_zio[p]);
2469                 else
2470                         ddt_phys_addref(ddp);
2471         } else if (zio->io_bp_override) {
2472                 ASSERT(bp->blk_birth == txg);
2473                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2474                 ddt_phys_fill(ddp, bp);
2475                 ddt_phys_addref(ddp);
2476         } else {
2477                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2478                     zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2479                     zio_ddt_child_write_done, dde, zio->io_priority,
2480                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2481
2482                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2483                 dde->dde_lead_zio[p] = cio;
2484         }
2485
2486         ddt_exit(ddt);
2487
2488         if (cio)
2489                 zio_nowait(cio);
2490         if (dio)
2491                 zio_nowait(dio);
2492
2493         return (ZIO_PIPELINE_CONTINUE);
2494 }
2495
2496 ddt_entry_t *freedde; /* for debugging */
2497
2498 static int
2499 zio_ddt_free(zio_t *zio)
2500 {
2501         spa_t *spa = zio->io_spa;
2502         blkptr_t *bp = zio->io_bp;
2503         ddt_t *ddt = ddt_select(spa, bp);
2504         ddt_entry_t *dde;
2505         ddt_phys_t *ddp;
2506
2507         ASSERT(BP_GET_DEDUP(bp));
2508         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2509
2510         ddt_enter(ddt);
2511         freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2512         if (dde) {
2513                 ddp = ddt_phys_select(dde, bp);
2514                 if (ddp)
2515                         ddt_phys_decref(ddp);
2516         }
2517         ddt_exit(ddt);
2518
2519         return (ZIO_PIPELINE_CONTINUE);
2520 }
2521
2522 /*
2523  * ==========================================================================
2524  * Allocate and free blocks
2525  * ==========================================================================
2526  */
2527 static int
2528 zio_dva_allocate(zio_t *zio)
2529 {
2530         spa_t *spa = zio->io_spa;
2531         metaslab_class_t *mc = spa_normal_class(spa);
2532         blkptr_t *bp = zio->io_bp;
2533         int error;
2534         int flags = 0;
2535
2536         if (zio->io_gang_leader == NULL) {
2537                 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2538                 zio->io_gang_leader = zio;
2539         }
2540
2541         ASSERT(BP_IS_HOLE(bp));
2542         ASSERT0(BP_GET_NDVAS(bp));
2543         ASSERT3U(zio->io_prop.zp_copies, >, 0);
2544         ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2545         ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2546
2547         /*
2548          * The dump device does not support gang blocks so allocation on
2549          * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2550          * the "fast" gang feature.
2551          */
2552         flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2553         flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2554             METASLAB_GANG_CHILD : 0;
2555         flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
2556         error = metaslab_alloc(spa, mc, zio->io_size, bp,
2557             zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2558
2559         if (error) {
2560                 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2561                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2562                     error);
2563                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2564                         return (zio_write_gang_block(zio));
2565                 zio->io_error = error;
2566         }
2567
2568         return (ZIO_PIPELINE_CONTINUE);
2569 }
2570
2571 static int
2572 zio_dva_free(zio_t *zio)
2573 {
2574         metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2575
2576         return (ZIO_PIPELINE_CONTINUE);
2577 }
2578
2579 static int
2580 zio_dva_claim(zio_t *zio)
2581 {
2582         int error;
2583
2584         error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2585         if (error)
2586                 zio->io_error = error;
2587
2588         return (ZIO_PIPELINE_CONTINUE);
2589 }
2590
2591 /*
2592  * Undo an allocation.  This is used by zio_done() when an I/O fails
2593  * and we want to give back the block we just allocated.
2594  * This handles both normal blocks and gang blocks.
2595  */
2596 static void
2597 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2598 {
2599         int g;
2600
2601         ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2602         ASSERT(zio->io_bp_override == NULL);
2603
2604         if (!BP_IS_HOLE(bp))
2605                 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2606
2607         if (gn != NULL) {
2608                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2609                         zio_dva_unallocate(zio, gn->gn_child[g],
2610                             &gn->gn_gbh->zg_blkptr[g]);
2611                 }
2612         }
2613 }
2614
2615 /*
2616  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2617  */
2618 int
2619 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
2620     boolean_t use_slog)
2621 {
2622         int error = 1;
2623
2624         ASSERT(txg > spa_syncing_txg(spa));
2625
2626         /*
2627          * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2628          * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2629          * when allocating them.
2630          */
2631         if (use_slog) {
2632                 error = metaslab_alloc(spa, spa_log_class(spa), size,
2633                     new_bp, 1, txg, NULL,
2634                     METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
2635         }
2636
2637         if (error) {
2638                 error = metaslab_alloc(spa, spa_normal_class(spa), size,
2639                     new_bp, 1, txg, NULL,
2640                     METASLAB_FASTWRITE);
2641         }
2642
2643         if (error == 0) {
2644                 BP_SET_LSIZE(new_bp, size);
2645                 BP_SET_PSIZE(new_bp, size);
2646                 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2647                 BP_SET_CHECKSUM(new_bp,
2648                     spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2649                     ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2650                 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2651                 BP_SET_LEVEL(new_bp, 0);
2652                 BP_SET_DEDUP(new_bp, 0);
2653                 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2654         }
2655
2656         return (error);
2657 }
2658
2659 /*
2660  * Free an intent log block.
2661  */
2662 void
2663 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2664 {
2665         ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2666         ASSERT(!BP_IS_GANG(bp));
2667
2668         zio_free(spa, txg, bp);
2669 }
2670
2671 /*
2672  * ==========================================================================
2673  * Read and write to physical devices
2674  * ==========================================================================
2675  */
2676
2677
2678 /*
2679  * Issue an I/O to the underlying vdev. Typically the issue pipeline
2680  * stops after this stage and will resume upon I/O completion.
2681  * However, there are instances where the vdev layer may need to
2682  * continue the pipeline when an I/O was not issued. Since the I/O
2683  * that was sent to the vdev layer might be different than the one
2684  * currently active in the pipeline (see vdev_queue_io()), we explicitly
2685  * force the underlying vdev layers to call either zio_execute() or
2686  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2687  */
2688 static int
2689 zio_vdev_io_start(zio_t *zio)
2690 {
2691         vdev_t *vd = zio->io_vd;
2692         uint64_t align;
2693         spa_t *spa = zio->io_spa;
2694
2695         ASSERT(zio->io_error == 0);
2696         ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2697
2698         if (vd == NULL) {
2699                 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2700                         spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2701
2702                 /*
2703                  * The mirror_ops handle multiple DVAs in a single BP.
2704                  */
2705                 vdev_mirror_ops.vdev_op_io_start(zio);
2706                 return (ZIO_PIPELINE_STOP);
2707         }
2708
2709         /*
2710          * We keep track of time-sensitive I/Os so that the scan thread
2711          * can quickly react to certain workloads.  In particular, we care
2712          * about non-scrubbing, top-level reads and writes with the following
2713          * characteristics:
2714          *      - synchronous writes of user data to non-slog devices
2715          *      - any reads of user data
2716          * When these conditions are met, adjust the timestamp of spa_last_io
2717          * which allows the scan thread to adjust its workload accordingly.
2718          */
2719         if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2720             vd == vd->vdev_top && !vd->vdev_islog &&
2721             zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2722             zio->io_txg != spa_syncing_txg(spa)) {
2723                 uint64_t old = spa->spa_last_io;
2724                 uint64_t new = ddi_get_lbolt64();
2725                 if (old != new)
2726                         (void) atomic_cas_64(&spa->spa_last_io, old, new);
2727         }
2728
2729         align = 1ULL << vd->vdev_top->vdev_ashift;
2730
2731         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2732             P2PHASE(zio->io_size, align) != 0) {
2733                 /* Transform logical writes to be a full physical block size. */
2734                 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2735                 char *abuf = zio_buf_alloc(asize);
2736                 ASSERT(vd == vd->vdev_top);
2737                 if (zio->io_type == ZIO_TYPE_WRITE) {
2738                         bcopy(zio->io_data, abuf, zio->io_size);
2739                         bzero(abuf + zio->io_size, asize - zio->io_size);
2740                 }
2741                 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2742         }
2743
2744         /*
2745          * If this is not a physical io, make sure that it is properly aligned
2746          * before proceeding.
2747          */
2748         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2749                 ASSERT0(P2PHASE(zio->io_offset, align));
2750                 ASSERT0(P2PHASE(zio->io_size, align));
2751         } else {
2752                 /*
2753                  * For physical writes, we allow 512b aligned writes and assume
2754                  * the device will perform a read-modify-write as necessary.
2755                  */
2756                 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2757                 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2758         }
2759
2760         VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2761
2762         /*
2763          * If this is a repair I/O, and there's no self-healing involved --
2764          * that is, we're just resilvering what we expect to resilver --
2765          * then don't do the I/O unless zio's txg is actually in vd's DTL.
2766          * This prevents spurious resilvering with nested replication.
2767          * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2768          * A is out of date, we'll read from C+D, then use the data to
2769          * resilver A+B -- but we don't actually want to resilver B, just A.
2770          * The top-level mirror has no way to know this, so instead we just
2771          * discard unnecessary repairs as we work our way down the vdev tree.
2772          * The same logic applies to any form of nested replication:
2773          * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2774          */
2775         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2776             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2777             zio->io_txg != 0 && /* not a delegated i/o */
2778             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2779                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2780                 zio_vdev_io_bypass(zio);
2781                 return (ZIO_PIPELINE_CONTINUE);
2782         }
2783
2784         if (vd->vdev_ops->vdev_op_leaf &&
2785             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2786
2787                 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2788                         return (ZIO_PIPELINE_CONTINUE);
2789
2790                 if ((zio = vdev_queue_io(zio)) == NULL)
2791                         return (ZIO_PIPELINE_STOP);
2792
2793                 if (!vdev_accessible(vd, zio)) {
2794                         zio->io_error = SET_ERROR(ENXIO);
2795                         zio_interrupt(zio);
2796                         return (ZIO_PIPELINE_STOP);
2797                 }
2798         }
2799
2800         vd->vdev_ops->vdev_op_io_start(zio);
2801         return (ZIO_PIPELINE_STOP);
2802 }
2803
2804 static int
2805 zio_vdev_io_done(zio_t *zio)
2806 {
2807         vdev_t *vd = zio->io_vd;
2808         vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2809         boolean_t unexpected_error = B_FALSE;
2810
2811         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2812                 return (ZIO_PIPELINE_STOP);
2813
2814         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2815
2816         if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2817
2818                 vdev_queue_io_done(zio);
2819
2820                 if (zio->io_type == ZIO_TYPE_WRITE)
2821                         vdev_cache_write(zio);
2822
2823                 if (zio_injection_enabled && zio->io_error == 0)
2824                         zio->io_error = zio_handle_device_injection(vd,
2825                             zio, EIO);
2826
2827                 if (zio_injection_enabled && zio->io_error == 0)
2828                         zio->io_error = zio_handle_label_injection(zio, EIO);
2829
2830                 if (zio->io_error) {
2831                         if (!vdev_accessible(vd, zio)) {
2832                                 zio->io_error = SET_ERROR(ENXIO);
2833                         } else {
2834                                 unexpected_error = B_TRUE;
2835                         }
2836                 }
2837         }
2838
2839         ops->vdev_op_io_done(zio);
2840
2841         if (unexpected_error)
2842                 VERIFY(vdev_probe(vd, zio) == NULL);
2843
2844         return (ZIO_PIPELINE_CONTINUE);
2845 }
2846
2847 /*
2848  * For non-raidz ZIOs, we can just copy aside the bad data read from the
2849  * disk, and use that to finish the checksum ereport later.
2850  */
2851 static void
2852 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2853     const void *good_buf)
2854 {
2855         /* no processing needed */
2856         zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2857 }
2858
2859 /*ARGSUSED*/
2860 void
2861 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2862 {
2863         void *buf = zio_buf_alloc(zio->io_size);
2864
2865         bcopy(zio->io_data, buf, zio->io_size);
2866
2867         zcr->zcr_cbinfo = zio->io_size;
2868         zcr->zcr_cbdata = buf;
2869         zcr->zcr_finish = zio_vsd_default_cksum_finish;
2870         zcr->zcr_free = zio_buf_free;
2871 }
2872
2873 static int
2874 zio_vdev_io_assess(zio_t *zio)
2875 {
2876         vdev_t *vd = zio->io_vd;
2877
2878         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2879                 return (ZIO_PIPELINE_STOP);
2880
2881         if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2882                 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2883
2884         if (zio->io_vsd != NULL) {
2885                 zio->io_vsd_ops->vsd_free(zio);
2886                 zio->io_vsd = NULL;
2887         }
2888
2889         if (zio_injection_enabled && zio->io_error == 0)
2890                 zio->io_error = zio_handle_fault_injection(zio, EIO);
2891
2892         /*
2893          * If the I/O failed, determine whether we should attempt to retry it.
2894          *
2895          * On retry, we cut in line in the issue queue, since we don't want
2896          * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2897          */
2898         if (zio->io_error && vd == NULL &&
2899             !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2900                 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2901                 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2902                 zio->io_error = 0;
2903                 zio->io_flags |= ZIO_FLAG_IO_RETRY |
2904                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2905                 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2906                 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2907                     zio_requeue_io_start_cut_in_line);
2908                 return (ZIO_PIPELINE_STOP);
2909         }
2910
2911         /*
2912          * If we got an error on a leaf device, convert it to ENXIO
2913          * if the device is not accessible at all.
2914          */
2915         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2916             !vdev_accessible(vd, zio))
2917                 zio->io_error = SET_ERROR(ENXIO);
2918
2919         /*
2920          * If we can't write to an interior vdev (mirror or RAID-Z),
2921          * set vdev_cant_write so that we stop trying to allocate from it.
2922          */
2923         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2924             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2925                 vd->vdev_cant_write = B_TRUE;
2926         }
2927
2928         if (zio->io_error)
2929                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2930
2931         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2932             zio->io_physdone != NULL) {
2933                 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2934                 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2935                 zio->io_physdone(zio->io_logical);
2936         }
2937
2938         return (ZIO_PIPELINE_CONTINUE);
2939 }
2940
2941 void
2942 zio_vdev_io_reissue(zio_t *zio)
2943 {
2944         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2945         ASSERT(zio->io_error == 0);
2946
2947         zio->io_stage >>= 1;
2948 }
2949
2950 void
2951 zio_vdev_io_redone(zio_t *zio)
2952 {
2953         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2954
2955         zio->io_stage >>= 1;
2956 }
2957
2958 void
2959 zio_vdev_io_bypass(zio_t *zio)
2960 {
2961         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2962         ASSERT(zio->io_error == 0);
2963
2964         zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2965         zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2966 }
2967
2968 /*
2969  * ==========================================================================
2970  * Generate and verify checksums
2971  * ==========================================================================
2972  */
2973 static int
2974 zio_checksum_generate(zio_t *zio)
2975 {
2976         blkptr_t *bp = zio->io_bp;
2977         enum zio_checksum checksum;
2978
2979         if (bp == NULL) {
2980                 /*
2981                  * This is zio_write_phys().
2982                  * We're either generating a label checksum, or none at all.
2983                  */
2984                 checksum = zio->io_prop.zp_checksum;
2985
2986                 if (checksum == ZIO_CHECKSUM_OFF)
2987                         return (ZIO_PIPELINE_CONTINUE);
2988
2989                 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2990         } else {
2991                 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2992                         ASSERT(!IO_IS_ALLOCATING(zio));
2993                         checksum = ZIO_CHECKSUM_GANG_HEADER;
2994                 } else {
2995                         checksum = BP_GET_CHECKSUM(bp);
2996                 }
2997         }
2998
2999         zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
3000
3001         return (ZIO_PIPELINE_CONTINUE);
3002 }
3003
3004 static int
3005 zio_checksum_verify(zio_t *zio)
3006 {
3007         zio_bad_cksum_t info;
3008         blkptr_t *bp = zio->io_bp;
3009         int error;
3010
3011         ASSERT(zio->io_vd != NULL);
3012
3013         if (bp == NULL) {
3014                 /*
3015                  * This is zio_read_phys().
3016                  * We're either verifying a label checksum, or nothing at all.
3017                  */
3018                 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
3019                         return (ZIO_PIPELINE_CONTINUE);
3020
3021                 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
3022         }
3023
3024         if ((error = zio_checksum_error(zio, &info)) != 0) {
3025                 zio->io_error = error;
3026                 if (error == ECKSUM &&
3027                     !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3028                         zfs_ereport_start_checksum(zio->io_spa,
3029                             zio->io_vd, zio, zio->io_offset,
3030                             zio->io_size, NULL, &info);
3031                 }
3032         }
3033
3034         return (ZIO_PIPELINE_CONTINUE);
3035 }
3036
3037 /*
3038  * Called by RAID-Z to ensure we don't compute the checksum twice.
3039  */
3040 void
3041 zio_checksum_verified(zio_t *zio)
3042 {
3043         zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
3044 }
3045
3046 /*
3047  * ==========================================================================
3048  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
3049  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
3050  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
3051  * indicate errors that are specific to one I/O, and most likely permanent.
3052  * Any other error is presumed to be worse because we weren't expecting it.
3053  * ==========================================================================
3054  */
3055 int
3056 zio_worst_error(int e1, int e2)
3057 {
3058         static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
3059         int r1, r2;
3060
3061         for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
3062                 if (e1 == zio_error_rank[r1])
3063                         break;
3064
3065         for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
3066                 if (e2 == zio_error_rank[r2])
3067                         break;
3068
3069         return (r1 > r2 ? e1 : e2);
3070 }
3071
3072 /*
3073  * ==========================================================================
3074  * I/O completion
3075  * ==========================================================================
3076  */
3077 static int
3078 zio_ready(zio_t *zio)
3079 {
3080         blkptr_t *bp = zio->io_bp;
3081         zio_t *pio, *pio_next;
3082
3083         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3084             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
3085                 return (ZIO_PIPELINE_STOP);
3086
3087         if (zio->io_ready) {
3088                 ASSERT(IO_IS_ALLOCATING(zio));
3089                 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3090                     (zio->io_flags & ZIO_FLAG_NOPWRITE));
3091                 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
3092
3093                 zio->io_ready(zio);
3094         }
3095
3096         if (bp != NULL && bp != &zio->io_bp_copy)
3097                 zio->io_bp_copy = *bp;
3098
3099         if (zio->io_error)
3100                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3101
3102         mutex_enter(&zio->io_lock);
3103         zio->io_state[ZIO_WAIT_READY] = 1;
3104         pio = zio_walk_parents(zio);
3105         mutex_exit(&zio->io_lock);
3106
3107         /*
3108          * As we notify zio's parents, new parents could be added.
3109          * New parents go to the head of zio's io_parent_list, however,
3110          * so we will (correctly) not notify them.  The remainder of zio's
3111          * io_parent_list, from 'pio_next' onward, cannot change because
3112          * all parents must wait for us to be done before they can be done.
3113          */
3114         for (; pio != NULL; pio = pio_next) {
3115                 pio_next = zio_walk_parents(zio);
3116                 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
3117         }
3118
3119         if (zio->io_flags & ZIO_FLAG_NODATA) {
3120                 if (BP_IS_GANG(bp)) {
3121                         zio->io_flags &= ~ZIO_FLAG_NODATA;
3122                 } else {
3123                         ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3124                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3125                 }
3126         }
3127
3128         if (zio_injection_enabled &&
3129             zio->io_spa->spa_syncing_txg == zio->io_txg)
3130                 zio_handle_ignored_writes(zio);
3131
3132         return (ZIO_PIPELINE_CONTINUE);
3133 }
3134
3135 static int
3136 zio_done(zio_t *zio)
3137 {
3138         zio_t *pio, *pio_next;
3139         int c, w;
3140
3141         /*
3142          * If our children haven't all completed,
3143          * wait for them and then repeat this pipeline stage.
3144          */
3145         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3146             zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3147             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3148             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3149                 return (ZIO_PIPELINE_STOP);
3150
3151         for (c = 0; c < ZIO_CHILD_TYPES; c++)
3152                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
3153                         ASSERT(zio->io_children[c][w] == 0);
3154
3155         if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
3156                 ASSERT(zio->io_bp->blk_pad[0] == 0);
3157                 ASSERT(zio->io_bp->blk_pad[1] == 0);
3158                 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
3159                     sizeof (blkptr_t)) == 0 ||
3160                     (zio->io_bp == zio_unique_parent(zio)->io_bp));
3161                 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
3162                     zio->io_bp_override == NULL &&
3163                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3164                         ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
3165                         ASSERT3U(zio->io_prop.zp_copies, <=,
3166                             BP_GET_NDVAS(zio->io_bp));
3167                         ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
3168                             (BP_COUNT_GANG(zio->io_bp) ==
3169                             BP_GET_NDVAS(zio->io_bp)));
3170                 }
3171                 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3172                         VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
3173         }
3174
3175         /*
3176          * If there were child vdev/gang/ddt errors, they apply to us now.
3177          */
3178         zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3179         zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3180         zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3181
3182         /*
3183          * If the I/O on the transformed data was successful, generate any
3184          * checksum reports now while we still have the transformed data.
3185          */
3186         if (zio->io_error == 0) {
3187                 while (zio->io_cksum_report != NULL) {
3188                         zio_cksum_report_t *zcr = zio->io_cksum_report;
3189                         uint64_t align = zcr->zcr_align;
3190                         uint64_t asize = P2ROUNDUP(zio->io_size, align);
3191                         char *abuf = zio->io_data;
3192
3193                         if (asize != zio->io_size) {
3194                                 abuf = zio_buf_alloc(asize);
3195                                 bcopy(zio->io_data, abuf, zio->io_size);
3196                                 bzero(abuf+zio->io_size, asize-zio->io_size);
3197                         }
3198
3199                         zio->io_cksum_report = zcr->zcr_next;
3200                         zcr->zcr_next = NULL;
3201                         zcr->zcr_finish(zcr, abuf);
3202                         zfs_ereport_free_checksum(zcr);
3203
3204                         if (asize != zio->io_size)
3205                                 zio_buf_free(abuf, asize);
3206                 }
3207         }
3208
3209         zio_pop_transforms(zio);        /* note: may set zio->io_error */
3210
3211         vdev_stat_update(zio, zio->io_size);
3212
3213         /*
3214          * If this I/O is attached to a particular vdev is slow, exceeding
3215          * 30 seconds to complete, post an error described the I/O delay.
3216          * We ignore these errors if the device is currently unavailable.
3217          */
3218         if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
3219                 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
3220                         zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
3221                             zio->io_vd, zio, 0, 0);
3222         }
3223
3224         if (zio->io_error) {
3225                 /*
3226                  * If this I/O is attached to a particular vdev,
3227                  * generate an error message describing the I/O failure
3228                  * at the block level.  We ignore these errors if the
3229                  * device is currently unavailable.
3230                  */
3231                 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
3232                         !vdev_is_dead(zio->io_vd))
3233                         zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
3234                                                 zio->io_vd, zio, 0, 0);
3235
3236                 if ((zio->io_error == EIO || !(zio->io_flags &
3237                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3238                     zio == zio->io_logical) {
3239                         /*
3240                          * For logical I/O requests, tell the SPA to log the
3241                          * error and generate a logical data ereport.
3242                          */
3243                         spa_log_error(zio->io_spa, zio);
3244                         zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
3245                             NULL, zio, 0, 0);
3246                 }
3247         }
3248
3249         if (zio->io_error && zio == zio->io_logical) {
3250                 /*
3251                  * Determine whether zio should be reexecuted.  This will
3252                  * propagate all the way to the root via zio_notify_parent().
3253                  */
3254                 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
3255                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3256
3257                 if (IO_IS_ALLOCATING(zio) &&
3258                     !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3259                         if (zio->io_error != ENOSPC)
3260                                 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3261                         else
3262                                 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3263                 }
3264
3265                 if ((zio->io_type == ZIO_TYPE_READ ||
3266                     zio->io_type == ZIO_TYPE_FREE) &&
3267                     !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3268                     zio->io_error == ENXIO &&
3269                     spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
3270                     spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
3271                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3272
3273                 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3274                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3275
3276                 /*
3277                  * Here is a possibly good place to attempt to do
3278                  * either combinatorial reconstruction or error correction
3279                  * based on checksums.  It also might be a good place
3280                  * to send out preliminary ereports before we suspend
3281                  * processing.
3282                  */
3283         }
3284
3285         /*
3286          * If there were logical child errors, they apply to us now.
3287          * We defer this until now to avoid conflating logical child
3288          * errors with errors that happened to the zio itself when
3289          * updating vdev stats and reporting FMA events above.
3290          */
3291         zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3292
3293         if ((zio->io_error || zio->io_reexecute) &&
3294             IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3295             !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3296                 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
3297
3298         zio_gang_tree_free(&zio->io_gang_tree);
3299
3300         /*
3301          * Godfather I/Os should never suspend.
3302          */
3303         if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3304             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3305                 zio->io_reexecute = 0;
3306
3307         if (zio->io_reexecute) {
3308                 /*
3309                  * This is a logical I/O that wants to reexecute.
3310                  *
3311                  * Reexecute is top-down.  When an i/o fails, if it's not
3312                  * the root, it simply notifies its parent and sticks around.
3313                  * The parent, seeing that it still has children in zio_done(),
3314                  * does the same.  This percolates all the way up to the root.
3315                  * The root i/o will reexecute or suspend the entire tree.
3316                  *
3317                  * This approach ensures that zio_reexecute() honors
3318                  * all the original i/o dependency relationships, e.g.
3319                  * parents not executing until children are ready.
3320                  */
3321                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3322
3323                 zio->io_gang_leader = NULL;
3324
3325                 mutex_enter(&zio->io_lock);
3326                 zio->io_state[ZIO_WAIT_DONE] = 1;
3327                 mutex_exit(&zio->io_lock);
3328
3329                 /*
3330                  * "The Godfather" I/O monitors its children but is
3331                  * not a true parent to them. It will track them through
3332                  * the pipeline but severs its ties whenever they get into
3333                  * trouble (e.g. suspended). This allows "The Godfather"
3334                  * I/O to return status without blocking.
3335                  */
3336                 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3337                         zio_link_t *zl = zio->io_walk_link;
3338                         pio_next = zio_walk_parents(zio);
3339
3340                         if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3341                             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3342                                 zio_remove_child(pio, zio, zl);
3343                                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3344                         }
3345                 }
3346
3347                 if ((pio = zio_unique_parent(zio)) != NULL) {
3348                         /*
3349                          * We're not a root i/o, so there's nothing to do
3350                          * but notify our parent.  Don't propagate errors
3351                          * upward since we haven't permanently failed yet.
3352                          */
3353                         ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3354                         zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3355                         zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3356                 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3357                         /*
3358                          * We'd fail again if we reexecuted now, so suspend
3359                          * until conditions improve (e.g. device comes online).
3360                          */
3361                         zio_suspend(zio->io_spa, zio);
3362                 } else {
3363                         /*
3364                          * Reexecution is potentially a huge amount of work.
3365                          * Hand it off to the otherwise-unused claim taskq.
3366                          */
3367                         ASSERT(taskq_empty_ent(&zio->io_tqent));
3368                         spa_taskq_dispatch_ent(zio->io_spa,
3369                             ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
3370                             (task_func_t *)zio_reexecute, zio, 0,
3371                             &zio->io_tqent);
3372                 }
3373                 return (ZIO_PIPELINE_STOP);
3374         }
3375
3376         ASSERT(zio->io_child_count == 0);
3377         ASSERT(zio->io_reexecute == 0);
3378         ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3379
3380         /*
3381          * Report any checksum errors, since the I/O is complete.
3382          */
3383         while (zio->io_cksum_report != NULL) {
3384                 zio_cksum_report_t *zcr = zio->io_cksum_report;
3385                 zio->io_cksum_report = zcr->zcr_next;
3386                 zcr->zcr_next = NULL;
3387                 zcr->zcr_finish(zcr, NULL);
3388                 zfs_ereport_free_checksum(zcr);
3389         }
3390
3391         if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
3392             !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
3393             !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
3394                 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
3395         }
3396
3397         /*
3398          * It is the responsibility of the done callback to ensure that this
3399          * particular zio is no longer discoverable for adoption, and as
3400          * such, cannot acquire any new parents.
3401          */
3402         if (zio->io_done)
3403                 zio->io_done(zio);
3404
3405         mutex_enter(&zio->io_lock);
3406         zio->io_state[ZIO_WAIT_DONE] = 1;
3407         mutex_exit(&zio->io_lock);
3408
3409         for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3410                 zio_link_t *zl = zio->io_walk_link;
3411                 pio_next = zio_walk_parents(zio);
3412                 zio_remove_child(pio, zio, zl);
3413                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3414         }
3415
3416         if (zio->io_waiter != NULL) {
3417                 mutex_enter(&zio->io_lock);
3418                 zio->io_executor = NULL;
3419                 cv_broadcast(&zio->io_cv);
3420                 mutex_exit(&zio->io_lock);
3421         } else {
3422                 zio_destroy(zio);
3423         }
3424
3425         return (ZIO_PIPELINE_STOP);
3426 }
3427
3428 /*
3429  * ==========================================================================
3430  * I/O pipeline definition
3431  * ==========================================================================
3432  */
3433 static zio_pipe_stage_t *zio_pipeline[] = {
3434         NULL,
3435         zio_read_bp_init,
3436         zio_free_bp_init,
3437         zio_issue_async,
3438         zio_write_bp_init,
3439         zio_checksum_generate,
3440         zio_nop_write,
3441         zio_ddt_read_start,
3442         zio_ddt_read_done,
3443         zio_ddt_write,
3444         zio_ddt_free,
3445         zio_gang_assemble,
3446         zio_gang_issue,
3447         zio_dva_allocate,
3448         zio_dva_free,
3449         zio_dva_claim,
3450         zio_ready,
3451         zio_vdev_io_start,
3452         zio_vdev_io_done,
3453         zio_vdev_io_assess,
3454         zio_checksum_verify,
3455         zio_done
3456 };
3457
3458
3459
3460
3461 /*
3462  * Compare two zbookmark_phys_t's to see which we would reach first in a
3463  * pre-order traversal of the object tree.
3464  *
3465  * This is simple in every case aside from the meta-dnode object. For all other
3466  * objects, we traverse them in order (object 1 before object 2, and so on).
3467  * However, all of these objects are traversed while traversing object 0, since
3468  * the data it points to is the list of objects.  Thus, we need to convert to a
3469  * canonical representation so we can compare meta-dnode bookmarks to
3470  * non-meta-dnode bookmarks.
3471  *
3472  * We do this by calculating "equivalents" for each field of the zbookmark.
3473  * zbookmarks outside of the meta-dnode use their own object and level, and
3474  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
3475  * blocks this bookmark refers to) by multiplying their blkid by their span
3476  * (the number of L0 blocks contained within one block at their level).
3477  * zbookmarks inside the meta-dnode calculate their object equivalent
3478  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
3479  * level + 1<<31 (any value larger than a level could ever be) for their level.
3480  * This causes them to always compare before a bookmark in their object
3481  * equivalent, compare appropriately to bookmarks in other objects, and to
3482  * compare appropriately to other bookmarks in the meta-dnode.
3483  */
3484 int
3485 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
3486     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
3487 {
3488         /*
3489          * These variables represent the "equivalent" values for the zbookmark,
3490          * after converting zbookmarks inside the meta dnode to their
3491          * normal-object equivalents.
3492          */
3493         uint64_t zb1obj, zb2obj;
3494         uint64_t zb1L0, zb2L0;
3495         uint64_t zb1level, zb2level;
3496
3497         if (zb1->zb_object == zb2->zb_object &&
3498             zb1->zb_level == zb2->zb_level &&
3499             zb1->zb_blkid == zb2->zb_blkid)
3500                 return (0);
3501
3502         /*
3503          * BP_SPANB calculates the span in blocks.
3504          */
3505         zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
3506         zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
3507
3508         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3509                 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
3510                 zb1L0 = 0;
3511                 zb1level = zb1->zb_level + COMPARE_META_LEVEL;
3512         } else {
3513                 zb1obj = zb1->zb_object;
3514                 zb1level = zb1->zb_level;
3515         }
3516
3517         if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
3518                 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
3519                 zb2L0 = 0;
3520                 zb2level = zb2->zb_level + COMPARE_META_LEVEL;
3521         } else {
3522                 zb2obj = zb2->zb_object;
3523                 zb2level = zb2->zb_level;
3524         }
3525
3526         /* Now that we have a canonical representation, do the comparison. */
3527         if (zb1obj != zb2obj)
3528                 return (zb1obj < zb2obj ? -1 : 1);
3529         else if (zb1L0 != zb2L0)
3530                 return (zb1L0 < zb2L0 ? -1 : 1);
3531         else if (zb1level != zb2level)
3532                 return (zb1level > zb2level ? -1 : 1);
3533         /*
3534          * This can (theoretically) happen if the bookmarks have the same object
3535          * and level, but different blkids, if the block sizes are not the same.
3536          * There is presently no way to change the indirect block sizes
3537          */
3538         return (0);
3539 }
3540
3541 /*
3542  *  This function checks the following: given that last_block is the place that
3543  *  our traversal stopped last time, does that guarantee that we've visited
3544  *  every node under subtree_root?  Therefore, we can't just use the raw output
3545  *  of zbookmark_compare.  We have to pass in a modified version of
3546  *  subtree_root; by incrementing the block id, and then checking whether
3547  *  last_block is before or equal to that, we can tell whether or not having
3548  *  visited last_block implies that all of subtree_root's children have been
3549  *  visited.
3550  */
3551 boolean_t
3552 zbookmark_subtree_completed(const dnode_phys_t *dnp,
3553     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
3554 {
3555         zbookmark_phys_t mod_zb = *subtree_root;
3556         mod_zb.zb_blkid++;
3557         ASSERT(last_block->zb_level == 0);
3558
3559         /* The objset_phys_t isn't before anything. */
3560         if (dnp == NULL)
3561                 return (B_FALSE);
3562
3563         /*
3564          * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
3565          * data block size in sectors, because that variable is only used if
3566          * the bookmark refers to a block in the meta-dnode.  Since we don't
3567          * know without examining it what object it refers to, and there's no
3568          * harm in passing in this value in other cases, we always pass it in.
3569          *
3570          * We pass in 0 for the indirect block size shift because zb2 must be
3571          * level 0.  The indirect block size is only used to calculate the span
3572          * of the bookmark, but since the bookmark must be level 0, the span is
3573          * always 1, so the math works out.
3574          *
3575          * If you make changes to how the zbookmark_compare code works, be sure
3576          * to make sure that this code still works afterwards.
3577          */
3578         return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
3579             1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
3580             last_block) <= 0);
3581 }
3582
3583 #if defined(_KERNEL) && defined(HAVE_SPL)
3584 EXPORT_SYMBOL(zio_type_name);
3585 EXPORT_SYMBOL(zio_buf_alloc);
3586 EXPORT_SYMBOL(zio_data_buf_alloc);
3587 EXPORT_SYMBOL(zio_buf_alloc_flags);
3588 EXPORT_SYMBOL(zio_buf_free);
3589 EXPORT_SYMBOL(zio_data_buf_free);
3590
3591 module_param(zio_delay_max, int, 0644);
3592 MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
3593
3594 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
3595 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
3596
3597 module_param(zfs_sync_pass_deferred_free, int, 0644);
3598 MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
3599         "Defer frees starting in this pass");
3600
3601 module_param(zfs_sync_pass_dont_compress, int, 0644);
3602 MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
3603         "Don't compress starting in this pass");
3604
3605 module_param(zfs_sync_pass_rewrite, int, 0644);
3606 MODULE_PARM_DESC(zfs_sync_pass_rewrite,
3607         "Rewrite new bps starting in this pass");
3608 #endif