module/zfs/zio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26
  27 #include <sys/sysmacros.h>
  28 #include <sys/zfs_context.h>
  29 #include <sys/fm/fs/zfs.h>
  30 #include <sys/spa.h>
  31 #include <sys/txg.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/zio_impl.h>
  35 #include <sys/zio_compress.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/dmu_objset.h>
  38 #include <sys/arc.h>
  39 #include <sys/ddt.h>
  40 #include <sys/blkptr.h>
  41 #include <sys/zfeature.h>
  42
  43 /*
  44  * ==========================================================================
  45  * I/O type descriptions
  46  * ==========================================================================
  47  */
  48 const char *zio_type_name[ZIO_TYPES] = {
  49         "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
  50 };
  51
  52 /*
  53  * ==========================================================================
  54  * I/O kmem caches
  55  * ==========================================================================
  56  */
  57 kmem_cache_t *zio_cache;
  58 kmem_cache_t *zio_link_cache;
  59 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  60 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  61 int zio_delay_max = ZIO_DELAY_MAX;
  62
  63 #define ZIO_PIPELINE_CONTINUE           0x100
  64 #define ZIO_PIPELINE_STOP               0x101
  65
  66 /*
  67  * The following actions directly effect the spa's sync-to-convergence logic.
  68  * The values below define the sync pass when we start performing the action.
  69  * Care should be taken when changing these values as they directly impact
  70  * spa_sync() performance. Tuning these values may introduce subtle performance
  71  * pathologies and should only be done in the context of performance analysis.
  72  * These tunables will eventually be removed and replaced with #defines once
  73  * enough analysis has been done to determine optimal values.
  74  *
  75  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  76  * regular blocks are not deferred.
  77  */
  78 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  79 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  80 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
  81
  82 /*
  83  * An allocating zio is one that either currently has the DVA allocate
  84  * stage set or will have it later in its lifetime.
  85  */
  86 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
  87
  88 int zio_requeue_io_start_cut_in_line = 1;
  89
  90 #ifdef ZFS_DEBUG
  91 int zio_buf_debug_limit = 16384;
  92 #else
  93 int zio_buf_debug_limit = 0;
  94 #endif
  95
  96 static inline void __zio_execute(zio_t *zio);
  97
  98 void
  99 zio_init(void)
 100 {
 101         size_t c;
 102         vmem_t *data_alloc_arena = NULL;
 103
 104         zio_cache = kmem_cache_create("zio_cache",
 105             sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 106         zio_link_cache = kmem_cache_create("zio_link_cache",
 107             sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 108
 109         /*
 110          * For small buffers, we want a cache for each multiple of
 111          * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 112          * for each quarter-power of 2.
 113          */
 114         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 115                 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 116                 size_t p2 = size;
 117                 size_t align = 0;
 118                 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 119
 120 #ifdef _ILP32
 121                 /*
 122                  * Cache size limited to 1M on 32-bit platforms until ARC
 123                  * buffers no longer require virtual address space.
 124                  */
 125                 if (size > zfs_max_recordsize)
 126                         break;
 127 #endif
 128
 129                 while (!ISP2(p2))
 130                         p2 &= p2 - 1;
 131
 132 #ifndef _KERNEL
 133                 /*
 134                  * If we are using watchpoints, put each buffer on its own page,
 135                  * to eliminate the performance overhead of trapping to the
 136                  * kernel when modifying a non-watched buffer that shares the
 137                  * page with a watched buffer.
 138                  */
 139                 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 140                         continue;
 141 #endif
 142                 if (size <= 4 * SPA_MINBLOCKSIZE) {
 143                         align = SPA_MINBLOCKSIZE;
 144                 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 145                         align = MIN(p2 >> 2, PAGESIZE);
 146                 }
 147
 148                 if (align != 0) {
 149                         char name[36];
 150                         (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 151                         zio_buf_cache[c] = kmem_cache_create(name, size,
 152                             align, NULL, NULL, NULL, NULL, NULL, cflags);
 153
 154                         (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 155                         zio_data_buf_cache[c] = kmem_cache_create(name, size,
 156                             align, NULL, NULL, NULL, NULL,
 157                             data_alloc_arena, cflags);
 158                 }
 159         }
 160
 161         while (--c != 0) {
 162                 ASSERT(zio_buf_cache[c] != NULL);
 163                 if (zio_buf_cache[c - 1] == NULL)
 164                         zio_buf_cache[c - 1] = zio_buf_cache[c];
 165
 166                 ASSERT(zio_data_buf_cache[c] != NULL);
 167                 if (zio_data_buf_cache[c - 1] == NULL)
 168                         zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 169         }
 170
 171         zio_inject_init();
 172
 173         lz4_init();
 174 }
 175
 176 void
 177 zio_fini(void)
 178 {
 179         size_t c;
 180         kmem_cache_t *last_cache = NULL;
 181         kmem_cache_t *last_data_cache = NULL;
 182
 183         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 184 #ifdef _ILP32
 185                 /*
 186                  * Cache size limited to 1M on 32-bit platforms until ARC
 187                  * buffers no longer require virtual address space.
 188                  */
 189                 if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
 190                         break;
 191 #endif
 192                 if (zio_buf_cache[c] != last_cache) {
 193                         last_cache = zio_buf_cache[c];
 194                         kmem_cache_destroy(zio_buf_cache[c]);
 195                 }
 196                 zio_buf_cache[c] = NULL;
 197
 198                 if (zio_data_buf_cache[c] != last_data_cache) {
 199                         last_data_cache = zio_data_buf_cache[c];
 200                         kmem_cache_destroy(zio_data_buf_cache[c]);
 201                 }
 202                 zio_data_buf_cache[c] = NULL;
 203         }
 204
 205         kmem_cache_destroy(zio_link_cache);
 206         kmem_cache_destroy(zio_cache);
 207
 208         zio_inject_fini();
 209
 210         lz4_fini();
 211 }
 212
 213 /*
 214  * ==========================================================================
 215  * Allocate and free I/O buffers
 216  * ==========================================================================
 217  */
 218
 219 /*
 220  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 221  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 222  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 223  * excess / transient data in-core during a crashdump.
 224  */
 225 void *
 226 zio_buf_alloc(size_t size)
 227 {
 228         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 229
 230         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 231
 232         return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 233 }
 234
 235 /*
 236  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 237  * crashdump if the kernel panics.  This exists so that we will limit the amount
 238  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 239  * of kernel heap dumped to disk when the kernel panics)
 240  */
 241 void *
 242 zio_data_buf_alloc(size_t size)
 243 {
 244         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 245
 246         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 247
 248         return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 249 }
 250
 251 /*
 252  * Use zio_buf_alloc_flags when specific allocation flags are needed.  e.g.
 253  * passing KM_NOSLEEP when it is acceptable for an allocation to fail.
 254  */
 255 void *
 256 zio_buf_alloc_flags(size_t size, int flags)
 257 {
 258         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 259
 260         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 261
 262         return (kmem_cache_alloc(zio_buf_cache[c], flags));
 263 }
 264
 265 void
 266 zio_buf_free(void *buf, size_t size)
 267 {
 268         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 269
 270         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 271
 272         kmem_cache_free(zio_buf_cache[c], buf);
 273 }
 274
 275 void
 276 zio_data_buf_free(void *buf, size_t size)
 277 {
 278         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 279
 280         VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 281
 282         kmem_cache_free(zio_data_buf_cache[c], buf);
 283 }
 284
 285 /*
 286  * ==========================================================================
 287  * Push and pop I/O transform buffers
 288  * ==========================================================================
 289  */
 290 static void
 291 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 292         zio_transform_func_t *transform)
 293 {
 294         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 295
 296         zt->zt_orig_data = zio->io_data;
 297         zt->zt_orig_size = zio->io_size;
 298         zt->zt_bufsize = bufsize;
 299         zt->zt_transform = transform;
 300
 301         zt->zt_next = zio->io_transform_stack;
 302         zio->io_transform_stack = zt;
 303
 304         zio->io_data = data;
 305         zio->io_size = size;
 306 }
 307
 308 static void
 309 zio_pop_transforms(zio_t *zio)
 310 {
 311         zio_transform_t *zt;
 312
 313         while ((zt = zio->io_transform_stack) != NULL) {
 314                 if (zt->zt_transform != NULL)
 315                         zt->zt_transform(zio,
 316                             zt->zt_orig_data, zt->zt_orig_size);
 317
 318                 if (zt->zt_bufsize != 0)
 319                         zio_buf_free(zio->io_data, zt->zt_bufsize);
 320
 321                 zio->io_data = zt->zt_orig_data;
 322                 zio->io_size = zt->zt_orig_size;
 323                 zio->io_transform_stack = zt->zt_next;
 324
 325                 kmem_free(zt, sizeof (zio_transform_t));
 326         }
 327 }
 328
 329 /*
 330  * ==========================================================================
 331  * I/O transform callbacks for subblocks and decompression
 332  * ==========================================================================
 333  */
 334 static void
 335 zio_subblock(zio_t *zio, void *data, uint64_t size)
 336 {
 337         ASSERT(zio->io_size > size);
 338
 339         if (zio->io_type == ZIO_TYPE_READ)
 340                 bcopy(zio->io_data, data, size);
 341 }
 342
 343 static void
 344 zio_decompress(zio_t *zio, void *data, uint64_t size)
 345 {
 346         if (zio->io_error == 0 &&
 347             zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 348             zio->io_data, data, zio->io_size, size) != 0)
 349                 zio->io_error = SET_ERROR(EIO);
 350 }
 351
 352 /*
 353  * ==========================================================================
 354  * I/O parent/child relationships and pipeline interlocks
 355  * ==========================================================================
 356  */
 357 /*
 358  * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 359  *        continue calling these functions until they return NULL.
 360  *        Otherwise, the next caller will pick up the list walk in
 361  *        some indeterminate state.  (Otherwise every caller would
 362  *        have to pass in a cookie to keep the state represented by
 363  *        io_walk_link, which gets annoying.)
 364  */
 365 zio_t *
 366 zio_walk_parents(zio_t *cio)
 367 {
 368         zio_link_t *zl = cio->io_walk_link;
 369         list_t *pl = &cio->io_parent_list;
 370
 371         zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 372         cio->io_walk_link = zl;
 373
 374         if (zl == NULL)
 375                 return (NULL);
 376
 377         ASSERT(zl->zl_child == cio);
 378         return (zl->zl_parent);
 379 }
 380
 381 zio_t *
 382 zio_walk_children(zio_t *pio)
 383 {
 384         zio_link_t *zl = pio->io_walk_link;
 385         list_t *cl = &pio->io_child_list;
 386
 387         zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 388         pio->io_walk_link = zl;
 389
 390         if (zl == NULL)
 391                 return (NULL);
 392
 393         ASSERT(zl->zl_parent == pio);
 394         return (zl->zl_child);
 395 }
 396
 397 zio_t *
 398 zio_unique_parent(zio_t *cio)
 399 {
 400         zio_t *pio = zio_walk_parents(cio);
 401
 402         VERIFY(zio_walk_parents(cio) == NULL);
 403         return (pio);
 404 }
 405
 406 void
 407 zio_add_child(zio_t *pio, zio_t *cio)
 408 {
 409         zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 410         int w;
 411
 412         /*
 413          * Logical I/Os can have logical, gang, or vdev children.
 414          * Gang I/Os can have gang or vdev children.
 415          * Vdev I/Os can only have vdev children.
 416          * The following ASSERT captures all of these constraints.
 417          */
 418         ASSERT(cio->io_child_type <= pio->io_child_type);
 419
 420         zl->zl_parent = pio;
 421         zl->zl_child = cio;
 422
 423         mutex_enter(&cio->io_lock);
 424         mutex_enter(&pio->io_lock);
 425
 426         ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 427
 428         for (w = 0; w < ZIO_WAIT_TYPES; w++)
 429                 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 430
 431         list_insert_head(&pio->io_child_list, zl);
 432         list_insert_head(&cio->io_parent_list, zl);
 433
 434         pio->io_child_count++;
 435         cio->io_parent_count++;
 436
 437         mutex_exit(&pio->io_lock);
 438         mutex_exit(&cio->io_lock);
 439 }
 440
 441 static void
 442 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 443 {
 444         ASSERT(zl->zl_parent == pio);
 445         ASSERT(zl->zl_child == cio);
 446
 447         mutex_enter(&cio->io_lock);
 448         mutex_enter(&pio->io_lock);
 449
 450         list_remove(&pio->io_child_list, zl);
 451         list_remove(&cio->io_parent_list, zl);
 452
 453         pio->io_child_count--;
 454         cio->io_parent_count--;
 455
 456         mutex_exit(&pio->io_lock);
 457         mutex_exit(&cio->io_lock);
 458
 459         kmem_cache_free(zio_link_cache, zl);
 460 }
 461
 462 static boolean_t
 463 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 464 {
 465         uint64_t *countp = &zio->io_children[child][wait];
 466         boolean_t waiting = B_FALSE;
 467
 468         mutex_enter(&zio->io_lock);
 469         ASSERT(zio->io_stall == NULL);
 470         if (*countp != 0) {
 471                 zio->io_stage >>= 1;
 472                 zio->io_stall = countp;
 473                 waiting = B_TRUE;
 474         }
 475         mutex_exit(&zio->io_lock);
 476
 477         return (waiting);
 478 }
 479
 480 __attribute__((always_inline))
 481 static inline void
 482 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 483 {
 484         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 485         int *errorp = &pio->io_child_error[zio->io_child_type];
 486
 487         mutex_enter(&pio->io_lock);
 488         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 489                 *errorp = zio_worst_error(*errorp, zio->io_error);
 490         pio->io_reexecute |= zio->io_reexecute;
 491         ASSERT3U(*countp, >, 0);
 492
 493         (*countp)--;
 494
 495         if (*countp == 0 && pio->io_stall == countp) {
 496                 pio->io_stall = NULL;
 497                 mutex_exit(&pio->io_lock);
 498                 __zio_execute(pio);
 499         } else {
 500                 mutex_exit(&pio->io_lock);
 501         }
 502 }
 503
 504 static void
 505 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 506 {
 507         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 508                 zio->io_error = zio->io_child_error[c];
 509 }
 510
 511 /*
 512  * ==========================================================================
 513  * Create the various types of I/O (read, write, free, etc)
 514  * ==========================================================================
 515  */
 516 static zio_t *
 517 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 518     void *data, uint64_t size, zio_done_func_t *done, void *private,
 519     zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 520     vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
 521     enum zio_stage stage, enum zio_stage pipeline)
 522 {
 523         zio_t *zio;
 524
 525         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 526         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 527         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 528
 529         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 530         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 531         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 532
 533         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 534         bzero(zio, sizeof (zio_t));
 535
 536         mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 537         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 538
 539         list_create(&zio->io_parent_list, sizeof (zio_link_t),
 540             offsetof(zio_link_t, zl_parent_node));
 541         list_create(&zio->io_child_list, sizeof (zio_link_t),
 542             offsetof(zio_link_t, zl_child_node));
 543
 544         if (vd != NULL)
 545                 zio->io_child_type = ZIO_CHILD_VDEV;
 546         else if (flags & ZIO_FLAG_GANG_CHILD)
 547                 zio->io_child_type = ZIO_CHILD_GANG;
 548         else if (flags & ZIO_FLAG_DDT_CHILD)
 549                 zio->io_child_type = ZIO_CHILD_DDT;
 550         else
 551                 zio->io_child_type = ZIO_CHILD_LOGICAL;
 552
 553         if (bp != NULL) {
 554                 zio->io_bp = (blkptr_t *)bp;
 555                 zio->io_bp_copy = *bp;
 556                 zio->io_bp_orig = *bp;
 557                 if (type != ZIO_TYPE_WRITE ||
 558                     zio->io_child_type == ZIO_CHILD_DDT)
 559                         zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 560                 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 561                         zio->io_logical = zio;
 562                 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 563                         pipeline |= ZIO_GANG_STAGES;
 564         }
 565
 566         zio->io_spa = spa;
 567         zio->io_txg = txg;
 568         zio->io_done = done;
 569         zio->io_private = private;
 570         zio->io_type = type;
 571         zio->io_priority = priority;
 572         zio->io_vd = vd;
 573         zio->io_offset = offset;
 574         zio->io_orig_data = zio->io_data = data;
 575         zio->io_orig_size = zio->io_size = size;
 576         zio->io_orig_flags = zio->io_flags = flags;
 577         zio->io_orig_stage = zio->io_stage = stage;
 578         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 579
 580         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 581         zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 582
 583         if (zb != NULL)
 584                 zio->io_bookmark = *zb;
 585
 586         if (pio != NULL) {
 587                 if (zio->io_logical == NULL)
 588                         zio->io_logical = pio->io_logical;
 589                 if (zio->io_child_type == ZIO_CHILD_GANG)
 590                         zio->io_gang_leader = pio->io_gang_leader;
 591                 zio_add_child(pio, zio);
 592         }
 593
 594         taskq_init_ent(&zio->io_tqent);
 595
 596         return (zio);
 597 }
 598
 599 static void
 600 zio_destroy(zio_t *zio)
 601 {
 602         list_destroy(&zio->io_parent_list);
 603         list_destroy(&zio->io_child_list);
 604         mutex_destroy(&zio->io_lock);
 605         cv_destroy(&zio->io_cv);
 606         kmem_cache_free(zio_cache, zio);
 607 }
 608
 609 zio_t *
 610 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 611     void *private, enum zio_flag flags)
 612 {
 613         zio_t *zio;
 614
 615         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 616             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 617             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 618
 619         return (zio);
 620 }
 621
 622 zio_t *
 623 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 624 {
 625         return (zio_null(NULL, spa, NULL, done, private, flags));
 626 }
 627
 628 void
 629 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 630 {
 631         int i;
 632
 633         if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 634                 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 635                     bp, (longlong_t)BP_GET_TYPE(bp));
 636         }
 637         if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 638             BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 639                 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 640                     bp, (longlong_t)BP_GET_CHECKSUM(bp));
 641         }
 642         if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 643             BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 644                 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 645                     bp, (longlong_t)BP_GET_COMPRESS(bp));
 646         }
 647         if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 648                 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 649                     bp, (longlong_t)BP_GET_LSIZE(bp));
 650         }
 651         if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 652                 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 653                     bp, (longlong_t)BP_GET_PSIZE(bp));
 654         }
 655
 656         if (BP_IS_EMBEDDED(bp)) {
 657                 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 658                         zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 659                             bp, (longlong_t)BPE_GET_ETYPE(bp));
 660                 }
 661         }
 662
 663         /*
 664          * Pool-specific checks.
 665          *
 666          * Note: it would be nice to verify that the blk_birth and
 667          * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 668          * allows the birth time of log blocks (and dmu_sync()-ed blocks
 669          * that are in the log) to be arbitrarily large.
 670          */
 671         for (i = 0; i < BP_GET_NDVAS(bp); i++) {
 672                 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 673                 vdev_t *vd;
 674                 uint64_t offset, asize;
 675                 if (vdevid >= spa->spa_root_vdev->vdev_children) {
 676                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 677                             "VDEV %llu",
 678                             bp, i, (longlong_t)vdevid);
 679                 }
 680                 vd = spa->spa_root_vdev->vdev_child[vdevid];
 681                 if (vd == NULL) {
 682                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 683                             "VDEV %llu",
 684                             bp, i, (longlong_t)vdevid);
 685                 }
 686                 if (vd->vdev_ops == &vdev_hole_ops) {
 687                         zfs_panic_recover("blkptr at %p DVA %u has hole "
 688                             "VDEV %llu",
 689                             bp, i, (longlong_t)vdevid);
 690
 691                 }
 692                 if (vd->vdev_ops == &vdev_missing_ops) {
 693                         /*
 694                          * "missing" vdevs are valid during import, but we
 695                          * don't have their detailed info (e.g. asize), so
 696                          * we can't perform any more checks on them.
 697                          */
 698                         continue;
 699                 }
 700                 offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 701                 asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 702                 if (BP_IS_GANG(bp))
 703                         asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 704                 if (offset + asize > vd->vdev_asize) {
 705                         zfs_panic_recover("blkptr at %p DVA %u has invalid "
 706                             "OFFSET %llu",
 707                             bp, i, (longlong_t)offset);
 708                 }
 709         }
 710 }
 711
 712 zio_t *
 713 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 714     void *data, uint64_t size, zio_done_func_t *done, void *private,
 715     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 716 {
 717         zio_t *zio;
 718
 719         zfs_blkptr_verify(spa, bp);
 720
 721         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 722             data, size, done, private,
 723             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 724             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 725             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 726
 727         return (zio);
 728 }
 729
 730 zio_t *
 731 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 732     void *data, uint64_t size, const zio_prop_t *zp,
 733     zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 734     void *private,
 735     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 736 {
 737         zio_t *zio;
 738
 739         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 740             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 741             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 742             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 743             DMU_OT_IS_VALID(zp->zp_type) &&
 744             zp->zp_level < 32 &&
 745             zp->zp_copies > 0 &&
 746             zp->zp_copies <= spa_max_replication(spa));
 747
 748         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 749             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 750             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 751             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 752
 753         zio->io_ready = ready;
 754         zio->io_physdone = physdone;
 755         zio->io_prop = *zp;
 756
 757         /*
 758          * Data can be NULL if we are going to call zio_write_override() to
 759          * provide the already-allocated BP.  But we may need the data to
 760          * verify a dedup hit (if requested).  In this case, don't try to
 761          * dedup (just take the already-allocated BP verbatim).
 762          */
 763         if (data == NULL && zio->io_prop.zp_dedup_verify) {
 764                 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 765         }
 766
 767         return (zio);
 768 }
 769
 770 zio_t *
 771 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 772     uint64_t size, zio_done_func_t *done, void *private,
 773     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 774 {
 775         zio_t *zio;
 776
 777         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 778             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 779             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 780
 781         return (zio);
 782 }
 783
 784 void
 785 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 786 {
 787         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 788         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 789         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 790         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 791
 792         /*
 793          * We must reset the io_prop to match the values that existed
 794          * when the bp was first written by dmu_sync() keeping in mind
 795          * that nopwrite and dedup are mutually exclusive.
 796          */
 797         zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 798         zio->io_prop.zp_nopwrite = nopwrite;
 799         zio->io_prop.zp_copies = copies;
 800         zio->io_bp_override = bp;
 801 }
 802
 803 void
 804 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 805 {
 806
 807         /*
 808          * The check for EMBEDDED is a performance optimization.  We
 809          * process the free here (by ignoring it) rather than
 810          * putting it on the list and then processing it in zio_free_sync().
 811          */
 812         if (BP_IS_EMBEDDED(bp))
 813                 return;
 814         metaslab_check_free(spa, bp);
 815
 816         /*
 817          * Frees that are for the currently-syncing txg, are not going to be
 818          * deferred, and which will not need to do a read (i.e. not GANG or
 819          * DEDUP), can be processed immediately.  Otherwise, put them on the
 820          * in-memory list for later processing.
 821          */
 822         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 823             txg != spa->spa_syncing_txg ||
 824             spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 825                 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 826         } else {
 827                 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
 828         }
 829 }
 830
 831 zio_t *
 832 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 833     enum zio_flag flags)
 834 {
 835         zio_t *zio;
 836         enum zio_stage stage = ZIO_FREE_PIPELINE;
 837
 838         ASSERT(!BP_IS_HOLE(bp));
 839         ASSERT(spa_syncing_txg(spa) == txg);
 840         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 841
 842         if (BP_IS_EMBEDDED(bp))
 843                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 844
 845         metaslab_check_free(spa, bp);
 846         arc_freed(spa, bp);
 847
 848         /*
 849          * GANG and DEDUP blocks can induce a read (for the gang block header,
 850          * or the DDT), so issue them asynchronously so that this thread is
 851          * not tied up.
 852          */
 853         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 854                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 855
 856         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 857             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 858             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 859
 860         return (zio);
 861 }
 862
 863 zio_t *
 864 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 865     zio_done_func_t *done, void *private, enum zio_flag flags)
 866 {
 867         zio_t *zio;
 868
 869         dprintf_bp(bp, "claiming in txg %llu", txg);
 870
 871         if (BP_IS_EMBEDDED(bp))
 872                 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 873
 874         /*
 875          * A claim is an allocation of a specific block.  Claims are needed
 876          * to support immediate writes in the intent log.  The issue is that
 877          * immediate writes contain committed data, but in a txg that was
 878          * *not* committed.  Upon opening the pool after an unclean shutdown,
 879          * the intent log claims all blocks that contain immediate write data
 880          * so that the SPA knows they're in use.
 881          *
 882          * All claims *must* be resolved in the first txg -- before the SPA
 883          * starts allocating blocks -- so that nothing is allocated twice.
 884          * If txg == 0 we just verify that the block is claimable.
 885          */
 886         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 887         ASSERT(txg == spa_first_txg(spa) || txg == 0);
 888         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 889
 890         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 891             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 892             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 893
 894         return (zio);
 895 }
 896
 897 zio_t *
 898 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 899     zio_done_func_t *done, void *private, enum zio_flag flags)
 900 {
 901         zio_t *zio;
 902         int c;
 903
 904         if (vd->vdev_children == 0) {
 905                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 906                     ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 907                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 908
 909                 zio->io_cmd = cmd;
 910         } else {
 911                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 912
 913                 for (c = 0; c < vd->vdev_children; c++)
 914                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 915                             done, private, flags));
 916         }
 917
 918         return (zio);
 919 }
 920
 921 zio_t *
 922 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 923     void *data, int checksum, zio_done_func_t *done, void *private,
 924     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 925 {
 926         zio_t *zio;
 927
 928         ASSERT(vd->vdev_children == 0);
 929         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 930             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 931         ASSERT3U(offset + size, <=, vd->vdev_psize);
 932
 933         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 934             ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 935             NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 936
 937         zio->io_prop.zp_checksum = checksum;
 938
 939         return (zio);
 940 }
 941
 942 zio_t *
 943 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 944     void *data, int checksum, zio_done_func_t *done, void *private,
 945     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 946 {
 947         zio_t *zio;
 948
 949         ASSERT(vd->vdev_children == 0);
 950         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 951             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 952         ASSERT3U(offset + size, <=, vd->vdev_psize);
 953
 954         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 955             ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 956             NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 957
 958         zio->io_prop.zp_checksum = checksum;
 959
 960         if (zio_checksum_table[checksum].ci_eck) {
 961                 /*
 962                  * zec checksums are necessarily destructive -- they modify
 963                  * the end of the write buffer to hold the verifier/checksum.
 964                  * Therefore, we must make a local copy in case the data is
 965                  * being written to multiple places in parallel.
 966                  */
 967                 void *wbuf = zio_buf_alloc(size);
 968                 bcopy(data, wbuf, size);
 969                 zio_push_transform(zio, wbuf, size, size, NULL);
 970         }
 971
 972         return (zio);
 973 }
 974
 975 /*
 976  * Create a child I/O to do some work for us.
 977  */
 978 zio_t *
 979 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 980         void *data, uint64_t size, int type, zio_priority_t priority,
 981         enum zio_flag flags, zio_done_func_t *done, void *private)
 982 {
 983         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 984         zio_t *zio;
 985
 986         ASSERT(vd->vdev_parent ==
 987             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 988
 989         if (type == ZIO_TYPE_READ && bp != NULL) {
 990                 /*
 991                  * If we have the bp, then the child should perform the
 992                  * checksum and the parent need not.  This pushes error
 993                  * detection as close to the leaves as possible and
 994                  * eliminates redundant checksums in the interior nodes.
 995                  */
 996                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 997                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 998         }
 999
1000         if (vd->vdev_children == 0)
1001                 offset += VDEV_LABEL_START_SIZE;
1002
1003         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
1004
1005         /*
1006          * If we've decided to do a repair, the write is not speculative --
1007          * even if the original read was.
1008          */
1009         if (flags & ZIO_FLAG_IO_REPAIR)
1010                 flags &= ~ZIO_FLAG_SPECULATIVE;
1011
1012         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
1013             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
1014             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
1015
1016         zio->io_physdone = pio->io_physdone;
1017         if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1018                 zio->io_logical->io_phys_children++;
1019
1020         return (zio);
1021 }
1022
1023 zio_t *
1024 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
1025         int type, zio_priority_t priority, enum zio_flag flags,
1026         zio_done_func_t *done, void *private)
1027 {
1028         zio_t *zio;
1029
1030         ASSERT(vd->vdev_ops->vdev_op_leaf);
1031
1032         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1033             data, size, done, private, type, priority,
1034             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
1035             vd, offset, NULL,
1036             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
1037
1038         return (zio);
1039 }
1040
1041 void
1042 zio_flush(zio_t *zio, vdev_t *vd)
1043 {
1044         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
1045             NULL, NULL,
1046             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1047 }
1048
1049 void
1050 zio_shrink(zio_t *zio, uint64_t size)
1051 {
1052         ASSERT(zio->io_executor == NULL);
1053         ASSERT(zio->io_orig_size == zio->io_size);
1054         ASSERT(size <= zio->io_size);
1055
1056         /*
1057          * We don't shrink for raidz because of problems with the
1058          * reconstruction when reading back less than the block size.
1059          * Note, BP_IS_RAIDZ() assumes no compression.
1060          */
1061         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1062         if (!BP_IS_RAIDZ(zio->io_bp))
1063                 zio->io_orig_size = zio->io_size = size;
1064 }
1065
1066 /*
1067  * ==========================================================================
1068  * Prepare to read and write logical blocks
1069  * ==========================================================================
1070  */
1071
1072 static int
1073 zio_read_bp_init(zio_t *zio)
1074 {
1075         blkptr_t *bp = zio->io_bp;
1076
1077         if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1078             zio->io_child_type == ZIO_CHILD_LOGICAL &&
1079             !(zio->io_flags & ZIO_FLAG_RAW)) {
1080                 uint64_t psize =
1081                     BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1082                 void *cbuf = zio_buf_alloc(psize);
1083
1084                 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1085         }
1086
1087         if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1088                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1089                 decode_embedded_bp_compressed(bp, zio->io_data);
1090         } else {
1091                 ASSERT(!BP_IS_EMBEDDED(bp));
1092         }
1093
1094         if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1095                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1096
1097         if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1098                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1099
1100         if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1101                 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1102
1103         return (ZIO_PIPELINE_CONTINUE);
1104 }
1105
1106 static int
1107 zio_write_bp_init(zio_t *zio)
1108 {
1109         spa_t *spa = zio->io_spa;
1110         zio_prop_t *zp = &zio->io_prop;
1111         enum zio_compress compress = zp->zp_compress;
1112         blkptr_t *bp = zio->io_bp;
1113         uint64_t lsize = zio->io_size;
1114         uint64_t psize = lsize;
1115         int pass = 1;
1116
1117         /*
1118          * If our children haven't all reached the ready stage,
1119          * wait for them and then repeat this pipeline stage.
1120          */
1121         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1122             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1123                 return (ZIO_PIPELINE_STOP);
1124
1125         if (!IO_IS_ALLOCATING(zio))
1126                 return (ZIO_PIPELINE_CONTINUE);
1127
1128         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1129
1130         if (zio->io_bp_override) {
1131                 ASSERT(bp->blk_birth != zio->io_txg);
1132                 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1133
1134                 *bp = *zio->io_bp_override;
1135                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1136
1137                 if (BP_IS_EMBEDDED(bp))
1138                         return (ZIO_PIPELINE_CONTINUE);
1139
1140                 /*
1141                  * If we've been overridden and nopwrite is set then
1142                  * set the flag accordingly to indicate that a nopwrite
1143                  * has already occurred.
1144                  */
1145                 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1146                         ASSERT(!zp->zp_dedup);
1147                         zio->io_flags |= ZIO_FLAG_NOPWRITE;
1148                         return (ZIO_PIPELINE_CONTINUE);
1149                 }
1150
1151                 ASSERT(!zp->zp_nopwrite);
1152
1153                 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1154                         return (ZIO_PIPELINE_CONTINUE);
1155
1156                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1157                     zp->zp_dedup_verify);
1158
1159                 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1160                         BP_SET_DEDUP(bp, 1);
1161                         zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1162                         return (ZIO_PIPELINE_CONTINUE);
1163                 }
1164         }
1165
1166         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1167                 /*
1168                  * We're rewriting an existing block, which means we're
1169                  * working on behalf of spa_sync().  For spa_sync() to
1170                  * converge, it must eventually be the case that we don't
1171                  * have to allocate new blocks.  But compression changes
1172                  * the blocksize, which forces a reallocate, and makes
1173                  * convergence take longer.  Therefore, after the first
1174                  * few passes, stop compressing to ensure convergence.
1175                  */
1176                 pass = spa_sync_pass(spa);
1177
1178                 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1179                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1180                 ASSERT(!BP_GET_DEDUP(bp));
1181
1182                 if (pass >= zfs_sync_pass_dont_compress)
1183                         compress = ZIO_COMPRESS_OFF;
1184
1185                 /* Make sure someone doesn't change their mind on overwrites */
1186                 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1187                     spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1188         }
1189
1190         if (compress != ZIO_COMPRESS_OFF) {
1191                 void *cbuf = zio_buf_alloc(lsize);
1192                 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1193                 if (psize == 0 || psize == lsize) {
1194                         compress = ZIO_COMPRESS_OFF;
1195                         zio_buf_free(cbuf, lsize);
1196                 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1197                     zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1198                     spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1199                         encode_embedded_bp_compressed(bp,
1200                             cbuf, compress, lsize, psize);
1201                         BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1202                         BP_SET_TYPE(bp, zio->io_prop.zp_type);
1203                         BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1204                         zio_buf_free(cbuf, lsize);
1205                         bp->blk_birth = zio->io_txg;
1206                         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1207                         ASSERT(spa_feature_is_active(spa,
1208                             SPA_FEATURE_EMBEDDED_DATA));
1209                         return (ZIO_PIPELINE_CONTINUE);
1210                 } else {
1211                         /*
1212                          * Round up compressed size up to the ashift
1213                          * of the smallest-ashift device, and zero the tail.
1214                          * This ensures that the compressed size of the BP
1215                          * (and thus compressratio property) are correct,
1216                          * in that we charge for the padding used to fill out
1217                          * the last sector.
1218                          */
1219                         size_t rounded;
1220
1221                         ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
1222
1223                         rounded = (size_t)P2ROUNDUP(psize,
1224                             1ULL << spa->spa_min_ashift);
1225                         if (rounded >= lsize) {
1226                                 compress = ZIO_COMPRESS_OFF;
1227                                 zio_buf_free(cbuf, lsize);
1228                                 psize = lsize;
1229                         } else {
1230                                 bzero((char *)cbuf + psize, rounded - psize);
1231                                 psize = rounded;
1232                                 zio_push_transform(zio, cbuf,
1233                                     psize, lsize, NULL);
1234                         }
1235                 }
1236         }
1237
1238         /*
1239          * The final pass of spa_sync() must be all rewrites, but the first
1240          * few passes offer a trade-off: allocating blocks defers convergence,
1241          * but newly allocated blocks are sequential, so they can be written
1242          * to disk faster.  Therefore, we allow the first few passes of
1243          * spa_sync() to allocate new blocks, but force rewrites after that.
1244          * There should only be a handful of blocks after pass 1 in any case.
1245          */
1246         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1247             BP_GET_PSIZE(bp) == psize &&
1248             pass >= zfs_sync_pass_rewrite) {
1249                 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1250                 ASSERT(psize != 0);
1251                 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1252                 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1253         } else {
1254                 BP_ZERO(bp);
1255                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1256         }
1257
1258         if (psize == 0) {
1259                 if (zio->io_bp_orig.blk_birth != 0 &&
1260                     spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1261                         BP_SET_LSIZE(bp, lsize);
1262                         BP_SET_TYPE(bp, zp->zp_type);
1263                         BP_SET_LEVEL(bp, zp->zp_level);
1264                         BP_SET_BIRTH(bp, zio->io_txg, 0);
1265                 }
1266                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1267         } else {
1268                 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1269                 BP_SET_LSIZE(bp, lsize);
1270                 BP_SET_TYPE(bp, zp->zp_type);
1271                 BP_SET_LEVEL(bp, zp->zp_level);
1272                 BP_SET_PSIZE(bp, psize);
1273                 BP_SET_COMPRESS(bp, compress);
1274                 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1275                 BP_SET_DEDUP(bp, zp->zp_dedup);
1276                 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1277                 if (zp->zp_dedup) {
1278                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1279                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1280                         zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1281                 }
1282                 if (zp->zp_nopwrite) {
1283                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1284                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1285                         zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1286                 }
1287         }
1288
1289         return (ZIO_PIPELINE_CONTINUE);
1290 }
1291
1292 static int
1293 zio_free_bp_init(zio_t *zio)
1294 {
1295         blkptr_t *bp = zio->io_bp;
1296
1297         if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1298                 if (BP_GET_DEDUP(bp))
1299                         zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1300         }
1301
1302         return (ZIO_PIPELINE_CONTINUE);
1303 }
1304
1305 /*
1306  * ==========================================================================
1307  * Execute the I/O pipeline
1308  * ==========================================================================
1309  */
1310
1311 static void
1312 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1313 {
1314         spa_t *spa = zio->io_spa;
1315         zio_type_t t = zio->io_type;
1316         int flags = (cutinline ? TQ_FRONT : 0);
1317
1318         /*
1319          * If we're a config writer or a probe, the normal issue and
1320          * interrupt threads may all be blocked waiting for the config lock.
1321          * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1322          */
1323         if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1324                 t = ZIO_TYPE_NULL;
1325
1326         /*
1327          * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1328          */
1329         if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1330                 t = ZIO_TYPE_NULL;
1331
1332         /*
1333          * If this is a high priority I/O, then use the high priority taskq if
1334          * available.
1335          */
1336         if (zio->io_priority == ZIO_PRIORITY_NOW &&
1337             spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1338                 q++;
1339
1340         ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1341
1342         /*
1343          * NB: We are assuming that the zio can only be dispatched
1344          * to a single taskq at a time.  It would be a grievous error
1345          * to dispatch the zio to another taskq at the same time.
1346          */
1347         ASSERT(taskq_empty_ent(&zio->io_tqent));
1348         spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1349             flags, &zio->io_tqent);
1350 }
1351
1352 static boolean_t
1353 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1354 {
1355         kthread_t *executor = zio->io_executor;
1356         spa_t *spa = zio->io_spa;
1357         zio_type_t t;
1358
1359         for (t = 0; t < ZIO_TYPES; t++) {
1360                 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1361                 uint_t i;
1362                 for (i = 0; i < tqs->stqs_count; i++) {
1363                         if (taskq_member(tqs->stqs_taskq[i], executor))
1364                                 return (B_TRUE);
1365                 }
1366         }
1367
1368         return (B_FALSE);
1369 }
1370
1371 static int
1372 zio_issue_async(zio_t *zio)
1373 {
1374         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1375
1376         return (ZIO_PIPELINE_STOP);
1377 }
1378
1379 void
1380 zio_interrupt(zio_t *zio)
1381 {
1382         zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1383 }
1384
1385 /*
1386  * Execute the I/O pipeline until one of the following occurs:
1387  * (1) the I/O completes; (2) the pipeline stalls waiting for
1388  * dependent child I/Os; (3) the I/O issues, so we're waiting
1389  * for an I/O completion interrupt; (4) the I/O is delegated by
1390  * vdev-level caching or aggregation; (5) the I/O is deferred
1391  * due to vdev-level queueing; (6) the I/O is handed off to
1392  * another thread.  In all cases, the pipeline stops whenever
1393  * there's no CPU work; it never burns a thread in cv_wait_io().
1394  *
1395  * There's no locking on io_stage because there's no legitimate way
1396  * for multiple threads to be attempting to process the same I/O.
1397  */
1398 static zio_pipe_stage_t *zio_pipeline[];
1399
1400 /*
1401  * zio_execute() is a wrapper around the static function
1402  * __zio_execute() so that we can force  __zio_execute() to be
1403  * inlined.  This reduces stack overhead which is important
1404  * because __zio_execute() is called recursively in several zio
1405  * code paths.  zio_execute() itself cannot be inlined because
1406  * it is externally visible.
1407  */
1408 void
1409 zio_execute(zio_t *zio)
1410 {
1411         fstrans_cookie_t cookie;
1412
1413         cookie = spl_fstrans_mark();
1414         __zio_execute(zio);
1415         spl_fstrans_unmark(cookie);
1416 }
1417
1418 /*
1419  * Used to determine if in the current context the stack is sized large
1420  * enough to allow zio_execute() to be called recursively.  A minimum
1421  * stack size of 16K is required to avoid needing to re-dispatch the zio.
1422  */
1423 boolean_t
1424 zio_execute_stack_check(zio_t *zio)
1425 {
1426 #if !defined(HAVE_LARGE_STACKS)
1427         dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
1428
1429         /* Executing in txg_sync_thread() context. */
1430         if (dp && curthread == dp->dp_tx.tx_sync_thread)
1431                 return (B_TRUE);
1432
1433         /* Pool initialization outside of zio_taskq context. */
1434         if (dp && spa_is_initializing(dp->dp_spa) &&
1435             !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
1436             !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
1437                 return (B_TRUE);
1438 #endif /* HAVE_LARGE_STACKS */
1439
1440         return (B_FALSE);
1441 }
1442
1443 __attribute__((always_inline))
1444 static inline void
1445 __zio_execute(zio_t *zio)
1446 {
1447         zio->io_executor = curthread;
1448
1449         while (zio->io_stage < ZIO_STAGE_DONE) {
1450                 enum zio_stage pipeline = zio->io_pipeline;
1451                 enum zio_stage stage = zio->io_stage;
1452                 int rv;
1453
1454                 ASSERT(!MUTEX_HELD(&zio->io_lock));
1455                 ASSERT(ISP2(stage));
1456                 ASSERT(zio->io_stall == NULL);
1457
1458                 do {
1459                         stage <<= 1;
1460                 } while ((stage & pipeline) == 0);
1461
1462                 ASSERT(stage <= ZIO_STAGE_DONE);
1463
1464                 /*
1465                  * If we are in interrupt context and this pipeline stage
1466                  * will grab a config lock that is held across I/O,
1467                  * or may wait for an I/O that needs an interrupt thread
1468                  * to complete, issue async to avoid deadlock.
1469                  *
1470                  * For VDEV_IO_START, we cut in line so that the io will
1471                  * be sent to disk promptly.
1472                  */
1473                 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1474                     zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1475                         boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1476                             zio_requeue_io_start_cut_in_line : B_FALSE;
1477                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1478                         return;
1479                 }
1480
1481                 /*
1482                  * If the current context doesn't have large enough stacks
1483                  * the zio must be issued asynchronously to prevent overflow.
1484                  */
1485                 if (zio_execute_stack_check(zio)) {
1486                         boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1487                             zio_requeue_io_start_cut_in_line : B_FALSE;
1488                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1489                         return;
1490                 }
1491
1492                 zio->io_stage = stage;
1493                 rv = zio_pipeline[highbit64(stage) - 1](zio);
1494
1495                 if (rv == ZIO_PIPELINE_STOP)
1496                         return;
1497
1498                 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1499         }
1500 }
1501
1502
1503 /*
1504  * ==========================================================================
1505  * Initiate I/O, either sync or async
1506  * ==========================================================================
1507  */
1508 int
1509 zio_wait(zio_t *zio)
1510 {
1511         int error;
1512
1513         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1514         ASSERT(zio->io_executor == NULL);
1515
1516         zio->io_waiter = curthread;
1517
1518         __zio_execute(zio);
1519
1520         mutex_enter(&zio->io_lock);
1521         while (zio->io_executor != NULL)
1522                 cv_wait_io(&zio->io_cv, &zio->io_lock);
1523         mutex_exit(&zio->io_lock);
1524
1525         error = zio->io_error;
1526         zio_destroy(zio);
1527
1528         return (error);
1529 }
1530
1531 void
1532 zio_nowait(zio_t *zio)
1533 {
1534         ASSERT(zio->io_executor == NULL);
1535
1536         if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1537             zio_unique_parent(zio) == NULL) {
1538                 zio_t *pio;
1539
1540                 /*
1541                  * This is a logical async I/O with no parent to wait for it.
1542                  * We add it to the spa_async_root_zio "Godfather" I/O which
1543                  * will ensure they complete prior to unloading the pool.
1544                  */
1545                 spa_t *spa = zio->io_spa;
1546                 kpreempt_disable();
1547                 pio = spa->spa_async_zio_root[CPU_SEQID];
1548                 kpreempt_enable();
1549
1550                 zio_add_child(pio, zio);
1551         }
1552
1553         __zio_execute(zio);
1554 }
1555
1556 /*
1557  * ==========================================================================
1558  * Reexecute or suspend/resume failed I/O
1559  * ==========================================================================
1560  */
1561
1562 static void
1563 zio_reexecute(zio_t *pio)
1564 {
1565         zio_t *cio, *cio_next;
1566         int c, w;
1567
1568         ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1569         ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1570         ASSERT(pio->io_gang_leader == NULL);
1571         ASSERT(pio->io_gang_tree == NULL);
1572
1573         pio->io_flags = pio->io_orig_flags;
1574         pio->io_stage = pio->io_orig_stage;
1575         pio->io_pipeline = pio->io_orig_pipeline;
1576         pio->io_reexecute = 0;
1577         pio->io_flags |= ZIO_FLAG_REEXECUTED;
1578         pio->io_error = 0;
1579         for (w = 0; w < ZIO_WAIT_TYPES; w++)
1580                 pio->io_state[w] = 0;
1581         for (c = 0; c < ZIO_CHILD_TYPES; c++)
1582                 pio->io_child_error[c] = 0;
1583
1584         if (IO_IS_ALLOCATING(pio))
1585                 BP_ZERO(pio->io_bp);
1586
1587         /*
1588          * As we reexecute pio's children, new children could be created.
1589          * New children go to the head of pio's io_child_list, however,
1590          * so we will (correctly) not reexecute them.  The key is that
1591          * the remainder of pio's io_child_list, from 'cio_next' onward,
1592          * cannot be affected by any side effects of reexecuting 'cio'.
1593          */
1594         for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1595                 cio_next = zio_walk_children(pio);
1596                 mutex_enter(&pio->io_lock);
1597                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
1598                         pio->io_children[cio->io_child_type][w]++;
1599                 mutex_exit(&pio->io_lock);
1600                 zio_reexecute(cio);
1601         }
1602
1603         /*
1604          * Now that all children have been reexecuted, execute the parent.
1605          * We don't reexecute "The Godfather" I/O here as it's the
1606          * responsibility of the caller to wait on him.
1607          */
1608         if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1609                 __zio_execute(pio);
1610 }
1611
1612 void
1613 zio_suspend(spa_t *spa, zio_t *zio)
1614 {
1615         if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1616                 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1617                     "failure and the failure mode property for this pool "
1618                     "is set to panic.", spa_name(spa));
1619
1620         cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
1621             "failure and has been suspended.\n", spa_name(spa));
1622
1623         zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1624
1625         mutex_enter(&spa->spa_suspend_lock);
1626
1627         if (spa->spa_suspend_zio_root == NULL)
1628                 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1629                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1630                     ZIO_FLAG_GODFATHER);
1631
1632         spa->spa_suspended = B_TRUE;
1633
1634         if (zio != NULL) {
1635                 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1636                 ASSERT(zio != spa->spa_suspend_zio_root);
1637                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1638                 ASSERT(zio_unique_parent(zio) == NULL);
1639                 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1640                 zio_add_child(spa->spa_suspend_zio_root, zio);
1641         }
1642
1643         mutex_exit(&spa->spa_suspend_lock);
1644 }
1645
1646 int
1647 zio_resume(spa_t *spa)
1648 {
1649         zio_t *pio;
1650
1651         /*
1652          * Reexecute all previously suspended i/o.
1653          */
1654         mutex_enter(&spa->spa_suspend_lock);
1655         spa->spa_suspended = B_FALSE;
1656         cv_broadcast(&spa->spa_suspend_cv);
1657         pio = spa->spa_suspend_zio_root;
1658         spa->spa_suspend_zio_root = NULL;
1659         mutex_exit(&spa->spa_suspend_lock);
1660
1661         if (pio == NULL)
1662                 return (0);
1663
1664         zio_reexecute(pio);
1665         return (zio_wait(pio));
1666 }
1667
1668 void
1669 zio_resume_wait(spa_t *spa)
1670 {
1671         mutex_enter(&spa->spa_suspend_lock);
1672         while (spa_suspended(spa))
1673                 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1674         mutex_exit(&spa->spa_suspend_lock);
1675 }
1676
1677 /*
1678  * ==========================================================================
1679  * Gang blocks.
1680  *
1681  * A gang block is a collection of small blocks that looks to the DMU
1682  * like one large block.  When zio_dva_allocate() cannot find a block
1683  * of the requested size, due to either severe fragmentation or the pool
1684  * being nearly full, it calls zio_write_gang_block() to construct the
1685  * block from smaller fragments.
1686  *
1687  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1688  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1689  * an indirect block: it's an array of block pointers.  It consumes
1690  * only one sector and hence is allocatable regardless of fragmentation.
1691  * The gang header's bps point to its gang members, which hold the data.
1692  *
1693  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1694  * as the verifier to ensure uniqueness of the SHA256 checksum.
1695  * Critically, the gang block bp's blk_cksum is the checksum of the data,
1696  * not the gang header.  This ensures that data block signatures (needed for
1697  * deduplication) are independent of how the block is physically stored.
1698  *
1699  * Gang blocks can be nested: a gang member may itself be a gang block.
1700  * Thus every gang block is a tree in which root and all interior nodes are
1701  * gang headers, and the leaves are normal blocks that contain user data.
1702  * The root of the gang tree is called the gang leader.
1703  *
1704  * To perform any operation (read, rewrite, free, claim) on a gang block,
1705  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1706  * in the io_gang_tree field of the original logical i/o by recursively
1707  * reading the gang leader and all gang headers below it.  This yields
1708  * an in-core tree containing the contents of every gang header and the
1709  * bps for every constituent of the gang block.
1710  *
1711  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1712  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1713  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1714  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1715  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1716  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1717  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1718  * of the gang header plus zio_checksum_compute() of the data to update the
1719  * gang header's blk_cksum as described above.
1720  *
1721  * The two-phase assemble/issue model solves the problem of partial failure --
1722  * what if you'd freed part of a gang block but then couldn't read the
1723  * gang header for another part?  Assembling the entire gang tree first
1724  * ensures that all the necessary gang header I/O has succeeded before
1725  * starting the actual work of free, claim, or write.  Once the gang tree
1726  * is assembled, free and claim are in-memory operations that cannot fail.
1727  *
1728  * In the event that a gang write fails, zio_dva_unallocate() walks the
1729  * gang tree to immediately free (i.e. insert back into the space map)
1730  * everything we've allocated.  This ensures that we don't get ENOSPC
1731  * errors during repeated suspend/resume cycles due to a flaky device.
1732  *
1733  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1734  * the gang tree, we won't modify the block, so we can safely defer the free
1735  * (knowing that the block is still intact).  If we *can* assemble the gang
1736  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1737  * each constituent bp and we can allocate a new block on the next sync pass.
1738  *
1739  * In all cases, the gang tree allows complete recovery from partial failure.
1740  * ==========================================================================
1741  */
1742
1743 static zio_t *
1744 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1745 {
1746         if (gn != NULL)
1747                 return (pio);
1748
1749         return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1750             NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1751             &pio->io_bookmark));
1752 }
1753
1754 zio_t *
1755 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1756 {
1757         zio_t *zio;
1758
1759         if (gn != NULL) {
1760                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1761                     gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1762                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1763                 /*
1764                  * As we rewrite each gang header, the pipeline will compute
1765                  * a new gang block header checksum for it; but no one will
1766                  * compute a new data checksum, so we do that here.  The one
1767                  * exception is the gang leader: the pipeline already computed
1768                  * its data checksum because that stage precedes gang assembly.
1769                  * (Presently, nothing actually uses interior data checksums;
1770                  * this is just good hygiene.)
1771                  */
1772                 if (gn != pio->io_gang_leader->io_gang_tree) {
1773                         zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1774                             data, BP_GET_PSIZE(bp));
1775                 }
1776                 /*
1777                  * If we are here to damage data for testing purposes,
1778                  * leave the GBH alone so that we can detect the damage.
1779                  */
1780                 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1781                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1782         } else {
1783                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1784                     data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1785                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1786         }
1787
1788         return (zio);
1789 }
1790
1791 /* ARGSUSED */
1792 zio_t *
1793 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1794 {
1795         return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1796             ZIO_GANG_CHILD_FLAGS(pio)));
1797 }
1798
1799 /* ARGSUSED */
1800 zio_t *
1801 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1802 {
1803         return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1804             NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1805 }
1806
1807 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1808         NULL,
1809         zio_read_gang,
1810         zio_rewrite_gang,
1811         zio_free_gang,
1812         zio_claim_gang,
1813         NULL
1814 };
1815
1816 static void zio_gang_tree_assemble_done(zio_t *zio);
1817
1818 static zio_gang_node_t *
1819 zio_gang_node_alloc(zio_gang_node_t **gnpp)
1820 {
1821         zio_gang_node_t *gn;
1822
1823         ASSERT(*gnpp == NULL);
1824
1825         gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1826         gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1827         *gnpp = gn;
1828
1829         return (gn);
1830 }
1831
1832 static void
1833 zio_gang_node_free(zio_gang_node_t **gnpp)
1834 {
1835         zio_gang_node_t *gn = *gnpp;
1836         int g;
1837
1838         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1839                 ASSERT(gn->gn_child[g] == NULL);
1840
1841         zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1842         kmem_free(gn, sizeof (*gn));
1843         *gnpp = NULL;
1844 }
1845
1846 static void
1847 zio_gang_tree_free(zio_gang_node_t **gnpp)
1848 {
1849         zio_gang_node_t *gn = *gnpp;
1850         int g;
1851
1852         if (gn == NULL)
1853                 return;
1854
1855         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1856                 zio_gang_tree_free(&gn->gn_child[g]);
1857
1858         zio_gang_node_free(gnpp);
1859 }
1860
1861 static void
1862 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1863 {
1864         zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1865
1866         ASSERT(gio->io_gang_leader == gio);
1867         ASSERT(BP_IS_GANG(bp));
1868
1869         zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1870             SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1871             gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1872 }
1873
1874 static void
1875 zio_gang_tree_assemble_done(zio_t *zio)
1876 {
1877         zio_t *gio = zio->io_gang_leader;
1878         zio_gang_node_t *gn = zio->io_private;
1879         blkptr_t *bp = zio->io_bp;
1880         int g;
1881
1882         ASSERT(gio == zio_unique_parent(zio));
1883         ASSERT(zio->io_child_count == 0);
1884
1885         if (zio->io_error)
1886                 return;
1887
1888         if (BP_SHOULD_BYTESWAP(bp))
1889                 byteswap_uint64_array(zio->io_data, zio->io_size);
1890
1891         ASSERT(zio->io_data == gn->gn_gbh);
1892         ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1893         ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1894
1895         for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1896                 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1897                 if (!BP_IS_GANG(gbp))
1898                         continue;
1899                 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1900         }
1901 }
1902
1903 static void
1904 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1905 {
1906         zio_t *gio = pio->io_gang_leader;
1907         zio_t *zio;
1908         int g;
1909
1910         ASSERT(BP_IS_GANG(bp) == !!gn);
1911         ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1912         ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1913
1914         /*
1915          * If you're a gang header, your data is in gn->gn_gbh.
1916          * If you're a gang member, your data is in 'data' and gn == NULL.
1917          */
1918         zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1919
1920         if (gn != NULL) {
1921                 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1922
1923                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1924                         blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1925                         if (BP_IS_HOLE(gbp))
1926                                 continue;
1927                         zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1928                         data = (char *)data + BP_GET_PSIZE(gbp);
1929                 }
1930         }
1931
1932         if (gn == gio->io_gang_tree)
1933                 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1934
1935         if (zio != pio)
1936                 zio_nowait(zio);
1937 }
1938
1939 static int
1940 zio_gang_assemble(zio_t *zio)
1941 {
1942         blkptr_t *bp = zio->io_bp;
1943
1944         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1945         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1946
1947         zio->io_gang_leader = zio;
1948
1949         zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1950
1951         return (ZIO_PIPELINE_CONTINUE);
1952 }
1953
1954 static int
1955 zio_gang_issue(zio_t *zio)
1956 {
1957         blkptr_t *bp = zio->io_bp;
1958
1959         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1960                 return (ZIO_PIPELINE_STOP);
1961
1962         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1963         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1964
1965         if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1966                 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1967         else
1968                 zio_gang_tree_free(&zio->io_gang_tree);
1969
1970         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1971
1972         return (ZIO_PIPELINE_CONTINUE);
1973 }
1974
1975 static void
1976 zio_write_gang_member_ready(zio_t *zio)
1977 {
1978         zio_t *pio = zio_unique_parent(zio);
1979         dva_t *cdva = zio->io_bp->blk_dva;
1980         dva_t *pdva = pio->io_bp->blk_dva;
1981         uint64_t asize;
1982         int d;
1983         ASSERTV(zio_t *gio = zio->io_gang_leader);
1984
1985         if (BP_IS_HOLE(zio->io_bp))
1986                 return;
1987
1988         ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1989
1990         ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1991         ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1992         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1993         ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1994         ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1995
1996         mutex_enter(&pio->io_lock);
1997         for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1998                 ASSERT(DVA_GET_GANG(&pdva[d]));
1999                 asize = DVA_GET_ASIZE(&pdva[d]);
2000                 asize += DVA_GET_ASIZE(&cdva[d]);
2001                 DVA_SET_ASIZE(&pdva[d], asize);
2002         }
2003         mutex_exit(&pio->io_lock);
2004 }
2005
2006 static int
2007 zio_write_gang_block(zio_t *pio)
2008 {
2009         spa_t *spa = pio->io_spa;
2010         blkptr_t *bp = pio->io_bp;
2011         zio_t *gio = pio->io_gang_leader;
2012         zio_t *zio;
2013         zio_gang_node_t *gn, **gnpp;
2014         zio_gbh_phys_t *gbh;
2015         uint64_t txg = pio->io_txg;
2016         uint64_t resid = pio->io_size;
2017         uint64_t lsize;
2018         int copies = gio->io_prop.zp_copies;
2019         int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
2020         zio_prop_t zp;
2021         int g, error;
2022
2023         error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
2024             bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
2025             METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
2026         if (error) {
2027                 pio->io_error = error;
2028                 return (ZIO_PIPELINE_CONTINUE);
2029         }
2030
2031         if (pio == gio) {
2032                 gnpp = &gio->io_gang_tree;
2033         } else {
2034                 gnpp = pio->io_private;
2035                 ASSERT(pio->io_ready == zio_write_gang_member_ready);
2036         }
2037
2038         gn = zio_gang_node_alloc(gnpp);
2039         gbh = gn->gn_gbh;
2040         bzero(gbh, SPA_GANGBLOCKSIZE);
2041
2042         /*
2043          * Create the gang header.
2044          */
2045         zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
2046             pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2047
2048         /*
2049          * Create and nowait the gang children.
2050          */
2051         for (g = 0; resid != 0; resid -= lsize, g++) {
2052                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
2053                     SPA_MINBLOCKSIZE);
2054                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
2055
2056                 zp.zp_checksum = gio->io_prop.zp_checksum;
2057                 zp.zp_compress = ZIO_COMPRESS_OFF;
2058                 zp.zp_type = DMU_OT_NONE;
2059                 zp.zp_level = 0;
2060                 zp.zp_copies = gio->io_prop.zp_copies;
2061                 zp.zp_dedup = B_FALSE;
2062                 zp.zp_dedup_verify = B_FALSE;
2063                 zp.zp_nopwrite = B_FALSE;
2064
2065                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
2066                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
2067                     zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
2068                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2069                     &pio->io_bookmark));
2070         }
2071
2072         /*
2073          * Set pio's pipeline to just wait for zio to finish.
2074          */
2075         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2076
2077         /*
2078          * We didn't allocate this bp, so make sure it doesn't get unmarked.
2079          */
2080         pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
2081
2082         zio_nowait(zio);
2083
2084         return (ZIO_PIPELINE_CONTINUE);
2085 }
2086
2087 /*
2088  * The zio_nop_write stage in the pipeline determines if allocating
2089  * a new bp is necessary.  By leveraging a cryptographically secure checksum,
2090  * such as SHA256, we can compare the checksums of the new data and the old
2091  * to determine if allocating a new block is required.  The nopwrite
2092  * feature can handle writes in either syncing or open context (i.e. zil
2093  * writes) and as a result is mutually exclusive with dedup.
2094  */
2095 static int
2096 zio_nop_write(zio_t *zio)
2097 {
2098         blkptr_t *bp = zio->io_bp;
2099         blkptr_t *bp_orig = &zio->io_bp_orig;
2100         zio_prop_t *zp = &zio->io_prop;
2101
2102         ASSERT(BP_GET_LEVEL(bp) == 0);
2103         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2104         ASSERT(zp->zp_nopwrite);
2105         ASSERT(!zp->zp_dedup);
2106         ASSERT(zio->io_bp_override == NULL);
2107         ASSERT(IO_IS_ALLOCATING(zio));
2108
2109         /*
2110          * Check to see if the original bp and the new bp have matching
2111          * characteristics (i.e. same checksum, compression algorithms, etc).
2112          * If they don't then just continue with the pipeline which will
2113          * allocate a new bp.
2114          */
2115         if (BP_IS_HOLE(bp_orig) ||
2116             !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2117             BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2118             BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2119             BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2120             zp->zp_copies != BP_GET_NDVAS(bp_orig))
2121                 return (ZIO_PIPELINE_CONTINUE);
2122
2123         /*
2124          * If the checksums match then reset the pipeline so that we
2125          * avoid allocating a new bp and issuing any I/O.
2126          */
2127         if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2128                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2129                 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2130                 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2131                 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2132                 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2133                     sizeof (uint64_t)) == 0);
2134
2135                 *bp = *bp_orig;
2136                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2137                 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2138         }
2139
2140         return (ZIO_PIPELINE_CONTINUE);
2141 }
2142
2143 /*
2144  * ==========================================================================
2145  * Dedup
2146  * ==========================================================================
2147  */
2148 static void
2149 zio_ddt_child_read_done(zio_t *zio)
2150 {
2151         blkptr_t *bp = zio->io_bp;
2152         ddt_entry_t *dde = zio->io_private;
2153         ddt_phys_t *ddp;
2154         zio_t *pio = zio_unique_parent(zio);
2155
2156         mutex_enter(&pio->io_lock);
2157         ddp = ddt_phys_select(dde, bp);
2158         if (zio->io_error == 0)
2159                 ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
2160         if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2161                 dde->dde_repair_data = zio->io_data;
2162         else
2163                 zio_buf_free(zio->io_data, zio->io_size);
2164         mutex_exit(&pio->io_lock);
2165 }
2166
2167 static int
2168 zio_ddt_read_start(zio_t *zio)
2169 {
2170         blkptr_t *bp = zio->io_bp;
2171         int p;
2172
2173         ASSERT(BP_GET_DEDUP(bp));
2174         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2175         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2176
2177         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2178                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2179                 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2180                 ddt_phys_t *ddp = dde->dde_phys;
2181                 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2182                 blkptr_t blk;
2183
2184                 ASSERT(zio->io_vsd == NULL);
2185                 zio->io_vsd = dde;
2186
2187                 if (ddp_self == NULL)
2188                         return (ZIO_PIPELINE_CONTINUE);
2189
2190                 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2191                         if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2192                                 continue;
2193                         ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2194                             &blk);
2195                         zio_nowait(zio_read(zio, zio->io_spa, &blk,
2196                             zio_buf_alloc(zio->io_size), zio->io_size,
2197                             zio_ddt_child_read_done, dde, zio->io_priority,
2198                             ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2199                             &zio->io_bookmark));
2200                 }
2201                 return (ZIO_PIPELINE_CONTINUE);
2202         }
2203
2204         zio_nowait(zio_read(zio, zio->io_spa, bp,
2205             zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2206             ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2207
2208         return (ZIO_PIPELINE_CONTINUE);
2209 }
2210
2211 static int
2212 zio_ddt_read_done(zio_t *zio)
2213 {
2214         blkptr_t *bp = zio->io_bp;
2215
2216         if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2217                 return (ZIO_PIPELINE_STOP);
2218
2219         ASSERT(BP_GET_DEDUP(bp));
2220         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2221         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2222
2223         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2224                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2225                 ddt_entry_t *dde = zio->io_vsd;
2226                 if (ddt == NULL) {
2227                         ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2228                         return (ZIO_PIPELINE_CONTINUE);
2229                 }
2230                 if (dde == NULL) {
2231                         zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2232                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2233                         return (ZIO_PIPELINE_STOP);
2234                 }
2235                 if (dde->dde_repair_data != NULL) {
2236                         bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2237                         zio->io_child_error[ZIO_CHILD_DDT] = 0;
2238                 }
2239                 ddt_repair_done(ddt, dde);
2240                 zio->io_vsd = NULL;
2241         }
2242
2243         ASSERT(zio->io_vsd == NULL);
2244
2245         return (ZIO_PIPELINE_CONTINUE);
2246 }
2247
2248 static boolean_t
2249 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2250 {
2251         spa_t *spa = zio->io_spa;
2252         int p;
2253
2254         /*
2255          * Note: we compare the original data, not the transformed data,
2256          * because when zio->io_bp is an override bp, we will not have
2257          * pushed the I/O transforms.  That's an important optimization
2258          * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2259          */
2260         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2261                 zio_t *lio = dde->dde_lead_zio[p];
2262
2263                 if (lio != NULL) {
2264                         return (lio->io_orig_size != zio->io_orig_size ||
2265                             bcmp(zio->io_orig_data, lio->io_orig_data,
2266                             zio->io_orig_size) != 0);
2267                 }
2268         }
2269
2270         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2271                 ddt_phys_t *ddp = &dde->dde_phys[p];
2272
2273                 if (ddp->ddp_phys_birth != 0) {
2274                         arc_buf_t *abuf = NULL;
2275                         arc_flags_t aflags = ARC_FLAG_WAIT;
2276                         blkptr_t blk = *zio->io_bp;
2277                         int error;
2278
2279                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2280
2281                         ddt_exit(ddt);
2282
2283                         error = arc_read(NULL, spa, &blk,
2284                             arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2285                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2286                             &aflags, &zio->io_bookmark);
2287
2288                         if (error == 0) {
2289                                 if (arc_buf_size(abuf) != zio->io_orig_size ||
2290                                     bcmp(abuf->b_data, zio->io_orig_data,
2291                                     zio->io_orig_size) != 0)
2292                                         error = SET_ERROR(EEXIST);
2293                                 VERIFY(arc_buf_remove_ref(abuf, &abuf));
2294                         }
2295
2296                         ddt_enter(ddt);
2297                         return (error != 0);
2298                 }
2299         }
2300
2301         return (B_FALSE);
2302 }
2303
2304 static void
2305 zio_ddt_child_write_ready(zio_t *zio)
2306 {
2307         int p = zio->io_prop.zp_copies;
2308         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2309         ddt_entry_t *dde = zio->io_private;
2310         ddt_phys_t *ddp = &dde->dde_phys[p];
2311         zio_t *pio;
2312
2313         if (zio->io_error)
2314                 return;
2315
2316         ddt_enter(ddt);
2317
2318         ASSERT(dde->dde_lead_zio[p] == zio);
2319
2320         ddt_phys_fill(ddp, zio->io_bp);
2321
2322         while ((pio = zio_walk_parents(zio)) != NULL)
2323                 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2324
2325         ddt_exit(ddt);
2326 }
2327
2328 static void
2329 zio_ddt_child_write_done(zio_t *zio)
2330 {
2331         int p = zio->io_prop.zp_copies;
2332         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2333         ddt_entry_t *dde = zio->io_private;
2334         ddt_phys_t *ddp = &dde->dde_phys[p];
2335
2336         ddt_enter(ddt);
2337
2338         ASSERT(ddp->ddp_refcnt == 0);
2339         ASSERT(dde->dde_lead_zio[p] == zio);
2340         dde->dde_lead_zio[p] = NULL;
2341
2342         if (zio->io_error == 0) {
2343                 while (zio_walk_parents(zio) != NULL)
2344                         ddt_phys_addref(ddp);
2345         } else {
2346                 ddt_phys_clear(ddp);
2347         }
2348
2349         ddt_exit(ddt);
2350 }
2351
2352 static void
2353 zio_ddt_ditto_write_done(zio_t *zio)
2354 {
2355         int p = DDT_PHYS_DITTO;
2356         blkptr_t *bp = zio->io_bp;
2357         ddt_t *ddt = ddt_select(zio->io_spa, bp);
2358         ddt_entry_t *dde = zio->io_private;
2359         ddt_phys_t *ddp = &dde->dde_phys[p];
2360         ddt_key_t *ddk = &dde->dde_key;
2361         ASSERTV(zio_prop_t *zp = &zio->io_prop);
2362
2363         ddt_enter(ddt);
2364
2365         ASSERT(ddp->ddp_refcnt == 0);
2366         ASSERT(dde->dde_lead_zio[p] == zio);
2367         dde->dde_lead_zio[p] = NULL;
2368
2369         if (zio->io_error == 0) {
2370                 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2371                 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2372                 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2373                 if (ddp->ddp_phys_birth != 0)
2374                         ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2375                 ddt_phys_fill(ddp, bp);
2376         }
2377
2378         ddt_exit(ddt);
2379 }
2380
2381 static int
2382 zio_ddt_write(zio_t *zio)
2383 {
2384         spa_t *spa = zio->io_spa;
2385         blkptr_t *bp = zio->io_bp;
2386         uint64_t txg = zio->io_txg;
2387         zio_prop_t *zp = &zio->io_prop;
2388         int p = zp->zp_copies;
2389         int ditto_copies;
2390         zio_t *cio = NULL;
2391         zio_t *dio = NULL;
2392         ddt_t *ddt = ddt_select(spa, bp);
2393         ddt_entry_t *dde;
2394         ddt_phys_t *ddp;
2395
2396         ASSERT(BP_GET_DEDUP(bp));
2397         ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2398         ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2399
2400         ddt_enter(ddt);
2401         dde = ddt_lookup(ddt, bp, B_TRUE);
2402         ddp = &dde->dde_phys[p];
2403
2404         if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2405                 /*
2406                  * If we're using a weak checksum, upgrade to a strong checksum
2407                  * and try again.  If we're already using a strong checksum,
2408                  * we can't resolve it, so just convert to an ordinary write.
2409                  * (And automatically e-mail a paper to Nature?)
2410                  */
2411                 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2412                         zp->zp_checksum = spa_dedup_checksum(spa);
2413                         zio_pop_transforms(zio);
2414                         zio->io_stage = ZIO_STAGE_OPEN;
2415                         BP_ZERO(bp);
2416                 } else {
2417                         zp->zp_dedup = B_FALSE;
2418                 }
2419                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2420                 ddt_exit(ddt);
2421                 return (ZIO_PIPELINE_CONTINUE);
2422         }
2423
2424         ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2425         ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2426
2427         if (ditto_copies > ddt_ditto_copies_present(dde) &&
2428             dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2429                 zio_prop_t czp = *zp;
2430
2431                 czp.zp_copies = ditto_copies;
2432
2433                 /*
2434                  * If we arrived here with an override bp, we won't have run
2435                  * the transform stack, so we won't have the data we need to
2436                  * generate a child i/o.  So, toss the override bp and restart.
2437                  * This is safe, because using the override bp is just an
2438                  * optimization; and it's rare, so the cost doesn't matter.
2439                  */
2440                 if (zio->io_bp_override) {
2441                         zio_pop_transforms(zio);
2442                         zio->io_stage = ZIO_STAGE_OPEN;
2443                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2444                         zio->io_bp_override = NULL;
2445                         BP_ZERO(bp);
2446                         ddt_exit(ddt);
2447                         return (ZIO_PIPELINE_CONTINUE);
2448                 }
2449
2450                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2451                     zio->io_orig_size, &czp, NULL, NULL,
2452                     zio_ddt_ditto_write_done, dde, zio->io_priority,
2453                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2454
2455                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2456                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2457         }
2458
2459         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2460                 if (ddp->ddp_phys_birth != 0)
2461                         ddt_bp_fill(ddp, bp, txg);
2462                 if (dde->dde_lead_zio[p] != NULL)
2463                         zio_add_child(zio, dde->dde_lead_zio[p]);
2464                 else
2465                         ddt_phys_addref(ddp);
2466         } else if (zio->io_bp_override) {
2467                 ASSERT(bp->blk_birth == txg);
2468                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2469                 ddt_phys_fill(ddp, bp);
2470                 ddt_phys_addref(ddp);
2471         } else {
2472                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2473                     zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2474                     zio_ddt_child_write_done, dde, zio->io_priority,
2475                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2476
2477                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2478                 dde->dde_lead_zio[p] = cio;
2479         }
2480
2481         ddt_exit(ddt);
2482
2483         if (cio)
2484                 zio_nowait(cio);
2485         if (dio)
2486                 zio_nowait(dio);
2487
2488         return (ZIO_PIPELINE_CONTINUE);
2489 }
2490
2491 ddt_entry_t *freedde; /* for debugging */
2492
2493 static int
2494 zio_ddt_free(zio_t *zio)
2495 {
2496         spa_t *spa = zio->io_spa;
2497         blkptr_t *bp = zio->io_bp;
2498         ddt_t *ddt = ddt_select(spa, bp);
2499         ddt_entry_t *dde;
2500         ddt_phys_t *ddp;
2501
2502         ASSERT(BP_GET_DEDUP(bp));
2503         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2504
2505         ddt_enter(ddt);
2506         freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2507         if (dde) {
2508                 ddp = ddt_phys_select(dde, bp);
2509                 if (ddp)
2510                         ddt_phys_decref(ddp);
2511         }
2512         ddt_exit(ddt);
2513
2514         return (ZIO_PIPELINE_CONTINUE);
2515 }
2516
2517 /*
2518  * ==========================================================================
2519  * Allocate and free blocks
2520  * ==========================================================================
2521  */
2522 static int
2523 zio_dva_allocate(zio_t *zio)
2524 {
2525         spa_t *spa = zio->io_spa;
2526         metaslab_class_t *mc = spa_normal_class(spa);
2527         blkptr_t *bp = zio->io_bp;
2528         int error;
2529         int flags = 0;
2530
2531         if (zio->io_gang_leader == NULL) {
2532                 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2533                 zio->io_gang_leader = zio;
2534         }
2535
2536         ASSERT(BP_IS_HOLE(bp));
2537         ASSERT0(BP_GET_NDVAS(bp));
2538         ASSERT3U(zio->io_prop.zp_copies, >, 0);
2539         ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2540         ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2541
2542         /*
2543          * The dump device does not support gang blocks so allocation on
2544          * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2545          * the "fast" gang feature.
2546          */
2547         flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2548         flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2549             METASLAB_GANG_CHILD : 0;
2550         flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
2551         error = metaslab_alloc(spa, mc, zio->io_size, bp,
2552             zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2553
2554         if (error) {
2555                 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2556                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2557                     error);
2558                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2559                         return (zio_write_gang_block(zio));
2560                 zio->io_error = error;
2561         }
2562
2563         return (ZIO_PIPELINE_CONTINUE);
2564 }
2565
2566 static int
2567 zio_dva_free(zio_t *zio)
2568 {
2569         metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2570
2571         return (ZIO_PIPELINE_CONTINUE);
2572 }
2573
2574 static int
2575 zio_dva_claim(zio_t *zio)
2576 {
2577         int error;
2578
2579         error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2580         if (error)
2581                 zio->io_error = error;
2582
2583         return (ZIO_PIPELINE_CONTINUE);
2584 }
2585
2586 /*
2587  * Undo an allocation.  This is used by zio_done() when an I/O fails
2588  * and we want to give back the block we just allocated.
2589  * This handles both normal blocks and gang blocks.
2590  */
2591 static void
2592 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2593 {
2594         int g;
2595
2596         ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2597         ASSERT(zio->io_bp_override == NULL);
2598
2599         if (!BP_IS_HOLE(bp))
2600                 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2601
2602         if (gn != NULL) {
2603                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2604                         zio_dva_unallocate(zio, gn->gn_child[g],
2605                             &gn->gn_gbh->zg_blkptr[g]);
2606                 }
2607         }
2608 }
2609
2610 /*
2611  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2612  */
2613 int
2614 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
2615     boolean_t use_slog)
2616 {
2617         int error = 1;
2618
2619         ASSERT(txg > spa_syncing_txg(spa));
2620
2621         /*
2622          * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2623          * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2624          * when allocating them.
2625          */
2626         if (use_slog) {
2627                 error = metaslab_alloc(spa, spa_log_class(spa), size,
2628                     new_bp, 1, txg, NULL,
2629                     METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
2630         }
2631
2632         if (error) {
2633                 error = metaslab_alloc(spa, spa_normal_class(spa), size,
2634                     new_bp, 1, txg, NULL,
2635                     METASLAB_FASTWRITE);
2636         }
2637
2638         if (error == 0) {
2639                 BP_SET_LSIZE(new_bp, size);
2640                 BP_SET_PSIZE(new_bp, size);
2641                 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2642                 BP_SET_CHECKSUM(new_bp,
2643                     spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2644                     ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2645                 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2646                 BP_SET_LEVEL(new_bp, 0);
2647                 BP_SET_DEDUP(new_bp, 0);
2648                 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2649         }
2650
2651         return (error);
2652 }
2653
2654 /*
2655  * Free an intent log block.
2656  */
2657 void
2658 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2659 {
2660         ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2661         ASSERT(!BP_IS_GANG(bp));
2662
2663         zio_free(spa, txg, bp);
2664 }
2665
2666 /*
2667  * ==========================================================================
2668  * Read and write to physical devices
2669  * ==========================================================================
2670  */
2671
2672
2673 /*
2674  * Issue an I/O to the underlying vdev. Typically the issue pipeline
2675  * stops after this stage and will resume upon I/O completion.
2676  * However, there are instances where the vdev layer may need to
2677  * continue the pipeline when an I/O was not issued. Since the I/O
2678  * that was sent to the vdev layer might be different than the one
2679  * currently active in the pipeline (see vdev_queue_io()), we explicitly
2680  * force the underlying vdev layers to call either zio_execute() or
2681  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2682  */
2683 static int
2684 zio_vdev_io_start(zio_t *zio)
2685 {
2686         vdev_t *vd = zio->io_vd;
2687         uint64_t align;
2688         spa_t *spa = zio->io_spa;
2689
2690         ASSERT(zio->io_error == 0);
2691         ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2692
2693         if (vd == NULL) {
2694                 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2695                         spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2696
2697                 /*
2698                  * The mirror_ops handle multiple DVAs in a single BP.
2699                  */
2700                 vdev_mirror_ops.vdev_op_io_start(zio);
2701                 return (ZIO_PIPELINE_STOP);
2702         }
2703
2704         /*
2705          * We keep track of time-sensitive I/Os so that the scan thread
2706          * can quickly react to certain workloads.  In particular, we care
2707          * about non-scrubbing, top-level reads and writes with the following
2708          * characteristics:
2709          *      - synchronous writes of user data to non-slog devices
2710          *      - any reads of user data
2711          * When these conditions are met, adjust the timestamp of spa_last_io
2712          * which allows the scan thread to adjust its workload accordingly.
2713          */
2714         if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2715             vd == vd->vdev_top && !vd->vdev_islog &&
2716             zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2717             zio->io_txg != spa_syncing_txg(spa)) {
2718                 uint64_t old = spa->spa_last_io;
2719                 uint64_t new = ddi_get_lbolt64();
2720                 if (old != new)
2721                         (void) atomic_cas_64(&spa->spa_last_io, old, new);
2722         }
2723
2724         align = 1ULL << vd->vdev_top->vdev_ashift;
2725
2726         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2727             P2PHASE(zio->io_size, align) != 0) {
2728                 /* Transform logical writes to be a full physical block size. */
2729                 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2730                 char *abuf = zio_buf_alloc(asize);
2731                 ASSERT(vd == vd->vdev_top);
2732                 if (zio->io_type == ZIO_TYPE_WRITE) {
2733                         bcopy(zio->io_data, abuf, zio->io_size);
2734                         bzero(abuf + zio->io_size, asize - zio->io_size);
2735                 }
2736                 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2737         }
2738
2739         /*
2740          * If this is not a physical io, make sure that it is properly aligned
2741          * before proceeding.
2742          */
2743         if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2744                 ASSERT0(P2PHASE(zio->io_offset, align));
2745                 ASSERT0(P2PHASE(zio->io_size, align));
2746         } else {
2747                 /*
2748                  * For physical writes, we allow 512b aligned writes and assume
2749                  * the device will perform a read-modify-write as necessary.
2750                  */
2751                 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2752                 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2753         }
2754
2755         VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2756
2757         /*
2758          * If this is a repair I/O, and there's no self-healing involved --
2759          * that is, we're just resilvering what we expect to resilver --
2760          * then don't do the I/O unless zio's txg is actually in vd's DTL.
2761          * This prevents spurious resilvering with nested replication.
2762          * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2763          * A is out of date, we'll read from C+D, then use the data to
2764          * resilver A+B -- but we don't actually want to resilver B, just A.
2765          * The top-level mirror has no way to know this, so instead we just
2766          * discard unnecessary repairs as we work our way down the vdev tree.
2767          * The same logic applies to any form of nested replication:
2768          * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2769          */
2770         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2771             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2772             zio->io_txg != 0 && /* not a delegated i/o */
2773             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2774                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2775                 zio_vdev_io_bypass(zio);
2776                 return (ZIO_PIPELINE_CONTINUE);
2777         }
2778
2779         if (vd->vdev_ops->vdev_op_leaf &&
2780             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2781
2782                 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2783                         return (ZIO_PIPELINE_CONTINUE);
2784
2785                 if ((zio = vdev_queue_io(zio)) == NULL)
2786                         return (ZIO_PIPELINE_STOP);
2787
2788                 if (!vdev_accessible(vd, zio)) {
2789                         zio->io_error = SET_ERROR(ENXIO);
2790                         zio_interrupt(zio);
2791                         return (ZIO_PIPELINE_STOP);
2792                 }
2793         }
2794
2795         vd->vdev_ops->vdev_op_io_start(zio);
2796         return (ZIO_PIPELINE_STOP);
2797 }
2798
2799 static int
2800 zio_vdev_io_done(zio_t *zio)
2801 {
2802         vdev_t *vd = zio->io_vd;
2803         vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2804         boolean_t unexpected_error = B_FALSE;
2805
2806         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2807                 return (ZIO_PIPELINE_STOP);
2808
2809         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2810
2811         if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2812
2813                 vdev_queue_io_done(zio);
2814
2815                 if (zio->io_type == ZIO_TYPE_WRITE)
2816                         vdev_cache_write(zio);
2817
2818                 if (zio_injection_enabled && zio->io_error == 0)
2819                         zio->io_error = zio_handle_device_injection(vd,
2820                             zio, EIO);
2821
2822                 if (zio_injection_enabled && zio->io_error == 0)
2823                         zio->io_error = zio_handle_label_injection(zio, EIO);
2824
2825                 if (zio->io_error) {
2826                         if (!vdev_accessible(vd, zio)) {
2827                                 zio->io_error = SET_ERROR(ENXIO);
2828                         } else {
2829                                 unexpected_error = B_TRUE;
2830                         }
2831                 }
2832         }
2833
2834         ops->vdev_op_io_done(zio);
2835
2836         if (unexpected_error)
2837                 VERIFY(vdev_probe(vd, zio) == NULL);
2838
2839         return (ZIO_PIPELINE_CONTINUE);
2840 }
2841
2842 /*
2843  * For non-raidz ZIOs, we can just copy aside the bad data read from the
2844  * disk, and use that to finish the checksum ereport later.
2845  */
2846 static void
2847 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2848     const void *good_buf)
2849 {
2850         /* no processing needed */
2851         zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2852 }
2853
2854 /*ARGSUSED*/
2855 void
2856 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2857 {
2858         void *buf = zio_buf_alloc(zio->io_size);
2859
2860         bcopy(zio->io_data, buf, zio->io_size);
2861
2862         zcr->zcr_cbinfo = zio->io_size;
2863         zcr->zcr_cbdata = buf;
2864         zcr->zcr_finish = zio_vsd_default_cksum_finish;
2865         zcr->zcr_free = zio_buf_free;
2866 }
2867
2868 static int
2869 zio_vdev_io_assess(zio_t *zio)
2870 {
2871         vdev_t *vd = zio->io_vd;
2872
2873         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2874                 return (ZIO_PIPELINE_STOP);
2875
2876         if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2877                 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2878
2879         if (zio->io_vsd != NULL) {
2880                 zio->io_vsd_ops->vsd_free(zio);
2881                 zio->io_vsd = NULL;
2882         }
2883
2884         if (zio_injection_enabled && zio->io_error == 0)
2885                 zio->io_error = zio_handle_fault_injection(zio, EIO);
2886
2887         /*
2888          * If the I/O failed, determine whether we should attempt to retry it.
2889          *
2890          * On retry, we cut in line in the issue queue, since we don't want
2891          * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2892          */
2893         if (zio->io_error && vd == NULL &&
2894             !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2895                 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2896                 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2897                 zio->io_error = 0;
2898                 zio->io_flags |= ZIO_FLAG_IO_RETRY |
2899                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2900                 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2901                 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2902                     zio_requeue_io_start_cut_in_line);
2903                 return (ZIO_PIPELINE_STOP);
2904         }
2905
2906         /*
2907          * If we got an error on a leaf device, convert it to ENXIO
2908          * if the device is not accessible at all.
2909          */
2910         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2911             !vdev_accessible(vd, zio))
2912                 zio->io_error = SET_ERROR(ENXIO);
2913
2914         /*
2915          * If we can't write to an interior vdev (mirror or RAID-Z),
2916          * set vdev_cant_write so that we stop trying to allocate from it.
2917          */
2918         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2919             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2920                 vd->vdev_cant_write = B_TRUE;
2921         }
2922
2923         if (zio->io_error)
2924                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2925
2926         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2927             zio->io_physdone != NULL) {
2928                 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2929                 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2930                 zio->io_physdone(zio->io_logical);
2931         }
2932
2933         return (ZIO_PIPELINE_CONTINUE);
2934 }
2935
2936 void
2937 zio_vdev_io_reissue(zio_t *zio)
2938 {
2939         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2940         ASSERT(zio->io_error == 0);
2941
2942         zio->io_stage >>= 1;
2943 }
2944
2945 void
2946 zio_vdev_io_redone(zio_t *zio)
2947 {
2948         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2949
2950         zio->io_stage >>= 1;
2951 }
2952
2953 void
2954 zio_vdev_io_bypass(zio_t *zio)
2955 {
2956         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2957         ASSERT(zio->io_error == 0);
2958
2959         zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2960         zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2961 }
2962
2963 /*
2964  * ==========================================================================
2965  * Generate and verify checksums
2966  * ==========================================================================
2967  */
2968 static int
2969 zio_checksum_generate(zio_t *zio)
2970 {
2971         blkptr_t *bp = zio->io_bp;
2972         enum zio_checksum checksum;
2973
2974         if (bp == NULL) {
2975                 /*
2976                  * This is zio_write_phys().
2977                  * We're either generating a label checksum, or none at all.
2978                  */
2979                 checksum = zio->io_prop.zp_checksum;
2980
2981                 if (checksum == ZIO_CHECKSUM_OFF)
2982                         return (ZIO_PIPELINE_CONTINUE);
2983
2984                 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2985         } else {
2986                 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2987                         ASSERT(!IO_IS_ALLOCATING(zio));
2988                         checksum = ZIO_CHECKSUM_GANG_HEADER;
2989                 } else {
2990                         checksum = BP_GET_CHECKSUM(bp);
2991                 }
2992         }
2993
2994         zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2995
2996         return (ZIO_PIPELINE_CONTINUE);
2997 }
2998
2999 static int
3000 zio_checksum_verify(zio_t *zio)
3001 {
3002         zio_bad_cksum_t info;
3003         blkptr_t *bp = zio->io_bp;
3004         int error;
3005
3006         ASSERT(zio->io_vd != NULL);
3007
3008         if (bp == NULL) {
3009                 /*
3010                  * This is zio_read_phys().
3011                  * We're either verifying a label checksum, or nothing at all.
3012                  */
3013                 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
3014                         return (ZIO_PIPELINE_CONTINUE);
3015
3016                 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
3017         }
3018
3019         if ((error = zio_checksum_error(zio, &info)) != 0) {
3020                 zio->io_error = error;
3021                 if (error == ECKSUM &&
3022                     !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3023                         zfs_ereport_start_checksum(zio->io_spa,
3024                             zio->io_vd, zio, zio->io_offset,
3025                             zio->io_size, NULL, &info);
3026                 }
3027         }
3028
3029         return (ZIO_PIPELINE_CONTINUE);
3030 }
3031
3032 /*
3033  * Called by RAID-Z to ensure we don't compute the checksum twice.
3034  */
3035 void
3036 zio_checksum_verified(zio_t *zio)
3037 {
3038         zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
3039 }
3040
3041 /*
3042  * ==========================================================================
3043  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
3044  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
3045  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
3046  * indicate errors that are specific to one I/O, and most likely permanent.
3047  * Any other error is presumed to be worse because we weren't expecting it.
3048  * ==========================================================================
3049  */
3050 int
3051 zio_worst_error(int e1, int e2)
3052 {
3053         static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
3054         int r1, r2;
3055
3056         for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
3057                 if (e1 == zio_error_rank[r1])
3058                         break;
3059
3060         for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
3061                 if (e2 == zio_error_rank[r2])
3062                         break;
3063
3064         return (r1 > r2 ? e1 : e2);
3065 }
3066
3067 /*
3068  * ==========================================================================
3069  * I/O completion
3070  * ==========================================================================
3071  */
3072 static int
3073 zio_ready(zio_t *zio)
3074 {
3075         blkptr_t *bp = zio->io_bp;
3076         zio_t *pio, *pio_next;
3077
3078         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3079             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
3080                 return (ZIO_PIPELINE_STOP);
3081
3082         if (zio->io_ready) {
3083                 ASSERT(IO_IS_ALLOCATING(zio));
3084                 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3085                     (zio->io_flags & ZIO_FLAG_NOPWRITE));
3086                 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
3087
3088                 zio->io_ready(zio);
3089         }
3090
3091         if (bp != NULL && bp != &zio->io_bp_copy)
3092                 zio->io_bp_copy = *bp;
3093
3094         if (zio->io_error)
3095                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3096
3097         mutex_enter(&zio->io_lock);
3098         zio->io_state[ZIO_WAIT_READY] = 1;
3099         pio = zio_walk_parents(zio);
3100         mutex_exit(&zio->io_lock);
3101
3102         /*
3103          * As we notify zio's parents, new parents could be added.
3104          * New parents go to the head of zio's io_parent_list, however,
3105          * so we will (correctly) not notify them.  The remainder of zio's
3106          * io_parent_list, from 'pio_next' onward, cannot change because
3107          * all parents must wait for us to be done before they can be done.
3108          */
3109         for (; pio != NULL; pio = pio_next) {
3110                 pio_next = zio_walk_parents(zio);
3111                 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
3112         }
3113
3114         if (zio->io_flags & ZIO_FLAG_NODATA) {
3115                 if (BP_IS_GANG(bp)) {
3116                         zio->io_flags &= ~ZIO_FLAG_NODATA;
3117                 } else {
3118                         ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3119                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3120                 }
3121         }
3122
3123         if (zio_injection_enabled &&
3124             zio->io_spa->spa_syncing_txg == zio->io_txg)
3125                 zio_handle_ignored_writes(zio);
3126
3127         return (ZIO_PIPELINE_CONTINUE);
3128 }
3129
3130 static int
3131 zio_done(zio_t *zio)
3132 {
3133         zio_t *pio, *pio_next;
3134         int c, w;
3135
3136         /*
3137          * If our children haven't all completed,
3138          * wait for them and then repeat this pipeline stage.
3139          */
3140         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3141             zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3142             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3143             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3144                 return (ZIO_PIPELINE_STOP);
3145
3146         for (c = 0; c < ZIO_CHILD_TYPES; c++)
3147                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
3148                         ASSERT(zio->io_children[c][w] == 0);
3149
3150         if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
3151                 ASSERT(zio->io_bp->blk_pad[0] == 0);
3152                 ASSERT(zio->io_bp->blk_pad[1] == 0);
3153                 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
3154                     sizeof (blkptr_t)) == 0 ||
3155                     (zio->io_bp == zio_unique_parent(zio)->io_bp));
3156                 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
3157                     zio->io_bp_override == NULL &&
3158                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3159                         ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
3160                         ASSERT3U(zio->io_prop.zp_copies, <=,
3161                             BP_GET_NDVAS(zio->io_bp));
3162                         ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
3163                             (BP_COUNT_GANG(zio->io_bp) ==
3164                             BP_GET_NDVAS(zio->io_bp)));
3165                 }
3166                 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3167                         VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
3168         }
3169
3170         /*
3171          * If there were child vdev/gang/ddt errors, they apply to us now.
3172          */
3173         zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3174         zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3175         zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3176
3177         /*
3178          * If the I/O on the transformed data was successful, generate any
3179          * checksum reports now while we still have the transformed data.
3180          */
3181         if (zio->io_error == 0) {
3182                 while (zio->io_cksum_report != NULL) {
3183                         zio_cksum_report_t *zcr = zio->io_cksum_report;
3184                         uint64_t align = zcr->zcr_align;
3185                         uint64_t asize = P2ROUNDUP(zio->io_size, align);
3186                         char *abuf = zio->io_data;
3187
3188                         if (asize != zio->io_size) {
3189                                 abuf = zio_buf_alloc(asize);
3190                                 bcopy(zio->io_data, abuf, zio->io_size);
3191                                 bzero(abuf+zio->io_size, asize-zio->io_size);
3192                         }
3193
3194                         zio->io_cksum_report = zcr->zcr_next;
3195                         zcr->zcr_next = NULL;
3196                         zcr->zcr_finish(zcr, abuf);
3197                         zfs_ereport_free_checksum(zcr);
3198
3199                         if (asize != zio->io_size)
3200                                 zio_buf_free(abuf, asize);
3201                 }
3202         }
3203
3204         zio_pop_transforms(zio);        /* note: may set zio->io_error */
3205
3206         vdev_stat_update(zio, zio->io_size);
3207
3208         /*
3209          * If this I/O is attached to a particular vdev is slow, exceeding
3210          * 30 seconds to complete, post an error described the I/O delay.
3211          * We ignore these errors if the device is currently unavailable.
3212          */
3213         if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
3214                 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
3215                         zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
3216                             zio->io_vd, zio, 0, 0);
3217         }
3218
3219         if (zio->io_error) {
3220                 /*
3221                  * If this I/O is attached to a particular vdev,
3222                  * generate an error message describing the I/O failure
3223                  * at the block level.  We ignore these errors if the
3224                  * device is currently unavailable.
3225                  */
3226                 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
3227                         !vdev_is_dead(zio->io_vd))
3228                         zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
3229                                                 zio->io_vd, zio, 0, 0);
3230
3231                 if ((zio->io_error == EIO || !(zio->io_flags &
3232                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3233                     zio == zio->io_logical) {
3234                         /*
3235                          * For logical I/O requests, tell the SPA to log the
3236                          * error and generate a logical data ereport.
3237                          */
3238                         spa_log_error(zio->io_spa, zio);
3239                         zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
3240                             NULL, zio, 0, 0);
3241                 }
3242         }
3243
3244         if (zio->io_error && zio == zio->io_logical) {
3245                 /*
3246                  * Determine whether zio should be reexecuted.  This will
3247                  * propagate all the way to the root via zio_notify_parent().
3248                  */
3249                 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
3250                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3251
3252                 if (IO_IS_ALLOCATING(zio) &&
3253                     !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3254                         if (zio->io_error != ENOSPC)
3255                                 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3256                         else
3257                                 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3258                 }
3259
3260                 if ((zio->io_type == ZIO_TYPE_READ ||
3261                     zio->io_type == ZIO_TYPE_FREE) &&
3262                     !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3263                     zio->io_error == ENXIO &&
3264                     spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
3265                     spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
3266                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3267
3268                 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3269                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3270
3271                 /*
3272                  * Here is a possibly good place to attempt to do
3273                  * either combinatorial reconstruction or error correction
3274                  * based on checksums.  It also might be a good place
3275                  * to send out preliminary ereports before we suspend
3276                  * processing.
3277                  */
3278         }
3279
3280         /*
3281          * If there were logical child errors, they apply to us now.
3282          * We defer this until now to avoid conflating logical child
3283          * errors with errors that happened to the zio itself when
3284          * updating vdev stats and reporting FMA events above.
3285          */
3286         zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3287
3288         if ((zio->io_error || zio->io_reexecute) &&
3289             IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3290             !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3291                 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
3292
3293         zio_gang_tree_free(&zio->io_gang_tree);
3294
3295         /*
3296          * Godfather I/Os should never suspend.
3297          */
3298         if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3299             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3300                 zio->io_reexecute = 0;
3301
3302         if (zio->io_reexecute) {
3303                 /*
3304                  * This is a logical I/O that wants to reexecute.
3305                  *
3306                  * Reexecute is top-down.  When an i/o fails, if it's not
3307                  * the root, it simply notifies its parent and sticks around.
3308                  * The parent, seeing that it still has children in zio_done(),
3309                  * does the same.  This percolates all the way up to the root.
3310                  * The root i/o will reexecute or suspend the entire tree.
3311                  *
3312                  * This approach ensures that zio_reexecute() honors
3313                  * all the original i/o dependency relationships, e.g.
3314                  * parents not executing until children are ready.
3315                  */
3316                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3317
3318                 zio->io_gang_leader = NULL;
3319
3320                 mutex_enter(&zio->io_lock);
3321                 zio->io_state[ZIO_WAIT_DONE] = 1;
3322                 mutex_exit(&zio->io_lock);
3323
3324                 /*
3325                  * "The Godfather" I/O monitors its children but is
3326                  * not a true parent to them. It will track them through
3327                  * the pipeline but severs its ties whenever they get into
3328                  * trouble (e.g. suspended). This allows "The Godfather"
3329                  * I/O to return status without blocking.
3330                  */
3331                 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3332                         zio_link_t *zl = zio->io_walk_link;
3333                         pio_next = zio_walk_parents(zio);
3334
3335                         if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3336                             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3337                                 zio_remove_child(pio, zio, zl);
3338                                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3339                         }
3340                 }
3341
3342                 if ((pio = zio_unique_parent(zio)) != NULL) {
3343                         /*
3344                          * We're not a root i/o, so there's nothing to do
3345                          * but notify our parent.  Don't propagate errors
3346                          * upward since we haven't permanently failed yet.
3347                          */
3348                         ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3349                         zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3350                         zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3351                 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3352                         /*
3353                          * We'd fail again if we reexecuted now, so suspend
3354                          * until conditions improve (e.g. device comes online).
3355                          */
3356                         zio_suspend(zio->io_spa, zio);
3357                 } else {
3358                         /*
3359                          * Reexecution is potentially a huge amount of work.
3360                          * Hand it off to the otherwise-unused claim taskq.
3361                          */
3362                         ASSERT(taskq_empty_ent(&zio->io_tqent));
3363                         spa_taskq_dispatch_ent(zio->io_spa,
3364                             ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
3365                             (task_func_t *)zio_reexecute, zio, 0,
3366                             &zio->io_tqent);
3367                 }
3368                 return (ZIO_PIPELINE_STOP);
3369         }
3370
3371         ASSERT(zio->io_child_count == 0);
3372         ASSERT(zio->io_reexecute == 0);
3373         ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3374
3375         /*
3376          * Report any checksum errors, since the I/O is complete.
3377          */
3378         while (zio->io_cksum_report != NULL) {
3379                 zio_cksum_report_t *zcr = zio->io_cksum_report;
3380                 zio->io_cksum_report = zcr->zcr_next;
3381                 zcr->zcr_next = NULL;
3382                 zcr->zcr_finish(zcr, NULL);
3383                 zfs_ereport_free_checksum(zcr);
3384         }
3385
3386         if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
3387             !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
3388             !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
3389                 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
3390         }
3391
3392         /*
3393          * It is the responsibility of the done callback to ensure that this
3394          * particular zio is no longer discoverable for adoption, and as
3395          * such, cannot acquire any new parents.
3396          */
3397         if (zio->io_done)
3398                 zio->io_done(zio);
3399
3400         mutex_enter(&zio->io_lock);
3401         zio->io_state[ZIO_WAIT_DONE] = 1;
3402         mutex_exit(&zio->io_lock);
3403
3404         for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3405                 zio_link_t *zl = zio->io_walk_link;
3406                 pio_next = zio_walk_parents(zio);
3407                 zio_remove_child(pio, zio, zl);
3408                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3409         }
3410
3411         if (zio->io_waiter != NULL) {
3412                 mutex_enter(&zio->io_lock);
3413                 zio->io_executor = NULL;
3414                 cv_broadcast(&zio->io_cv);
3415                 mutex_exit(&zio->io_lock);
3416         } else {
3417                 zio_destroy(zio);
3418         }
3419
3420         return (ZIO_PIPELINE_STOP);
3421 }
3422
3423 /*
3424  * ==========================================================================
3425  * I/O pipeline definition
3426  * ==========================================================================
3427  */
3428 static zio_pipe_stage_t *zio_pipeline[] = {
3429         NULL,
3430         zio_read_bp_init,
3431         zio_free_bp_init,
3432         zio_issue_async,
3433         zio_write_bp_init,
3434         zio_checksum_generate,
3435         zio_nop_write,
3436         zio_ddt_read_start,
3437         zio_ddt_read_done,
3438         zio_ddt_write,
3439         zio_ddt_free,
3440         zio_gang_assemble,
3441         zio_gang_issue,
3442         zio_dva_allocate,
3443         zio_dva_free,
3444         zio_dva_claim,
3445         zio_ready,
3446         zio_vdev_io_start,
3447         zio_vdev_io_done,
3448         zio_vdev_io_assess,
3449         zio_checksum_verify,
3450         zio_done
3451 };
3452
3453 /* dnp is the dnode for zb1->zb_object */
3454 boolean_t
3455 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3456     const zbookmark_phys_t *zb2)
3457 {
3458         uint64_t zb1nextL0, zb2thisobj;
3459
3460         ASSERT(zb1->zb_objset == zb2->zb_objset);
3461         ASSERT(zb2->zb_level == 0);
3462
3463         /* The objset_phys_t isn't before anything. */
3464         if (dnp == NULL)
3465                 return (B_FALSE);
3466
3467         zb1nextL0 = (zb1->zb_blkid + 1) <<
3468             ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3469
3470         zb2thisobj = zb2->zb_object ? zb2->zb_object :
3471             zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3472
3473         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3474                 uint64_t nextobj = zb1nextL0 *
3475                     (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3476                 return (nextobj <= zb2thisobj);
3477         }
3478
3479         if (zb1->zb_object < zb2thisobj)
3480                 return (B_TRUE);
3481         if (zb1->zb_object > zb2thisobj)
3482                 return (B_FALSE);
3483         if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3484                 return (B_FALSE);
3485         return (zb1nextL0 <= zb2->zb_blkid);
3486 }
3487
3488 #if defined(_KERNEL) && defined(HAVE_SPL)
3489 EXPORT_SYMBOL(zio_type_name);
3490 EXPORT_SYMBOL(zio_buf_alloc);
3491 EXPORT_SYMBOL(zio_data_buf_alloc);
3492 EXPORT_SYMBOL(zio_buf_alloc_flags);
3493 EXPORT_SYMBOL(zio_buf_free);
3494 EXPORT_SYMBOL(zio_data_buf_free);
3495
3496 module_param(zio_delay_max, int, 0644);
3497 MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
3498
3499 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
3500 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
3501
3502 module_param(zfs_sync_pass_deferred_free, int, 0644);
3503 MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
3504         "Defer frees starting in this pass");
3505
3506 module_param(zfs_sync_pass_dont_compress, int, 0644);
3507 MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
3508         "Don't compress starting in this pass");
3509
3510 module_param(zfs_sync_pass_rewrite, int, 0644);
3511 MODULE_PARM_DESC(zfs_sync_pass_rewrite,
3512         "Rewrite new bps starting in this pass");
3513 #endif