module/zfs/zio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/fm/fs/zfs.h>
  29 #include <sys/spa.h>
  30 #include <sys/txg.h>
  31 #include <sys/spa_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio_impl.h>
  34 #include <sys/zio_compress.h>
  35 #include <sys/zio_checksum.h>
  36 #include <sys/dmu_objset.h>
  37 #include <sys/arc.h>
  38 #include <sys/ddt.h>
  39 #include <sys/zfeature.h>
  40
  41 /*
  42  * ==========================================================================
  43  * I/O type descriptions
  44  * ==========================================================================
  45  */
  46 const char *zio_type_name[ZIO_TYPES] = {
  47         "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
  48 };
  49
  50 /*
  51  * ==========================================================================
  52  * I/O kmem caches
  53  * ==========================================================================
  54  */
  55 kmem_cache_t *zio_cache;
  56 kmem_cache_t *zio_link_cache;
  57 kmem_cache_t *zio_vdev_cache;
  58 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  59 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  60 int zio_bulk_flags = 0;
  61 int zio_delay_max = ZIO_DELAY_MAX;
  62
  63 extern int zfs_mg_alloc_failures;
  64
  65 /*
  66  * The following actions directly effect the spa's sync-to-convergence logic.
  67  * The values below define the sync pass when we start performing the action.
  68  * Care should be taken when changing these values as they directly impact
  69  * spa_sync() performance. Tuning these values may introduce subtle performance
  70  * pathologies and should only be done in the context of performance analysis.
  71  * These tunables will eventually be removed and replaced with #defines once
  72  * enough analysis has been done to determine optimal values.
  73  *
  74  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  75  * regular blocks are not deferred.
  76  */
  77 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  78 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  79 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
  80
  81 /*
  82  * An allocating zio is one that either currently has the DVA allocate
  83  * stage set or will have it later in its lifetime.
  84  */
  85 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
  86
  87 int zio_requeue_io_start_cut_in_line = 1;
  88
  89 #ifdef ZFS_DEBUG
  90 int zio_buf_debug_limit = 16384;
  91 #else
  92 int zio_buf_debug_limit = 0;
  93 #endif
  94
  95 static inline void __zio_execute(zio_t *zio);
  96
  97 static int
  98 zio_cons(void *arg, void *unused, int kmflag)
  99 {
 100         zio_t *zio = arg;
 101
 102         bzero(zio, sizeof (zio_t));
 103
 104         mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 105         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 106
 107         list_create(&zio->io_parent_list, sizeof (zio_link_t),
 108             offsetof(zio_link_t, zl_parent_node));
 109         list_create(&zio->io_child_list, sizeof (zio_link_t),
 110             offsetof(zio_link_t, zl_child_node));
 111
 112         return (0);
 113 }
 114
 115 static void
 116 zio_dest(void *arg, void *unused)
 117 {
 118         zio_t *zio = arg;
 119
 120         mutex_destroy(&zio->io_lock);
 121         cv_destroy(&zio->io_cv);
 122         list_destroy(&zio->io_parent_list);
 123         list_destroy(&zio->io_child_list);
 124 }
 125
 126 void
 127 zio_init(void)
 128 {
 129         size_t c;
 130         vmem_t *data_alloc_arena = NULL;
 131
 132         zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
 133             zio_cons, zio_dest, NULL, NULL, NULL, 0);
 134         zio_link_cache = kmem_cache_create("zio_link_cache",
 135             sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 136         zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof (vdev_io_t),
 137             PAGESIZE, NULL, NULL, NULL, NULL, NULL, 0);
 138
 139         /*
 140          * For small buffers, we want a cache for each multiple of
 141          * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
 142          * for each quarter-power of 2.  For large buffers, we want
 143          * a cache for each multiple of PAGESIZE.
 144          */
 145         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 146                 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 147                 size_t p2 = size;
 148                 size_t align = 0;
 149
 150                 while (p2 & (p2 - 1))
 151                         p2 &= p2 - 1;
 152
 153 #ifndef _KERNEL
 154                 /*
 155                  * If we are using watchpoints, put each buffer on its own page,
 156                  * to eliminate the performance overhead of trapping to the
 157                  * kernel when modifying a non-watched buffer that shares the
 158                  * page with a watched buffer.
 159                  */
 160                 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 161                         continue;
 162 #endif
 163                 if (size <= 4 * SPA_MINBLOCKSIZE) {
 164                         align = SPA_MINBLOCKSIZE;
 165                 } else if (IS_P2ALIGNED(size, PAGESIZE)) {
 166                         align = PAGESIZE;
 167                 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 168                         align = p2 >> 2;
 169                 }
 170
 171                 if (align != 0) {
 172                         char name[36];
 173                         int flags = zio_bulk_flags;
 174
 175                         (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 176                         zio_buf_cache[c] = kmem_cache_create(name, size,
 177                             align, NULL, NULL, NULL, NULL, NULL, flags);
 178
 179                         (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 180                         zio_data_buf_cache[c] = kmem_cache_create(name, size,
 181                             align, NULL, NULL, NULL, NULL,
 182                             data_alloc_arena, flags);
 183                 }
 184         }
 185
 186         while (--c != 0) {
 187                 ASSERT(zio_buf_cache[c] != NULL);
 188                 if (zio_buf_cache[c - 1] == NULL)
 189                         zio_buf_cache[c - 1] = zio_buf_cache[c];
 190
 191                 ASSERT(zio_data_buf_cache[c] != NULL);
 192                 if (zio_data_buf_cache[c - 1] == NULL)
 193                         zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 194         }
 195
 196         /*
 197          * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
 198          * to fail 3 times per txg or 8 failures, whichever is greater.
 199          */
 200         if (zfs_mg_alloc_failures == 0)
 201                 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
 202
 203         zio_inject_init();
 204
 205         lz4_init();
 206 }
 207
 208 void
 209 zio_fini(void)
 210 {
 211         size_t c;
 212         kmem_cache_t *last_cache = NULL;
 213         kmem_cache_t *last_data_cache = NULL;
 214
 215         for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 216                 if (zio_buf_cache[c] != last_cache) {
 217                         last_cache = zio_buf_cache[c];
 218                         kmem_cache_destroy(zio_buf_cache[c]);
 219                 }
 220                 zio_buf_cache[c] = NULL;
 221
 222                 if (zio_data_buf_cache[c] != last_data_cache) {
 223                         last_data_cache = zio_data_buf_cache[c];
 224                         kmem_cache_destroy(zio_data_buf_cache[c]);
 225                 }
 226                 zio_data_buf_cache[c] = NULL;
 227         }
 228
 229         kmem_cache_destroy(zio_vdev_cache);
 230         kmem_cache_destroy(zio_link_cache);
 231         kmem_cache_destroy(zio_cache);
 232
 233         zio_inject_fini();
 234
 235         lz4_fini();
 236 }
 237
 238 /*
 239  * ==========================================================================
 240  * Allocate and free I/O buffers
 241  * ==========================================================================
 242  */
 243
 244 /*
 245  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 246  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 247  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 248  * excess / transient data in-core during a crashdump.
 249  */
 250 void *
 251 zio_buf_alloc(size_t size)
 252 {
 253         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 254
 255         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 256
 257         return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG));
 258 }
 259
 260 /*
 261  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 262  * crashdump if the kernel panics.  This exists so that we will limit the amount
 263  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 264  * of kernel heap dumped to disk when the kernel panics)
 265  */
 266 void *
 267 zio_data_buf_alloc(size_t size)
 268 {
 269         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 270
 271         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 272
 273         return (kmem_cache_alloc(zio_data_buf_cache[c],
 274             KM_PUSHPAGE | KM_NODEBUG));
 275 }
 276
 277 void
 278 zio_buf_free(void *buf, size_t size)
 279 {
 280         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 281
 282         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 283
 284         kmem_cache_free(zio_buf_cache[c], buf);
 285 }
 286
 287 void
 288 zio_data_buf_free(void *buf, size_t size)
 289 {
 290         size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 291
 292         ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 293
 294         kmem_cache_free(zio_data_buf_cache[c], buf);
 295 }
 296
 297 /*
 298  * Dedicated I/O buffers to ensure that memory fragmentation never prevents
 299  * or significantly delays the issuing of a zio.   These buffers are used
 300  * to aggregate I/O and could be used for raidz stripes.
 301  */
 302 void *
 303 zio_vdev_alloc(void)
 304 {
 305         return (kmem_cache_alloc(zio_vdev_cache, KM_PUSHPAGE));
 306 }
 307
 308 void
 309 zio_vdev_free(void *buf)
 310 {
 311         kmem_cache_free(zio_vdev_cache, buf);
 312
 313 }
 314
 315 /*
 316  * ==========================================================================
 317  * Push and pop I/O transform buffers
 318  * ==========================================================================
 319  */
 320 static void
 321 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 322         zio_transform_func_t *transform)
 323 {
 324         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_PUSHPAGE);
 325
 326         zt->zt_orig_data = zio->io_data;
 327         zt->zt_orig_size = zio->io_size;
 328         zt->zt_bufsize = bufsize;
 329         zt->zt_transform = transform;
 330
 331         zt->zt_next = zio->io_transform_stack;
 332         zio->io_transform_stack = zt;
 333
 334         zio->io_data = data;
 335         zio->io_size = size;
 336 }
 337
 338 static void
 339 zio_pop_transforms(zio_t *zio)
 340 {
 341         zio_transform_t *zt;
 342
 343         while ((zt = zio->io_transform_stack) != NULL) {
 344                 if (zt->zt_transform != NULL)
 345                         zt->zt_transform(zio,
 346                             zt->zt_orig_data, zt->zt_orig_size);
 347
 348                 if (zt->zt_bufsize != 0)
 349                         zio_buf_free(zio->io_data, zt->zt_bufsize);
 350
 351                 zio->io_data = zt->zt_orig_data;
 352                 zio->io_size = zt->zt_orig_size;
 353                 zio->io_transform_stack = zt->zt_next;
 354
 355                 kmem_free(zt, sizeof (zio_transform_t));
 356         }
 357 }
 358
 359 /*
 360  * ==========================================================================
 361  * I/O transform callbacks for subblocks and decompression
 362  * ==========================================================================
 363  */
 364 static void
 365 zio_subblock(zio_t *zio, void *data, uint64_t size)
 366 {
 367         ASSERT(zio->io_size > size);
 368
 369         if (zio->io_type == ZIO_TYPE_READ)
 370                 bcopy(zio->io_data, data, size);
 371 }
 372
 373 static void
 374 zio_decompress(zio_t *zio, void *data, uint64_t size)
 375 {
 376         if (zio->io_error == 0 &&
 377             zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 378             zio->io_data, data, zio->io_size, size) != 0)
 379                 zio->io_error = SET_ERROR(EIO);
 380 }
 381
 382 /*
 383  * ==========================================================================
 384  * I/O parent/child relationships and pipeline interlocks
 385  * ==========================================================================
 386  */
 387 /*
 388  * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 389  *        continue calling these functions until they return NULL.
 390  *        Otherwise, the next caller will pick up the list walk in
 391  *        some indeterminate state.  (Otherwise every caller would
 392  *        have to pass in a cookie to keep the state represented by
 393  *        io_walk_link, which gets annoying.)
 394  */
 395 zio_t *
 396 zio_walk_parents(zio_t *cio)
 397 {
 398         zio_link_t *zl = cio->io_walk_link;
 399         list_t *pl = &cio->io_parent_list;
 400
 401         zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 402         cio->io_walk_link = zl;
 403
 404         if (zl == NULL)
 405                 return (NULL);
 406
 407         ASSERT(zl->zl_child == cio);
 408         return (zl->zl_parent);
 409 }
 410
 411 zio_t *
 412 zio_walk_children(zio_t *pio)
 413 {
 414         zio_link_t *zl = pio->io_walk_link;
 415         list_t *cl = &pio->io_child_list;
 416
 417         zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 418         pio->io_walk_link = zl;
 419
 420         if (zl == NULL)
 421                 return (NULL);
 422
 423         ASSERT(zl->zl_parent == pio);
 424         return (zl->zl_child);
 425 }
 426
 427 zio_t *
 428 zio_unique_parent(zio_t *cio)
 429 {
 430         zio_t *pio = zio_walk_parents(cio);
 431
 432         VERIFY(zio_walk_parents(cio) == NULL);
 433         return (pio);
 434 }
 435
 436 void
 437 zio_add_child(zio_t *pio, zio_t *cio)
 438 {
 439         zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_PUSHPAGE);
 440         int w;
 441
 442         /*
 443          * Logical I/Os can have logical, gang, or vdev children.
 444          * Gang I/Os can have gang or vdev children.
 445          * Vdev I/Os can only have vdev children.
 446          * The following ASSERT captures all of these constraints.
 447          */
 448         ASSERT(cio->io_child_type <= pio->io_child_type);
 449
 450         zl->zl_parent = pio;
 451         zl->zl_child = cio;
 452
 453         mutex_enter(&cio->io_lock);
 454         mutex_enter(&pio->io_lock);
 455
 456         ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 457
 458         for (w = 0; w < ZIO_WAIT_TYPES; w++)
 459                 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 460
 461         list_insert_head(&pio->io_child_list, zl);
 462         list_insert_head(&cio->io_parent_list, zl);
 463
 464         pio->io_child_count++;
 465         cio->io_parent_count++;
 466
 467         mutex_exit(&pio->io_lock);
 468         mutex_exit(&cio->io_lock);
 469 }
 470
 471 static void
 472 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 473 {
 474         ASSERT(zl->zl_parent == pio);
 475         ASSERT(zl->zl_child == cio);
 476
 477         mutex_enter(&cio->io_lock);
 478         mutex_enter(&pio->io_lock);
 479
 480         list_remove(&pio->io_child_list, zl);
 481         list_remove(&cio->io_parent_list, zl);
 482
 483         pio->io_child_count--;
 484         cio->io_parent_count--;
 485
 486         mutex_exit(&pio->io_lock);
 487         mutex_exit(&cio->io_lock);
 488
 489         kmem_cache_free(zio_link_cache, zl);
 490 }
 491
 492 static boolean_t
 493 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 494 {
 495         uint64_t *countp = &zio->io_children[child][wait];
 496         boolean_t waiting = B_FALSE;
 497
 498         mutex_enter(&zio->io_lock);
 499         ASSERT(zio->io_stall == NULL);
 500         if (*countp != 0) {
 501                 zio->io_stage >>= 1;
 502                 zio->io_stall = countp;
 503                 waiting = B_TRUE;
 504         }
 505         mutex_exit(&zio->io_lock);
 506
 507         return (waiting);
 508 }
 509
 510 __attribute__((always_inline))
 511 static inline void
 512 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 513 {
 514         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 515         int *errorp = &pio->io_child_error[zio->io_child_type];
 516
 517         mutex_enter(&pio->io_lock);
 518         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 519                 *errorp = zio_worst_error(*errorp, zio->io_error);
 520         pio->io_reexecute |= zio->io_reexecute;
 521         ASSERT3U(*countp, >, 0);
 522
 523         (*countp)--;
 524
 525         if (*countp == 0 && pio->io_stall == countp) {
 526                 pio->io_stall = NULL;
 527                 mutex_exit(&pio->io_lock);
 528                 __zio_execute(pio);
 529         } else {
 530                 mutex_exit(&pio->io_lock);
 531         }
 532 }
 533
 534 static void
 535 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 536 {
 537         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 538                 zio->io_error = zio->io_child_error[c];
 539 }
 540
 541 /*
 542  * ==========================================================================
 543  * Create the various types of I/O (read, write, free, etc)
 544  * ==========================================================================
 545  */
 546 static zio_t *
 547 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 548     void *data, uint64_t size, zio_done_func_t *done, void *private,
 549     zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 550     vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
 551     enum zio_stage stage, enum zio_stage pipeline)
 552 {
 553         zio_t *zio;
 554
 555         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 556         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 557         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 558
 559         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 560         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 561         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 562
 563         zio = kmem_cache_alloc(zio_cache, KM_PUSHPAGE);
 564
 565         if (vd != NULL)
 566                 zio->io_child_type = ZIO_CHILD_VDEV;
 567         else if (flags & ZIO_FLAG_GANG_CHILD)
 568                 zio->io_child_type = ZIO_CHILD_GANG;
 569         else if (flags & ZIO_FLAG_DDT_CHILD)
 570                 zio->io_child_type = ZIO_CHILD_DDT;
 571         else
 572                 zio->io_child_type = ZIO_CHILD_LOGICAL;
 573
 574         if (bp != NULL) {
 575                 zio->io_logical = NULL;
 576                 zio->io_bp = (blkptr_t *)bp;
 577                 zio->io_bp_copy = *bp;
 578                 zio->io_bp_orig = *bp;
 579                 if (type != ZIO_TYPE_WRITE ||
 580                     zio->io_child_type == ZIO_CHILD_DDT)
 581                         zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 582                 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 583                         zio->io_logical = zio;
 584                 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 585                         pipeline |= ZIO_GANG_STAGES;
 586         } else {
 587                 zio->io_logical = NULL;
 588                 zio->io_bp = NULL;
 589                 bzero(&zio->io_bp_copy, sizeof (blkptr_t));
 590                 bzero(&zio->io_bp_orig, sizeof (blkptr_t));
 591         }
 592
 593         zio->io_spa = spa;
 594         zio->io_txg = txg;
 595         zio->io_ready = NULL;
 596         zio->io_physdone = NULL;
 597         zio->io_done = done;
 598         zio->io_private = private;
 599         zio->io_prev_space_delta = 0;
 600         zio->io_type = type;
 601         zio->io_priority = priority;
 602         zio->io_vd = vd;
 603         zio->io_vsd = NULL;
 604         zio->io_vsd_ops = NULL;
 605         zio->io_offset = offset;
 606         zio->io_timestamp = 0;
 607         zio->io_delta = 0;
 608         zio->io_delay = 0;
 609         zio->io_orig_data = zio->io_data = data;
 610         zio->io_orig_size = zio->io_size = size;
 611         zio->io_orig_flags = zio->io_flags = flags;
 612         zio->io_orig_stage = zio->io_stage = stage;
 613         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 614         bzero(&zio->io_prop, sizeof (zio_prop_t));
 615         zio->io_cmd = 0;
 616         zio->io_reexecute = 0;
 617         zio->io_bp_override = NULL;
 618         zio->io_walk_link = NULL;
 619         zio->io_transform_stack = NULL;
 620         zio->io_error = 0;
 621         zio->io_child_count = 0;
 622         zio->io_phys_children = 0;
 623         zio->io_parent_count = 0;
 624         zio->io_stall = NULL;
 625         zio->io_gang_leader = NULL;
 626         zio->io_gang_tree = NULL;
 627         zio->io_executor = NULL;
 628         zio->io_waiter = NULL;
 629         zio->io_cksum_report = NULL;
 630         zio->io_ena = 0;
 631         bzero(zio->io_child_error, sizeof (int) * ZIO_CHILD_TYPES);
 632         bzero(zio->io_children,
 633             sizeof (uint64_t) * ZIO_CHILD_TYPES * ZIO_WAIT_TYPES);
 634         bzero(&zio->io_bookmark, sizeof (zbookmark_t));
 635
 636         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 637         zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 638
 639         if (zb != NULL)
 640                 zio->io_bookmark = *zb;
 641
 642         if (pio != NULL) {
 643                 if (zio->io_logical == NULL)
 644                         zio->io_logical = pio->io_logical;
 645                 if (zio->io_child_type == ZIO_CHILD_GANG)
 646                         zio->io_gang_leader = pio->io_gang_leader;
 647                 zio_add_child(pio, zio);
 648         }
 649
 650         taskq_init_ent(&zio->io_tqent);
 651
 652         return (zio);
 653 }
 654
 655 static void
 656 zio_destroy(zio_t *zio)
 657 {
 658         kmem_cache_free(zio_cache, zio);
 659 }
 660
 661 zio_t *
 662 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 663     void *private, enum zio_flag flags)
 664 {
 665         zio_t *zio;
 666
 667         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 668             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 669             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 670
 671         return (zio);
 672 }
 673
 674 zio_t *
 675 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 676 {
 677         return (zio_null(NULL, spa, NULL, done, private, flags));
 678 }
 679
 680 zio_t *
 681 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 682     void *data, uint64_t size, zio_done_func_t *done, void *private,
 683     zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
 684 {
 685         zio_t *zio;
 686
 687         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 688             data, size, done, private,
 689             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 690             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 691             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 692
 693         return (zio);
 694 }
 695
 696 zio_t *
 697 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 698     void *data, uint64_t size, const zio_prop_t *zp,
 699     zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 700     void *private,
 701     zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
 702 {
 703         zio_t *zio;
 704
 705         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 706             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 707             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 708             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 709             DMU_OT_IS_VALID(zp->zp_type) &&
 710             zp->zp_level < 32 &&
 711             zp->zp_copies > 0 &&
 712             zp->zp_copies <= spa_max_replication(spa));
 713
 714         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 715             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 716             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 717             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 718
 719         zio->io_ready = ready;
 720         zio->io_physdone = physdone;
 721         zio->io_prop = *zp;
 722
 723         return (zio);
 724 }
 725
 726 zio_t *
 727 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 728     uint64_t size, zio_done_func_t *done, void *private,
 729     zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
 730 {
 731         zio_t *zio;
 732
 733         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 734             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 735             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 736
 737         return (zio);
 738 }
 739
 740 void
 741 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 742 {
 743         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 744         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 745         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 746         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 747
 748         /*
 749          * We must reset the io_prop to match the values that existed
 750          * when the bp was first written by dmu_sync() keeping in mind
 751          * that nopwrite and dedup are mutually exclusive.
 752          */
 753         zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 754         zio->io_prop.zp_nopwrite = nopwrite;
 755         zio->io_prop.zp_copies = copies;
 756         zio->io_bp_override = bp;
 757 }
 758
 759 void
 760 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 761 {
 762         metaslab_check_free(spa, bp);
 763
 764         /*
 765          * Frees that are for the currently-syncing txg, are not going to be
 766          * deferred, and which will not need to do a read (i.e. not GANG or
 767          * DEDUP), can be processed immediately.  Otherwise, put them on the
 768          * in-memory list for later processing.
 769          */
 770         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 771             txg != spa->spa_syncing_txg ||
 772             spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 773                 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 774         } else {
 775                 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
 776         }
 777 }
 778
 779 zio_t *
 780 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 781     enum zio_flag flags)
 782 {
 783         zio_t *zio;
 784         enum zio_stage stage = ZIO_FREE_PIPELINE;
 785
 786         dprintf_bp(bp, "freeing in txg %llu, pass %u",
 787             (longlong_t)txg, spa->spa_sync_pass);
 788
 789         ASSERT(!BP_IS_HOLE(bp));
 790         ASSERT(spa_syncing_txg(spa) == txg);
 791         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 792
 793         metaslab_check_free(spa, bp);
 794         arc_freed(spa, bp);
 795
 796         /*
 797          * GANG and DEDUP blocks can induce a read (for the gang block header,
 798          * or the DDT), so issue them asynchronously so that this thread is
 799          * not tied up.
 800          */
 801         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 802                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 803
 804         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 805             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 806             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 807
 808         return (zio);
 809 }
 810
 811 zio_t *
 812 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 813     zio_done_func_t *done, void *private, enum zio_flag flags)
 814 {
 815         zio_t *zio;
 816
 817         /*
 818          * A claim is an allocation of a specific block.  Claims are needed
 819          * to support immediate writes in the intent log.  The issue is that
 820          * immediate writes contain committed data, but in a txg that was
 821          * *not* committed.  Upon opening the pool after an unclean shutdown,
 822          * the intent log claims all blocks that contain immediate write data
 823          * so that the SPA knows they're in use.
 824          *
 825          * All claims *must* be resolved in the first txg -- before the SPA
 826          * starts allocating blocks -- so that nothing is allocated twice.
 827          * If txg == 0 we just verify that the block is claimable.
 828          */
 829         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 830         ASSERT(txg == spa_first_txg(spa) || txg == 0);
 831         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 832
 833         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 834             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 835             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 836
 837         return (zio);
 838 }
 839
 840 zio_t *
 841 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 842     zio_done_func_t *done, void *private, enum zio_flag flags)
 843 {
 844         zio_t *zio;
 845         int c;
 846
 847         if (vd->vdev_children == 0) {
 848                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 849                     ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 850                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 851
 852                 zio->io_cmd = cmd;
 853         } else {
 854                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 855
 856                 for (c = 0; c < vd->vdev_children; c++)
 857                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 858                             done, private, flags));
 859         }
 860
 861         return (zio);
 862 }
 863
 864 zio_t *
 865 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 866     void *data, int checksum, zio_done_func_t *done, void *private,
 867     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 868 {
 869         zio_t *zio;
 870
 871         ASSERT(vd->vdev_children == 0);
 872         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 873             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 874         ASSERT3U(offset + size, <=, vd->vdev_psize);
 875
 876         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 877             ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
 878             ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 879
 880         zio->io_prop.zp_checksum = checksum;
 881
 882         return (zio);
 883 }
 884
 885 zio_t *
 886 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 887     void *data, int checksum, zio_done_func_t *done, void *private,
 888     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 889 {
 890         zio_t *zio;
 891
 892         ASSERT(vd->vdev_children == 0);
 893         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 894             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 895         ASSERT3U(offset + size, <=, vd->vdev_psize);
 896
 897         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 898             ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
 899             ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 900
 901         zio->io_prop.zp_checksum = checksum;
 902
 903         if (zio_checksum_table[checksum].ci_eck) {
 904                 /*
 905                  * zec checksums are necessarily destructive -- they modify
 906                  * the end of the write buffer to hold the verifier/checksum.
 907                  * Therefore, we must make a local copy in case the data is
 908                  * being written to multiple places in parallel.
 909                  */
 910                 void *wbuf = zio_buf_alloc(size);
 911                 bcopy(data, wbuf, size);
 912                 zio_push_transform(zio, wbuf, size, size, NULL);
 913         }
 914
 915         return (zio);
 916 }
 917
 918 /*
 919  * Create a child I/O to do some work for us.
 920  */
 921 zio_t *
 922 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 923         void *data, uint64_t size, int type, zio_priority_t priority,
 924         enum zio_flag flags, zio_done_func_t *done, void *private)
 925 {
 926         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 927         zio_t *zio;
 928
 929         ASSERT(vd->vdev_parent ==
 930             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 931
 932         if (type == ZIO_TYPE_READ && bp != NULL) {
 933                 /*
 934                  * If we have the bp, then the child should perform the
 935                  * checksum and the parent need not.  This pushes error
 936                  * detection as close to the leaves as possible and
 937                  * eliminates redundant checksums in the interior nodes.
 938                  */
 939                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 940                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 941         }
 942
 943         if (vd->vdev_children == 0)
 944                 offset += VDEV_LABEL_START_SIZE;
 945
 946         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 947
 948         /*
 949          * If we've decided to do a repair, the write is not speculative --
 950          * even if the original read was.
 951          */
 952         if (flags & ZIO_FLAG_IO_REPAIR)
 953                 flags &= ~ZIO_FLAG_SPECULATIVE;
 954
 955         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 956             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 957             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 958
 959         zio->io_physdone = pio->io_physdone;
 960         if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
 961                 zio->io_logical->io_phys_children++;
 962
 963         return (zio);
 964 }
 965
 966 zio_t *
 967 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 968         int type, zio_priority_t priority, enum zio_flag flags,
 969         zio_done_func_t *done, void *private)
 970 {
 971         zio_t *zio;
 972
 973         ASSERT(vd->vdev_ops->vdev_op_leaf);
 974
 975         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 976             data, size, done, private, type, priority,
 977             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 978             vd, offset, NULL,
 979             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 980
 981         return (zio);
 982 }
 983
 984 void
 985 zio_flush(zio_t *zio, vdev_t *vd)
 986 {
 987         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 988             NULL, NULL,
 989             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 990 }
 991
 992 void
 993 zio_shrink(zio_t *zio, uint64_t size)
 994 {
 995         ASSERT(zio->io_executor == NULL);
 996         ASSERT(zio->io_orig_size == zio->io_size);
 997         ASSERT(size <= zio->io_size);
 998
 999         /*
1000          * We don't shrink for raidz because of problems with the
1001          * reconstruction when reading back less than the block size.
1002          * Note, BP_IS_RAIDZ() assumes no compression.
1003          */
1004         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1005         if (!BP_IS_RAIDZ(zio->io_bp))
1006                 zio->io_orig_size = zio->io_size = size;
1007 }
1008
1009 /*
1010  * ==========================================================================
1011  * Prepare to read and write logical blocks
1012  * ==========================================================================
1013  */
1014
1015 static int
1016 zio_read_bp_init(zio_t *zio)
1017 {
1018         blkptr_t *bp = zio->io_bp;
1019
1020         if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1021             zio->io_child_type == ZIO_CHILD_LOGICAL &&
1022             !(zio->io_flags & ZIO_FLAG_RAW)) {
1023                 uint64_t psize = BP_GET_PSIZE(bp);
1024                 void *cbuf = zio_buf_alloc(psize);
1025
1026                 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1027         }
1028
1029         if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1030                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1031
1032         if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1033                 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1034
1035         if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1036                 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1037
1038         return (ZIO_PIPELINE_CONTINUE);
1039 }
1040
1041 static int
1042 zio_write_bp_init(zio_t *zio)
1043 {
1044         spa_t *spa = zio->io_spa;
1045         zio_prop_t *zp = &zio->io_prop;
1046         enum zio_compress compress = zp->zp_compress;
1047         blkptr_t *bp = zio->io_bp;
1048         uint64_t lsize = zio->io_size;
1049         uint64_t psize = lsize;
1050         int pass = 1;
1051
1052         /*
1053          * If our children haven't all reached the ready stage,
1054          * wait for them and then repeat this pipeline stage.
1055          */
1056         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1057             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1058                 return (ZIO_PIPELINE_STOP);
1059
1060         if (!IO_IS_ALLOCATING(zio))
1061                 return (ZIO_PIPELINE_CONTINUE);
1062
1063         ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1064
1065         if (zio->io_bp_override) {
1066                 ASSERT(bp->blk_birth != zio->io_txg);
1067                 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1068
1069                 *bp = *zio->io_bp_override;
1070                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1071
1072                 /*
1073                  * If we've been overridden and nopwrite is set then
1074                  * set the flag accordingly to indicate that a nopwrite
1075                  * has already occurred.
1076                  */
1077                 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1078                         ASSERT(!zp->zp_dedup);
1079                         zio->io_flags |= ZIO_FLAG_NOPWRITE;
1080                         return (ZIO_PIPELINE_CONTINUE);
1081                 }
1082
1083                 ASSERT(!zp->zp_nopwrite);
1084
1085                 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1086                         return (ZIO_PIPELINE_CONTINUE);
1087
1088                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1089                     zp->zp_dedup_verify);
1090
1091                 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1092                         BP_SET_DEDUP(bp, 1);
1093                         zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1094                         return (ZIO_PIPELINE_CONTINUE);
1095                 }
1096                 zio->io_bp_override = NULL;
1097                 BP_ZERO(bp);
1098         }
1099
1100         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1101                 /*
1102                  * We're rewriting an existing block, which means we're
1103                  * working on behalf of spa_sync().  For spa_sync() to
1104                  * converge, it must eventually be the case that we don't
1105                  * have to allocate new blocks.  But compression changes
1106                  * the blocksize, which forces a reallocate, and makes
1107                  * convergence take longer.  Therefore, after the first
1108                  * few passes, stop compressing to ensure convergence.
1109                  */
1110                 pass = spa_sync_pass(spa);
1111
1112                 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1113                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1114                 ASSERT(!BP_GET_DEDUP(bp));
1115
1116                 if (pass >= zfs_sync_pass_dont_compress)
1117                         compress = ZIO_COMPRESS_OFF;
1118
1119                 /* Make sure someone doesn't change their mind on overwrites */
1120                 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
1121                     spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1122         }
1123
1124         if (compress != ZIO_COMPRESS_OFF) {
1125                 void *cbuf = zio_buf_alloc(lsize);
1126                 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1127                 if (psize == 0 || psize == lsize) {
1128                         compress = ZIO_COMPRESS_OFF;
1129                         zio_buf_free(cbuf, lsize);
1130                 } else {
1131                         ASSERT(psize < lsize);
1132                         zio_push_transform(zio, cbuf, psize, lsize, NULL);
1133                 }
1134         }
1135
1136         /*
1137          * The final pass of spa_sync() must be all rewrites, but the first
1138          * few passes offer a trade-off: allocating blocks defers convergence,
1139          * but newly allocated blocks are sequential, so they can be written
1140          * to disk faster.  Therefore, we allow the first few passes of
1141          * spa_sync() to allocate new blocks, but force rewrites after that.
1142          * There should only be a handful of blocks after pass 1 in any case.
1143          */
1144         if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1145             BP_GET_PSIZE(bp) == psize &&
1146             pass >= zfs_sync_pass_rewrite) {
1147                 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1148                 ASSERT(psize != 0);
1149                 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1150                 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1151         } else {
1152                 BP_ZERO(bp);
1153                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1154         }
1155
1156         if (psize == 0) {
1157                 if (zio->io_bp_orig.blk_birth != 0 &&
1158                     spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1159                         BP_SET_LSIZE(bp, lsize);
1160                         BP_SET_TYPE(bp, zp->zp_type);
1161                         BP_SET_LEVEL(bp, zp->zp_level);
1162                         BP_SET_BIRTH(bp, zio->io_txg, 0);
1163                 }
1164                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1165         } else {
1166                 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1167                 BP_SET_LSIZE(bp, lsize);
1168                 BP_SET_TYPE(bp, zp->zp_type);
1169                 BP_SET_LEVEL(bp, zp->zp_level);
1170                 BP_SET_PSIZE(bp, psize);
1171                 BP_SET_COMPRESS(bp, compress);
1172                 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1173                 BP_SET_DEDUP(bp, zp->zp_dedup);
1174                 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1175                 if (zp->zp_dedup) {
1176                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1177                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1178                         zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1179                 }
1180                 if (zp->zp_nopwrite) {
1181                         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1182                         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1183                         zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1184                 }
1185         }
1186
1187         return (ZIO_PIPELINE_CONTINUE);
1188 }
1189
1190 static int
1191 zio_free_bp_init(zio_t *zio)
1192 {
1193         blkptr_t *bp = zio->io_bp;
1194
1195         if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1196                 if (BP_GET_DEDUP(bp))
1197                         zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1198         }
1199
1200         return (ZIO_PIPELINE_CONTINUE);
1201 }
1202
1203 /*
1204  * ==========================================================================
1205  * Execute the I/O pipeline
1206  * ==========================================================================
1207  */
1208
1209 static void
1210 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1211 {
1212         spa_t *spa = zio->io_spa;
1213         zio_type_t t = zio->io_type;
1214         int flags = (cutinline ? TQ_FRONT : 0);
1215
1216         /*
1217          * If we're a config writer or a probe, the normal issue and
1218          * interrupt threads may all be blocked waiting for the config lock.
1219          * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1220          */
1221         if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1222                 t = ZIO_TYPE_NULL;
1223
1224         /*
1225          * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1226          */
1227         if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1228                 t = ZIO_TYPE_NULL;
1229
1230         /*
1231          * If this is a high priority I/O, then use the high priority taskq if
1232          * available.
1233          */
1234         if (zio->io_priority == ZIO_PRIORITY_NOW &&
1235             spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1236                 q++;
1237
1238         ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1239
1240         /*
1241          * NB: We are assuming that the zio can only be dispatched
1242          * to a single taskq at a time.  It would be a grievous error
1243          * to dispatch the zio to another taskq at the same time.
1244          */
1245         ASSERT(taskq_empty_ent(&zio->io_tqent));
1246         spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1247             flags, &zio->io_tqent);
1248 }
1249
1250 static boolean_t
1251 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1252 {
1253         kthread_t *executor = zio->io_executor;
1254         spa_t *spa = zio->io_spa;
1255         zio_type_t t;
1256
1257         for (t = 0; t < ZIO_TYPES; t++) {
1258                 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1259                 uint_t i;
1260                 for (i = 0; i < tqs->stqs_count; i++) {
1261                         if (taskq_member(tqs->stqs_taskq[i], executor))
1262                                 return (B_TRUE);
1263                 }
1264         }
1265
1266         return (B_FALSE);
1267 }
1268
1269 static int
1270 zio_issue_async(zio_t *zio)
1271 {
1272         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1273
1274         return (ZIO_PIPELINE_STOP);
1275 }
1276
1277 void
1278 zio_interrupt(zio_t *zio)
1279 {
1280         zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1281 }
1282
1283 /*
1284  * Execute the I/O pipeline until one of the following occurs:
1285  * (1) the I/O completes; (2) the pipeline stalls waiting for
1286  * dependent child I/Os; (3) the I/O issues, so we're waiting
1287  * for an I/O completion interrupt; (4) the I/O is delegated by
1288  * vdev-level caching or aggregation; (5) the I/O is deferred
1289  * due to vdev-level queueing; (6) the I/O is handed off to
1290  * another thread.  In all cases, the pipeline stops whenever
1291  * there's no CPU work; it never burns a thread in cv_wait_io().
1292  *
1293  * There's no locking on io_stage because there's no legitimate way
1294  * for multiple threads to be attempting to process the same I/O.
1295  */
1296 static zio_pipe_stage_t *zio_pipeline[];
1297
1298 /*
1299  * zio_execute() is a wrapper around the static function
1300  * __zio_execute() so that we can force  __zio_execute() to be
1301  * inlined.  This reduces stack overhead which is important
1302  * because __zio_execute() is called recursively in several zio
1303  * code paths.  zio_execute() itself cannot be inlined because
1304  * it is externally visible.
1305  */
1306 void
1307 zio_execute(zio_t *zio)
1308 {
1309         __zio_execute(zio);
1310 }
1311
1312 __attribute__((always_inline))
1313 static inline void
1314 __zio_execute(zio_t *zio)
1315 {
1316         zio->io_executor = curthread;
1317
1318         while (zio->io_stage < ZIO_STAGE_DONE) {
1319                 enum zio_stage pipeline = zio->io_pipeline;
1320                 enum zio_stage stage = zio->io_stage;
1321                 dsl_pool_t *dp;
1322                 boolean_t cut;
1323                 int rv;
1324
1325                 ASSERT(!MUTEX_HELD(&zio->io_lock));
1326                 ASSERT(ISP2(stage));
1327                 ASSERT(zio->io_stall == NULL);
1328
1329                 do {
1330                         stage <<= 1;
1331                 } while ((stage & pipeline) == 0);
1332
1333                 ASSERT(stage <= ZIO_STAGE_DONE);
1334
1335                 dp = spa_get_dsl(zio->io_spa);
1336                 cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1337                     zio_requeue_io_start_cut_in_line : B_FALSE;
1338
1339                 /*
1340                  * If we are in interrupt context and this pipeline stage
1341                  * will grab a config lock that is held across I/O,
1342                  * or may wait for an I/O that needs an interrupt thread
1343                  * to complete, issue async to avoid deadlock.
1344                  *
1345                  * For VDEV_IO_START, we cut in line so that the io will
1346                  * be sent to disk promptly.
1347                  */
1348                 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1349                     zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1350                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1351                         return;
1352                 }
1353
1354                 /*
1355                  * If we executing in the context of the tx_sync_thread,
1356                  * or we are performing pool initialization outside of a
1357                  * zio_taskq[ZIO_TASKQ_ISSUE|ZIO_TASKQ_ISSUE_HIGH] context.
1358                  * Then issue the zio asynchronously to minimize stack usage
1359                  * for these deep call paths.
1360                  */
1361                 if ((dp && curthread == dp->dp_tx.tx_sync_thread) ||
1362                     (dp && spa_is_initializing(dp->dp_spa) &&
1363                     !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
1364                     !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))) {
1365                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1366                         return;
1367                 }
1368
1369                 zio->io_stage = stage;
1370                 rv = zio_pipeline[highbit(stage) - 1](zio);
1371
1372                 if (rv == ZIO_PIPELINE_STOP)
1373                         return;
1374
1375                 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1376         }
1377 }
1378
1379
1380 /*
1381  * ==========================================================================
1382  * Initiate I/O, either sync or async
1383  * ==========================================================================
1384  */
1385 int
1386 zio_wait(zio_t *zio)
1387 {
1388         int error;
1389
1390         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1391         ASSERT(zio->io_executor == NULL);
1392
1393         zio->io_waiter = curthread;
1394
1395         __zio_execute(zio);
1396
1397         mutex_enter(&zio->io_lock);
1398         while (zio->io_executor != NULL)
1399                 cv_wait_io(&zio->io_cv, &zio->io_lock);
1400         mutex_exit(&zio->io_lock);
1401
1402         error = zio->io_error;
1403         zio_destroy(zio);
1404
1405         return (error);
1406 }
1407
1408 void
1409 zio_nowait(zio_t *zio)
1410 {
1411         ASSERT(zio->io_executor == NULL);
1412
1413         if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1414             zio_unique_parent(zio) == NULL) {
1415                 /*
1416                  * This is a logical async I/O with no parent to wait for it.
1417                  * We add it to the spa_async_root_zio "Godfather" I/O which
1418                  * will ensure they complete prior to unloading the pool.
1419                  */
1420                 spa_t *spa = zio->io_spa;
1421
1422                 zio_add_child(spa->spa_async_zio_root, zio);
1423         }
1424
1425         __zio_execute(zio);
1426 }
1427
1428 /*
1429  * ==========================================================================
1430  * Reexecute or suspend/resume failed I/O
1431  * ==========================================================================
1432  */
1433
1434 static void
1435 zio_reexecute(zio_t *pio)
1436 {
1437         zio_t *cio, *cio_next;
1438         int c, w;
1439
1440         ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1441         ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1442         ASSERT(pio->io_gang_leader == NULL);
1443         ASSERT(pio->io_gang_tree == NULL);
1444
1445         pio->io_flags = pio->io_orig_flags;
1446         pio->io_stage = pio->io_orig_stage;
1447         pio->io_pipeline = pio->io_orig_pipeline;
1448         pio->io_reexecute = 0;
1449         pio->io_flags |= ZIO_FLAG_REEXECUTED;
1450         pio->io_error = 0;
1451         for (w = 0; w < ZIO_WAIT_TYPES; w++)
1452                 pio->io_state[w] = 0;
1453         for (c = 0; c < ZIO_CHILD_TYPES; c++)
1454                 pio->io_child_error[c] = 0;
1455
1456         if (IO_IS_ALLOCATING(pio))
1457                 BP_ZERO(pio->io_bp);
1458
1459         /*
1460          * As we reexecute pio's children, new children could be created.
1461          * New children go to the head of pio's io_child_list, however,
1462          * so we will (correctly) not reexecute them.  The key is that
1463          * the remainder of pio's io_child_list, from 'cio_next' onward,
1464          * cannot be affected by any side effects of reexecuting 'cio'.
1465          */
1466         for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1467                 cio_next = zio_walk_children(pio);
1468                 mutex_enter(&pio->io_lock);
1469                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
1470                         pio->io_children[cio->io_child_type][w]++;
1471                 mutex_exit(&pio->io_lock);
1472                 zio_reexecute(cio);
1473         }
1474
1475         /*
1476          * Now that all children have been reexecuted, execute the parent.
1477          * We don't reexecute "The Godfather" I/O here as it's the
1478          * responsibility of the caller to wait on him.
1479          */
1480         if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1481                 __zio_execute(pio);
1482 }
1483
1484 void
1485 zio_suspend(spa_t *spa, zio_t *zio)
1486 {
1487         if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1488                 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1489                     "failure and the failure mode property for this pool "
1490                     "is set to panic.", spa_name(spa));
1491
1492         cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
1493             "failure and has been suspended.\n", spa_name(spa));
1494
1495         zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1496
1497         mutex_enter(&spa->spa_suspend_lock);
1498
1499         if (spa->spa_suspend_zio_root == NULL)
1500                 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1501                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1502                     ZIO_FLAG_GODFATHER);
1503
1504         spa->spa_suspended = B_TRUE;
1505
1506         if (zio != NULL) {
1507                 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1508                 ASSERT(zio != spa->spa_suspend_zio_root);
1509                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1510                 ASSERT(zio_unique_parent(zio) == NULL);
1511                 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1512                 zio_add_child(spa->spa_suspend_zio_root, zio);
1513         }
1514
1515         mutex_exit(&spa->spa_suspend_lock);
1516 }
1517
1518 int
1519 zio_resume(spa_t *spa)
1520 {
1521         zio_t *pio;
1522
1523         /*
1524          * Reexecute all previously suspended i/o.
1525          */
1526         mutex_enter(&spa->spa_suspend_lock);
1527         spa->spa_suspended = B_FALSE;
1528         cv_broadcast(&spa->spa_suspend_cv);
1529         pio = spa->spa_suspend_zio_root;
1530         spa->spa_suspend_zio_root = NULL;
1531         mutex_exit(&spa->spa_suspend_lock);
1532
1533         if (pio == NULL)
1534                 return (0);
1535
1536         zio_reexecute(pio);
1537         return (zio_wait(pio));
1538 }
1539
1540 void
1541 zio_resume_wait(spa_t *spa)
1542 {
1543         mutex_enter(&spa->spa_suspend_lock);
1544         while (spa_suspended(spa))
1545                 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1546         mutex_exit(&spa->spa_suspend_lock);
1547 }
1548
1549 /*
1550  * ==========================================================================
1551  * Gang blocks.
1552  *
1553  * A gang block is a collection of small blocks that looks to the DMU
1554  * like one large block.  When zio_dva_allocate() cannot find a block
1555  * of the requested size, due to either severe fragmentation or the pool
1556  * being nearly full, it calls zio_write_gang_block() to construct the
1557  * block from smaller fragments.
1558  *
1559  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1560  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1561  * an indirect block: it's an array of block pointers.  It consumes
1562  * only one sector and hence is allocatable regardless of fragmentation.
1563  * The gang header's bps point to its gang members, which hold the data.
1564  *
1565  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1566  * as the verifier to ensure uniqueness of the SHA256 checksum.
1567  * Critically, the gang block bp's blk_cksum is the checksum of the data,
1568  * not the gang header.  This ensures that data block signatures (needed for
1569  * deduplication) are independent of how the block is physically stored.
1570  *
1571  * Gang blocks can be nested: a gang member may itself be a gang block.
1572  * Thus every gang block is a tree in which root and all interior nodes are
1573  * gang headers, and the leaves are normal blocks that contain user data.
1574  * The root of the gang tree is called the gang leader.
1575  *
1576  * To perform any operation (read, rewrite, free, claim) on a gang block,
1577  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1578  * in the io_gang_tree field of the original logical i/o by recursively
1579  * reading the gang leader and all gang headers below it.  This yields
1580  * an in-core tree containing the contents of every gang header and the
1581  * bps for every constituent of the gang block.
1582  *
1583  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1584  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1585  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1586  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1587  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1588  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1589  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1590  * of the gang header plus zio_checksum_compute() of the data to update the
1591  * gang header's blk_cksum as described above.
1592  *
1593  * The two-phase assemble/issue model solves the problem of partial failure --
1594  * what if you'd freed part of a gang block but then couldn't read the
1595  * gang header for another part?  Assembling the entire gang tree first
1596  * ensures that all the necessary gang header I/O has succeeded before
1597  * starting the actual work of free, claim, or write.  Once the gang tree
1598  * is assembled, free and claim are in-memory operations that cannot fail.
1599  *
1600  * In the event that a gang write fails, zio_dva_unallocate() walks the
1601  * gang tree to immediately free (i.e. insert back into the space map)
1602  * everything we've allocated.  This ensures that we don't get ENOSPC
1603  * errors during repeated suspend/resume cycles due to a flaky device.
1604  *
1605  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1606  * the gang tree, we won't modify the block, so we can safely defer the free
1607  * (knowing that the block is still intact).  If we *can* assemble the gang
1608  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1609  * each constituent bp and we can allocate a new block on the next sync pass.
1610  *
1611  * In all cases, the gang tree allows complete recovery from partial failure.
1612  * ==========================================================================
1613  */
1614
1615 static zio_t *
1616 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1617 {
1618         if (gn != NULL)
1619                 return (pio);
1620
1621         return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1622             NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1623             &pio->io_bookmark));
1624 }
1625
1626 zio_t *
1627 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1628 {
1629         zio_t *zio;
1630
1631         if (gn != NULL) {
1632                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1633                     gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1634                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1635                 /*
1636                  * As we rewrite each gang header, the pipeline will compute
1637                  * a new gang block header checksum for it; but no one will
1638                  * compute a new data checksum, so we do that here.  The one
1639                  * exception is the gang leader: the pipeline already computed
1640                  * its data checksum because that stage precedes gang assembly.
1641                  * (Presently, nothing actually uses interior data checksums;
1642                  * this is just good hygiene.)
1643                  */
1644                 if (gn != pio->io_gang_leader->io_gang_tree) {
1645                         zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1646                             data, BP_GET_PSIZE(bp));
1647                 }
1648                 /*
1649                  * If we are here to damage data for testing purposes,
1650                  * leave the GBH alone so that we can detect the damage.
1651                  */
1652                 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1653                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1654         } else {
1655                 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1656                     data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1657                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1658         }
1659
1660         return (zio);
1661 }
1662
1663 /* ARGSUSED */
1664 zio_t *
1665 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1666 {
1667         return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1668             ZIO_GANG_CHILD_FLAGS(pio)));
1669 }
1670
1671 /* ARGSUSED */
1672 zio_t *
1673 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1674 {
1675         return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1676             NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1677 }
1678
1679 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1680         NULL,
1681         zio_read_gang,
1682         zio_rewrite_gang,
1683         zio_free_gang,
1684         zio_claim_gang,
1685         NULL
1686 };
1687
1688 static void zio_gang_tree_assemble_done(zio_t *zio);
1689
1690 static zio_gang_node_t *
1691 zio_gang_node_alloc(zio_gang_node_t **gnpp)
1692 {
1693         zio_gang_node_t *gn;
1694
1695         ASSERT(*gnpp == NULL);
1696
1697         gn = kmem_zalloc(sizeof (*gn), KM_PUSHPAGE);
1698         gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1699         *gnpp = gn;
1700
1701         return (gn);
1702 }
1703
1704 static void
1705 zio_gang_node_free(zio_gang_node_t **gnpp)
1706 {
1707         zio_gang_node_t *gn = *gnpp;
1708         int g;
1709
1710         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1711                 ASSERT(gn->gn_child[g] == NULL);
1712
1713         zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1714         kmem_free(gn, sizeof (*gn));
1715         *gnpp = NULL;
1716 }
1717
1718 static void
1719 zio_gang_tree_free(zio_gang_node_t **gnpp)
1720 {
1721         zio_gang_node_t *gn = *gnpp;
1722         int g;
1723
1724         if (gn == NULL)
1725                 return;
1726
1727         for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1728                 zio_gang_tree_free(&gn->gn_child[g]);
1729
1730         zio_gang_node_free(gnpp);
1731 }
1732
1733 static void
1734 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1735 {
1736         zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1737
1738         ASSERT(gio->io_gang_leader == gio);
1739         ASSERT(BP_IS_GANG(bp));
1740
1741         zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1742             SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1743             gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1744 }
1745
1746 static void
1747 zio_gang_tree_assemble_done(zio_t *zio)
1748 {
1749         zio_t *gio = zio->io_gang_leader;
1750         zio_gang_node_t *gn = zio->io_private;
1751         blkptr_t *bp = zio->io_bp;
1752         int g;
1753
1754         ASSERT(gio == zio_unique_parent(zio));
1755         ASSERT(zio->io_child_count == 0);
1756
1757         if (zio->io_error)
1758                 return;
1759
1760         if (BP_SHOULD_BYTESWAP(bp))
1761                 byteswap_uint64_array(zio->io_data, zio->io_size);
1762
1763         ASSERT(zio->io_data == gn->gn_gbh);
1764         ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1765         ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1766
1767         for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1768                 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1769                 if (!BP_IS_GANG(gbp))
1770                         continue;
1771                 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1772         }
1773 }
1774
1775 static void
1776 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1777 {
1778         zio_t *gio = pio->io_gang_leader;
1779         zio_t *zio;
1780         int g;
1781
1782         ASSERT(BP_IS_GANG(bp) == !!gn);
1783         ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1784         ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1785
1786         /*
1787          * If you're a gang header, your data is in gn->gn_gbh.
1788          * If you're a gang member, your data is in 'data' and gn == NULL.
1789          */
1790         zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1791
1792         if (gn != NULL) {
1793                 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1794
1795                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1796                         blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1797                         if (BP_IS_HOLE(gbp))
1798                                 continue;
1799                         zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1800                         data = (char *)data + BP_GET_PSIZE(gbp);
1801                 }
1802         }
1803
1804         if (gn == gio->io_gang_tree)
1805                 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1806
1807         if (zio != pio)
1808                 zio_nowait(zio);
1809 }
1810
1811 static int
1812 zio_gang_assemble(zio_t *zio)
1813 {
1814         blkptr_t *bp = zio->io_bp;
1815
1816         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1817         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1818
1819         zio->io_gang_leader = zio;
1820
1821         zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1822
1823         return (ZIO_PIPELINE_CONTINUE);
1824 }
1825
1826 static int
1827 zio_gang_issue(zio_t *zio)
1828 {
1829         blkptr_t *bp = zio->io_bp;
1830
1831         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1832                 return (ZIO_PIPELINE_STOP);
1833
1834         ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1835         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1836
1837         if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1838                 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1839         else
1840                 zio_gang_tree_free(&zio->io_gang_tree);
1841
1842         zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1843
1844         return (ZIO_PIPELINE_CONTINUE);
1845 }
1846
1847 static void
1848 zio_write_gang_member_ready(zio_t *zio)
1849 {
1850         zio_t *pio = zio_unique_parent(zio);
1851         dva_t *cdva = zio->io_bp->blk_dva;
1852         dva_t *pdva = pio->io_bp->blk_dva;
1853         uint64_t asize;
1854         int d;
1855         ASSERTV(zio_t *gio = zio->io_gang_leader);
1856
1857         if (BP_IS_HOLE(zio->io_bp))
1858                 return;
1859
1860         ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1861
1862         ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1863         ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1864         ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1865         ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1866         ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1867
1868         mutex_enter(&pio->io_lock);
1869         for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1870                 ASSERT(DVA_GET_GANG(&pdva[d]));
1871                 asize = DVA_GET_ASIZE(&pdva[d]);
1872                 asize += DVA_GET_ASIZE(&cdva[d]);
1873                 DVA_SET_ASIZE(&pdva[d], asize);
1874         }
1875         mutex_exit(&pio->io_lock);
1876 }
1877
1878 static int
1879 zio_write_gang_block(zio_t *pio)
1880 {
1881         spa_t *spa = pio->io_spa;
1882         blkptr_t *bp = pio->io_bp;
1883         zio_t *gio = pio->io_gang_leader;
1884         zio_t *zio;
1885         zio_gang_node_t *gn, **gnpp;
1886         zio_gbh_phys_t *gbh;
1887         uint64_t txg = pio->io_txg;
1888         uint64_t resid = pio->io_size;
1889         uint64_t lsize;
1890         int copies = gio->io_prop.zp_copies;
1891         int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1892         zio_prop_t zp;
1893         int g, error;
1894
1895         error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1896             bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1897             METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1898         if (error) {
1899                 pio->io_error = error;
1900                 return (ZIO_PIPELINE_CONTINUE);
1901         }
1902
1903         if (pio == gio) {
1904                 gnpp = &gio->io_gang_tree;
1905         } else {
1906                 gnpp = pio->io_private;
1907                 ASSERT(pio->io_ready == zio_write_gang_member_ready);
1908         }
1909
1910         gn = zio_gang_node_alloc(gnpp);
1911         gbh = gn->gn_gbh;
1912         bzero(gbh, SPA_GANGBLOCKSIZE);
1913
1914         /*
1915          * Create the gang header.
1916          */
1917         zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1918             pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1919
1920         /*
1921          * Create and nowait the gang children.
1922          */
1923         for (g = 0; resid != 0; resid -= lsize, g++) {
1924                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1925                     SPA_MINBLOCKSIZE);
1926                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1927
1928                 zp.zp_checksum = gio->io_prop.zp_checksum;
1929                 zp.zp_compress = ZIO_COMPRESS_OFF;
1930                 zp.zp_type = DMU_OT_NONE;
1931                 zp.zp_level = 0;
1932                 zp.zp_copies = gio->io_prop.zp_copies;
1933                 zp.zp_dedup = B_FALSE;
1934                 zp.zp_dedup_verify = B_FALSE;
1935                 zp.zp_nopwrite = B_FALSE;
1936
1937                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1938                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1939                     zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1940                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1941                     &pio->io_bookmark));
1942         }
1943
1944         /*
1945          * Set pio's pipeline to just wait for zio to finish.
1946          */
1947         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1948
1949         /*
1950          * We didn't allocate this bp, so make sure it doesn't get unmarked.
1951          */
1952         pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
1953
1954         zio_nowait(zio);
1955
1956         return (ZIO_PIPELINE_CONTINUE);
1957 }
1958
1959 /*
1960  * The zio_nop_write stage in the pipeline determines if allocating
1961  * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1962  * such as SHA256, we can compare the checksums of the new data and the old
1963  * to determine if allocating a new block is required.  The nopwrite
1964  * feature can handle writes in either syncing or open context (i.e. zil
1965  * writes) and as a result is mutually exclusive with dedup.
1966  */
1967 static int
1968 zio_nop_write(zio_t *zio)
1969 {
1970         blkptr_t *bp = zio->io_bp;
1971         blkptr_t *bp_orig = &zio->io_bp_orig;
1972         zio_prop_t *zp = &zio->io_prop;
1973
1974         ASSERT(BP_GET_LEVEL(bp) == 0);
1975         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1976         ASSERT(zp->zp_nopwrite);
1977         ASSERT(!zp->zp_dedup);
1978         ASSERT(zio->io_bp_override == NULL);
1979         ASSERT(IO_IS_ALLOCATING(zio));
1980
1981         /*
1982          * Check to see if the original bp and the new bp have matching
1983          * characteristics (i.e. same checksum, compression algorithms, etc).
1984          * If they don't then just continue with the pipeline which will
1985          * allocate a new bp.
1986          */
1987         if (BP_IS_HOLE(bp_orig) ||
1988             !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
1989             BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
1990             BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
1991             BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
1992             zp->zp_copies != BP_GET_NDVAS(bp_orig))
1993                 return (ZIO_PIPELINE_CONTINUE);
1994
1995         /*
1996          * If the checksums match then reset the pipeline so that we
1997          * avoid allocating a new bp and issuing any I/O.
1998          */
1999         if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2000                 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2001                 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2002                 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2003                 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2004                 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2005                     sizeof (uint64_t)) == 0);
2006
2007                 *bp = *bp_orig;
2008                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2009                 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2010         }
2011
2012         return (ZIO_PIPELINE_CONTINUE);
2013 }
2014
2015 /*
2016  * ==========================================================================
2017  * Dedup
2018  * ==========================================================================
2019  */
2020 static void
2021 zio_ddt_child_read_done(zio_t *zio)
2022 {
2023         blkptr_t *bp = zio->io_bp;
2024         ddt_entry_t *dde = zio->io_private;
2025         ddt_phys_t *ddp;
2026         zio_t *pio = zio_unique_parent(zio);
2027
2028         mutex_enter(&pio->io_lock);
2029         ddp = ddt_phys_select(dde, bp);
2030         if (zio->io_error == 0)
2031                 ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
2032         if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2033                 dde->dde_repair_data = zio->io_data;
2034         else
2035                 zio_buf_free(zio->io_data, zio->io_size);
2036         mutex_exit(&pio->io_lock);
2037 }
2038
2039 static int
2040 zio_ddt_read_start(zio_t *zio)
2041 {
2042         blkptr_t *bp = zio->io_bp;
2043         int p;
2044
2045         ASSERT(BP_GET_DEDUP(bp));
2046         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2047         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2048
2049         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2050                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2051                 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2052                 ddt_phys_t *ddp = dde->dde_phys;
2053                 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2054                 blkptr_t blk;
2055
2056                 ASSERT(zio->io_vsd == NULL);
2057                 zio->io_vsd = dde;
2058
2059                 if (ddp_self == NULL)
2060                         return (ZIO_PIPELINE_CONTINUE);
2061
2062                 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2063                         if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2064                                 continue;
2065                         ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2066                             &blk);
2067                         zio_nowait(zio_read(zio, zio->io_spa, &blk,
2068                             zio_buf_alloc(zio->io_size), zio->io_size,
2069                             zio_ddt_child_read_done, dde, zio->io_priority,
2070                             ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2071                             &zio->io_bookmark));
2072                 }
2073                 return (ZIO_PIPELINE_CONTINUE);
2074         }
2075
2076         zio_nowait(zio_read(zio, zio->io_spa, bp,
2077             zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2078             ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2079
2080         return (ZIO_PIPELINE_CONTINUE);
2081 }
2082
2083 static int
2084 zio_ddt_read_done(zio_t *zio)
2085 {
2086         blkptr_t *bp = zio->io_bp;
2087
2088         if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2089                 return (ZIO_PIPELINE_STOP);
2090
2091         ASSERT(BP_GET_DEDUP(bp));
2092         ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2093         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2094
2095         if (zio->io_child_error[ZIO_CHILD_DDT]) {
2096                 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2097                 ddt_entry_t *dde = zio->io_vsd;
2098                 if (ddt == NULL) {
2099                         ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2100                         return (ZIO_PIPELINE_CONTINUE);
2101                 }
2102                 if (dde == NULL) {
2103                         zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2104                         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2105                         return (ZIO_PIPELINE_STOP);
2106                 }
2107                 if (dde->dde_repair_data != NULL) {
2108                         bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2109                         zio->io_child_error[ZIO_CHILD_DDT] = 0;
2110                 }
2111                 ddt_repair_done(ddt, dde);
2112                 zio->io_vsd = NULL;
2113         }
2114
2115         ASSERT(zio->io_vsd == NULL);
2116
2117         return (ZIO_PIPELINE_CONTINUE);
2118 }
2119
2120 static boolean_t
2121 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2122 {
2123         spa_t *spa = zio->io_spa;
2124         int p;
2125
2126         /*
2127          * Note: we compare the original data, not the transformed data,
2128          * because when zio->io_bp is an override bp, we will not have
2129          * pushed the I/O transforms.  That's an important optimization
2130          * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2131          */
2132         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2133                 zio_t *lio = dde->dde_lead_zio[p];
2134
2135                 if (lio != NULL) {
2136                         return (lio->io_orig_size != zio->io_orig_size ||
2137                             bcmp(zio->io_orig_data, lio->io_orig_data,
2138                             zio->io_orig_size) != 0);
2139                 }
2140         }
2141
2142         for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2143                 ddt_phys_t *ddp = &dde->dde_phys[p];
2144
2145                 if (ddp->ddp_phys_birth != 0) {
2146                         arc_buf_t *abuf = NULL;
2147                         uint32_t aflags = ARC_WAIT;
2148                         blkptr_t blk = *zio->io_bp;
2149                         int error;
2150
2151                         ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2152
2153                         ddt_exit(ddt);
2154
2155                         error = arc_read(NULL, spa, &blk,
2156                             arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2157                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2158                             &aflags, &zio->io_bookmark);
2159
2160                         if (error == 0) {
2161                                 if (arc_buf_size(abuf) != zio->io_orig_size ||
2162                                     bcmp(abuf->b_data, zio->io_orig_data,
2163                                     zio->io_orig_size) != 0)
2164                                         error = SET_ERROR(EEXIST);
2165                                 VERIFY(arc_buf_remove_ref(abuf, &abuf));
2166                         }
2167
2168                         ddt_enter(ddt);
2169                         return (error != 0);
2170                 }
2171         }
2172
2173         return (B_FALSE);
2174 }
2175
2176 static void
2177 zio_ddt_child_write_ready(zio_t *zio)
2178 {
2179         int p = zio->io_prop.zp_copies;
2180         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2181         ddt_entry_t *dde = zio->io_private;
2182         ddt_phys_t *ddp = &dde->dde_phys[p];
2183         zio_t *pio;
2184
2185         if (zio->io_error)
2186                 return;
2187
2188         ddt_enter(ddt);
2189
2190         ASSERT(dde->dde_lead_zio[p] == zio);
2191
2192         ddt_phys_fill(ddp, zio->io_bp);
2193
2194         while ((pio = zio_walk_parents(zio)) != NULL)
2195                 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2196
2197         ddt_exit(ddt);
2198 }
2199
2200 static void
2201 zio_ddt_child_write_done(zio_t *zio)
2202 {
2203         int p = zio->io_prop.zp_copies;
2204         ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2205         ddt_entry_t *dde = zio->io_private;
2206         ddt_phys_t *ddp = &dde->dde_phys[p];
2207
2208         ddt_enter(ddt);
2209
2210         ASSERT(ddp->ddp_refcnt == 0);
2211         ASSERT(dde->dde_lead_zio[p] == zio);
2212         dde->dde_lead_zio[p] = NULL;
2213
2214         if (zio->io_error == 0) {
2215                 while (zio_walk_parents(zio) != NULL)
2216                         ddt_phys_addref(ddp);
2217         } else {
2218                 ddt_phys_clear(ddp);
2219         }
2220
2221         ddt_exit(ddt);
2222 }
2223
2224 static void
2225 zio_ddt_ditto_write_done(zio_t *zio)
2226 {
2227         int p = DDT_PHYS_DITTO;
2228         blkptr_t *bp = zio->io_bp;
2229         ddt_t *ddt = ddt_select(zio->io_spa, bp);
2230         ddt_entry_t *dde = zio->io_private;
2231         ddt_phys_t *ddp = &dde->dde_phys[p];
2232         ddt_key_t *ddk = &dde->dde_key;
2233         ASSERTV(zio_prop_t *zp = &zio->io_prop);
2234
2235         ddt_enter(ddt);
2236
2237         ASSERT(ddp->ddp_refcnt == 0);
2238         ASSERT(dde->dde_lead_zio[p] == zio);
2239         dde->dde_lead_zio[p] = NULL;
2240
2241         if (zio->io_error == 0) {
2242                 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2243                 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2244                 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2245                 if (ddp->ddp_phys_birth != 0)
2246                         ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2247                 ddt_phys_fill(ddp, bp);
2248         }
2249
2250         ddt_exit(ddt);
2251 }
2252
2253 static int
2254 zio_ddt_write(zio_t *zio)
2255 {
2256         spa_t *spa = zio->io_spa;
2257         blkptr_t *bp = zio->io_bp;
2258         uint64_t txg = zio->io_txg;
2259         zio_prop_t *zp = &zio->io_prop;
2260         int p = zp->zp_copies;
2261         int ditto_copies;
2262         zio_t *cio = NULL;
2263         zio_t *dio = NULL;
2264         ddt_t *ddt = ddt_select(spa, bp);
2265         ddt_entry_t *dde;
2266         ddt_phys_t *ddp;
2267
2268         ASSERT(BP_GET_DEDUP(bp));
2269         ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2270         ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2271
2272         ddt_enter(ddt);
2273         dde = ddt_lookup(ddt, bp, B_TRUE);
2274         ddp = &dde->dde_phys[p];
2275
2276         if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2277                 /*
2278                  * If we're using a weak checksum, upgrade to a strong checksum
2279                  * and try again.  If we're already using a strong checksum,
2280                  * we can't resolve it, so just convert to an ordinary write.
2281                  * (And automatically e-mail a paper to Nature?)
2282                  */
2283                 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2284                         zp->zp_checksum = spa_dedup_checksum(spa);
2285                         zio_pop_transforms(zio);
2286                         zio->io_stage = ZIO_STAGE_OPEN;
2287                         BP_ZERO(bp);
2288                 } else {
2289                         zp->zp_dedup = B_FALSE;
2290                 }
2291                 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2292                 ddt_exit(ddt);
2293                 return (ZIO_PIPELINE_CONTINUE);
2294         }
2295
2296         ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2297         ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2298
2299         if (ditto_copies > ddt_ditto_copies_present(dde) &&
2300             dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2301                 zio_prop_t czp = *zp;
2302
2303                 czp.zp_copies = ditto_copies;
2304
2305                 /*
2306                  * If we arrived here with an override bp, we won't have run
2307                  * the transform stack, so we won't have the data we need to
2308                  * generate a child i/o.  So, toss the override bp and restart.
2309                  * This is safe, because using the override bp is just an
2310                  * optimization; and it's rare, so the cost doesn't matter.
2311                  */
2312                 if (zio->io_bp_override) {
2313                         zio_pop_transforms(zio);
2314                         zio->io_stage = ZIO_STAGE_OPEN;
2315                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2316                         zio->io_bp_override = NULL;
2317                         BP_ZERO(bp);
2318                         ddt_exit(ddt);
2319                         return (ZIO_PIPELINE_CONTINUE);
2320                 }
2321
2322                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2323                     zio->io_orig_size, &czp, NULL, NULL,
2324                     zio_ddt_ditto_write_done, dde, zio->io_priority,
2325                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2326
2327                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2328                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2329         }
2330
2331         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2332                 if (ddp->ddp_phys_birth != 0)
2333                         ddt_bp_fill(ddp, bp, txg);
2334                 if (dde->dde_lead_zio[p] != NULL)
2335                         zio_add_child(zio, dde->dde_lead_zio[p]);
2336                 else
2337                         ddt_phys_addref(ddp);
2338         } else if (zio->io_bp_override) {
2339                 ASSERT(bp->blk_birth == txg);
2340                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2341                 ddt_phys_fill(ddp, bp);
2342                 ddt_phys_addref(ddp);
2343         } else {
2344                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2345                     zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2346                     zio_ddt_child_write_done, dde, zio->io_priority,
2347                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2348
2349                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2350                 dde->dde_lead_zio[p] = cio;
2351         }
2352
2353         ddt_exit(ddt);
2354
2355         if (cio)
2356                 zio_nowait(cio);
2357         if (dio)
2358                 zio_nowait(dio);
2359
2360         return (ZIO_PIPELINE_CONTINUE);
2361 }
2362
2363 ddt_entry_t *freedde; /* for debugging */
2364
2365 static int
2366 zio_ddt_free(zio_t *zio)
2367 {
2368         spa_t *spa = zio->io_spa;
2369         blkptr_t *bp = zio->io_bp;
2370         ddt_t *ddt = ddt_select(spa, bp);
2371         ddt_entry_t *dde;
2372         ddt_phys_t *ddp;
2373
2374         ASSERT(BP_GET_DEDUP(bp));
2375         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2376
2377         ddt_enter(ddt);
2378         freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2379         if (dde) {
2380                 ddp = ddt_phys_select(dde, bp);
2381                 if (ddp)
2382                         ddt_phys_decref(ddp);
2383         }
2384         ddt_exit(ddt);
2385
2386         return (ZIO_PIPELINE_CONTINUE);
2387 }
2388
2389 /*
2390  * ==========================================================================
2391  * Allocate and free blocks
2392  * ==========================================================================
2393  */
2394 static int
2395 zio_dva_allocate(zio_t *zio)
2396 {
2397         spa_t *spa = zio->io_spa;
2398         metaslab_class_t *mc = spa_normal_class(spa);
2399         blkptr_t *bp = zio->io_bp;
2400         int error;
2401         int flags = 0;
2402
2403         if (zio->io_gang_leader == NULL) {
2404                 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2405                 zio->io_gang_leader = zio;
2406         }
2407
2408         ASSERT(BP_IS_HOLE(bp));
2409         ASSERT0(BP_GET_NDVAS(bp));
2410         ASSERT3U(zio->io_prop.zp_copies, >, 0);
2411         ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2412         ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2413
2414         /*
2415          * The dump device does not support gang blocks so allocation on
2416          * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2417          * the "fast" gang feature.
2418          */
2419         flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2420         flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2421             METASLAB_GANG_CHILD : 0;
2422         flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
2423         error = metaslab_alloc(spa, mc, zio->io_size, bp,
2424             zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2425
2426         if (error) {
2427                 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2428                     "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2429                     error);
2430                 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2431                         return (zio_write_gang_block(zio));
2432                 zio->io_error = error;
2433         }
2434
2435         return (ZIO_PIPELINE_CONTINUE);
2436 }
2437
2438 static int
2439 zio_dva_free(zio_t *zio)
2440 {
2441         metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2442
2443         return (ZIO_PIPELINE_CONTINUE);
2444 }
2445
2446 static int
2447 zio_dva_claim(zio_t *zio)
2448 {
2449         int error;
2450
2451         error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2452         if (error)
2453                 zio->io_error = error;
2454
2455         return (ZIO_PIPELINE_CONTINUE);
2456 }
2457
2458 /*
2459  * Undo an allocation.  This is used by zio_done() when an I/O fails
2460  * and we want to give back the block we just allocated.
2461  * This handles both normal blocks and gang blocks.
2462  */
2463 static void
2464 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2465 {
2466         int g;
2467
2468         ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2469         ASSERT(zio->io_bp_override == NULL);
2470
2471         if (!BP_IS_HOLE(bp))
2472                 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2473
2474         if (gn != NULL) {
2475                 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2476                         zio_dva_unallocate(zio, gn->gn_child[g],
2477                             &gn->gn_gbh->zg_blkptr[g]);
2478                 }
2479         }
2480 }
2481
2482 /*
2483  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2484  */
2485 int
2486 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
2487     boolean_t use_slog)
2488 {
2489         int error = 1;
2490
2491         ASSERT(txg > spa_syncing_txg(spa));
2492
2493         /*
2494          * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2495          * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2496          * when allocating them.
2497          */
2498         if (use_slog) {
2499                 error = metaslab_alloc(spa, spa_log_class(spa), size,
2500                     new_bp, 1, txg, NULL,
2501                     METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
2502         }
2503
2504         if (error) {
2505                 error = metaslab_alloc(spa, spa_normal_class(spa), size,
2506                     new_bp, 1, txg, NULL,
2507                     METASLAB_FASTWRITE);
2508         }
2509
2510         if (error == 0) {
2511                 BP_SET_LSIZE(new_bp, size);
2512                 BP_SET_PSIZE(new_bp, size);
2513                 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2514                 BP_SET_CHECKSUM(new_bp,
2515                     spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2516                     ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2517                 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2518                 BP_SET_LEVEL(new_bp, 0);
2519                 BP_SET_DEDUP(new_bp, 0);
2520                 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2521         }
2522
2523         return (error);
2524 }
2525
2526 /*
2527  * Free an intent log block.
2528  */
2529 void
2530 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2531 {
2532         ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2533         ASSERT(!BP_IS_GANG(bp));
2534
2535         zio_free(spa, txg, bp);
2536 }
2537
2538 /*
2539  * ==========================================================================
2540  * Read and write to physical devices
2541  * ==========================================================================
2542  */
2543 static int
2544 zio_vdev_io_start(zio_t *zio)
2545 {
2546         vdev_t *vd = zio->io_vd;
2547         uint64_t align;
2548         spa_t *spa = zio->io_spa;
2549
2550         ASSERT(zio->io_error == 0);
2551         ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2552
2553         if (vd == NULL) {
2554                 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2555                         spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2556
2557                 /*
2558                  * The mirror_ops handle multiple DVAs in a single BP.
2559                  */
2560                 return (vdev_mirror_ops.vdev_op_io_start(zio));
2561         }
2562
2563         /*
2564          * We keep track of time-sensitive I/Os so that the scan thread
2565          * can quickly react to certain workloads.  In particular, we care
2566          * about non-scrubbing, top-level reads and writes with the following
2567          * characteristics:
2568          *      - synchronous writes of user data to non-slog devices
2569          *      - any reads of user data
2570          * When these conditions are met, adjust the timestamp of spa_last_io
2571          * which allows the scan thread to adjust its workload accordingly.
2572          */
2573         if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2574             vd == vd->vdev_top && !vd->vdev_islog &&
2575             zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2576             zio->io_txg != spa_syncing_txg(spa)) {
2577                 uint64_t old = spa->spa_last_io;
2578                 uint64_t new = ddi_get_lbolt64();
2579                 if (old != new)
2580                         (void) atomic_cas_64(&spa->spa_last_io, old, new);
2581         }
2582
2583         align = 1ULL << vd->vdev_top->vdev_ashift;
2584
2585         if (P2PHASE(zio->io_size, align) != 0) {
2586                 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2587                 char *abuf = zio_buf_alloc(asize);
2588                 ASSERT(vd == vd->vdev_top);
2589                 if (zio->io_type == ZIO_TYPE_WRITE) {
2590                         bcopy(zio->io_data, abuf, zio->io_size);
2591                         bzero(abuf + zio->io_size, asize - zio->io_size);
2592                 }
2593                 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2594         }
2595
2596         ASSERT(P2PHASE(zio->io_offset, align) == 0);
2597         ASSERT(P2PHASE(zio->io_size, align) == 0);
2598         VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2599
2600         /*
2601          * If this is a repair I/O, and there's no self-healing involved --
2602          * that is, we're just resilvering what we expect to resilver --
2603          * then don't do the I/O unless zio's txg is actually in vd's DTL.
2604          * This prevents spurious resilvering with nested replication.
2605          * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2606          * A is out of date, we'll read from C+D, then use the data to
2607          * resilver A+B -- but we don't actually want to resilver B, just A.
2608          * The top-level mirror has no way to know this, so instead we just
2609          * discard unnecessary repairs as we work our way down the vdev tree.
2610          * The same logic applies to any form of nested replication:
2611          * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2612          */
2613         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2614             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2615             zio->io_txg != 0 && /* not a delegated i/o */
2616             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2617                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2618                 zio_vdev_io_bypass(zio);
2619                 return (ZIO_PIPELINE_CONTINUE);
2620         }
2621
2622         if (vd->vdev_ops->vdev_op_leaf &&
2623             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2624
2625                 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2626                         return (ZIO_PIPELINE_CONTINUE);
2627
2628                 if ((zio = vdev_queue_io(zio)) == NULL)
2629                         return (ZIO_PIPELINE_STOP);
2630
2631                 if (!vdev_accessible(vd, zio)) {
2632                         zio->io_error = SET_ERROR(ENXIO);
2633                         zio_interrupt(zio);
2634                         return (ZIO_PIPELINE_STOP);
2635                 }
2636         }
2637
2638         return (vd->vdev_ops->vdev_op_io_start(zio));
2639 }
2640
2641 static int
2642 zio_vdev_io_done(zio_t *zio)
2643 {
2644         vdev_t *vd = zio->io_vd;
2645         vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2646         boolean_t unexpected_error = B_FALSE;
2647
2648         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2649                 return (ZIO_PIPELINE_STOP);
2650
2651         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2652
2653         if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2654
2655                 vdev_queue_io_done(zio);
2656
2657                 if (zio->io_type == ZIO_TYPE_WRITE)
2658                         vdev_cache_write(zio);
2659
2660                 if (zio_injection_enabled && zio->io_error == 0)
2661                         zio->io_error = zio_handle_device_injection(vd,
2662                             zio, EIO);
2663
2664                 if (zio_injection_enabled && zio->io_error == 0)
2665                         zio->io_error = zio_handle_label_injection(zio, EIO);
2666
2667                 if (zio->io_error) {
2668                         if (!vdev_accessible(vd, zio)) {
2669                                 zio->io_error = SET_ERROR(ENXIO);
2670                         } else {
2671                                 unexpected_error = B_TRUE;
2672                         }
2673                 }
2674         }
2675
2676         ops->vdev_op_io_done(zio);
2677
2678         if (unexpected_error)
2679                 VERIFY(vdev_probe(vd, zio) == NULL);
2680
2681         return (ZIO_PIPELINE_CONTINUE);
2682 }
2683
2684 /*
2685  * For non-raidz ZIOs, we can just copy aside the bad data read from the
2686  * disk, and use that to finish the checksum ereport later.
2687  */
2688 static void
2689 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2690     const void *good_buf)
2691 {
2692         /* no processing needed */
2693         zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2694 }
2695
2696 /*ARGSUSED*/
2697 void
2698 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2699 {
2700         void *buf = zio_buf_alloc(zio->io_size);
2701
2702         bcopy(zio->io_data, buf, zio->io_size);
2703
2704         zcr->zcr_cbinfo = zio->io_size;
2705         zcr->zcr_cbdata = buf;
2706         zcr->zcr_finish = zio_vsd_default_cksum_finish;
2707         zcr->zcr_free = zio_buf_free;
2708 }
2709
2710 static int
2711 zio_vdev_io_assess(zio_t *zio)
2712 {
2713         vdev_t *vd = zio->io_vd;
2714
2715         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2716                 return (ZIO_PIPELINE_STOP);
2717
2718         if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2719                 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2720
2721         if (zio->io_vsd != NULL) {
2722                 zio->io_vsd_ops->vsd_free(zio);
2723                 zio->io_vsd = NULL;
2724         }
2725
2726         if (zio_injection_enabled && zio->io_error == 0)
2727                 zio->io_error = zio_handle_fault_injection(zio, EIO);
2728
2729         /*
2730          * If the I/O failed, determine whether we should attempt to retry it.
2731          *
2732          * On retry, we cut in line in the issue queue, since we don't want
2733          * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2734          */
2735         if (zio->io_error && vd == NULL &&
2736             !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2737                 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2738                 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2739                 zio->io_error = 0;
2740                 zio->io_flags |= ZIO_FLAG_IO_RETRY |
2741                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2742                 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2743                 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2744                     zio_requeue_io_start_cut_in_line);
2745                 return (ZIO_PIPELINE_STOP);
2746         }
2747
2748         /*
2749          * If we got an error on a leaf device, convert it to ENXIO
2750          * if the device is not accessible at all.
2751          */
2752         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2753             !vdev_accessible(vd, zio))
2754                 zio->io_error = SET_ERROR(ENXIO);
2755
2756         /*
2757          * If we can't write to an interior vdev (mirror or RAID-Z),
2758          * set vdev_cant_write so that we stop trying to allocate from it.
2759          */
2760         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2761             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2762                 vd->vdev_cant_write = B_TRUE;
2763         }
2764
2765         if (zio->io_error)
2766                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2767
2768         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2769             zio->io_physdone != NULL) {
2770                 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2771                 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2772                 zio->io_physdone(zio->io_logical);
2773         }
2774
2775         return (ZIO_PIPELINE_CONTINUE);
2776 }
2777
2778 void
2779 zio_vdev_io_reissue(zio_t *zio)
2780 {
2781         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2782         ASSERT(zio->io_error == 0);
2783
2784         zio->io_stage >>= 1;
2785 }
2786
2787 void
2788 zio_vdev_io_redone(zio_t *zio)
2789 {
2790         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2791
2792         zio->io_stage >>= 1;
2793 }
2794
2795 void
2796 zio_vdev_io_bypass(zio_t *zio)
2797 {
2798         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2799         ASSERT(zio->io_error == 0);
2800
2801         zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2802         zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2803 }
2804
2805 /*
2806  * ==========================================================================
2807  * Generate and verify checksums
2808  * ==========================================================================
2809  */
2810 static int
2811 zio_checksum_generate(zio_t *zio)
2812 {
2813         blkptr_t *bp = zio->io_bp;
2814         enum zio_checksum checksum;
2815
2816         if (bp == NULL) {
2817                 /*
2818                  * This is zio_write_phys().
2819                  * We're either generating a label checksum, or none at all.
2820                  */
2821                 checksum = zio->io_prop.zp_checksum;
2822
2823                 if (checksum == ZIO_CHECKSUM_OFF)
2824                         return (ZIO_PIPELINE_CONTINUE);
2825
2826                 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2827         } else {
2828                 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2829                         ASSERT(!IO_IS_ALLOCATING(zio));
2830                         checksum = ZIO_CHECKSUM_GANG_HEADER;
2831                 } else {
2832                         checksum = BP_GET_CHECKSUM(bp);
2833                 }
2834         }
2835
2836         zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2837
2838         return (ZIO_PIPELINE_CONTINUE);
2839 }
2840
2841 static int
2842 zio_checksum_verify(zio_t *zio)
2843 {
2844         zio_bad_cksum_t info;
2845         blkptr_t *bp = zio->io_bp;
2846         int error;
2847
2848         ASSERT(zio->io_vd != NULL);
2849
2850         if (bp == NULL) {
2851                 /*
2852                  * This is zio_read_phys().
2853                  * We're either verifying a label checksum, or nothing at all.
2854                  */
2855                 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2856                         return (ZIO_PIPELINE_CONTINUE);
2857
2858                 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2859         }
2860
2861         if ((error = zio_checksum_error(zio, &info)) != 0) {
2862                 zio->io_error = error;
2863                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2864                         zfs_ereport_start_checksum(zio->io_spa,
2865                             zio->io_vd, zio, zio->io_offset,
2866                             zio->io_size, NULL, &info);
2867                 }
2868         }
2869
2870         return (ZIO_PIPELINE_CONTINUE);
2871 }
2872
2873 /*
2874  * Called by RAID-Z to ensure we don't compute the checksum twice.
2875  */
2876 void
2877 zio_checksum_verified(zio_t *zio)
2878 {
2879         zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2880 }
2881
2882 /*
2883  * ==========================================================================
2884  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2885  * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2886  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2887  * indicate errors that are specific to one I/O, and most likely permanent.
2888  * Any other error is presumed to be worse because we weren't expecting it.
2889  * ==========================================================================
2890  */
2891 int
2892 zio_worst_error(int e1, int e2)
2893 {
2894         static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2895         int r1, r2;
2896
2897         for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2898                 if (e1 == zio_error_rank[r1])
2899                         break;
2900
2901         for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2902                 if (e2 == zio_error_rank[r2])
2903                         break;
2904
2905         return (r1 > r2 ? e1 : e2);
2906 }
2907
2908 /*
2909  * ==========================================================================
2910  * I/O completion
2911  * ==========================================================================
2912  */
2913 static int
2914 zio_ready(zio_t *zio)
2915 {
2916         blkptr_t *bp = zio->io_bp;
2917         zio_t *pio, *pio_next;
2918
2919         if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2920             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2921                 return (ZIO_PIPELINE_STOP);
2922
2923         if (zio->io_ready) {
2924                 ASSERT(IO_IS_ALLOCATING(zio));
2925                 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2926                     (zio->io_flags & ZIO_FLAG_NOPWRITE));
2927                 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2928
2929                 zio->io_ready(zio);
2930         }
2931
2932         if (bp != NULL && bp != &zio->io_bp_copy)
2933                 zio->io_bp_copy = *bp;
2934
2935         if (zio->io_error)
2936                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2937
2938         mutex_enter(&zio->io_lock);
2939         zio->io_state[ZIO_WAIT_READY] = 1;
2940         pio = zio_walk_parents(zio);
2941         mutex_exit(&zio->io_lock);
2942
2943         /*
2944          * As we notify zio's parents, new parents could be added.
2945          * New parents go to the head of zio's io_parent_list, however,
2946          * so we will (correctly) not notify them.  The remainder of zio's
2947          * io_parent_list, from 'pio_next' onward, cannot change because
2948          * all parents must wait for us to be done before they can be done.
2949          */
2950         for (; pio != NULL; pio = pio_next) {
2951                 pio_next = zio_walk_parents(zio);
2952                 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2953         }
2954
2955         if (zio->io_flags & ZIO_FLAG_NODATA) {
2956                 if (BP_IS_GANG(bp)) {
2957                         zio->io_flags &= ~ZIO_FLAG_NODATA;
2958                 } else {
2959                         ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2960                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2961                 }
2962         }
2963
2964         if (zio_injection_enabled &&
2965             zio->io_spa->spa_syncing_txg == zio->io_txg)
2966                 zio_handle_ignored_writes(zio);
2967
2968         return (ZIO_PIPELINE_CONTINUE);
2969 }
2970
2971 static int
2972 zio_done(zio_t *zio)
2973 {
2974         zio_t *pio, *pio_next;
2975         int c, w;
2976
2977         /*
2978          * If our children haven't all completed,
2979          * wait for them and then repeat this pipeline stage.
2980          */
2981         if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2982             zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2983             zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
2984             zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2985                 return (ZIO_PIPELINE_STOP);
2986
2987         for (c = 0; c < ZIO_CHILD_TYPES; c++)
2988                 for (w = 0; w < ZIO_WAIT_TYPES; w++)
2989                         ASSERT(zio->io_children[c][w] == 0);
2990
2991         if (zio->io_bp != NULL) {
2992                 ASSERT(zio->io_bp->blk_pad[0] == 0);
2993                 ASSERT(zio->io_bp->blk_pad[1] == 0);
2994                 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
2995                     sizeof (blkptr_t)) == 0 ||
2996                     (zio->io_bp == zio_unique_parent(zio)->io_bp));
2997                 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
2998                     zio->io_bp_override == NULL &&
2999                     !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3000                         ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
3001                         ASSERT3U(zio->io_prop.zp_copies, <=,
3002                             BP_GET_NDVAS(zio->io_bp));
3003                         ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
3004                             (BP_COUNT_GANG(zio->io_bp) ==
3005                             BP_GET_NDVAS(zio->io_bp)));
3006                 }
3007                 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3008                         VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
3009         }
3010
3011         /*
3012          * If there were child vdev/gang/ddt errors, they apply to us now.
3013          */
3014         zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3015         zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3016         zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3017
3018         /*
3019          * If the I/O on the transformed data was successful, generate any
3020          * checksum reports now while we still have the transformed data.
3021          */
3022         if (zio->io_error == 0) {
3023                 while (zio->io_cksum_report != NULL) {
3024                         zio_cksum_report_t *zcr = zio->io_cksum_report;
3025                         uint64_t align = zcr->zcr_align;
3026                         uint64_t asize = P2ROUNDUP(zio->io_size, align);
3027                         char *abuf = zio->io_data;
3028
3029                         if (asize != zio->io_size) {
3030                                 abuf = zio_buf_alloc(asize);
3031                                 bcopy(zio->io_data, abuf, zio->io_size);
3032                                 bzero(abuf+zio->io_size, asize-zio->io_size);
3033                         }
3034
3035                         zio->io_cksum_report = zcr->zcr_next;
3036                         zcr->zcr_next = NULL;
3037                         zcr->zcr_finish(zcr, abuf);
3038                         zfs_ereport_free_checksum(zcr);
3039
3040                         if (asize != zio->io_size)
3041                                 zio_buf_free(abuf, asize);
3042                 }
3043         }
3044
3045         zio_pop_transforms(zio);        /* note: may set zio->io_error */
3046
3047         vdev_stat_update(zio, zio->io_size);
3048
3049         /*
3050          * If this I/O is attached to a particular vdev is slow, exceeding
3051          * 30 seconds to complete, post an error described the I/O delay.
3052          * We ignore these errors if the device is currently unavailable.
3053          */
3054         if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
3055                 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
3056                         zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
3057                             zio->io_vd, zio, 0, 0);
3058         }
3059
3060         if (zio->io_error) {
3061                 /*
3062                  * If this I/O is attached to a particular vdev,
3063                  * generate an error message describing the I/O failure
3064                  * at the block level.  We ignore these errors if the
3065                  * device is currently unavailable.
3066                  */
3067                 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
3068                         !vdev_is_dead(zio->io_vd))
3069                         zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
3070                                                 zio->io_vd, zio, 0, 0);
3071
3072                 if ((zio->io_error == EIO || !(zio->io_flags &
3073                     (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3074                     zio == zio->io_logical) {
3075                         /*
3076                          * For logical I/O requests, tell the SPA to log the
3077                          * error and generate a logical data ereport.
3078                          */
3079                         spa_log_error(zio->io_spa, zio);
3080                         zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
3081                             NULL, zio, 0, 0);
3082                 }
3083         }
3084
3085         if (zio->io_error && zio == zio->io_logical) {
3086                 /*
3087                  * Determine whether zio should be reexecuted.  This will
3088                  * propagate all the way to the root via zio_notify_parent().
3089                  */
3090                 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
3091                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3092
3093                 if (IO_IS_ALLOCATING(zio) &&
3094                     !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3095                         if (zio->io_error != ENOSPC)
3096                                 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3097                         else
3098                                 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3099                 }
3100
3101                 if ((zio->io_type == ZIO_TYPE_READ ||
3102                     zio->io_type == ZIO_TYPE_FREE) &&
3103                     !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3104                     zio->io_error == ENXIO &&
3105                     spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
3106                     spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
3107                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3108
3109                 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3110                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3111
3112                 /*
3113                  * Here is a possibly good place to attempt to do
3114                  * either combinatorial reconstruction or error correction
3115                  * based on checksums.  It also might be a good place
3116                  * to send out preliminary ereports before we suspend
3117                  * processing.
3118                  */
3119         }
3120
3121         /*
3122          * If there were logical child errors, they apply to us now.
3123          * We defer this until now to avoid conflating logical child
3124          * errors with errors that happened to the zio itself when
3125          * updating vdev stats and reporting FMA events above.
3126          */
3127         zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3128
3129         if ((zio->io_error || zio->io_reexecute) &&
3130             IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3131             !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3132                 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
3133
3134         zio_gang_tree_free(&zio->io_gang_tree);
3135
3136         /*
3137          * Godfather I/Os should never suspend.
3138          */
3139         if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3140             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3141                 zio->io_reexecute = 0;
3142
3143         if (zio->io_reexecute) {
3144                 /*
3145                  * This is a logical I/O that wants to reexecute.
3146                  *
3147                  * Reexecute is top-down.  When an i/o fails, if it's not
3148                  * the root, it simply notifies its parent and sticks around.
3149                  * The parent, seeing that it still has children in zio_done(),
3150                  * does the same.  This percolates all the way up to the root.
3151                  * The root i/o will reexecute or suspend the entire tree.
3152                  *
3153                  * This approach ensures that zio_reexecute() honors
3154                  * all the original i/o dependency relationships, e.g.
3155                  * parents not executing until children are ready.
3156                  */
3157                 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3158
3159                 zio->io_gang_leader = NULL;
3160
3161                 mutex_enter(&zio->io_lock);
3162                 zio->io_state[ZIO_WAIT_DONE] = 1;
3163                 mutex_exit(&zio->io_lock);
3164
3165                 /*
3166                  * "The Godfather" I/O monitors its children but is
3167                  * not a true parent to them. It will track them through
3168                  * the pipeline but severs its ties whenever they get into
3169                  * trouble (e.g. suspended). This allows "The Godfather"
3170                  * I/O to return status without blocking.
3171                  */
3172                 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3173                         zio_link_t *zl = zio->io_walk_link;
3174                         pio_next = zio_walk_parents(zio);
3175
3176                         if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3177                             (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3178                                 zio_remove_child(pio, zio, zl);
3179                                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3180                         }
3181                 }
3182
3183                 if ((pio = zio_unique_parent(zio)) != NULL) {
3184                         /*
3185                          * We're not a root i/o, so there's nothing to do
3186                          * but notify our parent.  Don't propagate errors
3187                          * upward since we haven't permanently failed yet.
3188                          */
3189                         ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3190                         zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3191                         zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3192                 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3193                         /*
3194                          * We'd fail again if we reexecuted now, so suspend
3195                          * until conditions improve (e.g. device comes online).
3196                          */
3197                         zio_suspend(zio->io_spa, zio);
3198                 } else {
3199                         /*
3200                          * Reexecution is potentially a huge amount of work.
3201                          * Hand it off to the otherwise-unused claim taskq.
3202                          */
3203                         ASSERT(taskq_empty_ent(&zio->io_tqent));
3204                         spa_taskq_dispatch_ent(zio->io_spa,
3205                             ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
3206                             (task_func_t *)zio_reexecute, zio, 0,
3207                             &zio->io_tqent);
3208                 }
3209                 return (ZIO_PIPELINE_STOP);
3210         }
3211
3212         ASSERT(zio->io_child_count == 0);
3213         ASSERT(zio->io_reexecute == 0);
3214         ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3215
3216         /*
3217          * Report any checksum errors, since the I/O is complete.
3218          */
3219         while (zio->io_cksum_report != NULL) {
3220                 zio_cksum_report_t *zcr = zio->io_cksum_report;
3221                 zio->io_cksum_report = zcr->zcr_next;
3222                 zcr->zcr_next = NULL;
3223                 zcr->zcr_finish(zcr, NULL);
3224                 zfs_ereport_free_checksum(zcr);
3225         }
3226
3227         if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
3228             !BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
3229                 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
3230         }
3231
3232         /*
3233          * It is the responsibility of the done callback to ensure that this
3234          * particular zio is no longer discoverable for adoption, and as
3235          * such, cannot acquire any new parents.
3236          */
3237         if (zio->io_done)
3238                 zio->io_done(zio);
3239
3240         mutex_enter(&zio->io_lock);
3241         zio->io_state[ZIO_WAIT_DONE] = 1;
3242         mutex_exit(&zio->io_lock);
3243
3244         for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3245                 zio_link_t *zl = zio->io_walk_link;
3246                 pio_next = zio_walk_parents(zio);
3247                 zio_remove_child(pio, zio, zl);
3248                 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3249         }
3250
3251         if (zio->io_waiter != NULL) {
3252                 mutex_enter(&zio->io_lock);
3253                 zio->io_executor = NULL;
3254                 cv_broadcast(&zio->io_cv);
3255                 mutex_exit(&zio->io_lock);
3256         } else {
3257                 zio_destroy(zio);
3258         }
3259
3260         return (ZIO_PIPELINE_STOP);
3261 }
3262
3263 /*
3264  * ==========================================================================
3265  * I/O pipeline definition
3266  * ==========================================================================
3267  */
3268 static zio_pipe_stage_t *zio_pipeline[] = {
3269         NULL,
3270         zio_read_bp_init,
3271         zio_free_bp_init,
3272         zio_issue_async,
3273         zio_write_bp_init,
3274         zio_checksum_generate,
3275         zio_nop_write,
3276         zio_ddt_read_start,
3277         zio_ddt_read_done,
3278         zio_ddt_write,
3279         zio_ddt_free,
3280         zio_gang_assemble,
3281         zio_gang_issue,
3282         zio_dva_allocate,
3283         zio_dva_free,
3284         zio_dva_claim,
3285         zio_ready,
3286         zio_vdev_io_start,
3287         zio_vdev_io_done,
3288         zio_vdev_io_assess,
3289         zio_checksum_verify,
3290         zio_done
3291 };
3292
3293 /* dnp is the dnode for zb1->zb_object */
3294 boolean_t
3295 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
3296     const zbookmark_t *zb2)
3297 {
3298         uint64_t zb1nextL0, zb2thisobj;
3299
3300         ASSERT(zb1->zb_objset == zb2->zb_objset);
3301         ASSERT(zb2->zb_level == 0);
3302
3303         /*
3304          * A bookmark in the deadlist is considered to be after
3305          * everything else.
3306          */
3307         if (zb2->zb_object == DMU_DEADLIST_OBJECT)
3308                 return (B_TRUE);
3309
3310         /* The objset_phys_t isn't before anything. */
3311         if (dnp == NULL)
3312                 return (B_FALSE);
3313
3314         zb1nextL0 = (zb1->zb_blkid + 1) <<
3315             ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3316
3317         zb2thisobj = zb2->zb_object ? zb2->zb_object :
3318             zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3319
3320         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3321                 uint64_t nextobj = zb1nextL0 *
3322                     (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3323                 return (nextobj <= zb2thisobj);
3324         }
3325
3326         if (zb1->zb_object < zb2thisobj)
3327                 return (B_TRUE);
3328         if (zb1->zb_object > zb2thisobj)
3329                 return (B_FALSE);
3330         if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3331                 return (B_FALSE);
3332         return (zb1nextL0 <= zb2->zb_blkid);
3333 }
3334
3335 #if defined(_KERNEL) && defined(HAVE_SPL)
3336 /* Fault injection */
3337 EXPORT_SYMBOL(zio_injection_enabled);
3338 EXPORT_SYMBOL(zio_inject_fault);
3339 EXPORT_SYMBOL(zio_inject_list_next);
3340 EXPORT_SYMBOL(zio_clear_fault);
3341 EXPORT_SYMBOL(zio_handle_fault_injection);
3342 EXPORT_SYMBOL(zio_handle_device_injection);
3343 EXPORT_SYMBOL(zio_handle_label_injection);
3344 EXPORT_SYMBOL(zio_type_name);
3345
3346 module_param(zio_bulk_flags, int, 0644);
3347 MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers");
3348
3349 module_param(zio_delay_max, int, 0644);
3350 MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
3351
3352 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
3353 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
3354
3355 module_param(zfs_sync_pass_deferred_free, int, 0644);
3356 MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
3357         "Defer frees starting in this pass");
3358
3359 module_param(zfs_sync_pass_dont_compress, int, 0644);
3360 MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
3361         "Don't compress starting in this pass");
3362
3363 module_param(zfs_sync_pass_rewrite, int, 0644);
3364 MODULE_PARM_DESC(zfs_sync_pass_rewrite,
3365         "Rewrite new bps starting in this pass");
3366 #endif