]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/zio.c
Restrict kstats and print real pointers
[mirror_zfs.git] / module / zfs / zio.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
492f64e9 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
a38718a6 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
cc99f275 25 * Copyright (c) 2017, Intel Corporation.
34dc7c2f
BB
26 */
27
f1512ee6 28#include <sys/sysmacros.h>
34dc7c2f
BB
29#include <sys/zfs_context.h>
30#include <sys/fm/fs/zfs.h>
31#include <sys/spa.h>
32#include <sys/txg.h>
33#include <sys/spa_impl.h>
34#include <sys/vdev_impl.h>
1b939560 35#include <sys/vdev_trim.h>
34dc7c2f
BB
36#include <sys/zio_impl.h>
37#include <sys/zio_compress.h>
38#include <sys/zio_checksum.h>
428870ff
BB
39#include <sys/dmu_objset.h>
40#include <sys/arc.h>
41#include <sys/ddt.h>
9b67f605 42#include <sys/blkptr.h>
b0bc7a84 43#include <sys/zfeature.h>
d4a72f23 44#include <sys/dsl_scan.h>
3dfb57a3 45#include <sys/metaslab_impl.h>
193a37cb 46#include <sys/time.h>
26ef0cc7 47#include <sys/trace_zio.h>
a6255b7f 48#include <sys/abd.h>
b5256303 49#include <sys/dsl_crypt.h>
492f64e9 50#include <sys/cityhash.h>
34dc7c2f 51
34dc7c2f
BB
52/*
53 * ==========================================================================
54 * I/O type descriptions
55 * ==========================================================================
56 */
e8b96c60 57const char *zio_type_name[ZIO_TYPES] = {
3dfb57a3
DB
58 /*
59 * Note: Linux kernel thread name length is limited
60 * so these names will differ from upstream open zfs.
61 */
1b939560 62 "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
428870ff 63};
34dc7c2f 64
27f2b90d 65int zio_dva_throttle_enabled = B_TRUE;
638dd5f4 66int zio_deadman_log_all = B_FALSE;
3dfb57a3 67
34dc7c2f
BB
68/*
69 * ==========================================================================
70 * I/O kmem caches
71 * ==========================================================================
72 */
73kmem_cache_t *zio_cache;
d164b209 74kmem_cache_t *zio_link_cache;
34dc7c2f
BB
75kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
76kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
a6255b7f
DQ
77#if defined(ZFS_DEBUG) && !defined(_KERNEL)
78uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
79uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
80#endif
81
ad796b8a
TH
82/* Mark IOs as "slow" if they take longer than 30 seconds */
83int zio_slow_io_ms = (30 * MILLISEC);
34dc7c2f 84
fcff0f35
PD
85#define BP_SPANB(indblkshift, level) \
86 (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
87#define COMPARE_META_LEVEL 0x80000000ul
55d85d5a
GW
88/*
89 * The following actions directly effect the spa's sync-to-convergence logic.
90 * The values below define the sync pass when we start performing the action.
91 * Care should be taken when changing these values as they directly impact
92 * spa_sync() performance. Tuning these values may introduce subtle performance
93 * pathologies and should only be done in the context of performance analysis.
94 * These tunables will eventually be removed and replaced with #defines once
95 * enough analysis has been done to determine optimal values.
96 *
97 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
98 * regular blocks are not deferred.
99 */
100int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
101int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
102int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
103
34dc7c2f 104/*
b128c09f
BB
105 * An allocating zio is one that either currently has the DVA allocate
106 * stage set or will have it later in its lifetime.
34dc7c2f 107 */
428870ff
BB
108#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
109
c409e464 110int zio_requeue_io_start_cut_in_line = 1;
428870ff
BB
111
112#ifdef ZFS_DEBUG
113int zio_buf_debug_limit = 16384;
114#else
115int zio_buf_debug_limit = 0;
116#endif
34dc7c2f 117
da6b4005
NB
118static inline void __zio_execute(zio_t *zio);
119
3dfb57a3
DB
120static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
121
34dc7c2f
BB
122void
123zio_init(void)
124{
125 size_t c;
126 vmem_t *data_alloc_arena = NULL;
127
3941503c
BB
128 zio_cache = kmem_cache_create("zio_cache",
129 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
d164b209 130 zio_link_cache = kmem_cache_create("zio_link_cache",
6795a698 131 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
34dc7c2f
BB
132
133 /*
134 * For small buffers, we want a cache for each multiple of
f1512ee6
MA
135 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
136 * for each quarter-power of 2.
34dc7c2f
BB
137 */
138 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
139 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
140 size_t p2 = size;
141 size_t align = 0;
6442f3cf 142 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
34dc7c2f 143
34328f3c 144#if defined(_ILP32) && defined(_KERNEL)
f1512ee6
MA
145 /*
146 * Cache size limited to 1M on 32-bit platforms until ARC
147 * buffers no longer require virtual address space.
148 */
149 if (size > zfs_max_recordsize)
150 break;
151#endif
152
153 while (!ISP2(p2))
34dc7c2f
BB
154 p2 &= p2 - 1;
155
498877ba
MA
156#ifndef _KERNEL
157 /*
158 * If we are using watchpoints, put each buffer on its own page,
159 * to eliminate the performance overhead of trapping to the
160 * kernel when modifying a non-watched buffer that shares the
161 * page with a watched buffer.
162 */
163 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
164 continue;
fcf64f45
BB
165 /*
166 * Here's the problem - on 4K native devices in userland on
167 * Linux using O_DIRECT, buffers must be 4K aligned or I/O
168 * will fail with EINVAL, causing zdb (and others) to coredump.
169 * Since userland probably doesn't need optimized buffer caches,
170 * we just force 4K alignment on everything.
171 */
172 align = 8 * SPA_MINBLOCKSIZE;
173#else
24fa2034 174 if (size < PAGESIZE) {
34dc7c2f 175 align = SPA_MINBLOCKSIZE;
498877ba 176 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
24fa2034 177 align = PAGESIZE;
34dc7c2f 178 }
fcf64f45 179#endif
34dc7c2f
BB
180
181 if (align != 0) {
182 char name[36];
183 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
184 zio_buf_cache[c] = kmem_cache_create(name, size,
6442f3cf 185 align, NULL, NULL, NULL, NULL, NULL, cflags);
34dc7c2f
BB
186
187 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
188 zio_data_buf_cache[c] = kmem_cache_create(name, size,
ae6ba3db 189 align, NULL, NULL, NULL, NULL,
6442f3cf 190 data_alloc_arena, cflags);
34dc7c2f
BB
191 }
192 }
193
194 while (--c != 0) {
195 ASSERT(zio_buf_cache[c] != NULL);
196 if (zio_buf_cache[c - 1] == NULL)
197 zio_buf_cache[c - 1] = zio_buf_cache[c];
198
199 ASSERT(zio_data_buf_cache[c] != NULL);
200 if (zio_data_buf_cache[c - 1] == NULL)
201 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
202 }
203
34dc7c2f 204 zio_inject_init();
9759c60f
ED
205
206 lz4_init();
34dc7c2f
BB
207}
208
209void
210zio_fini(void)
211{
212 size_t c;
213 kmem_cache_t *last_cache = NULL;
214 kmem_cache_t *last_data_cache = NULL;
215
216 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
f1512ee6
MA
217#ifdef _ILP32
218 /*
219 * Cache size limited to 1M on 32-bit platforms until ARC
220 * buffers no longer require virtual address space.
221 */
222 if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
223 break;
a6255b7f
DQ
224#endif
225#if defined(ZFS_DEBUG) && !defined(_KERNEL)
226 if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c])
227 (void) printf("zio_fini: [%d] %llu != %llu\n",
228 (int)((c + 1) << SPA_MINBLOCKSHIFT),
229 (long long unsigned)zio_buf_cache_allocs[c],
230 (long long unsigned)zio_buf_cache_frees[c]);
f1512ee6 231#endif
34dc7c2f
BB
232 if (zio_buf_cache[c] != last_cache) {
233 last_cache = zio_buf_cache[c];
234 kmem_cache_destroy(zio_buf_cache[c]);
235 }
236 zio_buf_cache[c] = NULL;
237
238 if (zio_data_buf_cache[c] != last_data_cache) {
239 last_data_cache = zio_data_buf_cache[c];
240 kmem_cache_destroy(zio_data_buf_cache[c]);
241 }
242 zio_data_buf_cache[c] = NULL;
243 }
244
d164b209 245 kmem_cache_destroy(zio_link_cache);
34dc7c2f
BB
246 kmem_cache_destroy(zio_cache);
247
248 zio_inject_fini();
9759c60f
ED
249
250 lz4_fini();
34dc7c2f
BB
251}
252
253/*
254 * ==========================================================================
255 * Allocate and free I/O buffers
256 * ==========================================================================
257 */
258
259/*
260 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
261 * crashdump if the kernel panics, so use it judiciously. Obviously, it's
262 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
263 * excess / transient data in-core during a crashdump.
264 */
265void *
266zio_buf_alloc(size_t size)
267{
268 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
269
63e3a861 270 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
a6255b7f
DQ
271#if defined(ZFS_DEBUG) && !defined(_KERNEL)
272 atomic_add_64(&zio_buf_cache_allocs[c], 1);
273#endif
34dc7c2f 274
efcd79a8 275 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
34dc7c2f
BB
276}
277
278/*
279 * Use zio_data_buf_alloc to allocate data. The data will not appear in a
280 * crashdump if the kernel panics. This exists so that we will limit the amount
281 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
282 * of kernel heap dumped to disk when the kernel panics)
283 */
284void *
285zio_data_buf_alloc(size_t size)
286{
287 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
288
63e3a861 289 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
34dc7c2f 290
efcd79a8 291 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
34dc7c2f
BB
292}
293
294void
295zio_buf_free(void *buf, size_t size)
296{
297 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
298
63e3a861 299 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
a6255b7f
DQ
300#if defined(ZFS_DEBUG) && !defined(_KERNEL)
301 atomic_add_64(&zio_buf_cache_frees[c], 1);
302#endif
34dc7c2f
BB
303
304 kmem_cache_free(zio_buf_cache[c], buf);
305}
306
307void
308zio_data_buf_free(void *buf, size_t size)
309{
310 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
311
63e3a861 312 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
34dc7c2f
BB
313
314 kmem_cache_free(zio_data_buf_cache[c], buf);
315}
316
84c07ada
GN
317static void
318zio_abd_free(void *abd, size_t size)
319{
320 abd_free((abd_t *)abd);
321}
322
34dc7c2f
BB
323/*
324 * ==========================================================================
325 * Push and pop I/O transform buffers
326 * ==========================================================================
327 */
d3c2ae1c 328void
a6255b7f 329zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
e9aa730c 330 zio_transform_func_t *transform)
34dc7c2f 331{
79c76d5b 332 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
34dc7c2f 333
a6255b7f
DQ
334 /*
335 * Ensure that anyone expecting this zio to contain a linear ABD isn't
336 * going to get a nasty surprise when they try to access the data.
337 */
338 IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
339
340 zt->zt_orig_abd = zio->io_abd;
b128c09f 341 zt->zt_orig_size = zio->io_size;
34dc7c2f 342 zt->zt_bufsize = bufsize;
b128c09f 343 zt->zt_transform = transform;
34dc7c2f
BB
344
345 zt->zt_next = zio->io_transform_stack;
346 zio->io_transform_stack = zt;
347
a6255b7f 348 zio->io_abd = data;
34dc7c2f
BB
349 zio->io_size = size;
350}
351
d3c2ae1c 352void
b128c09f 353zio_pop_transforms(zio_t *zio)
34dc7c2f 354{
b128c09f
BB
355 zio_transform_t *zt;
356
357 while ((zt = zio->io_transform_stack) != NULL) {
358 if (zt->zt_transform != NULL)
359 zt->zt_transform(zio,
a6255b7f 360 zt->zt_orig_abd, zt->zt_orig_size);
34dc7c2f 361
428870ff 362 if (zt->zt_bufsize != 0)
a6255b7f 363 abd_free(zio->io_abd);
34dc7c2f 364
a6255b7f 365 zio->io_abd = zt->zt_orig_abd;
b128c09f
BB
366 zio->io_size = zt->zt_orig_size;
367 zio->io_transform_stack = zt->zt_next;
34dc7c2f 368
b128c09f 369 kmem_free(zt, sizeof (zio_transform_t));
34dc7c2f
BB
370 }
371}
372
b128c09f
BB
373/*
374 * ==========================================================================
b5256303 375 * I/O transform callbacks for subblocks, decompression, and decryption
b128c09f
BB
376 * ==========================================================================
377 */
378static void
a6255b7f 379zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
b128c09f
BB
380{
381 ASSERT(zio->io_size > size);
382
383 if (zio->io_type == ZIO_TYPE_READ)
a6255b7f 384 abd_copy(data, zio->io_abd, size);
b128c09f
BB
385}
386
387static void
a6255b7f 388zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
b128c09f 389{
a6255b7f
DQ
390 if (zio->io_error == 0) {
391 void *tmp = abd_borrow_buf(data, size);
392 int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
393 zio->io_abd, tmp, zio->io_size, size);
394 abd_return_buf_copy(data, tmp, size);
395
c3bd3fb4
TC
396 if (zio_injection_enabled && ret == 0)
397 ret = zio_handle_fault_injection(zio, EINVAL);
398
a6255b7f
DQ
399 if (ret != 0)
400 zio->io_error = SET_ERROR(EIO);
401 }
b128c09f
BB
402}
403
b5256303
TC
404static void
405zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
406{
407 int ret;
408 void *tmp;
409 blkptr_t *bp = zio->io_bp;
ae76f45c
TC
410 spa_t *spa = zio->io_spa;
411 uint64_t dsobj = zio->io_bookmark.zb_objset;
b5256303
TC
412 uint64_t lsize = BP_GET_LSIZE(bp);
413 dmu_object_type_t ot = BP_GET_TYPE(bp);
414 uint8_t salt[ZIO_DATA_SALT_LEN];
415 uint8_t iv[ZIO_DATA_IV_LEN];
416 uint8_t mac[ZIO_DATA_MAC_LEN];
417 boolean_t no_crypt = B_FALSE;
418
419 ASSERT(BP_USES_CRYPT(bp));
420 ASSERT3U(size, !=, 0);
421
422 if (zio->io_error != 0)
423 return;
424
425 /*
426 * Verify the cksum of MACs stored in an indirect bp. It will always
427 * be possible to verify this since it does not require an encryption
428 * key.
429 */
430 if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
431 zio_crypt_decode_mac_bp(bp, mac);
432
433 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
434 /*
435 * We haven't decompressed the data yet, but
436 * zio_crypt_do_indirect_mac_checksum() requires
437 * decompressed data to be able to parse out the MACs
438 * from the indirect block. We decompress it now and
439 * throw away the result after we are finished.
440 */
441 tmp = zio_buf_alloc(lsize);
442 ret = zio_decompress_data(BP_GET_COMPRESS(bp),
443 zio->io_abd, tmp, zio->io_size, lsize);
444 if (ret != 0) {
445 ret = SET_ERROR(EIO);
446 goto error;
447 }
448 ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
449 tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
450 zio_buf_free(tmp, lsize);
451 } else {
452 ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
453 zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
454 }
455 abd_copy(data, zio->io_abd, size);
456
be9a5c35
TC
457 if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
458 ret = zio_handle_decrypt_injection(spa,
459 &zio->io_bookmark, ot, ECKSUM);
460 }
b5256303
TC
461 if (ret != 0)
462 goto error;
463
464 return;
465 }
466
467 /*
468 * If this is an authenticated block, just check the MAC. It would be
469 * nice to separate this out into its own flag, but for the moment
470 * enum zio_flag is out of bits.
471 */
472 if (BP_IS_AUTHENTICATED(bp)) {
473 if (ot == DMU_OT_OBJSET) {
ae76f45c
TC
474 ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
475 dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
b5256303
TC
476 } else {
477 zio_crypt_decode_mac_bp(bp, mac);
ae76f45c
TC
478 ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
479 zio->io_abd, size, mac);
be9a5c35
TC
480 if (zio_injection_enabled && ret == 0) {
481 ret = zio_handle_decrypt_injection(spa,
482 &zio->io_bookmark, ot, ECKSUM);
483 }
b5256303
TC
484 }
485 abd_copy(data, zio->io_abd, size);
486
487 if (ret != 0)
488 goto error;
489
490 return;
491 }
492
493 zio_crypt_decode_params_bp(bp, salt, iv);
494
495 if (ot == DMU_OT_INTENT_LOG) {
496 tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
497 zio_crypt_decode_mac_zil(tmp, mac);
498 abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
499 } else {
500 zio_crypt_decode_mac_bp(bp, mac);
501 }
502
be9a5c35
TC
503 ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
504 BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
505 zio->io_abd, &no_crypt);
b5256303
TC
506 if (no_crypt)
507 abd_copy(data, zio->io_abd, size);
508
509 if (ret != 0)
510 goto error;
511
512 return;
513
514error:
515 /* assert that the key was found unless this was speculative */
be9a5c35 516 ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
b5256303
TC
517
518 /*
519 * If there was a decryption / authentication error return EIO as
520 * the io_error. If this was not a speculative zio, create an ereport.
521 */
522 if (ret == ECKSUM) {
a2c2ed1b 523 zio->io_error = SET_ERROR(EIO);
b5256303 524 if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
be9a5c35 525 spa_log_error(spa, &zio->io_bookmark);
b5256303 526 zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
ae76f45c 527 spa, NULL, &zio->io_bookmark, zio, 0, 0);
b5256303
TC
528 }
529 } else {
530 zio->io_error = ret;
531 }
532}
533
b128c09f
BB
534/*
535 * ==========================================================================
536 * I/O parent/child relationships and pipeline interlocks
537 * ==========================================================================
538 */
d164b209 539zio_t *
3dfb57a3 540zio_walk_parents(zio_t *cio, zio_link_t **zl)
d164b209 541{
d164b209 542 list_t *pl = &cio->io_parent_list;
b128c09f 543
3dfb57a3
DB
544 *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
545 if (*zl == NULL)
d164b209
BB
546 return (NULL);
547
3dfb57a3
DB
548 ASSERT((*zl)->zl_child == cio);
549 return ((*zl)->zl_parent);
d164b209
BB
550}
551
552zio_t *
3dfb57a3 553zio_walk_children(zio_t *pio, zio_link_t **zl)
d164b209 554{
d164b209
BB
555 list_t *cl = &pio->io_child_list;
556
a8b2e306
TC
557 ASSERT(MUTEX_HELD(&pio->io_lock));
558
3dfb57a3
DB
559 *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
560 if (*zl == NULL)
d164b209
BB
561 return (NULL);
562
3dfb57a3
DB
563 ASSERT((*zl)->zl_parent == pio);
564 return ((*zl)->zl_child);
d164b209
BB
565}
566
567zio_t *
568zio_unique_parent(zio_t *cio)
569{
3dfb57a3
DB
570 zio_link_t *zl = NULL;
571 zio_t *pio = zio_walk_parents(cio, &zl);
d164b209 572
3dfb57a3 573 VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
d164b209
BB
574 return (pio);
575}
576
577void
578zio_add_child(zio_t *pio, zio_t *cio)
b128c09f 579{
79c76d5b 580 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
d164b209
BB
581
582 /*
583 * Logical I/Os can have logical, gang, or vdev children.
584 * Gang I/Os can have gang or vdev children.
585 * Vdev I/Os can only have vdev children.
586 * The following ASSERT captures all of these constraints.
587 */
1ce23dca 588 ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
d164b209
BB
589
590 zl->zl_parent = pio;
591 zl->zl_child = cio;
592
b128c09f 593 mutex_enter(&pio->io_lock);
a8b2e306 594 mutex_enter(&cio->io_lock);
d164b209
BB
595
596 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
597
1c27024e 598 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
d164b209
BB
599 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
600
601 list_insert_head(&pio->io_child_list, zl);
602 list_insert_head(&cio->io_parent_list, zl);
603
428870ff
BB
604 pio->io_child_count++;
605 cio->io_parent_count++;
606
d164b209 607 mutex_exit(&cio->io_lock);
a8b2e306 608 mutex_exit(&pio->io_lock);
b128c09f
BB
609}
610
34dc7c2f 611static void
d164b209 612zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
b128c09f 613{
d164b209
BB
614 ASSERT(zl->zl_parent == pio);
615 ASSERT(zl->zl_child == cio);
b128c09f
BB
616
617 mutex_enter(&pio->io_lock);
a8b2e306 618 mutex_enter(&cio->io_lock);
d164b209
BB
619
620 list_remove(&pio->io_child_list, zl);
621 list_remove(&cio->io_parent_list, zl);
622
428870ff
BB
623 pio->io_child_count--;
624 cio->io_parent_count--;
625
d164b209 626 mutex_exit(&cio->io_lock);
a8b2e306 627 mutex_exit(&pio->io_lock);
d164b209 628 kmem_cache_free(zio_link_cache, zl);
b128c09f
BB
629}
630
631static boolean_t
ddc751d5 632zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
34dc7c2f 633{
b128c09f
BB
634 boolean_t waiting = B_FALSE;
635
636 mutex_enter(&zio->io_lock);
637 ASSERT(zio->io_stall == NULL);
ddc751d5
GW
638 for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
639 if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
640 continue;
641
642 uint64_t *countp = &zio->io_children[c][wait];
643 if (*countp != 0) {
644 zio->io_stage >>= 1;
645 ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
646 zio->io_stall = countp;
647 waiting = B_TRUE;
648 break;
649 }
b128c09f
BB
650 }
651 mutex_exit(&zio->io_lock);
b128c09f
BB
652 return (waiting);
653}
34dc7c2f 654
bf701a83
BB
655__attribute__((always_inline))
656static inline void
62840030
MA
657zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
658 zio_t **next_to_executep)
b128c09f
BB
659{
660 uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
661 int *errorp = &pio->io_child_error[zio->io_child_type];
34dc7c2f 662
b128c09f
BB
663 mutex_enter(&pio->io_lock);
664 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
665 *errorp = zio_worst_error(*errorp, zio->io_error);
666 pio->io_reexecute |= zio->io_reexecute;
667 ASSERT3U(*countp, >, 0);
e8b96c60
MA
668
669 (*countp)--;
670
671 if (*countp == 0 && pio->io_stall == countp) {
3dfb57a3
DB
672 zio_taskq_type_t type =
673 pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
674 ZIO_TASKQ_INTERRUPT;
b128c09f
BB
675 pio->io_stall = NULL;
676 mutex_exit(&pio->io_lock);
62840030 677
3dfb57a3 678 /*
62840030
MA
679 * If we can tell the caller to execute this parent next, do
680 * so. Otherwise dispatch the parent zio as its own task.
681 *
682 * Having the caller execute the parent when possible reduces
683 * locking on the zio taskq's, reduces context switch
684 * overhead, and has no recursion penalty. Note that one
685 * read from disk typically causes at least 3 zio's: a
686 * zio_null(), the logical zio_read(), and then a physical
687 * zio. When the physical ZIO completes, we are able to call
688 * zio_done() on all 3 of these zio's from one invocation of
689 * zio_execute() by returning the parent back to
690 * zio_execute(). Since the parent isn't executed until this
691 * thread returns back to zio_execute(), the caller should do
692 * so promptly.
693 *
694 * In other cases, dispatching the parent prevents
695 * overflowing the stack when we have deeply nested
696 * parent-child relationships, as we do with the "mega zio"
697 * of writes for spa_sync(), and the chain of ZIL blocks.
3dfb57a3 698 */
62840030
MA
699 if (next_to_executep != NULL && *next_to_executep == NULL) {
700 *next_to_executep = pio;
701 } else {
702 zio_taskq_dispatch(pio, type, B_FALSE);
703 }
b128c09f
BB
704 } else {
705 mutex_exit(&pio->io_lock);
34dc7c2f
BB
706 }
707}
708
b128c09f
BB
709static void
710zio_inherit_child_errors(zio_t *zio, enum zio_child c)
711{
712 if (zio->io_child_error[c] != 0 && zio->io_error == 0)
713 zio->io_error = zio->io_child_error[c];
714}
715
3dfb57a3 716int
64fc7762 717zio_bookmark_compare(const void *x1, const void *x2)
3dfb57a3
DB
718{
719 const zio_t *z1 = x1;
720 const zio_t *z2 = x2;
3dfb57a3 721
64fc7762
MA
722 if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
723 return (-1);
724 if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
725 return (1);
3dfb57a3 726
64fc7762
MA
727 if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
728 return (-1);
729 if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
730 return (1);
3dfb57a3 731
64fc7762
MA
732 if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
733 return (-1);
734 if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
735 return (1);
736
737 if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
738 return (-1);
739 if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
740 return (1);
741
742 if (z1 < z2)
743 return (-1);
744 if (z1 > z2)
745 return (1);
746
747 return (0);
3dfb57a3
DB
748}
749
34dc7c2f
BB
750/*
751 * ==========================================================================
b128c09f 752 * Create the various types of I/O (read, write, free, etc)
34dc7c2f
BB
753 * ==========================================================================
754 */
755static zio_t *
428870ff 756zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
a6255b7f 757 abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
2aa34383
DK
758 void *private, zio_type_t type, zio_priority_t priority,
759 enum zio_flag flags, vdev_t *vd, uint64_t offset,
760 const zbookmark_phys_t *zb, enum zio_stage stage,
761 enum zio_stage pipeline)
34dc7c2f
BB
762{
763 zio_t *zio;
764
1b939560 765 IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
2aa34383 766 ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
b128c09f
BB
767 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
768
769 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
770 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
771 ASSERT(vd || stage == ZIO_STAGE_OPEN);
34dc7c2f 772
b5256303 773 IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
2aa34383 774
79c76d5b 775 zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
3941503c
BB
776 bzero(zio, sizeof (zio_t));
777
448d7aaa 778 mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
3941503c
BB
779 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
780
781 list_create(&zio->io_parent_list, sizeof (zio_link_t),
782 offsetof(zio_link_t, zl_parent_node));
783 list_create(&zio->io_child_list, sizeof (zio_link_t),
784 offsetof(zio_link_t, zl_child_node));
4e21fd06 785 metaslab_trace_init(&zio->io_alloc_list);
d164b209 786
b128c09f
BB
787 if (vd != NULL)
788 zio->io_child_type = ZIO_CHILD_VDEV;
789 else if (flags & ZIO_FLAG_GANG_CHILD)
790 zio->io_child_type = ZIO_CHILD_GANG;
428870ff
BB
791 else if (flags & ZIO_FLAG_DDT_CHILD)
792 zio->io_child_type = ZIO_CHILD_DDT;
b128c09f
BB
793 else
794 zio->io_child_type = ZIO_CHILD_LOGICAL;
795
34dc7c2f 796 if (bp != NULL) {
428870ff 797 zio->io_bp = (blkptr_t *)bp;
34dc7c2f
BB
798 zio->io_bp_copy = *bp;
799 zio->io_bp_orig = *bp;
428870ff
BB
800 if (type != ZIO_TYPE_WRITE ||
801 zio->io_child_type == ZIO_CHILD_DDT)
b128c09f 802 zio->io_bp = &zio->io_bp_copy; /* so caller can free */
9babb374 803 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
b128c09f 804 zio->io_logical = zio;
9babb374
BB
805 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
806 pipeline |= ZIO_GANG_STAGES;
34dc7c2f 807 }
b128c09f
BB
808
809 zio->io_spa = spa;
810 zio->io_txg = txg;
34dc7c2f
BB
811 zio->io_done = done;
812 zio->io_private = private;
813 zio->io_type = type;
814 zio->io_priority = priority;
b128c09f
BB
815 zio->io_vd = vd;
816 zio->io_offset = offset;
a6255b7f 817 zio->io_orig_abd = zio->io_abd = data;
2aa34383
DK
818 zio->io_orig_size = zio->io_size = psize;
819 zio->io_lsize = lsize;
b128c09f
BB
820 zio->io_orig_flags = zio->io_flags = flags;
821 zio->io_orig_stage = zio->io_stage = stage;
822 zio->io_orig_pipeline = zio->io_pipeline = pipeline;
3dfb57a3 823 zio->io_pipeline_trace = ZIO_STAGE_OPEN;
34dc7c2f 824
d164b209
BB
825 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
826 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
827
b128c09f
BB
828 if (zb != NULL)
829 zio->io_bookmark = *zb;
830
831 if (pio != NULL) {
cc99f275
DB
832 if (zio->io_metaslab_class == NULL)
833 zio->io_metaslab_class = pio->io_metaslab_class;
b128c09f 834 if (zio->io_logical == NULL)
34dc7c2f 835 zio->io_logical = pio->io_logical;
9babb374
BB
836 if (zio->io_child_type == ZIO_CHILD_GANG)
837 zio->io_gang_leader = pio->io_gang_leader;
b128c09f 838 zio_add_child(pio, zio);
34dc7c2f
BB
839 }
840
a38718a6
GA
841 taskq_init_ent(&zio->io_tqent);
842
34dc7c2f
BB
843 return (zio);
844}
845
846static void
b128c09f 847zio_destroy(zio_t *zio)
34dc7c2f 848{
4e21fd06 849 metaslab_trace_fini(&zio->io_alloc_list);
3941503c
BB
850 list_destroy(&zio->io_parent_list);
851 list_destroy(&zio->io_child_list);
852 mutex_destroy(&zio->io_lock);
853 cv_destroy(&zio->io_cv);
b128c09f 854 kmem_cache_free(zio_cache, zio);
34dc7c2f
BB
855}
856
857zio_t *
d164b209 858zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
428870ff 859 void *private, enum zio_flag flags)
34dc7c2f
BB
860{
861 zio_t *zio;
862
2aa34383 863 zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
d164b209 864 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
b128c09f 865 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
34dc7c2f
BB
866
867 return (zio);
868}
869
870zio_t *
428870ff 871zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
34dc7c2f 872{
d164b209 873 return (zio_null(NULL, spa, NULL, done, private, flags));
34dc7c2f
BB
874}
875
63e3a861
MA
876void
877zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
878{
63e3a861
MA
879 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
880 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
881 bp, (longlong_t)BP_GET_TYPE(bp));
882 }
883 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
884 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
885 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
886 bp, (longlong_t)BP_GET_CHECKSUM(bp));
887 }
888 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
889 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
890 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
891 bp, (longlong_t)BP_GET_COMPRESS(bp));
892 }
893 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
894 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
895 bp, (longlong_t)BP_GET_LSIZE(bp));
896 }
897 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
898 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
899 bp, (longlong_t)BP_GET_PSIZE(bp));
900 }
901
902 if (BP_IS_EMBEDDED(bp)) {
903 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
904 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
905 bp, (longlong_t)BPE_GET_ETYPE(bp));
906 }
907 }
908
6cb8e530
PZ
909 /*
910 * Do not verify individual DVAs if the config is not trusted. This
911 * will be done once the zio is executed in vdev_mirror_map_alloc.
912 */
913 if (!spa->spa_trust_config)
914 return;
915
63e3a861
MA
916 /*
917 * Pool-specific checks.
918 *
919 * Note: it would be nice to verify that the blk_birth and
920 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
921 * allows the birth time of log blocks (and dmu_sync()-ed blocks
922 * that are in the log) to be arbitrarily large.
923 */
1c27024e 924 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
63e3a861 925 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
1c27024e 926
63e3a861
MA
927 if (vdevid >= spa->spa_root_vdev->vdev_children) {
928 zfs_panic_recover("blkptr at %p DVA %u has invalid "
929 "VDEV %llu",
930 bp, i, (longlong_t)vdevid);
ee3a23b8 931 continue;
63e3a861 932 }
1c27024e 933 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
63e3a861
MA
934 if (vd == NULL) {
935 zfs_panic_recover("blkptr at %p DVA %u has invalid "
936 "VDEV %llu",
937 bp, i, (longlong_t)vdevid);
ee3a23b8 938 continue;
63e3a861
MA
939 }
940 if (vd->vdev_ops == &vdev_hole_ops) {
941 zfs_panic_recover("blkptr at %p DVA %u has hole "
942 "VDEV %llu",
943 bp, i, (longlong_t)vdevid);
ee3a23b8 944 continue;
63e3a861
MA
945 }
946 if (vd->vdev_ops == &vdev_missing_ops) {
947 /*
948 * "missing" vdevs are valid during import, but we
949 * don't have their detailed info (e.g. asize), so
950 * we can't perform any more checks on them.
951 */
952 continue;
953 }
1c27024e
DB
954 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
955 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
63e3a861
MA
956 if (BP_IS_GANG(bp))
957 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
958 if (offset + asize > vd->vdev_asize) {
959 zfs_panic_recover("blkptr at %p DVA %u has invalid "
960 "OFFSET %llu",
961 bp, i, (longlong_t)offset);
962 }
963 }
964}
965
6cb8e530
PZ
966boolean_t
967zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
968{
969 uint64_t vdevid = DVA_GET_VDEV(dva);
970
971 if (vdevid >= spa->spa_root_vdev->vdev_children)
972 return (B_FALSE);
973
974 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
975 if (vd == NULL)
976 return (B_FALSE);
977
978 if (vd->vdev_ops == &vdev_hole_ops)
979 return (B_FALSE);
980
981 if (vd->vdev_ops == &vdev_missing_ops) {
982 return (B_FALSE);
983 }
984
985 uint64_t offset = DVA_GET_OFFSET(dva);
986 uint64_t asize = DVA_GET_ASIZE(dva);
987
988 if (BP_IS_GANG(bp))
989 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
990 if (offset + asize > vd->vdev_asize)
991 return (B_FALSE);
992
993 return (B_TRUE);
994}
995
34dc7c2f 996zio_t *
b128c09f 997zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
a6255b7f 998 abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
5dbd68a3 999 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
34dc7c2f
BB
1000{
1001 zio_t *zio;
1002
63e3a861
MA
1003 zfs_blkptr_verify(spa, bp);
1004
428870ff 1005 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
2aa34383 1006 data, size, size, done, private,
b128c09f 1007 ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
428870ff
BB
1008 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
1009 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
34dc7c2f 1010
b128c09f
BB
1011 return (zio);
1012}
34dc7c2f 1013
34dc7c2f 1014zio_t *
b128c09f 1015zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
a6255b7f 1016 abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
bc77ba73
PD
1017 zio_done_func_t *ready, zio_done_func_t *children_ready,
1018 zio_done_func_t *physdone, zio_done_func_t *done,
1019 void *private, zio_priority_t priority, enum zio_flag flags,
1020 const zbookmark_phys_t *zb)
34dc7c2f
BB
1021{
1022 zio_t *zio;
1023
b128c09f
BB
1024 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
1025 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
1026 zp->zp_compress >= ZIO_COMPRESS_OFF &&
1027 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
9ae529ec 1028 DMU_OT_IS_VALID(zp->zp_type) &&
b128c09f 1029 zp->zp_level < 32 &&
428870ff 1030 zp->zp_copies > 0 &&
03c6040b 1031 zp->zp_copies <= spa_max_replication(spa));
34dc7c2f 1032
2aa34383 1033 zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
b128c09f 1034 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
428870ff
BB
1035 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
1036 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
34dc7c2f
BB
1037
1038 zio->io_ready = ready;
bc77ba73 1039 zio->io_children_ready = children_ready;
e8b96c60 1040 zio->io_physdone = physdone;
b128c09f 1041 zio->io_prop = *zp;
34dc7c2f 1042
9b67f605
MA
1043 /*
1044 * Data can be NULL if we are going to call zio_write_override() to
1045 * provide the already-allocated BP. But we may need the data to
1046 * verify a dedup hit (if requested). In this case, don't try to
b5256303
TC
1047 * dedup (just take the already-allocated BP verbatim). Encrypted
1048 * dedup blocks need data as well so we also disable dedup in this
1049 * case.
9b67f605 1050 */
b5256303
TC
1051 if (data == NULL &&
1052 (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
9b67f605
MA
1053 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
1054 }
1055
34dc7c2f
BB
1056 return (zio);
1057}
1058
1059zio_t *
a6255b7f 1060zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
e8b96c60 1061 uint64_t size, zio_done_func_t *done, void *private,
5dbd68a3 1062 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
34dc7c2f
BB
1063{
1064 zio_t *zio;
1065
2aa34383 1066 zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
3dfb57a3 1067 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
b128c09f 1068 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
34dc7c2f
BB
1069
1070 return (zio);
1071}
1072
428870ff 1073void
03c6040b 1074zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
428870ff
BB
1075{
1076 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
1077 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1078 ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1079 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
1080
03c6040b
GW
1081 /*
1082 * We must reset the io_prop to match the values that existed
1083 * when the bp was first written by dmu_sync() keeping in mind
1084 * that nopwrite and dedup are mutually exclusive.
1085 */
1086 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
1087 zio->io_prop.zp_nopwrite = nopwrite;
428870ff
BB
1088 zio->io_prop.zp_copies = copies;
1089 zio->io_bp_override = bp;
1090}
1091
1092void
1093zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
1094{
9b67f605 1095
a1d477c2
MA
1096 zfs_blkptr_verify(spa, bp);
1097
9b67f605
MA
1098 /*
1099 * The check for EMBEDDED is a performance optimization. We
1100 * process the free here (by ignoring it) rather than
1101 * putting it on the list and then processing it in zio_free_sync().
1102 */
1103 if (BP_IS_EMBEDDED(bp))
1104 return;
13fe0198 1105 metaslab_check_free(spa, bp);
2883cad5
MA
1106
1107 /*
1108 * Frees that are for the currently-syncing txg, are not going to be
1109 * deferred, and which will not need to do a read (i.e. not GANG or
1110 * DEDUP), can be processed immediately. Otherwise, put them on the
1111 * in-memory list for later processing.
1112 */
1113 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
1114 txg != spa->spa_syncing_txg ||
1115 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
1116 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
1117 } else {
1118 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
1119 }
428870ff
BB
1120}
1121
34dc7c2f 1122zio_t *
428870ff
BB
1123zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
1124 enum zio_flag flags)
34dc7c2f
BB
1125{
1126 zio_t *zio;
2883cad5 1127 enum zio_stage stage = ZIO_FREE_PIPELINE;
34dc7c2f 1128
428870ff
BB
1129 ASSERT(!BP_IS_HOLE(bp));
1130 ASSERT(spa_syncing_txg(spa) == txg);
55d85d5a 1131 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
34dc7c2f 1132
9b67f605
MA
1133 if (BP_IS_EMBEDDED(bp))
1134 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
1135
13fe0198 1136 metaslab_check_free(spa, bp);
8c841793 1137 arc_freed(spa, bp);
d4a72f23 1138 dsl_scan_freed(spa, bp);
13fe0198 1139
2883cad5
MA
1140 /*
1141 * GANG and DEDUP blocks can induce a read (for the gang block header,
1142 * or the DDT), so issue them asynchronously so that this thread is
1143 * not tied up.
1144 */
1145 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
1146 stage |= ZIO_STAGE_ISSUE_ASYNC;
1147
b128c09f 1148 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
2aa34383
DK
1149 BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
1150 flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
2883cad5 1151
34dc7c2f
BB
1152 return (zio);
1153}
1154
1155zio_t *
428870ff
BB
1156zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
1157 zio_done_func_t *done, void *private, enum zio_flag flags)
34dc7c2f
BB
1158{
1159 zio_t *zio;
1160
a1d477c2 1161 zfs_blkptr_verify(spa, bp);
9b67f605
MA
1162
1163 if (BP_IS_EMBEDDED(bp))
1164 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
1165
34dc7c2f
BB
1166 /*
1167 * A claim is an allocation of a specific block. Claims are needed
1168 * to support immediate writes in the intent log. The issue is that
1169 * immediate writes contain committed data, but in a txg that was
1170 * *not* committed. Upon opening the pool after an unclean shutdown,
1171 * the intent log claims all blocks that contain immediate write data
1172 * so that the SPA knows they're in use.
1173 *
1174 * All claims *must* be resolved in the first txg -- before the SPA
1175 * starts allocating blocks -- so that nothing is allocated twice.
428870ff 1176 * If txg == 0 we just verify that the block is claimable.
34dc7c2f 1177 */
d2734cce
SD
1178 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
1179 spa_min_claim_txg(spa));
1180 ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
428870ff 1181 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
34dc7c2f 1182
b128c09f 1183 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
2aa34383
DK
1184 BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
1185 flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
3dfb57a3 1186 ASSERT0(zio->io_queued_timestamp);
34dc7c2f
BB
1187
1188 return (zio);
1189}
1190
1191zio_t *
1192zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
e8b96c60 1193 zio_done_func_t *done, void *private, enum zio_flag flags)
34dc7c2f
BB
1194{
1195 zio_t *zio;
1196 int c;
1197
1198 if (vd->vdev_children == 0) {
2aa34383 1199 zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
e8b96c60 1200 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
34dc7c2f
BB
1201 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
1202
34dc7c2f
BB
1203 zio->io_cmd = cmd;
1204 } else {
d164b209 1205 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
34dc7c2f
BB
1206
1207 for (c = 0; c < vd->vdev_children; c++)
1208 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
e8b96c60 1209 done, private, flags));
34dc7c2f
BB
1210 }
1211
1212 return (zio);
1213}
1214
1b939560
BB
1215zio_t *
1216zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
1217 zio_done_func_t *done, void *private, zio_priority_t priority,
1218 enum zio_flag flags, enum trim_flag trim_flags)
1219{
1220 zio_t *zio;
1221
1222 ASSERT0(vd->vdev_children);
1223 ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
1224 ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
1225 ASSERT3U(size, !=, 0);
1226
1227 zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
1228 private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
1229 vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
1230 zio->io_trim_flags = trim_flags;
1231
1232 return (zio);
1233}
1234
34dc7c2f
BB
1235zio_t *
1236zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
a6255b7f 1237 abd_t *data, int checksum, zio_done_func_t *done, void *private,
e8b96c60 1238 zio_priority_t priority, enum zio_flag flags, boolean_t labels)
34dc7c2f
BB
1239{
1240 zio_t *zio;
34dc7c2f 1241
b128c09f
BB
1242 ASSERT(vd->vdev_children == 0);
1243 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
1244 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
1245 ASSERT3U(offset + size, <=, vd->vdev_psize);
34dc7c2f 1246
2aa34383
DK
1247 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
1248 private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
1249 offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
34dc7c2f 1250
b128c09f 1251 zio->io_prop.zp_checksum = checksum;
34dc7c2f
BB
1252
1253 return (zio);
1254}
1255
1256zio_t *
1257zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
a6255b7f 1258 abd_t *data, int checksum, zio_done_func_t *done, void *private,
e8b96c60 1259 zio_priority_t priority, enum zio_flag flags, boolean_t labels)
34dc7c2f 1260{
34dc7c2f 1261 zio_t *zio;
34dc7c2f 1262
b128c09f
BB
1263 ASSERT(vd->vdev_children == 0);
1264 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
1265 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
1266 ASSERT3U(offset + size, <=, vd->vdev_psize);
34dc7c2f 1267
2aa34383
DK
1268 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
1269 private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
1270 offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
34dc7c2f 1271
b128c09f 1272 zio->io_prop.zp_checksum = checksum;
34dc7c2f 1273
3c67d83a 1274 if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
34dc7c2f 1275 /*
428870ff 1276 * zec checksums are necessarily destructive -- they modify
b128c09f 1277 * the end of the write buffer to hold the verifier/checksum.
34dc7c2f 1278 * Therefore, we must make a local copy in case the data is
b128c09f 1279 * being written to multiple places in parallel.
34dc7c2f 1280 */
a6255b7f
DQ
1281 abd_t *wbuf = abd_alloc_sametype(data, size);
1282 abd_copy(wbuf, data, size);
1283
b128c09f 1284 zio_push_transform(zio, wbuf, size, size, NULL);
34dc7c2f
BB
1285 }
1286
1287 return (zio);
1288}
1289
1290/*
b128c09f 1291 * Create a child I/O to do some work for us.
34dc7c2f
BB
1292 */
1293zio_t *
b128c09f 1294zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
4ea3f864
GM
1295 abd_t *data, uint64_t size, int type, zio_priority_t priority,
1296 enum zio_flag flags, zio_done_func_t *done, void *private)
34dc7c2f 1297{
428870ff 1298 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
b128c09f
BB
1299 zio_t *zio;
1300
a1d477c2
MA
1301 /*
1302 * vdev child I/Os do not propagate their error to the parent.
1303 * Therefore, for correct operation the caller *must* check for
1304 * and handle the error in the child i/o's done callback.
1305 * The only exceptions are i/os that we don't care about
1306 * (OPTIONAL or REPAIR).
1307 */
1308 ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
1309 done != NULL);
1310
34dc7c2f
BB
1311 if (type == ZIO_TYPE_READ && bp != NULL) {
1312 /*
1313 * If we have the bp, then the child should perform the
1314 * checksum and the parent need not. This pushes error
1315 * detection as close to the leaves as possible and
1316 * eliminates redundant checksums in the interior nodes.
1317 */
428870ff
BB
1318 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
1319 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
34dc7c2f
BB
1320 }
1321
a1d477c2
MA
1322 if (vd->vdev_ops->vdev_op_leaf) {
1323 ASSERT0(vd->vdev_children);
b128c09f 1324 offset += VDEV_LABEL_START_SIZE;
a1d477c2 1325 }
b128c09f 1326
a1d477c2 1327 flags |= ZIO_VDEV_CHILD_FLAGS(pio);
428870ff
BB
1328
1329 /*
1330 * If we've decided to do a repair, the write is not speculative --
1331 * even if the original read was.
1332 */
1333 if (flags & ZIO_FLAG_IO_REPAIR)
1334 flags &= ~ZIO_FLAG_SPECULATIVE;
1335
3dfb57a3
DB
1336 /*
1337 * If we're creating a child I/O that is not associated with a
1338 * top-level vdev, then the child zio is not an allocating I/O.
1339 * If this is a retried I/O then we ignore it since we will
1340 * have already processed the original allocating I/O.
1341 */
1342 if (flags & ZIO_FLAG_IO_ALLOCATING &&
1343 (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
cc99f275
DB
1344 ASSERT(pio->io_metaslab_class != NULL);
1345 ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
3dfb57a3
DB
1346 ASSERT(type == ZIO_TYPE_WRITE);
1347 ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
1348 ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
1349 ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
1350 pio->io_child_type == ZIO_CHILD_GANG);
1351
1352 flags &= ~ZIO_FLAG_IO_ALLOCATING;
1353 }
1354
1355
2aa34383 1356 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
428870ff
BB
1357 done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
1358 ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
3dfb57a3 1359 ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
34dc7c2f 1360
e8b96c60
MA
1361 zio->io_physdone = pio->io_physdone;
1362 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1363 zio->io_logical->io_phys_children++;
1364
b128c09f 1365 return (zio);
34dc7c2f
BB
1366}
1367
b128c09f 1368zio_t *
a6255b7f 1369zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
9e052db4 1370 zio_type_t type, zio_priority_t priority, enum zio_flag flags,
e9aa730c 1371 zio_done_func_t *done, void *private)
34dc7c2f 1372{
b128c09f 1373 zio_t *zio;
34dc7c2f 1374
b128c09f 1375 ASSERT(vd->vdev_ops->vdev_op_leaf);
34dc7c2f 1376
b128c09f 1377 zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
2aa34383 1378 data, size, size, done, private, type, priority,
e8b96c60 1379 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
b128c09f 1380 vd, offset, NULL,
428870ff 1381 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
34dc7c2f 1382
b128c09f 1383 return (zio);
34dc7c2f
BB
1384}
1385
1386void
b128c09f 1387zio_flush(zio_t *zio, vdev_t *vd)
34dc7c2f 1388{
b128c09f 1389 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
e8b96c60 1390 NULL, NULL,
b128c09f 1391 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
34dc7c2f
BB
1392}
1393
428870ff
BB
1394void
1395zio_shrink(zio_t *zio, uint64_t size)
1396{
1ce23dca
PS
1397 ASSERT3P(zio->io_executor, ==, NULL);
1398 ASSERT3U(zio->io_orig_size, ==, zio->io_size);
1399 ASSERT3U(size, <=, zio->io_size);
428870ff
BB
1400
1401 /*
1402 * We don't shrink for raidz because of problems with the
1403 * reconstruction when reading back less than the block size.
1404 * Note, BP_IS_RAIDZ() assumes no compression.
1405 */
1406 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
2aa34383
DK
1407 if (!BP_IS_RAIDZ(zio->io_bp)) {
1408 /* we are not doing a raw write */
1409 ASSERT3U(zio->io_size, ==, zio->io_lsize);
1410 zio->io_orig_size = zio->io_size = zio->io_lsize = size;
1411 }
428870ff
BB
1412}
1413
34dc7c2f
BB
1414/*
1415 * ==========================================================================
b128c09f 1416 * Prepare to read and write logical blocks
34dc7c2f
BB
1417 * ==========================================================================
1418 */
b128c09f 1419
62840030 1420static zio_t *
b128c09f 1421zio_read_bp_init(zio_t *zio)
34dc7c2f 1422{
b128c09f 1423 blkptr_t *bp = zio->io_bp;
b5256303
TC
1424 uint64_t psize =
1425 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
34dc7c2f 1426
a1d477c2
MA
1427 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
1428
fb5f0bc8 1429 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
9babb374 1430 zio->io_child_type == ZIO_CHILD_LOGICAL &&
b5256303 1431 !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
a6255b7f
DQ
1432 zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
1433 psize, psize, zio_decompress);
34dc7c2f 1434 }
34dc7c2f 1435
b5256303
TC
1436 if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
1437 BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
1438 zio->io_child_type == ZIO_CHILD_LOGICAL) {
1439 zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
1440 psize, psize, zio_decrypt);
1441 }
1442
9b67f605 1443 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
a6255b7f
DQ
1444 int psize = BPE_GET_PSIZE(bp);
1445 void *data = abd_borrow_buf(zio->io_abd, psize);
1446
9b67f605 1447 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
a6255b7f
DQ
1448 decode_embedded_bp_compressed(bp, data);
1449 abd_return_buf_copy(zio->io_abd, data, psize);
9b67f605
MA
1450 } else {
1451 ASSERT(!BP_IS_EMBEDDED(bp));
a1d477c2 1452 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
9b67f605
MA
1453 }
1454
9ae529ec 1455 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
b128c09f
BB
1456 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1457
428870ff
BB
1458 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1459 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1460
1461 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1462 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1463
62840030 1464 return (zio);
34dc7c2f
BB
1465}
1466
62840030 1467static zio_t *
b128c09f 1468zio_write_bp_init(zio_t *zio)
34dc7c2f 1469{
b128c09f 1470 if (!IO_IS_ALLOCATING(zio))
62840030 1471 return (zio);
34dc7c2f 1472
428870ff
BB
1473 ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1474
1475 if (zio->io_bp_override) {
3dfb57a3
DB
1476 blkptr_t *bp = zio->io_bp;
1477 zio_prop_t *zp = &zio->io_prop;
1478
428870ff
BB
1479 ASSERT(bp->blk_birth != zio->io_txg);
1480 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1481
1482 *bp = *zio->io_bp_override;
1483 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1484
9b67f605 1485 if (BP_IS_EMBEDDED(bp))
62840030 1486 return (zio);
9b67f605 1487
03c6040b
GW
1488 /*
1489 * If we've been overridden and nopwrite is set then
1490 * set the flag accordingly to indicate that a nopwrite
1491 * has already occurred.
1492 */
1493 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1494 ASSERT(!zp->zp_dedup);
3dfb57a3 1495 ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
03c6040b 1496 zio->io_flags |= ZIO_FLAG_NOPWRITE;
62840030 1497 return (zio);
03c6040b
GW
1498 }
1499
1500 ASSERT(!zp->zp_nopwrite);
1501
428870ff 1502 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
62840030 1503 return (zio);
428870ff 1504
3c67d83a
TH
1505 ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
1506 ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
428870ff 1507
b5256303
TC
1508 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
1509 !zp->zp_encrypt) {
428870ff
BB
1510 BP_SET_DEDUP(bp, 1);
1511 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
62840030 1512 return (zio);
428870ff 1513 }
3dfb57a3
DB
1514
1515 /*
1516 * We were unable to handle this as an override bp, treat
1517 * it as a regular write I/O.
1518 */
5511754b 1519 zio->io_bp_override = NULL;
3dfb57a3
DB
1520 *bp = zio->io_bp_orig;
1521 zio->io_pipeline = zio->io_orig_pipeline;
1522 }
1523
62840030 1524 return (zio);
3dfb57a3
DB
1525}
1526
62840030 1527static zio_t *
3dfb57a3
DB
1528zio_write_compress(zio_t *zio)
1529{
1530 spa_t *spa = zio->io_spa;
1531 zio_prop_t *zp = &zio->io_prop;
1532 enum zio_compress compress = zp->zp_compress;
1533 blkptr_t *bp = zio->io_bp;
1534 uint64_t lsize = zio->io_lsize;
1535 uint64_t psize = zio->io_size;
1536 int pass = 1;
1537
3dfb57a3
DB
1538 /*
1539 * If our children haven't all reached the ready stage,
1540 * wait for them and then repeat this pipeline stage.
1541 */
ddc751d5
GW
1542 if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
1543 ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
62840030 1544 return (NULL);
ddc751d5 1545 }
3dfb57a3
DB
1546
1547 if (!IO_IS_ALLOCATING(zio))
62840030 1548 return (zio);
3dfb57a3
DB
1549
1550 if (zio->io_children_ready != NULL) {
1551 /*
1552 * Now that all our children are ready, run the callback
1553 * associated with this zio in case it wants to modify the
1554 * data to be written.
1555 */
1556 ASSERT3U(zp->zp_level, >, 0);
1557 zio->io_children_ready(zio);
428870ff 1558 }
34dc7c2f 1559
3dfb57a3
DB
1560 ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1561 ASSERT(zio->io_bp_override == NULL);
1562
b0bc7a84 1563 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
b128c09f
BB
1564 /*
1565 * We're rewriting an existing block, which means we're
1566 * working on behalf of spa_sync(). For spa_sync() to
1567 * converge, it must eventually be the case that we don't
1568 * have to allocate new blocks. But compression changes
1569 * the blocksize, which forces a reallocate, and makes
1570 * convergence take longer. Therefore, after the first
1571 * few passes, stop compressing to ensure convergence.
1572 */
428870ff
BB
1573 pass = spa_sync_pass(spa);
1574
1575 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1576 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1577 ASSERT(!BP_GET_DEDUP(bp));
34dc7c2f 1578
55d85d5a 1579 if (pass >= zfs_sync_pass_dont_compress)
b128c09f 1580 compress = ZIO_COMPRESS_OFF;
34dc7c2f 1581
b128c09f 1582 /* Make sure someone doesn't change their mind on overwrites */
9b67f605 1583 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
428870ff 1584 spa_max_replication(spa)) == BP_GET_NDVAS(bp));
b128c09f 1585 }
34dc7c2f 1586
2aa34383 1587 /* If it's a compressed write that is not raw, compress the buffer. */
b5256303
TC
1588 if (compress != ZIO_COMPRESS_OFF &&
1589 !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
428870ff 1590 void *cbuf = zio_buf_alloc(lsize);
a6255b7f 1591 psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
428870ff 1592 if (psize == 0 || psize == lsize) {
b128c09f 1593 compress = ZIO_COMPRESS_OFF;
428870ff 1594 zio_buf_free(cbuf, lsize);
b5256303
TC
1595 } else if (!zp->zp_dedup && !zp->zp_encrypt &&
1596 psize <= BPE_PAYLOAD_SIZE &&
9b67f605
MA
1597 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1598 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1599 encode_embedded_bp_compressed(bp,
1600 cbuf, compress, lsize, psize);
1601 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1602 BP_SET_TYPE(bp, zio->io_prop.zp_type);
1603 BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1604 zio_buf_free(cbuf, lsize);
1605 bp->blk_birth = zio->io_txg;
1606 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1607 ASSERT(spa_feature_is_active(spa,
1608 SPA_FEATURE_EMBEDDED_DATA));
62840030 1609 return (zio);
428870ff 1610 } else {
9b67f605 1611 /*
c3520e7f
MA
1612 * Round up compressed size up to the ashift
1613 * of the smallest-ashift device, and zero the tail.
1614 * This ensures that the compressed size of the BP
1615 * (and thus compressratio property) are correct,
1616 * in that we charge for the padding used to fill out
1617 * the last sector.
9b67f605 1618 */
c3520e7f 1619 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
1c27024e 1620 size_t rounded = (size_t)P2ROUNDUP(psize,
c3520e7f
MA
1621 1ULL << spa->spa_min_ashift);
1622 if (rounded >= lsize) {
9b67f605
MA
1623 compress = ZIO_COMPRESS_OFF;
1624 zio_buf_free(cbuf, lsize);
c3520e7f 1625 psize = lsize;
9b67f605 1626 } else {
a6255b7f
DQ
1627 abd_t *cdata = abd_get_from_buf(cbuf, lsize);
1628 abd_take_ownership_of_buf(cdata, B_TRUE);
1629 abd_zero_off(cdata, psize, rounded - psize);
c3520e7f 1630 psize = rounded;
a6255b7f 1631 zio_push_transform(zio, cdata,
9b67f605
MA
1632 psize, lsize, NULL);
1633 }
b128c09f 1634 }
3dfb57a3
DB
1635
1636 /*
1637 * We were unable to handle this as an override bp, treat
1638 * it as a regular write I/O.
1639 */
1640 zio->io_bp_override = NULL;
1641 *bp = zio->io_bp_orig;
1642 zio->io_pipeline = zio->io_orig_pipeline;
1643
b1d21733
TC
1644 } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
1645 zp->zp_type == DMU_OT_DNODE) {
1646 /*
1647 * The DMU actually relies on the zio layer's compression
1648 * to free metadnode blocks that have had all contained
1649 * dnodes freed. As a result, even when doing a raw
1650 * receive, we must check whether the block can be compressed
1651 * to a hole.
1652 */
1653 psize = zio_compress_data(ZIO_COMPRESS_EMPTY,
1654 zio->io_abd, NULL, lsize);
1655 if (psize == 0)
1656 compress = ZIO_COMPRESS_OFF;
2aa34383
DK
1657 } else {
1658 ASSERT3U(psize, !=, 0);
b128c09f 1659 }
34dc7c2f 1660
b128c09f
BB
1661 /*
1662 * The final pass of spa_sync() must be all rewrites, but the first
1663 * few passes offer a trade-off: allocating blocks defers convergence,
1664 * but newly allocated blocks are sequential, so they can be written
1665 * to disk faster. Therefore, we allow the first few passes of
1666 * spa_sync() to allocate new blocks, but force rewrites after that.
1667 * There should only be a handful of blocks after pass 1 in any case.
1668 */
b0bc7a84
MG
1669 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1670 BP_GET_PSIZE(bp) == psize &&
55d85d5a 1671 pass >= zfs_sync_pass_rewrite) {
cc99f275 1672 VERIFY3U(psize, !=, 0);
1c27024e 1673 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
cc99f275 1674
b128c09f
BB
1675 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1676 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1677 } else {
1678 BP_ZERO(bp);
1679 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1680 }
34dc7c2f 1681
428870ff 1682 if (psize == 0) {
b0bc7a84
MG
1683 if (zio->io_bp_orig.blk_birth != 0 &&
1684 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1685 BP_SET_LSIZE(bp, lsize);
1686 BP_SET_TYPE(bp, zp->zp_type);
1687 BP_SET_LEVEL(bp, zp->zp_level);
1688 BP_SET_BIRTH(bp, zio->io_txg, 0);
1689 }
b128c09f
BB
1690 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1691 } else {
1692 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1693 BP_SET_LSIZE(bp, lsize);
b0bc7a84
MG
1694 BP_SET_TYPE(bp, zp->zp_type);
1695 BP_SET_LEVEL(bp, zp->zp_level);
428870ff 1696 BP_SET_PSIZE(bp, psize);
b128c09f
BB
1697 BP_SET_COMPRESS(bp, compress);
1698 BP_SET_CHECKSUM(bp, zp->zp_checksum);
428870ff 1699 BP_SET_DEDUP(bp, zp->zp_dedup);
b128c09f 1700 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
428870ff
BB
1701 if (zp->zp_dedup) {
1702 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1703 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
b5256303
TC
1704 ASSERT(!zp->zp_encrypt ||
1705 DMU_OT_IS_ENCRYPTED(zp->zp_type));
428870ff
BB
1706 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1707 }
03c6040b
GW
1708 if (zp->zp_nopwrite) {
1709 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1710 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1711 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1712 }
428870ff 1713 }
62840030 1714 return (zio);
428870ff
BB
1715}
1716
62840030 1717static zio_t *
428870ff
BB
1718zio_free_bp_init(zio_t *zio)
1719{
1720 blkptr_t *bp = zio->io_bp;
1721
1722 if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1723 if (BP_GET_DEDUP(bp))
1724 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
b128c09f 1725 }
34dc7c2f 1726
a1d477c2
MA
1727 ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
1728
62840030 1729 return (zio);
34dc7c2f
BB
1730}
1731
b128c09f
BB
1732/*
1733 * ==========================================================================
1734 * Execute the I/O pipeline
1735 * ==========================================================================
1736 */
1737
1738static void
7ef5e54e 1739zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
34dc7c2f 1740{
428870ff 1741 spa_t *spa = zio->io_spa;
b128c09f 1742 zio_type_t t = zio->io_type;
a38718a6 1743 int flags = (cutinline ? TQ_FRONT : 0);
34dc7c2f
BB
1744
1745 /*
9babb374
BB
1746 * If we're a config writer or a probe, the normal issue and
1747 * interrupt threads may all be blocked waiting for the config lock.
1748 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
34dc7c2f 1749 */
9babb374 1750 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
b128c09f 1751 t = ZIO_TYPE_NULL;
34dc7c2f
BB
1752
1753 /*
b128c09f 1754 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
34dc7c2f 1755 */
b128c09f
BB
1756 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1757 t = ZIO_TYPE_NULL;
34dc7c2f 1758
428870ff 1759 /*
7ef5e54e
AL
1760 * If this is a high priority I/O, then use the high priority taskq if
1761 * available.
428870ff 1762 */
18b14b17
GW
1763 if ((zio->io_priority == ZIO_PRIORITY_NOW ||
1764 zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
7ef5e54e 1765 spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
428870ff
BB
1766 q++;
1767
1768 ASSERT3U(q, <, ZIO_TASKQ_TYPES);
5cc556b4 1769
a38718a6
GA
1770 /*
1771 * NB: We are assuming that the zio can only be dispatched
1772 * to a single taskq at a time. It would be a grievous error
1773 * to dispatch the zio to another taskq at the same time.
1774 */
1775 ASSERT(taskq_empty_ent(&zio->io_tqent));
7ef5e54e
AL
1776 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1777 flags, &zio->io_tqent);
b128c09f 1778}
34dc7c2f 1779
b128c09f 1780static boolean_t
7ef5e54e 1781zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
b128c09f
BB
1782{
1783 kthread_t *executor = zio->io_executor;
1784 spa_t *spa = zio->io_spa;
34dc7c2f 1785
1c27024e 1786 for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
7ef5e54e
AL
1787 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1788 uint_t i;
1789 for (i = 0; i < tqs->stqs_count; i++) {
1790 if (taskq_member(tqs->stqs_taskq[i], executor))
1791 return (B_TRUE);
1792 }
1793 }
34dc7c2f 1794
b128c09f
BB
1795 return (B_FALSE);
1796}
34dc7c2f 1797
62840030 1798static zio_t *
b128c09f
BB
1799zio_issue_async(zio_t *zio)
1800{
428870ff 1801 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
b128c09f 1802
62840030 1803 return (NULL);
34dc7c2f
BB
1804}
1805
b128c09f
BB
1806void
1807zio_interrupt(zio_t *zio)
34dc7c2f 1808{
428870ff 1809 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
b128c09f 1810}
34dc7c2f 1811
26ef0cc7
TH
1812void
1813zio_delay_interrupt(zio_t *zio)
1814{
1815 /*
1816 * The timeout_generic() function isn't defined in userspace, so
1817 * rather than trying to implement the function, the zio delay
1818 * functionality has been disabled for userspace builds.
1819 */
1820
1821#ifdef _KERNEL
1822 /*
1823 * If io_target_timestamp is zero, then no delay has been registered
1824 * for this IO, thus jump to the end of this function and "skip" the
1825 * delay; issuing it directly to the zio layer.
1826 */
1827 if (zio->io_target_timestamp != 0) {
1828 hrtime_t now = gethrtime();
1829
1830 if (now >= zio->io_target_timestamp) {
1831 /*
1832 * This IO has already taken longer than the target
1833 * delay to complete, so we don't want to delay it
1834 * any longer; we "miss" the delay and issue it
1835 * directly to the zio layer. This is likely due to
1836 * the target latency being set to a value less than
1837 * the underlying hardware can satisfy (e.g. delay
1838 * set to 1ms, but the disks take 10ms to complete an
1839 * IO request).
1840 */
1841
1842 DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
1843 hrtime_t, now);
1844
1845 zio_interrupt(zio);
1846 } else {
1847 taskqid_t tid;
1848 hrtime_t diff = zio->io_target_timestamp - now;
1849 clock_t expire_at_tick = ddi_get_lbolt() +
1850 NSEC_TO_TICK(diff);
1851
1852 DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
1853 hrtime_t, now, hrtime_t, diff);
1854
1855 if (NSEC_TO_TICK(diff) == 0) {
1856 /* Our delay is less than a jiffy - just spin */
1857 zfs_sleep_until(zio->io_target_timestamp);
f545b6ae 1858 zio_interrupt(zio);
26ef0cc7
TH
1859 } else {
1860 /*
1861 * Use taskq_dispatch_delay() in the place of
1862 * OpenZFS's timeout_generic().
1863 */
1864 tid = taskq_dispatch_delay(system_taskq,
02730c33 1865 (task_func_t *)zio_interrupt,
26ef0cc7 1866 zio, TQ_NOSLEEP, expire_at_tick);
48d3eb40 1867 if (tid == TASKQID_INVALID) {
26ef0cc7
TH
1868 /*
1869 * Couldn't allocate a task. Just
1870 * finish the zio without a delay.
1871 */
1872 zio_interrupt(zio);
1873 }
1874 }
1875 }
1876 return;
1877 }
1878#endif
1879 DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
1880 zio_interrupt(zio);
1881}
1882
8fb1ede1 1883static void
638dd5f4 1884zio_deadman_impl(zio_t *pio, int ziodepth)
8fb1ede1
BB
1885{
1886 zio_t *cio, *cio_next;
1887 zio_link_t *zl = NULL;
1888 vdev_t *vd = pio->io_vd;
1889
638dd5f4
TC
1890 if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
1891 vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
8fb1ede1
BB
1892 zbookmark_phys_t *zb = &pio->io_bookmark;
1893 uint64_t delta = gethrtime() - pio->io_timestamp;
1894 uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
1895
a887d653 1896 zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
8fb1ede1
BB
1897 "delta=%llu queued=%llu io=%llu "
1898 "path=%s last=%llu "
1899 "type=%d priority=%d flags=0x%x "
1900 "stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
1901 "objset=%llu object=%llu level=%llu blkid=%llu "
1902 "offset=%llu size=%llu error=%d",
638dd5f4 1903 ziodepth, pio, pio->io_timestamp,
8fb1ede1 1904 delta, pio->io_delta, pio->io_delay,
638dd5f4 1905 vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0,
8fb1ede1 1906 pio->io_type, pio->io_priority, pio->io_flags,
638dd5f4 1907 pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
8fb1ede1
BB
1908 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
1909 pio->io_offset, pio->io_size, pio->io_error);
1910 zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
1911 pio->io_spa, vd, zb, pio, 0, 0);
1912
1913 if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
1914 taskq_empty_ent(&pio->io_tqent)) {
1915 zio_interrupt(pio);
1916 }
1917 }
1918
1919 mutex_enter(&pio->io_lock);
1920 for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
1921 cio_next = zio_walk_children(pio, &zl);
638dd5f4 1922 zio_deadman_impl(cio, ziodepth + 1);
8fb1ede1
BB
1923 }
1924 mutex_exit(&pio->io_lock);
1925}
1926
1927/*
1928 * Log the critical information describing this zio and all of its children
1929 * using the zfs_dbgmsg() interface then post deadman event for the ZED.
1930 */
1931void
1932zio_deadman(zio_t *pio, char *tag)
1933{
1934 spa_t *spa = pio->io_spa;
1935 char *name = spa_name(spa);
1936
1937 if (!zfs_deadman_enabled || spa_suspended(spa))
1938 return;
1939
638dd5f4 1940 zio_deadman_impl(pio, 0);
8fb1ede1
BB
1941
1942 switch (spa_get_deadman_failmode(spa)) {
1943 case ZIO_FAILURE_MODE_WAIT:
1944 zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
1945 break;
1946
1947 case ZIO_FAILURE_MODE_CONTINUE:
1948 zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
1949 break;
1950
1951 case ZIO_FAILURE_MODE_PANIC:
1952 fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
1953 break;
1954 }
1955}
1956
b128c09f
BB
1957/*
1958 * Execute the I/O pipeline until one of the following occurs:
1959 * (1) the I/O completes; (2) the pipeline stalls waiting for
1960 * dependent child I/Os; (3) the I/O issues, so we're waiting
1961 * for an I/O completion interrupt; (4) the I/O is delegated by
1962 * vdev-level caching or aggregation; (5) the I/O is deferred
1963 * due to vdev-level queueing; (6) the I/O is handed off to
1964 * another thread. In all cases, the pipeline stops whenever
8e07b99b 1965 * there's no CPU work; it never burns a thread in cv_wait_io().
b128c09f
BB
1966 *
1967 * There's no locking on io_stage because there's no legitimate way
1968 * for multiple threads to be attempting to process the same I/O.
1969 */
428870ff 1970static zio_pipe_stage_t *zio_pipeline[];
34dc7c2f 1971
da6b4005
NB
1972/*
1973 * zio_execute() is a wrapper around the static function
1974 * __zio_execute() so that we can force __zio_execute() to be
1975 * inlined. This reduces stack overhead which is important
1976 * because __zio_execute() is called recursively in several zio
1977 * code paths. zio_execute() itself cannot be inlined because
1978 * it is externally visible.
1979 */
b128c09f
BB
1980void
1981zio_execute(zio_t *zio)
da6b4005 1982{
92119cc2
BB
1983 fstrans_cookie_t cookie;
1984
1985 cookie = spl_fstrans_mark();
da6b4005 1986 __zio_execute(zio);
92119cc2 1987 spl_fstrans_unmark(cookie);
da6b4005
NB
1988}
1989
b58986ee
BB
1990/*
1991 * Used to determine if in the current context the stack is sized large
1992 * enough to allow zio_execute() to be called recursively. A minimum
1993 * stack size of 16K is required to avoid needing to re-dispatch the zio.
1994 */
1995boolean_t
1996zio_execute_stack_check(zio_t *zio)
1997{
1998#if !defined(HAVE_LARGE_STACKS)
1999 dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
2000
2001 /* Executing in txg_sync_thread() context. */
2002 if (dp && curthread == dp->dp_tx.tx_sync_thread)
2003 return (B_TRUE);
2004
2005 /* Pool initialization outside of zio_taskq context. */
2006 if (dp && spa_is_initializing(dp->dp_spa) &&
2007 !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
2008 !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
2009 return (B_TRUE);
2010#endif /* HAVE_LARGE_STACKS */
2011
2012 return (B_FALSE);
2013}
2014
da6b4005
NB
2015__attribute__((always_inline))
2016static inline void
2017__zio_execute(zio_t *zio)
b128c09f 2018{
3dfb57a3
DB
2019 ASSERT3U(zio->io_queued_timestamp, >, 0);
2020
b128c09f 2021 while (zio->io_stage < ZIO_STAGE_DONE) {
428870ff
BB
2022 enum zio_stage pipeline = zio->io_pipeline;
2023 enum zio_stage stage = zio->io_stage;
62840030
MA
2024
2025 zio->io_executor = curthread;
34dc7c2f 2026
b128c09f 2027 ASSERT(!MUTEX_HELD(&zio->io_lock));
428870ff
BB
2028 ASSERT(ISP2(stage));
2029 ASSERT(zio->io_stall == NULL);
34dc7c2f 2030
428870ff
BB
2031 do {
2032 stage <<= 1;
2033 } while ((stage & pipeline) == 0);
b128c09f
BB
2034
2035 ASSERT(stage <= ZIO_STAGE_DONE);
34dc7c2f
BB
2036
2037 /*
b128c09f
BB
2038 * If we are in interrupt context and this pipeline stage
2039 * will grab a config lock that is held across I/O,
428870ff
BB
2040 * or may wait for an I/O that needs an interrupt thread
2041 * to complete, issue async to avoid deadlock.
2042 *
2043 * For VDEV_IO_START, we cut in line so that the io will
2044 * be sent to disk promptly.
34dc7c2f 2045 */
91579709
BB
2046 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
2047 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
b58986ee
BB
2048 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
2049 zio_requeue_io_start_cut_in_line : B_FALSE;
91579709
BB
2050 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
2051 return;
2052 }
2053
2054 /*
b58986ee
BB
2055 * If the current context doesn't have large enough stacks
2056 * the zio must be issued asynchronously to prevent overflow.
91579709 2057 */
b58986ee
BB
2058 if (zio_execute_stack_check(zio)) {
2059 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
2060 zio_requeue_io_start_cut_in_line : B_FALSE;
428870ff 2061 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
b128c09f 2062 return;
34dc7c2f
BB
2063 }
2064
b128c09f 2065 zio->io_stage = stage;
3dfb57a3 2066 zio->io_pipeline_trace |= zio->io_stage;
34dc7c2f 2067
62840030
MA
2068 /*
2069 * The zio pipeline stage returns the next zio to execute
2070 * (typically the same as this one), or NULL if we should
2071 * stop.
2072 */
2073 zio = zio_pipeline[highbit64(stage) - 1](zio);
34dc7c2f 2074
62840030
MA
2075 if (zio == NULL)
2076 return;
b128c09f 2077 }
34dc7c2f
BB
2078}
2079
da6b4005 2080
b128c09f
BB
2081/*
2082 * ==========================================================================
2083 * Initiate I/O, either sync or async
2084 * ==========================================================================
2085 */
2086int
2087zio_wait(zio_t *zio)
34dc7c2f 2088{
8fb1ede1 2089 long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
b128c09f 2090 int error;
34dc7c2f 2091
1ce23dca
PS
2092 ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
2093 ASSERT3P(zio->io_executor, ==, NULL);
34dc7c2f 2094
b128c09f 2095 zio->io_waiter = curthread;
3dfb57a3
DB
2096 ASSERT0(zio->io_queued_timestamp);
2097 zio->io_queued_timestamp = gethrtime();
34dc7c2f 2098
da6b4005 2099 __zio_execute(zio);
34dc7c2f 2100
b128c09f 2101 mutex_enter(&zio->io_lock);
8fb1ede1
BB
2102 while (zio->io_executor != NULL) {
2103 error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
2104 ddi_get_lbolt() + timeout);
2105
2106 if (zfs_deadman_enabled && error == -1 &&
2107 gethrtime() - zio->io_queued_timestamp >
2108 spa_deadman_ziotime(zio->io_spa)) {
2109 mutex_exit(&zio->io_lock);
2110 timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
2111 zio_deadman(zio, FTAG);
2112 mutex_enter(&zio->io_lock);
2113 }
2114 }
b128c09f 2115 mutex_exit(&zio->io_lock);
34dc7c2f 2116
b128c09f
BB
2117 error = zio->io_error;
2118 zio_destroy(zio);
34dc7c2f 2119
b128c09f
BB
2120 return (error);
2121}
34dc7c2f 2122
b128c09f
BB
2123void
2124zio_nowait(zio_t *zio)
2125{
1ce23dca 2126 ASSERT3P(zio->io_executor, ==, NULL);
34dc7c2f 2127
d164b209
BB
2128 if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
2129 zio_unique_parent(zio) == NULL) {
8878261f
BB
2130 zio_t *pio;
2131
34dc7c2f 2132 /*
b128c09f 2133 * This is a logical async I/O with no parent to wait for it.
9babb374
BB
2134 * We add it to the spa_async_root_zio "Godfather" I/O which
2135 * will ensure they complete prior to unloading the pool.
34dc7c2f 2136 */
b128c09f 2137 spa_t *spa = zio->io_spa;
8878261f
BB
2138 kpreempt_disable();
2139 pio = spa->spa_async_zio_root[CPU_SEQID];
2140 kpreempt_enable();
9babb374 2141
8878261f 2142 zio_add_child(pio, zio);
b128c09f 2143 }
34dc7c2f 2144
3dfb57a3
DB
2145 ASSERT0(zio->io_queued_timestamp);
2146 zio->io_queued_timestamp = gethrtime();
da6b4005 2147 __zio_execute(zio);
b128c09f 2148}
34dc7c2f 2149
b128c09f
BB
2150/*
2151 * ==========================================================================
1ce23dca 2152 * Reexecute, cancel, or suspend/resume failed I/O
b128c09f
BB
2153 * ==========================================================================
2154 */
34dc7c2f 2155
b128c09f
BB
2156static void
2157zio_reexecute(zio_t *pio)
2158{
d164b209
BB
2159 zio_t *cio, *cio_next;
2160
2161 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
2162 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
9babb374
BB
2163 ASSERT(pio->io_gang_leader == NULL);
2164 ASSERT(pio->io_gang_tree == NULL);
34dc7c2f 2165
b128c09f
BB
2166 pio->io_flags = pio->io_orig_flags;
2167 pio->io_stage = pio->io_orig_stage;
2168 pio->io_pipeline = pio->io_orig_pipeline;
2169 pio->io_reexecute = 0;
03c6040b 2170 pio->io_flags |= ZIO_FLAG_REEXECUTED;
3dfb57a3 2171 pio->io_pipeline_trace = 0;
b128c09f 2172 pio->io_error = 0;
1c27024e 2173 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
d164b209 2174 pio->io_state[w] = 0;
1c27024e 2175 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
b128c09f 2176 pio->io_child_error[c] = 0;
34dc7c2f 2177
428870ff
BB
2178 if (IO_IS_ALLOCATING(pio))
2179 BP_ZERO(pio->io_bp);
34dc7c2f 2180
b128c09f
BB
2181 /*
2182 * As we reexecute pio's children, new children could be created.
d164b209 2183 * New children go to the head of pio's io_child_list, however,
b128c09f 2184 * so we will (correctly) not reexecute them. The key is that
d164b209
BB
2185 * the remainder of pio's io_child_list, from 'cio_next' onward,
2186 * cannot be affected by any side effects of reexecuting 'cio'.
b128c09f 2187 */
1c27024e 2188 zio_link_t *zl = NULL;
a8b2e306 2189 mutex_enter(&pio->io_lock);
3dfb57a3
DB
2190 for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
2191 cio_next = zio_walk_children(pio, &zl);
1c27024e 2192 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
d164b209 2193 pio->io_children[cio->io_child_type][w]++;
b128c09f 2194 mutex_exit(&pio->io_lock);
d164b209 2195 zio_reexecute(cio);
a8b2e306 2196 mutex_enter(&pio->io_lock);
34dc7c2f 2197 }
a8b2e306 2198 mutex_exit(&pio->io_lock);
34dc7c2f 2199
b128c09f
BB
2200 /*
2201 * Now that all children have been reexecuted, execute the parent.
9babb374 2202 * We don't reexecute "The Godfather" I/O here as it's the
9e2c3bb4 2203 * responsibility of the caller to wait on it.
b128c09f 2204 */
3dfb57a3
DB
2205 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
2206 pio->io_queued_timestamp = gethrtime();
da6b4005 2207 __zio_execute(pio);
3dfb57a3 2208 }
34dc7c2f
BB
2209}
2210
b128c09f 2211void
cec3a0a1 2212zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
34dc7c2f 2213{
b128c09f
BB
2214 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
2215 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
2216 "failure and the failure mode property for this pool "
2217 "is set to panic.", spa_name(spa));
34dc7c2f 2218
bf89c199
BB
2219 cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
2220 "failure and has been suspended.\n", spa_name(spa));
2221
b5256303
TC
2222 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
2223 NULL, NULL, 0, 0);
34dc7c2f 2224
b128c09f 2225 mutex_enter(&spa->spa_suspend_lock);
34dc7c2f 2226
b128c09f 2227 if (spa->spa_suspend_zio_root == NULL)
9babb374
BB
2228 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
2229 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2230 ZIO_FLAG_GODFATHER);
34dc7c2f 2231
cec3a0a1 2232 spa->spa_suspended = reason;
34dc7c2f 2233
b128c09f 2234 if (zio != NULL) {
9babb374 2235 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
b128c09f
BB
2236 ASSERT(zio != spa->spa_suspend_zio_root);
2237 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
d164b209 2238 ASSERT(zio_unique_parent(zio) == NULL);
b128c09f
BB
2239 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
2240 zio_add_child(spa->spa_suspend_zio_root, zio);
2241 }
34dc7c2f 2242
b128c09f
BB
2243 mutex_exit(&spa->spa_suspend_lock);
2244}
34dc7c2f 2245
9babb374 2246int
b128c09f
BB
2247zio_resume(spa_t *spa)
2248{
9babb374 2249 zio_t *pio;
34dc7c2f
BB
2250
2251 /*
b128c09f 2252 * Reexecute all previously suspended i/o.
34dc7c2f 2253 */
b128c09f 2254 mutex_enter(&spa->spa_suspend_lock);
cec3a0a1 2255 spa->spa_suspended = ZIO_SUSPEND_NONE;
b128c09f
BB
2256 cv_broadcast(&spa->spa_suspend_cv);
2257 pio = spa->spa_suspend_zio_root;
2258 spa->spa_suspend_zio_root = NULL;
2259 mutex_exit(&spa->spa_suspend_lock);
2260
2261 if (pio == NULL)
9babb374 2262 return (0);
34dc7c2f 2263
9babb374
BB
2264 zio_reexecute(pio);
2265 return (zio_wait(pio));
b128c09f
BB
2266}
2267
2268void
2269zio_resume_wait(spa_t *spa)
2270{
2271 mutex_enter(&spa->spa_suspend_lock);
2272 while (spa_suspended(spa))
2273 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
2274 mutex_exit(&spa->spa_suspend_lock);
34dc7c2f
BB
2275}
2276
2277/*
2278 * ==========================================================================
b128c09f
BB
2279 * Gang blocks.
2280 *
2281 * A gang block is a collection of small blocks that looks to the DMU
2282 * like one large block. When zio_dva_allocate() cannot find a block
2283 * of the requested size, due to either severe fragmentation or the pool
2284 * being nearly full, it calls zio_write_gang_block() to construct the
2285 * block from smaller fragments.
2286 *
2287 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
2288 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
2289 * an indirect block: it's an array of block pointers. It consumes
2290 * only one sector and hence is allocatable regardless of fragmentation.
2291 * The gang header's bps point to its gang members, which hold the data.
2292 *
2293 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
2294 * as the verifier to ensure uniqueness of the SHA256 checksum.
2295 * Critically, the gang block bp's blk_cksum is the checksum of the data,
2296 * not the gang header. This ensures that data block signatures (needed for
2297 * deduplication) are independent of how the block is physically stored.
2298 *
2299 * Gang blocks can be nested: a gang member may itself be a gang block.
2300 * Thus every gang block is a tree in which root and all interior nodes are
2301 * gang headers, and the leaves are normal blocks that contain user data.
2302 * The root of the gang tree is called the gang leader.
2303 *
2304 * To perform any operation (read, rewrite, free, claim) on a gang block,
2305 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
2306 * in the io_gang_tree field of the original logical i/o by recursively
2307 * reading the gang leader and all gang headers below it. This yields
2308 * an in-core tree containing the contents of every gang header and the
2309 * bps for every constituent of the gang block.
2310 *
2311 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
2312 * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
2313 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
2314 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
2315 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
2316 * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
2317 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
2318 * of the gang header plus zio_checksum_compute() of the data to update the
2319 * gang header's blk_cksum as described above.
2320 *
2321 * The two-phase assemble/issue model solves the problem of partial failure --
2322 * what if you'd freed part of a gang block but then couldn't read the
2323 * gang header for another part? Assembling the entire gang tree first
2324 * ensures that all the necessary gang header I/O has succeeded before
2325 * starting the actual work of free, claim, or write. Once the gang tree
2326 * is assembled, free and claim are in-memory operations that cannot fail.
2327 *
2328 * In the event that a gang write fails, zio_dva_unallocate() walks the
2329 * gang tree to immediately free (i.e. insert back into the space map)
2330 * everything we've allocated. This ensures that we don't get ENOSPC
2331 * errors during repeated suspend/resume cycles due to a flaky device.
2332 *
2333 * Gang rewrites only happen during sync-to-convergence. If we can't assemble
2334 * the gang tree, we won't modify the block, so we can safely defer the free
2335 * (knowing that the block is still intact). If we *can* assemble the gang
2336 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
2337 * each constituent bp and we can allocate a new block on the next sync pass.
2338 *
2339 * In all cases, the gang tree allows complete recovery from partial failure.
34dc7c2f
BB
2340 * ==========================================================================
2341 */
b128c09f 2342
a6255b7f
DQ
2343static void
2344zio_gang_issue_func_done(zio_t *zio)
2345{
2346 abd_put(zio->io_abd);
2347}
2348
b128c09f 2349static zio_t *
a6255b7f
DQ
2350zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
2351 uint64_t offset)
34dc7c2f 2352{
b128c09f
BB
2353 if (gn != NULL)
2354 return (pio);
34dc7c2f 2355
a6255b7f
DQ
2356 return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
2357 BP_GET_PSIZE(bp), zio_gang_issue_func_done,
2358 NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
b128c09f
BB
2359 &pio->io_bookmark));
2360}
2361
a6255b7f
DQ
2362static zio_t *
2363zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
2364 uint64_t offset)
b128c09f
BB
2365{
2366 zio_t *zio;
2367
2368 if (gn != NULL) {
a6255b7f
DQ
2369 abd_t *gbh_abd =
2370 abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
b128c09f 2371 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
a6255b7f
DQ
2372 gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
2373 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2374 &pio->io_bookmark);
34dc7c2f 2375 /*
b128c09f
BB
2376 * As we rewrite each gang header, the pipeline will compute
2377 * a new gang block header checksum for it; but no one will
2378 * compute a new data checksum, so we do that here. The one
2379 * exception is the gang leader: the pipeline already computed
2380 * its data checksum because that stage precedes gang assembly.
2381 * (Presently, nothing actually uses interior data checksums;
2382 * this is just good hygiene.)
34dc7c2f 2383 */
9babb374 2384 if (gn != pio->io_gang_leader->io_gang_tree) {
a6255b7f
DQ
2385 abd_t *buf = abd_get_offset(data, offset);
2386
b128c09f 2387 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
a6255b7f
DQ
2388 buf, BP_GET_PSIZE(bp));
2389
2390 abd_put(buf);
b128c09f 2391 }
428870ff
BB
2392 /*
2393 * If we are here to damage data for testing purposes,
2394 * leave the GBH alone so that we can detect the damage.
2395 */
2396 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
2397 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
34dc7c2f 2398 } else {
b128c09f 2399 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
a6255b7f
DQ
2400 abd_get_offset(data, offset), BP_GET_PSIZE(bp),
2401 zio_gang_issue_func_done, NULL, pio->io_priority,
b128c09f 2402 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
34dc7c2f
BB
2403 }
2404
b128c09f
BB
2405 return (zio);
2406}
34dc7c2f 2407
b128c09f 2408/* ARGSUSED */
a6255b7f
DQ
2409static zio_t *
2410zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
2411 uint64_t offset)
b128c09f 2412{
428870ff
BB
2413 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
2414 ZIO_GANG_CHILD_FLAGS(pio)));
34dc7c2f
BB
2415}
2416
b128c09f 2417/* ARGSUSED */
a6255b7f
DQ
2418static zio_t *
2419zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
2420 uint64_t offset)
34dc7c2f 2421{
b128c09f
BB
2422 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
2423 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
2424}
2425
2426static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
2427 NULL,
2428 zio_read_gang,
2429 zio_rewrite_gang,
2430 zio_free_gang,
2431 zio_claim_gang,
2432 NULL
2433};
34dc7c2f 2434
b128c09f 2435static void zio_gang_tree_assemble_done(zio_t *zio);
34dc7c2f 2436
b128c09f
BB
2437static zio_gang_node_t *
2438zio_gang_node_alloc(zio_gang_node_t **gnpp)
2439{
2440 zio_gang_node_t *gn;
34dc7c2f 2441
b128c09f 2442 ASSERT(*gnpp == NULL);
34dc7c2f 2443
79c76d5b 2444 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
b128c09f
BB
2445 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
2446 *gnpp = gn;
34dc7c2f 2447
b128c09f 2448 return (gn);
34dc7c2f
BB
2449}
2450
34dc7c2f 2451static void
b128c09f 2452zio_gang_node_free(zio_gang_node_t **gnpp)
34dc7c2f 2453{
b128c09f 2454 zio_gang_node_t *gn = *gnpp;
34dc7c2f 2455
1c27024e 2456 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
b128c09f
BB
2457 ASSERT(gn->gn_child[g] == NULL);
2458
2459 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
2460 kmem_free(gn, sizeof (*gn));
2461 *gnpp = NULL;
34dc7c2f
BB
2462}
2463
b128c09f
BB
2464static void
2465zio_gang_tree_free(zio_gang_node_t **gnpp)
34dc7c2f 2466{
b128c09f 2467 zio_gang_node_t *gn = *gnpp;
34dc7c2f 2468
b128c09f
BB
2469 if (gn == NULL)
2470 return;
34dc7c2f 2471
1c27024e 2472 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
b128c09f 2473 zio_gang_tree_free(&gn->gn_child[g]);
34dc7c2f 2474
b128c09f 2475 zio_gang_node_free(gnpp);
34dc7c2f
BB
2476}
2477
b128c09f 2478static void
9babb374 2479zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
34dc7c2f 2480{
b128c09f 2481 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
a6255b7f 2482 abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
b128c09f 2483
9babb374 2484 ASSERT(gio->io_gang_leader == gio);
b128c09f 2485 ASSERT(BP_IS_GANG(bp));
34dc7c2f 2486
a6255b7f
DQ
2487 zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
2488 zio_gang_tree_assemble_done, gn, gio->io_priority,
2489 ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
b128c09f 2490}
34dc7c2f 2491
b128c09f
BB
2492static void
2493zio_gang_tree_assemble_done(zio_t *zio)
2494{
9babb374 2495 zio_t *gio = zio->io_gang_leader;
b128c09f
BB
2496 zio_gang_node_t *gn = zio->io_private;
2497 blkptr_t *bp = zio->io_bp;
34dc7c2f 2498
9babb374 2499 ASSERT(gio == zio_unique_parent(zio));
428870ff 2500 ASSERT(zio->io_child_count == 0);
34dc7c2f 2501
b128c09f
BB
2502 if (zio->io_error)
2503 return;
34dc7c2f 2504
a6255b7f 2505 /* this ABD was created from a linear buf in zio_gang_tree_assemble */
b128c09f 2506 if (BP_SHOULD_BYTESWAP(bp))
a6255b7f 2507 byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
34dc7c2f 2508
a6255b7f 2509 ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
b128c09f 2510 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
428870ff 2511 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
34dc7c2f 2512
a6255b7f
DQ
2513 abd_put(zio->io_abd);
2514
1c27024e 2515 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
b128c09f
BB
2516 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
2517 if (!BP_IS_GANG(gbp))
2518 continue;
9babb374 2519 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
b128c09f 2520 }
34dc7c2f
BB
2521}
2522
b128c09f 2523static void
a6255b7f
DQ
2524zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
2525 uint64_t offset)
34dc7c2f 2526{
9babb374 2527 zio_t *gio = pio->io_gang_leader;
b128c09f 2528 zio_t *zio;
34dc7c2f 2529
b128c09f 2530 ASSERT(BP_IS_GANG(bp) == !!gn);
9babb374
BB
2531 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
2532 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
34dc7c2f 2533
b128c09f
BB
2534 /*
2535 * If you're a gang header, your data is in gn->gn_gbh.
2536 * If you're a gang member, your data is in 'data' and gn == NULL.
2537 */
a6255b7f 2538 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
34dc7c2f 2539
b128c09f 2540 if (gn != NULL) {
428870ff 2541 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
34dc7c2f 2542
1c27024e 2543 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
b128c09f
BB
2544 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
2545 if (BP_IS_HOLE(gbp))
2546 continue;
a6255b7f
DQ
2547 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
2548 offset);
2549 offset += BP_GET_PSIZE(gbp);
b128c09f 2550 }
34dc7c2f
BB
2551 }
2552
9babb374 2553 if (gn == gio->io_gang_tree)
a6255b7f 2554 ASSERT3U(gio->io_size, ==, offset);
34dc7c2f 2555
b128c09f
BB
2556 if (zio != pio)
2557 zio_nowait(zio);
34dc7c2f
BB
2558}
2559
62840030 2560static zio_t *
b128c09f 2561zio_gang_assemble(zio_t *zio)
34dc7c2f 2562{
b128c09f 2563 blkptr_t *bp = zio->io_bp;
34dc7c2f 2564
9babb374
BB
2565 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
2566 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2567
2568 zio->io_gang_leader = zio;
34dc7c2f 2569
b128c09f 2570 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
34dc7c2f 2571
62840030 2572 return (zio);
34dc7c2f
BB
2573}
2574
62840030 2575static zio_t *
b128c09f 2576zio_gang_issue(zio_t *zio)
34dc7c2f 2577{
b128c09f 2578 blkptr_t *bp = zio->io_bp;
34dc7c2f 2579
ddc751d5 2580 if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
62840030 2581 return (NULL);
ddc751d5 2582 }
34dc7c2f 2583
9babb374
BB
2584 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
2585 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
34dc7c2f 2586
b128c09f 2587 if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
a6255b7f
DQ
2588 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
2589 0);
b128c09f 2590 else
9babb374 2591 zio_gang_tree_free(&zio->io_gang_tree);
34dc7c2f 2592
b128c09f 2593 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
34dc7c2f 2594
62840030 2595 return (zio);
34dc7c2f
BB
2596}
2597
2598static void
b128c09f 2599zio_write_gang_member_ready(zio_t *zio)
34dc7c2f 2600{
d164b209 2601 zio_t *pio = zio_unique_parent(zio);
34dc7c2f
BB
2602 dva_t *cdva = zio->io_bp->blk_dva;
2603 dva_t *pdva = pio->io_bp->blk_dva;
2604 uint64_t asize;
d1d7e268 2605 ASSERTV(zio_t *gio = zio->io_gang_leader);
34dc7c2f 2606
b128c09f
BB
2607 if (BP_IS_HOLE(zio->io_bp))
2608 return;
2609
2610 ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
2611
2612 ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
428870ff
BB
2613 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
2614 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
2615 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
34dc7c2f 2616 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
34dc7c2f
BB
2617
2618 mutex_enter(&pio->io_lock);
1c27024e 2619 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
34dc7c2f
BB
2620 ASSERT(DVA_GET_GANG(&pdva[d]));
2621 asize = DVA_GET_ASIZE(&pdva[d]);
2622 asize += DVA_GET_ASIZE(&cdva[d]);
2623 DVA_SET_ASIZE(&pdva[d], asize);
2624 }
2625 mutex_exit(&pio->io_lock);
2626}
2627
a6255b7f
DQ
2628static void
2629zio_write_gang_done(zio_t *zio)
2630{
c955398b
BL
2631 /*
2632 * The io_abd field will be NULL for a zio with no data. The io_flags
2633 * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
2634 * check for it here as it is cleared in zio_ready.
2635 */
2636 if (zio->io_abd != NULL)
2637 abd_put(zio->io_abd);
a6255b7f
DQ
2638}
2639
62840030 2640static zio_t *
b128c09f 2641zio_write_gang_block(zio_t *pio)
34dc7c2f 2642{
b128c09f 2643 spa_t *spa = pio->io_spa;
3dfb57a3 2644 metaslab_class_t *mc = spa_normal_class(spa);
b128c09f 2645 blkptr_t *bp = pio->io_bp;
9babb374 2646 zio_t *gio = pio->io_gang_leader;
b128c09f
BB
2647 zio_t *zio;
2648 zio_gang_node_t *gn, **gnpp;
34dc7c2f 2649 zio_gbh_phys_t *gbh;
a6255b7f 2650 abd_t *gbh_abd;
b128c09f
BB
2651 uint64_t txg = pio->io_txg;
2652 uint64_t resid = pio->io_size;
2653 uint64_t lsize;
428870ff 2654 int copies = gio->io_prop.zp_copies;
b5256303 2655 int gbh_copies;
b128c09f 2656 zio_prop_t zp;
1c27024e 2657 int error;
c955398b 2658 boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
b5256303
TC
2659
2660 /*
2661 * encrypted blocks need DVA[2] free so encrypted gang headers can't
2662 * have a third copy.
2663 */
2664 gbh_copies = MIN(copies + 1, spa_max_replication(spa));
2665 if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
2666 gbh_copies = SPA_DVAS_PER_BP - 1;
2667
1c27024e 2668 int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
3dfb57a3
DB
2669 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
2670 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
c955398b 2671 ASSERT(has_data);
3dfb57a3
DB
2672
2673 flags |= METASLAB_ASYNC_ALLOC;
424fd7c3 2674 VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
492f64e9 2675 pio));
3dfb57a3
DB
2676
2677 /*
2678 * The logical zio has already placed a reservation for
2679 * 'copies' allocation slots but gang blocks may require
2680 * additional copies. These additional copies
2681 * (i.e. gbh_copies - copies) are guaranteed to succeed
2682 * since metaslab_class_throttle_reserve() always allows
2683 * additional reservations for gang blocks.
2684 */
2685 VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
492f64e9 2686 pio->io_allocator, pio, flags));
3dfb57a3
DB
2687 }
2688
2689 error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
4e21fd06 2690 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
492f64e9 2691 &pio->io_alloc_list, pio, pio->io_allocator);
34dc7c2f 2692 if (error) {
3dfb57a3
DB
2693 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
2694 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
c955398b 2695 ASSERT(has_data);
3dfb57a3
DB
2696
2697 /*
2698 * If we failed to allocate the gang block header then
2699 * we remove any additional allocation reservations that
2700 * we placed here. The original reservation will
2701 * be removed when the logical I/O goes to the ready
2702 * stage.
2703 */
2704 metaslab_class_throttle_unreserve(mc,
492f64e9 2705 gbh_copies - copies, pio->io_allocator, pio);
3dfb57a3
DB
2706 }
2707
b128c09f 2708 pio->io_error = error;
62840030 2709 return (pio);
34dc7c2f
BB
2710 }
2711
9babb374
BB
2712 if (pio == gio) {
2713 gnpp = &gio->io_gang_tree;
b128c09f
BB
2714 } else {
2715 gnpp = pio->io_private;
2716 ASSERT(pio->io_ready == zio_write_gang_member_ready);
34dc7c2f
BB
2717 }
2718
b128c09f
BB
2719 gn = zio_gang_node_alloc(gnpp);
2720 gbh = gn->gn_gbh;
2721 bzero(gbh, SPA_GANGBLOCKSIZE);
a6255b7f 2722 gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
34dc7c2f 2723
b128c09f
BB
2724 /*
2725 * Create the gang header.
2726 */
a6255b7f
DQ
2727 zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
2728 zio_write_gang_done, NULL, pio->io_priority,
2729 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
34dc7c2f 2730
b128c09f
BB
2731 /*
2732 * Create and nowait the gang children.
2733 */
1c27024e 2734 for (int g = 0; resid != 0; resid -= lsize, g++) {
b128c09f
BB
2735 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
2736 SPA_MINBLOCKSIZE);
2737 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
2738
9babb374 2739 zp.zp_checksum = gio->io_prop.zp_checksum;
b128c09f
BB
2740 zp.zp_compress = ZIO_COMPRESS_OFF;
2741 zp.zp_type = DMU_OT_NONE;
2742 zp.zp_level = 0;
428870ff 2743 zp.zp_copies = gio->io_prop.zp_copies;
03c6040b
GW
2744 zp.zp_dedup = B_FALSE;
2745 zp.zp_dedup_verify = B_FALSE;
2746 zp.zp_nopwrite = B_FALSE;
4807c0ba
TC
2747 zp.zp_encrypt = gio->io_prop.zp_encrypt;
2748 zp.zp_byteorder = gio->io_prop.zp_byteorder;
b5256303
TC
2749 bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
2750 bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
2751 bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
b128c09f 2752
1c27024e 2753 zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
c955398b
BL
2754 has_data ? abd_get_offset(pio->io_abd, pio->io_size -
2755 resid) : NULL, lsize, lsize, &zp,
2756 zio_write_gang_member_ready, NULL, NULL,
a6255b7f 2757 zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
3dfb57a3
DB
2758 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2759
2760 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
2761 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
c955398b 2762 ASSERT(has_data);
3dfb57a3
DB
2763
2764 /*
2765 * Gang children won't throttle but we should
2766 * account for their work, so reserve an allocation
2767 * slot for them here.
2768 */
2769 VERIFY(metaslab_class_throttle_reserve(mc,
492f64e9 2770 zp.zp_copies, cio->io_allocator, cio, flags));
3dfb57a3
DB
2771 }
2772 zio_nowait(cio);
b128c09f 2773 }
34dc7c2f
BB
2774
2775 /*
b128c09f 2776 * Set pio's pipeline to just wait for zio to finish.
34dc7c2f 2777 */
b128c09f
BB
2778 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2779
920dd524
ED
2780 /*
2781 * We didn't allocate this bp, so make sure it doesn't get unmarked.
2782 */
2783 pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
2784
b128c09f
BB
2785 zio_nowait(zio);
2786
62840030 2787 return (pio);
34dc7c2f
BB
2788}
2789
03c6040b 2790/*
3c67d83a
TH
2791 * The zio_nop_write stage in the pipeline determines if allocating a
2792 * new bp is necessary. The nopwrite feature can handle writes in
2793 * either syncing or open context (i.e. zil writes) and as a result is
2794 * mutually exclusive with dedup.
2795 *
2796 * By leveraging a cryptographically secure checksum, such as SHA256, we
2797 * can compare the checksums of the new data and the old to determine if
2798 * allocating a new block is required. Note that our requirements for
2799 * cryptographic strength are fairly weak: there can't be any accidental
2800 * hash collisions, but we don't need to be secure against intentional
2801 * (malicious) collisions. To trigger a nopwrite, you have to be able
2802 * to write the file to begin with, and triggering an incorrect (hash
2803 * collision) nopwrite is no worse than simply writing to the file.
2804 * That said, there are no known attacks against the checksum algorithms
2805 * used for nopwrite, assuming that the salt and the checksums
2806 * themselves remain secret.
03c6040b 2807 */
62840030 2808static zio_t *
03c6040b
GW
2809zio_nop_write(zio_t *zio)
2810{
2811 blkptr_t *bp = zio->io_bp;
2812 blkptr_t *bp_orig = &zio->io_bp_orig;
2813 zio_prop_t *zp = &zio->io_prop;
2814
2815 ASSERT(BP_GET_LEVEL(bp) == 0);
2816 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2817 ASSERT(zp->zp_nopwrite);
2818 ASSERT(!zp->zp_dedup);
2819 ASSERT(zio->io_bp_override == NULL);
2820 ASSERT(IO_IS_ALLOCATING(zio));
2821
2822 /*
2823 * Check to see if the original bp and the new bp have matching
2824 * characteristics (i.e. same checksum, compression algorithms, etc).
2825 * If they don't then just continue with the pipeline which will
2826 * allocate a new bp.
2827 */
2828 if (BP_IS_HOLE(bp_orig) ||
3c67d83a
TH
2829 !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
2830 ZCHECKSUM_FLAG_NOPWRITE) ||
b5256303 2831 BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
03c6040b
GW
2832 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2833 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2834 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2835 zp->zp_copies != BP_GET_NDVAS(bp_orig))
62840030 2836 return (zio);
03c6040b
GW
2837
2838 /*
2839 * If the checksums match then reset the pipeline so that we
2840 * avoid allocating a new bp and issuing any I/O.
2841 */
2842 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
3c67d83a
TH
2843 ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
2844 ZCHECKSUM_FLAG_NOPWRITE);
03c6040b
GW
2845 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2846 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2847 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2848 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2849 sizeof (uint64_t)) == 0);
2850
2851 *bp = *bp_orig;
2852 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2853 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2854 }
2855
62840030 2856 return (zio);
03c6040b
GW
2857}
2858
34dc7c2f
BB
2859/*
2860 * ==========================================================================
428870ff 2861 * Dedup
34dc7c2f
BB
2862 * ==========================================================================
2863 */
428870ff
BB
2864static void
2865zio_ddt_child_read_done(zio_t *zio)
2866{
2867 blkptr_t *bp = zio->io_bp;
2868 ddt_entry_t *dde = zio->io_private;
2869 ddt_phys_t *ddp;
2870 zio_t *pio = zio_unique_parent(zio);
2871
2872 mutex_enter(&pio->io_lock);
2873 ddp = ddt_phys_select(dde, bp);
2874 if (zio->io_error == 0)
2875 ddt_phys_clear(ddp); /* this ddp doesn't need repair */
a6255b7f
DQ
2876
2877 if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
2878 dde->dde_repair_abd = zio->io_abd;
428870ff 2879 else
a6255b7f 2880 abd_free(zio->io_abd);
428870ff
BB
2881 mutex_exit(&pio->io_lock);
2882}
2883
62840030 2884static zio_t *
428870ff
BB
2885zio_ddt_read_start(zio_t *zio)
2886{
2887 blkptr_t *bp = zio->io_bp;
2888
2889 ASSERT(BP_GET_DEDUP(bp));
2890 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2891 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2892
2893 if (zio->io_child_error[ZIO_CHILD_DDT]) {
2894 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2895 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2896 ddt_phys_t *ddp = dde->dde_phys;
2897 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2898 blkptr_t blk;
2899
2900 ASSERT(zio->io_vsd == NULL);
2901 zio->io_vsd = dde;
2902
2903 if (ddp_self == NULL)
62840030 2904 return (zio);
428870ff 2905
1c27024e 2906 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
428870ff
BB
2907 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2908 continue;
2909 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2910 &blk);
2911 zio_nowait(zio_read(zio, zio->io_spa, &blk,
a6255b7f
DQ
2912 abd_alloc_for_io(zio->io_size, B_TRUE),
2913 zio->io_size, zio_ddt_child_read_done, dde,
2914 zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
2915 ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
428870ff 2916 }
62840030 2917 return (zio);
428870ff
BB
2918 }
2919
2920 zio_nowait(zio_read(zio, zio->io_spa, bp,
a6255b7f 2921 zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
428870ff
BB
2922 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2923
62840030 2924 return (zio);
428870ff
BB
2925}
2926
62840030 2927static zio_t *
428870ff
BB
2928zio_ddt_read_done(zio_t *zio)
2929{
2930 blkptr_t *bp = zio->io_bp;
2931
ddc751d5 2932 if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
62840030 2933 return (NULL);
ddc751d5 2934 }
428870ff
BB
2935
2936 ASSERT(BP_GET_DEDUP(bp));
2937 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2938 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2939
2940 if (zio->io_child_error[ZIO_CHILD_DDT]) {
2941 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2942 ddt_entry_t *dde = zio->io_vsd;
2943 if (ddt == NULL) {
2944 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
62840030 2945 return (zio);
428870ff
BB
2946 }
2947 if (dde == NULL) {
2948 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2949 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
62840030 2950 return (NULL);
428870ff 2951 }
a6255b7f
DQ
2952 if (dde->dde_repair_abd != NULL) {
2953 abd_copy(zio->io_abd, dde->dde_repair_abd,
2954 zio->io_size);
428870ff
BB
2955 zio->io_child_error[ZIO_CHILD_DDT] = 0;
2956 }
2957 ddt_repair_done(ddt, dde);
2958 zio->io_vsd = NULL;
2959 }
2960
2961 ASSERT(zio->io_vsd == NULL);
2962
62840030 2963 return (zio);
428870ff
BB
2964}
2965
2966static boolean_t
2967zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2968{
2969 spa_t *spa = zio->io_spa;
c17bcf83 2970 boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
428870ff 2971
c17bcf83 2972 ASSERT(!(zio->io_bp_override && do_raw));
2aa34383 2973
428870ff
BB
2974 /*
2975 * Note: we compare the original data, not the transformed data,
2976 * because when zio->io_bp is an override bp, we will not have
2977 * pushed the I/O transforms. That's an important optimization
2978 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
c17bcf83 2979 * However, we should never get a raw, override zio so in these
b5256303 2980 * cases we can compare the io_abd directly. This is useful because
c17bcf83
TC
2981 * it allows us to do dedup verification even if we don't have access
2982 * to the original data (for instance, if the encryption keys aren't
2983 * loaded).
428870ff 2984 */
c17bcf83 2985
1c27024e 2986 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
428870ff
BB
2987 zio_t *lio = dde->dde_lead_zio[p];
2988
c17bcf83
TC
2989 if (lio != NULL && do_raw) {
2990 return (lio->io_size != zio->io_size ||
a6255b7f 2991 abd_cmp(zio->io_abd, lio->io_abd) != 0);
c17bcf83 2992 } else if (lio != NULL) {
428870ff 2993 return (lio->io_orig_size != zio->io_orig_size ||
a6255b7f 2994 abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
428870ff
BB
2995 }
2996 }
2997
1c27024e 2998 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
428870ff
BB
2999 ddt_phys_t *ddp = &dde->dde_phys[p];
3000
c17bcf83
TC
3001 if (ddp->ddp_phys_birth != 0 && do_raw) {
3002 blkptr_t blk = *zio->io_bp;
3003 uint64_t psize;
a6255b7f 3004 abd_t *tmpabd;
c17bcf83
TC
3005 int error;
3006
3007 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
3008 psize = BP_GET_PSIZE(&blk);
3009
3010 if (psize != zio->io_size)
3011 return (B_TRUE);
3012
3013 ddt_exit(ddt);
3014
a6255b7f 3015 tmpabd = abd_alloc_for_io(psize, B_TRUE);
c17bcf83 3016
a6255b7f 3017 error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
c17bcf83
TC
3018 psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
3019 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
3020 ZIO_FLAG_RAW, &zio->io_bookmark));
3021
3022 if (error == 0) {
a6255b7f 3023 if (abd_cmp(tmpabd, zio->io_abd) != 0)
c17bcf83
TC
3024 error = SET_ERROR(ENOENT);
3025 }
3026
a6255b7f 3027 abd_free(tmpabd);
c17bcf83
TC
3028 ddt_enter(ddt);
3029 return (error != 0);
3030 } else if (ddp->ddp_phys_birth != 0) {
428870ff 3031 arc_buf_t *abuf = NULL;
2a432414 3032 arc_flags_t aflags = ARC_FLAG_WAIT;
428870ff
BB
3033 blkptr_t blk = *zio->io_bp;
3034 int error;
3035
3036 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
3037
c17bcf83
TC
3038 if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
3039 return (B_TRUE);
3040
428870ff
BB
3041 ddt_exit(ddt);
3042
294f6806 3043 error = arc_read(NULL, spa, &blk,
428870ff
BB
3044 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
3045 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3046 &aflags, &zio->io_bookmark);
3047
3048 if (error == 0) {
a6255b7f 3049 if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
428870ff 3050 zio->io_orig_size) != 0)
c17bcf83 3051 error = SET_ERROR(ENOENT);
d3c2ae1c 3052 arc_buf_destroy(abuf, &abuf);
428870ff
BB
3053 }
3054
3055 ddt_enter(ddt);
3056 return (error != 0);
3057 }
3058 }
3059
3060 return (B_FALSE);
3061}
3062
3063static void
3064zio_ddt_child_write_ready(zio_t *zio)
3065{
3066 int p = zio->io_prop.zp_copies;
3067 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
3068 ddt_entry_t *dde = zio->io_private;
3069 ddt_phys_t *ddp = &dde->dde_phys[p];
3070 zio_t *pio;
3071
3072 if (zio->io_error)
3073 return;
3074
3075 ddt_enter(ddt);
3076
3077 ASSERT(dde->dde_lead_zio[p] == zio);
3078
3079 ddt_phys_fill(ddp, zio->io_bp);
3080
1c27024e 3081 zio_link_t *zl = NULL;
3dfb57a3 3082 while ((pio = zio_walk_parents(zio, &zl)) != NULL)
428870ff
BB
3083 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
3084
3085 ddt_exit(ddt);
3086}
3087
3088static void
3089zio_ddt_child_write_done(zio_t *zio)
3090{
3091 int p = zio->io_prop.zp_copies;
3092 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
3093 ddt_entry_t *dde = zio->io_private;
3094 ddt_phys_t *ddp = &dde->dde_phys[p];
3095
3096 ddt_enter(ddt);
3097
3098 ASSERT(ddp->ddp_refcnt == 0);
3099 ASSERT(dde->dde_lead_zio[p] == zio);
3100 dde->dde_lead_zio[p] = NULL;
3101
3102 if (zio->io_error == 0) {
3dfb57a3
DB
3103 zio_link_t *zl = NULL;
3104 while (zio_walk_parents(zio, &zl) != NULL)
428870ff
BB
3105 ddt_phys_addref(ddp);
3106 } else {
3107 ddt_phys_clear(ddp);
3108 }
3109
3110 ddt_exit(ddt);
3111}
3112
3113static void
3114zio_ddt_ditto_write_done(zio_t *zio)
3115{
3116 int p = DDT_PHYS_DITTO;
1c27024e 3117 ASSERTV(zio_prop_t *zp = &zio->io_prop);
428870ff
BB
3118 blkptr_t *bp = zio->io_bp;
3119 ddt_t *ddt = ddt_select(zio->io_spa, bp);
3120 ddt_entry_t *dde = zio->io_private;
3121 ddt_phys_t *ddp = &dde->dde_phys[p];
3122 ddt_key_t *ddk = &dde->dde_key;
3123
3124 ddt_enter(ddt);
3125
3126 ASSERT(ddp->ddp_refcnt == 0);
3127 ASSERT(dde->dde_lead_zio[p] == zio);
3128 dde->dde_lead_zio[p] = NULL;
3129
3130 if (zio->io_error == 0) {
3131 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
3132 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
3133 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
3134 if (ddp->ddp_phys_birth != 0)
3135 ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
3136 ddt_phys_fill(ddp, bp);
3137 }
3138
3139 ddt_exit(ddt);
3140}
3141
62840030 3142static zio_t *
428870ff
BB
3143zio_ddt_write(zio_t *zio)
3144{
3145 spa_t *spa = zio->io_spa;
3146 blkptr_t *bp = zio->io_bp;
3147 uint64_t txg = zio->io_txg;
3148 zio_prop_t *zp = &zio->io_prop;
3149 int p = zp->zp_copies;
3150 int ditto_copies;
3151 zio_t *cio = NULL;
3152 zio_t *dio = NULL;
3153 ddt_t *ddt = ddt_select(spa, bp);
3154 ddt_entry_t *dde;
3155 ddt_phys_t *ddp;
3156
3157 ASSERT(BP_GET_DEDUP(bp));
3158 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
3159 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
c17bcf83 3160 ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
428870ff
BB
3161
3162 ddt_enter(ddt);
3163 dde = ddt_lookup(ddt, bp, B_TRUE);
3164 ddp = &dde->dde_phys[p];
3165
3166 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
3167 /*
3168 * If we're using a weak checksum, upgrade to a strong checksum
3169 * and try again. If we're already using a strong checksum,
3170 * we can't resolve it, so just convert to an ordinary write.
3171 * (And automatically e-mail a paper to Nature?)
3172 */
3c67d83a
TH
3173 if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
3174 ZCHECKSUM_FLAG_DEDUP)) {
428870ff
BB
3175 zp->zp_checksum = spa_dedup_checksum(spa);
3176 zio_pop_transforms(zio);
3177 zio->io_stage = ZIO_STAGE_OPEN;
3178 BP_ZERO(bp);
3179 } else {
03c6040b 3180 zp->zp_dedup = B_FALSE;
428870ff
BB
3181 }
3182 zio->io_pipeline = ZIO_WRITE_PIPELINE;
3183 ddt_exit(ddt);
62840030 3184 return (zio);
428870ff
BB
3185 }
3186
3187 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
3188 ASSERT(ditto_copies < SPA_DVAS_PER_BP);
3189
3190 if (ditto_copies > ddt_ditto_copies_present(dde) &&
3191 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
3192 zio_prop_t czp = *zp;
3193
3194 czp.zp_copies = ditto_copies;
3195
3196 /*
3197 * If we arrived here with an override bp, we won't have run
3198 * the transform stack, so we won't have the data we need to
3199 * generate a child i/o. So, toss the override bp and restart.
3200 * This is safe, because using the override bp is just an
3201 * optimization; and it's rare, so the cost doesn't matter.
3202 */
3203 if (zio->io_bp_override) {
3204 zio_pop_transforms(zio);
3205 zio->io_stage = ZIO_STAGE_OPEN;
3206 zio->io_pipeline = ZIO_WRITE_PIPELINE;
3207 zio->io_bp_override = NULL;
3208 BP_ZERO(bp);
3209 ddt_exit(ddt);
62840030 3210 return (zio);
428870ff
BB
3211 }
3212
a6255b7f 3213 dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
2aa34383 3214 zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
bc77ba73 3215 NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
428870ff
BB
3216 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
3217
a6255b7f 3218 zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
428870ff
BB
3219 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
3220 }
3221
3222 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
3223 if (ddp->ddp_phys_birth != 0)
3224 ddt_bp_fill(ddp, bp, txg);
3225 if (dde->dde_lead_zio[p] != NULL)
3226 zio_add_child(zio, dde->dde_lead_zio[p]);
3227 else
3228 ddt_phys_addref(ddp);
3229 } else if (zio->io_bp_override) {
3230 ASSERT(bp->blk_birth == txg);
3231 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
3232 ddt_phys_fill(ddp, bp);
3233 ddt_phys_addref(ddp);
3234 } else {
a6255b7f 3235 cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
2aa34383 3236 zio->io_orig_size, zio->io_orig_size, zp,
bc77ba73 3237 zio_ddt_child_write_ready, NULL, NULL,
428870ff
BB
3238 zio_ddt_child_write_done, dde, zio->io_priority,
3239 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
3240
a6255b7f 3241 zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
428870ff
BB
3242 dde->dde_lead_zio[p] = cio;
3243 }
3244
3245 ddt_exit(ddt);
3246
3247 if (cio)
3248 zio_nowait(cio);
3249 if (dio)
3250 zio_nowait(dio);
3251
62840030 3252 return (zio);
428870ff
BB
3253}
3254
3255ddt_entry_t *freedde; /* for debugging */
b128c09f 3256
62840030 3257static zio_t *
428870ff
BB
3258zio_ddt_free(zio_t *zio)
3259{
3260 spa_t *spa = zio->io_spa;
3261 blkptr_t *bp = zio->io_bp;
3262 ddt_t *ddt = ddt_select(spa, bp);
3263 ddt_entry_t *dde;
3264 ddt_phys_t *ddp;
3265
3266 ASSERT(BP_GET_DEDUP(bp));
3267 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3268
3269 ddt_enter(ddt);
3270 freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
5dc6af0e
BB
3271 if (dde) {
3272 ddp = ddt_phys_select(dde, bp);
3273 if (ddp)
3274 ddt_phys_decref(ddp);
3275 }
428870ff
BB
3276 ddt_exit(ddt);
3277
62840030 3278 return (zio);
428870ff
BB
3279}
3280
3281/*
3282 * ==========================================================================
3283 * Allocate and free blocks
3284 * ==========================================================================
3285 */
3dfb57a3
DB
3286
3287static zio_t *
492f64e9 3288zio_io_to_allocate(spa_t *spa, int allocator)
3dfb57a3
DB
3289{
3290 zio_t *zio;
3291
492f64e9 3292 ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
3dfb57a3 3293
492f64e9 3294 zio = avl_first(&spa->spa_alloc_trees[allocator]);
3dfb57a3
DB
3295 if (zio == NULL)
3296 return (NULL);
3297
3298 ASSERT(IO_IS_ALLOCATING(zio));
3299
3300 /*
3301 * Try to place a reservation for this zio. If we're unable to
3302 * reserve then we throttle.
3303 */
492f64e9 3304 ASSERT3U(zio->io_allocator, ==, allocator);
cc99f275 3305 if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
492f64e9 3306 zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
3dfb57a3
DB
3307 return (NULL);
3308 }
3309
492f64e9 3310 avl_remove(&spa->spa_alloc_trees[allocator], zio);
3dfb57a3
DB
3311 ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
3312
3313 return (zio);
3314}
3315
62840030 3316static zio_t *
3dfb57a3
DB
3317zio_dva_throttle(zio_t *zio)
3318{
3319 spa_t *spa = zio->io_spa;
3320 zio_t *nio;
cc99f275
DB
3321 metaslab_class_t *mc;
3322
3323 /* locate an appropriate allocation class */
3324 mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
3325 zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
3dfb57a3
DB
3326
3327 if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
cc99f275 3328 !mc->mc_alloc_throttle_enabled ||
3dfb57a3
DB
3329 zio->io_child_type == ZIO_CHILD_GANG ||
3330 zio->io_flags & ZIO_FLAG_NODATA) {
62840030 3331 return (zio);
3dfb57a3
DB
3332 }
3333
3334 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
3335
3336 ASSERT3U(zio->io_queued_timestamp, >, 0);
3337 ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
3338
492f64e9
PD
3339 zbookmark_phys_t *bm = &zio->io_bookmark;
3340 /*
3341 * We want to try to use as many allocators as possible to help improve
3342 * performance, but we also want logically adjacent IOs to be physically
3343 * adjacent to improve sequential read performance. We chunk each object
3344 * into 2^20 block regions, and then hash based on the objset, object,
3345 * level, and region to accomplish both of these goals.
3346 */
3347 zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
3348 bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
3349 mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
3dfb57a3 3350 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
cc99f275 3351 zio->io_metaslab_class = mc;
492f64e9 3352 avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
cc99f275 3353 nio = zio_io_to_allocate(spa, zio->io_allocator);
492f64e9 3354 mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
62840030 3355 return (nio);
3dfb57a3
DB
3356}
3357
cc99f275 3358static void
492f64e9 3359zio_allocate_dispatch(spa_t *spa, int allocator)
3dfb57a3
DB
3360{
3361 zio_t *zio;
3362
492f64e9
PD
3363 mutex_enter(&spa->spa_alloc_locks[allocator]);
3364 zio = zio_io_to_allocate(spa, allocator);
3365 mutex_exit(&spa->spa_alloc_locks[allocator]);
3dfb57a3
DB
3366 if (zio == NULL)
3367 return;
3368
3369 ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
3370 ASSERT0(zio->io_error);
3371 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
3372}
3373
62840030 3374static zio_t *
34dc7c2f
BB
3375zio_dva_allocate(zio_t *zio)
3376{
3377 spa_t *spa = zio->io_spa;
cc99f275 3378 metaslab_class_t *mc;
34dc7c2f
BB
3379 blkptr_t *bp = zio->io_bp;
3380 int error;
6d974228 3381 int flags = 0;
34dc7c2f 3382
9babb374
BB
3383 if (zio->io_gang_leader == NULL) {
3384 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
3385 zio->io_gang_leader = zio;
3386 }
3387
34dc7c2f 3388 ASSERT(BP_IS_HOLE(bp));
c99c9001 3389 ASSERT0(BP_GET_NDVAS(bp));
428870ff
BB
3390 ASSERT3U(zio->io_prop.zp_copies, >, 0);
3391 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
34dc7c2f
BB
3392 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
3393
920dd524 3394 flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
3dfb57a3
DB
3395 if (zio->io_flags & ZIO_FLAG_NODATA)
3396 flags |= METASLAB_DONT_THROTTLE;
3397 if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
3398 flags |= METASLAB_GANG_CHILD;
3399 if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
3400 flags |= METASLAB_ASYNC_ALLOC;
3401
cc99f275
DB
3402 /*
3403 * if not already chosen, locate an appropriate allocation class
3404 */
3405 mc = zio->io_metaslab_class;
3406 if (mc == NULL) {
3407 mc = spa_preferred_class(spa, zio->io_size,
3408 zio->io_prop.zp_type, zio->io_prop.zp_level,
3409 zio->io_prop.zp_zpl_smallblk);
3410 zio->io_metaslab_class = mc;
3411 }
3412
b128c09f 3413 error = metaslab_alloc(spa, mc, zio->io_size, bp,
4e21fd06 3414 zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
492f64e9 3415 &zio->io_alloc_list, zio, zio->io_allocator);
34dc7c2f 3416
cc99f275
DB
3417 /*
3418 * Fallback to normal class when an alloc class is full
3419 */
3420 if (error == ENOSPC && mc != spa_normal_class(spa)) {
3421 /*
3422 * If throttling, transfer reservation over to normal class.
3423 * The io_allocator slot can remain the same even though we
3424 * are switching classes.
3425 */
3426 if (mc->mc_alloc_throttle_enabled &&
3427 (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
3428 metaslab_class_throttle_unreserve(mc,
3429 zio->io_prop.zp_copies, zio->io_allocator, zio);
3430 zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
3431
3432 mc = spa_normal_class(spa);
3433 VERIFY(metaslab_class_throttle_reserve(mc,
3434 zio->io_prop.zp_copies, zio->io_allocator, zio,
3435 flags | METASLAB_MUST_RESERVE));
3436 } else {
3437 mc = spa_normal_class(spa);
3438 }
3439 zio->io_metaslab_class = mc;
3440
3441 error = metaslab_alloc(spa, mc, zio->io_size, bp,
3442 zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
3443 &zio->io_alloc_list, zio, zio->io_allocator);
3444 }
3445
3dfb57a3 3446 if (error != 0) {
a887d653 3447 zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
6d974228
GW
3448 "size %llu, error %d", spa_name(spa), zio, zio->io_size,
3449 error);
b128c09f
BB
3450 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
3451 return (zio_write_gang_block(zio));
34dc7c2f
BB
3452 zio->io_error = error;
3453 }
3454
62840030 3455 return (zio);
34dc7c2f
BB
3456}
3457
62840030 3458static zio_t *
34dc7c2f
BB
3459zio_dva_free(zio_t *zio)
3460{
b128c09f 3461 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
34dc7c2f 3462
62840030 3463 return (zio);
34dc7c2f
BB
3464}
3465
62840030 3466static zio_t *
34dc7c2f
BB
3467zio_dva_claim(zio_t *zio)
3468{
b128c09f
BB
3469 int error;
3470
3471 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
3472 if (error)
3473 zio->io_error = error;
34dc7c2f 3474
62840030 3475 return (zio);
34dc7c2f
BB
3476}
3477
b128c09f
BB
3478/*
3479 * Undo an allocation. This is used by zio_done() when an I/O fails
3480 * and we want to give back the block we just allocated.
3481 * This handles both normal blocks and gang blocks.
3482 */
3483static void
3484zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
3485{
b128c09f 3486 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
428870ff 3487 ASSERT(zio->io_bp_override == NULL);
b128c09f
BB
3488
3489 if (!BP_IS_HOLE(bp))
428870ff 3490 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
b128c09f
BB
3491
3492 if (gn != NULL) {
1c27024e 3493 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
b128c09f
BB
3494 zio_dva_unallocate(zio, gn->gn_child[g],
3495 &gn->gn_gbh->zg_blkptr[g]);
3496 }
3497 }
3498}
3499
3500/*
3501 * Try to allocate an intent log block. Return 0 on success, errno on failure.
3502 */
3503int
b5256303
TC
3504zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
3505 uint64_t size, boolean_t *slog)
b128c09f 3506{
428870ff 3507 int error = 1;
4e21fd06 3508 zio_alloc_list_t io_alloc_list;
b128c09f 3509
428870ff
BB
3510 ASSERT(txg > spa_syncing_txg(spa));
3511
4e21fd06 3512 metaslab_trace_init(&io_alloc_list);
cc99f275
DB
3513
3514 /*
3515 * Block pointer fields are useful to metaslabs for stats and debugging.
3516 * Fill in the obvious ones before calling into metaslab_alloc().
3517 */
3518 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
3519 BP_SET_PSIZE(new_bp, size);
3520 BP_SET_LEVEL(new_bp, 0);
3521
492f64e9
PD
3522 /*
3523 * When allocating a zil block, we don't have information about
3524 * the final destination of the block except the objset it's part
3525 * of, so we just hash the objset ID to pick the allocator to get
3526 * some parallelism.
3527 */
1b7c1e5c 3528 error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
492f64e9
PD
3529 txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL,
3530 cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
3531 spa->spa_alloc_count);
1b7c1e5c
GDN
3532 if (error == 0) {
3533 *slog = TRUE;
3534 } else {
428870ff 3535 error = metaslab_alloc(spa, spa_normal_class(spa), size,
4e21fd06 3536 new_bp, 1, txg, NULL, METASLAB_FASTWRITE,
492f64e9
PD
3537 &io_alloc_list, NULL, cityhash4(0, 0, 0,
3538 os->os_dsl_dataset->ds_object) % spa->spa_alloc_count);
1b7c1e5c
GDN
3539 if (error == 0)
3540 *slog = FALSE;
ebf8e3a2 3541 }
4e21fd06 3542 metaslab_trace_fini(&io_alloc_list);
b128c09f
BB
3543
3544 if (error == 0) {
3545 BP_SET_LSIZE(new_bp, size);
3546 BP_SET_PSIZE(new_bp, size);
3547 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
428870ff
BB
3548 BP_SET_CHECKSUM(new_bp,
3549 spa_version(spa) >= SPA_VERSION_SLIM_ZIL
3550 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
b128c09f
BB
3551 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
3552 BP_SET_LEVEL(new_bp, 0);
428870ff 3553 BP_SET_DEDUP(new_bp, 0);
b128c09f 3554 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
b5256303
TC
3555
3556 /*
3557 * encrypted blocks will require an IV and salt. We generate
3558 * these now since we will not be rewriting the bp at
3559 * rewrite time.
3560 */
3561 if (os->os_encrypted) {
3562 uint8_t iv[ZIO_DATA_IV_LEN];
3563 uint8_t salt[ZIO_DATA_SALT_LEN];
3564
3565 BP_SET_CRYPT(new_bp, B_TRUE);
3566 VERIFY0(spa_crypt_get_salt(spa,
3567 dmu_objset_id(os), salt));
3568 VERIFY0(zio_crypt_generate_iv(iv));
3569
3570 zio_crypt_encode_params_bp(new_bp, salt, iv);
3571 }
1ce23dca
PS
3572 } else {
3573 zfs_dbgmsg("%s: zil block allocation failure: "
3574 "size %llu, error %d", spa_name(spa), size, error);
b128c09f
BB
3575 }
3576
3577 return (error);
3578}
3579
34dc7c2f
BB
3580/*
3581 * ==========================================================================
3582 * Read and write to physical devices
3583 * ==========================================================================
3584 */
98b25418 3585
98b25418
GW
3586/*
3587 * Issue an I/O to the underlying vdev. Typically the issue pipeline
3588 * stops after this stage and will resume upon I/O completion.
3589 * However, there are instances where the vdev layer may need to
3590 * continue the pipeline when an I/O was not issued. Since the I/O
3591 * that was sent to the vdev layer might be different than the one
3592 * currently active in the pipeline (see vdev_queue_io()), we explicitly
3593 * force the underlying vdev layers to call either zio_execute() or
3594 * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
3595 */
62840030 3596static zio_t *
34dc7c2f
BB
3597zio_vdev_io_start(zio_t *zio)
3598{
3599 vdev_t *vd = zio->io_vd;
34dc7c2f
BB
3600 uint64_t align;
3601 spa_t *spa = zio->io_spa;
3602
193a37cb
TH
3603 zio->io_delay = 0;
3604
b128c09f
BB
3605 ASSERT(zio->io_error == 0);
3606 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
34dc7c2f 3607
b128c09f
BB
3608 if (vd == NULL) {
3609 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
3610 spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
34dc7c2f 3611
b128c09f
BB
3612 /*
3613 * The mirror_ops handle multiple DVAs in a single BP.
3614 */
98b25418 3615 vdev_mirror_ops.vdev_op_io_start(zio);
62840030 3616 return (NULL);
34dc7c2f
BB
3617 }
3618
3dfb57a3 3619 ASSERT3P(zio->io_logical, !=, zio);
6cb8e530
PZ
3620 if (zio->io_type == ZIO_TYPE_WRITE) {
3621 ASSERT(spa->spa_trust_config);
3622
a1d477c2
MA
3623 /*
3624 * Note: the code can handle other kinds of writes,
3625 * but we don't expect them.
3626 */
6cb8e530
PZ
3627 if (zio->io_vd->vdev_removing) {
3628 ASSERT(zio->io_flags &
3629 (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
3630 ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
3631 }
a1d477c2 3632 }
3dfb57a3 3633
b128c09f
BB
3634 align = 1ULL << vd->vdev_top->vdev_ashift;
3635
b02fe35d
AR
3636 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
3637 P2PHASE(zio->io_size, align) != 0) {
3638 /* Transform logical writes to be a full physical block size. */
34dc7c2f 3639 uint64_t asize = P2ROUNDUP(zio->io_size, align);
a6255b7f 3640 abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
178e73b3 3641 ASSERT(vd == vd->vdev_top);
34dc7c2f 3642 if (zio->io_type == ZIO_TYPE_WRITE) {
a6255b7f
DQ
3643 abd_copy(abuf, zio->io_abd, zio->io_size);
3644 abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
34dc7c2f 3645 }
b128c09f 3646 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
34dc7c2f
BB
3647 }
3648
b02fe35d
AR
3649 /*
3650 * If this is not a physical io, make sure that it is properly aligned
3651 * before proceeding.
3652 */
3653 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
3654 ASSERT0(P2PHASE(zio->io_offset, align));
3655 ASSERT0(P2PHASE(zio->io_size, align));
3656 } else {
3657 /*
3658 * For physical writes, we allow 512b aligned writes and assume
3659 * the device will perform a read-modify-write as necessary.
3660 */
3661 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
3662 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
3663 }
3664
572e2857 3665 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
fb5f0bc8
BB
3666
3667 /*
3668 * If this is a repair I/O, and there's no self-healing involved --
3669 * that is, we're just resilvering what we expect to resilver --
3670 * then don't do the I/O unless zio's txg is actually in vd's DTL.
9e052db4
MA
3671 * This prevents spurious resilvering.
3672 *
3673 * There are a few ways that we can end up creating these spurious
3674 * resilver i/os:
3675 *
3676 * 1. A resilver i/o will be issued if any DVA in the BP has a
3677 * dirty DTL. The mirror code will issue resilver writes to
3678 * each DVA, including the one(s) that are not on vdevs with dirty
3679 * DTLs.
3680 *
3681 * 2. With nested replication, which happens when we have a
3682 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
3683 * For example, given mirror(replacing(A+B), C), it's likely that
3684 * only A is out of date (it's the new device). In this case, we'll
3685 * read from C, then use the data to resilver A+B -- but we don't
3686 * actually want to resilver B, just A. The top-level mirror has no
3687 * way to know this, so instead we just discard unnecessary repairs
3688 * as we work our way down the vdev tree.
3689 *
3690 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
3691 * The same logic applies to any form of nested replication: ditto
3692 * + mirror, RAID-Z + replacing, etc.
3693 *
3694 * However, indirect vdevs point off to other vdevs which may have
3695 * DTL's, so we never bypass them. The child i/os on concrete vdevs
3696 * will be properly bypassed instead.
fb5f0bc8
BB
3697 */
3698 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
3699 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
3700 zio->io_txg != 0 && /* not a delegated i/o */
9e052db4 3701 vd->vdev_ops != &vdev_indirect_ops &&
fb5f0bc8
BB
3702 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
3703 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
fb5f0bc8 3704 zio_vdev_io_bypass(zio);
62840030 3705 return (zio);
fb5f0bc8 3706 }
34dc7c2f 3707
1b939560
BB
3708 if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
3709 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
b128c09f 3710
b0bc7a84 3711 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
62840030 3712 return (zio);
b128c09f
BB
3713
3714 if ((zio = vdev_queue_io(zio)) == NULL)
62840030 3715 return (NULL);
b128c09f
BB
3716
3717 if (!vdev_accessible(vd, zio)) {
2e528b49 3718 zio->io_error = SET_ERROR(ENXIO);
b128c09f 3719 zio_interrupt(zio);
62840030 3720 return (NULL);
b128c09f 3721 }
67103816 3722 zio->io_delay = gethrtime();
b128c09f
BB
3723 }
3724
98b25418 3725 vd->vdev_ops->vdev_op_io_start(zio);
62840030 3726 return (NULL);
34dc7c2f
BB
3727}
3728
62840030 3729static zio_t *
34dc7c2f
BB
3730zio_vdev_io_done(zio_t *zio)
3731{
b128c09f
BB
3732 vdev_t *vd = zio->io_vd;
3733 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
3734 boolean_t unexpected_error = B_FALSE;
34dc7c2f 3735
ddc751d5 3736 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
62840030 3737 return (NULL);
ddc751d5 3738 }
34dc7c2f 3739
1b939560
BB
3740 ASSERT(zio->io_type == ZIO_TYPE_READ ||
3741 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
b128c09f 3742
193a37cb
TH
3743 if (zio->io_delay)
3744 zio->io_delay = gethrtime() - zio->io_delay;
3745
b128c09f
BB
3746 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
3747
3748 vdev_queue_io_done(zio);
3749
3750 if (zio->io_type == ZIO_TYPE_WRITE)
3751 vdev_cache_write(zio);
3752
3753 if (zio_injection_enabled && zio->io_error == 0)
d977122d
DB
3754 zio->io_error = zio_handle_device_injections(vd, zio,
3755 EIO, EILSEQ);
b128c09f
BB
3756
3757 if (zio_injection_enabled && zio->io_error == 0)
3758 zio->io_error = zio_handle_label_injection(zio, EIO);
3759
1b939560 3760 if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
b128c09f 3761 if (!vdev_accessible(vd, zio)) {
2e528b49 3762 zio->io_error = SET_ERROR(ENXIO);
b128c09f
BB
3763 } else {
3764 unexpected_error = B_TRUE;
3765 }
3766 }
3767 }
3768
3769 ops->vdev_op_io_done(zio);
34dc7c2f 3770
f43615d0 3771 if (unexpected_error)
d164b209 3772 VERIFY(vdev_probe(vd, zio) == NULL);
34dc7c2f 3773
62840030 3774 return (zio);
34dc7c2f
BB
3775}
3776
a8b2e306
TC
3777/*
3778 * This function is used to change the priority of an existing zio that is
3779 * currently in-flight. This is used by the arc to upgrade priority in the
3780 * event that a demand read is made for a block that is currently queued
3781 * as a scrub or async read IO. Otherwise, the high priority read request
3782 * would end up having to wait for the lower priority IO.
3783 */
3784void
3785zio_change_priority(zio_t *pio, zio_priority_t priority)
3786{
3787 zio_t *cio, *cio_next;
3788 zio_link_t *zl = NULL;
3789
3790 ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
3791
3792 if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
3793 vdev_queue_change_io_priority(pio, priority);
3794 } else {
3795 pio->io_priority = priority;
3796 }
3797
3798 mutex_enter(&pio->io_lock);
3799 for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
3800 cio_next = zio_walk_children(pio, &zl);
3801 zio_change_priority(cio, priority);
3802 }
3803 mutex_exit(&pio->io_lock);
3804}
3805
428870ff
BB
3806/*
3807 * For non-raidz ZIOs, we can just copy aside the bad data read from the
3808 * disk, and use that to finish the checksum ereport later.
3809 */
3810static void
3811zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
84c07ada 3812 const abd_t *good_buf)
428870ff
BB
3813{
3814 /* no processing needed */
3815 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
3816}
3817
3818/*ARGSUSED*/
3819void
3820zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
3821{
84c07ada 3822 void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
428870ff 3823
84c07ada 3824 abd_copy(abd, zio->io_abd, zio->io_size);
428870ff
BB
3825
3826 zcr->zcr_cbinfo = zio->io_size;
84c07ada 3827 zcr->zcr_cbdata = abd;
428870ff 3828 zcr->zcr_finish = zio_vsd_default_cksum_finish;
84c07ada 3829 zcr->zcr_free = zio_abd_free;
428870ff
BB
3830}
3831
62840030 3832static zio_t *
34dc7c2f
BB
3833zio_vdev_io_assess(zio_t *zio)
3834{
3835 vdev_t *vd = zio->io_vd;
b128c09f 3836
ddc751d5 3837 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
62840030 3838 return (NULL);
ddc751d5 3839 }
b128c09f
BB
3840
3841 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
3842 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
3843
3844 if (zio->io_vsd != NULL) {
428870ff 3845 zio->io_vsd_ops->vsd_free(zio);
b128c09f 3846 zio->io_vsd = NULL;
34dc7c2f
BB
3847 }
3848
b128c09f 3849 if (zio_injection_enabled && zio->io_error == 0)
34dc7c2f
BB
3850 zio->io_error = zio_handle_fault_injection(zio, EIO);
3851
3852 /*
3853 * If the I/O failed, determine whether we should attempt to retry it.
428870ff
BB
3854 *
3855 * On retry, we cut in line in the issue queue, since we don't want
3856 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
34dc7c2f 3857 */
b128c09f
BB
3858 if (zio->io_error && vd == NULL &&
3859 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
3860 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
3861 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
34dc7c2f 3862 zio->io_error = 0;
b128c09f
BB
3863 zio->io_flags |= ZIO_FLAG_IO_RETRY |
3864 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
428870ff
BB
3865 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
3866 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
3867 zio_requeue_io_start_cut_in_line);
62840030 3868 return (NULL);
34dc7c2f
BB
3869 }
3870
b128c09f
BB
3871 /*
3872 * If we got an error on a leaf device, convert it to ENXIO
3873 * if the device is not accessible at all.
3874 */
3875 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
3876 !vdev_accessible(vd, zio))
2e528b49 3877 zio->io_error = SET_ERROR(ENXIO);
b128c09f
BB
3878
3879 /*
3880 * If we can't write to an interior vdev (mirror or RAID-Z),
3881 * set vdev_cant_write so that we stop trying to allocate from it.
3882 */
3883 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
13fe0198 3884 vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
b128c09f 3885 vd->vdev_cant_write = B_TRUE;
13fe0198 3886 }
b128c09f 3887
298ec40b
GM
3888 /*
3889 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
1b939560
BB
3890 * attempts will ever succeed. In this case we set a persistent
3891 * boolean flag so that we don't bother with it in the future.
298ec40b
GM
3892 */
3893 if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
3894 zio->io_type == ZIO_TYPE_IOCTL &&
3895 zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
3896 vd->vdev_nowritecache = B_TRUE;
3897
b128c09f
BB
3898 if (zio->io_error)
3899 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3900
e8b96c60
MA
3901 if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
3902 zio->io_physdone != NULL) {
3903 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
3904 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
3905 zio->io_physdone(zio->io_logical);
3906 }
3907
62840030 3908 return (zio);
34dc7c2f
BB
3909}
3910
3911void
3912zio_vdev_io_reissue(zio_t *zio)
3913{
3914 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
3915 ASSERT(zio->io_error == 0);
3916
428870ff 3917 zio->io_stage >>= 1;
34dc7c2f
BB
3918}
3919
3920void
3921zio_vdev_io_redone(zio_t *zio)
3922{
3923 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
3924
428870ff 3925 zio->io_stage >>= 1;
34dc7c2f
BB
3926}
3927
3928void
3929zio_vdev_io_bypass(zio_t *zio)
3930{
3931 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
3932 ASSERT(zio->io_error == 0);
3933
3934 zio->io_flags |= ZIO_FLAG_IO_BYPASS;
428870ff 3935 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
34dc7c2f
BB
3936}
3937
b5256303
TC
3938/*
3939 * ==========================================================================
3940 * Encrypt and store encryption parameters
3941 * ==========================================================================
3942 */
3943
3944
3945/*
3946 * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
3947 * managing the storage of encryption parameters and passing them to the
3948 * lower-level encryption functions.
3949 */
62840030 3950static zio_t *
b5256303
TC
3951zio_encrypt(zio_t *zio)
3952{
3953 zio_prop_t *zp = &zio->io_prop;
3954 spa_t *spa = zio->io_spa;
3955 blkptr_t *bp = zio->io_bp;
3956 uint64_t psize = BP_GET_PSIZE(bp);
ae76f45c 3957 uint64_t dsobj = zio->io_bookmark.zb_objset;
b5256303
TC
3958 dmu_object_type_t ot = BP_GET_TYPE(bp);
3959 void *enc_buf = NULL;
3960 abd_t *eabd = NULL;
3961 uint8_t salt[ZIO_DATA_SALT_LEN];
3962 uint8_t iv[ZIO_DATA_IV_LEN];
3963 uint8_t mac[ZIO_DATA_MAC_LEN];
3964 boolean_t no_crypt = B_FALSE;
3965
3966 /* the root zio already encrypted the data */
3967 if (zio->io_child_type == ZIO_CHILD_GANG)
62840030 3968 return (zio);
b5256303
TC
3969
3970 /* only ZIL blocks are re-encrypted on rewrite */
3971 if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
62840030 3972 return (zio);
b5256303
TC
3973
3974 if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
3975 BP_SET_CRYPT(bp, B_FALSE);
62840030 3976 return (zio);
b5256303
TC
3977 }
3978
3979 /* if we are doing raw encryption set the provided encryption params */
3980 if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
ae76f45c 3981 ASSERT0(BP_GET_LEVEL(bp));
b5256303
TC
3982 BP_SET_CRYPT(bp, B_TRUE);
3983 BP_SET_BYTEORDER(bp, zp->zp_byteorder);
3984 if (ot != DMU_OT_OBJSET)
3985 zio_crypt_encode_mac_bp(bp, zp->zp_mac);
ae76f45c
TC
3986
3987 /* dnode blocks must be written out in the provided byteorder */
3988 if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
3989 ot == DMU_OT_DNODE) {
3990 void *bswap_buf = zio_buf_alloc(psize);
3991 abd_t *babd = abd_get_from_buf(bswap_buf, psize);
3992
3993 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
3994 abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
3995 dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
3996 psize);
3997
3998 abd_take_ownership_of_buf(babd, B_TRUE);
3999 zio_push_transform(zio, babd, psize, psize, NULL);
4000 }
4001
b5256303
TC
4002 if (DMU_OT_IS_ENCRYPTED(ot))
4003 zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
62840030 4004 return (zio);
b5256303
TC
4005 }
4006
4007 /* indirect blocks only maintain a cksum of the lower level MACs */
4008 if (BP_GET_LEVEL(bp) > 0) {
4009 BP_SET_CRYPT(bp, B_TRUE);
4010 VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
4011 zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
4012 mac));
4013 zio_crypt_encode_mac_bp(bp, mac);
62840030 4014 return (zio);
b5256303
TC
4015 }
4016
4017 /*
4018 * Objset blocks are a special case since they have 2 256-bit MACs
4019 * embedded within them.
4020 */
4021 if (ot == DMU_OT_OBJSET) {
4022 ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
4023 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
4024 BP_SET_CRYPT(bp, B_TRUE);
ae76f45c
TC
4025 VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
4026 zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
62840030 4027 return (zio);
b5256303
TC
4028 }
4029
4030 /* unencrypted object types are only authenticated with a MAC */
4031 if (!DMU_OT_IS_ENCRYPTED(ot)) {
4032 BP_SET_CRYPT(bp, B_TRUE);
ae76f45c
TC
4033 VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
4034 zio->io_abd, psize, mac));
b5256303 4035 zio_crypt_encode_mac_bp(bp, mac);
62840030 4036 return (zio);
b5256303
TC
4037 }
4038
4039 /*
4040 * Later passes of sync-to-convergence may decide to rewrite data
4041 * in place to avoid more disk reallocations. This presents a problem
d611989f 4042 * for encryption because this constitutes rewriting the new data with
b5256303
TC
4043 * the same encryption key and IV. However, this only applies to blocks
4044 * in the MOS (particularly the spacemaps) and we do not encrypt the
4045 * MOS. We assert that the zio is allocating or an intent log write
4046 * to enforce this.
4047 */
4048 ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
4049 ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
4050 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
4051 ASSERT3U(psize, !=, 0);
4052
4053 enc_buf = zio_buf_alloc(psize);
4054 eabd = abd_get_from_buf(enc_buf, psize);
4055 abd_take_ownership_of_buf(eabd, B_TRUE);
4056
4057 /*
4058 * For an explanation of what encryption parameters are stored
4059 * where, see the block comment in zio_crypt.c.
4060 */
4061 if (ot == DMU_OT_INTENT_LOG) {
4062 zio_crypt_decode_params_bp(bp, salt, iv);
4063 } else {
4064 BP_SET_CRYPT(bp, B_TRUE);
4065 }
4066
4067 /* Perform the encryption. This should not fail */
be9a5c35
TC
4068 VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
4069 BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
4070 salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
b5256303
TC
4071
4072 /* encode encryption metadata into the bp */
4073 if (ot == DMU_OT_INTENT_LOG) {
4074 /*
4075 * ZIL blocks store the MAC in the embedded checksum, so the
4076 * transform must always be applied.
4077 */
4078 zio_crypt_encode_mac_zil(enc_buf, mac);
4079 zio_push_transform(zio, eabd, psize, psize, NULL);
4080 } else {
4081 BP_SET_CRYPT(bp, B_TRUE);
4082 zio_crypt_encode_params_bp(bp, salt, iv);
4083 zio_crypt_encode_mac_bp(bp, mac);
4084
4085 if (no_crypt) {
4086 ASSERT3U(ot, ==, DMU_OT_DNODE);
4087 abd_free(eabd);
4088 } else {
4089 zio_push_transform(zio, eabd, psize, psize, NULL);
4090 }
4091 }
4092
62840030 4093 return (zio);
b5256303
TC
4094}
4095
34dc7c2f
BB
4096/*
4097 * ==========================================================================
4098 * Generate and verify checksums
4099 * ==========================================================================
4100 */
62840030 4101static zio_t *
34dc7c2f
BB
4102zio_checksum_generate(zio_t *zio)
4103{
34dc7c2f 4104 blkptr_t *bp = zio->io_bp;
b128c09f 4105 enum zio_checksum checksum;
34dc7c2f 4106
b128c09f
BB
4107 if (bp == NULL) {
4108 /*
4109 * This is zio_write_phys().
4110 * We're either generating a label checksum, or none at all.
4111 */
4112 checksum = zio->io_prop.zp_checksum;
34dc7c2f 4113
b128c09f 4114 if (checksum == ZIO_CHECKSUM_OFF)
62840030 4115 return (zio);
b128c09f
BB
4116
4117 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
4118 } else {
4119 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
4120 ASSERT(!IO_IS_ALLOCATING(zio));
4121 checksum = ZIO_CHECKSUM_GANG_HEADER;
4122 } else {
4123 checksum = BP_GET_CHECKSUM(bp);
4124 }
4125 }
34dc7c2f 4126
a6255b7f 4127 zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
34dc7c2f 4128
62840030 4129 return (zio);
34dc7c2f
BB
4130}
4131
62840030 4132static zio_t *
b128c09f 4133zio_checksum_verify(zio_t *zio)
34dc7c2f 4134{
428870ff 4135 zio_bad_cksum_t info;
b128c09f
BB
4136 blkptr_t *bp = zio->io_bp;
4137 int error;
34dc7c2f 4138
428870ff
BB
4139 ASSERT(zio->io_vd != NULL);
4140
b128c09f
BB
4141 if (bp == NULL) {
4142 /*
4143 * This is zio_read_phys().
4144 * We're either verifying a label checksum, or nothing at all.
4145 */
4146 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
62840030 4147 return (zio);
34dc7c2f 4148
b128c09f
BB
4149 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
4150 }
34dc7c2f 4151
428870ff 4152 if ((error = zio_checksum_error(zio, &info)) != 0) {
b128c09f 4153 zio->io_error = error;
7a3066ff
MA
4154 if (error == ECKSUM &&
4155 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2bbec1c9
TH
4156 mutex_enter(&zio->io_vd->vdev_stat_lock);
4157 zio->io_vd->vdev_stat.vs_checksum_errors++;
4158 mutex_exit(&zio->io_vd->vdev_stat_lock);
4159
428870ff 4160 zfs_ereport_start_checksum(zio->io_spa,
b5256303
TC
4161 zio->io_vd, &zio->io_bookmark, zio,
4162 zio->io_offset, zio->io_size, NULL, &info);
b128c09f 4163 }
34dc7c2f
BB
4164 }
4165
62840030 4166 return (zio);
34dc7c2f
BB
4167}
4168
4169/*
4170 * Called by RAID-Z to ensure we don't compute the checksum twice.
4171 */
4172void
4173zio_checksum_verified(zio_t *zio)
4174{
428870ff 4175 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
34dc7c2f
BB
4176}
4177
4178/*
b128c09f
BB
4179 * ==========================================================================
4180 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
9b67f605 4181 * An error of 0 indicates success. ENXIO indicates whole-device failure,
d611989f 4182 * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO
b128c09f
BB
4183 * indicate errors that are specific to one I/O, and most likely permanent.
4184 * Any other error is presumed to be worse because we weren't expecting it.
4185 * ==========================================================================
34dc7c2f 4186 */
b128c09f
BB
4187int
4188zio_worst_error(int e1, int e2)
34dc7c2f 4189{
b128c09f
BB
4190 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
4191 int r1, r2;
4192
4193 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
4194 if (e1 == zio_error_rank[r1])
4195 break;
34dc7c2f 4196
b128c09f
BB
4197 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
4198 if (e2 == zio_error_rank[r2])
4199 break;
4200
4201 return (r1 > r2 ? e1 : e2);
34dc7c2f
BB
4202}
4203
4204/*
4205 * ==========================================================================
b128c09f 4206 * I/O completion
34dc7c2f
BB
4207 * ==========================================================================
4208 */
62840030 4209static zio_t *
b128c09f 4210zio_ready(zio_t *zio)
34dc7c2f 4211{
b128c09f 4212 blkptr_t *bp = zio->io_bp;
d164b209 4213 zio_t *pio, *pio_next;
3dfb57a3 4214 zio_link_t *zl = NULL;
34dc7c2f 4215
ddc751d5
GW
4216 if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
4217 ZIO_WAIT_READY)) {
62840030 4218 return (NULL);
ddc751d5 4219 }
34dc7c2f 4220
9babb374 4221 if (zio->io_ready) {
b128c09f 4222 ASSERT(IO_IS_ALLOCATING(zio));
03c6040b
GW
4223 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
4224 (zio->io_flags & ZIO_FLAG_NOPWRITE));
b128c09f 4225 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
34dc7c2f 4226
b128c09f
BB
4227 zio->io_ready(zio);
4228 }
34dc7c2f 4229
b128c09f
BB
4230 if (bp != NULL && bp != &zio->io_bp_copy)
4231 zio->io_bp_copy = *bp;
34dc7c2f 4232
3dfb57a3 4233 if (zio->io_error != 0) {
b128c09f 4234 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
34dc7c2f 4235
3dfb57a3
DB
4236 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
4237 ASSERT(IO_IS_ALLOCATING(zio));
4238 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
cc99f275
DB
4239 ASSERT(zio->io_metaslab_class != NULL);
4240
3dfb57a3
DB
4241 /*
4242 * We were unable to allocate anything, unreserve and
4243 * issue the next I/O to allocate.
4244 */
4245 metaslab_class_throttle_unreserve(
cc99f275
DB
4246 zio->io_metaslab_class, zio->io_prop.zp_copies,
4247 zio->io_allocator, zio);
492f64e9 4248 zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
3dfb57a3
DB
4249 }
4250 }
4251
d164b209
BB
4252 mutex_enter(&zio->io_lock);
4253 zio->io_state[ZIO_WAIT_READY] = 1;
3dfb57a3 4254 pio = zio_walk_parents(zio, &zl);
d164b209
BB
4255 mutex_exit(&zio->io_lock);
4256
4257 /*
4258 * As we notify zio's parents, new parents could be added.
4259 * New parents go to the head of zio's io_parent_list, however,
4260 * so we will (correctly) not notify them. The remainder of zio's
4261 * io_parent_list, from 'pio_next' onward, cannot change because
4262 * all parents must wait for us to be done before they can be done.
4263 */
4264 for (; pio != NULL; pio = pio_next) {
3dfb57a3 4265 pio_next = zio_walk_parents(zio, &zl);
62840030 4266 zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
d164b209 4267 }
34dc7c2f 4268
428870ff
BB
4269 if (zio->io_flags & ZIO_FLAG_NODATA) {
4270 if (BP_IS_GANG(bp)) {
4271 zio->io_flags &= ~ZIO_FLAG_NODATA;
4272 } else {
a6255b7f 4273 ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
428870ff
BB
4274 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
4275 }
4276 }
4277
4278 if (zio_injection_enabled &&
4279 zio->io_spa->spa_syncing_txg == zio->io_txg)
4280 zio_handle_ignored_writes(zio);
4281
62840030 4282 return (zio);
34dc7c2f
BB
4283}
4284
3dfb57a3
DB
4285/*
4286 * Update the allocation throttle accounting.
4287 */
4288static void
4289zio_dva_throttle_done(zio_t *zio)
4290{
1c27024e 4291 ASSERTV(zio_t *lio = zio->io_logical);
3dfb57a3
DB
4292 zio_t *pio = zio_unique_parent(zio);
4293 vdev_t *vd = zio->io_vd;
4294 int flags = METASLAB_ASYNC_ALLOC;
4295
4296 ASSERT3P(zio->io_bp, !=, NULL);
4297 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
4298 ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
4299 ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
4300 ASSERT(vd != NULL);
4301 ASSERT3P(vd, ==, vd->vdev_top);
21df134f
SB
4302 ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
4303 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
3dfb57a3
DB
4304 ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
4305 ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
4306 ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
4307
4308 /*
4309 * Parents of gang children can have two flavors -- ones that
4310 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
4311 * and ones that allocated the constituent blocks. The allocation
4312 * throttle needs to know the allocating parent zio so we must find
4313 * it here.
4314 */
4315 if (pio->io_child_type == ZIO_CHILD_GANG) {
4316 /*
4317 * If our parent is a rewrite gang child then our grandparent
4318 * would have been the one that performed the allocation.
4319 */
4320 if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
4321 pio = zio_unique_parent(pio);
4322 flags |= METASLAB_GANG_CHILD;
4323 }
4324
4325 ASSERT(IO_IS_ALLOCATING(pio));
4326 ASSERT3P(zio, !=, zio->io_logical);
4327 ASSERT(zio->io_logical != NULL);
4328 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
4329 ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
cc99f275 4330 ASSERT(zio->io_metaslab_class != NULL);
3dfb57a3
DB
4331
4332 mutex_enter(&pio->io_lock);
492f64e9
PD
4333 metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
4334 pio->io_allocator, B_TRUE);
3dfb57a3
DB
4335 mutex_exit(&pio->io_lock);
4336
cc99f275
DB
4337 metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
4338 pio->io_allocator, pio);
3dfb57a3
DB
4339
4340 /*
4341 * Call into the pipeline to see if there is more work that
4342 * needs to be done. If there is work to be done it will be
4343 * dispatched to another taskq thread.
4344 */
492f64e9 4345 zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
3dfb57a3
DB
4346}
4347
62840030 4348static zio_t *
b128c09f 4349zio_done(zio_t *zio)
34dc7c2f 4350{
3dfb57a3
DB
4351 /*
4352 * Always attempt to keep stack usage minimal here since
d611989f 4353 * we can be called recursively up to 19 levels deep.
3dfb57a3 4354 */
84c07ada 4355 const uint64_t psize = zio->io_size;
d164b209 4356 zio_t *pio, *pio_next;
3dfb57a3 4357 zio_link_t *zl = NULL;
34dc7c2f 4358
b128c09f 4359 /*
9babb374 4360 * If our children haven't all completed,
b128c09f
BB
4361 * wait for them and then repeat this pipeline stage.
4362 */
ddc751d5 4363 if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
62840030 4364 return (NULL);
ddc751d5 4365 }
34dc7c2f 4366
3dfb57a3
DB
4367 /*
4368 * If the allocation throttle is enabled, then update the accounting.
4369 * We only track child I/Os that are part of an allocating async
4370 * write. We must do this since the allocation is performed
4371 * by the logical I/O but the actual write is done by child I/Os.
4372 */
4373 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
4374 zio->io_child_type == ZIO_CHILD_VDEV) {
cc99f275
DB
4375 ASSERT(zio->io_metaslab_class != NULL);
4376 ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
3dfb57a3
DB
4377 zio_dva_throttle_done(zio);
4378 }
4379
4380 /*
4381 * If the allocation throttle is enabled, verify that
4382 * we have decremented the refcounts for every I/O that was throttled.
4383 */
4384 if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
4385 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
4386 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
4387 ASSERT(zio->io_bp != NULL);
cc99f275 4388
492f64e9
PD
4389 metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
4390 zio->io_allocator);
424fd7c3 4391 VERIFY(zfs_refcount_not_held(
cc99f275 4392 &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
492f64e9 4393 zio));
3dfb57a3
DB
4394 }
4395
4396
1c27024e
DB
4397 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
4398 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
b128c09f
BB
4399 ASSERT(zio->io_children[c][w] == 0);
4400
9b67f605 4401 if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
c776b317
BB
4402 ASSERT(zio->io_bp->blk_pad[0] == 0);
4403 ASSERT(zio->io_bp->blk_pad[1] == 0);
d1d7e268
MK
4404 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
4405 sizeof (blkptr_t)) == 0 ||
c776b317
BB
4406 (zio->io_bp == zio_unique_parent(zio)->io_bp));
4407 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
428870ff 4408 zio->io_bp_override == NULL &&
b128c09f 4409 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
d1d7e268
MK
4410 ASSERT3U(zio->io_prop.zp_copies, <=,
4411 BP_GET_NDVAS(zio->io_bp));
c776b317 4412 ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
d1d7e268
MK
4413 (BP_COUNT_GANG(zio->io_bp) ==
4414 BP_GET_NDVAS(zio->io_bp)));
b128c09f 4415 }
03c6040b
GW
4416 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
4417 VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
b128c09f
BB
4418 }
4419
4420 /*
428870ff 4421 * If there were child vdev/gang/ddt errors, they apply to us now.
b128c09f
BB
4422 */
4423 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
4424 zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
428870ff
BB
4425 zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
4426
4427 /*
4428 * If the I/O on the transformed data was successful, generate any
4429 * checksum reports now while we still have the transformed data.
4430 */
4431 if (zio->io_error == 0) {
4432 while (zio->io_cksum_report != NULL) {
4433 zio_cksum_report_t *zcr = zio->io_cksum_report;
4434 uint64_t align = zcr->zcr_align;
a6255b7f 4435 uint64_t asize = P2ROUNDUP(psize, align);
a6255b7f
DQ
4436 abd_t *adata = zio->io_abd;
4437
4438 if (asize != psize) {
84c07ada 4439 adata = abd_alloc(asize, B_TRUE);
a6255b7f
DQ
4440 abd_copy(adata, zio->io_abd, psize);
4441 abd_zero_off(adata, psize, asize - psize);
428870ff
BB
4442 }
4443
4444 zio->io_cksum_report = zcr->zcr_next;
4445 zcr->zcr_next = NULL;
84c07ada 4446 zcr->zcr_finish(zcr, adata);
428870ff
BB
4447 zfs_ereport_free_checksum(zcr);
4448
a6255b7f
DQ
4449 if (asize != psize)
4450 abd_free(adata);
428870ff
BB
4451 }
4452 }
b128c09f
BB
4453
4454 zio_pop_transforms(zio); /* note: may set zio->io_error */
4455
a6255b7f 4456 vdev_stat_update(zio, psize);
b128c09f 4457
a69052be 4458 /*
cc92e9d0 4459 * If this I/O is attached to a particular vdev is slow, exceeding
72f53c56
MJ
4460 * 30 seconds to complete, post an error described the I/O delay.
4461 * We ignore these errors if the device is currently unavailable.
a69052be 4462 */
ad796b8a
TH
4463 if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
4464 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
4465 /*
4466 * We want to only increment our slow IO counters if
4467 * the IO is valid (i.e. not if the drive is removed).
4468 *
4469 * zfs_ereport_post() will also do these checks, but
4470 * it can also ratelimit and have other failures, so we
4471 * need to increment the slow_io counters independent
4472 * of it.
4473 */
4474 if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
4475 zio->io_spa, zio->io_vd, zio)) {
4476 mutex_enter(&zio->io_vd->vdev_stat_lock);
4477 zio->io_vd->vdev_stat.vs_slow_ios++;
4478 mutex_exit(&zio->io_vd->vdev_stat_lock);
4479
4480 zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
4481 zio->io_spa, zio->io_vd, &zio->io_bookmark,
4482 zio, 0, 0);
4483 }
4484 }
72f53c56 4485 }
a69052be 4486
b128c09f
BB
4487 if (zio->io_error) {
4488 /*
4489 * If this I/O is attached to a particular vdev,
4490 * generate an error message describing the I/O failure
4491 * at the block level. We ignore these errors if the
4492 * device is currently unavailable.
4493 */
c776b317 4494 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
2bbec1c9
TH
4495 !vdev_is_dead(zio->io_vd)) {
4496 mutex_enter(&zio->io_vd->vdev_stat_lock);
4497 if (zio->io_type == ZIO_TYPE_READ) {
4498 zio->io_vd->vdev_stat.vs_read_errors++;
4499 } else if (zio->io_type == ZIO_TYPE_WRITE) {
4500 zio->io_vd->vdev_stat.vs_write_errors++;
4501 }
4502 mutex_exit(&zio->io_vd->vdev_stat_lock);
4503
c776b317 4504 zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
b5256303 4505 zio->io_vd, &zio->io_bookmark, zio, 0, 0);
2bbec1c9 4506 }
34dc7c2f 4507
428870ff
BB
4508 if ((zio->io_error == EIO || !(zio->io_flags &
4509 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
c776b317 4510 zio == zio->io_logical) {
b128c09f
BB
4511 /*
4512 * For logical I/O requests, tell the SPA to log the
4513 * error and generate a logical data ereport.
4514 */
b5256303 4515 spa_log_error(zio->io_spa, &zio->io_bookmark);
d1d7e268 4516 zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
b5256303 4517 NULL, &zio->io_bookmark, zio, 0, 0);
b128c09f
BB
4518 }
4519 }
34dc7c2f 4520
c776b317 4521 if (zio->io_error && zio == zio->io_logical) {
b128c09f
BB
4522 /*
4523 * Determine whether zio should be reexecuted. This will
4524 * propagate all the way to the root via zio_notify_parent().
4525 */
c776b317 4526 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
428870ff 4527 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
b128c09f 4528
428870ff
BB
4529 if (IO_IS_ALLOCATING(zio) &&
4530 !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
b128c09f
BB
4531 if (zio->io_error != ENOSPC)
4532 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
4533 else
4534 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
428870ff 4535 }
b128c09f
BB
4536
4537 if ((zio->io_type == ZIO_TYPE_READ ||
4538 zio->io_type == ZIO_TYPE_FREE) &&
572e2857 4539 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
b128c09f 4540 zio->io_error == ENXIO &&
c776b317
BB
4541 spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
4542 spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
b128c09f
BB
4543 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
4544
4545 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
4546 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
428870ff
BB
4547
4548 /*
4549 * Here is a possibly good place to attempt to do
4550 * either combinatorial reconstruction or error correction
4551 * based on checksums. It also might be a good place
4552 * to send out preliminary ereports before we suspend
4553 * processing.
4554 */
34dc7c2f
BB
4555 }
4556
4557 /*
b128c09f
BB
4558 * If there were logical child errors, they apply to us now.
4559 * We defer this until now to avoid conflating logical child
4560 * errors with errors that happened to the zio itself when
4561 * updating vdev stats and reporting FMA events above.
34dc7c2f 4562 */
b128c09f 4563 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
34dc7c2f 4564
428870ff
BB
4565 if ((zio->io_error || zio->io_reexecute) &&
4566 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
03c6040b 4567 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
c776b317 4568 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
9babb374
BB
4569
4570 zio_gang_tree_free(&zio->io_gang_tree);
4571
4572 /*
4573 * Godfather I/Os should never suspend.
4574 */
4575 if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
4576 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
a32494d2 4577 zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
9babb374 4578
b128c09f
BB
4579 if (zio->io_reexecute) {
4580 /*
4581 * This is a logical I/O that wants to reexecute.
4582 *
4583 * Reexecute is top-down. When an i/o fails, if it's not
4584 * the root, it simply notifies its parent and sticks around.
4585 * The parent, seeing that it still has children in zio_done(),
4586 * does the same. This percolates all the way up to the root.
4587 * The root i/o will reexecute or suspend the entire tree.
4588 *
4589 * This approach ensures that zio_reexecute() honors
4590 * all the original i/o dependency relationships, e.g.
4591 * parents not executing until children are ready.
4592 */
4593 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
34dc7c2f 4594
9babb374 4595 zio->io_gang_leader = NULL;
b128c09f 4596
d164b209
BB
4597 mutex_enter(&zio->io_lock);
4598 zio->io_state[ZIO_WAIT_DONE] = 1;
4599 mutex_exit(&zio->io_lock);
4600
9babb374
BB
4601 /*
4602 * "The Godfather" I/O monitors its children but is
4603 * not a true parent to them. It will track them through
4604 * the pipeline but severs its ties whenever they get into
4605 * trouble (e.g. suspended). This allows "The Godfather"
4606 * I/O to return status without blocking.
4607 */
3dfb57a3
DB
4608 zl = NULL;
4609 for (pio = zio_walk_parents(zio, &zl); pio != NULL;
4610 pio = pio_next) {
4611 zio_link_t *remove_zl = zl;
4612 pio_next = zio_walk_parents(zio, &zl);
9babb374
BB
4613
4614 if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
4615 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3dfb57a3 4616 zio_remove_child(pio, zio, remove_zl);
62840030
MA
4617 /*
4618 * This is a rare code path, so we don't
4619 * bother with "next_to_execute".
4620 */
4621 zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
4622 NULL);
9babb374
BB
4623 }
4624 }
4625
d164b209 4626 if ((pio = zio_unique_parent(zio)) != NULL) {
b128c09f
BB
4627 /*
4628 * We're not a root i/o, so there's nothing to do
4629 * but notify our parent. Don't propagate errors
4630 * upward since we haven't permanently failed yet.
4631 */
9babb374 4632 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
b128c09f 4633 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
62840030
MA
4634 /*
4635 * This is a rare code path, so we don't bother with
4636 * "next_to_execute".
4637 */
4638 zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
b128c09f
BB
4639 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
4640 /*
4641 * We'd fail again if we reexecuted now, so suspend
4642 * until conditions improve (e.g. device comes online).
4643 */
cec3a0a1 4644 zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
b128c09f
BB
4645 } else {
4646 /*
4647 * Reexecution is potentially a huge amount of work.
4648 * Hand it off to the otherwise-unused claim taskq.
4649 */
a38718a6 4650 ASSERT(taskq_empty_ent(&zio->io_tqent));
7ef5e54e
AL
4651 spa_taskq_dispatch_ent(zio->io_spa,
4652 ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
a38718a6
GA
4653 (task_func_t *)zio_reexecute, zio, 0,
4654 &zio->io_tqent);
b128c09f 4655 }
62840030 4656 return (NULL);
34dc7c2f
BB
4657 }
4658
428870ff 4659 ASSERT(zio->io_child_count == 0);
b128c09f
BB
4660 ASSERT(zio->io_reexecute == 0);
4661 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
34dc7c2f 4662
428870ff
BB
4663 /*
4664 * Report any checksum errors, since the I/O is complete.
4665 */
4666 while (zio->io_cksum_report != NULL) {
4667 zio_cksum_report_t *zcr = zio->io_cksum_report;
4668 zio->io_cksum_report = zcr->zcr_next;
4669 zcr->zcr_next = NULL;
4670 zcr->zcr_finish(zcr, NULL);
4671 zfs_ereport_free_checksum(zcr);
4672 }
4673
920dd524 4674 if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
9b67f605
MA
4675 !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
4676 !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
920dd524
ED
4677 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
4678 }
4679
d164b209
BB
4680 /*
4681 * It is the responsibility of the done callback to ensure that this
4682 * particular zio is no longer discoverable for adoption, and as
4683 * such, cannot acquire any new parents.
4684 */
b128c09f
BB
4685 if (zio->io_done)
4686 zio->io_done(zio);
34dc7c2f 4687
d164b209
BB
4688 mutex_enter(&zio->io_lock);
4689 zio->io_state[ZIO_WAIT_DONE] = 1;
4690 mutex_exit(&zio->io_lock);
34dc7c2f 4691
62840030
MA
4692 /*
4693 * We are done executing this zio. We may want to execute a parent
4694 * next. See the comment in zio_notify_parent().
4695 */
4696 zio_t *next_to_execute = NULL;
3dfb57a3
DB
4697 zl = NULL;
4698 for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
4699 zio_link_t *remove_zl = zl;
4700 pio_next = zio_walk_parents(zio, &zl);
4701 zio_remove_child(pio, zio, remove_zl);
62840030 4702 zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
b128c09f 4703 }
34dc7c2f 4704
b128c09f
BB
4705 if (zio->io_waiter != NULL) {
4706 mutex_enter(&zio->io_lock);
4707 zio->io_executor = NULL;
4708 cv_broadcast(&zio->io_cv);
4709 mutex_exit(&zio->io_lock);
4710 } else {
4711 zio_destroy(zio);
4712 }
34dc7c2f 4713
62840030 4714 return (next_to_execute);
34dc7c2f
BB
4715}
4716
4717/*
b128c09f
BB
4718 * ==========================================================================
4719 * I/O pipeline definition
4720 * ==========================================================================
34dc7c2f 4721 */
428870ff 4722static zio_pipe_stage_t *zio_pipeline[] = {
b128c09f 4723 NULL,
b128c09f 4724 zio_read_bp_init,
3dfb57a3 4725 zio_write_bp_init,
428870ff
BB
4726 zio_free_bp_init,
4727 zio_issue_async,
3dfb57a3 4728 zio_write_compress,
b5256303 4729 zio_encrypt,
b128c09f 4730 zio_checksum_generate,
03c6040b 4731 zio_nop_write,
428870ff
BB
4732 zio_ddt_read_start,
4733 zio_ddt_read_done,
4734 zio_ddt_write,
4735 zio_ddt_free,
b128c09f
BB
4736 zio_gang_assemble,
4737 zio_gang_issue,
3dfb57a3 4738 zio_dva_throttle,
b128c09f
BB
4739 zio_dva_allocate,
4740 zio_dva_free,
4741 zio_dva_claim,
4742 zio_ready,
4743 zio_vdev_io_start,
4744 zio_vdev_io_done,
4745 zio_vdev_io_assess,
4746 zio_checksum_verify,
4747 zio_done
4748};
c28b2279 4749
9ae529ec 4750
9ae529ec 4751
9ae529ec 4752
fcff0f35
PD
4753/*
4754 * Compare two zbookmark_phys_t's to see which we would reach first in a
4755 * pre-order traversal of the object tree.
4756 *
4757 * This is simple in every case aside from the meta-dnode object. For all other
4758 * objects, we traverse them in order (object 1 before object 2, and so on).
4759 * However, all of these objects are traversed while traversing object 0, since
4760 * the data it points to is the list of objects. Thus, we need to convert to a
4761 * canonical representation so we can compare meta-dnode bookmarks to
4762 * non-meta-dnode bookmarks.
4763 *
4764 * We do this by calculating "equivalents" for each field of the zbookmark.
4765 * zbookmarks outside of the meta-dnode use their own object and level, and
4766 * calculate the level 0 equivalent (the first L0 blkid that is contained in the
4767 * blocks this bookmark refers to) by multiplying their blkid by their span
4768 * (the number of L0 blocks contained within one block at their level).
4769 * zbookmarks inside the meta-dnode calculate their object equivalent
4770 * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
4771 * level + 1<<31 (any value larger than a level could ever be) for their level.
4772 * This causes them to always compare before a bookmark in their object
4773 * equivalent, compare appropriately to bookmarks in other objects, and to
4774 * compare appropriately to other bookmarks in the meta-dnode.
4775 */
4776int
4777zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
4778 const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
4779{
4780 /*
4781 * These variables represent the "equivalent" values for the zbookmark,
4782 * after converting zbookmarks inside the meta dnode to their
4783 * normal-object equivalents.
4784 */
4785 uint64_t zb1obj, zb2obj;
4786 uint64_t zb1L0, zb2L0;
4787 uint64_t zb1level, zb2level;
4788
4789 if (zb1->zb_object == zb2->zb_object &&
4790 zb1->zb_level == zb2->zb_level &&
4791 zb1->zb_blkid == zb2->zb_blkid)
4792 return (0);
9ae529ec 4793
fcff0f35
PD
4794 /*
4795 * BP_SPANB calculates the span in blocks.
4796 */
4797 zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
4798 zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
9ae529ec
CS
4799
4800 if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
fcff0f35
PD
4801 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
4802 zb1L0 = 0;
4803 zb1level = zb1->zb_level + COMPARE_META_LEVEL;
4804 } else {
4805 zb1obj = zb1->zb_object;
4806 zb1level = zb1->zb_level;
9ae529ec
CS
4807 }
4808
fcff0f35
PD
4809 if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
4810 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
4811 zb2L0 = 0;
4812 zb2level = zb2->zb_level + COMPARE_META_LEVEL;
4813 } else {
4814 zb2obj = zb2->zb_object;
4815 zb2level = zb2->zb_level;
4816 }
4817
4818 /* Now that we have a canonical representation, do the comparison. */
4819 if (zb1obj != zb2obj)
4820 return (zb1obj < zb2obj ? -1 : 1);
4821 else if (zb1L0 != zb2L0)
4822 return (zb1L0 < zb2L0 ? -1 : 1);
4823 else if (zb1level != zb2level)
4824 return (zb1level > zb2level ? -1 : 1);
4825 /*
4826 * This can (theoretically) happen if the bookmarks have the same object
4827 * and level, but different blkids, if the block sizes are not the same.
4828 * There is presently no way to change the indirect block sizes
4829 */
4830 return (0);
4831}
4832
4833/*
4834 * This function checks the following: given that last_block is the place that
4835 * our traversal stopped last time, does that guarantee that we've visited
4836 * every node under subtree_root? Therefore, we can't just use the raw output
4837 * of zbookmark_compare. We have to pass in a modified version of
4838 * subtree_root; by incrementing the block id, and then checking whether
4839 * last_block is before or equal to that, we can tell whether or not having
4840 * visited last_block implies that all of subtree_root's children have been
4841 * visited.
4842 */
4843boolean_t
4844zbookmark_subtree_completed(const dnode_phys_t *dnp,
4845 const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
4846{
4847 zbookmark_phys_t mod_zb = *subtree_root;
4848 mod_zb.zb_blkid++;
4849 ASSERT(last_block->zb_level == 0);
4850
4851 /* The objset_phys_t isn't before anything. */
4852 if (dnp == NULL)
9ae529ec 4853 return (B_FALSE);
fcff0f35
PD
4854
4855 /*
4856 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
4857 * data block size in sectors, because that variable is only used if
4858 * the bookmark refers to a block in the meta-dnode. Since we don't
4859 * know without examining it what object it refers to, and there's no
4860 * harm in passing in this value in other cases, we always pass it in.
4861 *
4862 * We pass in 0 for the indirect block size shift because zb2 must be
4863 * level 0. The indirect block size is only used to calculate the span
4864 * of the bookmark, but since the bookmark must be level 0, the span is
4865 * always 1, so the math works out.
4866 *
4867 * If you make changes to how the zbookmark_compare code works, be sure
4868 * to make sure that this code still works afterwards.
4869 */
4870 return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
4871 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
4872 last_block) <= 0);
9ae529ec
CS
4873}
4874
93ce2b4c 4875#if defined(_KERNEL)
c28b2279 4876EXPORT_SYMBOL(zio_type_name);
81971b13
BB
4877EXPORT_SYMBOL(zio_buf_alloc);
4878EXPORT_SYMBOL(zio_data_buf_alloc);
4879EXPORT_SYMBOL(zio_buf_free);
4880EXPORT_SYMBOL(zio_data_buf_free);
c28b2279 4881
ad796b8a
TH
4882module_param(zio_slow_io_ms, int, 0644);
4883MODULE_PARM_DESC(zio_slow_io_ms,
4884 "Max I/O completion time (milliseconds) before marking it as slow");
c409e464
BB
4885
4886module_param(zio_requeue_io_start_cut_in_line, int, 0644);
4887MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
29dee3ee
CP
4888
4889module_param(zfs_sync_pass_deferred_free, int, 0644);
4890MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
d1d7e268 4891 "Defer frees starting in this pass");
29dee3ee
CP
4892
4893module_param(zfs_sync_pass_dont_compress, int, 0644);
4894MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
d1d7e268 4895 "Don't compress starting in this pass");
29dee3ee
CP
4896
4897module_param(zfs_sync_pass_rewrite, int, 0644);
4898MODULE_PARM_DESC(zfs_sync_pass_rewrite,
d1d7e268 4899 "Rewrite new bps starting in this pass");
3dfb57a3
DB
4900
4901module_param(zio_dva_throttle_enabled, int, 0644);
4902MODULE_PARM_DESC(zio_dva_throttle_enabled,
4903 "Throttle block allocations in the ZIO pipeline");
638dd5f4
TC
4904
4905module_param(zio_deadman_log_all, int, 0644);
4906MODULE_PARM_DESC(zio_deadman_log_all,
4907 "Log all slow ZIOs, not just those with vdevs");
c28b2279 4908#endif