]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/zio.c
Illumos 5818 - zfs {ref}compressratio is incorrect with 4k sector size
[mirror_zfs.git] / module / zfs / zio.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
c3520e7f 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
a38718a6 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
34dc7c2f
BB
25 */
26
f1512ee6 27#include <sys/sysmacros.h>
34dc7c2f
BB
28#include <sys/zfs_context.h>
29#include <sys/fm/fs/zfs.h>
30#include <sys/spa.h>
31#include <sys/txg.h>
32#include <sys/spa_impl.h>
33#include <sys/vdev_impl.h>
34#include <sys/zio_impl.h>
35#include <sys/zio_compress.h>
36#include <sys/zio_checksum.h>
428870ff
BB
37#include <sys/dmu_objset.h>
38#include <sys/arc.h>
39#include <sys/ddt.h>
9b67f605 40#include <sys/blkptr.h>
b0bc7a84 41#include <sys/zfeature.h>
34dc7c2f 42
34dc7c2f
BB
43/*
44 * ==========================================================================
45 * I/O type descriptions
46 * ==========================================================================
47 */
e8b96c60 48const char *zio_type_name[ZIO_TYPES] = {
451041db 49 "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
428870ff 50};
34dc7c2f
BB
51
52/*
53 * ==========================================================================
54 * I/O kmem caches
55 * ==========================================================================
56 */
57kmem_cache_t *zio_cache;
d164b209 58kmem_cache_t *zio_link_cache;
34dc7c2f
BB
59kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
60kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
a69052be 61int zio_delay_max = ZIO_DELAY_MAX;
34dc7c2f 62
98b25418
GW
63#define ZIO_PIPELINE_CONTINUE 0x100
64#define ZIO_PIPELINE_STOP 0x101
65
55d85d5a
GW
66/*
67 * The following actions directly effect the spa's sync-to-convergence logic.
68 * The values below define the sync pass when we start performing the action.
69 * Care should be taken when changing these values as they directly impact
70 * spa_sync() performance. Tuning these values may introduce subtle performance
71 * pathologies and should only be done in the context of performance analysis.
72 * These tunables will eventually be removed and replaced with #defines once
73 * enough analysis has been done to determine optimal values.
74 *
75 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
76 * regular blocks are not deferred.
77 */
78int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
79int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
80int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
81
34dc7c2f 82/*
b128c09f
BB
83 * An allocating zio is one that either currently has the DVA allocate
84 * stage set or will have it later in its lifetime.
34dc7c2f 85 */
428870ff
BB
86#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
87
c409e464 88int zio_requeue_io_start_cut_in_line = 1;
428870ff
BB
89
90#ifdef ZFS_DEBUG
91int zio_buf_debug_limit = 16384;
92#else
93int zio_buf_debug_limit = 0;
94#endif
34dc7c2f 95
da6b4005
NB
96static inline void __zio_execute(zio_t *zio);
97
34dc7c2f
BB
98void
99zio_init(void)
100{
101 size_t c;
102 vmem_t *data_alloc_arena = NULL;
103
3941503c
BB
104 zio_cache = kmem_cache_create("zio_cache",
105 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
d164b209 106 zio_link_cache = kmem_cache_create("zio_link_cache",
6795a698 107 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
34dc7c2f
BB
108
109 /*
110 * For small buffers, we want a cache for each multiple of
f1512ee6
MA
111 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
112 * for each quarter-power of 2.
34dc7c2f
BB
113 */
114 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
115 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
116 size_t p2 = size;
117 size_t align = 0;
6442f3cf 118 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
34dc7c2f 119
f1512ee6
MA
120#ifdef _ILP32
121 /*
122 * Cache size limited to 1M on 32-bit platforms until ARC
123 * buffers no longer require virtual address space.
124 */
125 if (size > zfs_max_recordsize)
126 break;
127#endif
128
129 while (!ISP2(p2))
34dc7c2f
BB
130 p2 &= p2 - 1;
131
498877ba
MA
132#ifndef _KERNEL
133 /*
134 * If we are using watchpoints, put each buffer on its own page,
135 * to eliminate the performance overhead of trapping to the
136 * kernel when modifying a non-watched buffer that shares the
137 * page with a watched buffer.
138 */
139 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
140 continue;
141#endif
34dc7c2f
BB
142 if (size <= 4 * SPA_MINBLOCKSIZE) {
143 align = SPA_MINBLOCKSIZE;
498877ba 144 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
f1512ee6 145 align = MIN(p2 >> 2, PAGESIZE);
34dc7c2f
BB
146 }
147
148 if (align != 0) {
149 char name[36];
150 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
151 zio_buf_cache[c] = kmem_cache_create(name, size,
6442f3cf 152 align, NULL, NULL, NULL, NULL, NULL, cflags);
34dc7c2f
BB
153
154 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
155 zio_data_buf_cache[c] = kmem_cache_create(name, size,
ae6ba3db 156 align, NULL, NULL, NULL, NULL,
6442f3cf 157 data_alloc_arena, cflags);
34dc7c2f
BB
158 }
159 }
160
161 while (--c != 0) {
162 ASSERT(zio_buf_cache[c] != NULL);
163 if (zio_buf_cache[c - 1] == NULL)
164 zio_buf_cache[c - 1] = zio_buf_cache[c];
165
166 ASSERT(zio_data_buf_cache[c] != NULL);
167 if (zio_data_buf_cache[c - 1] == NULL)
168 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
169 }
170
34dc7c2f 171 zio_inject_init();
9759c60f
ED
172
173 lz4_init();
34dc7c2f
BB
174}
175
176void
177zio_fini(void)
178{
179 size_t c;
180 kmem_cache_t *last_cache = NULL;
181 kmem_cache_t *last_data_cache = NULL;
182
183 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
f1512ee6
MA
184#ifdef _ILP32
185 /*
186 * Cache size limited to 1M on 32-bit platforms until ARC
187 * buffers no longer require virtual address space.
188 */
189 if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
190 break;
191#endif
34dc7c2f
BB
192 if (zio_buf_cache[c] != last_cache) {
193 last_cache = zio_buf_cache[c];
194 kmem_cache_destroy(zio_buf_cache[c]);
195 }
196 zio_buf_cache[c] = NULL;
197
198 if (zio_data_buf_cache[c] != last_data_cache) {
199 last_data_cache = zio_data_buf_cache[c];
200 kmem_cache_destroy(zio_data_buf_cache[c]);
201 }
202 zio_data_buf_cache[c] = NULL;
203 }
204
d164b209 205 kmem_cache_destroy(zio_link_cache);
34dc7c2f
BB
206 kmem_cache_destroy(zio_cache);
207
208 zio_inject_fini();
9759c60f
ED
209
210 lz4_fini();
34dc7c2f
BB
211}
212
213/*
214 * ==========================================================================
215 * Allocate and free I/O buffers
216 * ==========================================================================
217 */
218
219/*
220 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
221 * crashdump if the kernel panics, so use it judiciously. Obviously, it's
222 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
223 * excess / transient data in-core during a crashdump.
224 */
225void *
226zio_buf_alloc(size_t size)
227{
228 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
229
63e3a861 230 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
34dc7c2f 231
efcd79a8 232 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
34dc7c2f
BB
233}
234
235/*
236 * Use zio_data_buf_alloc to allocate data. The data will not appear in a
237 * crashdump if the kernel panics. This exists so that we will limit the amount
238 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
239 * of kernel heap dumped to disk when the kernel panics)
240 */
241void *
242zio_data_buf_alloc(size_t size)
243{
244 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
245
63e3a861 246 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
34dc7c2f 247
efcd79a8 248 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
34dc7c2f
BB
249}
250
251void
252zio_buf_free(void *buf, size_t size)
253{
254 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
255
63e3a861 256 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
34dc7c2f
BB
257
258 kmem_cache_free(zio_buf_cache[c], buf);
259}
260
261void
262zio_data_buf_free(void *buf, size_t size)
263{
264 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
265
63e3a861 266 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
34dc7c2f
BB
267
268 kmem_cache_free(zio_data_buf_cache[c], buf);
269}
270
271/*
272 * ==========================================================================
273 * Push and pop I/O transform buffers
274 * ==========================================================================
275 */
276static void
b128c09f
BB
277zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
278 zio_transform_func_t *transform)
34dc7c2f 279{
79c76d5b 280 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
34dc7c2f 281
b128c09f
BB
282 zt->zt_orig_data = zio->io_data;
283 zt->zt_orig_size = zio->io_size;
34dc7c2f 284 zt->zt_bufsize = bufsize;
b128c09f 285 zt->zt_transform = transform;
34dc7c2f
BB
286
287 zt->zt_next = zio->io_transform_stack;
288 zio->io_transform_stack = zt;
289
290 zio->io_data = data;
291 zio->io_size = size;
292}
293
294static void
b128c09f 295zio_pop_transforms(zio_t *zio)
34dc7c2f 296{
b128c09f
BB
297 zio_transform_t *zt;
298
299 while ((zt = zio->io_transform_stack) != NULL) {
300 if (zt->zt_transform != NULL)
301 zt->zt_transform(zio,
302 zt->zt_orig_data, zt->zt_orig_size);
34dc7c2f 303
428870ff
BB
304 if (zt->zt_bufsize != 0)
305 zio_buf_free(zio->io_data, zt->zt_bufsize);
34dc7c2f 306
b128c09f
BB
307 zio->io_data = zt->zt_orig_data;
308 zio->io_size = zt->zt_orig_size;
309 zio->io_transform_stack = zt->zt_next;
34dc7c2f 310
b128c09f 311 kmem_free(zt, sizeof (zio_transform_t));
34dc7c2f
BB
312 }
313}
314
b128c09f
BB
315/*
316 * ==========================================================================
317 * I/O transform callbacks for subblocks and decompression
318 * ==========================================================================
319 */
320static void
321zio_subblock(zio_t *zio, void *data, uint64_t size)
322{
323 ASSERT(zio->io_size > size);
324
325 if (zio->io_type == ZIO_TYPE_READ)
326 bcopy(zio->io_data, data, size);
327}
328
329static void
330zio_decompress(zio_t *zio, void *data, uint64_t size)
331{
332 if (zio->io_error == 0 &&
333 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
428870ff 334 zio->io_data, data, zio->io_size, size) != 0)
2e528b49 335 zio->io_error = SET_ERROR(EIO);
b128c09f
BB
336}
337
338/*
339 * ==========================================================================
340 * I/O parent/child relationships and pipeline interlocks
341 * ==========================================================================
342 */
d164b209
BB
343/*
344 * NOTE - Callers to zio_walk_parents() and zio_walk_children must
345 * continue calling these functions until they return NULL.
346 * Otherwise, the next caller will pick up the list walk in
347 * some indeterminate state. (Otherwise every caller would
348 * have to pass in a cookie to keep the state represented by
349 * io_walk_link, which gets annoying.)
350 */
351zio_t *
352zio_walk_parents(zio_t *cio)
353{
354 zio_link_t *zl = cio->io_walk_link;
355 list_t *pl = &cio->io_parent_list;
b128c09f 356
d164b209
BB
357 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
358 cio->io_walk_link = zl;
359
360 if (zl == NULL)
361 return (NULL);
362
363 ASSERT(zl->zl_child == cio);
364 return (zl->zl_parent);
365}
366
367zio_t *
368zio_walk_children(zio_t *pio)
369{
370 zio_link_t *zl = pio->io_walk_link;
371 list_t *cl = &pio->io_child_list;
372
373 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
374 pio->io_walk_link = zl;
375
376 if (zl == NULL)
377 return (NULL);
378
379 ASSERT(zl->zl_parent == pio);
380 return (zl->zl_child);
381}
382
383zio_t *
384zio_unique_parent(zio_t *cio)
385{
386 zio_t *pio = zio_walk_parents(cio);
387
388 VERIFY(zio_walk_parents(cio) == NULL);
389 return (pio);
390}
391
392void
393zio_add_child(zio_t *pio, zio_t *cio)
b128c09f 394{
79c76d5b 395 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
d6320ddb 396 int w;
d164b209
BB
397
398 /*
399 * Logical I/Os can have logical, gang, or vdev children.
400 * Gang I/Os can have gang or vdev children.
401 * Vdev I/Os can only have vdev children.
402 * The following ASSERT captures all of these constraints.
403 */
404 ASSERT(cio->io_child_type <= pio->io_child_type);
405
406 zl->zl_parent = pio;
407 zl->zl_child = cio;
408
409 mutex_enter(&cio->io_lock);
b128c09f 410 mutex_enter(&pio->io_lock);
d164b209
BB
411
412 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
413
d6320ddb 414 for (w = 0; w < ZIO_WAIT_TYPES; w++)
d164b209
BB
415 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
416
417 list_insert_head(&pio->io_child_list, zl);
418 list_insert_head(&cio->io_parent_list, zl);
419
428870ff
BB
420 pio->io_child_count++;
421 cio->io_parent_count++;
422
b128c09f 423 mutex_exit(&pio->io_lock);
d164b209 424 mutex_exit(&cio->io_lock);
b128c09f
BB
425}
426
34dc7c2f 427static void
d164b209 428zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
b128c09f 429{
d164b209
BB
430 ASSERT(zl->zl_parent == pio);
431 ASSERT(zl->zl_child == cio);
b128c09f 432
d164b209 433 mutex_enter(&cio->io_lock);
b128c09f 434 mutex_enter(&pio->io_lock);
d164b209
BB
435
436 list_remove(&pio->io_child_list, zl);
437 list_remove(&cio->io_parent_list, zl);
438
428870ff
BB
439 pio->io_child_count--;
440 cio->io_parent_count--;
441
b128c09f 442 mutex_exit(&pio->io_lock);
d164b209
BB
443 mutex_exit(&cio->io_lock);
444
445 kmem_cache_free(zio_link_cache, zl);
b128c09f
BB
446}
447
448static boolean_t
449zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
34dc7c2f 450{
b128c09f
BB
451 uint64_t *countp = &zio->io_children[child][wait];
452 boolean_t waiting = B_FALSE;
453
454 mutex_enter(&zio->io_lock);
455 ASSERT(zio->io_stall == NULL);
456 if (*countp != 0) {
428870ff 457 zio->io_stage >>= 1;
b128c09f
BB
458 zio->io_stall = countp;
459 waiting = B_TRUE;
460 }
461 mutex_exit(&zio->io_lock);
462
463 return (waiting);
464}
34dc7c2f 465
bf701a83
BB
466__attribute__((always_inline))
467static inline void
b128c09f
BB
468zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
469{
470 uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
471 int *errorp = &pio->io_child_error[zio->io_child_type];
34dc7c2f 472
b128c09f
BB
473 mutex_enter(&pio->io_lock);
474 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
475 *errorp = zio_worst_error(*errorp, zio->io_error);
476 pio->io_reexecute |= zio->io_reexecute;
477 ASSERT3U(*countp, >, 0);
e8b96c60
MA
478
479 (*countp)--;
480
481 if (*countp == 0 && pio->io_stall == countp) {
b128c09f
BB
482 pio->io_stall = NULL;
483 mutex_exit(&pio->io_lock);
da6b4005 484 __zio_execute(pio);
b128c09f
BB
485 } else {
486 mutex_exit(&pio->io_lock);
34dc7c2f
BB
487 }
488}
489
b128c09f
BB
490static void
491zio_inherit_child_errors(zio_t *zio, enum zio_child c)
492{
493 if (zio->io_child_error[c] != 0 && zio->io_error == 0)
494 zio->io_error = zio->io_child_error[c];
495}
496
34dc7c2f
BB
497/*
498 * ==========================================================================
b128c09f 499 * Create the various types of I/O (read, write, free, etc)
34dc7c2f
BB
500 * ==========================================================================
501 */
502static zio_t *
428870ff 503zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
34dc7c2f 504 void *data, uint64_t size, zio_done_func_t *done, void *private,
e8b96c60 505 zio_type_t type, zio_priority_t priority, enum zio_flag flags,
5dbd68a3 506 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
428870ff 507 enum zio_stage stage, enum zio_stage pipeline)
34dc7c2f
BB
508{
509 zio_t *zio;
510
511 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
512 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
b128c09f
BB
513 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
514
515 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
516 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
517 ASSERT(vd || stage == ZIO_STAGE_OPEN);
34dc7c2f 518
79c76d5b 519 zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
3941503c
BB
520 bzero(zio, sizeof (zio_t));
521
522 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
523 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
524
525 list_create(&zio->io_parent_list, sizeof (zio_link_t),
526 offsetof(zio_link_t, zl_parent_node));
527 list_create(&zio->io_child_list, sizeof (zio_link_t),
528 offsetof(zio_link_t, zl_child_node));
d164b209 529
b128c09f
BB
530 if (vd != NULL)
531 zio->io_child_type = ZIO_CHILD_VDEV;
532 else if (flags & ZIO_FLAG_GANG_CHILD)
533 zio->io_child_type = ZIO_CHILD_GANG;
428870ff
BB
534 else if (flags & ZIO_FLAG_DDT_CHILD)
535 zio->io_child_type = ZIO_CHILD_DDT;
b128c09f
BB
536 else
537 zio->io_child_type = ZIO_CHILD_LOGICAL;
538
34dc7c2f 539 if (bp != NULL) {
428870ff 540 zio->io_bp = (blkptr_t *)bp;
34dc7c2f
BB
541 zio->io_bp_copy = *bp;
542 zio->io_bp_orig = *bp;
428870ff
BB
543 if (type != ZIO_TYPE_WRITE ||
544 zio->io_child_type == ZIO_CHILD_DDT)
b128c09f 545 zio->io_bp = &zio->io_bp_copy; /* so caller can free */
9babb374 546 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
b128c09f 547 zio->io_logical = zio;
9babb374
BB
548 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
549 pipeline |= ZIO_GANG_STAGES;
34dc7c2f 550 }
b128c09f
BB
551
552 zio->io_spa = spa;
553 zio->io_txg = txg;
34dc7c2f
BB
554 zio->io_done = done;
555 zio->io_private = private;
556 zio->io_type = type;
557 zio->io_priority = priority;
b128c09f
BB
558 zio->io_vd = vd;
559 zio->io_offset = offset;
428870ff
BB
560 zio->io_orig_data = zio->io_data = data;
561 zio->io_orig_size = zio->io_size = size;
b128c09f
BB
562 zio->io_orig_flags = zio->io_flags = flags;
563 zio->io_orig_stage = zio->io_stage = stage;
564 zio->io_orig_pipeline = zio->io_pipeline = pipeline;
34dc7c2f 565
d164b209
BB
566 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
567 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
568
b128c09f
BB
569 if (zb != NULL)
570 zio->io_bookmark = *zb;
571
572 if (pio != NULL) {
b128c09f 573 if (zio->io_logical == NULL)
34dc7c2f 574 zio->io_logical = pio->io_logical;
9babb374
BB
575 if (zio->io_child_type == ZIO_CHILD_GANG)
576 zio->io_gang_leader = pio->io_gang_leader;
b128c09f 577 zio_add_child(pio, zio);
34dc7c2f
BB
578 }
579
a38718a6
GA
580 taskq_init_ent(&zio->io_tqent);
581
34dc7c2f
BB
582 return (zio);
583}
584
585static void
b128c09f 586zio_destroy(zio_t *zio)
34dc7c2f 587{
3941503c
BB
588 list_destroy(&zio->io_parent_list);
589 list_destroy(&zio->io_child_list);
590 mutex_destroy(&zio->io_lock);
591 cv_destroy(&zio->io_cv);
b128c09f 592 kmem_cache_free(zio_cache, zio);
34dc7c2f
BB
593}
594
595zio_t *
d164b209 596zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
428870ff 597 void *private, enum zio_flag flags)
34dc7c2f
BB
598{
599 zio_t *zio;
600
601 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
d164b209 602 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
b128c09f 603 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
34dc7c2f
BB
604
605 return (zio);
606}
607
608zio_t *
428870ff 609zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
34dc7c2f 610{
d164b209 611 return (zio_null(NULL, spa, NULL, done, private, flags));
34dc7c2f
BB
612}
613
63e3a861
MA
614void
615zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
616{
617 int i;
618
619 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
620 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
621 bp, (longlong_t)BP_GET_TYPE(bp));
622 }
623 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
624 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
625 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
626 bp, (longlong_t)BP_GET_CHECKSUM(bp));
627 }
628 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
629 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
630 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
631 bp, (longlong_t)BP_GET_COMPRESS(bp));
632 }
633 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
634 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
635 bp, (longlong_t)BP_GET_LSIZE(bp));
636 }
637 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
638 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
639 bp, (longlong_t)BP_GET_PSIZE(bp));
640 }
641
642 if (BP_IS_EMBEDDED(bp)) {
643 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
644 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
645 bp, (longlong_t)BPE_GET_ETYPE(bp));
646 }
647 }
648
649 /*
650 * Pool-specific checks.
651 *
652 * Note: it would be nice to verify that the blk_birth and
653 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
654 * allows the birth time of log blocks (and dmu_sync()-ed blocks
655 * that are in the log) to be arbitrarily large.
656 */
657 for (i = 0; i < BP_GET_NDVAS(bp); i++) {
658 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
659 vdev_t *vd;
660 uint64_t offset, asize;
661 if (vdevid >= spa->spa_root_vdev->vdev_children) {
662 zfs_panic_recover("blkptr at %p DVA %u has invalid "
663 "VDEV %llu",
664 bp, i, (longlong_t)vdevid);
665 }
666 vd = spa->spa_root_vdev->vdev_child[vdevid];
667 if (vd == NULL) {
668 zfs_panic_recover("blkptr at %p DVA %u has invalid "
669 "VDEV %llu",
670 bp, i, (longlong_t)vdevid);
671 }
672 if (vd->vdev_ops == &vdev_hole_ops) {
673 zfs_panic_recover("blkptr at %p DVA %u has hole "
674 "VDEV %llu",
675 bp, i, (longlong_t)vdevid);
676
677 }
678 if (vd->vdev_ops == &vdev_missing_ops) {
679 /*
680 * "missing" vdevs are valid during import, but we
681 * don't have their detailed info (e.g. asize), so
682 * we can't perform any more checks on them.
683 */
684 continue;
685 }
686 offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
687 asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
688 if (BP_IS_GANG(bp))
689 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
690 if (offset + asize > vd->vdev_asize) {
691 zfs_panic_recover("blkptr at %p DVA %u has invalid "
692 "OFFSET %llu",
693 bp, i, (longlong_t)offset);
694 }
695 }
696}
697
34dc7c2f 698zio_t *
b128c09f
BB
699zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
700 void *data, uint64_t size, zio_done_func_t *done, void *private,
5dbd68a3 701 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
34dc7c2f
BB
702{
703 zio_t *zio;
704
63e3a861
MA
705 zfs_blkptr_verify(spa, bp);
706
428870ff 707 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
b128c09f
BB
708 data, size, done, private,
709 ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
428870ff
BB
710 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
711 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
34dc7c2f 712
b128c09f
BB
713 return (zio);
714}
34dc7c2f 715
34dc7c2f 716zio_t *
b128c09f 717zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
428870ff 718 void *data, uint64_t size, const zio_prop_t *zp,
e8b96c60
MA
719 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
720 void *private,
5dbd68a3 721 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
34dc7c2f
BB
722{
723 zio_t *zio;
724
b128c09f
BB
725 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
726 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
727 zp->zp_compress >= ZIO_COMPRESS_OFF &&
728 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
9ae529ec 729 DMU_OT_IS_VALID(zp->zp_type) &&
b128c09f 730 zp->zp_level < 32 &&
428870ff 731 zp->zp_copies > 0 &&
03c6040b 732 zp->zp_copies <= spa_max_replication(spa));
34dc7c2f
BB
733
734 zio = zio_create(pio, spa, txg, bp, data, size, done, private,
b128c09f 735 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
428870ff
BB
736 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
737 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
34dc7c2f
BB
738
739 zio->io_ready = ready;
e8b96c60 740 zio->io_physdone = physdone;
b128c09f 741 zio->io_prop = *zp;
34dc7c2f 742
9b67f605
MA
743 /*
744 * Data can be NULL if we are going to call zio_write_override() to
745 * provide the already-allocated BP. But we may need the data to
746 * verify a dedup hit (if requested). In this case, don't try to
747 * dedup (just take the already-allocated BP verbatim).
748 */
749 if (data == NULL && zio->io_prop.zp_dedup_verify) {
750 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
751 }
752
34dc7c2f
BB
753 return (zio);
754}
755
756zio_t *
b128c09f 757zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
e8b96c60 758 uint64_t size, zio_done_func_t *done, void *private,
5dbd68a3 759 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
34dc7c2f
BB
760{
761 zio_t *zio;
762
34dc7c2f 763 zio = zio_create(pio, spa, txg, bp, data, size, done, private,
b128c09f
BB
764 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
765 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
34dc7c2f
BB
766
767 return (zio);
768}
769
428870ff 770void
03c6040b 771zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
428870ff
BB
772{
773 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
774 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
775 ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
776 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
777
03c6040b
GW
778 /*
779 * We must reset the io_prop to match the values that existed
780 * when the bp was first written by dmu_sync() keeping in mind
781 * that nopwrite and dedup are mutually exclusive.
782 */
783 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
784 zio->io_prop.zp_nopwrite = nopwrite;
428870ff
BB
785 zio->io_prop.zp_copies = copies;
786 zio->io_bp_override = bp;
787}
788
789void
790zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
791{
9b67f605
MA
792
793 /*
794 * The check for EMBEDDED is a performance optimization. We
795 * process the free here (by ignoring it) rather than
796 * putting it on the list and then processing it in zio_free_sync().
797 */
798 if (BP_IS_EMBEDDED(bp))
799 return;
13fe0198 800 metaslab_check_free(spa, bp);
2883cad5
MA
801
802 /*
803 * Frees that are for the currently-syncing txg, are not going to be
804 * deferred, and which will not need to do a read (i.e. not GANG or
805 * DEDUP), can be processed immediately. Otherwise, put them on the
806 * in-memory list for later processing.
807 */
808 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
809 txg != spa->spa_syncing_txg ||
810 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
811 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
812 } else {
813 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
814 }
428870ff
BB
815}
816
34dc7c2f 817zio_t *
428870ff
BB
818zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
819 enum zio_flag flags)
34dc7c2f
BB
820{
821 zio_t *zio;
2883cad5 822 enum zio_stage stage = ZIO_FREE_PIPELINE;
34dc7c2f 823
428870ff
BB
824 ASSERT(!BP_IS_HOLE(bp));
825 ASSERT(spa_syncing_txg(spa) == txg);
55d85d5a 826 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
34dc7c2f 827
9b67f605
MA
828 if (BP_IS_EMBEDDED(bp))
829 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
830
13fe0198 831 metaslab_check_free(spa, bp);
8c841793 832 arc_freed(spa, bp);
13fe0198 833
2883cad5
MA
834 /*
835 * GANG and DEDUP blocks can induce a read (for the gang block header,
836 * or the DDT), so issue them asynchronously so that this thread is
837 * not tied up.
838 */
839 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
840 stage |= ZIO_STAGE_ISSUE_ASYNC;
841
b128c09f 842 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
2883cad5
MA
843 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
844 NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
845
34dc7c2f
BB
846 return (zio);
847}
848
849zio_t *
428870ff
BB
850zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
851 zio_done_func_t *done, void *private, enum zio_flag flags)
34dc7c2f
BB
852{
853 zio_t *zio;
854
9b67f605
MA
855 dprintf_bp(bp, "claiming in txg %llu", txg);
856
857 if (BP_IS_EMBEDDED(bp))
858 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
859
34dc7c2f
BB
860 /*
861 * A claim is an allocation of a specific block. Claims are needed
862 * to support immediate writes in the intent log. The issue is that
863 * immediate writes contain committed data, but in a txg that was
864 * *not* committed. Upon opening the pool after an unclean shutdown,
865 * the intent log claims all blocks that contain immediate write data
866 * so that the SPA knows they're in use.
867 *
868 * All claims *must* be resolved in the first txg -- before the SPA
869 * starts allocating blocks -- so that nothing is allocated twice.
428870ff 870 * If txg == 0 we just verify that the block is claimable.
34dc7c2f
BB
871 */
872 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
428870ff
BB
873 ASSERT(txg == spa_first_txg(spa) || txg == 0);
874 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
34dc7c2f 875
b128c09f
BB
876 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
877 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
878 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
34dc7c2f
BB
879
880 return (zio);
881}
882
883zio_t *
884zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
e8b96c60 885 zio_done_func_t *done, void *private, enum zio_flag flags)
34dc7c2f
BB
886{
887 zio_t *zio;
888 int c;
889
890 if (vd->vdev_children == 0) {
891 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
e8b96c60 892 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
34dc7c2f
BB
893 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
894
34dc7c2f
BB
895 zio->io_cmd = cmd;
896 } else {
d164b209 897 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
34dc7c2f
BB
898
899 for (c = 0; c < vd->vdev_children; c++)
900 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
e8b96c60 901 done, private, flags));
34dc7c2f
BB
902 }
903
904 return (zio);
905}
906
34dc7c2f
BB
907zio_t *
908zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
909 void *data, int checksum, zio_done_func_t *done, void *private,
e8b96c60 910 zio_priority_t priority, enum zio_flag flags, boolean_t labels)
34dc7c2f
BB
911{
912 zio_t *zio;
34dc7c2f 913
b128c09f
BB
914 ASSERT(vd->vdev_children == 0);
915 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
916 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
917 ASSERT3U(offset + size, <=, vd->vdev_psize);
34dc7c2f 918
b128c09f 919 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
b02fe35d
AR
920 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
921 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
34dc7c2f 922
b128c09f 923 zio->io_prop.zp_checksum = checksum;
34dc7c2f
BB
924
925 return (zio);
926}
927
928zio_t *
929zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
930 void *data, int checksum, zio_done_func_t *done, void *private,
e8b96c60 931 zio_priority_t priority, enum zio_flag flags, boolean_t labels)
34dc7c2f 932{
34dc7c2f 933 zio_t *zio;
34dc7c2f 934
b128c09f
BB
935 ASSERT(vd->vdev_children == 0);
936 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
937 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
938 ASSERT3U(offset + size, <=, vd->vdev_psize);
34dc7c2f 939
b128c09f 940 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
b02fe35d
AR
941 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
942 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
34dc7c2f 943
b128c09f 944 zio->io_prop.zp_checksum = checksum;
34dc7c2f 945
428870ff 946 if (zio_checksum_table[checksum].ci_eck) {
34dc7c2f 947 /*
428870ff 948 * zec checksums are necessarily destructive -- they modify
b128c09f 949 * the end of the write buffer to hold the verifier/checksum.
34dc7c2f 950 * Therefore, we must make a local copy in case the data is
b128c09f 951 * being written to multiple places in parallel.
34dc7c2f 952 */
b128c09f 953 void *wbuf = zio_buf_alloc(size);
34dc7c2f 954 bcopy(data, wbuf, size);
b128c09f 955 zio_push_transform(zio, wbuf, size, size, NULL);
34dc7c2f
BB
956 }
957
958 return (zio);
959}
960
961/*
b128c09f 962 * Create a child I/O to do some work for us.
34dc7c2f
BB
963 */
964zio_t *
b128c09f 965zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
e8b96c60
MA
966 void *data, uint64_t size, int type, zio_priority_t priority,
967 enum zio_flag flags, zio_done_func_t *done, void *private)
34dc7c2f 968{
428870ff 969 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
b128c09f
BB
970 zio_t *zio;
971
972 ASSERT(vd->vdev_parent ==
973 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
34dc7c2f
BB
974
975 if (type == ZIO_TYPE_READ && bp != NULL) {
976 /*
977 * If we have the bp, then the child should perform the
978 * checksum and the parent need not. This pushes error
979 * detection as close to the leaves as possible and
980 * eliminates redundant checksums in the interior nodes.
981 */
428870ff
BB
982 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
983 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
34dc7c2f
BB
984 }
985
b128c09f
BB
986 if (vd->vdev_children == 0)
987 offset += VDEV_LABEL_START_SIZE;
988
428870ff
BB
989 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
990
991 /*
992 * If we've decided to do a repair, the write is not speculative --
993 * even if the original read was.
994 */
995 if (flags & ZIO_FLAG_IO_REPAIR)
996 flags &= ~ZIO_FLAG_SPECULATIVE;
997
b128c09f 998 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
428870ff
BB
999 done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
1000 ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
34dc7c2f 1001
e8b96c60
MA
1002 zio->io_physdone = pio->io_physdone;
1003 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1004 zio->io_logical->io_phys_children++;
1005
b128c09f 1006 return (zio);
34dc7c2f
BB
1007}
1008
b128c09f
BB
1009zio_t *
1010zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
e8b96c60 1011 int type, zio_priority_t priority, enum zio_flag flags,
428870ff 1012 zio_done_func_t *done, void *private)
34dc7c2f 1013{
b128c09f 1014 zio_t *zio;
34dc7c2f 1015
b128c09f 1016 ASSERT(vd->vdev_ops->vdev_op_leaf);
34dc7c2f 1017
b128c09f
BB
1018 zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1019 data, size, done, private, type, priority,
e8b96c60 1020 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
b128c09f 1021 vd, offset, NULL,
428870ff 1022 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
34dc7c2f 1023
b128c09f 1024 return (zio);
34dc7c2f
BB
1025}
1026
1027void
b128c09f 1028zio_flush(zio_t *zio, vdev_t *vd)
34dc7c2f 1029{
b128c09f 1030 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
e8b96c60 1031 NULL, NULL,
b128c09f 1032 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
34dc7c2f
BB
1033}
1034
428870ff
BB
1035void
1036zio_shrink(zio_t *zio, uint64_t size)
1037{
1038 ASSERT(zio->io_executor == NULL);
1039 ASSERT(zio->io_orig_size == zio->io_size);
1040 ASSERT(size <= zio->io_size);
1041
1042 /*
1043 * We don't shrink for raidz because of problems with the
1044 * reconstruction when reading back less than the block size.
1045 * Note, BP_IS_RAIDZ() assumes no compression.
1046 */
1047 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1048 if (!BP_IS_RAIDZ(zio->io_bp))
1049 zio->io_orig_size = zio->io_size = size;
1050}
1051
34dc7c2f
BB
1052/*
1053 * ==========================================================================
b128c09f 1054 * Prepare to read and write logical blocks
34dc7c2f
BB
1055 * ==========================================================================
1056 */
b128c09f 1057
34dc7c2f 1058static int
b128c09f 1059zio_read_bp_init(zio_t *zio)
34dc7c2f 1060{
b128c09f 1061 blkptr_t *bp = zio->io_bp;
34dc7c2f 1062
fb5f0bc8 1063 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
9babb374
BB
1064 zio->io_child_type == ZIO_CHILD_LOGICAL &&
1065 !(zio->io_flags & ZIO_FLAG_RAW)) {
9b67f605
MA
1066 uint64_t psize =
1067 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
428870ff 1068 void *cbuf = zio_buf_alloc(psize);
b128c09f 1069
428870ff 1070 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
34dc7c2f 1071 }
34dc7c2f 1072
9b67f605
MA
1073 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1074 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1075 decode_embedded_bp_compressed(bp, zio->io_data);
1076 } else {
1077 ASSERT(!BP_IS_EMBEDDED(bp));
1078 }
1079
9ae529ec 1080 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
b128c09f
BB
1081 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1082
428870ff
BB
1083 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1084 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1085
1086 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1087 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1088
b128c09f 1089 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1090}
1091
b128c09f
BB
1092static int
1093zio_write_bp_init(zio_t *zio)
34dc7c2f 1094{
428870ff 1095 spa_t *spa = zio->io_spa;
b128c09f 1096 zio_prop_t *zp = &zio->io_prop;
428870ff 1097 enum zio_compress compress = zp->zp_compress;
34dc7c2f 1098 blkptr_t *bp = zio->io_bp;
b128c09f 1099 uint64_t lsize = zio->io_size;
428870ff 1100 uint64_t psize = lsize;
b128c09f 1101 int pass = 1;
34dc7c2f 1102
b128c09f
BB
1103 /*
1104 * If our children haven't all reached the ready stage,
1105 * wait for them and then repeat this pipeline stage.
1106 */
1107 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1108 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1109 return (ZIO_PIPELINE_STOP);
34dc7c2f 1110
b128c09f
BB
1111 if (!IO_IS_ALLOCATING(zio))
1112 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f 1113
428870ff
BB
1114 ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1115
1116 if (zio->io_bp_override) {
1117 ASSERT(bp->blk_birth != zio->io_txg);
1118 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1119
1120 *bp = *zio->io_bp_override;
1121 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1122
9b67f605
MA
1123 if (BP_IS_EMBEDDED(bp))
1124 return (ZIO_PIPELINE_CONTINUE);
1125
03c6040b
GW
1126 /*
1127 * If we've been overridden and nopwrite is set then
1128 * set the flag accordingly to indicate that a nopwrite
1129 * has already occurred.
1130 */
1131 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1132 ASSERT(!zp->zp_dedup);
1133 zio->io_flags |= ZIO_FLAG_NOPWRITE;
1134 return (ZIO_PIPELINE_CONTINUE);
1135 }
1136
1137 ASSERT(!zp->zp_nopwrite);
1138
428870ff
BB
1139 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1140 return (ZIO_PIPELINE_CONTINUE);
1141
1142 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1143 zp->zp_dedup_verify);
1144
1145 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1146 BP_SET_DEDUP(bp, 1);
1147 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1148 return (ZIO_PIPELINE_CONTINUE);
1149 }
428870ff 1150 }
34dc7c2f 1151
b0bc7a84 1152 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
b128c09f
BB
1153 /*
1154 * We're rewriting an existing block, which means we're
1155 * working on behalf of spa_sync(). For spa_sync() to
1156 * converge, it must eventually be the case that we don't
1157 * have to allocate new blocks. But compression changes
1158 * the blocksize, which forces a reallocate, and makes
1159 * convergence take longer. Therefore, after the first
1160 * few passes, stop compressing to ensure convergence.
1161 */
428870ff
BB
1162 pass = spa_sync_pass(spa);
1163
1164 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1165 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1166 ASSERT(!BP_GET_DEDUP(bp));
34dc7c2f 1167
55d85d5a 1168 if (pass >= zfs_sync_pass_dont_compress)
b128c09f 1169 compress = ZIO_COMPRESS_OFF;
34dc7c2f 1170
b128c09f 1171 /* Make sure someone doesn't change their mind on overwrites */
9b67f605 1172 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
428870ff 1173 spa_max_replication(spa)) == BP_GET_NDVAS(bp));
b128c09f 1174 }
34dc7c2f 1175
b128c09f 1176 if (compress != ZIO_COMPRESS_OFF) {
428870ff
BB
1177 void *cbuf = zio_buf_alloc(lsize);
1178 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1179 if (psize == 0 || psize == lsize) {
b128c09f 1180 compress = ZIO_COMPRESS_OFF;
428870ff 1181 zio_buf_free(cbuf, lsize);
9b67f605
MA
1182 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1183 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1184 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1185 encode_embedded_bp_compressed(bp,
1186 cbuf, compress, lsize, psize);
1187 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1188 BP_SET_TYPE(bp, zio->io_prop.zp_type);
1189 BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1190 zio_buf_free(cbuf, lsize);
1191 bp->blk_birth = zio->io_txg;
1192 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1193 ASSERT(spa_feature_is_active(spa,
1194 SPA_FEATURE_EMBEDDED_DATA));
1195 return (ZIO_PIPELINE_CONTINUE);
428870ff 1196 } else {
9b67f605 1197 /*
c3520e7f
MA
1198 * Round up compressed size up to the ashift
1199 * of the smallest-ashift device, and zero the tail.
1200 * This ensures that the compressed size of the BP
1201 * (and thus compressratio property) are correct,
1202 * in that we charge for the padding used to fill out
1203 * the last sector.
9b67f605 1204 */
c3520e7f
MA
1205 size_t rounded;
1206
1207 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
1208
1209 rounded = (size_t)P2ROUNDUP(psize,
1210 1ULL << spa->spa_min_ashift);
1211 if (rounded >= lsize) {
9b67f605
MA
1212 compress = ZIO_COMPRESS_OFF;
1213 zio_buf_free(cbuf, lsize);
c3520e7f 1214 psize = lsize;
9b67f605 1215 } else {
c3520e7f
MA
1216 bzero((char *)cbuf + psize, rounded - psize);
1217 psize = rounded;
9b67f605
MA
1218 zio_push_transform(zio, cbuf,
1219 psize, lsize, NULL);
1220 }
b128c09f
BB
1221 }
1222 }
34dc7c2f 1223
b128c09f
BB
1224 /*
1225 * The final pass of spa_sync() must be all rewrites, but the first
1226 * few passes offer a trade-off: allocating blocks defers convergence,
1227 * but newly allocated blocks are sequential, so they can be written
1228 * to disk faster. Therefore, we allow the first few passes of
1229 * spa_sync() to allocate new blocks, but force rewrites after that.
1230 * There should only be a handful of blocks after pass 1 in any case.
1231 */
b0bc7a84
MG
1232 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1233 BP_GET_PSIZE(bp) == psize &&
55d85d5a 1234 pass >= zfs_sync_pass_rewrite) {
428870ff 1235 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
d6320ddb 1236 ASSERT(psize != 0);
b128c09f
BB
1237 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1238 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1239 } else {
1240 BP_ZERO(bp);
1241 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1242 }
34dc7c2f 1243
428870ff 1244 if (psize == 0) {
b0bc7a84
MG
1245 if (zio->io_bp_orig.blk_birth != 0 &&
1246 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1247 BP_SET_LSIZE(bp, lsize);
1248 BP_SET_TYPE(bp, zp->zp_type);
1249 BP_SET_LEVEL(bp, zp->zp_level);
1250 BP_SET_BIRTH(bp, zio->io_txg, 0);
1251 }
b128c09f
BB
1252 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1253 } else {
1254 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1255 BP_SET_LSIZE(bp, lsize);
b0bc7a84
MG
1256 BP_SET_TYPE(bp, zp->zp_type);
1257 BP_SET_LEVEL(bp, zp->zp_level);
428870ff 1258 BP_SET_PSIZE(bp, psize);
b128c09f
BB
1259 BP_SET_COMPRESS(bp, compress);
1260 BP_SET_CHECKSUM(bp, zp->zp_checksum);
428870ff 1261 BP_SET_DEDUP(bp, zp->zp_dedup);
b128c09f 1262 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
428870ff
BB
1263 if (zp->zp_dedup) {
1264 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1265 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1266 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1267 }
03c6040b
GW
1268 if (zp->zp_nopwrite) {
1269 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1270 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1271 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1272 }
428870ff
BB
1273 }
1274
1275 return (ZIO_PIPELINE_CONTINUE);
1276}
1277
1278static int
1279zio_free_bp_init(zio_t *zio)
1280{
1281 blkptr_t *bp = zio->io_bp;
1282
1283 if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1284 if (BP_GET_DEDUP(bp))
1285 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
b128c09f 1286 }
34dc7c2f
BB
1287
1288 return (ZIO_PIPELINE_CONTINUE);
1289}
1290
b128c09f
BB
1291/*
1292 * ==========================================================================
1293 * Execute the I/O pipeline
1294 * ==========================================================================
1295 */
1296
1297static void
7ef5e54e 1298zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
34dc7c2f 1299{
428870ff 1300 spa_t *spa = zio->io_spa;
b128c09f 1301 zio_type_t t = zio->io_type;
a38718a6 1302 int flags = (cutinline ? TQ_FRONT : 0);
34dc7c2f
BB
1303
1304 /*
9babb374
BB
1305 * If we're a config writer or a probe, the normal issue and
1306 * interrupt threads may all be blocked waiting for the config lock.
1307 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
34dc7c2f 1308 */
9babb374 1309 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
b128c09f 1310 t = ZIO_TYPE_NULL;
34dc7c2f
BB
1311
1312 /*
b128c09f 1313 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
34dc7c2f 1314 */
b128c09f
BB
1315 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1316 t = ZIO_TYPE_NULL;
34dc7c2f 1317
428870ff 1318 /*
7ef5e54e
AL
1319 * If this is a high priority I/O, then use the high priority taskq if
1320 * available.
428870ff
BB
1321 */
1322 if (zio->io_priority == ZIO_PRIORITY_NOW &&
7ef5e54e 1323 spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
428870ff
BB
1324 q++;
1325
1326 ASSERT3U(q, <, ZIO_TASKQ_TYPES);
5cc556b4 1327
a38718a6
GA
1328 /*
1329 * NB: We are assuming that the zio can only be dispatched
1330 * to a single taskq at a time. It would be a grievous error
1331 * to dispatch the zio to another taskq at the same time.
1332 */
1333 ASSERT(taskq_empty_ent(&zio->io_tqent));
7ef5e54e
AL
1334 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1335 flags, &zio->io_tqent);
b128c09f 1336}
34dc7c2f 1337
b128c09f 1338static boolean_t
7ef5e54e 1339zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
b128c09f
BB
1340{
1341 kthread_t *executor = zio->io_executor;
1342 spa_t *spa = zio->io_spa;
d6320ddb 1343 zio_type_t t;
34dc7c2f 1344
7ef5e54e
AL
1345 for (t = 0; t < ZIO_TYPES; t++) {
1346 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1347 uint_t i;
1348 for (i = 0; i < tqs->stqs_count; i++) {
1349 if (taskq_member(tqs->stqs_taskq[i], executor))
1350 return (B_TRUE);
1351 }
1352 }
34dc7c2f 1353
b128c09f
BB
1354 return (B_FALSE);
1355}
34dc7c2f 1356
b128c09f
BB
1357static int
1358zio_issue_async(zio_t *zio)
1359{
428870ff 1360 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
b128c09f
BB
1361
1362 return (ZIO_PIPELINE_STOP);
34dc7c2f
BB
1363}
1364
b128c09f
BB
1365void
1366zio_interrupt(zio_t *zio)
34dc7c2f 1367{
428870ff 1368 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
b128c09f 1369}
34dc7c2f 1370
b128c09f
BB
1371/*
1372 * Execute the I/O pipeline until one of the following occurs:
1373 * (1) the I/O completes; (2) the pipeline stalls waiting for
1374 * dependent child I/Os; (3) the I/O issues, so we're waiting
1375 * for an I/O completion interrupt; (4) the I/O is delegated by
1376 * vdev-level caching or aggregation; (5) the I/O is deferred
1377 * due to vdev-level queueing; (6) the I/O is handed off to
1378 * another thread. In all cases, the pipeline stops whenever
8e07b99b 1379 * there's no CPU work; it never burns a thread in cv_wait_io().
b128c09f
BB
1380 *
1381 * There's no locking on io_stage because there's no legitimate way
1382 * for multiple threads to be attempting to process the same I/O.
1383 */
428870ff 1384static zio_pipe_stage_t *zio_pipeline[];
34dc7c2f 1385
da6b4005
NB
1386/*
1387 * zio_execute() is a wrapper around the static function
1388 * __zio_execute() so that we can force __zio_execute() to be
1389 * inlined. This reduces stack overhead which is important
1390 * because __zio_execute() is called recursively in several zio
1391 * code paths. zio_execute() itself cannot be inlined because
1392 * it is externally visible.
1393 */
b128c09f
BB
1394void
1395zio_execute(zio_t *zio)
da6b4005 1396{
92119cc2
BB
1397 fstrans_cookie_t cookie;
1398
1399 cookie = spl_fstrans_mark();
da6b4005 1400 __zio_execute(zio);
92119cc2 1401 spl_fstrans_unmark(cookie);
da6b4005
NB
1402}
1403
1404__attribute__((always_inline))
1405static inline void
1406__zio_execute(zio_t *zio)
b128c09f
BB
1407{
1408 zio->io_executor = curthread;
34dc7c2f 1409
b128c09f 1410 while (zio->io_stage < ZIO_STAGE_DONE) {
428870ff
BB
1411 enum zio_stage pipeline = zio->io_pipeline;
1412 enum zio_stage stage = zio->io_stage;
91579709 1413 dsl_pool_t *dp;
2fac4c2a 1414 boolean_t cut;
b128c09f 1415 int rv;
34dc7c2f 1416
b128c09f 1417 ASSERT(!MUTEX_HELD(&zio->io_lock));
428870ff
BB
1418 ASSERT(ISP2(stage));
1419 ASSERT(zio->io_stall == NULL);
34dc7c2f 1420
428870ff
BB
1421 do {
1422 stage <<= 1;
1423 } while ((stage & pipeline) == 0);
b128c09f
BB
1424
1425 ASSERT(stage <= ZIO_STAGE_DONE);
34dc7c2f 1426
91579709 1427 dp = spa_get_dsl(zio->io_spa);
2fac4c2a
BB
1428 cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1429 zio_requeue_io_start_cut_in_line : B_FALSE;
1430
34dc7c2f 1431 /*
b128c09f
BB
1432 * If we are in interrupt context and this pipeline stage
1433 * will grab a config lock that is held across I/O,
428870ff
BB
1434 * or may wait for an I/O that needs an interrupt thread
1435 * to complete, issue async to avoid deadlock.
1436 *
1437 * For VDEV_IO_START, we cut in line so that the io will
1438 * be sent to disk promptly.
34dc7c2f 1439 */
91579709
BB
1440 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1441 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1442 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1443 return;
1444 }
1445
1446 /*
1447 * If we executing in the context of the tx_sync_thread,
1448 * or we are performing pool initialization outside of a
34e14332
BB
1449 * zio_taskq[ZIO_TASKQ_ISSUE|ZIO_TASKQ_ISSUE_HIGH] context.
1450 * Then issue the zio asynchronously to minimize stack usage
1451 * for these deep call paths.
91579709
BB
1452 */
1453 if ((dp && curthread == dp->dp_tx.tx_sync_thread) ||
1454 (dp && spa_is_initializing(dp->dp_spa) &&
34e14332
BB
1455 !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
1456 !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))) {
428870ff 1457 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
b128c09f 1458 return;
34dc7c2f
BB
1459 }
1460
b128c09f 1461 zio->io_stage = stage;
9bd274dd 1462 rv = zio_pipeline[highbit64(stage) - 1](zio);
34dc7c2f 1463
b128c09f
BB
1464 if (rv == ZIO_PIPELINE_STOP)
1465 return;
34dc7c2f 1466
b128c09f
BB
1467 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1468 }
34dc7c2f
BB
1469}
1470
da6b4005 1471
b128c09f
BB
1472/*
1473 * ==========================================================================
1474 * Initiate I/O, either sync or async
1475 * ==========================================================================
1476 */
1477int
1478zio_wait(zio_t *zio)
34dc7c2f 1479{
b128c09f 1480 int error;
34dc7c2f 1481
b128c09f
BB
1482 ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1483 ASSERT(zio->io_executor == NULL);
34dc7c2f 1484
b128c09f 1485 zio->io_waiter = curthread;
34dc7c2f 1486
da6b4005 1487 __zio_execute(zio);
34dc7c2f 1488
b128c09f 1489 mutex_enter(&zio->io_lock);
72f53c56 1490 while (zio->io_executor != NULL)
72938d69 1491 cv_wait_io(&zio->io_cv, &zio->io_lock);
b128c09f 1492 mutex_exit(&zio->io_lock);
34dc7c2f 1493
b128c09f
BB
1494 error = zio->io_error;
1495 zio_destroy(zio);
34dc7c2f 1496
b128c09f
BB
1497 return (error);
1498}
34dc7c2f 1499
b128c09f
BB
1500void
1501zio_nowait(zio_t *zio)
1502{
1503 ASSERT(zio->io_executor == NULL);
34dc7c2f 1504
d164b209
BB
1505 if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1506 zio_unique_parent(zio) == NULL) {
8878261f
BB
1507 zio_t *pio;
1508
34dc7c2f 1509 /*
b128c09f 1510 * This is a logical async I/O with no parent to wait for it.
9babb374
BB
1511 * We add it to the spa_async_root_zio "Godfather" I/O which
1512 * will ensure they complete prior to unloading the pool.
34dc7c2f 1513 */
b128c09f 1514 spa_t *spa = zio->io_spa;
8878261f
BB
1515 kpreempt_disable();
1516 pio = spa->spa_async_zio_root[CPU_SEQID];
1517 kpreempt_enable();
9babb374 1518
8878261f 1519 zio_add_child(pio, zio);
b128c09f 1520 }
34dc7c2f 1521
da6b4005 1522 __zio_execute(zio);
b128c09f 1523}
34dc7c2f 1524
b128c09f
BB
1525/*
1526 * ==========================================================================
1527 * Reexecute or suspend/resume failed I/O
1528 * ==========================================================================
1529 */
34dc7c2f 1530
b128c09f
BB
1531static void
1532zio_reexecute(zio_t *pio)
1533{
d164b209 1534 zio_t *cio, *cio_next;
d6320ddb 1535 int c, w;
d164b209
BB
1536
1537 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1538 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
9babb374
BB
1539 ASSERT(pio->io_gang_leader == NULL);
1540 ASSERT(pio->io_gang_tree == NULL);
34dc7c2f 1541
b128c09f
BB
1542 pio->io_flags = pio->io_orig_flags;
1543 pio->io_stage = pio->io_orig_stage;
1544 pio->io_pipeline = pio->io_orig_pipeline;
1545 pio->io_reexecute = 0;
03c6040b 1546 pio->io_flags |= ZIO_FLAG_REEXECUTED;
b128c09f 1547 pio->io_error = 0;
d6320ddb 1548 for (w = 0; w < ZIO_WAIT_TYPES; w++)
d164b209 1549 pio->io_state[w] = 0;
d6320ddb 1550 for (c = 0; c < ZIO_CHILD_TYPES; c++)
b128c09f 1551 pio->io_child_error[c] = 0;
34dc7c2f 1552
428870ff
BB
1553 if (IO_IS_ALLOCATING(pio))
1554 BP_ZERO(pio->io_bp);
34dc7c2f 1555
b128c09f
BB
1556 /*
1557 * As we reexecute pio's children, new children could be created.
d164b209 1558 * New children go to the head of pio's io_child_list, however,
b128c09f 1559 * so we will (correctly) not reexecute them. The key is that
d164b209
BB
1560 * the remainder of pio's io_child_list, from 'cio_next' onward,
1561 * cannot be affected by any side effects of reexecuting 'cio'.
b128c09f 1562 */
d164b209
BB
1563 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1564 cio_next = zio_walk_children(pio);
b128c09f 1565 mutex_enter(&pio->io_lock);
d6320ddb 1566 for (w = 0; w < ZIO_WAIT_TYPES; w++)
d164b209 1567 pio->io_children[cio->io_child_type][w]++;
b128c09f 1568 mutex_exit(&pio->io_lock);
d164b209 1569 zio_reexecute(cio);
34dc7c2f 1570 }
34dc7c2f 1571
b128c09f
BB
1572 /*
1573 * Now that all children have been reexecuted, execute the parent.
9babb374
BB
1574 * We don't reexecute "The Godfather" I/O here as it's the
1575 * responsibility of the caller to wait on him.
b128c09f 1576 */
9babb374 1577 if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
da6b4005 1578 __zio_execute(pio);
34dc7c2f
BB
1579}
1580
b128c09f
BB
1581void
1582zio_suspend(spa_t *spa, zio_t *zio)
34dc7c2f 1583{
b128c09f
BB
1584 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1585 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1586 "failure and the failure mode property for this pool "
1587 "is set to panic.", spa_name(spa));
34dc7c2f 1588
bf89c199
BB
1589 cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
1590 "failure and has been suspended.\n", spa_name(spa));
1591
b128c09f 1592 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
34dc7c2f 1593
b128c09f 1594 mutex_enter(&spa->spa_suspend_lock);
34dc7c2f 1595
b128c09f 1596 if (spa->spa_suspend_zio_root == NULL)
9babb374
BB
1597 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1598 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1599 ZIO_FLAG_GODFATHER);
34dc7c2f 1600
b128c09f 1601 spa->spa_suspended = B_TRUE;
34dc7c2f 1602
b128c09f 1603 if (zio != NULL) {
9babb374 1604 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
b128c09f
BB
1605 ASSERT(zio != spa->spa_suspend_zio_root);
1606 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
d164b209 1607 ASSERT(zio_unique_parent(zio) == NULL);
b128c09f
BB
1608 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1609 zio_add_child(spa->spa_suspend_zio_root, zio);
1610 }
34dc7c2f 1611
b128c09f
BB
1612 mutex_exit(&spa->spa_suspend_lock);
1613}
34dc7c2f 1614
9babb374 1615int
b128c09f
BB
1616zio_resume(spa_t *spa)
1617{
9babb374 1618 zio_t *pio;
34dc7c2f
BB
1619
1620 /*
b128c09f 1621 * Reexecute all previously suspended i/o.
34dc7c2f 1622 */
b128c09f
BB
1623 mutex_enter(&spa->spa_suspend_lock);
1624 spa->spa_suspended = B_FALSE;
1625 cv_broadcast(&spa->spa_suspend_cv);
1626 pio = spa->spa_suspend_zio_root;
1627 spa->spa_suspend_zio_root = NULL;
1628 mutex_exit(&spa->spa_suspend_lock);
1629
1630 if (pio == NULL)
9babb374 1631 return (0);
34dc7c2f 1632
9babb374
BB
1633 zio_reexecute(pio);
1634 return (zio_wait(pio));
b128c09f
BB
1635}
1636
1637void
1638zio_resume_wait(spa_t *spa)
1639{
1640 mutex_enter(&spa->spa_suspend_lock);
1641 while (spa_suspended(spa))
1642 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1643 mutex_exit(&spa->spa_suspend_lock);
34dc7c2f
BB
1644}
1645
1646/*
1647 * ==========================================================================
b128c09f
BB
1648 * Gang blocks.
1649 *
1650 * A gang block is a collection of small blocks that looks to the DMU
1651 * like one large block. When zio_dva_allocate() cannot find a block
1652 * of the requested size, due to either severe fragmentation or the pool
1653 * being nearly full, it calls zio_write_gang_block() to construct the
1654 * block from smaller fragments.
1655 *
1656 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1657 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
1658 * an indirect block: it's an array of block pointers. It consumes
1659 * only one sector and hence is allocatable regardless of fragmentation.
1660 * The gang header's bps point to its gang members, which hold the data.
1661 *
1662 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1663 * as the verifier to ensure uniqueness of the SHA256 checksum.
1664 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1665 * not the gang header. This ensures that data block signatures (needed for
1666 * deduplication) are independent of how the block is physically stored.
1667 *
1668 * Gang blocks can be nested: a gang member may itself be a gang block.
1669 * Thus every gang block is a tree in which root and all interior nodes are
1670 * gang headers, and the leaves are normal blocks that contain user data.
1671 * The root of the gang tree is called the gang leader.
1672 *
1673 * To perform any operation (read, rewrite, free, claim) on a gang block,
1674 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1675 * in the io_gang_tree field of the original logical i/o by recursively
1676 * reading the gang leader and all gang headers below it. This yields
1677 * an in-core tree containing the contents of every gang header and the
1678 * bps for every constituent of the gang block.
1679 *
1680 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1681 * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
1682 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1683 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1684 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1685 * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
1686 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1687 * of the gang header plus zio_checksum_compute() of the data to update the
1688 * gang header's blk_cksum as described above.
1689 *
1690 * The two-phase assemble/issue model solves the problem of partial failure --
1691 * what if you'd freed part of a gang block but then couldn't read the
1692 * gang header for another part? Assembling the entire gang tree first
1693 * ensures that all the necessary gang header I/O has succeeded before
1694 * starting the actual work of free, claim, or write. Once the gang tree
1695 * is assembled, free and claim are in-memory operations that cannot fail.
1696 *
1697 * In the event that a gang write fails, zio_dva_unallocate() walks the
1698 * gang tree to immediately free (i.e. insert back into the space map)
1699 * everything we've allocated. This ensures that we don't get ENOSPC
1700 * errors during repeated suspend/resume cycles due to a flaky device.
1701 *
1702 * Gang rewrites only happen during sync-to-convergence. If we can't assemble
1703 * the gang tree, we won't modify the block, so we can safely defer the free
1704 * (knowing that the block is still intact). If we *can* assemble the gang
1705 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1706 * each constituent bp and we can allocate a new block on the next sync pass.
1707 *
1708 * In all cases, the gang tree allows complete recovery from partial failure.
34dc7c2f
BB
1709 * ==========================================================================
1710 */
b128c09f
BB
1711
1712static zio_t *
1713zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
34dc7c2f 1714{
b128c09f
BB
1715 if (gn != NULL)
1716 return (pio);
34dc7c2f 1717
b128c09f
BB
1718 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1719 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1720 &pio->io_bookmark));
1721}
1722
1723zio_t *
1724zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1725{
1726 zio_t *zio;
1727
1728 if (gn != NULL) {
1729 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1730 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1731 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
34dc7c2f 1732 /*
b128c09f
BB
1733 * As we rewrite each gang header, the pipeline will compute
1734 * a new gang block header checksum for it; but no one will
1735 * compute a new data checksum, so we do that here. The one
1736 * exception is the gang leader: the pipeline already computed
1737 * its data checksum because that stage precedes gang assembly.
1738 * (Presently, nothing actually uses interior data checksums;
1739 * this is just good hygiene.)
34dc7c2f 1740 */
9babb374 1741 if (gn != pio->io_gang_leader->io_gang_tree) {
b128c09f
BB
1742 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1743 data, BP_GET_PSIZE(bp));
1744 }
428870ff
BB
1745 /*
1746 * If we are here to damage data for testing purposes,
1747 * leave the GBH alone so that we can detect the damage.
1748 */
1749 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1750 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
34dc7c2f 1751 } else {
b128c09f
BB
1752 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1753 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1754 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
34dc7c2f
BB
1755 }
1756
b128c09f
BB
1757 return (zio);
1758}
34dc7c2f 1759
b128c09f
BB
1760/* ARGSUSED */
1761zio_t *
1762zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1763{
428870ff
BB
1764 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1765 ZIO_GANG_CHILD_FLAGS(pio)));
34dc7c2f
BB
1766}
1767
b128c09f
BB
1768/* ARGSUSED */
1769zio_t *
1770zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
34dc7c2f 1771{
b128c09f
BB
1772 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1773 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1774}
1775
1776static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1777 NULL,
1778 zio_read_gang,
1779 zio_rewrite_gang,
1780 zio_free_gang,
1781 zio_claim_gang,
1782 NULL
1783};
34dc7c2f 1784
b128c09f 1785static void zio_gang_tree_assemble_done(zio_t *zio);
34dc7c2f 1786
b128c09f
BB
1787static zio_gang_node_t *
1788zio_gang_node_alloc(zio_gang_node_t **gnpp)
1789{
1790 zio_gang_node_t *gn;
34dc7c2f 1791
b128c09f 1792 ASSERT(*gnpp == NULL);
34dc7c2f 1793
79c76d5b 1794 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
b128c09f
BB
1795 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1796 *gnpp = gn;
34dc7c2f 1797
b128c09f 1798 return (gn);
34dc7c2f
BB
1799}
1800
34dc7c2f 1801static void
b128c09f 1802zio_gang_node_free(zio_gang_node_t **gnpp)
34dc7c2f 1803{
b128c09f 1804 zio_gang_node_t *gn = *gnpp;
d6320ddb 1805 int g;
34dc7c2f 1806
d6320ddb 1807 for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
b128c09f
BB
1808 ASSERT(gn->gn_child[g] == NULL);
1809
1810 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1811 kmem_free(gn, sizeof (*gn));
1812 *gnpp = NULL;
34dc7c2f
BB
1813}
1814
b128c09f
BB
1815static void
1816zio_gang_tree_free(zio_gang_node_t **gnpp)
34dc7c2f 1817{
b128c09f 1818 zio_gang_node_t *gn = *gnpp;
d6320ddb 1819 int g;
34dc7c2f 1820
b128c09f
BB
1821 if (gn == NULL)
1822 return;
34dc7c2f 1823
d6320ddb 1824 for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
b128c09f 1825 zio_gang_tree_free(&gn->gn_child[g]);
34dc7c2f 1826
b128c09f 1827 zio_gang_node_free(gnpp);
34dc7c2f
BB
1828}
1829
b128c09f 1830static void
9babb374 1831zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
34dc7c2f 1832{
b128c09f
BB
1833 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1834
9babb374 1835 ASSERT(gio->io_gang_leader == gio);
b128c09f 1836 ASSERT(BP_IS_GANG(bp));
34dc7c2f 1837
9babb374 1838 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
b128c09f 1839 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
9babb374 1840 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
b128c09f 1841}
34dc7c2f 1842
b128c09f
BB
1843static void
1844zio_gang_tree_assemble_done(zio_t *zio)
1845{
9babb374 1846 zio_t *gio = zio->io_gang_leader;
b128c09f
BB
1847 zio_gang_node_t *gn = zio->io_private;
1848 blkptr_t *bp = zio->io_bp;
d6320ddb 1849 int g;
34dc7c2f 1850
9babb374 1851 ASSERT(gio == zio_unique_parent(zio));
428870ff 1852 ASSERT(zio->io_child_count == 0);
34dc7c2f 1853
b128c09f
BB
1854 if (zio->io_error)
1855 return;
34dc7c2f 1856
b128c09f
BB
1857 if (BP_SHOULD_BYTESWAP(bp))
1858 byteswap_uint64_array(zio->io_data, zio->io_size);
34dc7c2f 1859
b128c09f
BB
1860 ASSERT(zio->io_data == gn->gn_gbh);
1861 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
428870ff 1862 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
34dc7c2f 1863
d6320ddb 1864 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
b128c09f
BB
1865 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1866 if (!BP_IS_GANG(gbp))
1867 continue;
9babb374 1868 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
b128c09f 1869 }
34dc7c2f
BB
1870}
1871
b128c09f
BB
1872static void
1873zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
34dc7c2f 1874{
9babb374 1875 zio_t *gio = pio->io_gang_leader;
b128c09f 1876 zio_t *zio;
d6320ddb 1877 int g;
34dc7c2f 1878
b128c09f 1879 ASSERT(BP_IS_GANG(bp) == !!gn);
9babb374
BB
1880 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1881 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
34dc7c2f 1882
b128c09f
BB
1883 /*
1884 * If you're a gang header, your data is in gn->gn_gbh.
1885 * If you're a gang member, your data is in 'data' and gn == NULL.
1886 */
9babb374 1887 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
34dc7c2f 1888
b128c09f 1889 if (gn != NULL) {
428870ff 1890 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
34dc7c2f 1891
d6320ddb 1892 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
b128c09f
BB
1893 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1894 if (BP_IS_HOLE(gbp))
1895 continue;
1896 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1897 data = (char *)data + BP_GET_PSIZE(gbp);
1898 }
34dc7c2f
BB
1899 }
1900
9babb374
BB
1901 if (gn == gio->io_gang_tree)
1902 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
34dc7c2f 1903
b128c09f
BB
1904 if (zio != pio)
1905 zio_nowait(zio);
34dc7c2f
BB
1906}
1907
1908static int
b128c09f 1909zio_gang_assemble(zio_t *zio)
34dc7c2f 1910{
b128c09f 1911 blkptr_t *bp = zio->io_bp;
34dc7c2f 1912
9babb374
BB
1913 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1914 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1915
1916 zio->io_gang_leader = zio;
34dc7c2f 1917
b128c09f 1918 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
34dc7c2f
BB
1919
1920 return (ZIO_PIPELINE_CONTINUE);
1921}
1922
1923static int
b128c09f 1924zio_gang_issue(zio_t *zio)
34dc7c2f 1925{
b128c09f 1926 blkptr_t *bp = zio->io_bp;
34dc7c2f 1927
b128c09f
BB
1928 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1929 return (ZIO_PIPELINE_STOP);
34dc7c2f 1930
9babb374
BB
1931 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1932 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
34dc7c2f 1933
b128c09f 1934 if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
9babb374 1935 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
b128c09f 1936 else
9babb374 1937 zio_gang_tree_free(&zio->io_gang_tree);
34dc7c2f 1938
b128c09f 1939 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
34dc7c2f
BB
1940
1941 return (ZIO_PIPELINE_CONTINUE);
1942}
1943
1944static void
b128c09f 1945zio_write_gang_member_ready(zio_t *zio)
34dc7c2f 1946{
d164b209 1947 zio_t *pio = zio_unique_parent(zio);
34dc7c2f
BB
1948 dva_t *cdva = zio->io_bp->blk_dva;
1949 dva_t *pdva = pio->io_bp->blk_dva;
1950 uint64_t asize;
d6320ddb 1951 int d;
d1d7e268 1952 ASSERTV(zio_t *gio = zio->io_gang_leader);
34dc7c2f 1953
b128c09f
BB
1954 if (BP_IS_HOLE(zio->io_bp))
1955 return;
1956
1957 ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1958
1959 ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
428870ff
BB
1960 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1961 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1962 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
34dc7c2f 1963 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
34dc7c2f
BB
1964
1965 mutex_enter(&pio->io_lock);
d6320ddb 1966 for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
34dc7c2f
BB
1967 ASSERT(DVA_GET_GANG(&pdva[d]));
1968 asize = DVA_GET_ASIZE(&pdva[d]);
1969 asize += DVA_GET_ASIZE(&cdva[d]);
1970 DVA_SET_ASIZE(&pdva[d], asize);
1971 }
1972 mutex_exit(&pio->io_lock);
1973}
1974
1975static int
b128c09f 1976zio_write_gang_block(zio_t *pio)
34dc7c2f 1977{
b128c09f
BB
1978 spa_t *spa = pio->io_spa;
1979 blkptr_t *bp = pio->io_bp;
9babb374 1980 zio_t *gio = pio->io_gang_leader;
b128c09f
BB
1981 zio_t *zio;
1982 zio_gang_node_t *gn, **gnpp;
34dc7c2f 1983 zio_gbh_phys_t *gbh;
b128c09f
BB
1984 uint64_t txg = pio->io_txg;
1985 uint64_t resid = pio->io_size;
1986 uint64_t lsize;
428870ff
BB
1987 int copies = gio->io_prop.zp_copies;
1988 int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
b128c09f 1989 zio_prop_t zp;
d6320ddb 1990 int g, error;
34dc7c2f 1991
428870ff
BB
1992 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1993 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
b128c09f 1994 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
34dc7c2f 1995 if (error) {
b128c09f 1996 pio->io_error = error;
34dc7c2f
BB
1997 return (ZIO_PIPELINE_CONTINUE);
1998 }
1999
9babb374
BB
2000 if (pio == gio) {
2001 gnpp = &gio->io_gang_tree;
b128c09f
BB
2002 } else {
2003 gnpp = pio->io_private;
2004 ASSERT(pio->io_ready == zio_write_gang_member_ready);
34dc7c2f
BB
2005 }
2006
b128c09f
BB
2007 gn = zio_gang_node_alloc(gnpp);
2008 gbh = gn->gn_gbh;
2009 bzero(gbh, SPA_GANGBLOCKSIZE);
34dc7c2f 2010
b128c09f
BB
2011 /*
2012 * Create the gang header.
2013 */
2014 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
2015 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
34dc7c2f 2016
b128c09f
BB
2017 /*
2018 * Create and nowait the gang children.
2019 */
d6320ddb 2020 for (g = 0; resid != 0; resid -= lsize, g++) {
b128c09f
BB
2021 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
2022 SPA_MINBLOCKSIZE);
2023 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
2024
9babb374 2025 zp.zp_checksum = gio->io_prop.zp_checksum;
b128c09f
BB
2026 zp.zp_compress = ZIO_COMPRESS_OFF;
2027 zp.zp_type = DMU_OT_NONE;
2028 zp.zp_level = 0;
428870ff 2029 zp.zp_copies = gio->io_prop.zp_copies;
03c6040b
GW
2030 zp.zp_dedup = B_FALSE;
2031 zp.zp_dedup_verify = B_FALSE;
2032 zp.zp_nopwrite = B_FALSE;
b128c09f
BB
2033
2034 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
2035 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
e8b96c60 2036 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
b128c09f
BB
2037 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2038 &pio->io_bookmark));
2039 }
34dc7c2f
BB
2040
2041 /*
b128c09f 2042 * Set pio's pipeline to just wait for zio to finish.
34dc7c2f 2043 */
b128c09f
BB
2044 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2045
920dd524
ED
2046 /*
2047 * We didn't allocate this bp, so make sure it doesn't get unmarked.
2048 */
2049 pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
2050
b128c09f
BB
2051 zio_nowait(zio);
2052
2053 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
2054}
2055
03c6040b
GW
2056/*
2057 * The zio_nop_write stage in the pipeline determines if allocating
2058 * a new bp is necessary. By leveraging a cryptographically secure checksum,
2059 * such as SHA256, we can compare the checksums of the new data and the old
2060 * to determine if allocating a new block is required. The nopwrite
2061 * feature can handle writes in either syncing or open context (i.e. zil
2062 * writes) and as a result is mutually exclusive with dedup.
2063 */
2064static int
2065zio_nop_write(zio_t *zio)
2066{
2067 blkptr_t *bp = zio->io_bp;
2068 blkptr_t *bp_orig = &zio->io_bp_orig;
2069 zio_prop_t *zp = &zio->io_prop;
2070
2071 ASSERT(BP_GET_LEVEL(bp) == 0);
2072 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2073 ASSERT(zp->zp_nopwrite);
2074 ASSERT(!zp->zp_dedup);
2075 ASSERT(zio->io_bp_override == NULL);
2076 ASSERT(IO_IS_ALLOCATING(zio));
2077
2078 /*
2079 * Check to see if the original bp and the new bp have matching
2080 * characteristics (i.e. same checksum, compression algorithms, etc).
2081 * If they don't then just continue with the pipeline which will
2082 * allocate a new bp.
2083 */
2084 if (BP_IS_HOLE(bp_orig) ||
2085 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2086 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2087 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2088 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2089 zp->zp_copies != BP_GET_NDVAS(bp_orig))
2090 return (ZIO_PIPELINE_CONTINUE);
2091
2092 /*
2093 * If the checksums match then reset the pipeline so that we
2094 * avoid allocating a new bp and issuing any I/O.
2095 */
2096 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2097 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2098 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2099 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2100 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2101 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2102 sizeof (uint64_t)) == 0);
2103
2104 *bp = *bp_orig;
2105 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2106 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2107 }
2108
2109 return (ZIO_PIPELINE_CONTINUE);
2110}
2111
34dc7c2f
BB
2112/*
2113 * ==========================================================================
428870ff 2114 * Dedup
34dc7c2f
BB
2115 * ==========================================================================
2116 */
428870ff
BB
2117static void
2118zio_ddt_child_read_done(zio_t *zio)
2119{
2120 blkptr_t *bp = zio->io_bp;
2121 ddt_entry_t *dde = zio->io_private;
2122 ddt_phys_t *ddp;
2123 zio_t *pio = zio_unique_parent(zio);
2124
2125 mutex_enter(&pio->io_lock);
2126 ddp = ddt_phys_select(dde, bp);
2127 if (zio->io_error == 0)
2128 ddt_phys_clear(ddp); /* this ddp doesn't need repair */
2129 if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2130 dde->dde_repair_data = zio->io_data;
2131 else
2132 zio_buf_free(zio->io_data, zio->io_size);
2133 mutex_exit(&pio->io_lock);
2134}
2135
2136static int
2137zio_ddt_read_start(zio_t *zio)
2138{
2139 blkptr_t *bp = zio->io_bp;
d6320ddb 2140 int p;
428870ff
BB
2141
2142 ASSERT(BP_GET_DEDUP(bp));
2143 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2144 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2145
2146 if (zio->io_child_error[ZIO_CHILD_DDT]) {
2147 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2148 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2149 ddt_phys_t *ddp = dde->dde_phys;
2150 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2151 blkptr_t blk;
2152
2153 ASSERT(zio->io_vsd == NULL);
2154 zio->io_vsd = dde;
2155
2156 if (ddp_self == NULL)
2157 return (ZIO_PIPELINE_CONTINUE);
2158
d6320ddb 2159 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
428870ff
BB
2160 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2161 continue;
2162 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2163 &blk);
2164 zio_nowait(zio_read(zio, zio->io_spa, &blk,
2165 zio_buf_alloc(zio->io_size), zio->io_size,
2166 zio_ddt_child_read_done, dde, zio->io_priority,
2167 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2168 &zio->io_bookmark));
2169 }
2170 return (ZIO_PIPELINE_CONTINUE);
2171 }
2172
2173 zio_nowait(zio_read(zio, zio->io_spa, bp,
2174 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2175 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2176
2177 return (ZIO_PIPELINE_CONTINUE);
2178}
2179
2180static int
2181zio_ddt_read_done(zio_t *zio)
2182{
2183 blkptr_t *bp = zio->io_bp;
2184
2185 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2186 return (ZIO_PIPELINE_STOP);
2187
2188 ASSERT(BP_GET_DEDUP(bp));
2189 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2190 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2191
2192 if (zio->io_child_error[ZIO_CHILD_DDT]) {
2193 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2194 ddt_entry_t *dde = zio->io_vsd;
2195 if (ddt == NULL) {
2196 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2197 return (ZIO_PIPELINE_CONTINUE);
2198 }
2199 if (dde == NULL) {
2200 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2201 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2202 return (ZIO_PIPELINE_STOP);
2203 }
2204 if (dde->dde_repair_data != NULL) {
2205 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2206 zio->io_child_error[ZIO_CHILD_DDT] = 0;
2207 }
2208 ddt_repair_done(ddt, dde);
2209 zio->io_vsd = NULL;
2210 }
2211
2212 ASSERT(zio->io_vsd == NULL);
2213
2214 return (ZIO_PIPELINE_CONTINUE);
2215}
2216
2217static boolean_t
2218zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2219{
2220 spa_t *spa = zio->io_spa;
d6320ddb 2221 int p;
428870ff
BB
2222
2223 /*
2224 * Note: we compare the original data, not the transformed data,
2225 * because when zio->io_bp is an override bp, we will not have
2226 * pushed the I/O transforms. That's an important optimization
2227 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2228 */
d6320ddb 2229 for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
428870ff
BB
2230 zio_t *lio = dde->dde_lead_zio[p];
2231
2232 if (lio != NULL) {
2233 return (lio->io_orig_size != zio->io_orig_size ||
2234 bcmp(zio->io_orig_data, lio->io_orig_data,
2235 zio->io_orig_size) != 0);
2236 }
2237 }
2238
d6320ddb 2239 for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
428870ff
BB
2240 ddt_phys_t *ddp = &dde->dde_phys[p];
2241
2242 if (ddp->ddp_phys_birth != 0) {
2243 arc_buf_t *abuf = NULL;
2244 uint32_t aflags = ARC_WAIT;
2245 blkptr_t blk = *zio->io_bp;
2246 int error;
2247
2248 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2249
2250 ddt_exit(ddt);
2251
294f6806 2252 error = arc_read(NULL, spa, &blk,
428870ff
BB
2253 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2254 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2255 &aflags, &zio->io_bookmark);
2256
2257 if (error == 0) {
2258 if (arc_buf_size(abuf) != zio->io_orig_size ||
2259 bcmp(abuf->b_data, zio->io_orig_data,
2260 zio->io_orig_size) != 0)
2e528b49 2261 error = SET_ERROR(EEXIST);
13fe0198 2262 VERIFY(arc_buf_remove_ref(abuf, &abuf));
428870ff
BB
2263 }
2264
2265 ddt_enter(ddt);
2266 return (error != 0);
2267 }
2268 }
2269
2270 return (B_FALSE);
2271}
2272
2273static void
2274zio_ddt_child_write_ready(zio_t *zio)
2275{
2276 int p = zio->io_prop.zp_copies;
2277 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2278 ddt_entry_t *dde = zio->io_private;
2279 ddt_phys_t *ddp = &dde->dde_phys[p];
2280 zio_t *pio;
2281
2282 if (zio->io_error)
2283 return;
2284
2285 ddt_enter(ddt);
2286
2287 ASSERT(dde->dde_lead_zio[p] == zio);
2288
2289 ddt_phys_fill(ddp, zio->io_bp);
2290
2291 while ((pio = zio_walk_parents(zio)) != NULL)
2292 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2293
2294 ddt_exit(ddt);
2295}
2296
2297static void
2298zio_ddt_child_write_done(zio_t *zio)
2299{
2300 int p = zio->io_prop.zp_copies;
2301 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2302 ddt_entry_t *dde = zio->io_private;
2303 ddt_phys_t *ddp = &dde->dde_phys[p];
2304
2305 ddt_enter(ddt);
2306
2307 ASSERT(ddp->ddp_refcnt == 0);
2308 ASSERT(dde->dde_lead_zio[p] == zio);
2309 dde->dde_lead_zio[p] = NULL;
2310
2311 if (zio->io_error == 0) {
2312 while (zio_walk_parents(zio) != NULL)
2313 ddt_phys_addref(ddp);
2314 } else {
2315 ddt_phys_clear(ddp);
2316 }
2317
2318 ddt_exit(ddt);
2319}
2320
2321static void
2322zio_ddt_ditto_write_done(zio_t *zio)
2323{
2324 int p = DDT_PHYS_DITTO;
428870ff
BB
2325 blkptr_t *bp = zio->io_bp;
2326 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2327 ddt_entry_t *dde = zio->io_private;
2328 ddt_phys_t *ddp = &dde->dde_phys[p];
2329 ddt_key_t *ddk = &dde->dde_key;
1fde1e37 2330 ASSERTV(zio_prop_t *zp = &zio->io_prop);
428870ff
BB
2331
2332 ddt_enter(ddt);
2333
2334 ASSERT(ddp->ddp_refcnt == 0);
2335 ASSERT(dde->dde_lead_zio[p] == zio);
2336 dde->dde_lead_zio[p] = NULL;
2337
2338 if (zio->io_error == 0) {
2339 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2340 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2341 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2342 if (ddp->ddp_phys_birth != 0)
2343 ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2344 ddt_phys_fill(ddp, bp);
2345 }
2346
2347 ddt_exit(ddt);
2348}
2349
2350static int
2351zio_ddt_write(zio_t *zio)
2352{
2353 spa_t *spa = zio->io_spa;
2354 blkptr_t *bp = zio->io_bp;
2355 uint64_t txg = zio->io_txg;
2356 zio_prop_t *zp = &zio->io_prop;
2357 int p = zp->zp_copies;
2358 int ditto_copies;
2359 zio_t *cio = NULL;
2360 zio_t *dio = NULL;
2361 ddt_t *ddt = ddt_select(spa, bp);
2362 ddt_entry_t *dde;
2363 ddt_phys_t *ddp;
2364
2365 ASSERT(BP_GET_DEDUP(bp));
2366 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2367 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2368
2369 ddt_enter(ddt);
2370 dde = ddt_lookup(ddt, bp, B_TRUE);
2371 ddp = &dde->dde_phys[p];
2372
2373 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2374 /*
2375 * If we're using a weak checksum, upgrade to a strong checksum
2376 * and try again. If we're already using a strong checksum,
2377 * we can't resolve it, so just convert to an ordinary write.
2378 * (And automatically e-mail a paper to Nature?)
2379 */
2380 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2381 zp->zp_checksum = spa_dedup_checksum(spa);
2382 zio_pop_transforms(zio);
2383 zio->io_stage = ZIO_STAGE_OPEN;
2384 BP_ZERO(bp);
2385 } else {
03c6040b 2386 zp->zp_dedup = B_FALSE;
428870ff
BB
2387 }
2388 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2389 ddt_exit(ddt);
2390 return (ZIO_PIPELINE_CONTINUE);
2391 }
2392
2393 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2394 ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2395
2396 if (ditto_copies > ddt_ditto_copies_present(dde) &&
2397 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2398 zio_prop_t czp = *zp;
2399
2400 czp.zp_copies = ditto_copies;
2401
2402 /*
2403 * If we arrived here with an override bp, we won't have run
2404 * the transform stack, so we won't have the data we need to
2405 * generate a child i/o. So, toss the override bp and restart.
2406 * This is safe, because using the override bp is just an
2407 * optimization; and it's rare, so the cost doesn't matter.
2408 */
2409 if (zio->io_bp_override) {
2410 zio_pop_transforms(zio);
2411 zio->io_stage = ZIO_STAGE_OPEN;
2412 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2413 zio->io_bp_override = NULL;
2414 BP_ZERO(bp);
2415 ddt_exit(ddt);
2416 return (ZIO_PIPELINE_CONTINUE);
2417 }
2418
2419 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
e8b96c60 2420 zio->io_orig_size, &czp, NULL, NULL,
428870ff
BB
2421 zio_ddt_ditto_write_done, dde, zio->io_priority,
2422 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2423
2424 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2425 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2426 }
2427
2428 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2429 if (ddp->ddp_phys_birth != 0)
2430 ddt_bp_fill(ddp, bp, txg);
2431 if (dde->dde_lead_zio[p] != NULL)
2432 zio_add_child(zio, dde->dde_lead_zio[p]);
2433 else
2434 ddt_phys_addref(ddp);
2435 } else if (zio->io_bp_override) {
2436 ASSERT(bp->blk_birth == txg);
2437 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2438 ddt_phys_fill(ddp, bp);
2439 ddt_phys_addref(ddp);
2440 } else {
2441 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
e8b96c60 2442 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
428870ff
BB
2443 zio_ddt_child_write_done, dde, zio->io_priority,
2444 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2445
2446 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2447 dde->dde_lead_zio[p] = cio;
2448 }
2449
2450 ddt_exit(ddt);
2451
2452 if (cio)
2453 zio_nowait(cio);
2454 if (dio)
2455 zio_nowait(dio);
2456
2457 return (ZIO_PIPELINE_CONTINUE);
2458}
2459
2460ddt_entry_t *freedde; /* for debugging */
b128c09f 2461
428870ff
BB
2462static int
2463zio_ddt_free(zio_t *zio)
2464{
2465 spa_t *spa = zio->io_spa;
2466 blkptr_t *bp = zio->io_bp;
2467 ddt_t *ddt = ddt_select(spa, bp);
2468 ddt_entry_t *dde;
2469 ddt_phys_t *ddp;
2470
2471 ASSERT(BP_GET_DEDUP(bp));
2472 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2473
2474 ddt_enter(ddt);
2475 freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
5dc6af0e
BB
2476 if (dde) {
2477 ddp = ddt_phys_select(dde, bp);
2478 if (ddp)
2479 ddt_phys_decref(ddp);
2480 }
428870ff
BB
2481 ddt_exit(ddt);
2482
2483 return (ZIO_PIPELINE_CONTINUE);
2484}
2485
2486/*
2487 * ==========================================================================
2488 * Allocate and free blocks
2489 * ==========================================================================
2490 */
34dc7c2f
BB
2491static int
2492zio_dva_allocate(zio_t *zio)
2493{
2494 spa_t *spa = zio->io_spa;
428870ff 2495 metaslab_class_t *mc = spa_normal_class(spa);
34dc7c2f
BB
2496 blkptr_t *bp = zio->io_bp;
2497 int error;
6d974228 2498 int flags = 0;
34dc7c2f 2499
9babb374
BB
2500 if (zio->io_gang_leader == NULL) {
2501 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2502 zio->io_gang_leader = zio;
2503 }
2504
34dc7c2f 2505 ASSERT(BP_IS_HOLE(bp));
c99c9001 2506 ASSERT0(BP_GET_NDVAS(bp));
428870ff
BB
2507 ASSERT3U(zio->io_prop.zp_copies, >, 0);
2508 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
34dc7c2f
BB
2509 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2510
6d974228
GW
2511 /*
2512 * The dump device does not support gang blocks so allocation on
2513 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2514 * the "fast" gang feature.
2515 */
2516 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2517 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2518 METASLAB_GANG_CHILD : 0;
920dd524 2519 flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
b128c09f 2520 error = metaslab_alloc(spa, mc, zio->io_size, bp,
6d974228 2521 zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
34dc7c2f 2522
b128c09f 2523 if (error) {
6d974228
GW
2524 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2525 "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2526 error);
b128c09f
BB
2527 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2528 return (zio_write_gang_block(zio));
34dc7c2f
BB
2529 zio->io_error = error;
2530 }
2531
2532 return (ZIO_PIPELINE_CONTINUE);
2533}
2534
2535static int
2536zio_dva_free(zio_t *zio)
2537{
b128c09f 2538 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
34dc7c2f
BB
2539
2540 return (ZIO_PIPELINE_CONTINUE);
2541}
2542
2543static int
2544zio_dva_claim(zio_t *zio)
2545{
b128c09f
BB
2546 int error;
2547
2548 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2549 if (error)
2550 zio->io_error = error;
34dc7c2f
BB
2551
2552 return (ZIO_PIPELINE_CONTINUE);
2553}
2554
b128c09f
BB
2555/*
2556 * Undo an allocation. This is used by zio_done() when an I/O fails
2557 * and we want to give back the block we just allocated.
2558 * This handles both normal blocks and gang blocks.
2559 */
2560static void
2561zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2562{
d6320ddb
BB
2563 int g;
2564
b128c09f 2565 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
428870ff 2566 ASSERT(zio->io_bp_override == NULL);
b128c09f
BB
2567
2568 if (!BP_IS_HOLE(bp))
428870ff 2569 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
b128c09f
BB
2570
2571 if (gn != NULL) {
d6320ddb 2572 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
b128c09f
BB
2573 zio_dva_unallocate(zio, gn->gn_child[g],
2574 &gn->gn_gbh->zg_blkptr[g]);
2575 }
2576 }
2577}
2578
2579/*
2580 * Try to allocate an intent log block. Return 0 on success, errno on failure.
2581 */
2582int
920dd524
ED
2583zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
2584 boolean_t use_slog)
b128c09f 2585{
428870ff 2586 int error = 1;
b128c09f 2587
428870ff
BB
2588 ASSERT(txg > spa_syncing_txg(spa));
2589
ebf8e3a2
BB
2590 /*
2591 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2592 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2593 * when allocating them.
2594 */
2595 if (use_slog) {
428870ff 2596 error = metaslab_alloc(spa, spa_log_class(spa), size,
920dd524
ED
2597 new_bp, 1, txg, NULL,
2598 METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
ebf8e3a2 2599 }
b128c09f 2600
ebf8e3a2 2601 if (error) {
428870ff 2602 error = metaslab_alloc(spa, spa_normal_class(spa), size,
920dd524 2603 new_bp, 1, txg, NULL,
ac72fac3 2604 METASLAB_FASTWRITE);
ebf8e3a2 2605 }
b128c09f
BB
2606
2607 if (error == 0) {
2608 BP_SET_LSIZE(new_bp, size);
2609 BP_SET_PSIZE(new_bp, size);
2610 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
428870ff
BB
2611 BP_SET_CHECKSUM(new_bp,
2612 spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2613 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
b128c09f
BB
2614 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2615 BP_SET_LEVEL(new_bp, 0);
428870ff 2616 BP_SET_DEDUP(new_bp, 0);
b128c09f
BB
2617 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2618 }
2619
2620 return (error);
2621}
2622
2623/*
428870ff 2624 * Free an intent log block.
b128c09f
BB
2625 */
2626void
428870ff 2627zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
b128c09f 2628{
428870ff 2629 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
b128c09f
BB
2630 ASSERT(!BP_IS_GANG(bp));
2631
428870ff 2632 zio_free(spa, txg, bp);
b128c09f
BB
2633}
2634
34dc7c2f
BB
2635/*
2636 * ==========================================================================
2637 * Read and write to physical devices
2638 * ==========================================================================
2639 */
98b25418
GW
2640
2641
2642/*
2643 * Issue an I/O to the underlying vdev. Typically the issue pipeline
2644 * stops after this stage and will resume upon I/O completion.
2645 * However, there are instances where the vdev layer may need to
2646 * continue the pipeline when an I/O was not issued. Since the I/O
2647 * that was sent to the vdev layer might be different than the one
2648 * currently active in the pipeline (see vdev_queue_io()), we explicitly
2649 * force the underlying vdev layers to call either zio_execute() or
2650 * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2651 */
34dc7c2f
BB
2652static int
2653zio_vdev_io_start(zio_t *zio)
2654{
2655 vdev_t *vd = zio->io_vd;
34dc7c2f
BB
2656 uint64_t align;
2657 spa_t *spa = zio->io_spa;
2658
b128c09f
BB
2659 ASSERT(zio->io_error == 0);
2660 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
34dc7c2f 2661
b128c09f
BB
2662 if (vd == NULL) {
2663 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2664 spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
34dc7c2f 2665
b128c09f
BB
2666 /*
2667 * The mirror_ops handle multiple DVAs in a single BP.
2668 */
98b25418
GW
2669 vdev_mirror_ops.vdev_op_io_start(zio);
2670 return (ZIO_PIPELINE_STOP);
34dc7c2f
BB
2671 }
2672
572e2857
BB
2673 /*
2674 * We keep track of time-sensitive I/Os so that the scan thread
2675 * can quickly react to certain workloads. In particular, we care
2676 * about non-scrubbing, top-level reads and writes with the following
2677 * characteristics:
98b25418 2678 * - synchronous writes of user data to non-slog devices
572e2857
BB
2679 * - any reads of user data
2680 * When these conditions are met, adjust the timestamp of spa_last_io
2681 * which allows the scan thread to adjust its workload accordingly.
2682 */
2683 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2684 vd == vd->vdev_top && !vd->vdev_islog &&
2685 zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2686 zio->io_txg != spa_syncing_txg(spa)) {
2687 uint64_t old = spa->spa_last_io;
2688 uint64_t new = ddi_get_lbolt64();
2689 if (old != new)
2690 (void) atomic_cas_64(&spa->spa_last_io, old, new);
2691 }
2692
b128c09f
BB
2693 align = 1ULL << vd->vdev_top->vdev_ashift;
2694
b02fe35d
AR
2695 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2696 P2PHASE(zio->io_size, align) != 0) {
2697 /* Transform logical writes to be a full physical block size. */
34dc7c2f
BB
2698 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2699 char *abuf = zio_buf_alloc(asize);
178e73b3 2700 ASSERT(vd == vd->vdev_top);
34dc7c2f
BB
2701 if (zio->io_type == ZIO_TYPE_WRITE) {
2702 bcopy(zio->io_data, abuf, zio->io_size);
2703 bzero(abuf + zio->io_size, asize - zio->io_size);
2704 }
b128c09f 2705 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
34dc7c2f
BB
2706 }
2707
b02fe35d
AR
2708 /*
2709 * If this is not a physical io, make sure that it is properly aligned
2710 * before proceeding.
2711 */
2712 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2713 ASSERT0(P2PHASE(zio->io_offset, align));
2714 ASSERT0(P2PHASE(zio->io_size, align));
2715 } else {
2716 /*
2717 * For physical writes, we allow 512b aligned writes and assume
2718 * the device will perform a read-modify-write as necessary.
2719 */
2720 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2721 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2722 }
2723
572e2857 2724 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
fb5f0bc8
BB
2725
2726 /*
2727 * If this is a repair I/O, and there's no self-healing involved --
2728 * that is, we're just resilvering what we expect to resilver --
2729 * then don't do the I/O unless zio's txg is actually in vd's DTL.
2730 * This prevents spurious resilvering with nested replication.
2731 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2732 * A is out of date, we'll read from C+D, then use the data to
2733 * resilver A+B -- but we don't actually want to resilver B, just A.
2734 * The top-level mirror has no way to know this, so instead we just
2735 * discard unnecessary repairs as we work our way down the vdev tree.
2736 * The same logic applies to any form of nested replication:
2737 * ditto + mirror, RAID-Z + replacing, etc. This covers them all.
2738 */
2739 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2740 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2741 zio->io_txg != 0 && /* not a delegated i/o */
2742 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2743 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
fb5f0bc8
BB
2744 zio_vdev_io_bypass(zio);
2745 return (ZIO_PIPELINE_CONTINUE);
2746 }
34dc7c2f 2747
b128c09f
BB
2748 if (vd->vdev_ops->vdev_op_leaf &&
2749 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2750
b0bc7a84 2751 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
d164b209 2752 return (ZIO_PIPELINE_CONTINUE);
b128c09f
BB
2753
2754 if ((zio = vdev_queue_io(zio)) == NULL)
2755 return (ZIO_PIPELINE_STOP);
2756
2757 if (!vdev_accessible(vd, zio)) {
2e528b49 2758 zio->io_error = SET_ERROR(ENXIO);
b128c09f
BB
2759 zio_interrupt(zio);
2760 return (ZIO_PIPELINE_STOP);
2761 }
b128c09f
BB
2762 }
2763
98b25418
GW
2764 vd->vdev_ops->vdev_op_io_start(zio);
2765 return (ZIO_PIPELINE_STOP);
34dc7c2f
BB
2766}
2767
2768static int
2769zio_vdev_io_done(zio_t *zio)
2770{
b128c09f
BB
2771 vdev_t *vd = zio->io_vd;
2772 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2773 boolean_t unexpected_error = B_FALSE;
34dc7c2f 2774
b128c09f
BB
2775 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2776 return (ZIO_PIPELINE_STOP);
34dc7c2f 2777
b128c09f
BB
2778 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2779
2780 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2781
2782 vdev_queue_io_done(zio);
2783
2784 if (zio->io_type == ZIO_TYPE_WRITE)
2785 vdev_cache_write(zio);
2786
2787 if (zio_injection_enabled && zio->io_error == 0)
9babb374
BB
2788 zio->io_error = zio_handle_device_injection(vd,
2789 zio, EIO);
b128c09f
BB
2790
2791 if (zio_injection_enabled && zio->io_error == 0)
2792 zio->io_error = zio_handle_label_injection(zio, EIO);
2793
2794 if (zio->io_error) {
2795 if (!vdev_accessible(vd, zio)) {
2e528b49 2796 zio->io_error = SET_ERROR(ENXIO);
b128c09f
BB
2797 } else {
2798 unexpected_error = B_TRUE;
2799 }
2800 }
2801 }
2802
2803 ops->vdev_op_io_done(zio);
34dc7c2f 2804
b128c09f 2805 if (unexpected_error)
d164b209 2806 VERIFY(vdev_probe(vd, zio) == NULL);
34dc7c2f 2807
b128c09f 2808 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
2809}
2810
428870ff
BB
2811/*
2812 * For non-raidz ZIOs, we can just copy aside the bad data read from the
2813 * disk, and use that to finish the checksum ereport later.
2814 */
2815static void
2816zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2817 const void *good_buf)
2818{
2819 /* no processing needed */
2820 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2821}
2822
2823/*ARGSUSED*/
2824void
2825zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2826{
2827 void *buf = zio_buf_alloc(zio->io_size);
2828
2829 bcopy(zio->io_data, buf, zio->io_size);
2830
2831 zcr->zcr_cbinfo = zio->io_size;
2832 zcr->zcr_cbdata = buf;
2833 zcr->zcr_finish = zio_vsd_default_cksum_finish;
2834 zcr->zcr_free = zio_buf_free;
2835}
2836
34dc7c2f
BB
2837static int
2838zio_vdev_io_assess(zio_t *zio)
2839{
2840 vdev_t *vd = zio->io_vd;
b128c09f
BB
2841
2842 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2843 return (ZIO_PIPELINE_STOP);
2844
2845 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2846 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2847
2848 if (zio->io_vsd != NULL) {
428870ff 2849 zio->io_vsd_ops->vsd_free(zio);
b128c09f 2850 zio->io_vsd = NULL;
34dc7c2f
BB
2851 }
2852
b128c09f 2853 if (zio_injection_enabled && zio->io_error == 0)
34dc7c2f
BB
2854 zio->io_error = zio_handle_fault_injection(zio, EIO);
2855
2856 /*
2857 * If the I/O failed, determine whether we should attempt to retry it.
428870ff
BB
2858 *
2859 * On retry, we cut in line in the issue queue, since we don't want
2860 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
34dc7c2f 2861 */
b128c09f
BB
2862 if (zio->io_error && vd == NULL &&
2863 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2864 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2865 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
34dc7c2f 2866 zio->io_error = 0;
b128c09f
BB
2867 zio->io_flags |= ZIO_FLAG_IO_RETRY |
2868 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
428870ff
BB
2869 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2870 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2871 zio_requeue_io_start_cut_in_line);
b128c09f 2872 return (ZIO_PIPELINE_STOP);
34dc7c2f
BB
2873 }
2874
b128c09f
BB
2875 /*
2876 * If we got an error on a leaf device, convert it to ENXIO
2877 * if the device is not accessible at all.
2878 */
2879 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2880 !vdev_accessible(vd, zio))
2e528b49 2881 zio->io_error = SET_ERROR(ENXIO);
b128c09f
BB
2882
2883 /*
2884 * If we can't write to an interior vdev (mirror or RAID-Z),
2885 * set vdev_cant_write so that we stop trying to allocate from it.
2886 */
2887 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
13fe0198 2888 vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
b128c09f 2889 vd->vdev_cant_write = B_TRUE;
13fe0198 2890 }
b128c09f
BB
2891
2892 if (zio->io_error)
2893 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2894
e8b96c60
MA
2895 if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2896 zio->io_physdone != NULL) {
2897 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2898 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2899 zio->io_physdone(zio->io_logical);
2900 }
2901
34dc7c2f
BB
2902 return (ZIO_PIPELINE_CONTINUE);
2903}
2904
2905void
2906zio_vdev_io_reissue(zio_t *zio)
2907{
2908 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2909 ASSERT(zio->io_error == 0);
2910
428870ff 2911 zio->io_stage >>= 1;
34dc7c2f
BB
2912}
2913
2914void
2915zio_vdev_io_redone(zio_t *zio)
2916{
2917 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2918
428870ff 2919 zio->io_stage >>= 1;
34dc7c2f
BB
2920}
2921
2922void
2923zio_vdev_io_bypass(zio_t *zio)
2924{
2925 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2926 ASSERT(zio->io_error == 0);
2927
2928 zio->io_flags |= ZIO_FLAG_IO_BYPASS;
428870ff 2929 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
34dc7c2f
BB
2930}
2931
2932/*
2933 * ==========================================================================
2934 * Generate and verify checksums
2935 * ==========================================================================
2936 */
2937static int
2938zio_checksum_generate(zio_t *zio)
2939{
34dc7c2f 2940 blkptr_t *bp = zio->io_bp;
b128c09f 2941 enum zio_checksum checksum;
34dc7c2f 2942
b128c09f
BB
2943 if (bp == NULL) {
2944 /*
2945 * This is zio_write_phys().
2946 * We're either generating a label checksum, or none at all.
2947 */
2948 checksum = zio->io_prop.zp_checksum;
34dc7c2f 2949
b128c09f
BB
2950 if (checksum == ZIO_CHECKSUM_OFF)
2951 return (ZIO_PIPELINE_CONTINUE);
2952
2953 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2954 } else {
2955 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2956 ASSERT(!IO_IS_ALLOCATING(zio));
2957 checksum = ZIO_CHECKSUM_GANG_HEADER;
2958 } else {
2959 checksum = BP_GET_CHECKSUM(bp);
2960 }
2961 }
34dc7c2f 2962
b128c09f 2963 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
34dc7c2f
BB
2964
2965 return (ZIO_PIPELINE_CONTINUE);
2966}
2967
2968static int
b128c09f 2969zio_checksum_verify(zio_t *zio)
34dc7c2f 2970{
428870ff 2971 zio_bad_cksum_t info;
b128c09f
BB
2972 blkptr_t *bp = zio->io_bp;
2973 int error;
34dc7c2f 2974
428870ff
BB
2975 ASSERT(zio->io_vd != NULL);
2976
b128c09f
BB
2977 if (bp == NULL) {
2978 /*
2979 * This is zio_read_phys().
2980 * We're either verifying a label checksum, or nothing at all.
2981 */
2982 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2983 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f 2984
b128c09f
BB
2985 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2986 }
34dc7c2f 2987
428870ff 2988 if ((error = zio_checksum_error(zio, &info)) != 0) {
b128c09f 2989 zio->io_error = error;
7a3066ff
MA
2990 if (error == ECKSUM &&
2991 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
428870ff
BB
2992 zfs_ereport_start_checksum(zio->io_spa,
2993 zio->io_vd, zio, zio->io_offset,
2994 zio->io_size, NULL, &info);
b128c09f 2995 }
34dc7c2f
BB
2996 }
2997
2998 return (ZIO_PIPELINE_CONTINUE);
2999}
3000
3001/*
3002 * Called by RAID-Z to ensure we don't compute the checksum twice.
3003 */
3004void
3005zio_checksum_verified(zio_t *zio)
3006{
428870ff 3007 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
34dc7c2f
BB
3008}
3009
3010/*
b128c09f
BB
3011 * ==========================================================================
3012 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
9b67f605 3013 * An error of 0 indicates success. ENXIO indicates whole-device failure,
b128c09f
BB
3014 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
3015 * indicate errors that are specific to one I/O, and most likely permanent.
3016 * Any other error is presumed to be worse because we weren't expecting it.
3017 * ==========================================================================
34dc7c2f 3018 */
b128c09f
BB
3019int
3020zio_worst_error(int e1, int e2)
34dc7c2f 3021{
b128c09f
BB
3022 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
3023 int r1, r2;
3024
3025 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
3026 if (e1 == zio_error_rank[r1])
3027 break;
34dc7c2f 3028
b128c09f
BB
3029 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
3030 if (e2 == zio_error_rank[r2])
3031 break;
3032
3033 return (r1 > r2 ? e1 : e2);
34dc7c2f
BB
3034}
3035
3036/*
3037 * ==========================================================================
b128c09f 3038 * I/O completion
34dc7c2f
BB
3039 * ==========================================================================
3040 */
b128c09f
BB
3041static int
3042zio_ready(zio_t *zio)
34dc7c2f 3043{
b128c09f 3044 blkptr_t *bp = zio->io_bp;
d164b209 3045 zio_t *pio, *pio_next;
34dc7c2f 3046
428870ff
BB
3047 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3048 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
9babb374 3049 return (ZIO_PIPELINE_STOP);
34dc7c2f 3050
9babb374 3051 if (zio->io_ready) {
b128c09f 3052 ASSERT(IO_IS_ALLOCATING(zio));
03c6040b
GW
3053 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3054 (zio->io_flags & ZIO_FLAG_NOPWRITE));
b128c09f 3055 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
34dc7c2f 3056
b128c09f
BB
3057 zio->io_ready(zio);
3058 }
34dc7c2f 3059
b128c09f
BB
3060 if (bp != NULL && bp != &zio->io_bp_copy)
3061 zio->io_bp_copy = *bp;
34dc7c2f 3062
b128c09f
BB
3063 if (zio->io_error)
3064 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
34dc7c2f 3065
d164b209
BB
3066 mutex_enter(&zio->io_lock);
3067 zio->io_state[ZIO_WAIT_READY] = 1;
3068 pio = zio_walk_parents(zio);
3069 mutex_exit(&zio->io_lock);
3070
3071 /*
3072 * As we notify zio's parents, new parents could be added.
3073 * New parents go to the head of zio's io_parent_list, however,
3074 * so we will (correctly) not notify them. The remainder of zio's
3075 * io_parent_list, from 'pio_next' onward, cannot change because
3076 * all parents must wait for us to be done before they can be done.
3077 */
3078 for (; pio != NULL; pio = pio_next) {
3079 pio_next = zio_walk_parents(zio);
b128c09f 3080 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
d164b209 3081 }
34dc7c2f 3082
428870ff
BB
3083 if (zio->io_flags & ZIO_FLAG_NODATA) {
3084 if (BP_IS_GANG(bp)) {
3085 zio->io_flags &= ~ZIO_FLAG_NODATA;
3086 } else {
3087 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3088 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3089 }
3090 }
3091
3092 if (zio_injection_enabled &&
3093 zio->io_spa->spa_syncing_txg == zio->io_txg)
3094 zio_handle_ignored_writes(zio);
3095
b128c09f 3096 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
3097}
3098
b128c09f
BB
3099static int
3100zio_done(zio_t *zio)
34dc7c2f 3101{
d164b209 3102 zio_t *pio, *pio_next;
d6320ddb 3103 int c, w;
34dc7c2f 3104
b128c09f 3105 /*
9babb374 3106 * If our children haven't all completed,
b128c09f
BB
3107 * wait for them and then repeat this pipeline stage.
3108 */
3109 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3110 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
428870ff 3111 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
b128c09f
BB
3112 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3113 return (ZIO_PIPELINE_STOP);
34dc7c2f 3114
d6320ddb
BB
3115 for (c = 0; c < ZIO_CHILD_TYPES; c++)
3116 for (w = 0; w < ZIO_WAIT_TYPES; w++)
b128c09f
BB
3117 ASSERT(zio->io_children[c][w] == 0);
3118
9b67f605 3119 if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
c776b317
BB
3120 ASSERT(zio->io_bp->blk_pad[0] == 0);
3121 ASSERT(zio->io_bp->blk_pad[1] == 0);
d1d7e268
MK
3122 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
3123 sizeof (blkptr_t)) == 0 ||
c776b317
BB
3124 (zio->io_bp == zio_unique_parent(zio)->io_bp));
3125 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
428870ff 3126 zio->io_bp_override == NULL &&
b128c09f 3127 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
c776b317 3128 ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
d1d7e268
MK
3129 ASSERT3U(zio->io_prop.zp_copies, <=,
3130 BP_GET_NDVAS(zio->io_bp));
c776b317 3131 ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
d1d7e268
MK
3132 (BP_COUNT_GANG(zio->io_bp) ==
3133 BP_GET_NDVAS(zio->io_bp)));
b128c09f 3134 }
03c6040b
GW
3135 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3136 VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
b128c09f
BB
3137 }
3138
3139 /*
428870ff 3140 * If there were child vdev/gang/ddt errors, they apply to us now.
b128c09f
BB
3141 */
3142 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3143 zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
428870ff
BB
3144 zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3145
3146 /*
3147 * If the I/O on the transformed data was successful, generate any
3148 * checksum reports now while we still have the transformed data.
3149 */
3150 if (zio->io_error == 0) {
3151 while (zio->io_cksum_report != NULL) {
3152 zio_cksum_report_t *zcr = zio->io_cksum_report;
3153 uint64_t align = zcr->zcr_align;
c776b317 3154 uint64_t asize = P2ROUNDUP(zio->io_size, align);
428870ff
BB
3155 char *abuf = zio->io_data;
3156
c776b317 3157 if (asize != zio->io_size) {
428870ff 3158 abuf = zio_buf_alloc(asize);
c776b317 3159 bcopy(zio->io_data, abuf, zio->io_size);
d1d7e268 3160 bzero(abuf+zio->io_size, asize-zio->io_size);
428870ff
BB
3161 }
3162
3163 zio->io_cksum_report = zcr->zcr_next;
3164 zcr->zcr_next = NULL;
3165 zcr->zcr_finish(zcr, abuf);
3166 zfs_ereport_free_checksum(zcr);
3167
c776b317 3168 if (asize != zio->io_size)
428870ff
BB
3169 zio_buf_free(abuf, asize);
3170 }
3171 }
b128c09f
BB
3172
3173 zio_pop_transforms(zio); /* note: may set zio->io_error */
3174
c776b317 3175 vdev_stat_update(zio, zio->io_size);
b128c09f 3176
a69052be 3177 /*
cc92e9d0 3178 * If this I/O is attached to a particular vdev is slow, exceeding
72f53c56
MJ
3179 * 30 seconds to complete, post an error described the I/O delay.
3180 * We ignore these errors if the device is currently unavailable.
a69052be 3181 */
cc92e9d0 3182 if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
72f53c56
MJ
3183 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
3184 zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
d1d7e268 3185 zio->io_vd, zio, 0, 0);
72f53c56 3186 }
a69052be 3187
b128c09f
BB
3188 if (zio->io_error) {
3189 /*
3190 * If this I/O is attached to a particular vdev,
3191 * generate an error message describing the I/O failure
3192 * at the block level. We ignore these errors if the
3193 * device is currently unavailable.
3194 */
c776b317
BB
3195 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
3196 !vdev_is_dead(zio->io_vd))
3197 zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
3198 zio->io_vd, zio, 0, 0);
34dc7c2f 3199
428870ff
BB
3200 if ((zio->io_error == EIO || !(zio->io_flags &
3201 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
c776b317 3202 zio == zio->io_logical) {
b128c09f
BB
3203 /*
3204 * For logical I/O requests, tell the SPA to log the
3205 * error and generate a logical data ereport.
3206 */
c776b317 3207 spa_log_error(zio->io_spa, zio);
d1d7e268
MK
3208 zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
3209 NULL, zio, 0, 0);
b128c09f
BB
3210 }
3211 }
34dc7c2f 3212
c776b317 3213 if (zio->io_error && zio == zio->io_logical) {
b128c09f
BB
3214 /*
3215 * Determine whether zio should be reexecuted. This will
3216 * propagate all the way to the root via zio_notify_parent().
3217 */
c776b317 3218 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
428870ff 3219 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
b128c09f 3220
428870ff
BB
3221 if (IO_IS_ALLOCATING(zio) &&
3222 !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
b128c09f
BB
3223 if (zio->io_error != ENOSPC)
3224 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3225 else
3226 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
428870ff 3227 }
b128c09f
BB
3228
3229 if ((zio->io_type == ZIO_TYPE_READ ||
3230 zio->io_type == ZIO_TYPE_FREE) &&
572e2857 3231 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
b128c09f 3232 zio->io_error == ENXIO &&
c776b317
BB
3233 spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
3234 spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
b128c09f
BB
3235 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3236
3237 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3238 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
428870ff
BB
3239
3240 /*
3241 * Here is a possibly good place to attempt to do
3242 * either combinatorial reconstruction or error correction
3243 * based on checksums. It also might be a good place
3244 * to send out preliminary ereports before we suspend
3245 * processing.
3246 */
34dc7c2f
BB
3247 }
3248
3249 /*
b128c09f
BB
3250 * If there were logical child errors, they apply to us now.
3251 * We defer this until now to avoid conflating logical child
3252 * errors with errors that happened to the zio itself when
3253 * updating vdev stats and reporting FMA events above.
34dc7c2f 3254 */
b128c09f 3255 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
34dc7c2f 3256
428870ff
BB
3257 if ((zio->io_error || zio->io_reexecute) &&
3258 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
03c6040b 3259 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
c776b317 3260 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
9babb374
BB
3261
3262 zio_gang_tree_free(&zio->io_gang_tree);
3263
3264 /*
3265 * Godfather I/Os should never suspend.
3266 */
3267 if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3268 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3269 zio->io_reexecute = 0;
3270
b128c09f
BB
3271 if (zio->io_reexecute) {
3272 /*
3273 * This is a logical I/O that wants to reexecute.
3274 *
3275 * Reexecute is top-down. When an i/o fails, if it's not
3276 * the root, it simply notifies its parent and sticks around.
3277 * The parent, seeing that it still has children in zio_done(),
3278 * does the same. This percolates all the way up to the root.
3279 * The root i/o will reexecute or suspend the entire tree.
3280 *
3281 * This approach ensures that zio_reexecute() honors
3282 * all the original i/o dependency relationships, e.g.
3283 * parents not executing until children are ready.
3284 */
3285 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
34dc7c2f 3286
9babb374 3287 zio->io_gang_leader = NULL;
b128c09f 3288
d164b209
BB
3289 mutex_enter(&zio->io_lock);
3290 zio->io_state[ZIO_WAIT_DONE] = 1;
3291 mutex_exit(&zio->io_lock);
3292
9babb374
BB
3293 /*
3294 * "The Godfather" I/O monitors its children but is
3295 * not a true parent to them. It will track them through
3296 * the pipeline but severs its ties whenever they get into
3297 * trouble (e.g. suspended). This allows "The Godfather"
3298 * I/O to return status without blocking.
3299 */
3300 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3301 zio_link_t *zl = zio->io_walk_link;
3302 pio_next = zio_walk_parents(zio);
3303
3304 if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3305 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3306 zio_remove_child(pio, zio, zl);
3307 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3308 }
3309 }
3310
d164b209 3311 if ((pio = zio_unique_parent(zio)) != NULL) {
b128c09f
BB
3312 /*
3313 * We're not a root i/o, so there's nothing to do
3314 * but notify our parent. Don't propagate errors
3315 * upward since we haven't permanently failed yet.
3316 */
9babb374 3317 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
b128c09f
BB
3318 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3319 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3320 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3321 /*
3322 * We'd fail again if we reexecuted now, so suspend
3323 * until conditions improve (e.g. device comes online).
3324 */
c776b317 3325 zio_suspend(zio->io_spa, zio);
b128c09f
BB
3326 } else {
3327 /*
3328 * Reexecution is potentially a huge amount of work.
3329 * Hand it off to the otherwise-unused claim taskq.
3330 */
a38718a6 3331 ASSERT(taskq_empty_ent(&zio->io_tqent));
7ef5e54e
AL
3332 spa_taskq_dispatch_ent(zio->io_spa,
3333 ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
a38718a6
GA
3334 (task_func_t *)zio_reexecute, zio, 0,
3335 &zio->io_tqent);
b128c09f
BB
3336 }
3337 return (ZIO_PIPELINE_STOP);
34dc7c2f
BB
3338 }
3339
428870ff 3340 ASSERT(zio->io_child_count == 0);
b128c09f
BB
3341 ASSERT(zio->io_reexecute == 0);
3342 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
34dc7c2f 3343
428870ff
BB
3344 /*
3345 * Report any checksum errors, since the I/O is complete.
3346 */
3347 while (zio->io_cksum_report != NULL) {
3348 zio_cksum_report_t *zcr = zio->io_cksum_report;
3349 zio->io_cksum_report = zcr->zcr_next;
3350 zcr->zcr_next = NULL;
3351 zcr->zcr_finish(zcr, NULL);
3352 zfs_ereport_free_checksum(zcr);
3353 }
3354
920dd524 3355 if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
9b67f605
MA
3356 !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
3357 !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
920dd524
ED
3358 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
3359 }
3360
d164b209
BB
3361 /*
3362 * It is the responsibility of the done callback to ensure that this
3363 * particular zio is no longer discoverable for adoption, and as
3364 * such, cannot acquire any new parents.
3365 */
b128c09f
BB
3366 if (zio->io_done)
3367 zio->io_done(zio);
34dc7c2f 3368
d164b209
BB
3369 mutex_enter(&zio->io_lock);
3370 zio->io_state[ZIO_WAIT_DONE] = 1;
3371 mutex_exit(&zio->io_lock);
34dc7c2f 3372
d164b209
BB
3373 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3374 zio_link_t *zl = zio->io_walk_link;
3375 pio_next = zio_walk_parents(zio);
3376 zio_remove_child(pio, zio, zl);
b128c09f
BB
3377 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3378 }
34dc7c2f 3379
b128c09f
BB
3380 if (zio->io_waiter != NULL) {
3381 mutex_enter(&zio->io_lock);
3382 zio->io_executor = NULL;
3383 cv_broadcast(&zio->io_cv);
3384 mutex_exit(&zio->io_lock);
3385 } else {
3386 zio_destroy(zio);
3387 }
34dc7c2f 3388
b128c09f 3389 return (ZIO_PIPELINE_STOP);
34dc7c2f
BB
3390}
3391
3392/*
b128c09f
BB
3393 * ==========================================================================
3394 * I/O pipeline definition
3395 * ==========================================================================
34dc7c2f 3396 */
428870ff 3397static zio_pipe_stage_t *zio_pipeline[] = {
b128c09f 3398 NULL,
b128c09f 3399 zio_read_bp_init,
428870ff
BB
3400 zio_free_bp_init,
3401 zio_issue_async,
b128c09f
BB
3402 zio_write_bp_init,
3403 zio_checksum_generate,
03c6040b 3404 zio_nop_write,
428870ff
BB
3405 zio_ddt_read_start,
3406 zio_ddt_read_done,
3407 zio_ddt_write,
3408 zio_ddt_free,
b128c09f
BB
3409 zio_gang_assemble,
3410 zio_gang_issue,
3411 zio_dva_allocate,
3412 zio_dva_free,
3413 zio_dva_claim,
3414 zio_ready,
3415 zio_vdev_io_start,
3416 zio_vdev_io_done,
3417 zio_vdev_io_assess,
3418 zio_checksum_verify,
3419 zio_done
3420};
c28b2279 3421
9ae529ec
CS
3422/* dnp is the dnode for zb1->zb_object */
3423boolean_t
5dbd68a3
MA
3424zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3425 const zbookmark_phys_t *zb2)
9ae529ec
CS
3426{
3427 uint64_t zb1nextL0, zb2thisobj;
3428
3429 ASSERT(zb1->zb_objset == zb2->zb_objset);
3430 ASSERT(zb2->zb_level == 0);
3431
9ae529ec
CS
3432 /* The objset_phys_t isn't before anything. */
3433 if (dnp == NULL)
3434 return (B_FALSE);
3435
3436 zb1nextL0 = (zb1->zb_blkid + 1) <<
3437 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3438
3439 zb2thisobj = zb2->zb_object ? zb2->zb_object :
3440 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3441
3442 if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3443 uint64_t nextobj = zb1nextL0 *
3444 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3445 return (nextobj <= zb2thisobj);
3446 }
3447
3448 if (zb1->zb_object < zb2thisobj)
3449 return (B_TRUE);
3450 if (zb1->zb_object > zb2thisobj)
3451 return (B_FALSE);
3452 if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3453 return (B_FALSE);
3454 return (zb1nextL0 <= zb2->zb_blkid);
3455}
3456
c28b2279 3457#if defined(_KERNEL) && defined(HAVE_SPL)
c28b2279 3458EXPORT_SYMBOL(zio_type_name);
81971b13
BB
3459EXPORT_SYMBOL(zio_buf_alloc);
3460EXPORT_SYMBOL(zio_data_buf_alloc);
3461EXPORT_SYMBOL(zio_buf_free);
3462EXPORT_SYMBOL(zio_data_buf_free);
c28b2279 3463
a69052be 3464module_param(zio_delay_max, int, 0644);
c409e464
BB
3465MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
3466
3467module_param(zio_requeue_io_start_cut_in_line, int, 0644);
3468MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
29dee3ee
CP
3469
3470module_param(zfs_sync_pass_deferred_free, int, 0644);
3471MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
d1d7e268 3472 "Defer frees starting in this pass");
29dee3ee
CP
3473
3474module_param(zfs_sync_pass_dont_compress, int, 0644);
3475MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
d1d7e268 3476 "Don't compress starting in this pass");
29dee3ee
CP
3477
3478module_param(zfs_sync_pass_rewrite, int, 0644);
3479MODULE_PARM_DESC(zfs_sync_pass_rewrite,
d1d7e268 3480 "Rewrite new bps starting in this pass");
c28b2279 3481#endif