4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/fm/fs/zfs.h>
29 #include <sys/spa_impl.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/zio_impl.h>
32 #include <sys/zio_compress.h>
33 #include <sys/zio_checksum.h>
34 #include <sys/dmu_objset.h>
39 * ==========================================================================
41 * ==========================================================================
43 uint8_t zio_priority_table
[ZIO_PRIORITY_TABLE_SIZE
] = {
44 0, /* ZIO_PRIORITY_NOW */
45 0, /* ZIO_PRIORITY_SYNC_READ */
46 0, /* ZIO_PRIORITY_SYNC_WRITE */
47 0, /* ZIO_PRIORITY_LOG_WRITE */
48 1, /* ZIO_PRIORITY_CACHE_FILL */
49 1, /* ZIO_PRIORITY_AGG */
50 4, /* ZIO_PRIORITY_FREE */
51 4, /* ZIO_PRIORITY_ASYNC_WRITE */
52 6, /* ZIO_PRIORITY_ASYNC_READ */
53 10, /* ZIO_PRIORITY_RESILVER */
54 20, /* ZIO_PRIORITY_SCRUB */
55 2, /* ZIO_PRIORITY_DDT_PREFETCH */
59 * ==========================================================================
60 * I/O type descriptions
61 * ==========================================================================
63 char *zio_type_name
[ZIO_TYPES
] = {
64 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
69 * ==========================================================================
71 * ==========================================================================
73 kmem_cache_t
*zio_cache
;
74 kmem_cache_t
*zio_link_cache
;
75 kmem_cache_t
*zio_buf_cache
[SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
];
76 kmem_cache_t
*zio_data_buf_cache
[SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
];
79 extern vmem_t
*zio_alloc_arena
;
83 * An allocating zio is one that either currently has the DVA allocate
84 * stage set or will have it later in its lifetime.
86 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
88 boolean_t zio_requeue_io_start_cut_in_line
= B_TRUE
;
91 int zio_buf_debug_limit
= 16384;
93 int zio_buf_debug_limit
= 0;
100 vmem_t
*data_alloc_arena
= NULL
;
103 data_alloc_arena
= zio_alloc_arena
;
105 zio_cache
= kmem_cache_create("zio_cache",
106 sizeof (zio_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
107 zio_link_cache
= kmem_cache_create("zio_link_cache",
108 sizeof (zio_link_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
111 * For small buffers, we want a cache for each multiple of
112 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
113 * for each quarter-power of 2. For large buffers, we want
114 * a cache for each multiple of PAGESIZE.
116 for (c
= 0; c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
; c
++) {
117 size_t size
= (c
+ 1) << SPA_MINBLOCKSHIFT
;
121 while (p2
& (p2
- 1))
124 if (size
<= 4 * SPA_MINBLOCKSIZE
) {
125 align
= SPA_MINBLOCKSIZE
;
126 } else if (P2PHASE(size
, PAGESIZE
) == 0) {
128 } else if (P2PHASE(size
, p2
>> 2) == 0) {
134 (void) sprintf(name
, "zio_buf_%lu", (ulong_t
)size
);
135 zio_buf_cache
[c
] = kmem_cache_create(name
, size
,
136 align
, NULL
, NULL
, NULL
, NULL
, NULL
,
137 size
> zio_buf_debug_limit
? KMC_NODEBUG
: 0);
139 (void) sprintf(name
, "zio_data_buf_%lu", (ulong_t
)size
);
140 zio_data_buf_cache
[c
] = kmem_cache_create(name
, size
,
141 align
, NULL
, NULL
, NULL
, NULL
, data_alloc_arena
,
142 size
> zio_buf_debug_limit
? KMC_NODEBUG
: 0);
147 ASSERT(zio_buf_cache
[c
] != NULL
);
148 if (zio_buf_cache
[c
- 1] == NULL
)
149 zio_buf_cache
[c
- 1] = zio_buf_cache
[c
];
151 ASSERT(zio_data_buf_cache
[c
] != NULL
);
152 if (zio_data_buf_cache
[c
- 1] == NULL
)
153 zio_data_buf_cache
[c
- 1] = zio_data_buf_cache
[c
];
163 kmem_cache_t
*last_cache
= NULL
;
164 kmem_cache_t
*last_data_cache
= NULL
;
166 for (c
= 0; c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
; c
++) {
167 if (zio_buf_cache
[c
] != last_cache
) {
168 last_cache
= zio_buf_cache
[c
];
169 kmem_cache_destroy(zio_buf_cache
[c
]);
171 zio_buf_cache
[c
] = NULL
;
173 if (zio_data_buf_cache
[c
] != last_data_cache
) {
174 last_data_cache
= zio_data_buf_cache
[c
];
175 kmem_cache_destroy(zio_data_buf_cache
[c
]);
177 zio_data_buf_cache
[c
] = NULL
;
180 kmem_cache_destroy(zio_link_cache
);
181 kmem_cache_destroy(zio_cache
);
187 * ==========================================================================
188 * Allocate and free I/O buffers
189 * ==========================================================================
193 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
194 * crashdump if the kernel panics, so use it judiciously. Obviously, it's
195 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
196 * excess / transient data in-core during a crashdump.
199 zio_buf_alloc(size_t size
)
201 size_t c
= (size
- 1) >> SPA_MINBLOCKSHIFT
;
203 ASSERT(c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
);
205 return (kmem_cache_alloc(zio_buf_cache
[c
], KM_PUSHPAGE
));
209 * Use zio_data_buf_alloc to allocate data. The data will not appear in a
210 * crashdump if the kernel panics. This exists so that we will limit the amount
211 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
212 * of kernel heap dumped to disk when the kernel panics)
215 zio_data_buf_alloc(size_t size
)
217 size_t c
= (size
- 1) >> SPA_MINBLOCKSHIFT
;
219 ASSERT(c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
);
221 return (kmem_cache_alloc(zio_data_buf_cache
[c
], KM_PUSHPAGE
));
225 zio_buf_free(void *buf
, size_t size
)
227 size_t c
= (size
- 1) >> SPA_MINBLOCKSHIFT
;
229 ASSERT(c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
);
231 kmem_cache_free(zio_buf_cache
[c
], buf
);
235 zio_data_buf_free(void *buf
, size_t size
)
237 size_t c
= (size
- 1) >> SPA_MINBLOCKSHIFT
;
239 ASSERT(c
< SPA_MAXBLOCKSIZE
>> SPA_MINBLOCKSHIFT
);
241 kmem_cache_free(zio_data_buf_cache
[c
], buf
);
245 * ==========================================================================
246 * Push and pop I/O transform buffers
247 * ==========================================================================
250 zio_push_transform(zio_t
*zio
, void *data
, uint64_t size
, uint64_t bufsize
,
251 zio_transform_func_t
*transform
)
253 zio_transform_t
*zt
= kmem_alloc(sizeof (zio_transform_t
), KM_SLEEP
);
255 zt
->zt_orig_data
= zio
->io_data
;
256 zt
->zt_orig_size
= zio
->io_size
;
257 zt
->zt_bufsize
= bufsize
;
258 zt
->zt_transform
= transform
;
260 zt
->zt_next
= zio
->io_transform_stack
;
261 zio
->io_transform_stack
= zt
;
268 zio_pop_transforms(zio_t
*zio
)
272 while ((zt
= zio
->io_transform_stack
) != NULL
) {
273 if (zt
->zt_transform
!= NULL
)
274 zt
->zt_transform(zio
,
275 zt
->zt_orig_data
, zt
->zt_orig_size
);
277 if (zt
->zt_bufsize
!= 0)
278 zio_buf_free(zio
->io_data
, zt
->zt_bufsize
);
280 zio
->io_data
= zt
->zt_orig_data
;
281 zio
->io_size
= zt
->zt_orig_size
;
282 zio
->io_transform_stack
= zt
->zt_next
;
284 kmem_free(zt
, sizeof (zio_transform_t
));
289 * ==========================================================================
290 * I/O transform callbacks for subblocks and decompression
291 * ==========================================================================
294 zio_subblock(zio_t
*zio
, void *data
, uint64_t size
)
296 ASSERT(zio
->io_size
> size
);
298 if (zio
->io_type
== ZIO_TYPE_READ
)
299 bcopy(zio
->io_data
, data
, size
);
303 zio_decompress(zio_t
*zio
, void *data
, uint64_t size
)
305 if (zio
->io_error
== 0 &&
306 zio_decompress_data(BP_GET_COMPRESS(zio
->io_bp
),
307 zio
->io_data
, data
, zio
->io_size
, size
) != 0)
312 * ==========================================================================
313 * I/O parent/child relationships and pipeline interlocks
314 * ==========================================================================
317 * NOTE - Callers to zio_walk_parents() and zio_walk_children must
318 * continue calling these functions until they return NULL.
319 * Otherwise, the next caller will pick up the list walk in
320 * some indeterminate state. (Otherwise every caller would
321 * have to pass in a cookie to keep the state represented by
322 * io_walk_link, which gets annoying.)
325 zio_walk_parents(zio_t
*cio
)
327 zio_link_t
*zl
= cio
->io_walk_link
;
328 list_t
*pl
= &cio
->io_parent_list
;
330 zl
= (zl
== NULL
) ? list_head(pl
) : list_next(pl
, zl
);
331 cio
->io_walk_link
= zl
;
336 ASSERT(zl
->zl_child
== cio
);
337 return (zl
->zl_parent
);
341 zio_walk_children(zio_t
*pio
)
343 zio_link_t
*zl
= pio
->io_walk_link
;
344 list_t
*cl
= &pio
->io_child_list
;
346 zl
= (zl
== NULL
) ? list_head(cl
) : list_next(cl
, zl
);
347 pio
->io_walk_link
= zl
;
352 ASSERT(zl
->zl_parent
== pio
);
353 return (zl
->zl_child
);
357 zio_unique_parent(zio_t
*cio
)
359 zio_t
*pio
= zio_walk_parents(cio
);
361 VERIFY(zio_walk_parents(cio
) == NULL
);
366 zio_add_child(zio_t
*pio
, zio_t
*cio
)
368 zio_link_t
*zl
= kmem_cache_alloc(zio_link_cache
, KM_SLEEP
);
372 * Logical I/Os can have logical, gang, or vdev children.
373 * Gang I/Os can have gang or vdev children.
374 * Vdev I/Os can only have vdev children.
375 * The following ASSERT captures all of these constraints.
377 ASSERT(cio
->io_child_type
<= pio
->io_child_type
);
382 mutex_enter(&cio
->io_lock
);
383 mutex_enter(&pio
->io_lock
);
385 ASSERT(pio
->io_state
[ZIO_WAIT_DONE
] == 0);
387 for (w
= 0; w
< ZIO_WAIT_TYPES
; w
++)
388 pio
->io_children
[cio
->io_child_type
][w
] += !cio
->io_state
[w
];
390 list_insert_head(&pio
->io_child_list
, zl
);
391 list_insert_head(&cio
->io_parent_list
, zl
);
393 pio
->io_child_count
++;
394 cio
->io_parent_count
++;
396 mutex_exit(&pio
->io_lock
);
397 mutex_exit(&cio
->io_lock
);
401 zio_remove_child(zio_t
*pio
, zio_t
*cio
, zio_link_t
*zl
)
403 ASSERT(zl
->zl_parent
== pio
);
404 ASSERT(zl
->zl_child
== cio
);
406 mutex_enter(&cio
->io_lock
);
407 mutex_enter(&pio
->io_lock
);
409 list_remove(&pio
->io_child_list
, zl
);
410 list_remove(&cio
->io_parent_list
, zl
);
412 pio
->io_child_count
--;
413 cio
->io_parent_count
--;
415 mutex_exit(&pio
->io_lock
);
416 mutex_exit(&cio
->io_lock
);
418 kmem_cache_free(zio_link_cache
, zl
);
422 zio_wait_for_children(zio_t
*zio
, enum zio_child child
, enum zio_wait_type wait
)
424 uint64_t *countp
= &zio
->io_children
[child
][wait
];
425 boolean_t waiting
= B_FALSE
;
427 mutex_enter(&zio
->io_lock
);
428 ASSERT(zio
->io_stall
== NULL
);
431 zio
->io_stall
= countp
;
434 mutex_exit(&zio
->io_lock
);
440 zio_notify_parent(zio_t
*pio
, zio_t
*zio
, enum zio_wait_type wait
)
442 uint64_t *countp
= &pio
->io_children
[zio
->io_child_type
][wait
];
443 int *errorp
= &pio
->io_child_error
[zio
->io_child_type
];
445 mutex_enter(&pio
->io_lock
);
446 if (zio
->io_error
&& !(zio
->io_flags
& ZIO_FLAG_DONT_PROPAGATE
))
447 *errorp
= zio_worst_error(*errorp
, zio
->io_error
);
448 pio
->io_reexecute
|= zio
->io_reexecute
;
449 ASSERT3U(*countp
, >, 0);
450 if (--*countp
== 0 && pio
->io_stall
== countp
) {
451 pio
->io_stall
= NULL
;
452 mutex_exit(&pio
->io_lock
);
455 mutex_exit(&pio
->io_lock
);
460 zio_inherit_child_errors(zio_t
*zio
, enum zio_child c
)
462 if (zio
->io_child_error
[c
] != 0 && zio
->io_error
== 0)
463 zio
->io_error
= zio
->io_child_error
[c
];
467 * ==========================================================================
468 * Create the various types of I/O (read, write, free, etc)
469 * ==========================================================================
472 zio_create(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, const blkptr_t
*bp
,
473 void *data
, uint64_t size
, zio_done_func_t
*done
, void *private,
474 zio_type_t type
, int priority
, enum zio_flag flags
,
475 vdev_t
*vd
, uint64_t offset
, const zbookmark_t
*zb
,
476 enum zio_stage stage
, enum zio_stage pipeline
)
480 ASSERT3U(size
, <=, SPA_MAXBLOCKSIZE
);
481 ASSERT(P2PHASE(size
, SPA_MINBLOCKSIZE
) == 0);
482 ASSERT(P2PHASE(offset
, SPA_MINBLOCKSIZE
) == 0);
484 ASSERT(!vd
|| spa_config_held(spa
, SCL_STATE_ALL
, RW_READER
));
485 ASSERT(!bp
|| !(flags
& ZIO_FLAG_CONFIG_WRITER
));
486 ASSERT(vd
|| stage
== ZIO_STAGE_OPEN
);
488 zio
= kmem_cache_alloc(zio_cache
, KM_SLEEP
);
489 bzero(zio
, sizeof (zio_t
));
491 mutex_init(&zio
->io_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
492 cv_init(&zio
->io_cv
, NULL
, CV_DEFAULT
, NULL
);
494 list_create(&zio
->io_parent_list
, sizeof (zio_link_t
),
495 offsetof(zio_link_t
, zl_parent_node
));
496 list_create(&zio
->io_child_list
, sizeof (zio_link_t
),
497 offsetof(zio_link_t
, zl_child_node
));
500 zio
->io_child_type
= ZIO_CHILD_VDEV
;
501 else if (flags
& ZIO_FLAG_GANG_CHILD
)
502 zio
->io_child_type
= ZIO_CHILD_GANG
;
503 else if (flags
& ZIO_FLAG_DDT_CHILD
)
504 zio
->io_child_type
= ZIO_CHILD_DDT
;
506 zio
->io_child_type
= ZIO_CHILD_LOGICAL
;
509 zio
->io_bp
= (blkptr_t
*)bp
;
510 zio
->io_bp_copy
= *bp
;
511 zio
->io_bp_orig
= *bp
;
512 if (type
!= ZIO_TYPE_WRITE
||
513 zio
->io_child_type
== ZIO_CHILD_DDT
)
514 zio
->io_bp
= &zio
->io_bp_copy
; /* so caller can free */
515 if (zio
->io_child_type
== ZIO_CHILD_LOGICAL
)
516 zio
->io_logical
= zio
;
517 if (zio
->io_child_type
> ZIO_CHILD_GANG
&& BP_IS_GANG(bp
))
518 pipeline
|= ZIO_GANG_STAGES
;
524 zio
->io_private
= private;
526 zio
->io_priority
= priority
;
528 zio
->io_offset
= offset
;
529 zio
->io_orig_data
= zio
->io_data
= data
;
530 zio
->io_orig_size
= zio
->io_size
= size
;
531 zio
->io_orig_flags
= zio
->io_flags
= flags
;
532 zio
->io_orig_stage
= zio
->io_stage
= stage
;
533 zio
->io_orig_pipeline
= zio
->io_pipeline
= pipeline
;
535 zio
->io_state
[ZIO_WAIT_READY
] = (stage
>= ZIO_STAGE_READY
);
536 zio
->io_state
[ZIO_WAIT_DONE
] = (stage
>= ZIO_STAGE_DONE
);
539 zio
->io_bookmark
= *zb
;
542 if (zio
->io_logical
== NULL
)
543 zio
->io_logical
= pio
->io_logical
;
544 if (zio
->io_child_type
== ZIO_CHILD_GANG
)
545 zio
->io_gang_leader
= pio
->io_gang_leader
;
546 zio_add_child(pio
, zio
);
553 zio_destroy(zio_t
*zio
)
555 list_destroy(&zio
->io_parent_list
);
556 list_destroy(&zio
->io_child_list
);
557 mutex_destroy(&zio
->io_lock
);
558 cv_destroy(&zio
->io_cv
);
559 kmem_cache_free(zio_cache
, zio
);
563 zio_null(zio_t
*pio
, spa_t
*spa
, vdev_t
*vd
, zio_done_func_t
*done
,
564 void *private, enum zio_flag flags
)
568 zio
= zio_create(pio
, spa
, 0, NULL
, NULL
, 0, done
, private,
569 ZIO_TYPE_NULL
, ZIO_PRIORITY_NOW
, flags
, vd
, 0, NULL
,
570 ZIO_STAGE_OPEN
, ZIO_INTERLOCK_PIPELINE
);
576 zio_root(spa_t
*spa
, zio_done_func_t
*done
, void *private, enum zio_flag flags
)
578 return (zio_null(NULL
, spa
, NULL
, done
, private, flags
));
582 zio_read(zio_t
*pio
, spa_t
*spa
, const blkptr_t
*bp
,
583 void *data
, uint64_t size
, zio_done_func_t
*done
, void *private,
584 int priority
, enum zio_flag flags
, const zbookmark_t
*zb
)
588 zio
= zio_create(pio
, spa
, BP_PHYSICAL_BIRTH(bp
), bp
,
589 data
, size
, done
, private,
590 ZIO_TYPE_READ
, priority
, flags
, NULL
, 0, zb
,
591 ZIO_STAGE_OPEN
, (flags
& ZIO_FLAG_DDT_CHILD
) ?
592 ZIO_DDT_CHILD_READ_PIPELINE
: ZIO_READ_PIPELINE
);
598 zio_write(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, blkptr_t
*bp
,
599 void *data
, uint64_t size
, const zio_prop_t
*zp
,
600 zio_done_func_t
*ready
, zio_done_func_t
*done
, void *private,
601 int priority
, enum zio_flag flags
, const zbookmark_t
*zb
)
605 ASSERT(zp
->zp_checksum
>= ZIO_CHECKSUM_OFF
&&
606 zp
->zp_checksum
< ZIO_CHECKSUM_FUNCTIONS
&&
607 zp
->zp_compress
>= ZIO_COMPRESS_OFF
&&
608 zp
->zp_compress
< ZIO_COMPRESS_FUNCTIONS
&&
609 zp
->zp_type
< DMU_OT_NUMTYPES
&&
612 zp
->zp_copies
<= spa_max_replication(spa
) &&
614 zp
->zp_dedup_verify
<= 1);
616 zio
= zio_create(pio
, spa
, txg
, bp
, data
, size
, done
, private,
617 ZIO_TYPE_WRITE
, priority
, flags
, NULL
, 0, zb
,
618 ZIO_STAGE_OPEN
, (flags
& ZIO_FLAG_DDT_CHILD
) ?
619 ZIO_DDT_CHILD_WRITE_PIPELINE
: ZIO_WRITE_PIPELINE
);
621 zio
->io_ready
= ready
;
628 zio_rewrite(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, blkptr_t
*bp
, void *data
,
629 uint64_t size
, zio_done_func_t
*done
, void *private, int priority
,
630 enum zio_flag flags
, zbookmark_t
*zb
)
634 zio
= zio_create(pio
, spa
, txg
, bp
, data
, size
, done
, private,
635 ZIO_TYPE_WRITE
, priority
, flags
, NULL
, 0, zb
,
636 ZIO_STAGE_OPEN
, ZIO_REWRITE_PIPELINE
);
642 zio_write_override(zio_t
*zio
, blkptr_t
*bp
, int copies
)
644 ASSERT(zio
->io_type
== ZIO_TYPE_WRITE
);
645 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
646 ASSERT(zio
->io_stage
== ZIO_STAGE_OPEN
);
647 ASSERT(zio
->io_txg
== spa_syncing_txg(zio
->io_spa
));
649 zio
->io_prop
.zp_copies
= copies
;
650 zio
->io_bp_override
= bp
;
654 zio_free(spa_t
*spa
, uint64_t txg
, const blkptr_t
*bp
)
656 bplist_append(&spa
->spa_free_bplist
[txg
& TXG_MASK
], bp
);
660 zio_free_sync(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, const blkptr_t
*bp
,
665 dprintf_bp(bp
, "freeing in txg %llu, pass %u",
666 (longlong_t
)txg
, spa
->spa_sync_pass
);
668 ASSERT(!BP_IS_HOLE(bp
));
669 ASSERT(spa_syncing_txg(spa
) == txg
);
670 ASSERT(spa_sync_pass(spa
) <= SYNC_PASS_DEFERRED_FREE
);
672 zio
= zio_create(pio
, spa
, txg
, bp
, NULL
, BP_GET_PSIZE(bp
),
673 NULL
, NULL
, ZIO_TYPE_FREE
, ZIO_PRIORITY_FREE
, flags
,
674 NULL
, 0, NULL
, ZIO_STAGE_OPEN
, ZIO_FREE_PIPELINE
);
680 zio_claim(zio_t
*pio
, spa_t
*spa
, uint64_t txg
, const blkptr_t
*bp
,
681 zio_done_func_t
*done
, void *private, enum zio_flag flags
)
686 * A claim is an allocation of a specific block. Claims are needed
687 * to support immediate writes in the intent log. The issue is that
688 * immediate writes contain committed data, but in a txg that was
689 * *not* committed. Upon opening the pool after an unclean shutdown,
690 * the intent log claims all blocks that contain immediate write data
691 * so that the SPA knows they're in use.
693 * All claims *must* be resolved in the first txg -- before the SPA
694 * starts allocating blocks -- so that nothing is allocated twice.
695 * If txg == 0 we just verify that the block is claimable.
697 ASSERT3U(spa
->spa_uberblock
.ub_rootbp
.blk_birth
, <, spa_first_txg(spa
));
698 ASSERT(txg
== spa_first_txg(spa
) || txg
== 0);
699 ASSERT(!BP_GET_DEDUP(bp
) || !spa_writeable(spa
)); /* zdb(1M) */
701 zio
= zio_create(pio
, spa
, txg
, bp
, NULL
, BP_GET_PSIZE(bp
),
702 done
, private, ZIO_TYPE_CLAIM
, ZIO_PRIORITY_NOW
, flags
,
703 NULL
, 0, NULL
, ZIO_STAGE_OPEN
, ZIO_CLAIM_PIPELINE
);
709 zio_ioctl(zio_t
*pio
, spa_t
*spa
, vdev_t
*vd
, int cmd
,
710 zio_done_func_t
*done
, void *private, int priority
, enum zio_flag flags
)
715 if (vd
->vdev_children
== 0) {
716 zio
= zio_create(pio
, spa
, 0, NULL
, NULL
, 0, done
, private,
717 ZIO_TYPE_IOCTL
, priority
, flags
, vd
, 0, NULL
,
718 ZIO_STAGE_OPEN
, ZIO_IOCTL_PIPELINE
);
722 zio
= zio_null(pio
, spa
, NULL
, NULL
, NULL
, flags
);
724 for (c
= 0; c
< vd
->vdev_children
; c
++)
725 zio_nowait(zio_ioctl(zio
, spa
, vd
->vdev_child
[c
], cmd
,
726 done
, private, priority
, flags
));
733 zio_read_phys(zio_t
*pio
, vdev_t
*vd
, uint64_t offset
, uint64_t size
,
734 void *data
, int checksum
, zio_done_func_t
*done
, void *private,
735 int priority
, enum zio_flag flags
, boolean_t labels
)
739 ASSERT(vd
->vdev_children
== 0);
740 ASSERT(!labels
|| offset
+ size
<= VDEV_LABEL_START_SIZE
||
741 offset
>= vd
->vdev_psize
- VDEV_LABEL_END_SIZE
);
742 ASSERT3U(offset
+ size
, <=, vd
->vdev_psize
);
744 zio
= zio_create(pio
, vd
->vdev_spa
, 0, NULL
, data
, size
, done
, private,
745 ZIO_TYPE_READ
, priority
, flags
, vd
, offset
, NULL
,
746 ZIO_STAGE_OPEN
, ZIO_READ_PHYS_PIPELINE
);
748 zio
->io_prop
.zp_checksum
= checksum
;
754 zio_write_phys(zio_t
*pio
, vdev_t
*vd
, uint64_t offset
, uint64_t size
,
755 void *data
, int checksum
, zio_done_func_t
*done
, void *private,
756 int priority
, enum zio_flag flags
, boolean_t labels
)
760 ASSERT(vd
->vdev_children
== 0);
761 ASSERT(!labels
|| offset
+ size
<= VDEV_LABEL_START_SIZE
||
762 offset
>= vd
->vdev_psize
- VDEV_LABEL_END_SIZE
);
763 ASSERT3U(offset
+ size
, <=, vd
->vdev_psize
);
765 zio
= zio_create(pio
, vd
->vdev_spa
, 0, NULL
, data
, size
, done
, private,
766 ZIO_TYPE_WRITE
, priority
, flags
, vd
, offset
, NULL
,
767 ZIO_STAGE_OPEN
, ZIO_WRITE_PHYS_PIPELINE
);
769 zio
->io_prop
.zp_checksum
= checksum
;
771 if (zio_checksum_table
[checksum
].ci_eck
) {
773 * zec checksums are necessarily destructive -- they modify
774 * the end of the write buffer to hold the verifier/checksum.
775 * Therefore, we must make a local copy in case the data is
776 * being written to multiple places in parallel.
778 void *wbuf
= zio_buf_alloc(size
);
779 bcopy(data
, wbuf
, size
);
780 zio_push_transform(zio
, wbuf
, size
, size
, NULL
);
787 * Create a child I/O to do some work for us.
790 zio_vdev_child_io(zio_t
*pio
, blkptr_t
*bp
, vdev_t
*vd
, uint64_t offset
,
791 void *data
, uint64_t size
, int type
, int priority
, enum zio_flag flags
,
792 zio_done_func_t
*done
, void *private)
794 enum zio_stage pipeline
= ZIO_VDEV_CHILD_PIPELINE
;
797 ASSERT(vd
->vdev_parent
==
798 (pio
->io_vd
? pio
->io_vd
: pio
->io_spa
->spa_root_vdev
));
800 if (type
== ZIO_TYPE_READ
&& bp
!= NULL
) {
802 * If we have the bp, then the child should perform the
803 * checksum and the parent need not. This pushes error
804 * detection as close to the leaves as possible and
805 * eliminates redundant checksums in the interior nodes.
807 pipeline
|= ZIO_STAGE_CHECKSUM_VERIFY
;
808 pio
->io_pipeline
&= ~ZIO_STAGE_CHECKSUM_VERIFY
;
811 if (vd
->vdev_children
== 0)
812 offset
+= VDEV_LABEL_START_SIZE
;
814 flags
|= ZIO_VDEV_CHILD_FLAGS(pio
) | ZIO_FLAG_DONT_PROPAGATE
;
817 * If we've decided to do a repair, the write is not speculative --
818 * even if the original read was.
820 if (flags
& ZIO_FLAG_IO_REPAIR
)
821 flags
&= ~ZIO_FLAG_SPECULATIVE
;
823 zio
= zio_create(pio
, pio
->io_spa
, pio
->io_txg
, bp
, data
, size
,
824 done
, private, type
, priority
, flags
, vd
, offset
, &pio
->io_bookmark
,
825 ZIO_STAGE_VDEV_IO_START
>> 1, pipeline
);
831 zio_vdev_delegated_io(vdev_t
*vd
, uint64_t offset
, void *data
, uint64_t size
,
832 int type
, int priority
, enum zio_flag flags
,
833 zio_done_func_t
*done
, void *private)
837 ASSERT(vd
->vdev_ops
->vdev_op_leaf
);
839 zio
= zio_create(NULL
, vd
->vdev_spa
, 0, NULL
,
840 data
, size
, done
, private, type
, priority
,
841 flags
| ZIO_FLAG_CANFAIL
| ZIO_FLAG_DONT_RETRY
,
843 ZIO_STAGE_VDEV_IO_START
>> 1, ZIO_VDEV_CHILD_PIPELINE
);
849 zio_flush(zio_t
*zio
, vdev_t
*vd
)
851 zio_nowait(zio_ioctl(zio
, zio
->io_spa
, vd
, DKIOCFLUSHWRITECACHE
,
852 NULL
, NULL
, ZIO_PRIORITY_NOW
,
853 ZIO_FLAG_CANFAIL
| ZIO_FLAG_DONT_PROPAGATE
| ZIO_FLAG_DONT_RETRY
));
857 zio_shrink(zio_t
*zio
, uint64_t size
)
859 ASSERT(zio
->io_executor
== NULL
);
860 ASSERT(zio
->io_orig_size
== zio
->io_size
);
861 ASSERT(size
<= zio
->io_size
);
864 * We don't shrink for raidz because of problems with the
865 * reconstruction when reading back less than the block size.
866 * Note, BP_IS_RAIDZ() assumes no compression.
868 ASSERT(BP_GET_COMPRESS(zio
->io_bp
) == ZIO_COMPRESS_OFF
);
869 if (!BP_IS_RAIDZ(zio
->io_bp
))
870 zio
->io_orig_size
= zio
->io_size
= size
;
874 * ==========================================================================
875 * Prepare to read and write logical blocks
876 * ==========================================================================
880 zio_read_bp_init(zio_t
*zio
)
882 blkptr_t
*bp
= zio
->io_bp
;
884 if (BP_GET_COMPRESS(bp
) != ZIO_COMPRESS_OFF
&&
885 zio
->io_child_type
== ZIO_CHILD_LOGICAL
&&
886 !(zio
->io_flags
& ZIO_FLAG_RAW
)) {
887 uint64_t psize
= BP_GET_PSIZE(bp
);
888 void *cbuf
= zio_buf_alloc(psize
);
890 zio_push_transform(zio
, cbuf
, psize
, psize
, zio_decompress
);
893 if (!dmu_ot
[BP_GET_TYPE(bp
)].ot_metadata
&& BP_GET_LEVEL(bp
) == 0)
894 zio
->io_flags
|= ZIO_FLAG_DONT_CACHE
;
896 if (BP_GET_TYPE(bp
) == DMU_OT_DDT_ZAP
)
897 zio
->io_flags
|= ZIO_FLAG_DONT_CACHE
;
899 if (BP_GET_DEDUP(bp
) && zio
->io_child_type
== ZIO_CHILD_LOGICAL
)
900 zio
->io_pipeline
= ZIO_DDT_READ_PIPELINE
;
902 return (ZIO_PIPELINE_CONTINUE
);
906 zio_write_bp_init(zio_t
*zio
)
908 spa_t
*spa
= zio
->io_spa
;
909 zio_prop_t
*zp
= &zio
->io_prop
;
910 enum zio_compress compress
= zp
->zp_compress
;
911 blkptr_t
*bp
= zio
->io_bp
;
912 uint64_t lsize
= zio
->io_size
;
913 uint64_t psize
= lsize
;
917 * If our children haven't all reached the ready stage,
918 * wait for them and then repeat this pipeline stage.
920 if (zio_wait_for_children(zio
, ZIO_CHILD_GANG
, ZIO_WAIT_READY
) ||
921 zio_wait_for_children(zio
, ZIO_CHILD_LOGICAL
, ZIO_WAIT_READY
))
922 return (ZIO_PIPELINE_STOP
);
924 if (!IO_IS_ALLOCATING(zio
))
925 return (ZIO_PIPELINE_CONTINUE
);
927 ASSERT(zio
->io_child_type
!= ZIO_CHILD_DDT
);
929 if (zio
->io_bp_override
) {
930 ASSERT(bp
->blk_birth
!= zio
->io_txg
);
931 ASSERT(BP_GET_DEDUP(zio
->io_bp_override
) == 0);
933 *bp
= *zio
->io_bp_override
;
934 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
936 if (BP_IS_HOLE(bp
) || !zp
->zp_dedup
)
937 return (ZIO_PIPELINE_CONTINUE
);
939 ASSERT(zio_checksum_table
[zp
->zp_checksum
].ci_dedup
||
940 zp
->zp_dedup_verify
);
942 if (BP_GET_CHECKSUM(bp
) == zp
->zp_checksum
) {
944 zio
->io_pipeline
|= ZIO_STAGE_DDT_WRITE
;
945 return (ZIO_PIPELINE_CONTINUE
);
947 zio
->io_bp_override
= NULL
;
951 if (bp
->blk_birth
== zio
->io_txg
) {
953 * We're rewriting an existing block, which means we're
954 * working on behalf of spa_sync(). For spa_sync() to
955 * converge, it must eventually be the case that we don't
956 * have to allocate new blocks. But compression changes
957 * the blocksize, which forces a reallocate, and makes
958 * convergence take longer. Therefore, after the first
959 * few passes, stop compressing to ensure convergence.
961 pass
= spa_sync_pass(spa
);
963 ASSERT(zio
->io_txg
== spa_syncing_txg(spa
));
964 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
965 ASSERT(!BP_GET_DEDUP(bp
));
967 if (pass
> SYNC_PASS_DONT_COMPRESS
)
968 compress
= ZIO_COMPRESS_OFF
;
970 /* Make sure someone doesn't change their mind on overwrites */
971 ASSERT(MIN(zp
->zp_copies
+ BP_IS_GANG(bp
),
972 spa_max_replication(spa
)) == BP_GET_NDVAS(bp
));
975 if (compress
!= ZIO_COMPRESS_OFF
) {
976 void *cbuf
= zio_buf_alloc(lsize
);
977 psize
= zio_compress_data(compress
, zio
->io_data
, cbuf
, lsize
);
978 if (psize
== 0 || psize
== lsize
) {
979 compress
= ZIO_COMPRESS_OFF
;
980 zio_buf_free(cbuf
, lsize
);
982 ASSERT(psize
< lsize
);
983 zio_push_transform(zio
, cbuf
, psize
, lsize
, NULL
);
988 * The final pass of spa_sync() must be all rewrites, but the first
989 * few passes offer a trade-off: allocating blocks defers convergence,
990 * but newly allocated blocks are sequential, so they can be written
991 * to disk faster. Therefore, we allow the first few passes of
992 * spa_sync() to allocate new blocks, but force rewrites after that.
993 * There should only be a handful of blocks after pass 1 in any case.
995 if (bp
->blk_birth
== zio
->io_txg
&& BP_GET_PSIZE(bp
) == psize
&&
996 pass
> SYNC_PASS_REWRITE
) {
997 enum zio_stage gang_stages
= zio
->io_pipeline
& ZIO_GANG_STAGES
;
999 zio
->io_pipeline
= ZIO_REWRITE_PIPELINE
| gang_stages
;
1000 zio
->io_flags
|= ZIO_FLAG_IO_REWRITE
;
1003 zio
->io_pipeline
= ZIO_WRITE_PIPELINE
;
1007 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
1009 ASSERT(zp
->zp_checksum
!= ZIO_CHECKSUM_GANG_HEADER
);
1010 BP_SET_LSIZE(bp
, lsize
);
1011 BP_SET_PSIZE(bp
, psize
);
1012 BP_SET_COMPRESS(bp
, compress
);
1013 BP_SET_CHECKSUM(bp
, zp
->zp_checksum
);
1014 BP_SET_TYPE(bp
, zp
->zp_type
);
1015 BP_SET_LEVEL(bp
, zp
->zp_level
);
1016 BP_SET_DEDUP(bp
, zp
->zp_dedup
);
1017 BP_SET_BYTEORDER(bp
, ZFS_HOST_BYTEORDER
);
1019 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
1020 ASSERT(!(zio
->io_flags
& ZIO_FLAG_IO_REWRITE
));
1021 zio
->io_pipeline
= ZIO_DDT_WRITE_PIPELINE
;
1025 return (ZIO_PIPELINE_CONTINUE
);
1029 zio_free_bp_init(zio_t
*zio
)
1031 blkptr_t
*bp
= zio
->io_bp
;
1033 if (zio
->io_child_type
== ZIO_CHILD_LOGICAL
) {
1034 if (BP_GET_DEDUP(bp
))
1035 zio
->io_pipeline
= ZIO_DDT_FREE_PIPELINE
;
1038 return (ZIO_PIPELINE_CONTINUE
);
1042 * ==========================================================================
1043 * Execute the I/O pipeline
1044 * ==========================================================================
1048 zio_taskq_dispatch(zio_t
*zio
, enum zio_taskq_type q
, boolean_t cutinline
)
1050 spa_t
*spa
= zio
->io_spa
;
1051 zio_type_t t
= zio
->io_type
;
1052 int flags
= TQ_SLEEP
| (cutinline
? TQ_FRONT
: 0);
1055 * If we're a config writer or a probe, the normal issue and
1056 * interrupt threads may all be blocked waiting for the config lock.
1057 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1059 if (zio
->io_flags
& (ZIO_FLAG_CONFIG_WRITER
| ZIO_FLAG_PROBE
))
1063 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1065 if (t
== ZIO_TYPE_WRITE
&& zio
->io_vd
&& zio
->io_vd
->vdev_aux
)
1069 * If this is a high priority I/O, then use the high priority taskq.
1071 if (zio
->io_priority
== ZIO_PRIORITY_NOW
&&
1072 spa
->spa_zio_taskq
[t
][q
+ 1] != NULL
)
1075 ASSERT3U(q
, <, ZIO_TASKQ_TYPES
);
1076 (void) taskq_dispatch(spa
->spa_zio_taskq
[t
][q
],
1077 (task_func_t
*)zio_execute
, zio
, flags
);
1081 zio_taskq_member(zio_t
*zio
, enum zio_taskq_type q
)
1083 kthread_t
*executor
= zio
->io_executor
;
1084 spa_t
*spa
= zio
->io_spa
;
1087 for (t
= 0; t
< ZIO_TYPES
; t
++)
1088 if (taskq_member(spa
->spa_zio_taskq
[t
][q
], executor
))
1095 zio_issue_async(zio_t
*zio
)
1097 zio_taskq_dispatch(zio
, ZIO_TASKQ_ISSUE
, B_FALSE
);
1099 return (ZIO_PIPELINE_STOP
);
1103 zio_interrupt(zio_t
*zio
)
1105 zio_taskq_dispatch(zio
, ZIO_TASKQ_INTERRUPT
, B_FALSE
);
1109 * Execute the I/O pipeline until one of the following occurs:
1110 * (1) the I/O completes; (2) the pipeline stalls waiting for
1111 * dependent child I/Os; (3) the I/O issues, so we're waiting
1112 * for an I/O completion interrupt; (4) the I/O is delegated by
1113 * vdev-level caching or aggregation; (5) the I/O is deferred
1114 * due to vdev-level queueing; (6) the I/O is handed off to
1115 * another thread. In all cases, the pipeline stops whenever
1116 * there's no CPU work; it never burns a thread in cv_wait().
1118 * There's no locking on io_stage because there's no legitimate way
1119 * for multiple threads to be attempting to process the same I/O.
1121 static zio_pipe_stage_t
*zio_pipeline
[];
1124 zio_execute(zio_t
*zio
)
1126 zio
->io_executor
= curthread
;
1128 while (zio
->io_stage
< ZIO_STAGE_DONE
) {
1129 enum zio_stage pipeline
= zio
->io_pipeline
;
1130 enum zio_stage stage
= zio
->io_stage
;
1133 ASSERT(!MUTEX_HELD(&zio
->io_lock
));
1134 ASSERT(ISP2(stage
));
1135 ASSERT(zio
->io_stall
== NULL
);
1139 } while ((stage
& pipeline
) == 0);
1141 ASSERT(stage
<= ZIO_STAGE_DONE
);
1144 * If we are in interrupt context and this pipeline stage
1145 * will grab a config lock that is held across I/O,
1146 * or may wait for an I/O that needs an interrupt thread
1147 * to complete, issue async to avoid deadlock.
1149 * For VDEV_IO_START, we cut in line so that the io will
1150 * be sent to disk promptly.
1152 if ((stage
& ZIO_BLOCKING_STAGES
) && zio
->io_vd
== NULL
&&
1153 zio_taskq_member(zio
, ZIO_TASKQ_INTERRUPT
)) {
1154 boolean_t cut
= (stage
== ZIO_STAGE_VDEV_IO_START
) ?
1155 zio_requeue_io_start_cut_in_line
: B_FALSE
;
1156 zio_taskq_dispatch(zio
, ZIO_TASKQ_ISSUE
, cut
);
1160 zio
->io_stage
= stage
;
1161 rv
= zio_pipeline
[highbit(stage
) - 1](zio
);
1163 if (rv
== ZIO_PIPELINE_STOP
)
1166 ASSERT(rv
== ZIO_PIPELINE_CONTINUE
);
1171 * ==========================================================================
1172 * Initiate I/O, either sync or async
1173 * ==========================================================================
1176 zio_wait(zio_t
*zio
)
1180 ASSERT(zio
->io_stage
== ZIO_STAGE_OPEN
);
1181 ASSERT(zio
->io_executor
== NULL
);
1183 zio
->io_waiter
= curthread
;
1187 mutex_enter(&zio
->io_lock
);
1188 while (zio
->io_executor
!= NULL
)
1189 cv_wait(&zio
->io_cv
, &zio
->io_lock
);
1190 mutex_exit(&zio
->io_lock
);
1192 error
= zio
->io_error
;
1199 zio_nowait(zio_t
*zio
)
1201 ASSERT(zio
->io_executor
== NULL
);
1203 if (zio
->io_child_type
== ZIO_CHILD_LOGICAL
&&
1204 zio_unique_parent(zio
) == NULL
) {
1206 * This is a logical async I/O with no parent to wait for it.
1207 * We add it to the spa_async_root_zio "Godfather" I/O which
1208 * will ensure they complete prior to unloading the pool.
1210 spa_t
*spa
= zio
->io_spa
;
1212 zio_add_child(spa
->spa_async_zio_root
, zio
);
1219 * ==========================================================================
1220 * Reexecute or suspend/resume failed I/O
1221 * ==========================================================================
1225 zio_reexecute(zio_t
*pio
)
1227 zio_t
*cio
, *cio_next
;
1230 ASSERT(pio
->io_child_type
== ZIO_CHILD_LOGICAL
);
1231 ASSERT(pio
->io_orig_stage
== ZIO_STAGE_OPEN
);
1232 ASSERT(pio
->io_gang_leader
== NULL
);
1233 ASSERT(pio
->io_gang_tree
== NULL
);
1235 pio
->io_flags
= pio
->io_orig_flags
;
1236 pio
->io_stage
= pio
->io_orig_stage
;
1237 pio
->io_pipeline
= pio
->io_orig_pipeline
;
1238 pio
->io_reexecute
= 0;
1240 for (w
= 0; w
< ZIO_WAIT_TYPES
; w
++)
1241 pio
->io_state
[w
] = 0;
1242 for (c
= 0; c
< ZIO_CHILD_TYPES
; c
++)
1243 pio
->io_child_error
[c
] = 0;
1245 if (IO_IS_ALLOCATING(pio
))
1246 BP_ZERO(pio
->io_bp
);
1249 * As we reexecute pio's children, new children could be created.
1250 * New children go to the head of pio's io_child_list, however,
1251 * so we will (correctly) not reexecute them. The key is that
1252 * the remainder of pio's io_child_list, from 'cio_next' onward,
1253 * cannot be affected by any side effects of reexecuting 'cio'.
1255 for (cio
= zio_walk_children(pio
); cio
!= NULL
; cio
= cio_next
) {
1256 cio_next
= zio_walk_children(pio
);
1257 mutex_enter(&pio
->io_lock
);
1258 for (w
= 0; w
< ZIO_WAIT_TYPES
; w
++)
1259 pio
->io_children
[cio
->io_child_type
][w
]++;
1260 mutex_exit(&pio
->io_lock
);
1265 * Now that all children have been reexecuted, execute the parent.
1266 * We don't reexecute "The Godfather" I/O here as it's the
1267 * responsibility of the caller to wait on him.
1269 if (!(pio
->io_flags
& ZIO_FLAG_GODFATHER
))
1274 zio_suspend(spa_t
*spa
, zio_t
*zio
)
1276 if (spa_get_failmode(spa
) == ZIO_FAILURE_MODE_PANIC
)
1277 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1278 "failure and the failure mode property for this pool "
1279 "is set to panic.", spa_name(spa
));
1281 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE
, spa
, NULL
, NULL
, 0, 0);
1283 mutex_enter(&spa
->spa_suspend_lock
);
1285 if (spa
->spa_suspend_zio_root
== NULL
)
1286 spa
->spa_suspend_zio_root
= zio_root(spa
, NULL
, NULL
,
1287 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
|
1288 ZIO_FLAG_GODFATHER
);
1290 spa
->spa_suspended
= B_TRUE
;
1293 ASSERT(!(zio
->io_flags
& ZIO_FLAG_GODFATHER
));
1294 ASSERT(zio
!= spa
->spa_suspend_zio_root
);
1295 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
1296 ASSERT(zio_unique_parent(zio
) == NULL
);
1297 ASSERT(zio
->io_stage
== ZIO_STAGE_DONE
);
1298 zio_add_child(spa
->spa_suspend_zio_root
, zio
);
1301 mutex_exit(&spa
->spa_suspend_lock
);
1305 zio_resume(spa_t
*spa
)
1310 * Reexecute all previously suspended i/o.
1312 mutex_enter(&spa
->spa_suspend_lock
);
1313 spa
->spa_suspended
= B_FALSE
;
1314 cv_broadcast(&spa
->spa_suspend_cv
);
1315 pio
= spa
->spa_suspend_zio_root
;
1316 spa
->spa_suspend_zio_root
= NULL
;
1317 mutex_exit(&spa
->spa_suspend_lock
);
1323 return (zio_wait(pio
));
1327 zio_resume_wait(spa_t
*spa
)
1329 mutex_enter(&spa
->spa_suspend_lock
);
1330 while (spa_suspended(spa
))
1331 cv_wait(&spa
->spa_suspend_cv
, &spa
->spa_suspend_lock
);
1332 mutex_exit(&spa
->spa_suspend_lock
);
1336 * ==========================================================================
1339 * A gang block is a collection of small blocks that looks to the DMU
1340 * like one large block. When zio_dva_allocate() cannot find a block
1341 * of the requested size, due to either severe fragmentation or the pool
1342 * being nearly full, it calls zio_write_gang_block() to construct the
1343 * block from smaller fragments.
1345 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1346 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
1347 * an indirect block: it's an array of block pointers. It consumes
1348 * only one sector and hence is allocatable regardless of fragmentation.
1349 * The gang header's bps point to its gang members, which hold the data.
1351 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1352 * as the verifier to ensure uniqueness of the SHA256 checksum.
1353 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1354 * not the gang header. This ensures that data block signatures (needed for
1355 * deduplication) are independent of how the block is physically stored.
1357 * Gang blocks can be nested: a gang member may itself be a gang block.
1358 * Thus every gang block is a tree in which root and all interior nodes are
1359 * gang headers, and the leaves are normal blocks that contain user data.
1360 * The root of the gang tree is called the gang leader.
1362 * To perform any operation (read, rewrite, free, claim) on a gang block,
1363 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1364 * in the io_gang_tree field of the original logical i/o by recursively
1365 * reading the gang leader and all gang headers below it. This yields
1366 * an in-core tree containing the contents of every gang header and the
1367 * bps for every constituent of the gang block.
1369 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1370 * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
1371 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1372 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1373 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1374 * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
1375 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1376 * of the gang header plus zio_checksum_compute() of the data to update the
1377 * gang header's blk_cksum as described above.
1379 * The two-phase assemble/issue model solves the problem of partial failure --
1380 * what if you'd freed part of a gang block but then couldn't read the
1381 * gang header for another part? Assembling the entire gang tree first
1382 * ensures that all the necessary gang header I/O has succeeded before
1383 * starting the actual work of free, claim, or write. Once the gang tree
1384 * is assembled, free and claim are in-memory operations that cannot fail.
1386 * In the event that a gang write fails, zio_dva_unallocate() walks the
1387 * gang tree to immediately free (i.e. insert back into the space map)
1388 * everything we've allocated. This ensures that we don't get ENOSPC
1389 * errors during repeated suspend/resume cycles due to a flaky device.
1391 * Gang rewrites only happen during sync-to-convergence. If we can't assemble
1392 * the gang tree, we won't modify the block, so we can safely defer the free
1393 * (knowing that the block is still intact). If we *can* assemble the gang
1394 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1395 * each constituent bp and we can allocate a new block on the next sync pass.
1397 * In all cases, the gang tree allows complete recovery from partial failure.
1398 * ==========================================================================
1402 zio_read_gang(zio_t
*pio
, blkptr_t
*bp
, zio_gang_node_t
*gn
, void *data
)
1407 return (zio_read(pio
, pio
->io_spa
, bp
, data
, BP_GET_PSIZE(bp
),
1408 NULL
, NULL
, pio
->io_priority
, ZIO_GANG_CHILD_FLAGS(pio
),
1409 &pio
->io_bookmark
));
1413 zio_rewrite_gang(zio_t
*pio
, blkptr_t
*bp
, zio_gang_node_t
*gn
, void *data
)
1418 zio
= zio_rewrite(pio
, pio
->io_spa
, pio
->io_txg
, bp
,
1419 gn
->gn_gbh
, SPA_GANGBLOCKSIZE
, NULL
, NULL
, pio
->io_priority
,
1420 ZIO_GANG_CHILD_FLAGS(pio
), &pio
->io_bookmark
);
1422 * As we rewrite each gang header, the pipeline will compute
1423 * a new gang block header checksum for it; but no one will
1424 * compute a new data checksum, so we do that here. The one
1425 * exception is the gang leader: the pipeline already computed
1426 * its data checksum because that stage precedes gang assembly.
1427 * (Presently, nothing actually uses interior data checksums;
1428 * this is just good hygiene.)
1430 if (gn
!= pio
->io_gang_leader
->io_gang_tree
) {
1431 zio_checksum_compute(zio
, BP_GET_CHECKSUM(bp
),
1432 data
, BP_GET_PSIZE(bp
));
1435 * If we are here to damage data for testing purposes,
1436 * leave the GBH alone so that we can detect the damage.
1438 if (pio
->io_gang_leader
->io_flags
& ZIO_FLAG_INDUCE_DAMAGE
)
1439 zio
->io_pipeline
&= ~ZIO_VDEV_IO_STAGES
;
1441 zio
= zio_rewrite(pio
, pio
->io_spa
, pio
->io_txg
, bp
,
1442 data
, BP_GET_PSIZE(bp
), NULL
, NULL
, pio
->io_priority
,
1443 ZIO_GANG_CHILD_FLAGS(pio
), &pio
->io_bookmark
);
1451 zio_free_gang(zio_t
*pio
, blkptr_t
*bp
, zio_gang_node_t
*gn
, void *data
)
1453 return (zio_free_sync(pio
, pio
->io_spa
, pio
->io_txg
, bp
,
1454 ZIO_GANG_CHILD_FLAGS(pio
)));
1459 zio_claim_gang(zio_t
*pio
, blkptr_t
*bp
, zio_gang_node_t
*gn
, void *data
)
1461 return (zio_claim(pio
, pio
->io_spa
, pio
->io_txg
, bp
,
1462 NULL
, NULL
, ZIO_GANG_CHILD_FLAGS(pio
)));
1465 static zio_gang_issue_func_t
*zio_gang_issue_func
[ZIO_TYPES
] = {
1474 static void zio_gang_tree_assemble_done(zio_t
*zio
);
1476 static zio_gang_node_t
*
1477 zio_gang_node_alloc(zio_gang_node_t
**gnpp
)
1479 zio_gang_node_t
*gn
;
1481 ASSERT(*gnpp
== NULL
);
1483 gn
= kmem_zalloc(sizeof (*gn
), KM_SLEEP
);
1484 gn
->gn_gbh
= zio_buf_alloc(SPA_GANGBLOCKSIZE
);
1491 zio_gang_node_free(zio_gang_node_t
**gnpp
)
1493 zio_gang_node_t
*gn
= *gnpp
;
1496 for (g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++)
1497 ASSERT(gn
->gn_child
[g
] == NULL
);
1499 zio_buf_free(gn
->gn_gbh
, SPA_GANGBLOCKSIZE
);
1500 kmem_free(gn
, sizeof (*gn
));
1505 zio_gang_tree_free(zio_gang_node_t
**gnpp
)
1507 zio_gang_node_t
*gn
= *gnpp
;
1513 for (g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++)
1514 zio_gang_tree_free(&gn
->gn_child
[g
]);
1516 zio_gang_node_free(gnpp
);
1520 zio_gang_tree_assemble(zio_t
*gio
, blkptr_t
*bp
, zio_gang_node_t
**gnpp
)
1522 zio_gang_node_t
*gn
= zio_gang_node_alloc(gnpp
);
1524 ASSERT(gio
->io_gang_leader
== gio
);
1525 ASSERT(BP_IS_GANG(bp
));
1527 zio_nowait(zio_read(gio
, gio
->io_spa
, bp
, gn
->gn_gbh
,
1528 SPA_GANGBLOCKSIZE
, zio_gang_tree_assemble_done
, gn
,
1529 gio
->io_priority
, ZIO_GANG_CHILD_FLAGS(gio
), &gio
->io_bookmark
));
1533 zio_gang_tree_assemble_done(zio_t
*zio
)
1535 zio_t
*gio
= zio
->io_gang_leader
;
1536 zio_gang_node_t
*gn
= zio
->io_private
;
1537 blkptr_t
*bp
= zio
->io_bp
;
1540 ASSERT(gio
== zio_unique_parent(zio
));
1541 ASSERT(zio
->io_child_count
== 0);
1546 if (BP_SHOULD_BYTESWAP(bp
))
1547 byteswap_uint64_array(zio
->io_data
, zio
->io_size
);
1549 ASSERT(zio
->io_data
== gn
->gn_gbh
);
1550 ASSERT(zio
->io_size
== SPA_GANGBLOCKSIZE
);
1551 ASSERT(gn
->gn_gbh
->zg_tail
.zec_magic
== ZEC_MAGIC
);
1553 for (g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++) {
1554 blkptr_t
*gbp
= &gn
->gn_gbh
->zg_blkptr
[g
];
1555 if (!BP_IS_GANG(gbp
))
1557 zio_gang_tree_assemble(gio
, gbp
, &gn
->gn_child
[g
]);
1562 zio_gang_tree_issue(zio_t
*pio
, zio_gang_node_t
*gn
, blkptr_t
*bp
, void *data
)
1564 zio_t
*gio
= pio
->io_gang_leader
;
1568 ASSERT(BP_IS_GANG(bp
) == !!gn
);
1569 ASSERT(BP_GET_CHECKSUM(bp
) == BP_GET_CHECKSUM(gio
->io_bp
));
1570 ASSERT(BP_GET_LSIZE(bp
) == BP_GET_PSIZE(bp
) || gn
== gio
->io_gang_tree
);
1573 * If you're a gang header, your data is in gn->gn_gbh.
1574 * If you're a gang member, your data is in 'data' and gn == NULL.
1576 zio
= zio_gang_issue_func
[gio
->io_type
](pio
, bp
, gn
, data
);
1579 ASSERT(gn
->gn_gbh
->zg_tail
.zec_magic
== ZEC_MAGIC
);
1581 for (g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++) {
1582 blkptr_t
*gbp
= &gn
->gn_gbh
->zg_blkptr
[g
];
1583 if (BP_IS_HOLE(gbp
))
1585 zio_gang_tree_issue(zio
, gn
->gn_child
[g
], gbp
, data
);
1586 data
= (char *)data
+ BP_GET_PSIZE(gbp
);
1590 if (gn
== gio
->io_gang_tree
)
1591 ASSERT3P((char *)gio
->io_data
+ gio
->io_size
, ==, data
);
1598 zio_gang_assemble(zio_t
*zio
)
1600 blkptr_t
*bp
= zio
->io_bp
;
1602 ASSERT(BP_IS_GANG(bp
) && zio
->io_gang_leader
== NULL
);
1603 ASSERT(zio
->io_child_type
> ZIO_CHILD_GANG
);
1605 zio
->io_gang_leader
= zio
;
1607 zio_gang_tree_assemble(zio
, bp
, &zio
->io_gang_tree
);
1609 return (ZIO_PIPELINE_CONTINUE
);
1613 zio_gang_issue(zio_t
*zio
)
1615 blkptr_t
*bp
= zio
->io_bp
;
1617 if (zio_wait_for_children(zio
, ZIO_CHILD_GANG
, ZIO_WAIT_DONE
))
1618 return (ZIO_PIPELINE_STOP
);
1620 ASSERT(BP_IS_GANG(bp
) && zio
->io_gang_leader
== zio
);
1621 ASSERT(zio
->io_child_type
> ZIO_CHILD_GANG
);
1623 if (zio
->io_child_error
[ZIO_CHILD_GANG
] == 0)
1624 zio_gang_tree_issue(zio
, zio
->io_gang_tree
, bp
, zio
->io_data
);
1626 zio_gang_tree_free(&zio
->io_gang_tree
);
1628 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
1630 return (ZIO_PIPELINE_CONTINUE
);
1634 zio_write_gang_member_ready(zio_t
*zio
)
1636 zio_t
*pio
= zio_unique_parent(zio
);
1637 zio_t
*gio
= zio
->io_gang_leader
;
1638 dva_t
*cdva
= zio
->io_bp
->blk_dva
;
1639 dva_t
*pdva
= pio
->io_bp
->blk_dva
;
1643 if (BP_IS_HOLE(zio
->io_bp
))
1646 ASSERT(BP_IS_HOLE(&zio
->io_bp_orig
));
1648 ASSERT(zio
->io_child_type
== ZIO_CHILD_GANG
);
1649 ASSERT3U(zio
->io_prop
.zp_copies
, ==, gio
->io_prop
.zp_copies
);
1650 ASSERT3U(zio
->io_prop
.zp_copies
, <=, BP_GET_NDVAS(zio
->io_bp
));
1651 ASSERT3U(pio
->io_prop
.zp_copies
, <=, BP_GET_NDVAS(pio
->io_bp
));
1652 ASSERT3U(BP_GET_NDVAS(zio
->io_bp
), <=, BP_GET_NDVAS(pio
->io_bp
));
1654 mutex_enter(&pio
->io_lock
);
1655 for (d
= 0; d
< BP_GET_NDVAS(zio
->io_bp
); d
++) {
1656 ASSERT(DVA_GET_GANG(&pdva
[d
]));
1657 asize
= DVA_GET_ASIZE(&pdva
[d
]);
1658 asize
+= DVA_GET_ASIZE(&cdva
[d
]);
1659 DVA_SET_ASIZE(&pdva
[d
], asize
);
1661 mutex_exit(&pio
->io_lock
);
1665 zio_write_gang_block(zio_t
*pio
)
1667 spa_t
*spa
= pio
->io_spa
;
1668 blkptr_t
*bp
= pio
->io_bp
;
1669 zio_t
*gio
= pio
->io_gang_leader
;
1671 zio_gang_node_t
*gn
, **gnpp
;
1672 zio_gbh_phys_t
*gbh
;
1673 uint64_t txg
= pio
->io_txg
;
1674 uint64_t resid
= pio
->io_size
;
1676 int copies
= gio
->io_prop
.zp_copies
;
1677 int gbh_copies
= MIN(copies
+ 1, spa_max_replication(spa
));
1681 error
= metaslab_alloc(spa
, spa_normal_class(spa
), SPA_GANGBLOCKSIZE
,
1682 bp
, gbh_copies
, txg
, pio
== gio
? NULL
: gio
->io_bp
,
1683 METASLAB_HINTBP_FAVOR
| METASLAB_GANG_HEADER
);
1685 pio
->io_error
= error
;
1686 return (ZIO_PIPELINE_CONTINUE
);
1690 gnpp
= &gio
->io_gang_tree
;
1692 gnpp
= pio
->io_private
;
1693 ASSERT(pio
->io_ready
== zio_write_gang_member_ready
);
1696 gn
= zio_gang_node_alloc(gnpp
);
1698 bzero(gbh
, SPA_GANGBLOCKSIZE
);
1701 * Create the gang header.
1703 zio
= zio_rewrite(pio
, spa
, txg
, bp
, gbh
, SPA_GANGBLOCKSIZE
, NULL
, NULL
,
1704 pio
->io_priority
, ZIO_GANG_CHILD_FLAGS(pio
), &pio
->io_bookmark
);
1707 * Create and nowait the gang children.
1709 for (g
= 0; resid
!= 0; resid
-= lsize
, g
++) {
1710 lsize
= P2ROUNDUP(resid
/ (SPA_GBH_NBLKPTRS
- g
),
1712 ASSERT(lsize
>= SPA_MINBLOCKSIZE
&& lsize
<= resid
);
1714 zp
.zp_checksum
= gio
->io_prop
.zp_checksum
;
1715 zp
.zp_compress
= ZIO_COMPRESS_OFF
;
1716 zp
.zp_type
= DMU_OT_NONE
;
1718 zp
.zp_copies
= gio
->io_prop
.zp_copies
;
1720 zp
.zp_dedup_verify
= 0;
1722 zio_nowait(zio_write(zio
, spa
, txg
, &gbh
->zg_blkptr
[g
],
1723 (char *)pio
->io_data
+ (pio
->io_size
- resid
), lsize
, &zp
,
1724 zio_write_gang_member_ready
, NULL
, &gn
->gn_child
[g
],
1725 pio
->io_priority
, ZIO_GANG_CHILD_FLAGS(pio
),
1726 &pio
->io_bookmark
));
1730 * Set pio's pipeline to just wait for zio to finish.
1732 pio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
1736 return (ZIO_PIPELINE_CONTINUE
);
1740 * ==========================================================================
1742 * ==========================================================================
1745 zio_ddt_child_read_done(zio_t
*zio
)
1747 blkptr_t
*bp
= zio
->io_bp
;
1748 ddt_entry_t
*dde
= zio
->io_private
;
1750 zio_t
*pio
= zio_unique_parent(zio
);
1752 mutex_enter(&pio
->io_lock
);
1753 ddp
= ddt_phys_select(dde
, bp
);
1754 if (zio
->io_error
== 0)
1755 ddt_phys_clear(ddp
); /* this ddp doesn't need repair */
1756 if (zio
->io_error
== 0 && dde
->dde_repair_data
== NULL
)
1757 dde
->dde_repair_data
= zio
->io_data
;
1759 zio_buf_free(zio
->io_data
, zio
->io_size
);
1760 mutex_exit(&pio
->io_lock
);
1764 zio_ddt_read_start(zio_t
*zio
)
1766 blkptr_t
*bp
= zio
->io_bp
;
1769 ASSERT(BP_GET_DEDUP(bp
));
1770 ASSERT(BP_GET_PSIZE(bp
) == zio
->io_size
);
1771 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
1773 if (zio
->io_child_error
[ZIO_CHILD_DDT
]) {
1774 ddt_t
*ddt
= ddt_select(zio
->io_spa
, bp
);
1775 ddt_entry_t
*dde
= ddt_repair_start(ddt
, bp
);
1776 ddt_phys_t
*ddp
= dde
->dde_phys
;
1777 ddt_phys_t
*ddp_self
= ddt_phys_select(dde
, bp
);
1780 ASSERT(zio
->io_vsd
== NULL
);
1783 if (ddp_self
== NULL
)
1784 return (ZIO_PIPELINE_CONTINUE
);
1786 for (p
= 0; p
< DDT_PHYS_TYPES
; p
++, ddp
++) {
1787 if (ddp
->ddp_phys_birth
== 0 || ddp
== ddp_self
)
1789 ddt_bp_create(ddt
->ddt_checksum
, &dde
->dde_key
, ddp
,
1791 zio_nowait(zio_read(zio
, zio
->io_spa
, &blk
,
1792 zio_buf_alloc(zio
->io_size
), zio
->io_size
,
1793 zio_ddt_child_read_done
, dde
, zio
->io_priority
,
1794 ZIO_DDT_CHILD_FLAGS(zio
) | ZIO_FLAG_DONT_PROPAGATE
,
1795 &zio
->io_bookmark
));
1797 return (ZIO_PIPELINE_CONTINUE
);
1800 zio_nowait(zio_read(zio
, zio
->io_spa
, bp
,
1801 zio
->io_data
, zio
->io_size
, NULL
, NULL
, zio
->io_priority
,
1802 ZIO_DDT_CHILD_FLAGS(zio
), &zio
->io_bookmark
));
1804 return (ZIO_PIPELINE_CONTINUE
);
1808 zio_ddt_read_done(zio_t
*zio
)
1810 blkptr_t
*bp
= zio
->io_bp
;
1812 if (zio_wait_for_children(zio
, ZIO_CHILD_DDT
, ZIO_WAIT_DONE
))
1813 return (ZIO_PIPELINE_STOP
);
1815 ASSERT(BP_GET_DEDUP(bp
));
1816 ASSERT(BP_GET_PSIZE(bp
) == zio
->io_size
);
1817 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
1819 if (zio
->io_child_error
[ZIO_CHILD_DDT
]) {
1820 ddt_t
*ddt
= ddt_select(zio
->io_spa
, bp
);
1821 ddt_entry_t
*dde
= zio
->io_vsd
;
1823 ASSERT(spa_load_state(zio
->io_spa
) != SPA_LOAD_NONE
);
1824 return (ZIO_PIPELINE_CONTINUE
);
1827 zio
->io_stage
= ZIO_STAGE_DDT_READ_START
>> 1;
1828 zio_taskq_dispatch(zio
, ZIO_TASKQ_ISSUE
, B_FALSE
);
1829 return (ZIO_PIPELINE_STOP
);
1831 if (dde
->dde_repair_data
!= NULL
) {
1832 bcopy(dde
->dde_repair_data
, zio
->io_data
, zio
->io_size
);
1833 zio
->io_child_error
[ZIO_CHILD_DDT
] = 0;
1835 ddt_repair_done(ddt
, dde
);
1839 ASSERT(zio
->io_vsd
== NULL
);
1841 return (ZIO_PIPELINE_CONTINUE
);
1845 zio_ddt_collision(zio_t
*zio
, ddt_t
*ddt
, ddt_entry_t
*dde
)
1847 spa_t
*spa
= zio
->io_spa
;
1851 * Note: we compare the original data, not the transformed data,
1852 * because when zio->io_bp is an override bp, we will not have
1853 * pushed the I/O transforms. That's an important optimization
1854 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
1856 for (p
= DDT_PHYS_SINGLE
; p
<= DDT_PHYS_TRIPLE
; p
++) {
1857 zio_t
*lio
= dde
->dde_lead_zio
[p
];
1860 return (lio
->io_orig_size
!= zio
->io_orig_size
||
1861 bcmp(zio
->io_orig_data
, lio
->io_orig_data
,
1862 zio
->io_orig_size
) != 0);
1866 for (p
= DDT_PHYS_SINGLE
; p
<= DDT_PHYS_TRIPLE
; p
++) {
1867 ddt_phys_t
*ddp
= &dde
->dde_phys
[p
];
1869 if (ddp
->ddp_phys_birth
!= 0) {
1870 arc_buf_t
*abuf
= NULL
;
1871 uint32_t aflags
= ARC_WAIT
;
1872 blkptr_t blk
= *zio
->io_bp
;
1875 ddt_bp_fill(ddp
, &blk
, ddp
->ddp_phys_birth
);
1879 error
= arc_read_nolock(NULL
, spa
, &blk
,
1880 arc_getbuf_func
, &abuf
, ZIO_PRIORITY_SYNC_READ
,
1881 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
,
1882 &aflags
, &zio
->io_bookmark
);
1885 if (arc_buf_size(abuf
) != zio
->io_orig_size
||
1886 bcmp(abuf
->b_data
, zio
->io_orig_data
,
1887 zio
->io_orig_size
) != 0)
1889 VERIFY(arc_buf_remove_ref(abuf
, &abuf
) == 1);
1893 return (error
!= 0);
1901 zio_ddt_child_write_ready(zio_t
*zio
)
1903 int p
= zio
->io_prop
.zp_copies
;
1904 ddt_t
*ddt
= ddt_select(zio
->io_spa
, zio
->io_bp
);
1905 ddt_entry_t
*dde
= zio
->io_private
;
1906 ddt_phys_t
*ddp
= &dde
->dde_phys
[p
];
1914 ASSERT(dde
->dde_lead_zio
[p
] == zio
);
1916 ddt_phys_fill(ddp
, zio
->io_bp
);
1918 while ((pio
= zio_walk_parents(zio
)) != NULL
)
1919 ddt_bp_fill(ddp
, pio
->io_bp
, zio
->io_txg
);
1925 zio_ddt_child_write_done(zio_t
*zio
)
1927 int p
= zio
->io_prop
.zp_copies
;
1928 ddt_t
*ddt
= ddt_select(zio
->io_spa
, zio
->io_bp
);
1929 ddt_entry_t
*dde
= zio
->io_private
;
1930 ddt_phys_t
*ddp
= &dde
->dde_phys
[p
];
1934 ASSERT(ddp
->ddp_refcnt
== 0);
1935 ASSERT(dde
->dde_lead_zio
[p
] == zio
);
1936 dde
->dde_lead_zio
[p
] = NULL
;
1938 if (zio
->io_error
== 0) {
1939 while (zio_walk_parents(zio
) != NULL
)
1940 ddt_phys_addref(ddp
);
1942 ddt_phys_clear(ddp
);
1949 zio_ddt_ditto_write_done(zio_t
*zio
)
1951 int p
= DDT_PHYS_DITTO
;
1952 zio_prop_t
*zp
= &zio
->io_prop
;
1953 blkptr_t
*bp
= zio
->io_bp
;
1954 ddt_t
*ddt
= ddt_select(zio
->io_spa
, bp
);
1955 ddt_entry_t
*dde
= zio
->io_private
;
1956 ddt_phys_t
*ddp
= &dde
->dde_phys
[p
];
1957 ddt_key_t
*ddk
= &dde
->dde_key
;
1961 ASSERT(ddp
->ddp_refcnt
== 0);
1962 ASSERT(dde
->dde_lead_zio
[p
] == zio
);
1963 dde
->dde_lead_zio
[p
] = NULL
;
1965 if (zio
->io_error
== 0) {
1966 ASSERT(ZIO_CHECKSUM_EQUAL(bp
->blk_cksum
, ddk
->ddk_cksum
));
1967 ASSERT(zp
->zp_copies
< SPA_DVAS_PER_BP
);
1968 ASSERT(zp
->zp_copies
== BP_GET_NDVAS(bp
) - BP_IS_GANG(bp
));
1969 if (ddp
->ddp_phys_birth
!= 0)
1970 ddt_phys_free(ddt
, ddk
, ddp
, zio
->io_txg
);
1971 ddt_phys_fill(ddp
, bp
);
1978 zio_ddt_write(zio_t
*zio
)
1980 spa_t
*spa
= zio
->io_spa
;
1981 blkptr_t
*bp
= zio
->io_bp
;
1982 uint64_t txg
= zio
->io_txg
;
1983 zio_prop_t
*zp
= &zio
->io_prop
;
1984 int p
= zp
->zp_copies
;
1988 ddt_t
*ddt
= ddt_select(spa
, bp
);
1992 ASSERT(BP_GET_DEDUP(bp
));
1993 ASSERT(BP_GET_CHECKSUM(bp
) == zp
->zp_checksum
);
1994 ASSERT(BP_IS_HOLE(bp
) || zio
->io_bp_override
);
1997 dde
= ddt_lookup(ddt
, bp
, B_TRUE
);
1998 ddp
= &dde
->dde_phys
[p
];
2000 if (zp
->zp_dedup_verify
&& zio_ddt_collision(zio
, ddt
, dde
)) {
2002 * If we're using a weak checksum, upgrade to a strong checksum
2003 * and try again. If we're already using a strong checksum,
2004 * we can't resolve it, so just convert to an ordinary write.
2005 * (And automatically e-mail a paper to Nature?)
2007 if (!zio_checksum_table
[zp
->zp_checksum
].ci_dedup
) {
2008 zp
->zp_checksum
= spa_dedup_checksum(spa
);
2009 zio_pop_transforms(zio
);
2010 zio
->io_stage
= ZIO_STAGE_OPEN
;
2015 zio
->io_pipeline
= ZIO_WRITE_PIPELINE
;
2017 return (ZIO_PIPELINE_CONTINUE
);
2020 ditto_copies
= ddt_ditto_copies_needed(ddt
, dde
, ddp
);
2021 ASSERT(ditto_copies
< SPA_DVAS_PER_BP
);
2023 if (ditto_copies
> ddt_ditto_copies_present(dde
) &&
2024 dde
->dde_lead_zio
[DDT_PHYS_DITTO
] == NULL
) {
2025 zio_prop_t czp
= *zp
;
2027 czp
.zp_copies
= ditto_copies
;
2030 * If we arrived here with an override bp, we won't have run
2031 * the transform stack, so we won't have the data we need to
2032 * generate a child i/o. So, toss the override bp and restart.
2033 * This is safe, because using the override bp is just an
2034 * optimization; and it's rare, so the cost doesn't matter.
2036 if (zio
->io_bp_override
) {
2037 zio_pop_transforms(zio
);
2038 zio
->io_stage
= ZIO_STAGE_OPEN
;
2039 zio
->io_pipeline
= ZIO_WRITE_PIPELINE
;
2040 zio
->io_bp_override
= NULL
;
2043 return (ZIO_PIPELINE_CONTINUE
);
2046 dio
= zio_write(zio
, spa
, txg
, bp
, zio
->io_orig_data
,
2047 zio
->io_orig_size
, &czp
, NULL
,
2048 zio_ddt_ditto_write_done
, dde
, zio
->io_priority
,
2049 ZIO_DDT_CHILD_FLAGS(zio
), &zio
->io_bookmark
);
2051 zio_push_transform(dio
, zio
->io_data
, zio
->io_size
, 0, NULL
);
2052 dde
->dde_lead_zio
[DDT_PHYS_DITTO
] = dio
;
2055 if (ddp
->ddp_phys_birth
!= 0 || dde
->dde_lead_zio
[p
] != NULL
) {
2056 if (ddp
->ddp_phys_birth
!= 0)
2057 ddt_bp_fill(ddp
, bp
, txg
);
2058 if (dde
->dde_lead_zio
[p
] != NULL
)
2059 zio_add_child(zio
, dde
->dde_lead_zio
[p
]);
2061 ddt_phys_addref(ddp
);
2062 } else if (zio
->io_bp_override
) {
2063 ASSERT(bp
->blk_birth
== txg
);
2064 ASSERT(BP_EQUAL(bp
, zio
->io_bp_override
));
2065 ddt_phys_fill(ddp
, bp
);
2066 ddt_phys_addref(ddp
);
2068 cio
= zio_write(zio
, spa
, txg
, bp
, zio
->io_orig_data
,
2069 zio
->io_orig_size
, zp
, zio_ddt_child_write_ready
,
2070 zio_ddt_child_write_done
, dde
, zio
->io_priority
,
2071 ZIO_DDT_CHILD_FLAGS(zio
), &zio
->io_bookmark
);
2073 zio_push_transform(cio
, zio
->io_data
, zio
->io_size
, 0, NULL
);
2074 dde
->dde_lead_zio
[p
] = cio
;
2084 return (ZIO_PIPELINE_CONTINUE
);
2087 ddt_entry_t
*freedde
; /* for debugging */
2090 zio_ddt_free(zio_t
*zio
)
2092 spa_t
*spa
= zio
->io_spa
;
2093 blkptr_t
*bp
= zio
->io_bp
;
2094 ddt_t
*ddt
= ddt_select(spa
, bp
);
2098 ASSERT(BP_GET_DEDUP(bp
));
2099 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
2102 freedde
= dde
= ddt_lookup(ddt
, bp
, B_TRUE
);
2103 ddp
= ddt_phys_select(dde
, bp
);
2104 ddt_phys_decref(ddp
);
2107 return (ZIO_PIPELINE_CONTINUE
);
2111 * ==========================================================================
2112 * Allocate and free blocks
2113 * ==========================================================================
2116 zio_dva_allocate(zio_t
*zio
)
2118 spa_t
*spa
= zio
->io_spa
;
2119 metaslab_class_t
*mc
= spa_normal_class(spa
);
2120 blkptr_t
*bp
= zio
->io_bp
;
2123 if (zio
->io_gang_leader
== NULL
) {
2124 ASSERT(zio
->io_child_type
> ZIO_CHILD_GANG
);
2125 zio
->io_gang_leader
= zio
;
2128 ASSERT(BP_IS_HOLE(bp
));
2129 ASSERT3U(BP_GET_NDVAS(bp
), ==, 0);
2130 ASSERT3U(zio
->io_prop
.zp_copies
, >, 0);
2131 ASSERT3U(zio
->io_prop
.zp_copies
, <=, spa_max_replication(spa
));
2132 ASSERT3U(zio
->io_size
, ==, BP_GET_PSIZE(bp
));
2134 error
= metaslab_alloc(spa
, mc
, zio
->io_size
, bp
,
2135 zio
->io_prop
.zp_copies
, zio
->io_txg
, NULL
, 0);
2138 if (error
== ENOSPC
&& zio
->io_size
> SPA_MINBLOCKSIZE
)
2139 return (zio_write_gang_block(zio
));
2140 zio
->io_error
= error
;
2143 return (ZIO_PIPELINE_CONTINUE
);
2147 zio_dva_free(zio_t
*zio
)
2149 metaslab_free(zio
->io_spa
, zio
->io_bp
, zio
->io_txg
, B_FALSE
);
2151 return (ZIO_PIPELINE_CONTINUE
);
2155 zio_dva_claim(zio_t
*zio
)
2159 error
= metaslab_claim(zio
->io_spa
, zio
->io_bp
, zio
->io_txg
);
2161 zio
->io_error
= error
;
2163 return (ZIO_PIPELINE_CONTINUE
);
2167 * Undo an allocation. This is used by zio_done() when an I/O fails
2168 * and we want to give back the block we just allocated.
2169 * This handles both normal blocks and gang blocks.
2172 zio_dva_unallocate(zio_t
*zio
, zio_gang_node_t
*gn
, blkptr_t
*bp
)
2176 ASSERT(bp
->blk_birth
== zio
->io_txg
|| BP_IS_HOLE(bp
));
2177 ASSERT(zio
->io_bp_override
== NULL
);
2179 if (!BP_IS_HOLE(bp
))
2180 metaslab_free(zio
->io_spa
, bp
, bp
->blk_birth
, B_TRUE
);
2183 for (g
= 0; g
< SPA_GBH_NBLKPTRS
; g
++) {
2184 zio_dva_unallocate(zio
, gn
->gn_child
[g
],
2185 &gn
->gn_gbh
->zg_blkptr
[g
]);
2191 * Try to allocate an intent log block. Return 0 on success, errno on failure.
2194 zio_alloc_zil(spa_t
*spa
, uint64_t txg
, blkptr_t
*new_bp
, blkptr_t
*old_bp
,
2195 uint64_t size
, boolean_t use_slog
)
2199 ASSERT(txg
> spa_syncing_txg(spa
));
2202 error
= metaslab_alloc(spa
, spa_log_class(spa
), size
,
2203 new_bp
, 1, txg
, old_bp
, METASLAB_HINTBP_AVOID
);
2206 error
= metaslab_alloc(spa
, spa_normal_class(spa
), size
,
2207 new_bp
, 1, txg
, old_bp
, METASLAB_HINTBP_AVOID
);
2210 BP_SET_LSIZE(new_bp
, size
);
2211 BP_SET_PSIZE(new_bp
, size
);
2212 BP_SET_COMPRESS(new_bp
, ZIO_COMPRESS_OFF
);
2213 BP_SET_CHECKSUM(new_bp
,
2214 spa_version(spa
) >= SPA_VERSION_SLIM_ZIL
2215 ? ZIO_CHECKSUM_ZILOG2
: ZIO_CHECKSUM_ZILOG
);
2216 BP_SET_TYPE(new_bp
, DMU_OT_INTENT_LOG
);
2217 BP_SET_LEVEL(new_bp
, 0);
2218 BP_SET_DEDUP(new_bp
, 0);
2219 BP_SET_BYTEORDER(new_bp
, ZFS_HOST_BYTEORDER
);
2226 * Free an intent log block.
2229 zio_free_zil(spa_t
*spa
, uint64_t txg
, blkptr_t
*bp
)
2231 ASSERT(BP_GET_TYPE(bp
) == DMU_OT_INTENT_LOG
);
2232 ASSERT(!BP_IS_GANG(bp
));
2234 zio_free(spa
, txg
, bp
);
2238 * ==========================================================================
2239 * Read and write to physical devices
2240 * ==========================================================================
2243 zio_vdev_io_start(zio_t
*zio
)
2245 vdev_t
*vd
= zio
->io_vd
;
2247 spa_t
*spa
= zio
->io_spa
;
2249 ASSERT(zio
->io_error
== 0);
2250 ASSERT(zio
->io_child_error
[ZIO_CHILD_VDEV
] == 0);
2253 if (!(zio
->io_flags
& ZIO_FLAG_CONFIG_WRITER
))
2254 spa_config_enter(spa
, SCL_ZIO
, zio
, RW_READER
);
2257 * The mirror_ops handle multiple DVAs in a single BP.
2259 return (vdev_mirror_ops
.vdev_op_io_start(zio
));
2263 * We keep track of time-sensitive I/Os so that the scan thread
2264 * can quickly react to certain workloads. In particular, we care
2265 * about non-scrubbing, top-level reads and writes with the following
2267 * - synchronous writes of user data to non-slog devices
2268 * - any reads of user data
2269 * When these conditions are met, adjust the timestamp of spa_last_io
2270 * which allows the scan thread to adjust its workload accordingly.
2272 if (!(zio
->io_flags
& ZIO_FLAG_SCAN_THREAD
) && zio
->io_bp
!= NULL
&&
2273 vd
== vd
->vdev_top
&& !vd
->vdev_islog
&&
2274 zio
->io_bookmark
.zb_objset
!= DMU_META_OBJSET
&&
2275 zio
->io_txg
!= spa_syncing_txg(spa
)) {
2276 uint64_t old
= spa
->spa_last_io
;
2277 uint64_t new = ddi_get_lbolt64();
2279 (void) atomic_cas_64(&spa
->spa_last_io
, old
, new);
2282 align
= 1ULL << vd
->vdev_top
->vdev_ashift
;
2284 if (P2PHASE(zio
->io_size
, align
) != 0) {
2285 uint64_t asize
= P2ROUNDUP(zio
->io_size
, align
);
2286 char *abuf
= zio_buf_alloc(asize
);
2287 ASSERT(vd
== vd
->vdev_top
);
2288 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
2289 bcopy(zio
->io_data
, abuf
, zio
->io_size
);
2290 bzero(abuf
+ zio
->io_size
, asize
- zio
->io_size
);
2292 zio_push_transform(zio
, abuf
, asize
, asize
, zio_subblock
);
2295 ASSERT(P2PHASE(zio
->io_offset
, align
) == 0);
2296 ASSERT(P2PHASE(zio
->io_size
, align
) == 0);
2297 VERIFY(zio
->io_type
!= ZIO_TYPE_WRITE
|| spa_writeable(spa
));
2300 * If this is a repair I/O, and there's no self-healing involved --
2301 * that is, we're just resilvering what we expect to resilver --
2302 * then don't do the I/O unless zio's txg is actually in vd's DTL.
2303 * This prevents spurious resilvering with nested replication.
2304 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2305 * A is out of date, we'll read from C+D, then use the data to
2306 * resilver A+B -- but we don't actually want to resilver B, just A.
2307 * The top-level mirror has no way to know this, so instead we just
2308 * discard unnecessary repairs as we work our way down the vdev tree.
2309 * The same logic applies to any form of nested replication:
2310 * ditto + mirror, RAID-Z + replacing, etc. This covers them all.
2312 if ((zio
->io_flags
& ZIO_FLAG_IO_REPAIR
) &&
2313 !(zio
->io_flags
& ZIO_FLAG_SELF_HEAL
) &&
2314 zio
->io_txg
!= 0 && /* not a delegated i/o */
2315 !vdev_dtl_contains(vd
, DTL_PARTIAL
, zio
->io_txg
, 1)) {
2316 ASSERT(zio
->io_type
== ZIO_TYPE_WRITE
);
2317 zio_vdev_io_bypass(zio
);
2318 return (ZIO_PIPELINE_CONTINUE
);
2321 if (vd
->vdev_ops
->vdev_op_leaf
&&
2322 (zio
->io_type
== ZIO_TYPE_READ
|| zio
->io_type
== ZIO_TYPE_WRITE
)) {
2324 if (zio
->io_type
== ZIO_TYPE_READ
&& vdev_cache_read(zio
) == 0)
2325 return (ZIO_PIPELINE_CONTINUE
);
2327 if ((zio
= vdev_queue_io(zio
)) == NULL
)
2328 return (ZIO_PIPELINE_STOP
);
2330 if (!vdev_accessible(vd
, zio
)) {
2331 zio
->io_error
= ENXIO
;
2333 return (ZIO_PIPELINE_STOP
);
2337 return (vd
->vdev_ops
->vdev_op_io_start(zio
));
2341 zio_vdev_io_done(zio_t
*zio
)
2343 vdev_t
*vd
= zio
->io_vd
;
2344 vdev_ops_t
*ops
= vd
? vd
->vdev_ops
: &vdev_mirror_ops
;
2345 boolean_t unexpected_error
= B_FALSE
;
2347 if (zio_wait_for_children(zio
, ZIO_CHILD_VDEV
, ZIO_WAIT_DONE
))
2348 return (ZIO_PIPELINE_STOP
);
2350 ASSERT(zio
->io_type
== ZIO_TYPE_READ
|| zio
->io_type
== ZIO_TYPE_WRITE
);
2352 if (vd
!= NULL
&& vd
->vdev_ops
->vdev_op_leaf
) {
2354 vdev_queue_io_done(zio
);
2356 if (zio
->io_type
== ZIO_TYPE_WRITE
)
2357 vdev_cache_write(zio
);
2359 if (zio_injection_enabled
&& zio
->io_error
== 0)
2360 zio
->io_error
= zio_handle_device_injection(vd
,
2363 if (zio_injection_enabled
&& zio
->io_error
== 0)
2364 zio
->io_error
= zio_handle_label_injection(zio
, EIO
);
2366 if (zio
->io_error
) {
2367 if (!vdev_accessible(vd
, zio
)) {
2368 zio
->io_error
= ENXIO
;
2370 unexpected_error
= B_TRUE
;
2375 ops
->vdev_op_io_done(zio
);
2377 if (unexpected_error
)
2378 VERIFY(vdev_probe(vd
, zio
) == NULL
);
2380 return (ZIO_PIPELINE_CONTINUE
);
2384 * For non-raidz ZIOs, we can just copy aside the bad data read from the
2385 * disk, and use that to finish the checksum ereport later.
2388 zio_vsd_default_cksum_finish(zio_cksum_report_t
*zcr
,
2389 const void *good_buf
)
2391 /* no processing needed */
2392 zfs_ereport_finish_checksum(zcr
, good_buf
, zcr
->zcr_cbdata
, B_FALSE
);
2397 zio_vsd_default_cksum_report(zio_t
*zio
, zio_cksum_report_t
*zcr
, void *ignored
)
2399 void *buf
= zio_buf_alloc(zio
->io_size
);
2401 bcopy(zio
->io_data
, buf
, zio
->io_size
);
2403 zcr
->zcr_cbinfo
= zio
->io_size
;
2404 zcr
->zcr_cbdata
= buf
;
2405 zcr
->zcr_finish
= zio_vsd_default_cksum_finish
;
2406 zcr
->zcr_free
= zio_buf_free
;
2410 zio_vdev_io_assess(zio_t
*zio
)
2412 vdev_t
*vd
= zio
->io_vd
;
2414 if (zio_wait_for_children(zio
, ZIO_CHILD_VDEV
, ZIO_WAIT_DONE
))
2415 return (ZIO_PIPELINE_STOP
);
2417 if (vd
== NULL
&& !(zio
->io_flags
& ZIO_FLAG_CONFIG_WRITER
))
2418 spa_config_exit(zio
->io_spa
, SCL_ZIO
, zio
);
2420 if (zio
->io_vsd
!= NULL
) {
2421 zio
->io_vsd_ops
->vsd_free(zio
);
2425 if (zio_injection_enabled
&& zio
->io_error
== 0)
2426 zio
->io_error
= zio_handle_fault_injection(zio
, EIO
);
2429 * If the I/O failed, determine whether we should attempt to retry it.
2431 * On retry, we cut in line in the issue queue, since we don't want
2432 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2434 if (zio
->io_error
&& vd
== NULL
&&
2435 !(zio
->io_flags
& (ZIO_FLAG_DONT_RETRY
| ZIO_FLAG_IO_RETRY
))) {
2436 ASSERT(!(zio
->io_flags
& ZIO_FLAG_DONT_QUEUE
)); /* not a leaf */
2437 ASSERT(!(zio
->io_flags
& ZIO_FLAG_IO_BYPASS
)); /* not a leaf */
2439 zio
->io_flags
|= ZIO_FLAG_IO_RETRY
|
2440 ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_AGGREGATE
;
2441 zio
->io_stage
= ZIO_STAGE_VDEV_IO_START
>> 1;
2442 zio_taskq_dispatch(zio
, ZIO_TASKQ_ISSUE
,
2443 zio_requeue_io_start_cut_in_line
);
2444 return (ZIO_PIPELINE_STOP
);
2448 * If we got an error on a leaf device, convert it to ENXIO
2449 * if the device is not accessible at all.
2451 if (zio
->io_error
&& vd
!= NULL
&& vd
->vdev_ops
->vdev_op_leaf
&&
2452 !vdev_accessible(vd
, zio
))
2453 zio
->io_error
= ENXIO
;
2456 * If we can't write to an interior vdev (mirror or RAID-Z),
2457 * set vdev_cant_write so that we stop trying to allocate from it.
2459 if (zio
->io_error
== ENXIO
&& zio
->io_type
== ZIO_TYPE_WRITE
&&
2460 vd
!= NULL
&& !vd
->vdev_ops
->vdev_op_leaf
)
2461 vd
->vdev_cant_write
= B_TRUE
;
2464 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
2466 return (ZIO_PIPELINE_CONTINUE
);
2470 zio_vdev_io_reissue(zio_t
*zio
)
2472 ASSERT(zio
->io_stage
== ZIO_STAGE_VDEV_IO_START
);
2473 ASSERT(zio
->io_error
== 0);
2475 zio
->io_stage
>>= 1;
2479 zio_vdev_io_redone(zio_t
*zio
)
2481 ASSERT(zio
->io_stage
== ZIO_STAGE_VDEV_IO_DONE
);
2483 zio
->io_stage
>>= 1;
2487 zio_vdev_io_bypass(zio_t
*zio
)
2489 ASSERT(zio
->io_stage
== ZIO_STAGE_VDEV_IO_START
);
2490 ASSERT(zio
->io_error
== 0);
2492 zio
->io_flags
|= ZIO_FLAG_IO_BYPASS
;
2493 zio
->io_stage
= ZIO_STAGE_VDEV_IO_ASSESS
>> 1;
2497 * ==========================================================================
2498 * Generate and verify checksums
2499 * ==========================================================================
2502 zio_checksum_generate(zio_t
*zio
)
2504 blkptr_t
*bp
= zio
->io_bp
;
2505 enum zio_checksum checksum
;
2509 * This is zio_write_phys().
2510 * We're either generating a label checksum, or none at all.
2512 checksum
= zio
->io_prop
.zp_checksum
;
2514 if (checksum
== ZIO_CHECKSUM_OFF
)
2515 return (ZIO_PIPELINE_CONTINUE
);
2517 ASSERT(checksum
== ZIO_CHECKSUM_LABEL
);
2519 if (BP_IS_GANG(bp
) && zio
->io_child_type
== ZIO_CHILD_GANG
) {
2520 ASSERT(!IO_IS_ALLOCATING(zio
));
2521 checksum
= ZIO_CHECKSUM_GANG_HEADER
;
2523 checksum
= BP_GET_CHECKSUM(bp
);
2527 zio_checksum_compute(zio
, checksum
, zio
->io_data
, zio
->io_size
);
2529 return (ZIO_PIPELINE_CONTINUE
);
2533 zio_checksum_verify(zio_t
*zio
)
2535 zio_bad_cksum_t info
;
2536 blkptr_t
*bp
= zio
->io_bp
;
2539 ASSERT(zio
->io_vd
!= NULL
);
2543 * This is zio_read_phys().
2544 * We're either verifying a label checksum, or nothing at all.
2546 if (zio
->io_prop
.zp_checksum
== ZIO_CHECKSUM_OFF
)
2547 return (ZIO_PIPELINE_CONTINUE
);
2549 ASSERT(zio
->io_prop
.zp_checksum
== ZIO_CHECKSUM_LABEL
);
2552 if ((error
= zio_checksum_error(zio
, &info
)) != 0) {
2553 zio
->io_error
= error
;
2554 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
2555 zfs_ereport_start_checksum(zio
->io_spa
,
2556 zio
->io_vd
, zio
, zio
->io_offset
,
2557 zio
->io_size
, NULL
, &info
);
2561 return (ZIO_PIPELINE_CONTINUE
);
2565 * Called by RAID-Z to ensure we don't compute the checksum twice.
2568 zio_checksum_verified(zio_t
*zio
)
2570 zio
->io_pipeline
&= ~ZIO_STAGE_CHECKSUM_VERIFY
;
2574 * ==========================================================================
2575 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2576 * An error of 0 indictes success. ENXIO indicates whole-device failure,
2577 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
2578 * indicate errors that are specific to one I/O, and most likely permanent.
2579 * Any other error is presumed to be worse because we weren't expecting it.
2580 * ==========================================================================
2583 zio_worst_error(int e1
, int e2
)
2585 static int zio_error_rank
[] = { 0, ENXIO
, ECKSUM
, EIO
};
2588 for (r1
= 0; r1
< sizeof (zio_error_rank
) / sizeof (int); r1
++)
2589 if (e1
== zio_error_rank
[r1
])
2592 for (r2
= 0; r2
< sizeof (zio_error_rank
) / sizeof (int); r2
++)
2593 if (e2
== zio_error_rank
[r2
])
2596 return (r1
> r2
? e1
: e2
);
2600 * ==========================================================================
2602 * ==========================================================================
2605 zio_ready(zio_t
*zio
)
2607 blkptr_t
*bp
= zio
->io_bp
;
2608 zio_t
*pio
, *pio_next
;
2610 if (zio_wait_for_children(zio
, ZIO_CHILD_GANG
, ZIO_WAIT_READY
) ||
2611 zio_wait_for_children(zio
, ZIO_CHILD_DDT
, ZIO_WAIT_READY
))
2612 return (ZIO_PIPELINE_STOP
);
2614 if (zio
->io_ready
) {
2615 ASSERT(IO_IS_ALLOCATING(zio
));
2616 ASSERT(bp
->blk_birth
== zio
->io_txg
|| BP_IS_HOLE(bp
));
2617 ASSERT(zio
->io_children
[ZIO_CHILD_GANG
][ZIO_WAIT_READY
] == 0);
2622 if (bp
!= NULL
&& bp
!= &zio
->io_bp_copy
)
2623 zio
->io_bp_copy
= *bp
;
2626 zio
->io_pipeline
= ZIO_INTERLOCK_PIPELINE
;
2628 mutex_enter(&zio
->io_lock
);
2629 zio
->io_state
[ZIO_WAIT_READY
] = 1;
2630 pio
= zio_walk_parents(zio
);
2631 mutex_exit(&zio
->io_lock
);
2634 * As we notify zio's parents, new parents could be added.
2635 * New parents go to the head of zio's io_parent_list, however,
2636 * so we will (correctly) not notify them. The remainder of zio's
2637 * io_parent_list, from 'pio_next' onward, cannot change because
2638 * all parents must wait for us to be done before they can be done.
2640 for (; pio
!= NULL
; pio
= pio_next
) {
2641 pio_next
= zio_walk_parents(zio
);
2642 zio_notify_parent(pio
, zio
, ZIO_WAIT_READY
);
2645 if (zio
->io_flags
& ZIO_FLAG_NODATA
) {
2646 if (BP_IS_GANG(bp
)) {
2647 zio
->io_flags
&= ~ZIO_FLAG_NODATA
;
2649 ASSERT((uintptr_t)zio
->io_data
< SPA_MAXBLOCKSIZE
);
2650 zio
->io_pipeline
&= ~ZIO_VDEV_IO_STAGES
;
2654 if (zio_injection_enabled
&&
2655 zio
->io_spa
->spa_syncing_txg
== zio
->io_txg
)
2656 zio_handle_ignored_writes(zio
);
2658 return (ZIO_PIPELINE_CONTINUE
);
2662 zio_done(zio_t
*zio
)
2664 spa_t
*spa
= zio
->io_spa
;
2665 zio_t
*lio
= zio
->io_logical
;
2666 blkptr_t
*bp
= zio
->io_bp
;
2667 vdev_t
*vd
= zio
->io_vd
;
2668 uint64_t psize
= zio
->io_size
;
2669 zio_t
*pio
, *pio_next
;
2673 * If our children haven't all completed,
2674 * wait for them and then repeat this pipeline stage.
2676 if (zio_wait_for_children(zio
, ZIO_CHILD_VDEV
, ZIO_WAIT_DONE
) ||
2677 zio_wait_for_children(zio
, ZIO_CHILD_GANG
, ZIO_WAIT_DONE
) ||
2678 zio_wait_for_children(zio
, ZIO_CHILD_DDT
, ZIO_WAIT_DONE
) ||
2679 zio_wait_for_children(zio
, ZIO_CHILD_LOGICAL
, ZIO_WAIT_DONE
))
2680 return (ZIO_PIPELINE_STOP
);
2682 for (c
= 0; c
< ZIO_CHILD_TYPES
; c
++)
2683 for (w
= 0; w
< ZIO_WAIT_TYPES
; w
++)
2684 ASSERT(zio
->io_children
[c
][w
] == 0);
2687 ASSERT(bp
->blk_pad
[0] == 0);
2688 ASSERT(bp
->blk_pad
[1] == 0);
2689 ASSERT(bcmp(bp
, &zio
->io_bp_copy
, sizeof (blkptr_t
)) == 0 ||
2690 (bp
== zio_unique_parent(zio
)->io_bp
));
2691 if (zio
->io_type
== ZIO_TYPE_WRITE
&& !BP_IS_HOLE(bp
) &&
2692 zio
->io_bp_override
== NULL
&&
2693 !(zio
->io_flags
& ZIO_FLAG_IO_REPAIR
)) {
2694 ASSERT(!BP_SHOULD_BYTESWAP(bp
));
2695 ASSERT3U(zio
->io_prop
.zp_copies
, <=, BP_GET_NDVAS(bp
));
2696 ASSERT(BP_COUNT_GANG(bp
) == 0 ||
2697 (BP_COUNT_GANG(bp
) == BP_GET_NDVAS(bp
)));
2702 * If there were child vdev/gang/ddt errors, they apply to us now.
2704 zio_inherit_child_errors(zio
, ZIO_CHILD_VDEV
);
2705 zio_inherit_child_errors(zio
, ZIO_CHILD_GANG
);
2706 zio_inherit_child_errors(zio
, ZIO_CHILD_DDT
);
2709 * If the I/O on the transformed data was successful, generate any
2710 * checksum reports now while we still have the transformed data.
2712 if (zio
->io_error
== 0) {
2713 while (zio
->io_cksum_report
!= NULL
) {
2714 zio_cksum_report_t
*zcr
= zio
->io_cksum_report
;
2715 uint64_t align
= zcr
->zcr_align
;
2716 uint64_t asize
= P2ROUNDUP(psize
, align
);
2717 char *abuf
= zio
->io_data
;
2719 if (asize
!= psize
) {
2720 abuf
= zio_buf_alloc(asize
);
2721 bcopy(zio
->io_data
, abuf
, psize
);
2722 bzero(abuf
+ psize
, asize
- psize
);
2725 zio
->io_cksum_report
= zcr
->zcr_next
;
2726 zcr
->zcr_next
= NULL
;
2727 zcr
->zcr_finish(zcr
, abuf
);
2728 zfs_ereport_free_checksum(zcr
);
2731 zio_buf_free(abuf
, asize
);
2735 zio_pop_transforms(zio
); /* note: may set zio->io_error */
2737 vdev_stat_update(zio
, psize
);
2739 if (zio
->io_error
) {
2741 * If this I/O is attached to a particular vdev,
2742 * generate an error message describing the I/O failure
2743 * at the block level. We ignore these errors if the
2744 * device is currently unavailable.
2746 if (zio
->io_error
!= ECKSUM
&& vd
!= NULL
&& !vdev_is_dead(vd
))
2747 zfs_ereport_post(FM_EREPORT_ZFS_IO
, spa
, vd
, zio
, 0, 0);
2749 if ((zio
->io_error
== EIO
|| !(zio
->io_flags
&
2750 (ZIO_FLAG_SPECULATIVE
| ZIO_FLAG_DONT_PROPAGATE
))) &&
2753 * For logical I/O requests, tell the SPA to log the
2754 * error and generate a logical data ereport.
2756 spa_log_error(spa
, zio
);
2757 zfs_ereport_post(FM_EREPORT_ZFS_DATA
, spa
, NULL
, zio
,
2762 if (zio
->io_error
&& zio
== lio
) {
2764 * Determine whether zio should be reexecuted. This will
2765 * propagate all the way to the root via zio_notify_parent().
2767 ASSERT(vd
== NULL
&& bp
!= NULL
);
2768 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
2770 if (IO_IS_ALLOCATING(zio
) &&
2771 !(zio
->io_flags
& ZIO_FLAG_CANFAIL
)) {
2772 if (zio
->io_error
!= ENOSPC
)
2773 zio
->io_reexecute
|= ZIO_REEXECUTE_NOW
;
2775 zio
->io_reexecute
|= ZIO_REEXECUTE_SUSPEND
;
2778 if ((zio
->io_type
== ZIO_TYPE_READ
||
2779 zio
->io_type
== ZIO_TYPE_FREE
) &&
2780 !(zio
->io_flags
& ZIO_FLAG_SCAN_THREAD
) &&
2781 zio
->io_error
== ENXIO
&&
2782 spa_load_state(spa
) == SPA_LOAD_NONE
&&
2783 spa_get_failmode(spa
) != ZIO_FAILURE_MODE_CONTINUE
)
2784 zio
->io_reexecute
|= ZIO_REEXECUTE_SUSPEND
;
2786 if (!(zio
->io_flags
& ZIO_FLAG_CANFAIL
) && !zio
->io_reexecute
)
2787 zio
->io_reexecute
|= ZIO_REEXECUTE_SUSPEND
;
2790 * Here is a possibly good place to attempt to do
2791 * either combinatorial reconstruction or error correction
2792 * based on checksums. It also might be a good place
2793 * to send out preliminary ereports before we suspend
2799 * If there were logical child errors, they apply to us now.
2800 * We defer this until now to avoid conflating logical child
2801 * errors with errors that happened to the zio itself when
2802 * updating vdev stats and reporting FMA events above.
2804 zio_inherit_child_errors(zio
, ZIO_CHILD_LOGICAL
);
2806 if ((zio
->io_error
|| zio
->io_reexecute
) &&
2807 IO_IS_ALLOCATING(zio
) && zio
->io_gang_leader
== zio
&&
2808 !(zio
->io_flags
& ZIO_FLAG_IO_REWRITE
))
2809 zio_dva_unallocate(zio
, zio
->io_gang_tree
, bp
);
2811 zio_gang_tree_free(&zio
->io_gang_tree
);
2814 * Godfather I/Os should never suspend.
2816 if ((zio
->io_flags
& ZIO_FLAG_GODFATHER
) &&
2817 (zio
->io_reexecute
& ZIO_REEXECUTE_SUSPEND
))
2818 zio
->io_reexecute
= 0;
2820 if (zio
->io_reexecute
) {
2822 * This is a logical I/O that wants to reexecute.
2824 * Reexecute is top-down. When an i/o fails, if it's not
2825 * the root, it simply notifies its parent and sticks around.
2826 * The parent, seeing that it still has children in zio_done(),
2827 * does the same. This percolates all the way up to the root.
2828 * The root i/o will reexecute or suspend the entire tree.
2830 * This approach ensures that zio_reexecute() honors
2831 * all the original i/o dependency relationships, e.g.
2832 * parents not executing until children are ready.
2834 ASSERT(zio
->io_child_type
== ZIO_CHILD_LOGICAL
);
2836 zio
->io_gang_leader
= NULL
;
2838 mutex_enter(&zio
->io_lock
);
2839 zio
->io_state
[ZIO_WAIT_DONE
] = 1;
2840 mutex_exit(&zio
->io_lock
);
2843 * "The Godfather" I/O monitors its children but is
2844 * not a true parent to them. It will track them through
2845 * the pipeline but severs its ties whenever they get into
2846 * trouble (e.g. suspended). This allows "The Godfather"
2847 * I/O to return status without blocking.
2849 for (pio
= zio_walk_parents(zio
); pio
!= NULL
; pio
= pio_next
) {
2850 zio_link_t
*zl
= zio
->io_walk_link
;
2851 pio_next
= zio_walk_parents(zio
);
2853 if ((pio
->io_flags
& ZIO_FLAG_GODFATHER
) &&
2854 (zio
->io_reexecute
& ZIO_REEXECUTE_SUSPEND
)) {
2855 zio_remove_child(pio
, zio
, zl
);
2856 zio_notify_parent(pio
, zio
, ZIO_WAIT_DONE
);
2860 if ((pio
= zio_unique_parent(zio
)) != NULL
) {
2862 * We're not a root i/o, so there's nothing to do
2863 * but notify our parent. Don't propagate errors
2864 * upward since we haven't permanently failed yet.
2866 ASSERT(!(zio
->io_flags
& ZIO_FLAG_GODFATHER
));
2867 zio
->io_flags
|= ZIO_FLAG_DONT_PROPAGATE
;
2868 zio_notify_parent(pio
, zio
, ZIO_WAIT_DONE
);
2869 } else if (zio
->io_reexecute
& ZIO_REEXECUTE_SUSPEND
) {
2871 * We'd fail again if we reexecuted now, so suspend
2872 * until conditions improve (e.g. device comes online).
2874 zio_suspend(spa
, zio
);
2877 * Reexecution is potentially a huge amount of work.
2878 * Hand it off to the otherwise-unused claim taskq.
2880 (void) taskq_dispatch(
2881 spa
->spa_zio_taskq
[ZIO_TYPE_CLAIM
][ZIO_TASKQ_ISSUE
],
2882 (task_func_t
*)zio_reexecute
, zio
, TQ_SLEEP
);
2884 return (ZIO_PIPELINE_STOP
);
2887 ASSERT(zio
->io_child_count
== 0);
2888 ASSERT(zio
->io_reexecute
== 0);
2889 ASSERT(zio
->io_error
== 0 || (zio
->io_flags
& ZIO_FLAG_CANFAIL
));
2892 * Report any checksum errors, since the I/O is complete.
2894 while (zio
->io_cksum_report
!= NULL
) {
2895 zio_cksum_report_t
*zcr
= zio
->io_cksum_report
;
2896 zio
->io_cksum_report
= zcr
->zcr_next
;
2897 zcr
->zcr_next
= NULL
;
2898 zcr
->zcr_finish(zcr
, NULL
);
2899 zfs_ereport_free_checksum(zcr
);
2903 * It is the responsibility of the done callback to ensure that this
2904 * particular zio is no longer discoverable for adoption, and as
2905 * such, cannot acquire any new parents.
2910 mutex_enter(&zio
->io_lock
);
2911 zio
->io_state
[ZIO_WAIT_DONE
] = 1;
2912 mutex_exit(&zio
->io_lock
);
2914 for (pio
= zio_walk_parents(zio
); pio
!= NULL
; pio
= pio_next
) {
2915 zio_link_t
*zl
= zio
->io_walk_link
;
2916 pio_next
= zio_walk_parents(zio
);
2917 zio_remove_child(pio
, zio
, zl
);
2918 zio_notify_parent(pio
, zio
, ZIO_WAIT_DONE
);
2921 if (zio
->io_waiter
!= NULL
) {
2922 mutex_enter(&zio
->io_lock
);
2923 zio
->io_executor
= NULL
;
2924 cv_broadcast(&zio
->io_cv
);
2925 mutex_exit(&zio
->io_lock
);
2930 return (ZIO_PIPELINE_STOP
);
2934 * ==========================================================================
2935 * I/O pipeline definition
2936 * ==========================================================================
2938 static zio_pipe_stage_t
*zio_pipeline
[] = {
2944 zio_checksum_generate
,
2958 zio_checksum_verify
,