]> git.proxmox.com Git - mirror_zfs-debian.git/blob - module/zfs/zio.c
Imported Upstream version 0.6.5.3
[mirror_zfs-debian.git] / module / zfs / zio.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25 */
26
27 #include <sys/sysmacros.h>
28 #include <sys/zfs_context.h>
29 #include <sys/fm/fs/zfs.h>
30 #include <sys/spa.h>
31 #include <sys/txg.h>
32 #include <sys/spa_impl.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/zio_impl.h>
35 #include <sys/zio_compress.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/dmu_objset.h>
38 #include <sys/arc.h>
39 #include <sys/ddt.h>
40 #include <sys/blkptr.h>
41 #include <sys/zfeature.h>
42
43 /*
44 * ==========================================================================
45 * I/O type descriptions
46 * ==========================================================================
47 */
48 const char *zio_type_name[ZIO_TYPES] = {
49 "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
50 };
51
52 /*
53 * ==========================================================================
54 * I/O kmem caches
55 * ==========================================================================
56 */
57 kmem_cache_t *zio_cache;
58 kmem_cache_t *zio_link_cache;
59 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
60 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
61 int zio_delay_max = ZIO_DELAY_MAX;
62
63 #define ZIO_PIPELINE_CONTINUE 0x100
64 #define ZIO_PIPELINE_STOP 0x101
65
66 /*
67 * The following actions directly effect the spa's sync-to-convergence logic.
68 * The values below define the sync pass when we start performing the action.
69 * Care should be taken when changing these values as they directly impact
70 * spa_sync() performance. Tuning these values may introduce subtle performance
71 * pathologies and should only be done in the context of performance analysis.
72 * These tunables will eventually be removed and replaced with #defines once
73 * enough analysis has been done to determine optimal values.
74 *
75 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
76 * regular blocks are not deferred.
77 */
78 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
79 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
80 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
81
82 /*
83 * An allocating zio is one that either currently has the DVA allocate
84 * stage set or will have it later in its lifetime.
85 */
86 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
87
88 int zio_requeue_io_start_cut_in_line = 1;
89
90 #ifdef ZFS_DEBUG
91 int zio_buf_debug_limit = 16384;
92 #else
93 int zio_buf_debug_limit = 0;
94 #endif
95
96 static inline void __zio_execute(zio_t *zio);
97
98 void
99 zio_init(void)
100 {
101 size_t c;
102 vmem_t *data_alloc_arena = NULL;
103
104 zio_cache = kmem_cache_create("zio_cache",
105 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
106 zio_link_cache = kmem_cache_create("zio_link_cache",
107 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
108
109 /*
110 * For small buffers, we want a cache for each multiple of
111 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
112 * for each quarter-power of 2.
113 */
114 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
115 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
116 size_t p2 = size;
117 size_t align = 0;
118 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
119
120 #ifdef _ILP32
121 /*
122 * Cache size limited to 1M on 32-bit platforms until ARC
123 * buffers no longer require virtual address space.
124 */
125 if (size > zfs_max_recordsize)
126 break;
127 #endif
128
129 while (!ISP2(p2))
130 p2 &= p2 - 1;
131
132 #ifndef _KERNEL
133 /*
134 * If we are using watchpoints, put each buffer on its own page,
135 * to eliminate the performance overhead of trapping to the
136 * kernel when modifying a non-watched buffer that shares the
137 * page with a watched buffer.
138 */
139 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
140 continue;
141 #endif
142 if (size <= 4 * SPA_MINBLOCKSIZE) {
143 align = SPA_MINBLOCKSIZE;
144 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
145 align = MIN(p2 >> 2, PAGESIZE);
146 }
147
148 if (align != 0) {
149 char name[36];
150 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
151 zio_buf_cache[c] = kmem_cache_create(name, size,
152 align, NULL, NULL, NULL, NULL, NULL, cflags);
153
154 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
155 zio_data_buf_cache[c] = kmem_cache_create(name, size,
156 align, NULL, NULL, NULL, NULL,
157 data_alloc_arena, cflags);
158 }
159 }
160
161 while (--c != 0) {
162 ASSERT(zio_buf_cache[c] != NULL);
163 if (zio_buf_cache[c - 1] == NULL)
164 zio_buf_cache[c - 1] = zio_buf_cache[c];
165
166 ASSERT(zio_data_buf_cache[c] != NULL);
167 if (zio_data_buf_cache[c - 1] == NULL)
168 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
169 }
170
171 zio_inject_init();
172
173 lz4_init();
174 }
175
176 void
177 zio_fini(void)
178 {
179 size_t c;
180 kmem_cache_t *last_cache = NULL;
181 kmem_cache_t *last_data_cache = NULL;
182
183 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
184 #ifdef _ILP32
185 /*
186 * Cache size limited to 1M on 32-bit platforms until ARC
187 * buffers no longer require virtual address space.
188 */
189 if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
190 break;
191 #endif
192 if (zio_buf_cache[c] != last_cache) {
193 last_cache = zio_buf_cache[c];
194 kmem_cache_destroy(zio_buf_cache[c]);
195 }
196 zio_buf_cache[c] = NULL;
197
198 if (zio_data_buf_cache[c] != last_data_cache) {
199 last_data_cache = zio_data_buf_cache[c];
200 kmem_cache_destroy(zio_data_buf_cache[c]);
201 }
202 zio_data_buf_cache[c] = NULL;
203 }
204
205 kmem_cache_destroy(zio_link_cache);
206 kmem_cache_destroy(zio_cache);
207
208 zio_inject_fini();
209
210 lz4_fini();
211 }
212
213 /*
214 * ==========================================================================
215 * Allocate and free I/O buffers
216 * ==========================================================================
217 */
218
219 /*
220 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
221 * crashdump if the kernel panics, so use it judiciously. Obviously, it's
222 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
223 * excess / transient data in-core during a crashdump.
224 */
225 void *
226 zio_buf_alloc(size_t size)
227 {
228 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
229
230 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
231
232 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
233 }
234
235 /*
236 * Use zio_data_buf_alloc to allocate data. The data will not appear in a
237 * crashdump if the kernel panics. This exists so that we will limit the amount
238 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
239 * of kernel heap dumped to disk when the kernel panics)
240 */
241 void *
242 zio_data_buf_alloc(size_t size)
243 {
244 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
245
246 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
247
248 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
249 }
250
251 void
252 zio_buf_free(void *buf, size_t size)
253 {
254 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
255
256 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
257
258 kmem_cache_free(zio_buf_cache[c], buf);
259 }
260
261 void
262 zio_data_buf_free(void *buf, size_t size)
263 {
264 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
265
266 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
267
268 kmem_cache_free(zio_data_buf_cache[c], buf);
269 }
270
271 /*
272 * ==========================================================================
273 * Push and pop I/O transform buffers
274 * ==========================================================================
275 */
276 static void
277 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
278 zio_transform_func_t *transform)
279 {
280 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
281
282 zt->zt_orig_data = zio->io_data;
283 zt->zt_orig_size = zio->io_size;
284 zt->zt_bufsize = bufsize;
285 zt->zt_transform = transform;
286
287 zt->zt_next = zio->io_transform_stack;
288 zio->io_transform_stack = zt;
289
290 zio->io_data = data;
291 zio->io_size = size;
292 }
293
294 static void
295 zio_pop_transforms(zio_t *zio)
296 {
297 zio_transform_t *zt;
298
299 while ((zt = zio->io_transform_stack) != NULL) {
300 if (zt->zt_transform != NULL)
301 zt->zt_transform(zio,
302 zt->zt_orig_data, zt->zt_orig_size);
303
304 if (zt->zt_bufsize != 0)
305 zio_buf_free(zio->io_data, zt->zt_bufsize);
306
307 zio->io_data = zt->zt_orig_data;
308 zio->io_size = zt->zt_orig_size;
309 zio->io_transform_stack = zt->zt_next;
310
311 kmem_free(zt, sizeof (zio_transform_t));
312 }
313 }
314
315 /*
316 * ==========================================================================
317 * I/O transform callbacks for subblocks and decompression
318 * ==========================================================================
319 */
320 static void
321 zio_subblock(zio_t *zio, void *data, uint64_t size)
322 {
323 ASSERT(zio->io_size > size);
324
325 if (zio->io_type == ZIO_TYPE_READ)
326 bcopy(zio->io_data, data, size);
327 }
328
329 static void
330 zio_decompress(zio_t *zio, void *data, uint64_t size)
331 {
332 if (zio->io_error == 0 &&
333 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
334 zio->io_data, data, zio->io_size, size) != 0)
335 zio->io_error = SET_ERROR(EIO);
336 }
337
338 /*
339 * ==========================================================================
340 * I/O parent/child relationships and pipeline interlocks
341 * ==========================================================================
342 */
343 /*
344 * NOTE - Callers to zio_walk_parents() and zio_walk_children must
345 * continue calling these functions until they return NULL.
346 * Otherwise, the next caller will pick up the list walk in
347 * some indeterminate state. (Otherwise every caller would
348 * have to pass in a cookie to keep the state represented by
349 * io_walk_link, which gets annoying.)
350 */
351 zio_t *
352 zio_walk_parents(zio_t *cio)
353 {
354 zio_link_t *zl = cio->io_walk_link;
355 list_t *pl = &cio->io_parent_list;
356
357 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
358 cio->io_walk_link = zl;
359
360 if (zl == NULL)
361 return (NULL);
362
363 ASSERT(zl->zl_child == cio);
364 return (zl->zl_parent);
365 }
366
367 zio_t *
368 zio_walk_children(zio_t *pio)
369 {
370 zio_link_t *zl = pio->io_walk_link;
371 list_t *cl = &pio->io_child_list;
372
373 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
374 pio->io_walk_link = zl;
375
376 if (zl == NULL)
377 return (NULL);
378
379 ASSERT(zl->zl_parent == pio);
380 return (zl->zl_child);
381 }
382
383 zio_t *
384 zio_unique_parent(zio_t *cio)
385 {
386 zio_t *pio = zio_walk_parents(cio);
387
388 VERIFY(zio_walk_parents(cio) == NULL);
389 return (pio);
390 }
391
392 void
393 zio_add_child(zio_t *pio, zio_t *cio)
394 {
395 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
396 int w;
397
398 /*
399 * Logical I/Os can have logical, gang, or vdev children.
400 * Gang I/Os can have gang or vdev children.
401 * Vdev I/Os can only have vdev children.
402 * The following ASSERT captures all of these constraints.
403 */
404 ASSERT(cio->io_child_type <= pio->io_child_type);
405
406 zl->zl_parent = pio;
407 zl->zl_child = cio;
408
409 mutex_enter(&cio->io_lock);
410 mutex_enter(&pio->io_lock);
411
412 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
413
414 for (w = 0; w < ZIO_WAIT_TYPES; w++)
415 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
416
417 list_insert_head(&pio->io_child_list, zl);
418 list_insert_head(&cio->io_parent_list, zl);
419
420 pio->io_child_count++;
421 cio->io_parent_count++;
422
423 mutex_exit(&pio->io_lock);
424 mutex_exit(&cio->io_lock);
425 }
426
427 static void
428 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
429 {
430 ASSERT(zl->zl_parent == pio);
431 ASSERT(zl->zl_child == cio);
432
433 mutex_enter(&cio->io_lock);
434 mutex_enter(&pio->io_lock);
435
436 list_remove(&pio->io_child_list, zl);
437 list_remove(&cio->io_parent_list, zl);
438
439 pio->io_child_count--;
440 cio->io_parent_count--;
441
442 mutex_exit(&pio->io_lock);
443 mutex_exit(&cio->io_lock);
444
445 kmem_cache_free(zio_link_cache, zl);
446 }
447
448 static boolean_t
449 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
450 {
451 uint64_t *countp = &zio->io_children[child][wait];
452 boolean_t waiting = B_FALSE;
453
454 mutex_enter(&zio->io_lock);
455 ASSERT(zio->io_stall == NULL);
456 if (*countp != 0) {
457 zio->io_stage >>= 1;
458 zio->io_stall = countp;
459 waiting = B_TRUE;
460 }
461 mutex_exit(&zio->io_lock);
462
463 return (waiting);
464 }
465
466 __attribute__((always_inline))
467 static inline void
468 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
469 {
470 uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
471 int *errorp = &pio->io_child_error[zio->io_child_type];
472
473 mutex_enter(&pio->io_lock);
474 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
475 *errorp = zio_worst_error(*errorp, zio->io_error);
476 pio->io_reexecute |= zio->io_reexecute;
477 ASSERT3U(*countp, >, 0);
478
479 (*countp)--;
480
481 if (*countp == 0 && pio->io_stall == countp) {
482 pio->io_stall = NULL;
483 mutex_exit(&pio->io_lock);
484 __zio_execute(pio);
485 } else {
486 mutex_exit(&pio->io_lock);
487 }
488 }
489
490 static void
491 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
492 {
493 if (zio->io_child_error[c] != 0 && zio->io_error == 0)
494 zio->io_error = zio->io_child_error[c];
495 }
496
497 /*
498 * ==========================================================================
499 * Create the various types of I/O (read, write, free, etc)
500 * ==========================================================================
501 */
502 static zio_t *
503 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
504 void *data, uint64_t size, zio_done_func_t *done, void *private,
505 zio_type_t type, zio_priority_t priority, enum zio_flag flags,
506 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
507 enum zio_stage stage, enum zio_stage pipeline)
508 {
509 zio_t *zio;
510
511 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
512 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
513 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
514
515 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
516 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
517 ASSERT(vd || stage == ZIO_STAGE_OPEN);
518
519 zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
520 bzero(zio, sizeof (zio_t));
521
522 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
523 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
524
525 list_create(&zio->io_parent_list, sizeof (zio_link_t),
526 offsetof(zio_link_t, zl_parent_node));
527 list_create(&zio->io_child_list, sizeof (zio_link_t),
528 offsetof(zio_link_t, zl_child_node));
529
530 if (vd != NULL)
531 zio->io_child_type = ZIO_CHILD_VDEV;
532 else if (flags & ZIO_FLAG_GANG_CHILD)
533 zio->io_child_type = ZIO_CHILD_GANG;
534 else if (flags & ZIO_FLAG_DDT_CHILD)
535 zio->io_child_type = ZIO_CHILD_DDT;
536 else
537 zio->io_child_type = ZIO_CHILD_LOGICAL;
538
539 if (bp != NULL) {
540 zio->io_bp = (blkptr_t *)bp;
541 zio->io_bp_copy = *bp;
542 zio->io_bp_orig = *bp;
543 if (type != ZIO_TYPE_WRITE ||
544 zio->io_child_type == ZIO_CHILD_DDT)
545 zio->io_bp = &zio->io_bp_copy; /* so caller can free */
546 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
547 zio->io_logical = zio;
548 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
549 pipeline |= ZIO_GANG_STAGES;
550 }
551
552 zio->io_spa = spa;
553 zio->io_txg = txg;
554 zio->io_done = done;
555 zio->io_private = private;
556 zio->io_type = type;
557 zio->io_priority = priority;
558 zio->io_vd = vd;
559 zio->io_offset = offset;
560 zio->io_orig_data = zio->io_data = data;
561 zio->io_orig_size = zio->io_size = size;
562 zio->io_orig_flags = zio->io_flags = flags;
563 zio->io_orig_stage = zio->io_stage = stage;
564 zio->io_orig_pipeline = zio->io_pipeline = pipeline;
565
566 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
567 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
568
569 if (zb != NULL)
570 zio->io_bookmark = *zb;
571
572 if (pio != NULL) {
573 if (zio->io_logical == NULL)
574 zio->io_logical = pio->io_logical;
575 if (zio->io_child_type == ZIO_CHILD_GANG)
576 zio->io_gang_leader = pio->io_gang_leader;
577 zio_add_child(pio, zio);
578 }
579
580 taskq_init_ent(&zio->io_tqent);
581
582 return (zio);
583 }
584
585 static void
586 zio_destroy(zio_t *zio)
587 {
588 list_destroy(&zio->io_parent_list);
589 list_destroy(&zio->io_child_list);
590 mutex_destroy(&zio->io_lock);
591 cv_destroy(&zio->io_cv);
592 kmem_cache_free(zio_cache, zio);
593 }
594
595 zio_t *
596 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
597 void *private, enum zio_flag flags)
598 {
599 zio_t *zio;
600
601 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
602 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
603 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
604
605 return (zio);
606 }
607
608 zio_t *
609 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
610 {
611 return (zio_null(NULL, spa, NULL, done, private, flags));
612 }
613
614 void
615 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
616 {
617 int i;
618
619 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
620 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
621 bp, (longlong_t)BP_GET_TYPE(bp));
622 }
623 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
624 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
625 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
626 bp, (longlong_t)BP_GET_CHECKSUM(bp));
627 }
628 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
629 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
630 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
631 bp, (longlong_t)BP_GET_COMPRESS(bp));
632 }
633 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
634 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
635 bp, (longlong_t)BP_GET_LSIZE(bp));
636 }
637 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
638 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
639 bp, (longlong_t)BP_GET_PSIZE(bp));
640 }
641
642 if (BP_IS_EMBEDDED(bp)) {
643 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
644 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
645 bp, (longlong_t)BPE_GET_ETYPE(bp));
646 }
647 }
648
649 /*
650 * Pool-specific checks.
651 *
652 * Note: it would be nice to verify that the blk_birth and
653 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
654 * allows the birth time of log blocks (and dmu_sync()-ed blocks
655 * that are in the log) to be arbitrarily large.
656 */
657 for (i = 0; i < BP_GET_NDVAS(bp); i++) {
658 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
659 vdev_t *vd;
660 uint64_t offset, asize;
661 if (vdevid >= spa->spa_root_vdev->vdev_children) {
662 zfs_panic_recover("blkptr at %p DVA %u has invalid "
663 "VDEV %llu",
664 bp, i, (longlong_t)vdevid);
665 }
666 vd = spa->spa_root_vdev->vdev_child[vdevid];
667 if (vd == NULL) {
668 zfs_panic_recover("blkptr at %p DVA %u has invalid "
669 "VDEV %llu",
670 bp, i, (longlong_t)vdevid);
671 }
672 if (vd->vdev_ops == &vdev_hole_ops) {
673 zfs_panic_recover("blkptr at %p DVA %u has hole "
674 "VDEV %llu",
675 bp, i, (longlong_t)vdevid);
676
677 }
678 if (vd->vdev_ops == &vdev_missing_ops) {
679 /*
680 * "missing" vdevs are valid during import, but we
681 * don't have their detailed info (e.g. asize), so
682 * we can't perform any more checks on them.
683 */
684 continue;
685 }
686 offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
687 asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
688 if (BP_IS_GANG(bp))
689 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
690 if (offset + asize > vd->vdev_asize) {
691 zfs_panic_recover("blkptr at %p DVA %u has invalid "
692 "OFFSET %llu",
693 bp, i, (longlong_t)offset);
694 }
695 }
696 }
697
698 zio_t *
699 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
700 void *data, uint64_t size, zio_done_func_t *done, void *private,
701 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
702 {
703 zio_t *zio;
704
705 zfs_blkptr_verify(spa, bp);
706
707 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
708 data, size, done, private,
709 ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
710 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
711 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
712
713 return (zio);
714 }
715
716 zio_t *
717 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
718 void *data, uint64_t size, const zio_prop_t *zp,
719 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
720 void *private,
721 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
722 {
723 zio_t *zio;
724
725 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
726 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
727 zp->zp_compress >= ZIO_COMPRESS_OFF &&
728 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
729 DMU_OT_IS_VALID(zp->zp_type) &&
730 zp->zp_level < 32 &&
731 zp->zp_copies > 0 &&
732 zp->zp_copies <= spa_max_replication(spa));
733
734 zio = zio_create(pio, spa, txg, bp, data, size, done, private,
735 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
736 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
737 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
738
739 zio->io_ready = ready;
740 zio->io_physdone = physdone;
741 zio->io_prop = *zp;
742
743 /*
744 * Data can be NULL if we are going to call zio_write_override() to
745 * provide the already-allocated BP. But we may need the data to
746 * verify a dedup hit (if requested). In this case, don't try to
747 * dedup (just take the already-allocated BP verbatim).
748 */
749 if (data == NULL && zio->io_prop.zp_dedup_verify) {
750 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
751 }
752
753 return (zio);
754 }
755
756 zio_t *
757 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
758 uint64_t size, zio_done_func_t *done, void *private,
759 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
760 {
761 zio_t *zio;
762
763 zio = zio_create(pio, spa, txg, bp, data, size, done, private,
764 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
765 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
766
767 return (zio);
768 }
769
770 void
771 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
772 {
773 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
774 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
775 ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
776 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
777
778 /*
779 * We must reset the io_prop to match the values that existed
780 * when the bp was first written by dmu_sync() keeping in mind
781 * that nopwrite and dedup are mutually exclusive.
782 */
783 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
784 zio->io_prop.zp_nopwrite = nopwrite;
785 zio->io_prop.zp_copies = copies;
786 zio->io_bp_override = bp;
787 }
788
789 void
790 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
791 {
792
793 /*
794 * The check for EMBEDDED is a performance optimization. We
795 * process the free here (by ignoring it) rather than
796 * putting it on the list and then processing it in zio_free_sync().
797 */
798 if (BP_IS_EMBEDDED(bp))
799 return;
800 metaslab_check_free(spa, bp);
801
802 /*
803 * Frees that are for the currently-syncing txg, are not going to be
804 * deferred, and which will not need to do a read (i.e. not GANG or
805 * DEDUP), can be processed immediately. Otherwise, put them on the
806 * in-memory list for later processing.
807 */
808 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
809 txg != spa->spa_syncing_txg ||
810 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
811 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
812 } else {
813 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
814 }
815 }
816
817 zio_t *
818 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
819 enum zio_flag flags)
820 {
821 zio_t *zio;
822 enum zio_stage stage = ZIO_FREE_PIPELINE;
823
824 ASSERT(!BP_IS_HOLE(bp));
825 ASSERT(spa_syncing_txg(spa) == txg);
826 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
827
828 if (BP_IS_EMBEDDED(bp))
829 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
830
831 metaslab_check_free(spa, bp);
832 arc_freed(spa, bp);
833
834 /*
835 * GANG and DEDUP blocks can induce a read (for the gang block header,
836 * or the DDT), so issue them asynchronously so that this thread is
837 * not tied up.
838 */
839 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
840 stage |= ZIO_STAGE_ISSUE_ASYNC;
841
842 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
843 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
844 NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
845
846 return (zio);
847 }
848
849 zio_t *
850 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
851 zio_done_func_t *done, void *private, enum zio_flag flags)
852 {
853 zio_t *zio;
854
855 dprintf_bp(bp, "claiming in txg %llu", txg);
856
857 if (BP_IS_EMBEDDED(bp))
858 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
859
860 /*
861 * A claim is an allocation of a specific block. Claims are needed
862 * to support immediate writes in the intent log. The issue is that
863 * immediate writes contain committed data, but in a txg that was
864 * *not* committed. Upon opening the pool after an unclean shutdown,
865 * the intent log claims all blocks that contain immediate write data
866 * so that the SPA knows they're in use.
867 *
868 * All claims *must* be resolved in the first txg -- before the SPA
869 * starts allocating blocks -- so that nothing is allocated twice.
870 * If txg == 0 we just verify that the block is claimable.
871 */
872 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
873 ASSERT(txg == spa_first_txg(spa) || txg == 0);
874 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
875
876 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
877 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
878 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
879
880 return (zio);
881 }
882
883 zio_t *
884 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
885 zio_done_func_t *done, void *private, enum zio_flag flags)
886 {
887 zio_t *zio;
888 int c;
889
890 if (vd->vdev_children == 0) {
891 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
892 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
893 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
894
895 zio->io_cmd = cmd;
896 } else {
897 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
898
899 for (c = 0; c < vd->vdev_children; c++)
900 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
901 done, private, flags));
902 }
903
904 return (zio);
905 }
906
907 zio_t *
908 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
909 void *data, int checksum, zio_done_func_t *done, void *private,
910 zio_priority_t priority, enum zio_flag flags, boolean_t labels)
911 {
912 zio_t *zio;
913
914 ASSERT(vd->vdev_children == 0);
915 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
916 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
917 ASSERT3U(offset + size, <=, vd->vdev_psize);
918
919 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
920 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
921 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
922
923 zio->io_prop.zp_checksum = checksum;
924
925 return (zio);
926 }
927
928 zio_t *
929 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
930 void *data, int checksum, zio_done_func_t *done, void *private,
931 zio_priority_t priority, enum zio_flag flags, boolean_t labels)
932 {
933 zio_t *zio;
934
935 ASSERT(vd->vdev_children == 0);
936 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
937 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
938 ASSERT3U(offset + size, <=, vd->vdev_psize);
939
940 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
941 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
942 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
943
944 zio->io_prop.zp_checksum = checksum;
945
946 if (zio_checksum_table[checksum].ci_eck) {
947 /*
948 * zec checksums are necessarily destructive -- they modify
949 * the end of the write buffer to hold the verifier/checksum.
950 * Therefore, we must make a local copy in case the data is
951 * being written to multiple places in parallel.
952 */
953 void *wbuf = zio_buf_alloc(size);
954 bcopy(data, wbuf, size);
955 zio_push_transform(zio, wbuf, size, size, NULL);
956 }
957
958 return (zio);
959 }
960
961 /*
962 * Create a child I/O to do some work for us.
963 */
964 zio_t *
965 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
966 void *data, uint64_t size, int type, zio_priority_t priority,
967 enum zio_flag flags, zio_done_func_t *done, void *private)
968 {
969 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
970 zio_t *zio;
971
972 ASSERT(vd->vdev_parent ==
973 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
974
975 if (type == ZIO_TYPE_READ && bp != NULL) {
976 /*
977 * If we have the bp, then the child should perform the
978 * checksum and the parent need not. This pushes error
979 * detection as close to the leaves as possible and
980 * eliminates redundant checksums in the interior nodes.
981 */
982 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
983 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
984 }
985
986 if (vd->vdev_children == 0)
987 offset += VDEV_LABEL_START_SIZE;
988
989 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
990
991 /*
992 * If we've decided to do a repair, the write is not speculative --
993 * even if the original read was.
994 */
995 if (flags & ZIO_FLAG_IO_REPAIR)
996 flags &= ~ZIO_FLAG_SPECULATIVE;
997
998 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
999 done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
1000 ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
1001
1002 zio->io_physdone = pio->io_physdone;
1003 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1004 zio->io_logical->io_phys_children++;
1005
1006 return (zio);
1007 }
1008
1009 zio_t *
1010 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
1011 int type, zio_priority_t priority, enum zio_flag flags,
1012 zio_done_func_t *done, void *private)
1013 {
1014 zio_t *zio;
1015
1016 ASSERT(vd->vdev_ops->vdev_op_leaf);
1017
1018 zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1019 data, size, done, private, type, priority,
1020 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
1021 vd, offset, NULL,
1022 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
1023
1024 return (zio);
1025 }
1026
1027 void
1028 zio_flush(zio_t *zio, vdev_t *vd)
1029 {
1030 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
1031 NULL, NULL,
1032 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1033 }
1034
1035 void
1036 zio_shrink(zio_t *zio, uint64_t size)
1037 {
1038 ASSERT(zio->io_executor == NULL);
1039 ASSERT(zio->io_orig_size == zio->io_size);
1040 ASSERT(size <= zio->io_size);
1041
1042 /*
1043 * We don't shrink for raidz because of problems with the
1044 * reconstruction when reading back less than the block size.
1045 * Note, BP_IS_RAIDZ() assumes no compression.
1046 */
1047 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1048 if (!BP_IS_RAIDZ(zio->io_bp))
1049 zio->io_orig_size = zio->io_size = size;
1050 }
1051
1052 /*
1053 * ==========================================================================
1054 * Prepare to read and write logical blocks
1055 * ==========================================================================
1056 */
1057
1058 static int
1059 zio_read_bp_init(zio_t *zio)
1060 {
1061 blkptr_t *bp = zio->io_bp;
1062
1063 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1064 zio->io_child_type == ZIO_CHILD_LOGICAL &&
1065 !(zio->io_flags & ZIO_FLAG_RAW)) {
1066 uint64_t psize =
1067 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1068 void *cbuf = zio_buf_alloc(psize);
1069
1070 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1071 }
1072
1073 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1074 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1075 decode_embedded_bp_compressed(bp, zio->io_data);
1076 } else {
1077 ASSERT(!BP_IS_EMBEDDED(bp));
1078 }
1079
1080 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1081 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1082
1083 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1084 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1085
1086 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1087 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1088
1089 return (ZIO_PIPELINE_CONTINUE);
1090 }
1091
1092 static int
1093 zio_write_bp_init(zio_t *zio)
1094 {
1095 spa_t *spa = zio->io_spa;
1096 zio_prop_t *zp = &zio->io_prop;
1097 enum zio_compress compress = zp->zp_compress;
1098 blkptr_t *bp = zio->io_bp;
1099 uint64_t lsize = zio->io_size;
1100 uint64_t psize = lsize;
1101 int pass = 1;
1102
1103 /*
1104 * If our children haven't all reached the ready stage,
1105 * wait for them and then repeat this pipeline stage.
1106 */
1107 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1108 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1109 return (ZIO_PIPELINE_STOP);
1110
1111 if (!IO_IS_ALLOCATING(zio))
1112 return (ZIO_PIPELINE_CONTINUE);
1113
1114 ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1115
1116 if (zio->io_bp_override) {
1117 ASSERT(bp->blk_birth != zio->io_txg);
1118 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1119
1120 *bp = *zio->io_bp_override;
1121 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1122
1123 if (BP_IS_EMBEDDED(bp))
1124 return (ZIO_PIPELINE_CONTINUE);
1125
1126 /*
1127 * If we've been overridden and nopwrite is set then
1128 * set the flag accordingly to indicate that a nopwrite
1129 * has already occurred.
1130 */
1131 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1132 ASSERT(!zp->zp_dedup);
1133 zio->io_flags |= ZIO_FLAG_NOPWRITE;
1134 return (ZIO_PIPELINE_CONTINUE);
1135 }
1136
1137 ASSERT(!zp->zp_nopwrite);
1138
1139 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1140 return (ZIO_PIPELINE_CONTINUE);
1141
1142 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1143 zp->zp_dedup_verify);
1144
1145 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1146 BP_SET_DEDUP(bp, 1);
1147 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1148 return (ZIO_PIPELINE_CONTINUE);
1149 }
1150 }
1151
1152 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1153 /*
1154 * We're rewriting an existing block, which means we're
1155 * working on behalf of spa_sync(). For spa_sync() to
1156 * converge, it must eventually be the case that we don't
1157 * have to allocate new blocks. But compression changes
1158 * the blocksize, which forces a reallocate, and makes
1159 * convergence take longer. Therefore, after the first
1160 * few passes, stop compressing to ensure convergence.
1161 */
1162 pass = spa_sync_pass(spa);
1163
1164 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1165 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1166 ASSERT(!BP_GET_DEDUP(bp));
1167
1168 if (pass >= zfs_sync_pass_dont_compress)
1169 compress = ZIO_COMPRESS_OFF;
1170
1171 /* Make sure someone doesn't change their mind on overwrites */
1172 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1173 spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1174 }
1175
1176 if (compress != ZIO_COMPRESS_OFF) {
1177 void *cbuf = zio_buf_alloc(lsize);
1178 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1179 if (psize == 0 || psize == lsize) {
1180 compress = ZIO_COMPRESS_OFF;
1181 zio_buf_free(cbuf, lsize);
1182 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1183 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1184 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1185 encode_embedded_bp_compressed(bp,
1186 cbuf, compress, lsize, psize);
1187 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1188 BP_SET_TYPE(bp, zio->io_prop.zp_type);
1189 BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1190 zio_buf_free(cbuf, lsize);
1191 bp->blk_birth = zio->io_txg;
1192 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1193 ASSERT(spa_feature_is_active(spa,
1194 SPA_FEATURE_EMBEDDED_DATA));
1195 return (ZIO_PIPELINE_CONTINUE);
1196 } else {
1197 /*
1198 * Round up compressed size up to the ashift
1199 * of the smallest-ashift device, and zero the tail.
1200 * This ensures that the compressed size of the BP
1201 * (and thus compressratio property) are correct,
1202 * in that we charge for the padding used to fill out
1203 * the last sector.
1204 */
1205 size_t rounded;
1206
1207 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
1208
1209 rounded = (size_t)P2ROUNDUP(psize,
1210 1ULL << spa->spa_min_ashift);
1211 if (rounded >= lsize) {
1212 compress = ZIO_COMPRESS_OFF;
1213 zio_buf_free(cbuf, lsize);
1214 psize = lsize;
1215 } else {
1216 bzero((char *)cbuf + psize, rounded - psize);
1217 psize = rounded;
1218 zio_push_transform(zio, cbuf,
1219 psize, lsize, NULL);
1220 }
1221 }
1222 }
1223
1224 /*
1225 * The final pass of spa_sync() must be all rewrites, but the first
1226 * few passes offer a trade-off: allocating blocks defers convergence,
1227 * but newly allocated blocks are sequential, so they can be written
1228 * to disk faster. Therefore, we allow the first few passes of
1229 * spa_sync() to allocate new blocks, but force rewrites after that.
1230 * There should only be a handful of blocks after pass 1 in any case.
1231 */
1232 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1233 BP_GET_PSIZE(bp) == psize &&
1234 pass >= zfs_sync_pass_rewrite) {
1235 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1236 ASSERT(psize != 0);
1237 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1238 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1239 } else {
1240 BP_ZERO(bp);
1241 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1242 }
1243
1244 if (psize == 0) {
1245 if (zio->io_bp_orig.blk_birth != 0 &&
1246 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1247 BP_SET_LSIZE(bp, lsize);
1248 BP_SET_TYPE(bp, zp->zp_type);
1249 BP_SET_LEVEL(bp, zp->zp_level);
1250 BP_SET_BIRTH(bp, zio->io_txg, 0);
1251 }
1252 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1253 } else {
1254 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1255 BP_SET_LSIZE(bp, lsize);
1256 BP_SET_TYPE(bp, zp->zp_type);
1257 BP_SET_LEVEL(bp, zp->zp_level);
1258 BP_SET_PSIZE(bp, psize);
1259 BP_SET_COMPRESS(bp, compress);
1260 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1261 BP_SET_DEDUP(bp, zp->zp_dedup);
1262 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1263 if (zp->zp_dedup) {
1264 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1265 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1266 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1267 }
1268 if (zp->zp_nopwrite) {
1269 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1270 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1271 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1272 }
1273 }
1274
1275 return (ZIO_PIPELINE_CONTINUE);
1276 }
1277
1278 static int
1279 zio_free_bp_init(zio_t *zio)
1280 {
1281 blkptr_t *bp = zio->io_bp;
1282
1283 if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1284 if (BP_GET_DEDUP(bp))
1285 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1286 }
1287
1288 return (ZIO_PIPELINE_CONTINUE);
1289 }
1290
1291 /*
1292 * ==========================================================================
1293 * Execute the I/O pipeline
1294 * ==========================================================================
1295 */
1296
1297 static void
1298 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1299 {
1300 spa_t *spa = zio->io_spa;
1301 zio_type_t t = zio->io_type;
1302 int flags = (cutinline ? TQ_FRONT : 0);
1303
1304 /*
1305 * If we're a config writer or a probe, the normal issue and
1306 * interrupt threads may all be blocked waiting for the config lock.
1307 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1308 */
1309 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1310 t = ZIO_TYPE_NULL;
1311
1312 /*
1313 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1314 */
1315 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1316 t = ZIO_TYPE_NULL;
1317
1318 /*
1319 * If this is a high priority I/O, then use the high priority taskq if
1320 * available.
1321 */
1322 if (zio->io_priority == ZIO_PRIORITY_NOW &&
1323 spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1324 q++;
1325
1326 ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1327
1328 /*
1329 * NB: We are assuming that the zio can only be dispatched
1330 * to a single taskq at a time. It would be a grievous error
1331 * to dispatch the zio to another taskq at the same time.
1332 */
1333 ASSERT(taskq_empty_ent(&zio->io_tqent));
1334 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1335 flags, &zio->io_tqent);
1336 }
1337
1338 static boolean_t
1339 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1340 {
1341 kthread_t *executor = zio->io_executor;
1342 spa_t *spa = zio->io_spa;
1343 zio_type_t t;
1344
1345 for (t = 0; t < ZIO_TYPES; t++) {
1346 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1347 uint_t i;
1348 for (i = 0; i < tqs->stqs_count; i++) {
1349 if (taskq_member(tqs->stqs_taskq[i], executor))
1350 return (B_TRUE);
1351 }
1352 }
1353
1354 return (B_FALSE);
1355 }
1356
1357 static int
1358 zio_issue_async(zio_t *zio)
1359 {
1360 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1361
1362 return (ZIO_PIPELINE_STOP);
1363 }
1364
1365 void
1366 zio_interrupt(zio_t *zio)
1367 {
1368 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1369 }
1370
1371 /*
1372 * Execute the I/O pipeline until one of the following occurs:
1373 * (1) the I/O completes; (2) the pipeline stalls waiting for
1374 * dependent child I/Os; (3) the I/O issues, so we're waiting
1375 * for an I/O completion interrupt; (4) the I/O is delegated by
1376 * vdev-level caching or aggregation; (5) the I/O is deferred
1377 * due to vdev-level queueing; (6) the I/O is handed off to
1378 * another thread. In all cases, the pipeline stops whenever
1379 * there's no CPU work; it never burns a thread in cv_wait_io().
1380 *
1381 * There's no locking on io_stage because there's no legitimate way
1382 * for multiple threads to be attempting to process the same I/O.
1383 */
1384 static zio_pipe_stage_t *zio_pipeline[];
1385
1386 /*
1387 * zio_execute() is a wrapper around the static function
1388 * __zio_execute() so that we can force __zio_execute() to be
1389 * inlined. This reduces stack overhead which is important
1390 * because __zio_execute() is called recursively in several zio
1391 * code paths. zio_execute() itself cannot be inlined because
1392 * it is externally visible.
1393 */
1394 void
1395 zio_execute(zio_t *zio)
1396 {
1397 fstrans_cookie_t cookie;
1398
1399 cookie = spl_fstrans_mark();
1400 __zio_execute(zio);
1401 spl_fstrans_unmark(cookie);
1402 }
1403
1404 __attribute__((always_inline))
1405 static inline void
1406 __zio_execute(zio_t *zio)
1407 {
1408 zio->io_executor = curthread;
1409
1410 while (zio->io_stage < ZIO_STAGE_DONE) {
1411 enum zio_stage pipeline = zio->io_pipeline;
1412 enum zio_stage stage = zio->io_stage;
1413 dsl_pool_t *dp;
1414 boolean_t cut;
1415 int rv;
1416
1417 ASSERT(!MUTEX_HELD(&zio->io_lock));
1418 ASSERT(ISP2(stage));
1419 ASSERT(zio->io_stall == NULL);
1420
1421 do {
1422 stage <<= 1;
1423 } while ((stage & pipeline) == 0);
1424
1425 ASSERT(stage <= ZIO_STAGE_DONE);
1426
1427 dp = spa_get_dsl(zio->io_spa);
1428 cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1429 zio_requeue_io_start_cut_in_line : B_FALSE;
1430
1431 /*
1432 * If we are in interrupt context and this pipeline stage
1433 * will grab a config lock that is held across I/O,
1434 * or may wait for an I/O that needs an interrupt thread
1435 * to complete, issue async to avoid deadlock.
1436 *
1437 * For VDEV_IO_START, we cut in line so that the io will
1438 * be sent to disk promptly.
1439 */
1440 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1441 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1442 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1443 return;
1444 }
1445
1446 /*
1447 * If we executing in the context of the tx_sync_thread,
1448 * or we are performing pool initialization outside of a
1449 * zio_taskq[ZIO_TASKQ_ISSUE|ZIO_TASKQ_ISSUE_HIGH] context.
1450 * Then issue the zio asynchronously to minimize stack usage
1451 * for these deep call paths.
1452 */
1453 if ((dp && curthread == dp->dp_tx.tx_sync_thread) ||
1454 (dp && spa_is_initializing(dp->dp_spa) &&
1455 !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
1456 !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))) {
1457 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1458 return;
1459 }
1460
1461 zio->io_stage = stage;
1462 rv = zio_pipeline[highbit64(stage) - 1](zio);
1463
1464 if (rv == ZIO_PIPELINE_STOP)
1465 return;
1466
1467 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1468 }
1469 }
1470
1471
1472 /*
1473 * ==========================================================================
1474 * Initiate I/O, either sync or async
1475 * ==========================================================================
1476 */
1477 int
1478 zio_wait(zio_t *zio)
1479 {
1480 int error;
1481
1482 ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1483 ASSERT(zio->io_executor == NULL);
1484
1485 zio->io_waiter = curthread;
1486
1487 __zio_execute(zio);
1488
1489 mutex_enter(&zio->io_lock);
1490 while (zio->io_executor != NULL)
1491 cv_wait_io(&zio->io_cv, &zio->io_lock);
1492 mutex_exit(&zio->io_lock);
1493
1494 error = zio->io_error;
1495 zio_destroy(zio);
1496
1497 return (error);
1498 }
1499
1500 void
1501 zio_nowait(zio_t *zio)
1502 {
1503 ASSERT(zio->io_executor == NULL);
1504
1505 if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1506 zio_unique_parent(zio) == NULL) {
1507 zio_t *pio;
1508
1509 /*
1510 * This is a logical async I/O with no parent to wait for it.
1511 * We add it to the spa_async_root_zio "Godfather" I/O which
1512 * will ensure they complete prior to unloading the pool.
1513 */
1514 spa_t *spa = zio->io_spa;
1515 kpreempt_disable();
1516 pio = spa->spa_async_zio_root[CPU_SEQID];
1517 kpreempt_enable();
1518
1519 zio_add_child(pio, zio);
1520 }
1521
1522 __zio_execute(zio);
1523 }
1524
1525 /*
1526 * ==========================================================================
1527 * Reexecute or suspend/resume failed I/O
1528 * ==========================================================================
1529 */
1530
1531 static void
1532 zio_reexecute(zio_t *pio)
1533 {
1534 zio_t *cio, *cio_next;
1535 int c, w;
1536
1537 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1538 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1539 ASSERT(pio->io_gang_leader == NULL);
1540 ASSERT(pio->io_gang_tree == NULL);
1541
1542 pio->io_flags = pio->io_orig_flags;
1543 pio->io_stage = pio->io_orig_stage;
1544 pio->io_pipeline = pio->io_orig_pipeline;
1545 pio->io_reexecute = 0;
1546 pio->io_flags |= ZIO_FLAG_REEXECUTED;
1547 pio->io_error = 0;
1548 for (w = 0; w < ZIO_WAIT_TYPES; w++)
1549 pio->io_state[w] = 0;
1550 for (c = 0; c < ZIO_CHILD_TYPES; c++)
1551 pio->io_child_error[c] = 0;
1552
1553 if (IO_IS_ALLOCATING(pio))
1554 BP_ZERO(pio->io_bp);
1555
1556 /*
1557 * As we reexecute pio's children, new children could be created.
1558 * New children go to the head of pio's io_child_list, however,
1559 * so we will (correctly) not reexecute them. The key is that
1560 * the remainder of pio's io_child_list, from 'cio_next' onward,
1561 * cannot be affected by any side effects of reexecuting 'cio'.
1562 */
1563 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1564 cio_next = zio_walk_children(pio);
1565 mutex_enter(&pio->io_lock);
1566 for (w = 0; w < ZIO_WAIT_TYPES; w++)
1567 pio->io_children[cio->io_child_type][w]++;
1568 mutex_exit(&pio->io_lock);
1569 zio_reexecute(cio);
1570 }
1571
1572 /*
1573 * Now that all children have been reexecuted, execute the parent.
1574 * We don't reexecute "The Godfather" I/O here as it's the
1575 * responsibility of the caller to wait on him.
1576 */
1577 if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1578 __zio_execute(pio);
1579 }
1580
1581 void
1582 zio_suspend(spa_t *spa, zio_t *zio)
1583 {
1584 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1585 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1586 "failure and the failure mode property for this pool "
1587 "is set to panic.", spa_name(spa));
1588
1589 cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
1590 "failure and has been suspended.\n", spa_name(spa));
1591
1592 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1593
1594 mutex_enter(&spa->spa_suspend_lock);
1595
1596 if (spa->spa_suspend_zio_root == NULL)
1597 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1598 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1599 ZIO_FLAG_GODFATHER);
1600
1601 spa->spa_suspended = B_TRUE;
1602
1603 if (zio != NULL) {
1604 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1605 ASSERT(zio != spa->spa_suspend_zio_root);
1606 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1607 ASSERT(zio_unique_parent(zio) == NULL);
1608 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1609 zio_add_child(spa->spa_suspend_zio_root, zio);
1610 }
1611
1612 mutex_exit(&spa->spa_suspend_lock);
1613 }
1614
1615 int
1616 zio_resume(spa_t *spa)
1617 {
1618 zio_t *pio;
1619
1620 /*
1621 * Reexecute all previously suspended i/o.
1622 */
1623 mutex_enter(&spa->spa_suspend_lock);
1624 spa->spa_suspended = B_FALSE;
1625 cv_broadcast(&spa->spa_suspend_cv);
1626 pio = spa->spa_suspend_zio_root;
1627 spa->spa_suspend_zio_root = NULL;
1628 mutex_exit(&spa->spa_suspend_lock);
1629
1630 if (pio == NULL)
1631 return (0);
1632
1633 zio_reexecute(pio);
1634 return (zio_wait(pio));
1635 }
1636
1637 void
1638 zio_resume_wait(spa_t *spa)
1639 {
1640 mutex_enter(&spa->spa_suspend_lock);
1641 while (spa_suspended(spa))
1642 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1643 mutex_exit(&spa->spa_suspend_lock);
1644 }
1645
1646 /*
1647 * ==========================================================================
1648 * Gang blocks.
1649 *
1650 * A gang block is a collection of small blocks that looks to the DMU
1651 * like one large block. When zio_dva_allocate() cannot find a block
1652 * of the requested size, due to either severe fragmentation or the pool
1653 * being nearly full, it calls zio_write_gang_block() to construct the
1654 * block from smaller fragments.
1655 *
1656 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1657 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
1658 * an indirect block: it's an array of block pointers. It consumes
1659 * only one sector and hence is allocatable regardless of fragmentation.
1660 * The gang header's bps point to its gang members, which hold the data.
1661 *
1662 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1663 * as the verifier to ensure uniqueness of the SHA256 checksum.
1664 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1665 * not the gang header. This ensures that data block signatures (needed for
1666 * deduplication) are independent of how the block is physically stored.
1667 *
1668 * Gang blocks can be nested: a gang member may itself be a gang block.
1669 * Thus every gang block is a tree in which root and all interior nodes are
1670 * gang headers, and the leaves are normal blocks that contain user data.
1671 * The root of the gang tree is called the gang leader.
1672 *
1673 * To perform any operation (read, rewrite, free, claim) on a gang block,
1674 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1675 * in the io_gang_tree field of the original logical i/o by recursively
1676 * reading the gang leader and all gang headers below it. This yields
1677 * an in-core tree containing the contents of every gang header and the
1678 * bps for every constituent of the gang block.
1679 *
1680 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1681 * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
1682 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1683 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1684 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1685 * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
1686 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1687 * of the gang header plus zio_checksum_compute() of the data to update the
1688 * gang header's blk_cksum as described above.
1689 *
1690 * The two-phase assemble/issue model solves the problem of partial failure --
1691 * what if you'd freed part of a gang block but then couldn't read the
1692 * gang header for another part? Assembling the entire gang tree first
1693 * ensures that all the necessary gang header I/O has succeeded before
1694 * starting the actual work of free, claim, or write. Once the gang tree
1695 * is assembled, free and claim are in-memory operations that cannot fail.
1696 *
1697 * In the event that a gang write fails, zio_dva_unallocate() walks the
1698 * gang tree to immediately free (i.e. insert back into the space map)
1699 * everything we've allocated. This ensures that we don't get ENOSPC
1700 * errors during repeated suspend/resume cycles due to a flaky device.
1701 *
1702 * Gang rewrites only happen during sync-to-convergence. If we can't assemble
1703 * the gang tree, we won't modify the block, so we can safely defer the free
1704 * (knowing that the block is still intact). If we *can* assemble the gang
1705 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1706 * each constituent bp and we can allocate a new block on the next sync pass.
1707 *
1708 * In all cases, the gang tree allows complete recovery from partial failure.
1709 * ==========================================================================
1710 */
1711
1712 static zio_t *
1713 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1714 {
1715 if (gn != NULL)
1716 return (pio);
1717
1718 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1719 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1720 &pio->io_bookmark));
1721 }
1722
1723 zio_t *
1724 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1725 {
1726 zio_t *zio;
1727
1728 if (gn != NULL) {
1729 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1730 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1731 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1732 /*
1733 * As we rewrite each gang header, the pipeline will compute
1734 * a new gang block header checksum for it; but no one will
1735 * compute a new data checksum, so we do that here. The one
1736 * exception is the gang leader: the pipeline already computed
1737 * its data checksum because that stage precedes gang assembly.
1738 * (Presently, nothing actually uses interior data checksums;
1739 * this is just good hygiene.)
1740 */
1741 if (gn != pio->io_gang_leader->io_gang_tree) {
1742 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1743 data, BP_GET_PSIZE(bp));
1744 }
1745 /*
1746 * If we are here to damage data for testing purposes,
1747 * leave the GBH alone so that we can detect the damage.
1748 */
1749 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1750 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1751 } else {
1752 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1753 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1754 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1755 }
1756
1757 return (zio);
1758 }
1759
1760 /* ARGSUSED */
1761 zio_t *
1762 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1763 {
1764 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1765 ZIO_GANG_CHILD_FLAGS(pio)));
1766 }
1767
1768 /* ARGSUSED */
1769 zio_t *
1770 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1771 {
1772 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1773 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1774 }
1775
1776 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1777 NULL,
1778 zio_read_gang,
1779 zio_rewrite_gang,
1780 zio_free_gang,
1781 zio_claim_gang,
1782 NULL
1783 };
1784
1785 static void zio_gang_tree_assemble_done(zio_t *zio);
1786
1787 static zio_gang_node_t *
1788 zio_gang_node_alloc(zio_gang_node_t **gnpp)
1789 {
1790 zio_gang_node_t *gn;
1791
1792 ASSERT(*gnpp == NULL);
1793
1794 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1795 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1796 *gnpp = gn;
1797
1798 return (gn);
1799 }
1800
1801 static void
1802 zio_gang_node_free(zio_gang_node_t **gnpp)
1803 {
1804 zio_gang_node_t *gn = *gnpp;
1805 int g;
1806
1807 for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1808 ASSERT(gn->gn_child[g] == NULL);
1809
1810 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1811 kmem_free(gn, sizeof (*gn));
1812 *gnpp = NULL;
1813 }
1814
1815 static void
1816 zio_gang_tree_free(zio_gang_node_t **gnpp)
1817 {
1818 zio_gang_node_t *gn = *gnpp;
1819 int g;
1820
1821 if (gn == NULL)
1822 return;
1823
1824 for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
1825 zio_gang_tree_free(&gn->gn_child[g]);
1826
1827 zio_gang_node_free(gnpp);
1828 }
1829
1830 static void
1831 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1832 {
1833 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1834
1835 ASSERT(gio->io_gang_leader == gio);
1836 ASSERT(BP_IS_GANG(bp));
1837
1838 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1839 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1840 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1841 }
1842
1843 static void
1844 zio_gang_tree_assemble_done(zio_t *zio)
1845 {
1846 zio_t *gio = zio->io_gang_leader;
1847 zio_gang_node_t *gn = zio->io_private;
1848 blkptr_t *bp = zio->io_bp;
1849 int g;
1850
1851 ASSERT(gio == zio_unique_parent(zio));
1852 ASSERT(zio->io_child_count == 0);
1853
1854 if (zio->io_error)
1855 return;
1856
1857 if (BP_SHOULD_BYTESWAP(bp))
1858 byteswap_uint64_array(zio->io_data, zio->io_size);
1859
1860 ASSERT(zio->io_data == gn->gn_gbh);
1861 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1862 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1863
1864 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1865 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1866 if (!BP_IS_GANG(gbp))
1867 continue;
1868 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1869 }
1870 }
1871
1872 static void
1873 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1874 {
1875 zio_t *gio = pio->io_gang_leader;
1876 zio_t *zio;
1877 int g;
1878
1879 ASSERT(BP_IS_GANG(bp) == !!gn);
1880 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1881 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1882
1883 /*
1884 * If you're a gang header, your data is in gn->gn_gbh.
1885 * If you're a gang member, your data is in 'data' and gn == NULL.
1886 */
1887 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1888
1889 if (gn != NULL) {
1890 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1891
1892 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1893 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1894 if (BP_IS_HOLE(gbp))
1895 continue;
1896 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1897 data = (char *)data + BP_GET_PSIZE(gbp);
1898 }
1899 }
1900
1901 if (gn == gio->io_gang_tree)
1902 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1903
1904 if (zio != pio)
1905 zio_nowait(zio);
1906 }
1907
1908 static int
1909 zio_gang_assemble(zio_t *zio)
1910 {
1911 blkptr_t *bp = zio->io_bp;
1912
1913 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1914 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1915
1916 zio->io_gang_leader = zio;
1917
1918 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1919
1920 return (ZIO_PIPELINE_CONTINUE);
1921 }
1922
1923 static int
1924 zio_gang_issue(zio_t *zio)
1925 {
1926 blkptr_t *bp = zio->io_bp;
1927
1928 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1929 return (ZIO_PIPELINE_STOP);
1930
1931 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1932 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1933
1934 if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1935 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1936 else
1937 zio_gang_tree_free(&zio->io_gang_tree);
1938
1939 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1940
1941 return (ZIO_PIPELINE_CONTINUE);
1942 }
1943
1944 static void
1945 zio_write_gang_member_ready(zio_t *zio)
1946 {
1947 zio_t *pio = zio_unique_parent(zio);
1948 dva_t *cdva = zio->io_bp->blk_dva;
1949 dva_t *pdva = pio->io_bp->blk_dva;
1950 uint64_t asize;
1951 int d;
1952 ASSERTV(zio_t *gio = zio->io_gang_leader);
1953
1954 if (BP_IS_HOLE(zio->io_bp))
1955 return;
1956
1957 ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1958
1959 ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1960 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1961 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1962 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1963 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1964
1965 mutex_enter(&pio->io_lock);
1966 for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1967 ASSERT(DVA_GET_GANG(&pdva[d]));
1968 asize = DVA_GET_ASIZE(&pdva[d]);
1969 asize += DVA_GET_ASIZE(&cdva[d]);
1970 DVA_SET_ASIZE(&pdva[d], asize);
1971 }
1972 mutex_exit(&pio->io_lock);
1973 }
1974
1975 static int
1976 zio_write_gang_block(zio_t *pio)
1977 {
1978 spa_t *spa = pio->io_spa;
1979 blkptr_t *bp = pio->io_bp;
1980 zio_t *gio = pio->io_gang_leader;
1981 zio_t *zio;
1982 zio_gang_node_t *gn, **gnpp;
1983 zio_gbh_phys_t *gbh;
1984 uint64_t txg = pio->io_txg;
1985 uint64_t resid = pio->io_size;
1986 uint64_t lsize;
1987 int copies = gio->io_prop.zp_copies;
1988 int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1989 zio_prop_t zp;
1990 int g, error;
1991
1992 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1993 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1994 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1995 if (error) {
1996 pio->io_error = error;
1997 return (ZIO_PIPELINE_CONTINUE);
1998 }
1999
2000 if (pio == gio) {
2001 gnpp = &gio->io_gang_tree;
2002 } else {
2003 gnpp = pio->io_private;
2004 ASSERT(pio->io_ready == zio_write_gang_member_ready);
2005 }
2006
2007 gn = zio_gang_node_alloc(gnpp);
2008 gbh = gn->gn_gbh;
2009 bzero(gbh, SPA_GANGBLOCKSIZE);
2010
2011 /*
2012 * Create the gang header.
2013 */
2014 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
2015 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
2016
2017 /*
2018 * Create and nowait the gang children.
2019 */
2020 for (g = 0; resid != 0; resid -= lsize, g++) {
2021 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
2022 SPA_MINBLOCKSIZE);
2023 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
2024
2025 zp.zp_checksum = gio->io_prop.zp_checksum;
2026 zp.zp_compress = ZIO_COMPRESS_OFF;
2027 zp.zp_type = DMU_OT_NONE;
2028 zp.zp_level = 0;
2029 zp.zp_copies = gio->io_prop.zp_copies;
2030 zp.zp_dedup = B_FALSE;
2031 zp.zp_dedup_verify = B_FALSE;
2032 zp.zp_nopwrite = B_FALSE;
2033
2034 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
2035 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
2036 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
2037 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
2038 &pio->io_bookmark));
2039 }
2040
2041 /*
2042 * Set pio's pipeline to just wait for zio to finish.
2043 */
2044 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2045
2046 /*
2047 * We didn't allocate this bp, so make sure it doesn't get unmarked.
2048 */
2049 pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
2050
2051 zio_nowait(zio);
2052
2053 return (ZIO_PIPELINE_CONTINUE);
2054 }
2055
2056 /*
2057 * The zio_nop_write stage in the pipeline determines if allocating
2058 * a new bp is necessary. By leveraging a cryptographically secure checksum,
2059 * such as SHA256, we can compare the checksums of the new data and the old
2060 * to determine if allocating a new block is required. The nopwrite
2061 * feature can handle writes in either syncing or open context (i.e. zil
2062 * writes) and as a result is mutually exclusive with dedup.
2063 */
2064 static int
2065 zio_nop_write(zio_t *zio)
2066 {
2067 blkptr_t *bp = zio->io_bp;
2068 blkptr_t *bp_orig = &zio->io_bp_orig;
2069 zio_prop_t *zp = &zio->io_prop;
2070
2071 ASSERT(BP_GET_LEVEL(bp) == 0);
2072 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2073 ASSERT(zp->zp_nopwrite);
2074 ASSERT(!zp->zp_dedup);
2075 ASSERT(zio->io_bp_override == NULL);
2076 ASSERT(IO_IS_ALLOCATING(zio));
2077
2078 /*
2079 * Check to see if the original bp and the new bp have matching
2080 * characteristics (i.e. same checksum, compression algorithms, etc).
2081 * If they don't then just continue with the pipeline which will
2082 * allocate a new bp.
2083 */
2084 if (BP_IS_HOLE(bp_orig) ||
2085 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2086 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2087 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2088 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2089 zp->zp_copies != BP_GET_NDVAS(bp_orig))
2090 return (ZIO_PIPELINE_CONTINUE);
2091
2092 /*
2093 * If the checksums match then reset the pipeline so that we
2094 * avoid allocating a new bp and issuing any I/O.
2095 */
2096 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2097 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2098 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2099 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2100 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2101 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2102 sizeof (uint64_t)) == 0);
2103
2104 *bp = *bp_orig;
2105 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2106 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2107 }
2108
2109 return (ZIO_PIPELINE_CONTINUE);
2110 }
2111
2112 /*
2113 * ==========================================================================
2114 * Dedup
2115 * ==========================================================================
2116 */
2117 static void
2118 zio_ddt_child_read_done(zio_t *zio)
2119 {
2120 blkptr_t *bp = zio->io_bp;
2121 ddt_entry_t *dde = zio->io_private;
2122 ddt_phys_t *ddp;
2123 zio_t *pio = zio_unique_parent(zio);
2124
2125 mutex_enter(&pio->io_lock);
2126 ddp = ddt_phys_select(dde, bp);
2127 if (zio->io_error == 0)
2128 ddt_phys_clear(ddp); /* this ddp doesn't need repair */
2129 if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2130 dde->dde_repair_data = zio->io_data;
2131 else
2132 zio_buf_free(zio->io_data, zio->io_size);
2133 mutex_exit(&pio->io_lock);
2134 }
2135
2136 static int
2137 zio_ddt_read_start(zio_t *zio)
2138 {
2139 blkptr_t *bp = zio->io_bp;
2140 int p;
2141
2142 ASSERT(BP_GET_DEDUP(bp));
2143 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2144 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2145
2146 if (zio->io_child_error[ZIO_CHILD_DDT]) {
2147 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2148 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2149 ddt_phys_t *ddp = dde->dde_phys;
2150 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2151 blkptr_t blk;
2152
2153 ASSERT(zio->io_vsd == NULL);
2154 zio->io_vsd = dde;
2155
2156 if (ddp_self == NULL)
2157 return (ZIO_PIPELINE_CONTINUE);
2158
2159 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2160 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2161 continue;
2162 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2163 &blk);
2164 zio_nowait(zio_read(zio, zio->io_spa, &blk,
2165 zio_buf_alloc(zio->io_size), zio->io_size,
2166 zio_ddt_child_read_done, dde, zio->io_priority,
2167 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2168 &zio->io_bookmark));
2169 }
2170 return (ZIO_PIPELINE_CONTINUE);
2171 }
2172
2173 zio_nowait(zio_read(zio, zio->io_spa, bp,
2174 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2175 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2176
2177 return (ZIO_PIPELINE_CONTINUE);
2178 }
2179
2180 static int
2181 zio_ddt_read_done(zio_t *zio)
2182 {
2183 blkptr_t *bp = zio->io_bp;
2184
2185 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2186 return (ZIO_PIPELINE_STOP);
2187
2188 ASSERT(BP_GET_DEDUP(bp));
2189 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2190 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2191
2192 if (zio->io_child_error[ZIO_CHILD_DDT]) {
2193 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2194 ddt_entry_t *dde = zio->io_vsd;
2195 if (ddt == NULL) {
2196 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2197 return (ZIO_PIPELINE_CONTINUE);
2198 }
2199 if (dde == NULL) {
2200 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2201 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2202 return (ZIO_PIPELINE_STOP);
2203 }
2204 if (dde->dde_repair_data != NULL) {
2205 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2206 zio->io_child_error[ZIO_CHILD_DDT] = 0;
2207 }
2208 ddt_repair_done(ddt, dde);
2209 zio->io_vsd = NULL;
2210 }
2211
2212 ASSERT(zio->io_vsd == NULL);
2213
2214 return (ZIO_PIPELINE_CONTINUE);
2215 }
2216
2217 static boolean_t
2218 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2219 {
2220 spa_t *spa = zio->io_spa;
2221 int p;
2222
2223 /*
2224 * Note: we compare the original data, not the transformed data,
2225 * because when zio->io_bp is an override bp, we will not have
2226 * pushed the I/O transforms. That's an important optimization
2227 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2228 */
2229 for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2230 zio_t *lio = dde->dde_lead_zio[p];
2231
2232 if (lio != NULL) {
2233 return (lio->io_orig_size != zio->io_orig_size ||
2234 bcmp(zio->io_orig_data, lio->io_orig_data,
2235 zio->io_orig_size) != 0);
2236 }
2237 }
2238
2239 for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2240 ddt_phys_t *ddp = &dde->dde_phys[p];
2241
2242 if (ddp->ddp_phys_birth != 0) {
2243 arc_buf_t *abuf = NULL;
2244 arc_flags_t aflags = ARC_FLAG_WAIT;
2245 blkptr_t blk = *zio->io_bp;
2246 int error;
2247
2248 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2249
2250 ddt_exit(ddt);
2251
2252 error = arc_read(NULL, spa, &blk,
2253 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2254 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2255 &aflags, &zio->io_bookmark);
2256
2257 if (error == 0) {
2258 if (arc_buf_size(abuf) != zio->io_orig_size ||
2259 bcmp(abuf->b_data, zio->io_orig_data,
2260 zio->io_orig_size) != 0)
2261 error = SET_ERROR(EEXIST);
2262 VERIFY(arc_buf_remove_ref(abuf, &abuf));
2263 }
2264
2265 ddt_enter(ddt);
2266 return (error != 0);
2267 }
2268 }
2269
2270 return (B_FALSE);
2271 }
2272
2273 static void
2274 zio_ddt_child_write_ready(zio_t *zio)
2275 {
2276 int p = zio->io_prop.zp_copies;
2277 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2278 ddt_entry_t *dde = zio->io_private;
2279 ddt_phys_t *ddp = &dde->dde_phys[p];
2280 zio_t *pio;
2281
2282 if (zio->io_error)
2283 return;
2284
2285 ddt_enter(ddt);
2286
2287 ASSERT(dde->dde_lead_zio[p] == zio);
2288
2289 ddt_phys_fill(ddp, zio->io_bp);
2290
2291 while ((pio = zio_walk_parents(zio)) != NULL)
2292 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2293
2294 ddt_exit(ddt);
2295 }
2296
2297 static void
2298 zio_ddt_child_write_done(zio_t *zio)
2299 {
2300 int p = zio->io_prop.zp_copies;
2301 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2302 ddt_entry_t *dde = zio->io_private;
2303 ddt_phys_t *ddp = &dde->dde_phys[p];
2304
2305 ddt_enter(ddt);
2306
2307 ASSERT(ddp->ddp_refcnt == 0);
2308 ASSERT(dde->dde_lead_zio[p] == zio);
2309 dde->dde_lead_zio[p] = NULL;
2310
2311 if (zio->io_error == 0) {
2312 while (zio_walk_parents(zio) != NULL)
2313 ddt_phys_addref(ddp);
2314 } else {
2315 ddt_phys_clear(ddp);
2316 }
2317
2318 ddt_exit(ddt);
2319 }
2320
2321 static void
2322 zio_ddt_ditto_write_done(zio_t *zio)
2323 {
2324 int p = DDT_PHYS_DITTO;
2325 blkptr_t *bp = zio->io_bp;
2326 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2327 ddt_entry_t *dde = zio->io_private;
2328 ddt_phys_t *ddp = &dde->dde_phys[p];
2329 ddt_key_t *ddk = &dde->dde_key;
2330 ASSERTV(zio_prop_t *zp = &zio->io_prop);
2331
2332 ddt_enter(ddt);
2333
2334 ASSERT(ddp->ddp_refcnt == 0);
2335 ASSERT(dde->dde_lead_zio[p] == zio);
2336 dde->dde_lead_zio[p] = NULL;
2337
2338 if (zio->io_error == 0) {
2339 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2340 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2341 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2342 if (ddp->ddp_phys_birth != 0)
2343 ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2344 ddt_phys_fill(ddp, bp);
2345 }
2346
2347 ddt_exit(ddt);
2348 }
2349
2350 static int
2351 zio_ddt_write(zio_t *zio)
2352 {
2353 spa_t *spa = zio->io_spa;
2354 blkptr_t *bp = zio->io_bp;
2355 uint64_t txg = zio->io_txg;
2356 zio_prop_t *zp = &zio->io_prop;
2357 int p = zp->zp_copies;
2358 int ditto_copies;
2359 zio_t *cio = NULL;
2360 zio_t *dio = NULL;
2361 ddt_t *ddt = ddt_select(spa, bp);
2362 ddt_entry_t *dde;
2363 ddt_phys_t *ddp;
2364
2365 ASSERT(BP_GET_DEDUP(bp));
2366 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2367 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2368
2369 ddt_enter(ddt);
2370 dde = ddt_lookup(ddt, bp, B_TRUE);
2371 ddp = &dde->dde_phys[p];
2372
2373 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2374 /*
2375 * If we're using a weak checksum, upgrade to a strong checksum
2376 * and try again. If we're already using a strong checksum,
2377 * we can't resolve it, so just convert to an ordinary write.
2378 * (And automatically e-mail a paper to Nature?)
2379 */
2380 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2381 zp->zp_checksum = spa_dedup_checksum(spa);
2382 zio_pop_transforms(zio);
2383 zio->io_stage = ZIO_STAGE_OPEN;
2384 BP_ZERO(bp);
2385 } else {
2386 zp->zp_dedup = B_FALSE;
2387 }
2388 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2389 ddt_exit(ddt);
2390 return (ZIO_PIPELINE_CONTINUE);
2391 }
2392
2393 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2394 ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2395
2396 if (ditto_copies > ddt_ditto_copies_present(dde) &&
2397 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2398 zio_prop_t czp = *zp;
2399
2400 czp.zp_copies = ditto_copies;
2401
2402 /*
2403 * If we arrived here with an override bp, we won't have run
2404 * the transform stack, so we won't have the data we need to
2405 * generate a child i/o. So, toss the override bp and restart.
2406 * This is safe, because using the override bp is just an
2407 * optimization; and it's rare, so the cost doesn't matter.
2408 */
2409 if (zio->io_bp_override) {
2410 zio_pop_transforms(zio);
2411 zio->io_stage = ZIO_STAGE_OPEN;
2412 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2413 zio->io_bp_override = NULL;
2414 BP_ZERO(bp);
2415 ddt_exit(ddt);
2416 return (ZIO_PIPELINE_CONTINUE);
2417 }
2418
2419 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2420 zio->io_orig_size, &czp, NULL, NULL,
2421 zio_ddt_ditto_write_done, dde, zio->io_priority,
2422 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2423
2424 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2425 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2426 }
2427
2428 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2429 if (ddp->ddp_phys_birth != 0)
2430 ddt_bp_fill(ddp, bp, txg);
2431 if (dde->dde_lead_zio[p] != NULL)
2432 zio_add_child(zio, dde->dde_lead_zio[p]);
2433 else
2434 ddt_phys_addref(ddp);
2435 } else if (zio->io_bp_override) {
2436 ASSERT(bp->blk_birth == txg);
2437 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2438 ddt_phys_fill(ddp, bp);
2439 ddt_phys_addref(ddp);
2440 } else {
2441 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2442 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2443 zio_ddt_child_write_done, dde, zio->io_priority,
2444 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2445
2446 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2447 dde->dde_lead_zio[p] = cio;
2448 }
2449
2450 ddt_exit(ddt);
2451
2452 if (cio)
2453 zio_nowait(cio);
2454 if (dio)
2455 zio_nowait(dio);
2456
2457 return (ZIO_PIPELINE_CONTINUE);
2458 }
2459
2460 ddt_entry_t *freedde; /* for debugging */
2461
2462 static int
2463 zio_ddt_free(zio_t *zio)
2464 {
2465 spa_t *spa = zio->io_spa;
2466 blkptr_t *bp = zio->io_bp;
2467 ddt_t *ddt = ddt_select(spa, bp);
2468 ddt_entry_t *dde;
2469 ddt_phys_t *ddp;
2470
2471 ASSERT(BP_GET_DEDUP(bp));
2472 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2473
2474 ddt_enter(ddt);
2475 freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2476 if (dde) {
2477 ddp = ddt_phys_select(dde, bp);
2478 if (ddp)
2479 ddt_phys_decref(ddp);
2480 }
2481 ddt_exit(ddt);
2482
2483 return (ZIO_PIPELINE_CONTINUE);
2484 }
2485
2486 /*
2487 * ==========================================================================
2488 * Allocate and free blocks
2489 * ==========================================================================
2490 */
2491 static int
2492 zio_dva_allocate(zio_t *zio)
2493 {
2494 spa_t *spa = zio->io_spa;
2495 metaslab_class_t *mc = spa_normal_class(spa);
2496 blkptr_t *bp = zio->io_bp;
2497 int error;
2498 int flags = 0;
2499
2500 if (zio->io_gang_leader == NULL) {
2501 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2502 zio->io_gang_leader = zio;
2503 }
2504
2505 ASSERT(BP_IS_HOLE(bp));
2506 ASSERT0(BP_GET_NDVAS(bp));
2507 ASSERT3U(zio->io_prop.zp_copies, >, 0);
2508 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2509 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2510
2511 /*
2512 * The dump device does not support gang blocks so allocation on
2513 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2514 * the "fast" gang feature.
2515 */
2516 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2517 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2518 METASLAB_GANG_CHILD : 0;
2519 flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
2520 error = metaslab_alloc(spa, mc, zio->io_size, bp,
2521 zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2522
2523 if (error) {
2524 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2525 "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2526 error);
2527 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2528 return (zio_write_gang_block(zio));
2529 zio->io_error = error;
2530 }
2531
2532 return (ZIO_PIPELINE_CONTINUE);
2533 }
2534
2535 static int
2536 zio_dva_free(zio_t *zio)
2537 {
2538 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2539
2540 return (ZIO_PIPELINE_CONTINUE);
2541 }
2542
2543 static int
2544 zio_dva_claim(zio_t *zio)
2545 {
2546 int error;
2547
2548 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2549 if (error)
2550 zio->io_error = error;
2551
2552 return (ZIO_PIPELINE_CONTINUE);
2553 }
2554
2555 /*
2556 * Undo an allocation. This is used by zio_done() when an I/O fails
2557 * and we want to give back the block we just allocated.
2558 * This handles both normal blocks and gang blocks.
2559 */
2560 static void
2561 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2562 {
2563 int g;
2564
2565 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2566 ASSERT(zio->io_bp_override == NULL);
2567
2568 if (!BP_IS_HOLE(bp))
2569 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2570
2571 if (gn != NULL) {
2572 for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2573 zio_dva_unallocate(zio, gn->gn_child[g],
2574 &gn->gn_gbh->zg_blkptr[g]);
2575 }
2576 }
2577 }
2578
2579 /*
2580 * Try to allocate an intent log block. Return 0 on success, errno on failure.
2581 */
2582 int
2583 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
2584 boolean_t use_slog)
2585 {
2586 int error = 1;
2587
2588 ASSERT(txg > spa_syncing_txg(spa));
2589
2590 /*
2591 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2592 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2593 * when allocating them.
2594 */
2595 if (use_slog) {
2596 error = metaslab_alloc(spa, spa_log_class(spa), size,
2597 new_bp, 1, txg, NULL,
2598 METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
2599 }
2600
2601 if (error) {
2602 error = metaslab_alloc(spa, spa_normal_class(spa), size,
2603 new_bp, 1, txg, NULL,
2604 METASLAB_FASTWRITE);
2605 }
2606
2607 if (error == 0) {
2608 BP_SET_LSIZE(new_bp, size);
2609 BP_SET_PSIZE(new_bp, size);
2610 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2611 BP_SET_CHECKSUM(new_bp,
2612 spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2613 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2614 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2615 BP_SET_LEVEL(new_bp, 0);
2616 BP_SET_DEDUP(new_bp, 0);
2617 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2618 }
2619
2620 return (error);
2621 }
2622
2623 /*
2624 * Free an intent log block.
2625 */
2626 void
2627 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2628 {
2629 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2630 ASSERT(!BP_IS_GANG(bp));
2631
2632 zio_free(spa, txg, bp);
2633 }
2634
2635 /*
2636 * ==========================================================================
2637 * Read and write to physical devices
2638 * ==========================================================================
2639 */
2640
2641
2642 /*
2643 * Issue an I/O to the underlying vdev. Typically the issue pipeline
2644 * stops after this stage and will resume upon I/O completion.
2645 * However, there are instances where the vdev layer may need to
2646 * continue the pipeline when an I/O was not issued. Since the I/O
2647 * that was sent to the vdev layer might be different than the one
2648 * currently active in the pipeline (see vdev_queue_io()), we explicitly
2649 * force the underlying vdev layers to call either zio_execute() or
2650 * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2651 */
2652 static int
2653 zio_vdev_io_start(zio_t *zio)
2654 {
2655 vdev_t *vd = zio->io_vd;
2656 uint64_t align;
2657 spa_t *spa = zio->io_spa;
2658
2659 ASSERT(zio->io_error == 0);
2660 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2661
2662 if (vd == NULL) {
2663 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2664 spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2665
2666 /*
2667 * The mirror_ops handle multiple DVAs in a single BP.
2668 */
2669 vdev_mirror_ops.vdev_op_io_start(zio);
2670 return (ZIO_PIPELINE_STOP);
2671 }
2672
2673 /*
2674 * We keep track of time-sensitive I/Os so that the scan thread
2675 * can quickly react to certain workloads. In particular, we care
2676 * about non-scrubbing, top-level reads and writes with the following
2677 * characteristics:
2678 * - synchronous writes of user data to non-slog devices
2679 * - any reads of user data
2680 * When these conditions are met, adjust the timestamp of spa_last_io
2681 * which allows the scan thread to adjust its workload accordingly.
2682 */
2683 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2684 vd == vd->vdev_top && !vd->vdev_islog &&
2685 zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2686 zio->io_txg != spa_syncing_txg(spa)) {
2687 uint64_t old = spa->spa_last_io;
2688 uint64_t new = ddi_get_lbolt64();
2689 if (old != new)
2690 (void) atomic_cas_64(&spa->spa_last_io, old, new);
2691 }
2692
2693 align = 1ULL << vd->vdev_top->vdev_ashift;
2694
2695 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2696 P2PHASE(zio->io_size, align) != 0) {
2697 /* Transform logical writes to be a full physical block size. */
2698 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2699 char *abuf = zio_buf_alloc(asize);
2700 ASSERT(vd == vd->vdev_top);
2701 if (zio->io_type == ZIO_TYPE_WRITE) {
2702 bcopy(zio->io_data, abuf, zio->io_size);
2703 bzero(abuf + zio->io_size, asize - zio->io_size);
2704 }
2705 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2706 }
2707
2708 /*
2709 * If this is not a physical io, make sure that it is properly aligned
2710 * before proceeding.
2711 */
2712 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2713 ASSERT0(P2PHASE(zio->io_offset, align));
2714 ASSERT0(P2PHASE(zio->io_size, align));
2715 } else {
2716 /*
2717 * For physical writes, we allow 512b aligned writes and assume
2718 * the device will perform a read-modify-write as necessary.
2719 */
2720 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2721 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2722 }
2723
2724 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2725
2726 /*
2727 * If this is a repair I/O, and there's no self-healing involved --
2728 * that is, we're just resilvering what we expect to resilver --
2729 * then don't do the I/O unless zio's txg is actually in vd's DTL.
2730 * This prevents spurious resilvering with nested replication.
2731 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2732 * A is out of date, we'll read from C+D, then use the data to
2733 * resilver A+B -- but we don't actually want to resilver B, just A.
2734 * The top-level mirror has no way to know this, so instead we just
2735 * discard unnecessary repairs as we work our way down the vdev tree.
2736 * The same logic applies to any form of nested replication:
2737 * ditto + mirror, RAID-Z + replacing, etc. This covers them all.
2738 */
2739 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2740 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2741 zio->io_txg != 0 && /* not a delegated i/o */
2742 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2743 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2744 zio_vdev_io_bypass(zio);
2745 return (ZIO_PIPELINE_CONTINUE);
2746 }
2747
2748 if (vd->vdev_ops->vdev_op_leaf &&
2749 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2750
2751 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2752 return (ZIO_PIPELINE_CONTINUE);
2753
2754 if ((zio = vdev_queue_io(zio)) == NULL)
2755 return (ZIO_PIPELINE_STOP);
2756
2757 if (!vdev_accessible(vd, zio)) {
2758 zio->io_error = SET_ERROR(ENXIO);
2759 zio_interrupt(zio);
2760 return (ZIO_PIPELINE_STOP);
2761 }
2762 }
2763
2764 vd->vdev_ops->vdev_op_io_start(zio);
2765 return (ZIO_PIPELINE_STOP);
2766 }
2767
2768 static int
2769 zio_vdev_io_done(zio_t *zio)
2770 {
2771 vdev_t *vd = zio->io_vd;
2772 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2773 boolean_t unexpected_error = B_FALSE;
2774
2775 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2776 return (ZIO_PIPELINE_STOP);
2777
2778 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2779
2780 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2781
2782 vdev_queue_io_done(zio);
2783
2784 if (zio->io_type == ZIO_TYPE_WRITE)
2785 vdev_cache_write(zio);
2786
2787 if (zio_injection_enabled && zio->io_error == 0)
2788 zio->io_error = zio_handle_device_injection(vd,
2789 zio, EIO);
2790
2791 if (zio_injection_enabled && zio->io_error == 0)
2792 zio->io_error = zio_handle_label_injection(zio, EIO);
2793
2794 if (zio->io_error) {
2795 if (!vdev_accessible(vd, zio)) {
2796 zio->io_error = SET_ERROR(ENXIO);
2797 } else {
2798 unexpected_error = B_TRUE;
2799 }
2800 }
2801 }
2802
2803 ops->vdev_op_io_done(zio);
2804
2805 if (unexpected_error)
2806 VERIFY(vdev_probe(vd, zio) == NULL);
2807
2808 return (ZIO_PIPELINE_CONTINUE);
2809 }
2810
2811 /*
2812 * For non-raidz ZIOs, we can just copy aside the bad data read from the
2813 * disk, and use that to finish the checksum ereport later.
2814 */
2815 static void
2816 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2817 const void *good_buf)
2818 {
2819 /* no processing needed */
2820 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2821 }
2822
2823 /*ARGSUSED*/
2824 void
2825 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2826 {
2827 void *buf = zio_buf_alloc(zio->io_size);
2828
2829 bcopy(zio->io_data, buf, zio->io_size);
2830
2831 zcr->zcr_cbinfo = zio->io_size;
2832 zcr->zcr_cbdata = buf;
2833 zcr->zcr_finish = zio_vsd_default_cksum_finish;
2834 zcr->zcr_free = zio_buf_free;
2835 }
2836
2837 static int
2838 zio_vdev_io_assess(zio_t *zio)
2839 {
2840 vdev_t *vd = zio->io_vd;
2841
2842 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2843 return (ZIO_PIPELINE_STOP);
2844
2845 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2846 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2847
2848 if (zio->io_vsd != NULL) {
2849 zio->io_vsd_ops->vsd_free(zio);
2850 zio->io_vsd = NULL;
2851 }
2852
2853 if (zio_injection_enabled && zio->io_error == 0)
2854 zio->io_error = zio_handle_fault_injection(zio, EIO);
2855
2856 /*
2857 * If the I/O failed, determine whether we should attempt to retry it.
2858 *
2859 * On retry, we cut in line in the issue queue, since we don't want
2860 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2861 */
2862 if (zio->io_error && vd == NULL &&
2863 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2864 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2865 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
2866 zio->io_error = 0;
2867 zio->io_flags |= ZIO_FLAG_IO_RETRY |
2868 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2869 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2870 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2871 zio_requeue_io_start_cut_in_line);
2872 return (ZIO_PIPELINE_STOP);
2873 }
2874
2875 /*
2876 * If we got an error on a leaf device, convert it to ENXIO
2877 * if the device is not accessible at all.
2878 */
2879 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2880 !vdev_accessible(vd, zio))
2881 zio->io_error = SET_ERROR(ENXIO);
2882
2883 /*
2884 * If we can't write to an interior vdev (mirror or RAID-Z),
2885 * set vdev_cant_write so that we stop trying to allocate from it.
2886 */
2887 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2888 vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2889 vd->vdev_cant_write = B_TRUE;
2890 }
2891
2892 if (zio->io_error)
2893 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2894
2895 if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2896 zio->io_physdone != NULL) {
2897 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2898 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2899 zio->io_physdone(zio->io_logical);
2900 }
2901
2902 return (ZIO_PIPELINE_CONTINUE);
2903 }
2904
2905 void
2906 zio_vdev_io_reissue(zio_t *zio)
2907 {
2908 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2909 ASSERT(zio->io_error == 0);
2910
2911 zio->io_stage >>= 1;
2912 }
2913
2914 void
2915 zio_vdev_io_redone(zio_t *zio)
2916 {
2917 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2918
2919 zio->io_stage >>= 1;
2920 }
2921
2922 void
2923 zio_vdev_io_bypass(zio_t *zio)
2924 {
2925 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2926 ASSERT(zio->io_error == 0);
2927
2928 zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2929 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2930 }
2931
2932 /*
2933 * ==========================================================================
2934 * Generate and verify checksums
2935 * ==========================================================================
2936 */
2937 static int
2938 zio_checksum_generate(zio_t *zio)
2939 {
2940 blkptr_t *bp = zio->io_bp;
2941 enum zio_checksum checksum;
2942
2943 if (bp == NULL) {
2944 /*
2945 * This is zio_write_phys().
2946 * We're either generating a label checksum, or none at all.
2947 */
2948 checksum = zio->io_prop.zp_checksum;
2949
2950 if (checksum == ZIO_CHECKSUM_OFF)
2951 return (ZIO_PIPELINE_CONTINUE);
2952
2953 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2954 } else {
2955 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2956 ASSERT(!IO_IS_ALLOCATING(zio));
2957 checksum = ZIO_CHECKSUM_GANG_HEADER;
2958 } else {
2959 checksum = BP_GET_CHECKSUM(bp);
2960 }
2961 }
2962
2963 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2964
2965 return (ZIO_PIPELINE_CONTINUE);
2966 }
2967
2968 static int
2969 zio_checksum_verify(zio_t *zio)
2970 {
2971 zio_bad_cksum_t info;
2972 blkptr_t *bp = zio->io_bp;
2973 int error;
2974
2975 ASSERT(zio->io_vd != NULL);
2976
2977 if (bp == NULL) {
2978 /*
2979 * This is zio_read_phys().
2980 * We're either verifying a label checksum, or nothing at all.
2981 */
2982 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2983 return (ZIO_PIPELINE_CONTINUE);
2984
2985 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2986 }
2987
2988 if ((error = zio_checksum_error(zio, &info)) != 0) {
2989 zio->io_error = error;
2990 if (error == ECKSUM &&
2991 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2992 zfs_ereport_start_checksum(zio->io_spa,
2993 zio->io_vd, zio, zio->io_offset,
2994 zio->io_size, NULL, &info);
2995 }
2996 }
2997
2998 return (ZIO_PIPELINE_CONTINUE);
2999 }
3000
3001 /*
3002 * Called by RAID-Z to ensure we don't compute the checksum twice.
3003 */
3004 void
3005 zio_checksum_verified(zio_t *zio)
3006 {
3007 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
3008 }
3009
3010 /*
3011 * ==========================================================================
3012 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
3013 * An error of 0 indicates success. ENXIO indicates whole-device failure,
3014 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
3015 * indicate errors that are specific to one I/O, and most likely permanent.
3016 * Any other error is presumed to be worse because we weren't expecting it.
3017 * ==========================================================================
3018 */
3019 int
3020 zio_worst_error(int e1, int e2)
3021 {
3022 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
3023 int r1, r2;
3024
3025 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
3026 if (e1 == zio_error_rank[r1])
3027 break;
3028
3029 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
3030 if (e2 == zio_error_rank[r2])
3031 break;
3032
3033 return (r1 > r2 ? e1 : e2);
3034 }
3035
3036 /*
3037 * ==========================================================================
3038 * I/O completion
3039 * ==========================================================================
3040 */
3041 static int
3042 zio_ready(zio_t *zio)
3043 {
3044 blkptr_t *bp = zio->io_bp;
3045 zio_t *pio, *pio_next;
3046
3047 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3048 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
3049 return (ZIO_PIPELINE_STOP);
3050
3051 if (zio->io_ready) {
3052 ASSERT(IO_IS_ALLOCATING(zio));
3053 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3054 (zio->io_flags & ZIO_FLAG_NOPWRITE));
3055 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
3056
3057 zio->io_ready(zio);
3058 }
3059
3060 if (bp != NULL && bp != &zio->io_bp_copy)
3061 zio->io_bp_copy = *bp;
3062
3063 if (zio->io_error)
3064 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3065
3066 mutex_enter(&zio->io_lock);
3067 zio->io_state[ZIO_WAIT_READY] = 1;
3068 pio = zio_walk_parents(zio);
3069 mutex_exit(&zio->io_lock);
3070
3071 /*
3072 * As we notify zio's parents, new parents could be added.
3073 * New parents go to the head of zio's io_parent_list, however,
3074 * so we will (correctly) not notify them. The remainder of zio's
3075 * io_parent_list, from 'pio_next' onward, cannot change because
3076 * all parents must wait for us to be done before they can be done.
3077 */
3078 for (; pio != NULL; pio = pio_next) {
3079 pio_next = zio_walk_parents(zio);
3080 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
3081 }
3082
3083 if (zio->io_flags & ZIO_FLAG_NODATA) {
3084 if (BP_IS_GANG(bp)) {
3085 zio->io_flags &= ~ZIO_FLAG_NODATA;
3086 } else {
3087 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3088 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3089 }
3090 }
3091
3092 if (zio_injection_enabled &&
3093 zio->io_spa->spa_syncing_txg == zio->io_txg)
3094 zio_handle_ignored_writes(zio);
3095
3096 return (ZIO_PIPELINE_CONTINUE);
3097 }
3098
3099 static int
3100 zio_done(zio_t *zio)
3101 {
3102 zio_t *pio, *pio_next;
3103 int c, w;
3104
3105 /*
3106 * If our children haven't all completed,
3107 * wait for them and then repeat this pipeline stage.
3108 */
3109 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3110 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3111 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3112 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3113 return (ZIO_PIPELINE_STOP);
3114
3115 for (c = 0; c < ZIO_CHILD_TYPES; c++)
3116 for (w = 0; w < ZIO_WAIT_TYPES; w++)
3117 ASSERT(zio->io_children[c][w] == 0);
3118
3119 if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
3120 ASSERT(zio->io_bp->blk_pad[0] == 0);
3121 ASSERT(zio->io_bp->blk_pad[1] == 0);
3122 ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
3123 sizeof (blkptr_t)) == 0 ||
3124 (zio->io_bp == zio_unique_parent(zio)->io_bp));
3125 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
3126 zio->io_bp_override == NULL &&
3127 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3128 ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
3129 ASSERT3U(zio->io_prop.zp_copies, <=,
3130 BP_GET_NDVAS(zio->io_bp));
3131 ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
3132 (BP_COUNT_GANG(zio->io_bp) ==
3133 BP_GET_NDVAS(zio->io_bp)));
3134 }
3135 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3136 VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
3137 }
3138
3139 /*
3140 * If there were child vdev/gang/ddt errors, they apply to us now.
3141 */
3142 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3143 zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3144 zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3145
3146 /*
3147 * If the I/O on the transformed data was successful, generate any
3148 * checksum reports now while we still have the transformed data.
3149 */
3150 if (zio->io_error == 0) {
3151 while (zio->io_cksum_report != NULL) {
3152 zio_cksum_report_t *zcr = zio->io_cksum_report;
3153 uint64_t align = zcr->zcr_align;
3154 uint64_t asize = P2ROUNDUP(zio->io_size, align);
3155 char *abuf = zio->io_data;
3156
3157 if (asize != zio->io_size) {
3158 abuf = zio_buf_alloc(asize);
3159 bcopy(zio->io_data, abuf, zio->io_size);
3160 bzero(abuf+zio->io_size, asize-zio->io_size);
3161 }
3162
3163 zio->io_cksum_report = zcr->zcr_next;
3164 zcr->zcr_next = NULL;
3165 zcr->zcr_finish(zcr, abuf);
3166 zfs_ereport_free_checksum(zcr);
3167
3168 if (asize != zio->io_size)
3169 zio_buf_free(abuf, asize);
3170 }
3171 }
3172
3173 zio_pop_transforms(zio); /* note: may set zio->io_error */
3174
3175 vdev_stat_update(zio, zio->io_size);
3176
3177 /*
3178 * If this I/O is attached to a particular vdev is slow, exceeding
3179 * 30 seconds to complete, post an error described the I/O delay.
3180 * We ignore these errors if the device is currently unavailable.
3181 */
3182 if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
3183 if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
3184 zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
3185 zio->io_vd, zio, 0, 0);
3186 }
3187
3188 if (zio->io_error) {
3189 /*
3190 * If this I/O is attached to a particular vdev,
3191 * generate an error message describing the I/O failure
3192 * at the block level. We ignore these errors if the
3193 * device is currently unavailable.
3194 */
3195 if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
3196 !vdev_is_dead(zio->io_vd))
3197 zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
3198 zio->io_vd, zio, 0, 0);
3199
3200 if ((zio->io_error == EIO || !(zio->io_flags &
3201 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3202 zio == zio->io_logical) {
3203 /*
3204 * For logical I/O requests, tell the SPA to log the
3205 * error and generate a logical data ereport.
3206 */
3207 spa_log_error(zio->io_spa, zio);
3208 zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
3209 NULL, zio, 0, 0);
3210 }
3211 }
3212
3213 if (zio->io_error && zio == zio->io_logical) {
3214 /*
3215 * Determine whether zio should be reexecuted. This will
3216 * propagate all the way to the root via zio_notify_parent().
3217 */
3218 ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
3219 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3220
3221 if (IO_IS_ALLOCATING(zio) &&
3222 !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3223 if (zio->io_error != ENOSPC)
3224 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3225 else
3226 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3227 }
3228
3229 if ((zio->io_type == ZIO_TYPE_READ ||
3230 zio->io_type == ZIO_TYPE_FREE) &&
3231 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3232 zio->io_error == ENXIO &&
3233 spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
3234 spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
3235 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3236
3237 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3238 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3239
3240 /*
3241 * Here is a possibly good place to attempt to do
3242 * either combinatorial reconstruction or error correction
3243 * based on checksums. It also might be a good place
3244 * to send out preliminary ereports before we suspend
3245 * processing.
3246 */
3247 }
3248
3249 /*
3250 * If there were logical child errors, they apply to us now.
3251 * We defer this until now to avoid conflating logical child
3252 * errors with errors that happened to the zio itself when
3253 * updating vdev stats and reporting FMA events above.
3254 */
3255 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3256
3257 if ((zio->io_error || zio->io_reexecute) &&
3258 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3259 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3260 zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
3261
3262 zio_gang_tree_free(&zio->io_gang_tree);
3263
3264 /*
3265 * Godfather I/Os should never suspend.
3266 */
3267 if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3268 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3269 zio->io_reexecute = 0;
3270
3271 if (zio->io_reexecute) {
3272 /*
3273 * This is a logical I/O that wants to reexecute.
3274 *
3275 * Reexecute is top-down. When an i/o fails, if it's not
3276 * the root, it simply notifies its parent and sticks around.
3277 * The parent, seeing that it still has children in zio_done(),
3278 * does the same. This percolates all the way up to the root.
3279 * The root i/o will reexecute or suspend the entire tree.
3280 *
3281 * This approach ensures that zio_reexecute() honors
3282 * all the original i/o dependency relationships, e.g.
3283 * parents not executing until children are ready.
3284 */
3285 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3286
3287 zio->io_gang_leader = NULL;
3288
3289 mutex_enter(&zio->io_lock);
3290 zio->io_state[ZIO_WAIT_DONE] = 1;
3291 mutex_exit(&zio->io_lock);
3292
3293 /*
3294 * "The Godfather" I/O monitors its children but is
3295 * not a true parent to them. It will track them through
3296 * the pipeline but severs its ties whenever they get into
3297 * trouble (e.g. suspended). This allows "The Godfather"
3298 * I/O to return status without blocking.
3299 */
3300 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3301 zio_link_t *zl = zio->io_walk_link;
3302 pio_next = zio_walk_parents(zio);
3303
3304 if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3305 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3306 zio_remove_child(pio, zio, zl);
3307 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3308 }
3309 }
3310
3311 if ((pio = zio_unique_parent(zio)) != NULL) {
3312 /*
3313 * We're not a root i/o, so there's nothing to do
3314 * but notify our parent. Don't propagate errors
3315 * upward since we haven't permanently failed yet.
3316 */
3317 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3318 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3319 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3320 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3321 /*
3322 * We'd fail again if we reexecuted now, so suspend
3323 * until conditions improve (e.g. device comes online).
3324 */
3325 zio_suspend(zio->io_spa, zio);
3326 } else {
3327 /*
3328 * Reexecution is potentially a huge amount of work.
3329 * Hand it off to the otherwise-unused claim taskq.
3330 */
3331 ASSERT(taskq_empty_ent(&zio->io_tqent));
3332 spa_taskq_dispatch_ent(zio->io_spa,
3333 ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
3334 (task_func_t *)zio_reexecute, zio, 0,
3335 &zio->io_tqent);
3336 }
3337 return (ZIO_PIPELINE_STOP);
3338 }
3339
3340 ASSERT(zio->io_child_count == 0);
3341 ASSERT(zio->io_reexecute == 0);
3342 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3343
3344 /*
3345 * Report any checksum errors, since the I/O is complete.
3346 */
3347 while (zio->io_cksum_report != NULL) {
3348 zio_cksum_report_t *zcr = zio->io_cksum_report;
3349 zio->io_cksum_report = zcr->zcr_next;
3350 zcr->zcr_next = NULL;
3351 zcr->zcr_finish(zcr, NULL);
3352 zfs_ereport_free_checksum(zcr);
3353 }
3354
3355 if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
3356 !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
3357 !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
3358 metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
3359 }
3360
3361 /*
3362 * It is the responsibility of the done callback to ensure that this
3363 * particular zio is no longer discoverable for adoption, and as
3364 * such, cannot acquire any new parents.
3365 */
3366 if (zio->io_done)
3367 zio->io_done(zio);
3368
3369 mutex_enter(&zio->io_lock);
3370 zio->io_state[ZIO_WAIT_DONE] = 1;
3371 mutex_exit(&zio->io_lock);
3372
3373 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3374 zio_link_t *zl = zio->io_walk_link;
3375 pio_next = zio_walk_parents(zio);
3376 zio_remove_child(pio, zio, zl);
3377 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3378 }
3379
3380 if (zio->io_waiter != NULL) {
3381 mutex_enter(&zio->io_lock);
3382 zio->io_executor = NULL;
3383 cv_broadcast(&zio->io_cv);
3384 mutex_exit(&zio->io_lock);
3385 } else {
3386 zio_destroy(zio);
3387 }
3388
3389 return (ZIO_PIPELINE_STOP);
3390 }
3391
3392 /*
3393 * ==========================================================================
3394 * I/O pipeline definition
3395 * ==========================================================================
3396 */
3397 static zio_pipe_stage_t *zio_pipeline[] = {
3398 NULL,
3399 zio_read_bp_init,
3400 zio_free_bp_init,
3401 zio_issue_async,
3402 zio_write_bp_init,
3403 zio_checksum_generate,
3404 zio_nop_write,
3405 zio_ddt_read_start,
3406 zio_ddt_read_done,
3407 zio_ddt_write,
3408 zio_ddt_free,
3409 zio_gang_assemble,
3410 zio_gang_issue,
3411 zio_dva_allocate,
3412 zio_dva_free,
3413 zio_dva_claim,
3414 zio_ready,
3415 zio_vdev_io_start,
3416 zio_vdev_io_done,
3417 zio_vdev_io_assess,
3418 zio_checksum_verify,
3419 zio_done
3420 };
3421
3422 /* dnp is the dnode for zb1->zb_object */
3423 boolean_t
3424 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3425 const zbookmark_phys_t *zb2)
3426 {
3427 uint64_t zb1nextL0, zb2thisobj;
3428
3429 ASSERT(zb1->zb_objset == zb2->zb_objset);
3430 ASSERT(zb2->zb_level == 0);
3431
3432 /* The objset_phys_t isn't before anything. */
3433 if (dnp == NULL)
3434 return (B_FALSE);
3435
3436 zb1nextL0 = (zb1->zb_blkid + 1) <<
3437 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3438
3439 zb2thisobj = zb2->zb_object ? zb2->zb_object :
3440 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3441
3442 if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3443 uint64_t nextobj = zb1nextL0 *
3444 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3445 return (nextobj <= zb2thisobj);
3446 }
3447
3448 if (zb1->zb_object < zb2thisobj)
3449 return (B_TRUE);
3450 if (zb1->zb_object > zb2thisobj)
3451 return (B_FALSE);
3452 if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3453 return (B_FALSE);
3454 return (zb1nextL0 <= zb2->zb_blkid);
3455 }
3456
3457 #if defined(_KERNEL) && defined(HAVE_SPL)
3458 EXPORT_SYMBOL(zio_type_name);
3459 EXPORT_SYMBOL(zio_buf_alloc);
3460 EXPORT_SYMBOL(zio_data_buf_alloc);
3461 EXPORT_SYMBOL(zio_buf_free);
3462 EXPORT_SYMBOL(zio_data_buf_free);
3463
3464 module_param(zio_delay_max, int, 0644);
3465 MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
3466
3467 module_param(zio_requeue_io_start_cut_in_line, int, 0644);
3468 MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
3469
3470 module_param(zfs_sync_pass_deferred_free, int, 0644);
3471 MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
3472 "Defer frees starting in this pass");
3473
3474 module_param(zfs_sync_pass_dont_compress, int, 0644);
3475 MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
3476 "Don't compress starting in this pass");
3477
3478 module_param(zfs_sync_pass_rewrite, int, 0644);
3479 MODULE_PARM_DESC(zfs_sync_pass_rewrite,
3480 "Rewrite new bps starting in this pass");
3481 #endif