]> git.proxmox.com Git - zfsonlinux.git/blame - zfs-patches/0032-OpenZFS-8857-zio_remove_child-panic-due-to-already-d.patch
bump version to 0.7.7-pve1~bpo9
[zfsonlinux.git] / zfs-patches / 0032-OpenZFS-8857-zio_remove_child-panic-due-to-already-d.patch
CommitLineData
75b07eca
FG
1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: George Wilson <george.wilson@delphix.com>
3Date: Thu, 8 Feb 2018 15:04:14 -0500
4Subject: [PATCH] OpenZFS 8857 - zio_remove_child() panic due to already
5 destroyed parent zio
6MIME-Version: 1.0
7Content-Type: text/plain; charset=UTF-8
8Content-Transfer-Encoding: 8bit
9
10PROBLEM
11=======
12It's possible for a parent zio to complete even though it has children
13which have not completed. This can result in the following panic:
14 > $C
15 ffffff01809128c0 vpanic()
16 ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
17 ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
18 ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
19 ffffff3373370908)
20 ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
21 ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
22 ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
23 ffffff0180912b40 thread_start+8()
24 > ::status
25 debugging crash dump vmcore.2 (64-bit) from batfs0390
26 operating system: 5.11 joyent_20170911T171900Z (i86pc)
27 image uuid: (not set)
28 panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
29 owner=ffffff3c59b39480 thread=ffffff0180912c40
30 dump content: kernel pages only
31The problem is that dbuf_prefetch along with l2arc can create a zio tree
32which confuses the parent zio and allows it to complete with while children
33still exist. Here's the scenario:
34 zio tree:
35 pio
36 |--- lio
37The parent zio, pio, has entered the zio_done stage and begins to check its
38children to see there are still some that have not completed. In zio_done(),
39the children are checked in the following order:
40 zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
41 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
42 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
43 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
44If pio, finds any child which has not completed then it stops executing and
45goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
46while checking the particular child.
47In this scenario, the pio has completed the first call to
48zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
49the only zio in the zio tree right now is the logical zio, lio, then it
50completes that call and prepares to check the next child type.
51In the meantime, the lio completes and in its callback creates a child vdev
52zio, cio. The zio tree looks like this:
53 zio tree:
54 pio
55 |--- lio
56 |--- cio
57The lio then grabs the parent's io_lock and removes itself.
58 zio tree:
59 pio
60 |--- cio
61The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
62and will erroneously complete. When the child zio, cio, completes it will panic
63the system trying to reference the parent zio which has been destroyed.
64SOLUTION
65========
66The fix is to rework the zio_wait_for_children() logic to accept a bitfield
67for all the children types that it's interested in checking. The
68io_lock will is held the entire time we check all the children types. Since
69the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
70to allow for the conversion between a ZIO_CHILD type and the bitfield used by
71the zio_wiat_for_children logic.
72
73Authored by: George Wilson <george.wilson@delphix.com>
74Reviewed by: Matthew Ahrens <mahrens@delphix.com>
75Reviewed by: Andriy Gapon <avg@FreeBSD.org>
76Reviewed by: Youzhong Yang <youzhong@gmail.com>
77Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
78Approved by: Dan McDonald <danmcd@omniti.com>
79Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
80
81OpenZFS-issue: https://www.illumos.org/issues/8857
82OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
83Issue #5918
84Closes #7168
85
86(cherry picked from commit 07ce5d739024cf9c638716c09f9934b4e629678c)
87Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
88---
89 include/sys/zio.h | 11 +++++++++++
90 module/zfs/zio.c | 49 +++++++++++++++++++++++++++++--------------------
91 2 files changed, 40 insertions(+), 20 deletions(-)
92
93diff --git a/include/sys/zio.h b/include/sys/zio.h
94index 4eaabc38c..0d741f8e2 100644
95--- a/include/sys/zio.h
96+++ b/include/sys/zio.h
97@@ -215,6 +215,9 @@ enum zio_flag {
98 (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
99 ZIO_FLAG_CANFAIL)
100
101+#define ZIO_CHILD_BIT(x) (1 << (x))
102+#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
103+
104 enum zio_child {
105 ZIO_CHILD_VDEV = 0,
106 ZIO_CHILD_GANG,
107@@ -223,6 +226,14 @@ enum zio_child {
108 ZIO_CHILD_TYPES
109 };
110
111+#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
112+#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
113+#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
114+#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
115+#define ZIO_CHILD_ALL_BITS \
116+ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
117+ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
118+
119 enum zio_wait_type {
120 ZIO_WAIT_READY = 0,
121 ZIO_WAIT_DONE,
122diff --git a/module/zfs/zio.c b/module/zfs/zio.c
123index 1d69d8d8d..cd0a473e0 100644
124--- a/module/zfs/zio.c
125+++ b/module/zfs/zio.c
126@@ -491,21 +491,26 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
127 }
128
129 static boolean_t
130-zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
131+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
132 {
133- uint64_t *countp = &zio->io_children[child][wait];
134 boolean_t waiting = B_FALSE;
135
136 mutex_enter(&zio->io_lock);
137 ASSERT(zio->io_stall == NULL);
138- if (*countp != 0) {
139- zio->io_stage >>= 1;
140- ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
141- zio->io_stall = countp;
142- waiting = B_TRUE;
143+ for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
144+ if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
145+ continue;
146+
147+ uint64_t *countp = &zio->io_children[c][wait];
148+ if (*countp != 0) {
149+ zio->io_stage >>= 1;
150+ ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
151+ zio->io_stall = countp;
152+ waiting = B_TRUE;
153+ break;
154+ }
155 }
156 mutex_exit(&zio->io_lock);
157-
158 return (waiting);
159 }
160
161@@ -1296,9 +1301,10 @@ zio_write_compress(zio_t *zio)
162 * If our children haven't all reached the ready stage,
163 * wait for them and then repeat this pipeline stage.
164 */
165- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
166- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
167+ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
168+ ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
169 return (ZIO_PIPELINE_STOP);
170+ }
171
172 if (!IO_IS_ALLOCATING(zio))
173 return (ZIO_PIPELINE_CONTINUE);
174@@ -2229,8 +2235,9 @@ zio_gang_issue(zio_t *zio)
175 {
176 blkptr_t *bp = zio->io_bp;
177
178- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
179+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
180 return (ZIO_PIPELINE_STOP);
181+ }
182
183 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
184 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
185@@ -2561,8 +2568,9 @@ zio_ddt_read_done(zio_t *zio)
186 {
187 blkptr_t *bp = zio->io_bp;
188
189- if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
190+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
191 return (ZIO_PIPELINE_STOP);
192+ }
193
194 ASSERT(BP_GET_DEDUP(bp));
195 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
196@@ -3292,8 +3300,9 @@ zio_vdev_io_done(zio_t *zio)
197 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
198 boolean_t unexpected_error = B_FALSE;
199
200- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
201+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
202 return (ZIO_PIPELINE_STOP);
203+ }
204
205 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
206
207@@ -3362,8 +3371,9 @@ zio_vdev_io_assess(zio_t *zio)
208 {
209 vdev_t *vd = zio->io_vd;
210
211- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
212+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
213 return (ZIO_PIPELINE_STOP);
214+ }
215
216 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
217 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
218@@ -3578,9 +3588,10 @@ zio_ready(zio_t *zio)
219 zio_t *pio, *pio_next;
220 zio_link_t *zl = NULL;
221
222- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
223- zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
224+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
225+ ZIO_WAIT_READY)) {
226 return (ZIO_PIPELINE_STOP);
227+ }
228
229 if (zio->io_ready) {
230 ASSERT(IO_IS_ALLOCATING(zio));
231@@ -3721,11 +3732,9 @@ zio_done(zio_t *zio)
232 * If our children haven't all completed,
233 * wait for them and then repeat this pipeline stage.
234 */
235- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
236- zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
237- zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
238- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
239+ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
240 return (ZIO_PIPELINE_STOP);
241+ }
242
243 /*
244 * If the allocation throttle is enabled, then update the accounting.
245--
2462.14.2
247