]>
Commit | Line | Data |
---|---|---|
75b07eca FG |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: George Wilson <george.wilson@delphix.com> | |
3 | Date: Thu, 8 Feb 2018 15:04:14 -0500 | |
4 | Subject: [PATCH] OpenZFS 8857 - zio_remove_child() panic due to already | |
5 | destroyed parent zio | |
6 | MIME-Version: 1.0 | |
7 | Content-Type: text/plain; charset=UTF-8 | |
8 | Content-Transfer-Encoding: 8bit | |
9 | ||
10 | PROBLEM | |
11 | ======= | |
12 | It's possible for a parent zio to complete even though it has children | |
13 | which have not completed. This can result in the following panic: | |
14 | > $C | |
15 | ffffff01809128c0 vpanic() | |
16 | ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80) | |
17 | ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80) | |
18 | ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0, | |
19 | ffffff3373370908) | |
20 | ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0) | |
21 | ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0) | |
22 | ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140) | |
23 | ffffff0180912b40 thread_start+8() | |
24 | > ::status | |
25 | debugging crash dump vmcore.2 (64-bit) from batfs0390 | |
26 | operating system: 5.11 joyent_20170911T171900Z (i86pc) | |
27 | image uuid: (not set) | |
28 | panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80 | |
29 | owner=ffffff3c59b39480 thread=ffffff0180912c40 | |
30 | dump content: kernel pages only | |
31 | The problem is that dbuf_prefetch along with l2arc can create a zio tree | |
32 | which confuses the parent zio and allows it to complete with while children | |
33 | still exist. Here's the scenario: | |
34 | zio tree: | |
35 | pio | |
36 | |--- lio | |
37 | The parent zio, pio, has entered the zio_done stage and begins to check its | |
38 | children to see there are still some that have not completed. In zio_done(), | |
39 | the children are checked in the following order: | |
40 | zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) | |
41 | zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) | |
42 | zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) | |
43 | zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE) | |
44 | If pio, finds any child which has not completed then it stops executing and | |
45 | goes to sleep. Each call to zio_wait_for_children() will grab the io_lock | |
46 | while checking the particular child. | |
47 | In this scenario, the pio has completed the first call to | |
48 | zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since | |
49 | the only zio in the zio tree right now is the logical zio, lio, then it | |
50 | completes that call and prepares to check the next child type. | |
51 | In the meantime, the lio completes and in its callback creates a child vdev | |
52 | zio, cio. The zio tree looks like this: | |
53 | zio tree: | |
54 | pio | |
55 | |--- lio | |
56 | |--- cio | |
57 | The lio then grabs the parent's io_lock and removes itself. | |
58 | zio tree: | |
59 | pio | |
60 | |--- cio | |
61 | The pio continues to run but has already completed its check for ZIO_CHILD_VDEV | |
62 | and will erroneously complete. When the child zio, cio, completes it will panic | |
63 | the system trying to reference the parent zio which has been destroyed. | |
64 | SOLUTION | |
65 | ======== | |
66 | The fix is to rework the zio_wait_for_children() logic to accept a bitfield | |
67 | for all the children types that it's interested in checking. The | |
68 | io_lock will is held the entire time we check all the children types. Since | |
69 | the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided | |
70 | to allow for the conversion between a ZIO_CHILD type and the bitfield used by | |
71 | the zio_wiat_for_children logic. | |
72 | ||
73 | Authored by: George Wilson <george.wilson@delphix.com> | |
74 | Reviewed by: Matthew Ahrens <mahrens@delphix.com> | |
75 | Reviewed by: Andriy Gapon <avg@FreeBSD.org> | |
76 | Reviewed by: Youzhong Yang <youzhong@gmail.com> | |
77 | Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> | |
78 | Approved by: Dan McDonald <danmcd@omniti.com> | |
79 | Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov> | |
80 | ||
81 | OpenZFS-issue: https://www.illumos.org/issues/8857 | |
82 | OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c | |
83 | Issue #5918 | |
84 | Closes #7168 | |
85 | ||
86 | (cherry picked from commit 07ce5d739024cf9c638716c09f9934b4e629678c) | |
87 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
88 | --- | |
89 | include/sys/zio.h | 11 +++++++++++ | |
90 | module/zfs/zio.c | 49 +++++++++++++++++++++++++++++-------------------- | |
91 | 2 files changed, 40 insertions(+), 20 deletions(-) | |
92 | ||
93 | diff --git a/include/sys/zio.h b/include/sys/zio.h | |
94 | index 4eaabc38c..0d741f8e2 100644 | |
95 | --- a/include/sys/zio.h | |
96 | +++ b/include/sys/zio.h | |
97 | @@ -215,6 +215,9 @@ enum zio_flag { | |
98 | (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ | |
99 | ZIO_FLAG_CANFAIL) | |
100 | ||
101 | +#define ZIO_CHILD_BIT(x) (1 << (x)) | |
102 | +#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) | |
103 | + | |
104 | enum zio_child { | |
105 | ZIO_CHILD_VDEV = 0, | |
106 | ZIO_CHILD_GANG, | |
107 | @@ -223,6 +226,14 @@ enum zio_child { | |
108 | ZIO_CHILD_TYPES | |
109 | }; | |
110 | ||
111 | +#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV) | |
112 | +#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG) | |
113 | +#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) | |
114 | +#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) | |
115 | +#define ZIO_CHILD_ALL_BITS \ | |
116 | + (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ | |
117 | + ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) | |
118 | + | |
119 | enum zio_wait_type { | |
120 | ZIO_WAIT_READY = 0, | |
121 | ZIO_WAIT_DONE, | |
122 | diff --git a/module/zfs/zio.c b/module/zfs/zio.c | |
123 | index 1d69d8d8d..cd0a473e0 100644 | |
124 | --- a/module/zfs/zio.c | |
125 | +++ b/module/zfs/zio.c | |
126 | @@ -491,21 +491,26 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) | |
127 | } | |
128 | ||
129 | static boolean_t | |
130 | -zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) | |
131 | +zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) | |
132 | { | |
133 | - uint64_t *countp = &zio->io_children[child][wait]; | |
134 | boolean_t waiting = B_FALSE; | |
135 | ||
136 | mutex_enter(&zio->io_lock); | |
137 | ASSERT(zio->io_stall == NULL); | |
138 | - if (*countp != 0) { | |
139 | - zio->io_stage >>= 1; | |
140 | - ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); | |
141 | - zio->io_stall = countp; | |
142 | - waiting = B_TRUE; | |
143 | + for (int c = 0; c < ZIO_CHILD_TYPES; c++) { | |
144 | + if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) | |
145 | + continue; | |
146 | + | |
147 | + uint64_t *countp = &zio->io_children[c][wait]; | |
148 | + if (*countp != 0) { | |
149 | + zio->io_stage >>= 1; | |
150 | + ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); | |
151 | + zio->io_stall = countp; | |
152 | + waiting = B_TRUE; | |
153 | + break; | |
154 | + } | |
155 | } | |
156 | mutex_exit(&zio->io_lock); | |
157 | - | |
158 | return (waiting); | |
159 | } | |
160 | ||
161 | @@ -1296,9 +1301,10 @@ zio_write_compress(zio_t *zio) | |
162 | * If our children haven't all reached the ready stage, | |
163 | * wait for them and then repeat this pipeline stage. | |
164 | */ | |
165 | - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || | |
166 | - zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) | |
167 | + if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | | |
168 | + ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { | |
169 | return (ZIO_PIPELINE_STOP); | |
170 | + } | |
171 | ||
172 | if (!IO_IS_ALLOCATING(zio)) | |
173 | return (ZIO_PIPELINE_CONTINUE); | |
174 | @@ -2229,8 +2235,9 @@ zio_gang_issue(zio_t *zio) | |
175 | { | |
176 | blkptr_t *bp = zio->io_bp; | |
177 | ||
178 | - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) | |
179 | + if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { | |
180 | return (ZIO_PIPELINE_STOP); | |
181 | + } | |
182 | ||
183 | ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); | |
184 | ASSERT(zio->io_child_type > ZIO_CHILD_GANG); | |
185 | @@ -2561,8 +2568,9 @@ zio_ddt_read_done(zio_t *zio) | |
186 | { | |
187 | blkptr_t *bp = zio->io_bp; | |
188 | ||
189 | - if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) | |
190 | + if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { | |
191 | return (ZIO_PIPELINE_STOP); | |
192 | + } | |
193 | ||
194 | ASSERT(BP_GET_DEDUP(bp)); | |
195 | ASSERT(BP_GET_PSIZE(bp) == zio->io_size); | |
196 | @@ -3292,8 +3300,9 @@ zio_vdev_io_done(zio_t *zio) | |
197 | vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; | |
198 | boolean_t unexpected_error = B_FALSE; | |
199 | ||
200 | - if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) | |
201 | + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { | |
202 | return (ZIO_PIPELINE_STOP); | |
203 | + } | |
204 | ||
205 | ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); | |
206 | ||
207 | @@ -3362,8 +3371,9 @@ zio_vdev_io_assess(zio_t *zio) | |
208 | { | |
209 | vdev_t *vd = zio->io_vd; | |
210 | ||
211 | - if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) | |
212 | + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { | |
213 | return (ZIO_PIPELINE_STOP); | |
214 | + } | |
215 | ||
216 | if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) | |
217 | spa_config_exit(zio->io_spa, SCL_ZIO, zio); | |
218 | @@ -3578,9 +3588,10 @@ zio_ready(zio_t *zio) | |
219 | zio_t *pio, *pio_next; | |
220 | zio_link_t *zl = NULL; | |
221 | ||
222 | - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || | |
223 | - zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) | |
224 | + if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, | |
225 | + ZIO_WAIT_READY)) { | |
226 | return (ZIO_PIPELINE_STOP); | |
227 | + } | |
228 | ||
229 | if (zio->io_ready) { | |
230 | ASSERT(IO_IS_ALLOCATING(zio)); | |
231 | @@ -3721,11 +3732,9 @@ zio_done(zio_t *zio) | |
232 | * If our children haven't all completed, | |
233 | * wait for them and then repeat this pipeline stage. | |
234 | */ | |
235 | - if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || | |
236 | - zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || | |
237 | - zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || | |
238 | - zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) | |
239 | + if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { | |
240 | return (ZIO_PIPELINE_STOP); | |
241 | + } | |
242 | ||
243 | /* | |
244 | * If the allocation throttle is enabled, then update the accounting. | |
245 | -- | |
246 | 2.14.2 | |
247 |