]> git.proxmox.com Git - zfsonlinux.git/blob - zfs-patches/0032-OpenZFS-8857-zio_remove_child-panic-due-to-already-d.patch
bump version to 0.7.7-pve1~bpo9
[zfsonlinux.git] / zfs-patches / 0032-OpenZFS-8857-zio_remove_child-panic-due-to-already-d.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: George Wilson <george.wilson@delphix.com>
3 Date: Thu, 8 Feb 2018 15:04:14 -0500
4 Subject: [PATCH] OpenZFS 8857 - zio_remove_child() panic due to already
5 destroyed parent zio
6 MIME-Version: 1.0
7 Content-Type: text/plain; charset=UTF-8
8 Content-Transfer-Encoding: 8bit
9
10 PROBLEM
11 =======
12 It's possible for a parent zio to complete even though it has children
13 which have not completed. This can result in the following panic:
14 > $C
15 ffffff01809128c0 vpanic()
16 ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
17 ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
18 ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
19 ffffff3373370908)
20 ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
21 ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
22 ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
23 ffffff0180912b40 thread_start+8()
24 > ::status
25 debugging crash dump vmcore.2 (64-bit) from batfs0390
26 operating system: 5.11 joyent_20170911T171900Z (i86pc)
27 image uuid: (not set)
28 panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
29 owner=ffffff3c59b39480 thread=ffffff0180912c40
30 dump content: kernel pages only
31 The problem is that dbuf_prefetch along with l2arc can create a zio tree
32 which confuses the parent zio and allows it to complete with while children
33 still exist. Here's the scenario:
34 zio tree:
35 pio
36 |--- lio
37 The parent zio, pio, has entered the zio_done stage and begins to check its
38 children to see there are still some that have not completed. In zio_done(),
39 the children are checked in the following order:
40 zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
41 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
42 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
43 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
44 If pio, finds any child which has not completed then it stops executing and
45 goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
46 while checking the particular child.
47 In this scenario, the pio has completed the first call to
48 zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
49 the only zio in the zio tree right now is the logical zio, lio, then it
50 completes that call and prepares to check the next child type.
51 In the meantime, the lio completes and in its callback creates a child vdev
52 zio, cio. The zio tree looks like this:
53 zio tree:
54 pio
55 |--- lio
56 |--- cio
57 The lio then grabs the parent's io_lock and removes itself.
58 zio tree:
59 pio
60 |--- cio
61 The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
62 and will erroneously complete. When the child zio, cio, completes it will panic
63 the system trying to reference the parent zio which has been destroyed.
64 SOLUTION
65 ========
66 The fix is to rework the zio_wait_for_children() logic to accept a bitfield
67 for all the children types that it's interested in checking. The
68 io_lock will is held the entire time we check all the children types. Since
69 the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
70 to allow for the conversion between a ZIO_CHILD type and the bitfield used by
71 the zio_wiat_for_children logic.
72
73 Authored by: George Wilson <george.wilson@delphix.com>
74 Reviewed by: Matthew Ahrens <mahrens@delphix.com>
75 Reviewed by: Andriy Gapon <avg@FreeBSD.org>
76 Reviewed by: Youzhong Yang <youzhong@gmail.com>
77 Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
78 Approved by: Dan McDonald <danmcd@omniti.com>
79 Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
80
81 OpenZFS-issue: https://www.illumos.org/issues/8857
82 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
83 Issue #5918
84 Closes #7168
85
86 (cherry picked from commit 07ce5d739024cf9c638716c09f9934b4e629678c)
87 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
88 ---
89 include/sys/zio.h | 11 +++++++++++
90 module/zfs/zio.c | 49 +++++++++++++++++++++++++++++--------------------
91 2 files changed, 40 insertions(+), 20 deletions(-)
92
93 diff --git a/include/sys/zio.h b/include/sys/zio.h
94 index 4eaabc38c..0d741f8e2 100644
95 --- a/include/sys/zio.h
96 +++ b/include/sys/zio.h
97 @@ -215,6 +215,9 @@ enum zio_flag {
98 (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
99 ZIO_FLAG_CANFAIL)
100
101 +#define ZIO_CHILD_BIT(x) (1 << (x))
102 +#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
103 +
104 enum zio_child {
105 ZIO_CHILD_VDEV = 0,
106 ZIO_CHILD_GANG,
107 @@ -223,6 +226,14 @@ enum zio_child {
108 ZIO_CHILD_TYPES
109 };
110
111 +#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
112 +#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
113 +#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
114 +#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
115 +#define ZIO_CHILD_ALL_BITS \
116 + (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
117 + ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
118 +
119 enum zio_wait_type {
120 ZIO_WAIT_READY = 0,
121 ZIO_WAIT_DONE,
122 diff --git a/module/zfs/zio.c b/module/zfs/zio.c
123 index 1d69d8d8d..cd0a473e0 100644
124 --- a/module/zfs/zio.c
125 +++ b/module/zfs/zio.c
126 @@ -491,21 +491,26 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
127 }
128
129 static boolean_t
130 -zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
131 +zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
132 {
133 - uint64_t *countp = &zio->io_children[child][wait];
134 boolean_t waiting = B_FALSE;
135
136 mutex_enter(&zio->io_lock);
137 ASSERT(zio->io_stall == NULL);
138 - if (*countp != 0) {
139 - zio->io_stage >>= 1;
140 - ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
141 - zio->io_stall = countp;
142 - waiting = B_TRUE;
143 + for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
144 + if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
145 + continue;
146 +
147 + uint64_t *countp = &zio->io_children[c][wait];
148 + if (*countp != 0) {
149 + zio->io_stage >>= 1;
150 + ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
151 + zio->io_stall = countp;
152 + waiting = B_TRUE;
153 + break;
154 + }
155 }
156 mutex_exit(&zio->io_lock);
157 -
158 return (waiting);
159 }
160
161 @@ -1296,9 +1301,10 @@ zio_write_compress(zio_t *zio)
162 * If our children haven't all reached the ready stage,
163 * wait for them and then repeat this pipeline stage.
164 */
165 - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
166 - zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
167 + if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
168 + ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
169 return (ZIO_PIPELINE_STOP);
170 + }
171
172 if (!IO_IS_ALLOCATING(zio))
173 return (ZIO_PIPELINE_CONTINUE);
174 @@ -2229,8 +2235,9 @@ zio_gang_issue(zio_t *zio)
175 {
176 blkptr_t *bp = zio->io_bp;
177
178 - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
179 + if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
180 return (ZIO_PIPELINE_STOP);
181 + }
182
183 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
184 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
185 @@ -2561,8 +2568,9 @@ zio_ddt_read_done(zio_t *zio)
186 {
187 blkptr_t *bp = zio->io_bp;
188
189 - if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
190 + if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
191 return (ZIO_PIPELINE_STOP);
192 + }
193
194 ASSERT(BP_GET_DEDUP(bp));
195 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
196 @@ -3292,8 +3300,9 @@ zio_vdev_io_done(zio_t *zio)
197 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
198 boolean_t unexpected_error = B_FALSE;
199
200 - if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
201 + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
202 return (ZIO_PIPELINE_STOP);
203 + }
204
205 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
206
207 @@ -3362,8 +3371,9 @@ zio_vdev_io_assess(zio_t *zio)
208 {
209 vdev_t *vd = zio->io_vd;
210
211 - if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
212 + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
213 return (ZIO_PIPELINE_STOP);
214 + }
215
216 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
217 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
218 @@ -3578,9 +3588,10 @@ zio_ready(zio_t *zio)
219 zio_t *pio, *pio_next;
220 zio_link_t *zl = NULL;
221
222 - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
223 - zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
224 + if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
225 + ZIO_WAIT_READY)) {
226 return (ZIO_PIPELINE_STOP);
227 + }
228
229 if (zio->io_ready) {
230 ASSERT(IO_IS_ALLOCATING(zio));
231 @@ -3721,11 +3732,9 @@ zio_done(zio_t *zio)
232 * If our children haven't all completed,
233 * wait for them and then repeat this pipeline stage.
234 */
235 - if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
236 - zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
237 - zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
238 - zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
239 + if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
240 return (ZIO_PIPELINE_STOP);
241 + }
242
243 /*
244 * If the allocation throttle is enabled, then update the accounting.
245 --
246 2.14.2
247