]>
Commit | Line | Data |
---|---|---|
3323a8b7 FG |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Changwei Ge <ge.changwei@h3c.com> | |
3 | Date: Wed, 31 Jan 2018 16:15:06 -0800 | |
4 | Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without | |
5 | meta_alloc | |
6 | MIME-Version: 1.0 | |
7 | Content-Type: text/plain; charset=UTF-8 | |
8 | Content-Transfer-Encoding: 8bit | |
9 | ||
10 | A crash issue was reported by John Lightsey with a call trace as follows: | |
11 | ||
12 | ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2] | |
13 | ocfs2_change_extent_flag+0x33a/0x470 [ocfs2] | |
14 | ocfs2_mark_extent_written+0x172/0x220 [ocfs2] | |
15 | ocfs2_dio_end_io+0x62d/0x910 [ocfs2] | |
16 | dio_complete+0x19a/0x1a0 | |
17 | do_blockdev_direct_IO+0x19dd/0x1eb0 | |
18 | __blockdev_direct_IO+0x43/0x50 | |
19 | ocfs2_direct_IO+0x8f/0xa0 [ocfs2] | |
20 | generic_file_direct_write+0xb2/0x170 | |
21 | __generic_file_write_iter+0xc3/0x1b0 | |
22 | ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2] | |
23 | __vfs_write+0xae/0xf0 | |
24 | vfs_write+0xb8/0x1b0 | |
25 | SyS_write+0x4f/0xb0 | |
26 | system_call_fastpath+0x16/0x75 | |
27 | ||
28 | The BUG code told that extent tree wants to grow but no metadata was | |
29 | reserved ahead of time. From my investigation into this issue, the root | |
30 | cause it that although enough metadata is not reserved, there should be | |
31 | enough for following use. Rightmost extent is merged into its left one | |
32 | due to a certain times of marking extent written. Because during | |
33 | marking extent written, we got many physically continuous extents. At | |
34 | last, an empty extent showed up and the rightmost path is removed from | |
35 | extent tree. | |
36 | ||
37 | Add a new mechanism to reuse extent block cached in dealloc which were | |
38 | just unlinked from extent tree to solve this crash issue. | |
39 | ||
40 | Criteria is that during marking extents *written*, if extent rotation | |
41 | and merging results in unlinking extent with growing extent tree later | |
42 | without any metadata reserved ahead of time, try to reuse those extents | |
43 | in dealloc in which deleted extents are cached. | |
44 | ||
45 | Also, this patch addresses the issue John reported that ::dw_zero_count | |
46 | is not calculated properly. | |
47 | ||
48 | After applying this patch, the issue John reported was gone. Thanks for | |
49 | the reproducer provided by John. And this patch has passed | |
50 | ocfs2-test(29 cases) suite running by New H3C Group. | |
51 | ||
52 | [ge.changwei@h3c.com: fix static checker warnning] | |
53 | Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com | |
54 | [akpm@linux-foundation.org: brelse(NULL) is legal] | |
55 | Link: http://lkml.kernel.org/r/1515479070-32653-2-git-send-email-ge.changwei@h3c.com | |
56 | Signed-off-by: Changwei Ge <ge.changwei@h3c.com> | |
57 | Reported-by: John Lightsey <john@nixnuts.net> | |
58 | Tested-by: John Lightsey <john@nixnuts.net> | |
59 | Cc: Joel Becker <jlbec@evilplan.org> | |
60 | Cc: Joseph Qi <jiangqi903@gmail.com> | |
61 | Cc: Junxiao Bi <junxiao.bi@oracle.com> | |
62 | Cc: Dan Carpenter <dan.carpenter@oracle.com> | |
63 | Cc: Mark Fasheh <mfasheh@versity.com> | |
64 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
65 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
66 | (cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42) | |
67 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
68 | --- | |
69 | fs/ocfs2/alloc.h | 1 + | |
70 | fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- | |
71 | fs/ocfs2/aops.c | 6 ++ | |
72 | 3 files changed, 203 insertions(+), 10 deletions(-) | |
73 | ||
74 | diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h | |
75 | index 4a5152ec88a3..571692171dd1 100644 | |
76 | --- a/fs/ocfs2/alloc.h | |
77 | +++ b/fs/ocfs2/alloc.h | |
78 | @@ -61,6 +61,7 @@ struct ocfs2_extent_tree { | |
79 | ocfs2_journal_access_func et_root_journal_access; | |
80 | void *et_object; | |
81 | unsigned int et_max_leaf_clusters; | |
82 | + struct ocfs2_cached_dealloc_ctxt *et_dealloc; | |
83 | }; | |
84 | ||
85 | /* | |
86 | diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c | |
87 | index 386aecce881d..9b5e7d8ba710 100644 | |
88 | --- a/fs/ocfs2/alloc.c | |
89 | +++ b/fs/ocfs2/alloc.c | |
90 | @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, | |
91 | struct ocfs2_extent_rec *rec); | |
92 | static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); | |
93 | static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); | |
94 | + | |
95 | +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, | |
96 | + struct ocfs2_extent_tree *et, | |
97 | + struct buffer_head **new_eb_bh, | |
98 | + int blk_wanted, int *blk_given); | |
99 | +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et); | |
100 | + | |
101 | static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { | |
102 | .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, | |
103 | .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, | |
104 | @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, | |
105 | if (!obj) | |
106 | obj = (void *)bh->b_data; | |
107 | et->et_object = obj; | |
108 | + et->et_dealloc = NULL; | |
109 | ||
110 | et->et_ops->eo_fill_root_el(et); | |
111 | if (!et->et_ops->eo_fill_max_leaf_clusters) | |
112 | @@ -1159,7 +1167,7 @@ static int ocfs2_add_branch(handle_t *handle, | |
113 | struct buffer_head **last_eb_bh, | |
114 | struct ocfs2_alloc_context *meta_ac) | |
115 | { | |
116 | - int status, new_blocks, i; | |
117 | + int status, new_blocks, i, block_given = 0; | |
118 | u64 next_blkno, new_last_eb_blk; | |
119 | struct buffer_head *bh; | |
120 | struct buffer_head **new_eb_bhs = NULL; | |
121 | @@ -1214,11 +1222,31 @@ static int ocfs2_add_branch(handle_t *handle, | |
122 | goto bail; | |
123 | } | |
124 | ||
125 | - status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, | |
126 | - meta_ac, new_eb_bhs); | |
127 | - if (status < 0) { | |
128 | - mlog_errno(status); | |
129 | - goto bail; | |
130 | + /* Firstyly, try to reuse dealloc since we have already estimated how | |
131 | + * many extent blocks we may use. | |
132 | + */ | |
133 | + if (!ocfs2_is_dealloc_empty(et)) { | |
134 | + status = ocfs2_reuse_blk_from_dealloc(handle, et, | |
135 | + new_eb_bhs, new_blocks, | |
136 | + &block_given); | |
137 | + if (status < 0) { | |
138 | + mlog_errno(status); | |
139 | + goto bail; | |
140 | + } | |
141 | + } | |
142 | + | |
143 | + BUG_ON(block_given > new_blocks); | |
144 | + | |
145 | + if (block_given < new_blocks) { | |
146 | + BUG_ON(!meta_ac); | |
147 | + status = ocfs2_create_new_meta_bhs(handle, et, | |
148 | + new_blocks - block_given, | |
149 | + meta_ac, | |
150 | + &new_eb_bhs[block_given]); | |
151 | + if (status < 0) { | |
152 | + mlog_errno(status); | |
153 | + goto bail; | |
154 | + } | |
155 | } | |
156 | ||
157 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be | |
158 | @@ -1341,15 +1369,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle, | |
159 | struct ocfs2_alloc_context *meta_ac, | |
160 | struct buffer_head **ret_new_eb_bh) | |
161 | { | |
162 | - int status, i; | |
163 | + int status, i, block_given = 0; | |
164 | u32 new_clusters; | |
165 | struct buffer_head *new_eb_bh = NULL; | |
166 | struct ocfs2_extent_block *eb; | |
167 | struct ocfs2_extent_list *root_el; | |
168 | struct ocfs2_extent_list *eb_el; | |
169 | ||
170 | - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, | |
171 | - &new_eb_bh); | |
172 | + if (!ocfs2_is_dealloc_empty(et)) { | |
173 | + status = ocfs2_reuse_blk_from_dealloc(handle, et, | |
174 | + &new_eb_bh, 1, | |
175 | + &block_given); | |
176 | + } else if (meta_ac) { | |
177 | + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, | |
178 | + &new_eb_bh); | |
179 | + | |
180 | + } else { | |
181 | + BUG(); | |
182 | + } | |
183 | + | |
184 | if (status < 0) { | |
185 | mlog_errno(status); | |
186 | goto bail; | |
187 | @@ -1512,7 +1550,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, | |
188 | int depth = le16_to_cpu(el->l_tree_depth); | |
189 | struct buffer_head *bh = NULL; | |
190 | ||
191 | - BUG_ON(meta_ac == NULL); | |
192 | + BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et)); | |
193 | ||
194 | shift = ocfs2_find_branch_target(et, &bh); | |
195 | if (shift < 0) { | |
196 | @@ -6593,6 +6631,154 @@ ocfs2_find_per_slot_free_list(int type, | |
197 | return fl; | |
198 | } | |
199 | ||
200 | +static struct ocfs2_per_slot_free_list * | |
201 | +ocfs2_find_preferred_free_list(int type, | |
202 | + int preferred_slot, | |
203 | + int *real_slot, | |
204 | + struct ocfs2_cached_dealloc_ctxt *ctxt) | |
205 | +{ | |
206 | + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; | |
207 | + | |
208 | + while (fl) { | |
209 | + if (fl->f_inode_type == type && fl->f_slot == preferred_slot) { | |
210 | + *real_slot = fl->f_slot; | |
211 | + return fl; | |
212 | + } | |
213 | + | |
214 | + fl = fl->f_next_suballocator; | |
215 | + } | |
216 | + | |
217 | + /* If we can't find any free list matching preferred slot, just use | |
218 | + * the first one. | |
219 | + */ | |
220 | + fl = ctxt->c_first_suballocator; | |
221 | + *real_slot = fl->f_slot; | |
222 | + | |
223 | + return fl; | |
224 | +} | |
225 | + | |
226 | +/* Return Value 1 indicates empty */ | |
227 | +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et) | |
228 | +{ | |
229 | + struct ocfs2_per_slot_free_list *fl = NULL; | |
230 | + | |
231 | + if (!et->et_dealloc) | |
232 | + return 1; | |
233 | + | |
234 | + fl = et->et_dealloc->c_first_suballocator; | |
235 | + if (!fl) | |
236 | + return 1; | |
237 | + | |
238 | + if (!fl->f_first) | |
239 | + return 1; | |
240 | + | |
241 | + return 0; | |
242 | +} | |
243 | + | |
244 | +/* If extent was deleted from tree due to extent rotation and merging, and | |
245 | + * no metadata is reserved ahead of time. Try to reuse some extents | |
246 | + * just deleted. This is only used to reuse extent blocks. | |
247 | + * It is supposed to find enough extent blocks in dealloc if our estimation | |
248 | + * on metadata is accurate. | |
249 | + */ | |
250 | +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, | |
251 | + struct ocfs2_extent_tree *et, | |
252 | + struct buffer_head **new_eb_bh, | |
253 | + int blk_wanted, int *blk_given) | |
254 | +{ | |
255 | + int i, status = 0, real_slot; | |
256 | + struct ocfs2_cached_dealloc_ctxt *dealloc; | |
257 | + struct ocfs2_per_slot_free_list *fl; | |
258 | + struct ocfs2_cached_block_free *bf; | |
259 | + struct ocfs2_extent_block *eb; | |
260 | + struct ocfs2_super *osb = | |
261 | + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); | |
262 | + | |
263 | + *blk_given = 0; | |
264 | + | |
265 | + /* If extent tree doesn't have a dealloc, this is not faulty. Just | |
266 | + * tell upper caller dealloc can't provide any block and it should | |
267 | + * ask for alloc to claim more space. | |
268 | + */ | |
269 | + dealloc = et->et_dealloc; | |
270 | + if (!dealloc) | |
271 | + goto bail; | |
272 | + | |
273 | + for (i = 0; i < blk_wanted; i++) { | |
274 | + /* Prefer to use local slot */ | |
275 | + fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE, | |
276 | + osb->slot_num, &real_slot, | |
277 | + dealloc); | |
278 | + /* If no more block can be reused, we should claim more | |
279 | + * from alloc. Just return here normally. | |
280 | + */ | |
281 | + if (!fl) { | |
282 | + status = 0; | |
283 | + break; | |
284 | + } | |
285 | + | |
286 | + bf = fl->f_first; | |
287 | + fl->f_first = bf->free_next; | |
288 | + | |
289 | + new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk); | |
290 | + if (new_eb_bh[i] == NULL) { | |
291 | + status = -ENOMEM; | |
292 | + mlog_errno(status); | |
293 | + goto bail; | |
294 | + } | |
295 | + | |
296 | + mlog(0, "Reusing block(%llu) from " | |
297 | + "dealloc(local slot:%d, real slot:%d)\n", | |
298 | + bf->free_blk, osb->slot_num, real_slot); | |
299 | + | |
300 | + ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]); | |
301 | + | |
302 | + status = ocfs2_journal_access_eb(handle, et->et_ci, | |
303 | + new_eb_bh[i], | |
304 | + OCFS2_JOURNAL_ACCESS_CREATE); | |
305 | + if (status < 0) { | |
306 | + mlog_errno(status); | |
307 | + goto bail; | |
308 | + } | |
309 | + | |
310 | + memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize); | |
311 | + eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data; | |
312 | + | |
313 | + /* We can't guarantee that buffer head is still cached, so | |
314 | + * polutlate the extent block again. | |
315 | + */ | |
316 | + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); | |
317 | + eb->h_blkno = cpu_to_le64(bf->free_blk); | |
318 | + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); | |
319 | + eb->h_suballoc_slot = cpu_to_le16(real_slot); | |
320 | + eb->h_suballoc_loc = cpu_to_le64(bf->free_bg); | |
321 | + eb->h_suballoc_bit = cpu_to_le16(bf->free_bit); | |
322 | + eb->h_list.l_count = | |
323 | + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); | |
324 | + | |
325 | + /* We'll also be dirtied by the caller, so | |
326 | + * this isn't absolutely necessary. | |
327 | + */ | |
328 | + ocfs2_journal_dirty(handle, new_eb_bh[i]); | |
329 | + | |
330 | + if (!fl->f_first) { | |
331 | + dealloc->c_first_suballocator = fl->f_next_suballocator; | |
332 | + kfree(fl); | |
333 | + } | |
334 | + kfree(bf); | |
335 | + } | |
336 | + | |
337 | + *blk_given = i; | |
338 | + | |
339 | +bail: | |
340 | + if (unlikely(status < 0)) { | |
341 | + for (i = 0; i < blk_wanted; i++) | |
342 | + brelse(new_eb_bh[i]); | |
343 | + } | |
344 | + | |
345 | + return status; | |
346 | +} | |
347 | + | |
348 | int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, | |
349 | int type, int slot, u64 suballoc, | |
350 | u64 blkno, unsigned int bit) | |
351 | diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c | |
352 | index 77ec9b495027..2ff02dda97d8 100644 | |
353 | --- a/fs/ocfs2/aops.c | |
354 | +++ b/fs/ocfs2/aops.c | |
355 | @@ -2322,6 +2322,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode, | |
356 | ||
357 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); | |
358 | ||
359 | + /* Attach dealloc with extent tree in case that we may reuse extents | |
360 | + * which are already unlinked from current extent tree due to extent | |
361 | + * rotation and merging. | |
362 | + */ | |
363 | + et.et_dealloc = &dealloc; | |
364 | + | |
365 | ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, | |
366 | &data_ac, &meta_ac); | |
367 | if (ret) { | |
368 | -- | |
369 | 2.14.2 | |
370 |