]>
Commit | Line | Data |
---|---|---|
3323a8b7 FG |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Changwei Ge <ge.changwei@h3c.com> | |
3 | Date: Wed, 31 Jan 2018 16:15:06 -0800 | |
4 | Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without | |
5 | meta_alloc | |
6 | MIME-Version: 1.0 | |
7 | Content-Type: text/plain; charset=UTF-8 | |
8 | Content-Transfer-Encoding: 8bit | |
9 | ||
10 | A crash issue was reported by John Lightsey with a call trace as follows: | |
11 | ||
12 | ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2] | |
13 | ocfs2_change_extent_flag+0x33a/0x470 [ocfs2] | |
14 | ocfs2_mark_extent_written+0x172/0x220 [ocfs2] | |
15 | ocfs2_dio_end_io+0x62d/0x910 [ocfs2] | |
16 | dio_complete+0x19a/0x1a0 | |
17 | do_blockdev_direct_IO+0x19dd/0x1eb0 | |
18 | __blockdev_direct_IO+0x43/0x50 | |
19 | ocfs2_direct_IO+0x8f/0xa0 [ocfs2] | |
20 | generic_file_direct_write+0xb2/0x170 | |
21 | __generic_file_write_iter+0xc3/0x1b0 | |
22 | ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2] | |
23 | __vfs_write+0xae/0xf0 | |
24 | vfs_write+0xb8/0x1b0 | |
25 | SyS_write+0x4f/0xb0 | |
26 | system_call_fastpath+0x16/0x75 | |
27 | ||
28 | The BUG code told that extent tree wants to grow but no metadata was | |
29 | reserved ahead of time. From my investigation into this issue, the root | |
30 | cause it that although enough metadata is not reserved, there should be | |
31 | enough for following use. Rightmost extent is merged into its left one | |
32 | due to a certain times of marking extent written. Because during | |
33 | marking extent written, we got many physically continuous extents. At | |
34 | last, an empty extent showed up and the rightmost path is removed from | |
35 | extent tree. | |
36 | ||
37 | Add a new mechanism to reuse extent block cached in dealloc which were | |
38 | just unlinked from extent tree to solve this crash issue. | |
39 | ||
40 | Criteria is that during marking extents *written*, if extent rotation | |
41 | and merging results in unlinking extent with growing extent tree later | |
42 | without any metadata reserved ahead of time, try to reuse those extents | |
43 | in dealloc in which deleted extents are cached. | |
44 | ||
45 | Also, this patch addresses the issue John reported that ::dw_zero_count | |
46 | is not calculated properly. | |
47 | ||
48 | After applying this patch, the issue John reported was gone. Thanks for | |
49 | the reproducer provided by John. And this patch has passed | |
50 | ocfs2-test(29 cases) suite running by New H3C Group. | |
51 | ||
52 | [ge.changwei@h3c.com: fix static checker warnning] | |
53 | Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com | |
54 | [akpm@linux-foundation.org: brelse(NULL) is legal] | |
55 | Link: http://lkml.kernel.org/r/1515479070-32653-2-git-send-email-ge.changwei@h3c.com | |
56 | Signed-off-by: Changwei Ge <ge.changwei@h3c.com> | |
57 | Reported-by: John Lightsey <john@nixnuts.net> | |
58 | Tested-by: John Lightsey <john@nixnuts.net> | |
59 | Cc: Joel Becker <jlbec@evilplan.org> | |
60 | Cc: Joseph Qi <jiangqi903@gmail.com> | |
61 | Cc: Junxiao Bi <junxiao.bi@oracle.com> | |
62 | Cc: Dan Carpenter <dan.carpenter@oracle.com> | |
63 | Cc: Mark Fasheh <mfasheh@versity.com> | |
64 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
65 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
66 | (cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42) | |
67 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
68 | --- | |
3323a8b7 | 69 | fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- |
c3592848 | 70 | fs/ocfs2/alloc.h | 1 + |
3323a8b7 FG |
71 | fs/ocfs2/aops.c | 6 ++ |
72 | 3 files changed, 203 insertions(+), 10 deletions(-) | |
73 | ||
3323a8b7 | 74 | diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c |
15baf5b4 | 75 | index ab5105f9767e..2f2c76193f54 100644 |
3323a8b7 FG |
76 | --- a/fs/ocfs2/alloc.c |
77 | +++ b/fs/ocfs2/alloc.c | |
78 | @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, | |
79 | struct ocfs2_extent_rec *rec); | |
80 | static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); | |
81 | static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); | |
82 | + | |
83 | +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, | |
84 | + struct ocfs2_extent_tree *et, | |
85 | + struct buffer_head **new_eb_bh, | |
86 | + int blk_wanted, int *blk_given); | |
87 | +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et); | |
88 | + | |
89 | static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { | |
90 | .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, | |
91 | .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, | |
92 | @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, | |
93 | if (!obj) | |
94 | obj = (void *)bh->b_data; | |
95 | et->et_object = obj; | |
96 | + et->et_dealloc = NULL; | |
97 | ||
98 | et->et_ops->eo_fill_root_el(et); | |
99 | if (!et->et_ops->eo_fill_max_leaf_clusters) | |
15baf5b4 | 100 | @@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle, |
3323a8b7 FG |
101 | struct buffer_head **last_eb_bh, |
102 | struct ocfs2_alloc_context *meta_ac) | |
103 | { | |
104 | - int status, new_blocks, i; | |
105 | + int status, new_blocks, i, block_given = 0; | |
106 | u64 next_blkno, new_last_eb_blk; | |
107 | struct buffer_head *bh; | |
108 | struct buffer_head **new_eb_bhs = NULL; | |
15baf5b4 | 109 | @@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle, |
3323a8b7 FG |
110 | goto bail; |
111 | } | |
112 | ||
113 | - status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, | |
114 | - meta_ac, new_eb_bhs); | |
115 | - if (status < 0) { | |
116 | - mlog_errno(status); | |
117 | - goto bail; | |
118 | + /* Firstyly, try to reuse dealloc since we have already estimated how | |
119 | + * many extent blocks we may use. | |
120 | + */ | |
121 | + if (!ocfs2_is_dealloc_empty(et)) { | |
122 | + status = ocfs2_reuse_blk_from_dealloc(handle, et, | |
123 | + new_eb_bhs, new_blocks, | |
124 | + &block_given); | |
125 | + if (status < 0) { | |
126 | + mlog_errno(status); | |
127 | + goto bail; | |
128 | + } | |
129 | + } | |
130 | + | |
131 | + BUG_ON(block_given > new_blocks); | |
132 | + | |
133 | + if (block_given < new_blocks) { | |
134 | + BUG_ON(!meta_ac); | |
135 | + status = ocfs2_create_new_meta_bhs(handle, et, | |
136 | + new_blocks - block_given, | |
137 | + meta_ac, | |
138 | + &new_eb_bhs[block_given]); | |
139 | + if (status < 0) { | |
140 | + mlog_errno(status); | |
141 | + goto bail; | |
142 | + } | |
143 | } | |
144 | ||
145 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be | |
15baf5b4 | 146 | @@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle, |
3323a8b7 FG |
147 | struct ocfs2_alloc_context *meta_ac, |
148 | struct buffer_head **ret_new_eb_bh) | |
149 | { | |
150 | - int status, i; | |
151 | + int status, i, block_given = 0; | |
152 | u32 new_clusters; | |
153 | struct buffer_head *new_eb_bh = NULL; | |
154 | struct ocfs2_extent_block *eb; | |
155 | struct ocfs2_extent_list *root_el; | |
156 | struct ocfs2_extent_list *eb_el; | |
157 | ||
158 | - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, | |
159 | - &new_eb_bh); | |
160 | + if (!ocfs2_is_dealloc_empty(et)) { | |
161 | + status = ocfs2_reuse_blk_from_dealloc(handle, et, | |
162 | + &new_eb_bh, 1, | |
163 | + &block_given); | |
164 | + } else if (meta_ac) { | |
165 | + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, | |
166 | + &new_eb_bh); | |
167 | + | |
168 | + } else { | |
169 | + BUG(); | |
170 | + } | |
171 | + | |
172 | if (status < 0) { | |
173 | mlog_errno(status); | |
174 | goto bail; | |
15baf5b4 | 175 | @@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, |
3323a8b7 FG |
176 | int depth = le16_to_cpu(el->l_tree_depth); |
177 | struct buffer_head *bh = NULL; | |
178 | ||
179 | - BUG_ON(meta_ac == NULL); | |
180 | + BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et)); | |
181 | ||
182 | shift = ocfs2_find_branch_target(et, &bh); | |
183 | if (shift < 0) { | |
15baf5b4 | 184 | @@ -6585,6 +6623,154 @@ ocfs2_find_per_slot_free_list(int type, |
3323a8b7 FG |
185 | return fl; |
186 | } | |
187 | ||
188 | +static struct ocfs2_per_slot_free_list * | |
189 | +ocfs2_find_preferred_free_list(int type, | |
190 | + int preferred_slot, | |
191 | + int *real_slot, | |
192 | + struct ocfs2_cached_dealloc_ctxt *ctxt) | |
193 | +{ | |
194 | + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; | |
195 | + | |
196 | + while (fl) { | |
197 | + if (fl->f_inode_type == type && fl->f_slot == preferred_slot) { | |
198 | + *real_slot = fl->f_slot; | |
199 | + return fl; | |
200 | + } | |
201 | + | |
202 | + fl = fl->f_next_suballocator; | |
203 | + } | |
204 | + | |
205 | + /* If we can't find any free list matching preferred slot, just use | |
206 | + * the first one. | |
207 | + */ | |
208 | + fl = ctxt->c_first_suballocator; | |
209 | + *real_slot = fl->f_slot; | |
210 | + | |
211 | + return fl; | |
212 | +} | |
213 | + | |
214 | +/* Return Value 1 indicates empty */ | |
215 | +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et) | |
216 | +{ | |
217 | + struct ocfs2_per_slot_free_list *fl = NULL; | |
218 | + | |
219 | + if (!et->et_dealloc) | |
220 | + return 1; | |
221 | + | |
222 | + fl = et->et_dealloc->c_first_suballocator; | |
223 | + if (!fl) | |
224 | + return 1; | |
225 | + | |
226 | + if (!fl->f_first) | |
227 | + return 1; | |
228 | + | |
229 | + return 0; | |
230 | +} | |
231 | + | |
232 | +/* If extent was deleted from tree due to extent rotation and merging, and | |
233 | + * no metadata is reserved ahead of time. Try to reuse some extents | |
234 | + * just deleted. This is only used to reuse extent blocks. | |
235 | + * It is supposed to find enough extent blocks in dealloc if our estimation | |
236 | + * on metadata is accurate. | |
237 | + */ | |
238 | +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, | |
239 | + struct ocfs2_extent_tree *et, | |
240 | + struct buffer_head **new_eb_bh, | |
241 | + int blk_wanted, int *blk_given) | |
242 | +{ | |
243 | + int i, status = 0, real_slot; | |
244 | + struct ocfs2_cached_dealloc_ctxt *dealloc; | |
245 | + struct ocfs2_per_slot_free_list *fl; | |
246 | + struct ocfs2_cached_block_free *bf; | |
247 | + struct ocfs2_extent_block *eb; | |
248 | + struct ocfs2_super *osb = | |
249 | + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); | |
250 | + | |
251 | + *blk_given = 0; | |
252 | + | |
253 | + /* If extent tree doesn't have a dealloc, this is not faulty. Just | |
254 | + * tell upper caller dealloc can't provide any block and it should | |
255 | + * ask for alloc to claim more space. | |
256 | + */ | |
257 | + dealloc = et->et_dealloc; | |
258 | + if (!dealloc) | |
259 | + goto bail; | |
260 | + | |
261 | + for (i = 0; i < blk_wanted; i++) { | |
262 | + /* Prefer to use local slot */ | |
263 | + fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE, | |
264 | + osb->slot_num, &real_slot, | |
265 | + dealloc); | |
266 | + /* If no more block can be reused, we should claim more | |
267 | + * from alloc. Just return here normally. | |
268 | + */ | |
269 | + if (!fl) { | |
270 | + status = 0; | |
271 | + break; | |
272 | + } | |
273 | + | |
274 | + bf = fl->f_first; | |
275 | + fl->f_first = bf->free_next; | |
276 | + | |
277 | + new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk); | |
278 | + if (new_eb_bh[i] == NULL) { | |
279 | + status = -ENOMEM; | |
280 | + mlog_errno(status); | |
281 | + goto bail; | |
282 | + } | |
283 | + | |
284 | + mlog(0, "Reusing block(%llu) from " | |
285 | + "dealloc(local slot:%d, real slot:%d)\n", | |
286 | + bf->free_blk, osb->slot_num, real_slot); | |
287 | + | |
288 | + ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]); | |
289 | + | |
290 | + status = ocfs2_journal_access_eb(handle, et->et_ci, | |
291 | + new_eb_bh[i], | |
292 | + OCFS2_JOURNAL_ACCESS_CREATE); | |
293 | + if (status < 0) { | |
294 | + mlog_errno(status); | |
295 | + goto bail; | |
296 | + } | |
297 | + | |
298 | + memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize); | |
299 | + eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data; | |
300 | + | |
301 | + /* We can't guarantee that buffer head is still cached, so | |
302 | + * polutlate the extent block again. | |
303 | + */ | |
304 | + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); | |
305 | + eb->h_blkno = cpu_to_le64(bf->free_blk); | |
306 | + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); | |
307 | + eb->h_suballoc_slot = cpu_to_le16(real_slot); | |
308 | + eb->h_suballoc_loc = cpu_to_le64(bf->free_bg); | |
309 | + eb->h_suballoc_bit = cpu_to_le16(bf->free_bit); | |
310 | + eb->h_list.l_count = | |
311 | + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); | |
312 | + | |
313 | + /* We'll also be dirtied by the caller, so | |
314 | + * this isn't absolutely necessary. | |
315 | + */ | |
316 | + ocfs2_journal_dirty(handle, new_eb_bh[i]); | |
317 | + | |
318 | + if (!fl->f_first) { | |
319 | + dealloc->c_first_suballocator = fl->f_next_suballocator; | |
320 | + kfree(fl); | |
321 | + } | |
322 | + kfree(bf); | |
323 | + } | |
324 | + | |
325 | + *blk_given = i; | |
326 | + | |
327 | +bail: | |
328 | + if (unlikely(status < 0)) { | |
329 | + for (i = 0; i < blk_wanted; i++) | |
330 | + brelse(new_eb_bh[i]); | |
331 | + } | |
332 | + | |
333 | + return status; | |
334 | +} | |
335 | + | |
336 | int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, | |
337 | int type, int slot, u64 suballoc, | |
338 | u64 blkno, unsigned int bit) | |
c3592848 TL |
339 | diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h |
340 | index 27b75cf32cfa..250bcacdf9e9 100644 | |
341 | --- a/fs/ocfs2/alloc.h | |
342 | +++ b/fs/ocfs2/alloc.h | |
343 | @@ -61,6 +61,7 @@ struct ocfs2_extent_tree { | |
344 | ocfs2_journal_access_func et_root_journal_access; | |
345 | void *et_object; | |
346 | unsigned int et_max_leaf_clusters; | |
347 | + struct ocfs2_cached_dealloc_ctxt *et_dealloc; | |
348 | }; | |
349 | ||
350 | /* | |
3323a8b7 | 351 | diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c |
15baf5b4 | 352 | index 256986aca8df..e8e205bf2e41 100644 |
3323a8b7 FG |
353 | --- a/fs/ocfs2/aops.c |
354 | +++ b/fs/ocfs2/aops.c | |
15baf5b4 | 355 | @@ -2332,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode, |
3323a8b7 FG |
356 | |
357 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); | |
358 | ||
359 | + /* Attach dealloc with extent tree in case that we may reuse extents | |
360 | + * which are already unlinked from current extent tree due to extent | |
361 | + * rotation and merging. | |
362 | + */ | |
363 | + et.et_dealloc = &dealloc; | |
364 | + | |
365 | ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, | |
366 | &data_ac, &meta_ac); | |
367 | if (ret) { |