]> git.proxmox.com Git - pve-kernel.git/blame - patches/kernel/0029-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch
add cherry-picks for OCFS2 bug
[pve-kernel.git] / patches / kernel / 0029-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch
CommitLineData
3323a8b7
FG
1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: Changwei Ge <ge.changwei@h3c.com>
3Date: Wed, 31 Jan 2018 16:15:06 -0800
4Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without
5 meta_alloc
6MIME-Version: 1.0
7Content-Type: text/plain; charset=UTF-8
8Content-Transfer-Encoding: 8bit
9
10A crash issue was reported by John Lightsey with a call trace as follows:
11
12 ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
13 ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
14 ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
15 ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
16 dio_complete+0x19a/0x1a0
17 do_blockdev_direct_IO+0x19dd/0x1eb0
18 __blockdev_direct_IO+0x43/0x50
19 ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
20 generic_file_direct_write+0xb2/0x170
21 __generic_file_write_iter+0xc3/0x1b0
22 ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
23 __vfs_write+0xae/0xf0
24 vfs_write+0xb8/0x1b0
25 SyS_write+0x4f/0xb0
26 system_call_fastpath+0x16/0x75
27
28The BUG code told that extent tree wants to grow but no metadata was
29reserved ahead of time. From my investigation into this issue, the root
30cause it that although enough metadata is not reserved, there should be
31enough for following use. Rightmost extent is merged into its left one
32due to a certain times of marking extent written. Because during
33marking extent written, we got many physically continuous extents. At
34last, an empty extent showed up and the rightmost path is removed from
35extent tree.
36
37Add a new mechanism to reuse extent block cached in dealloc which were
38just unlinked from extent tree to solve this crash issue.
39
40Criteria is that during marking extents *written*, if extent rotation
41and merging results in unlinking extent with growing extent tree later
42without any metadata reserved ahead of time, try to reuse those extents
43in dealloc in which deleted extents are cached.
44
45Also, this patch addresses the issue John reported that ::dw_zero_count
46is not calculated properly.
47
48After applying this patch, the issue John reported was gone. Thanks for
49the reproducer provided by John. And this patch has passed
50ocfs2-test(29 cases) suite running by New H3C Group.
51
52[ge.changwei@h3c.com: fix static checker warnning]
53 Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com
54[akpm@linux-foundation.org: brelse(NULL) is legal]
55Link: http://lkml.kernel.org/r/1515479070-32653-2-git-send-email-ge.changwei@h3c.com
56Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
57Reported-by: John Lightsey <john@nixnuts.net>
58Tested-by: John Lightsey <john@nixnuts.net>
59Cc: Joel Becker <jlbec@evilplan.org>
60Cc: Joseph Qi <jiangqi903@gmail.com>
61Cc: Junxiao Bi <junxiao.bi@oracle.com>
62Cc: Dan Carpenter <dan.carpenter@oracle.com>
63Cc: Mark Fasheh <mfasheh@versity.com>
64Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
65Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
66(cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42)
67Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
68---
69 fs/ocfs2/alloc.h | 1 +
70 fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
71 fs/ocfs2/aops.c | 6 ++
72 3 files changed, 203 insertions(+), 10 deletions(-)
73
74diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
75index 4a5152ec88a3..571692171dd1 100644
76--- a/fs/ocfs2/alloc.h
77+++ b/fs/ocfs2/alloc.h
78@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
79 ocfs2_journal_access_func et_root_journal_access;
80 void *et_object;
81 unsigned int et_max_leaf_clusters;
82+ struct ocfs2_cached_dealloc_ctxt *et_dealloc;
83 };
84
85 /*
86diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
87index 386aecce881d..9b5e7d8ba710 100644
88--- a/fs/ocfs2/alloc.c
89+++ b/fs/ocfs2/alloc.c
90@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
91 struct ocfs2_extent_rec *rec);
92 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
93 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
94+
95+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
96+ struct ocfs2_extent_tree *et,
97+ struct buffer_head **new_eb_bh,
98+ int blk_wanted, int *blk_given);
99+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
100+
101 static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
102 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
103 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
104@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
105 if (!obj)
106 obj = (void *)bh->b_data;
107 et->et_object = obj;
108+ et->et_dealloc = NULL;
109
110 et->et_ops->eo_fill_root_el(et);
111 if (!et->et_ops->eo_fill_max_leaf_clusters)
112@@ -1159,7 +1167,7 @@ static int ocfs2_add_branch(handle_t *handle,
113 struct buffer_head **last_eb_bh,
114 struct ocfs2_alloc_context *meta_ac)
115 {
116- int status, new_blocks, i;
117+ int status, new_blocks, i, block_given = 0;
118 u64 next_blkno, new_last_eb_blk;
119 struct buffer_head *bh;
120 struct buffer_head **new_eb_bhs = NULL;
121@@ -1214,11 +1222,31 @@ static int ocfs2_add_branch(handle_t *handle,
122 goto bail;
123 }
124
125- status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
126- meta_ac, new_eb_bhs);
127- if (status < 0) {
128- mlog_errno(status);
129- goto bail;
130+ /* Firstyly, try to reuse dealloc since we have already estimated how
131+ * many extent blocks we may use.
132+ */
133+ if (!ocfs2_is_dealloc_empty(et)) {
134+ status = ocfs2_reuse_blk_from_dealloc(handle, et,
135+ new_eb_bhs, new_blocks,
136+ &block_given);
137+ if (status < 0) {
138+ mlog_errno(status);
139+ goto bail;
140+ }
141+ }
142+
143+ BUG_ON(block_given > new_blocks);
144+
145+ if (block_given < new_blocks) {
146+ BUG_ON(!meta_ac);
147+ status = ocfs2_create_new_meta_bhs(handle, et,
148+ new_blocks - block_given,
149+ meta_ac,
150+ &new_eb_bhs[block_given]);
151+ if (status < 0) {
152+ mlog_errno(status);
153+ goto bail;
154+ }
155 }
156
157 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
158@@ -1341,15 +1369,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
159 struct ocfs2_alloc_context *meta_ac,
160 struct buffer_head **ret_new_eb_bh)
161 {
162- int status, i;
163+ int status, i, block_given = 0;
164 u32 new_clusters;
165 struct buffer_head *new_eb_bh = NULL;
166 struct ocfs2_extent_block *eb;
167 struct ocfs2_extent_list *root_el;
168 struct ocfs2_extent_list *eb_el;
169
170- status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
171- &new_eb_bh);
172+ if (!ocfs2_is_dealloc_empty(et)) {
173+ status = ocfs2_reuse_blk_from_dealloc(handle, et,
174+ &new_eb_bh, 1,
175+ &block_given);
176+ } else if (meta_ac) {
177+ status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
178+ &new_eb_bh);
179+
180+ } else {
181+ BUG();
182+ }
183+
184 if (status < 0) {
185 mlog_errno(status);
186 goto bail;
187@@ -1512,7 +1550,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
188 int depth = le16_to_cpu(el->l_tree_depth);
189 struct buffer_head *bh = NULL;
190
191- BUG_ON(meta_ac == NULL);
192+ BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
193
194 shift = ocfs2_find_branch_target(et, &bh);
195 if (shift < 0) {
196@@ -6593,6 +6631,154 @@ ocfs2_find_per_slot_free_list(int type,
197 return fl;
198 }
199
200+static struct ocfs2_per_slot_free_list *
201+ocfs2_find_preferred_free_list(int type,
202+ int preferred_slot,
203+ int *real_slot,
204+ struct ocfs2_cached_dealloc_ctxt *ctxt)
205+{
206+ struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
207+
208+ while (fl) {
209+ if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
210+ *real_slot = fl->f_slot;
211+ return fl;
212+ }
213+
214+ fl = fl->f_next_suballocator;
215+ }
216+
217+ /* If we can't find any free list matching preferred slot, just use
218+ * the first one.
219+ */
220+ fl = ctxt->c_first_suballocator;
221+ *real_slot = fl->f_slot;
222+
223+ return fl;
224+}
225+
226+/* Return Value 1 indicates empty */
227+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
228+{
229+ struct ocfs2_per_slot_free_list *fl = NULL;
230+
231+ if (!et->et_dealloc)
232+ return 1;
233+
234+ fl = et->et_dealloc->c_first_suballocator;
235+ if (!fl)
236+ return 1;
237+
238+ if (!fl->f_first)
239+ return 1;
240+
241+ return 0;
242+}
243+
244+/* If extent was deleted from tree due to extent rotation and merging, and
245+ * no metadata is reserved ahead of time. Try to reuse some extents
246+ * just deleted. This is only used to reuse extent blocks.
247+ * It is supposed to find enough extent blocks in dealloc if our estimation
248+ * on metadata is accurate.
249+ */
250+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
251+ struct ocfs2_extent_tree *et,
252+ struct buffer_head **new_eb_bh,
253+ int blk_wanted, int *blk_given)
254+{
255+ int i, status = 0, real_slot;
256+ struct ocfs2_cached_dealloc_ctxt *dealloc;
257+ struct ocfs2_per_slot_free_list *fl;
258+ struct ocfs2_cached_block_free *bf;
259+ struct ocfs2_extent_block *eb;
260+ struct ocfs2_super *osb =
261+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
262+
263+ *blk_given = 0;
264+
265+ /* If extent tree doesn't have a dealloc, this is not faulty. Just
266+ * tell upper caller dealloc can't provide any block and it should
267+ * ask for alloc to claim more space.
268+ */
269+ dealloc = et->et_dealloc;
270+ if (!dealloc)
271+ goto bail;
272+
273+ for (i = 0; i < blk_wanted; i++) {
274+ /* Prefer to use local slot */
275+ fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
276+ osb->slot_num, &real_slot,
277+ dealloc);
278+ /* If no more block can be reused, we should claim more
279+ * from alloc. Just return here normally.
280+ */
281+ if (!fl) {
282+ status = 0;
283+ break;
284+ }
285+
286+ bf = fl->f_first;
287+ fl->f_first = bf->free_next;
288+
289+ new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
290+ if (new_eb_bh[i] == NULL) {
291+ status = -ENOMEM;
292+ mlog_errno(status);
293+ goto bail;
294+ }
295+
296+ mlog(0, "Reusing block(%llu) from "
297+ "dealloc(local slot:%d, real slot:%d)\n",
298+ bf->free_blk, osb->slot_num, real_slot);
299+
300+ ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
301+
302+ status = ocfs2_journal_access_eb(handle, et->et_ci,
303+ new_eb_bh[i],
304+ OCFS2_JOURNAL_ACCESS_CREATE);
305+ if (status < 0) {
306+ mlog_errno(status);
307+ goto bail;
308+ }
309+
310+ memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
311+ eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
312+
313+ /* We can't guarantee that buffer head is still cached, so
314+ * polutlate the extent block again.
315+ */
316+ strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
317+ eb->h_blkno = cpu_to_le64(bf->free_blk);
318+ eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
319+ eb->h_suballoc_slot = cpu_to_le16(real_slot);
320+ eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
321+ eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
322+ eb->h_list.l_count =
323+ cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
324+
325+ /* We'll also be dirtied by the caller, so
326+ * this isn't absolutely necessary.
327+ */
328+ ocfs2_journal_dirty(handle, new_eb_bh[i]);
329+
330+ if (!fl->f_first) {
331+ dealloc->c_first_suballocator = fl->f_next_suballocator;
332+ kfree(fl);
333+ }
334+ kfree(bf);
335+ }
336+
337+ *blk_given = i;
338+
339+bail:
340+ if (unlikely(status < 0)) {
341+ for (i = 0; i < blk_wanted; i++)
342+ brelse(new_eb_bh[i]);
343+ }
344+
345+ return status;
346+}
347+
348 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
349 int type, int slot, u64 suballoc,
350 u64 blkno, unsigned int bit)
351diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
352index 77ec9b495027..2ff02dda97d8 100644
353--- a/fs/ocfs2/aops.c
354+++ b/fs/ocfs2/aops.c
355@@ -2322,6 +2322,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
356
357 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
358
359+ /* Attach dealloc with extent tree in case that we may reuse extents
360+ * which are already unlinked from current extent tree due to extent
361+ * rotation and merging.
362+ */
363+ et.et_dealloc = &dealloc;
364+
365 ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
366 &data_ac, &meta_ac);
367 if (ret) {
368--
3692.14.2
370