]>
Commit | Line | Data |
---|---|---|
028ba5df TY |
1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | |
3 | * | |
4 | * move_extents.c | |
5 | * | |
6 | * Copyright (C) 2011 Oracle. All rights reserved. | |
7 | * | |
8 | * This program is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU General Public | |
10 | * License version 2 as published by the Free Software Foundation. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * General Public License for more details. | |
16 | */ | |
17 | #include <linux/fs.h> | |
18 | #include <linux/types.h> | |
19 | #include <linux/mount.h> | |
20 | #include <linux/swap.h> | |
21 | ||
22 | #include <cluster/masklog.h> | |
23 | ||
24 | #include "ocfs2.h" | |
25 | #include "ocfs2_ioctl.h" | |
26 | ||
27 | #include "alloc.h" | |
28 | #include "aops.h" | |
29 | #include "dlmglue.h" | |
30 | #include "extent_map.h" | |
31 | #include "inode.h" | |
32 | #include "journal.h" | |
33 | #include "suballoc.h" | |
34 | #include "uptodate.h" | |
35 | #include "super.h" | |
36 | #include "dir.h" | |
37 | #include "buffer_head_io.h" | |
38 | #include "sysfile.h" | |
39 | #include "suballoc.h" | |
40 | #include "refcounttree.h" | |
41 | #include "move_extents.h" | |
42 | ||
43 | struct ocfs2_move_extents_context { | |
44 | struct inode *inode; | |
45 | struct file *file; | |
46 | int auto_defrag; | |
47 | int credits; | |
48 | u32 new_phys_cpos; | |
49 | u32 clusters_moved; | |
50 | u64 refcount_loc; | |
51 | struct ocfs2_move_extents *range; | |
52 | struct ocfs2_extent_tree et; | |
53 | struct ocfs2_alloc_context *meta_ac; | |
54 | struct ocfs2_alloc_context *data_ac; | |
55 | struct ocfs2_cached_dealloc_ctxt dealloc; | |
56 | }; | |
de474ee8 | 57 | |
8f603e56 TY |
58 | static int __ocfs2_move_extent(handle_t *handle, |
59 | struct ocfs2_move_extents_context *context, | |
60 | u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, | |
61 | int ext_flags) | |
62 | { | |
63 | int ret = 0, index; | |
64 | struct inode *inode = context->inode; | |
65 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
66 | struct ocfs2_extent_rec *rec, replace_rec; | |
67 | struct ocfs2_path *path = NULL; | |
68 | struct ocfs2_extent_list *el; | |
69 | u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); | |
70 | u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); | |
71 | ||
72 | ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, | |
73 | p_cpos, new_p_cpos, len); | |
74 | if (ret) { | |
75 | mlog_errno(ret); | |
76 | goto out; | |
77 | } | |
78 | ||
79 | memset(&replace_rec, 0, sizeof(replace_rec)); | |
80 | replace_rec.e_cpos = cpu_to_le32(cpos); | |
81 | replace_rec.e_leaf_clusters = cpu_to_le16(len); | |
82 | replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, | |
83 | new_p_cpos)); | |
84 | ||
85 | path = ocfs2_new_path_from_et(&context->et); | |
86 | if (!path) { | |
87 | ret = -ENOMEM; | |
88 | mlog_errno(ret); | |
89 | goto out; | |
90 | } | |
91 | ||
92 | ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); | |
93 | if (ret) { | |
94 | mlog_errno(ret); | |
95 | goto out; | |
96 | } | |
97 | ||
98 | el = path_leaf_el(path); | |
99 | ||
100 | index = ocfs2_search_extent_list(el, cpos); | |
101 | if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { | |
102 | ocfs2_error(inode->i_sb, | |
103 | "Inode %llu has an extent at cpos %u which can no " | |
104 | "longer be found.\n", | |
105 | (unsigned long long)ino, cpos); | |
106 | ret = -EROFS; | |
107 | goto out; | |
108 | } | |
109 | ||
110 | rec = &el->l_recs[index]; | |
111 | ||
112 | BUG_ON(ext_flags != rec->e_flags); | |
113 | /* | |
114 | * after moving/defraging to new location, the extent is not going | |
115 | * to be refcounted anymore. | |
116 | */ | |
117 | replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; | |
118 | ||
119 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), | |
120 | context->et.et_root_bh, | |
121 | OCFS2_JOURNAL_ACCESS_WRITE); | |
122 | if (ret) { | |
123 | mlog_errno(ret); | |
124 | goto out; | |
125 | } | |
126 | ||
127 | ret = ocfs2_split_extent(handle, &context->et, path, index, | |
128 | &replace_rec, context->meta_ac, | |
129 | &context->dealloc); | |
130 | if (ret) { | |
131 | mlog_errno(ret); | |
132 | goto out; | |
133 | } | |
134 | ||
135 | ocfs2_journal_dirty(handle, context->et.et_root_bh); | |
136 | ||
137 | context->new_phys_cpos = new_p_cpos; | |
138 | ||
139 | /* | |
140 | * need I to append truncate log for old clusters? | |
141 | */ | |
142 | if (old_blkno) { | |
143 | if (ext_flags & OCFS2_EXT_REFCOUNTED) | |
144 | ret = ocfs2_decrease_refcount(inode, handle, | |
145 | ocfs2_blocks_to_clusters(osb->sb, | |
146 | old_blkno), | |
147 | len, context->meta_ac, | |
148 | &context->dealloc, 1); | |
149 | else | |
150 | ret = ocfs2_truncate_log_append(osb, handle, | |
151 | old_blkno, len); | |
152 | } | |
153 | ||
154 | out: | |
155 | return ret; | |
156 | } | |
157 | ||
de474ee8 TY |
158 | /* |
159 | * lock allocators, and reserving appropriate number of bits for | |
160 | * meta blocks and data clusters. | |
161 | * | |
162 | * in some cases, we don't need to reserve clusters, just let data_ac | |
163 | * be NULL. | |
164 | */ | |
165 | static int ocfs2_lock_allocators_move_extents(struct inode *inode, | |
166 | struct ocfs2_extent_tree *et, | |
167 | u32 clusters_to_move, | |
168 | u32 extents_to_split, | |
169 | struct ocfs2_alloc_context **meta_ac, | |
170 | struct ocfs2_alloc_context **data_ac, | |
171 | int extra_blocks, | |
172 | int *credits) | |
173 | { | |
174 | int ret, num_free_extents; | |
175 | unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; | |
176 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
177 | ||
178 | num_free_extents = ocfs2_num_free_extents(osb, et); | |
179 | if (num_free_extents < 0) { | |
180 | ret = num_free_extents; | |
181 | mlog_errno(ret); | |
182 | goto out; | |
183 | } | |
184 | ||
185 | if (!num_free_extents || | |
186 | (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) | |
187 | extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); | |
188 | ||
189 | ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); | |
190 | if (ret) { | |
191 | mlog_errno(ret); | |
192 | goto out; | |
193 | } | |
194 | ||
195 | if (data_ac) { | |
196 | ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); | |
197 | if (ret) { | |
198 | mlog_errno(ret); | |
199 | goto out; | |
200 | } | |
201 | } | |
202 | ||
203 | *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, | |
204 | clusters_to_move + 2); | |
205 | ||
206 | mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", | |
207 | extra_blocks, clusters_to_move, *credits); | |
208 | out: | |
209 | if (ret) { | |
210 | if (*meta_ac) { | |
211 | ocfs2_free_alloc_context(*meta_ac); | |
212 | *meta_ac = NULL; | |
213 | } | |
214 | } | |
215 | ||
216 | return ret; | |
217 | } | |
202ee5fa TY |
218 | |
219 | /* | |
220 | * Using one journal handle to guarantee the data consistency in case | |
221 | * crash happens anywhere. | |
222 | */ | |
223 | static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, | |
224 | u32 cpos, u32 phys_cpos, u32 len, int ext_flags) | |
225 | { | |
226 | int ret, credits = 0, extra_blocks = 0; | |
227 | handle_t *handle; | |
228 | struct inode *inode = context->inode; | |
229 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
230 | struct inode *tl_inode = osb->osb_tl_inode; | |
231 | struct ocfs2_refcount_tree *ref_tree = NULL; | |
232 | u32 new_phys_cpos, new_len; | |
233 | u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); | |
234 | ||
235 | if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { | |
236 | ||
237 | BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & | |
238 | OCFS2_HAS_REFCOUNT_FL)); | |
239 | ||
240 | BUG_ON(!context->refcount_loc); | |
241 | ||
242 | ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, | |
243 | &ref_tree, NULL); | |
244 | if (ret) { | |
245 | mlog_errno(ret); | |
246 | return ret; | |
247 | } | |
248 | ||
249 | ret = ocfs2_prepare_refcount_change_for_del(inode, | |
250 | context->refcount_loc, | |
251 | phys_blkno, | |
252 | len, | |
253 | &credits, | |
254 | &extra_blocks); | |
255 | if (ret) { | |
256 | mlog_errno(ret); | |
257 | goto out; | |
258 | } | |
259 | } | |
260 | ||
261 | ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, | |
262 | &context->meta_ac, | |
263 | &context->data_ac, | |
264 | extra_blocks, &credits); | |
265 | if (ret) { | |
266 | mlog_errno(ret); | |
267 | goto out; | |
268 | } | |
269 | ||
270 | /* | |
271 | * should be using allocation reservation strategy there? | |
272 | * | |
273 | * if (context->data_ac) | |
274 | * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; | |
275 | */ | |
276 | ||
277 | mutex_lock(&tl_inode->i_mutex); | |
278 | ||
279 | if (ocfs2_truncate_log_needs_flush(osb)) { | |
280 | ret = __ocfs2_flush_truncate_log(osb); | |
281 | if (ret < 0) { | |
282 | mlog_errno(ret); | |
283 | goto out_unlock_mutex; | |
284 | } | |
285 | } | |
286 | ||
287 | handle = ocfs2_start_trans(osb, credits); | |
288 | if (IS_ERR(handle)) { | |
289 | ret = PTR_ERR(handle); | |
290 | mlog_errno(ret); | |
291 | goto out_unlock_mutex; | |
292 | } | |
293 | ||
294 | ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, len, | |
295 | &new_phys_cpos, &new_len); | |
296 | if (ret) { | |
297 | mlog_errno(ret); | |
298 | goto out_commit; | |
299 | } | |
300 | ||
301 | /* | |
302 | * we're not quite patient here to make multiple attempts for claiming | |
303 | * enough clusters, failure to claim clusters per-requested is not a | |
304 | * disaster though, it can only mean partial range of defragmentation | |
305 | * or extent movements gets gone, users anyway is able to have another | |
306 | * try as they wish anytime, since they're going to be returned a | |
307 | * '-ENOSPC' and completed length of this movement. | |
308 | */ | |
309 | if (new_len != len) { | |
310 | mlog(0, "len_claimed: %u, len: %u\n", new_len, len); | |
311 | context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; | |
312 | ret = -ENOSPC; | |
313 | goto out_commit; | |
314 | } | |
315 | ||
316 | mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, | |
317 | phys_cpos, new_phys_cpos); | |
318 | ||
319 | ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, | |
320 | new_phys_cpos, ext_flags); | |
321 | if (ret) | |
322 | mlog_errno(ret); | |
323 | ||
324 | /* | |
325 | * Here we should write the new page out first if we are | |
326 | * in write-back mode. | |
327 | */ | |
328 | ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); | |
329 | if (ret) | |
330 | mlog_errno(ret); | |
331 | ||
332 | out_commit: | |
333 | ocfs2_commit_trans(osb, handle); | |
334 | ||
335 | out_unlock_mutex: | |
336 | mutex_unlock(&tl_inode->i_mutex); | |
337 | ||
338 | if (context->data_ac) { | |
339 | ocfs2_free_alloc_context(context->data_ac); | |
340 | context->data_ac = NULL; | |
341 | } | |
342 | ||
343 | if (context->meta_ac) { | |
344 | ocfs2_free_alloc_context(context->meta_ac); | |
345 | context->meta_ac = NULL; | |
346 | } | |
347 | ||
348 | out: | |
349 | if (ref_tree) | |
350 | ocfs2_unlock_refcount_tree(osb, ref_tree, 1); | |
351 | ||
352 | return ret; | |
353 | } | |
1c06b912 TY |
354 | |
355 | /* | |
356 | * find the victim alloc group, where #blkno fits. | |
357 | */ | |
358 | static int ocfs2_find_victim_alloc_group(struct inode *inode, | |
359 | u64 vict_blkno, | |
360 | int type, int slot, | |
361 | int *vict_bit, | |
362 | struct buffer_head **ret_bh) | |
363 | { | |
364 | int ret, i, blocks_per_unit = 1; | |
365 | u64 blkno; | |
366 | char namebuf[40]; | |
367 | ||
368 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
369 | struct buffer_head *ac_bh = NULL, *gd_bh = NULL; | |
370 | struct ocfs2_chain_list *cl; | |
371 | struct ocfs2_chain_rec *rec; | |
372 | struct ocfs2_dinode *ac_dinode; | |
373 | struct ocfs2_group_desc *bg; | |
374 | ||
375 | ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); | |
376 | ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, | |
377 | strlen(namebuf), &blkno); | |
378 | if (ret) { | |
379 | ret = -ENOENT; | |
380 | goto out; | |
381 | } | |
382 | ||
383 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); | |
384 | if (ret) { | |
385 | mlog_errno(ret); | |
386 | goto out; | |
387 | } | |
388 | ||
389 | ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; | |
390 | cl = &(ac_dinode->id2.i_chain); | |
391 | rec = &(cl->cl_recs[0]); | |
392 | ||
393 | if (type == GLOBAL_BITMAP_SYSTEM_INODE) | |
394 | blocks_per_unit <<= (osb->s_clustersize_bits - | |
395 | inode->i_sb->s_blocksize_bits); | |
396 | /* | |
397 | * 'vict_blkno' was out of the valid range. | |
398 | */ | |
399 | if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || | |
400 | (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) * | |
401 | blocks_per_unit))) { | |
402 | ret = -EINVAL; | |
403 | goto out; | |
404 | } | |
405 | ||
406 | for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { | |
407 | ||
408 | rec = &(cl->cl_recs[i]); | |
409 | if (!rec) | |
410 | continue; | |
411 | ||
412 | bg = NULL; | |
413 | ||
414 | do { | |
415 | if (!bg) | |
416 | blkno = le64_to_cpu(rec->c_blkno); | |
417 | else | |
418 | blkno = le64_to_cpu(bg->bg_next_group); | |
419 | ||
420 | if (gd_bh) { | |
421 | brelse(gd_bh); | |
422 | gd_bh = NULL; | |
423 | } | |
424 | ||
425 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); | |
426 | if (ret) { | |
427 | mlog_errno(ret); | |
428 | goto out; | |
429 | } | |
430 | ||
431 | bg = (struct ocfs2_group_desc *)gd_bh->b_data; | |
432 | ||
433 | if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + | |
434 | le16_to_cpu(bg->bg_bits))) { | |
435 | ||
436 | *ret_bh = gd_bh; | |
437 | *vict_bit = (vict_blkno - blkno) / | |
438 | blocks_per_unit; | |
439 | mlog(0, "find the victim group: #%llu, " | |
440 | "total_bits: %u, vict_bit: %u\n", | |
441 | blkno, le16_to_cpu(bg->bg_bits), | |
442 | *vict_bit); | |
443 | goto out; | |
444 | } | |
445 | ||
446 | } while (le64_to_cpu(bg->bg_next_group)); | |
447 | } | |
448 | ||
449 | ret = -EINVAL; | |
450 | out: | |
451 | brelse(ac_bh); | |
452 | ||
453 | /* | |
454 | * caller has to release the gd_bh properly. | |
455 | */ | |
456 | return ret; | |
457 | } |