]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - fs/ext4/mballoc.c
ext4: remove unncessary call mb_find_buddy() in debugging code
[mirror_ubuntu-bionic-kernel.git] / fs / ext4 / mballoc.c
CommitLineData
c9de560d
AT
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */
18
19
20/*
21 * mballoc.c contains the multiblocks allocation routines
22 */
23
8f6e39a7 24#include "mballoc.h"
6ba495e9 25#include <linux/debugfs.h>
5a0e3ad6 26#include <linux/slab.h>
9bffad1e
TT
27#include <trace/events/ext4.h>
28
c9de560d
AT
29/*
30 * MUSTDO:
31 * - test ext4_ext_search_left() and ext4_ext_search_right()
32 * - search for metadata in few groups
33 *
34 * TODO v4:
35 * - normalization should take into account whether file is still open
36 * - discard preallocations if no free space left (policy?)
37 * - don't normalize tails
38 * - quota
39 * - reservation for superuser
40 *
41 * TODO v3:
42 * - bitmap read-ahead (proposed by Oleg Drokin aka green)
43 * - track min/max extents in each group for better group selection
44 * - mb_mark_used() may allocate chunk right after splitting buddy
45 * - tree of groups sorted by number of free blocks
46 * - error handling
47 */
48
49/*
50 * The allocation request involve request for multiple number of blocks
51 * near to the goal(block) value specified.
52 *
b713a5ec
TT
53 * During initialization phase of the allocator we decide to use the
54 * group preallocation or inode preallocation depending on the size of
55 * the file. The size of the file could be the resulting file size we
56 * would have after allocation, or the current file size, which ever
57 * is larger. If the size is less than sbi->s_mb_stream_request we
58 * select to use the group preallocation. The default value of
59 * s_mb_stream_request is 16 blocks. This can also be tuned via
60 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
61 * terms of number of blocks.
c9de560d
AT
62 *
63 * The main motivation for having small file use group preallocation is to
b713a5ec 64 * ensure that we have small files closer together on the disk.
c9de560d 65 *
b713a5ec
TT
66 * First stage the allocator looks at the inode prealloc list,
67 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
68 * spaces for this particular inode. The inode prealloc space is
69 * represented as:
c9de560d
AT
70 *
71 * pa_lstart -> the logical start block for this prealloc space
72 * pa_pstart -> the physical start block for this prealloc space
1537a363 73 * pa_len -> length for this prealloc space
c9de560d
AT
74 * pa_free -> free space available in this prealloc space
75 *
76 * The inode preallocation space is used looking at the _logical_ start
77 * block. If only the logical file block falls within the range of prealloc
78 * space we will consume the particular prealloc space. This make sure that
79 * that the we have contiguous physical blocks representing the file blocks
80 *
81 * The important thing to be noted in case of inode prealloc space is that
82 * we don't modify the values associated to inode prealloc space except
83 * pa_free.
84 *
85 * If we are not able to find blocks in the inode prealloc space and if we
86 * have the group allocation flag set then we look at the locality group
87 * prealloc space. These are per CPU prealloc list repreasented as
88 *
89 * ext4_sb_info.s_locality_groups[smp_processor_id()]
90 *
91 * The reason for having a per cpu locality group is to reduce the contention
92 * between CPUs. It is possible to get scheduled at this point.
93 *
94 * The locality group prealloc space is used looking at whether we have
95 * enough free space (pa_free) withing the prealloc space.
96 *
97 * If we can't allocate blocks via inode prealloc or/and locality group
98 * prealloc then we look at the buddy cache. The buddy cache is represented
99 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
100 * mapped to the buddy and bitmap information regarding different
101 * groups. The buddy information is attached to buddy cache inode so that
102 * we can access them through the page cache. The information regarding
103 * each group is loaded via ext4_mb_load_buddy. The information involve
104 * block bitmap and buddy information. The information are stored in the
105 * inode as:
106 *
107 * { page }
c3a326a6 108 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
c9de560d
AT
109 *
110 *
111 * one block each for bitmap and buddy information. So for each group we
112 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
113 * blocksize) blocks. So it can have information regarding groups_per_page
114 * which is blocks_per_page/2
115 *
116 * The buddy cache inode is not stored on disk. The inode is thrown
117 * away when the filesystem is unmounted.
118 *
119 * We look for count number of blocks in the buddy cache. If we were able
120 * to locate that many free blocks we return with additional information
121 * regarding rest of the contiguous physical block available
122 *
123 * Before allocating blocks via buddy cache we normalize the request
124 * blocks. This ensure we ask for more blocks that we needed. The extra
125 * blocks that we get after allocation is added to the respective prealloc
126 * list. In case of inode preallocation we follow a list of heuristics
127 * based on file size. This can be found in ext4_mb_normalize_request. If
128 * we are doing a group prealloc we try to normalize the request to
b713a5ec 129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
c9de560d 130 * 512 blocks. This can be tuned via
b713a5ec 131 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
c9de560d
AT
132 * terms of number of blocks. If we have mounted the file system with -O
133 * stripe=<value> option the group prealloc request is normalized to the
134 * stripe value (sbi->s_stripe)
135 *
b713a5ec 136 * The regular allocator(using the buddy cache) supports few tunables.
c9de560d 137 *
b713a5ec
TT
138 * /sys/fs/ext4/<partition>/mb_min_to_scan
139 * /sys/fs/ext4/<partition>/mb_max_to_scan
140 * /sys/fs/ext4/<partition>/mb_order2_req
c9de560d 141 *
b713a5ec 142 * The regular allocator uses buddy scan only if the request len is power of
c9de560d
AT
143 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
144 * value of s_mb_order2_reqs can be tuned via
b713a5ec 145 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
af901ca1 146 * stripe size (sbi->s_stripe), we try to search for contiguous block in
b713a5ec
TT
147 * stripe size. This should result in better allocation on RAID setups. If
148 * not, we search in the specific group using bitmap for best extents. The
149 * tunable min_to_scan and max_to_scan control the behaviour here.
c9de560d 150 * min_to_scan indicate how long the mballoc __must__ look for a best
b713a5ec 151 * extent and max_to_scan indicates how long the mballoc __can__ look for a
c9de560d
AT
152 * best extent in the found extents. Searching for the blocks starts with
153 * the group specified as the goal value in allocation context via
154 * ac_g_ex. Each group is first checked based on the criteria whether it
155 * can used for allocation. ext4_mb_good_group explains how the groups are
156 * checked.
157 *
158 * Both the prealloc space are getting populated as above. So for the first
159 * request we will hit the buddy cache which will result in this prealloc
160 * space getting filled. The prealloc space is then later used for the
161 * subsequent request.
162 */
163
164/*
165 * mballoc operates on the following data:
166 * - on-disk bitmap
167 * - in-core buddy (actually includes buddy and bitmap)
168 * - preallocation descriptors (PAs)
169 *
170 * there are two types of preallocations:
171 * - inode
172 * assiged to specific inode and can be used for this inode only.
173 * it describes part of inode's space preallocated to specific
174 * physical blocks. any block from that preallocated can be used
175 * independent. the descriptor just tracks number of blocks left
176 * unused. so, before taking some block from descriptor, one must
177 * make sure corresponded logical block isn't allocated yet. this
178 * also means that freeing any block within descriptor's range
179 * must discard all preallocated blocks.
180 * - locality group
181 * assigned to specific locality group which does not translate to
182 * permanent set of inodes: inode can join and leave group. space
183 * from this type of preallocation can be used for any inode. thus
184 * it's consumed from the beginning to the end.
185 *
186 * relation between them can be expressed as:
187 * in-core buddy = on-disk bitmap + preallocation descriptors
188 *
189 * this mean blocks mballoc considers used are:
190 * - allocated blocks (persistent)
191 * - preallocated blocks (non-persistent)
192 *
193 * consistency in mballoc world means that at any time a block is either
194 * free or used in ALL structures. notice: "any time" should not be read
195 * literally -- time is discrete and delimited by locks.
196 *
197 * to keep it simple, we don't use block numbers, instead we count number of
198 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
199 *
200 * all operations can be expressed as:
201 * - init buddy: buddy = on-disk + PAs
202 * - new PA: buddy += N; PA = N
203 * - use inode PA: on-disk += N; PA -= N
204 * - discard inode PA buddy -= on-disk - PA; PA = 0
205 * - use locality group PA on-disk += N; PA -= N
206 * - discard locality group PA buddy -= PA; PA = 0
207 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
208 * is used in real operation because we can't know actual used
209 * bits from PA, only from on-disk bitmap
210 *
211 * if we follow this strict logic, then all operations above should be atomic.
212 * given some of them can block, we'd have to use something like semaphores
213 * killing performance on high-end SMP hardware. let's try to relax it using
214 * the following knowledge:
215 * 1) if buddy is referenced, it's already initialized
216 * 2) while block is used in buddy and the buddy is referenced,
217 * nobody can re-allocate that block
218 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
219 * bit set and PA claims same block, it's OK. IOW, one can set bit in
220 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
221 * block
222 *
223 * so, now we're building a concurrency table:
224 * - init buddy vs.
225 * - new PA
226 * blocks for PA are allocated in the buddy, buddy must be referenced
227 * until PA is linked to allocation group to avoid concurrent buddy init
228 * - use inode PA
229 * we need to make sure that either on-disk bitmap or PA has uptodate data
230 * given (3) we care that PA-=N operation doesn't interfere with init
231 * - discard inode PA
232 * the simplest way would be to have buddy initialized by the discard
233 * - use locality group PA
234 * again PA-=N must be serialized with init
235 * - discard locality group PA
236 * the simplest way would be to have buddy initialized by the discard
237 * - new PA vs.
238 * - use inode PA
239 * i_data_sem serializes them
240 * - discard inode PA
241 * discard process must wait until PA isn't used by another process
242 * - use locality group PA
243 * some mutex should serialize them
244 * - discard locality group PA
245 * discard process must wait until PA isn't used by another process
246 * - use inode PA
247 * - use inode PA
248 * i_data_sem or another mutex should serializes them
249 * - discard inode PA
250 * discard process must wait until PA isn't used by another process
251 * - use locality group PA
252 * nothing wrong here -- they're different PAs covering different blocks
253 * - discard locality group PA
254 * discard process must wait until PA isn't used by another process
255 *
256 * now we're ready to make few consequences:
257 * - PA is referenced and while it is no discard is possible
258 * - PA is referenced until block isn't marked in on-disk bitmap
259 * - PA changes only after on-disk bitmap
260 * - discard must not compete with init. either init is done before
261 * any discard or they're serialized somehow
262 * - buddy init as sum of on-disk bitmap and PAs is done atomically
263 *
264 * a special case when we've used PA to emptiness. no need to modify buddy
265 * in this case, but we should care about concurrent init
266 *
267 */
268
269 /*
270 * Logic in few words:
271 *
272 * - allocation:
273 * load group
274 * find blocks
275 * mark bits in on-disk bitmap
276 * release group
277 *
278 * - use preallocation:
279 * find proper PA (per-inode or group)
280 * load group
281 * mark bits in on-disk bitmap
282 * release group
283 * release PA
284 *
285 * - free:
286 * load group
287 * mark bits in on-disk bitmap
288 * release group
289 *
290 * - discard preallocations in group:
291 * mark PAs deleted
292 * move them onto local list
293 * load on-disk bitmap
294 * load group
295 * remove PA from object (inode or locality group)
296 * mark free blocks in-core
297 *
298 * - discard inode's preallocations:
299 */
300
301/*
302 * Locking rules
303 *
304 * Locks:
305 * - bitlock on a group (group)
306 * - object (inode/locality) (object)
307 * - per-pa lock (pa)
308 *
309 * Paths:
310 * - new pa
311 * object
312 * group
313 *
314 * - find and use pa:
315 * pa
316 *
317 * - release consumed pa:
318 * pa
319 * group
320 * object
321 *
322 * - generate in-core bitmap:
323 * group
324 * pa
325 *
326 * - discard all for given object (inode, locality group):
327 * object
328 * pa
329 * group
330 *
331 * - discard all for given group:
332 * group
333 * pa
334 * group
335 * object
336 *
337 */
c3a326a6
AK
338static struct kmem_cache *ext4_pspace_cachep;
339static struct kmem_cache *ext4_ac_cachep;
340static struct kmem_cache *ext4_free_ext_cachep;
fb1813f4
CW
341
342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */
2892c15d 345#define NR_GRPINFO_CACHES 8
fb1813f4
CW
346static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
347
2892c15d
ES
348static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
349 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
350 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
351 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
352};
353
c3a326a6
AK
354static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
355 ext4_group_t group);
7a2fcbf7
AK
356static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
357 ext4_group_t group);
c3a326a6
AK
358static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
359
ffad0a44
AK
360static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
361{
c9de560d 362#if BITS_PER_LONG == 64
ffad0a44
AK
363 *bit += ((unsigned long) addr & 7UL) << 3;
364 addr = (void *) ((unsigned long) addr & ~7UL);
c9de560d 365#elif BITS_PER_LONG == 32
ffad0a44
AK
366 *bit += ((unsigned long) addr & 3UL) << 3;
367 addr = (void *) ((unsigned long) addr & ~3UL);
c9de560d
AT
368#else
369#error "how many bits you are?!"
370#endif
ffad0a44
AK
371 return addr;
372}
c9de560d
AT
373
374static inline int mb_test_bit(int bit, void *addr)
375{
376 /*
377 * ext4_test_bit on architecture like powerpc
378 * needs unsigned long aligned address
379 */
ffad0a44 380 addr = mb_correct_addr_and_bit(&bit, addr);
c9de560d
AT
381 return ext4_test_bit(bit, addr);
382}
383
384static inline void mb_set_bit(int bit, void *addr)
385{
ffad0a44 386 addr = mb_correct_addr_and_bit(&bit, addr);
c9de560d
AT
387 ext4_set_bit(bit, addr);
388}
389
c9de560d
AT
390static inline void mb_clear_bit(int bit, void *addr)
391{
ffad0a44 392 addr = mb_correct_addr_and_bit(&bit, addr);
c9de560d
AT
393 ext4_clear_bit(bit, addr);
394}
395
ffad0a44
AK
396static inline int mb_find_next_zero_bit(void *addr, int max, int start)
397{
e7dfb246 398 int fix = 0, ret, tmpmax;
ffad0a44 399 addr = mb_correct_addr_and_bit(&fix, addr);
e7dfb246 400 tmpmax = max + fix;
ffad0a44
AK
401 start += fix;
402
e7dfb246
AK
403 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
404 if (ret > max)
405 return max;
406 return ret;
ffad0a44
AK
407}
408
409static inline int mb_find_next_bit(void *addr, int max, int start)
410{
e7dfb246 411 int fix = 0, ret, tmpmax;
ffad0a44 412 addr = mb_correct_addr_and_bit(&fix, addr);
e7dfb246 413 tmpmax = max + fix;
ffad0a44
AK
414 start += fix;
415
e7dfb246
AK
416 ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
417 if (ret > max)
418 return max;
419 return ret;
ffad0a44
AK
420}
421
c9de560d
AT
422static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
423{
424 char *bb;
425
c9de560d
AT
426 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
427 BUG_ON(max == NULL);
428
429 if (order > e4b->bd_blkbits + 1) {
430 *max = 0;
431 return NULL;
432 }
433
434 /* at order 0 we see each particular block */
84b775a3
CL
435 if (order == 0) {
436 *max = 1 << (e4b->bd_blkbits + 3);
c9de560d 437 return EXT4_MB_BITMAP(e4b);
84b775a3 438 }
c9de560d
AT
439
440 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
441 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
442
443 return bb;
444}
445
446#ifdef DOUBLE_CHECK
447static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
448 int first, int count)
449{
450 int i;
451 struct super_block *sb = e4b->bd_sb;
452
453 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
454 return;
bc8e6740 455 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
c9de560d
AT
456 for (i = 0; i < count; i++) {
457 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
458 ext4_fsblk_t blocknr;
5661bd68
AM
459
460 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
c9de560d 461 blocknr += first + i;
5d1b1b3f 462 ext4_grp_locked_error(sb, e4b->bd_group,
e29136f8
TT
463 inode ? inode->i_ino : 0,
464 blocknr,
465 "freeing block already freed "
466 "(bit %u)",
467 first + i);
c9de560d
AT
468 }
469 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
470 }
471}
472
473static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
474{
475 int i;
476
477 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
478 return;
bc8e6740 479 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
c9de560d
AT
480 for (i = 0; i < count; i++) {
481 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
482 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
483 }
484}
485
486static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
487{
488 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
489 unsigned char *b1, *b2;
490 int i;
491 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
492 b2 = (unsigned char *) bitmap;
493 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
494 if (b1[i] != b2[i]) {
a9df9a49 495 printk(KERN_ERR "corruption in group %u "
4776004f
TT
496 "at byte %u(%u): %x in copy != %x "
497 "on disk/prealloc\n",
498 e4b->bd_group, i, i * 8, b1[i], b2[i]);
c9de560d
AT
499 BUG();
500 }
501 }
502 }
503}
504
505#else
506static inline void mb_free_blocks_double(struct inode *inode,
507 struct ext4_buddy *e4b, int first, int count)
508{
509 return;
510}
511static inline void mb_mark_used_double(struct ext4_buddy *e4b,
512 int first, int count)
513{
514 return;
515}
516static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
517{
518 return;
519}
520#endif
521
522#ifdef AGGRESSIVE_CHECK
523
524#define MB_CHECK_ASSERT(assert) \
525do { \
526 if (!(assert)) { \
527 printk(KERN_EMERG \
528 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
529 function, file, line, # assert); \
530 BUG(); \
531 } \
532} while (0)
533
534static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
535 const char *function, int line)
536{
537 struct super_block *sb = e4b->bd_sb;
538 int order = e4b->bd_blkbits + 1;
539 int max;
540 int max2;
541 int i;
542 int j;
543 int k;
544 int count;
545 struct ext4_group_info *grp;
546 int fragments = 0;
547 int fstart;
548 struct list_head *cur;
549 void *buddy;
550 void *buddy2;
551
c9de560d
AT
552 {
553 static int mb_check_counter;
554 if (mb_check_counter++ % 100 != 0)
555 return 0;
556 }
557
558 while (order > 1) {
559 buddy = mb_find_buddy(e4b, order, &max);
560 MB_CHECK_ASSERT(buddy);
561 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
562 MB_CHECK_ASSERT(buddy2);
563 MB_CHECK_ASSERT(buddy != buddy2);
564 MB_CHECK_ASSERT(max * 2 == max2);
565
566 count = 0;
567 for (i = 0; i < max; i++) {
568
569 if (mb_test_bit(i, buddy)) {
570 /* only single bit in buddy2 may be 1 */
571 if (!mb_test_bit(i << 1, buddy2)) {
572 MB_CHECK_ASSERT(
573 mb_test_bit((i<<1)+1, buddy2));
574 } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
575 MB_CHECK_ASSERT(
576 mb_test_bit(i << 1, buddy2));
577 }
578 continue;
579 }
580
581 /* both bits in buddy2 must be 0 */
582 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
583 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
584
585 for (j = 0; j < (1 << order); j++) {
586 k = (i * (1 << order)) + j;
587 MB_CHECK_ASSERT(
588 !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
589 }
590 count++;
591 }
592 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
593 order--;
594 }
595
596 fstart = -1;
597 buddy = mb_find_buddy(e4b, 0, &max);
598 for (i = 0; i < max; i++) {
599 if (!mb_test_bit(i, buddy)) {
600 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
601 if (fstart == -1) {
602 fragments++;
603 fstart = i;
604 }
605 continue;
606 }
607 fstart = -1;
608 /* check used bits only */
609 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
610 buddy2 = mb_find_buddy(e4b, j, &max2);
611 k = i >> j;
612 MB_CHECK_ASSERT(k < max2);
613 MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
614 }
615 }
616 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
617 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
618
619 grp = ext4_get_group_info(sb, e4b->bd_group);
c9de560d
AT
620 list_for_each(cur, &grp->bb_prealloc_list) {
621 ext4_group_t groupnr;
622 struct ext4_prealloc_space *pa;
60bd63d1
SR
623 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
624 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
c9de560d 625 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
60bd63d1 626 for (i = 0; i < pa->pa_len; i++)
c9de560d
AT
627 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
628 }
629 return 0;
630}
631#undef MB_CHECK_ASSERT
632#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
46e665e9 633 __FILE__, __func__, __LINE__)
c9de560d
AT
634#else
635#define mb_check_buddy(e4b)
636#endif
637
638/* FIXME!! need more doc */
639static void ext4_mb_mark_free_simple(struct super_block *sb,
a36b4498 640 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
c9de560d
AT
641 struct ext4_group_info *grp)
642{
643 struct ext4_sb_info *sbi = EXT4_SB(sb);
a36b4498
ES
644 ext4_grpblk_t min;
645 ext4_grpblk_t max;
646 ext4_grpblk_t chunk;
c9de560d
AT
647 unsigned short border;
648
b73fce69 649 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
c9de560d
AT
650
651 border = 2 << sb->s_blocksize_bits;
652
653 while (len > 0) {
654 /* find how many blocks can be covered since this position */
655 max = ffs(first | border) - 1;
656
657 /* find how many blocks of power 2 we need to mark */
658 min = fls(len) - 1;
659
660 if (max < min)
661 min = max;
662 chunk = 1 << min;
663
664 /* mark multiblock chunks only */
665 grp->bb_counters[min]++;
666 if (min > 0)
667 mb_clear_bit(first >> min,
668 buddy + sbi->s_mb_offsets[min]);
669
670 len -= chunk;
671 first += chunk;
672 }
673}
674
8a57d9d6
CW
675/*
676 * Cache the order of the largest free extent we have available in this block
677 * group.
678 */
679static void
680mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
681{
682 int i;
683 int bits;
684
685 grp->bb_largest_free_order = -1; /* uninit */
686
687 bits = sb->s_blocksize_bits + 1;
688 for (i = bits; i >= 0; i--) {
689 if (grp->bb_counters[i] > 0) {
690 grp->bb_largest_free_order = i;
691 break;
692 }
693 }
694}
695
089ceecc
ES
696static noinline_for_stack
697void ext4_mb_generate_buddy(struct super_block *sb,
c9de560d
AT
698 void *buddy, void *bitmap, ext4_group_t group)
699{
700 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
a36b4498
ES
701 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
702 ext4_grpblk_t i = 0;
703 ext4_grpblk_t first;
704 ext4_grpblk_t len;
c9de560d
AT
705 unsigned free = 0;
706 unsigned fragments = 0;
707 unsigned long long period = get_cycles();
708
709 /* initialize buddy from bitmap which is aggregation
710 * of on-disk bitmap and preallocations */
ffad0a44 711 i = mb_find_next_zero_bit(bitmap, max, 0);
c9de560d
AT
712 grp->bb_first_free = i;
713 while (i < max) {
714 fragments++;
715 first = i;
ffad0a44 716 i = mb_find_next_bit(bitmap, max, i);
c9de560d
AT
717 len = i - first;
718 free += len;
719 if (len > 1)
720 ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
721 else
722 grp->bb_counters[0]++;
723 if (i < max)
ffad0a44 724 i = mb_find_next_zero_bit(bitmap, max, i);
c9de560d
AT
725 }
726 grp->bb_fragments = fragments;
727
728 if (free != grp->bb_free) {
e29136f8
TT
729 ext4_grp_locked_error(sb, group, 0, 0,
730 "%u blocks in bitmap, %u in gd",
731 free, grp->bb_free);
e56eb659
AK
732 /*
733 * If we intent to continue, we consider group descritor
734 * corrupt and update bb_free using bitmap value
735 */
c9de560d
AT
736 grp->bb_free = free;
737 }
8a57d9d6 738 mb_set_largest_free_order(sb, grp);
c9de560d
AT
739
740 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
741
742 period = get_cycles() - period;
743 spin_lock(&EXT4_SB(sb)->s_bal_lock);
744 EXT4_SB(sb)->s_mb_buddies_generated++;
745 EXT4_SB(sb)->s_mb_generation_time += period;
746 spin_unlock(&EXT4_SB(sb)->s_bal_lock);
747}
748
749/* The buddy information is attached the buddy cache inode
750 * for convenience. The information regarding each group
751 * is loaded via ext4_mb_load_buddy. The information involve
752 * block bitmap and buddy information. The information are
753 * stored in the inode as
754 *
755 * { page }
c3a326a6 756 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
c9de560d
AT
757 *
758 *
759 * one block each for bitmap and buddy information.
760 * So for each group we take up 2 blocks. A page can
761 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
762 * So it can have information regarding groups_per_page which
763 * is blocks_per_page/2
8a57d9d6
CW
764 *
765 * Locking note: This routine takes the block group lock of all groups
766 * for this page; do not hold this lock when calling this routine!
c9de560d
AT
767 */
768
769static int ext4_mb_init_cache(struct page *page, char *incore)
770{
8df9675f 771 ext4_group_t ngroups;
c9de560d
AT
772 int blocksize;
773 int blocks_per_page;
774 int groups_per_page;
775 int err = 0;
776 int i;
777 ext4_group_t first_group;
778 int first_block;
779 struct super_block *sb;
780 struct buffer_head *bhs;
781 struct buffer_head **bh;
782 struct inode *inode;
783 char *data;
784 char *bitmap;
785
6ba495e9 786 mb_debug(1, "init page %lu\n", page->index);
c9de560d
AT
787
788 inode = page->mapping->host;
789 sb = inode->i_sb;
8df9675f 790 ngroups = ext4_get_groups_count(sb);
c9de560d
AT
791 blocksize = 1 << inode->i_blkbits;
792 blocks_per_page = PAGE_CACHE_SIZE / blocksize;
793
794 groups_per_page = blocks_per_page >> 1;
795 if (groups_per_page == 0)
796 groups_per_page = 1;
797
798 /* allocate buffer_heads to read bitmaps */
799 if (groups_per_page > 1) {
800 err = -ENOMEM;
801 i = sizeof(struct buffer_head *) * groups_per_page;
802 bh = kzalloc(i, GFP_NOFS);
803 if (bh == NULL)
804 goto out;
805 } else
806 bh = &bhs;
807
808 first_group = page->index * blocks_per_page / 2;
809
810 /* read all groups the page covers into the cache */
811 for (i = 0; i < groups_per_page; i++) {
812 struct ext4_group_desc *desc;
813
8df9675f 814 if (first_group + i >= ngroups)
c9de560d
AT
815 break;
816
817 err = -EIO;
818 desc = ext4_get_group_desc(sb, first_group + i, NULL);
819 if (desc == NULL)
820 goto out;
821
822 err = -ENOMEM;
823 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
824 if (bh[i] == NULL)
825 goto out;
826
2ccb5fb9 827 if (bitmap_uptodate(bh[i]))
c9de560d
AT
828 continue;
829
c806e68f 830 lock_buffer(bh[i]);
2ccb5fb9
AK
831 if (bitmap_uptodate(bh[i])) {
832 unlock_buffer(bh[i]);
833 continue;
834 }
955ce5f5 835 ext4_lock_group(sb, first_group + i);
c9de560d
AT
836 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
837 ext4_init_block_bitmap(sb, bh[i],
838 first_group + i, desc);
2ccb5fb9 839 set_bitmap_uptodate(bh[i]);
c9de560d 840 set_buffer_uptodate(bh[i]);
955ce5f5 841 ext4_unlock_group(sb, first_group + i);
3300beda 842 unlock_buffer(bh[i]);
c9de560d
AT
843 continue;
844 }
955ce5f5 845 ext4_unlock_group(sb, first_group + i);
2ccb5fb9
AK
846 if (buffer_uptodate(bh[i])) {
847 /*
848 * if not uninit if bh is uptodate,
849 * bitmap is also uptodate
850 */
851 set_bitmap_uptodate(bh[i]);
852 unlock_buffer(bh[i]);
853 continue;
854 }
c9de560d 855 get_bh(bh[i]);
2ccb5fb9
AK
856 /*
857 * submit the buffer_head for read. We can
858 * safely mark the bitmap as uptodate now.
859 * We do it here so the bitmap uptodate bit
860 * get set with buffer lock held.
861 */
862 set_bitmap_uptodate(bh[i]);
c9de560d
AT
863 bh[i]->b_end_io = end_buffer_read_sync;
864 submit_bh(READ, bh[i]);
6ba495e9 865 mb_debug(1, "read bitmap for group %u\n", first_group + i);
c9de560d
AT
866 }
867
868 /* wait for I/O completion */
869 for (i = 0; i < groups_per_page && bh[i]; i++)
870 wait_on_buffer(bh[i]);
871
872 err = -EIO;
873 for (i = 0; i < groups_per_page && bh[i]; i++)
874 if (!buffer_uptodate(bh[i]))
875 goto out;
876
31b481dc 877 err = 0;
c9de560d 878 first_block = page->index * blocks_per_page;
29eaf024
AK
879 /* init the page */
880 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
c9de560d
AT
881 for (i = 0; i < blocks_per_page; i++) {
882 int group;
883 struct ext4_group_info *grinfo;
884
885 group = (first_block + i) >> 1;
8df9675f 886 if (group >= ngroups)
c9de560d
AT
887 break;
888
889 /*
890 * data carry information regarding this
891 * particular group in the format specified
892 * above
893 *
894 */
895 data = page_address(page) + (i * blocksize);
896 bitmap = bh[group - first_group]->b_data;
897
898 /*
899 * We place the buddy block and bitmap block
900 * close together
901 */
902 if ((first_block + i) & 1) {
903 /* this is block of buddy */
904 BUG_ON(incore == NULL);
6ba495e9 905 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
c9de560d 906 group, page->index, i * blocksize);
f307333e 907 trace_ext4_mb_buddy_bitmap_load(sb, group);
c9de560d
AT
908 grinfo = ext4_get_group_info(sb, group);
909 grinfo->bb_fragments = 0;
910 memset(grinfo->bb_counters, 0,
1927805e
ES
911 sizeof(*grinfo->bb_counters) *
912 (sb->s_blocksize_bits+2));
c9de560d
AT
913 /*
914 * incore got set to the group block bitmap below
915 */
7a2fcbf7 916 ext4_lock_group(sb, group);
c9de560d 917 ext4_mb_generate_buddy(sb, data, incore, group);
7a2fcbf7 918 ext4_unlock_group(sb, group);
c9de560d
AT
919 incore = NULL;
920 } else {
921 /* this is block of bitmap */
922 BUG_ON(incore != NULL);
6ba495e9 923 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
c9de560d 924 group, page->index, i * blocksize);
f307333e 925 trace_ext4_mb_bitmap_load(sb, group);
c9de560d
AT
926
927 /* see comments in ext4_mb_put_pa() */
928 ext4_lock_group(sb, group);
929 memcpy(data, bitmap, blocksize);
930
931 /* mark all preallocated blks used in in-core bitmap */
932 ext4_mb_generate_from_pa(sb, data, group);
7a2fcbf7 933 ext4_mb_generate_from_freelist(sb, data, group);
c9de560d
AT
934 ext4_unlock_group(sb, group);
935
936 /* set incore so that the buddy information can be
937 * generated using this
938 */
939 incore = data;
940 }
941 }
942 SetPageUptodate(page);
943
944out:
945 if (bh) {
946 for (i = 0; i < groups_per_page && bh[i]; i++)
947 brelse(bh[i]);
948 if (bh != &bhs)
949 kfree(bh);
950 }
951 return err;
952}
953
eee4adc7
ES
954/*
955 * lock the group_info alloc_sem of all the groups
956 * belonging to the same buddy cache page. This
957 * make sure other parallel operation on the buddy
958 * cache doesn't happen whild holding the buddy cache
959 * lock
960 */
961static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
962 ext4_group_t group)
963{
964 int i;
965 int block, pnum;
966 int blocks_per_page;
967 int groups_per_page;
968 ext4_group_t ngroups = ext4_get_groups_count(sb);
969 ext4_group_t first_group;
970 struct ext4_group_info *grp;
971
972 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
973 /*
974 * the buddy cache inode stores the block bitmap
975 * and buddy information in consecutive blocks.
976 * So for each group we need two blocks.
977 */
978 block = group * 2;
979 pnum = block / blocks_per_page;
980 first_group = pnum * blocks_per_page / 2;
981
982 groups_per_page = blocks_per_page >> 1;
983 if (groups_per_page == 0)
984 groups_per_page = 1;
985 /* read all groups the page covers into the cache */
986 for (i = 0; i < groups_per_page; i++) {
987
988 if ((first_group + i) >= ngroups)
989 break;
990 grp = ext4_get_group_info(sb, first_group + i);
991 /* take all groups write allocation
992 * semaphore. This make sure there is
993 * no block allocation going on in any
994 * of that groups
995 */
996 down_write_nested(&grp->alloc_sem, i);
997 }
998 return i;
999}
1000
1001static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1002 ext4_group_t group, int locked_group)
1003{
1004 int i;
1005 int block, pnum;
1006 int blocks_per_page;
1007 ext4_group_t first_group;
1008 struct ext4_group_info *grp;
1009
1010 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1011 /*
1012 * the buddy cache inode stores the block bitmap
1013 * and buddy information in consecutive blocks.
1014 * So for each group we need two blocks.
1015 */
1016 block = group * 2;
1017 pnum = block / blocks_per_page;
1018 first_group = pnum * blocks_per_page / 2;
1019 /* release locks on all the groups */
1020 for (i = 0; i < locked_group; i++) {
1021
1022 grp = ext4_get_group_info(sb, first_group + i);
1023 /* take all groups write allocation
1024 * semaphore. This make sure there is
1025 * no block allocation going on in any
1026 * of that groups
1027 */
1028 up_write(&grp->alloc_sem);
1029 }
1030
1031}
1032
8a57d9d6
CW
1033/*
1034 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1035 * block group lock of all groups for this page; do not hold the BG lock when
1036 * calling this routine!
1037 */
b6a758ec
AK
1038static noinline_for_stack
1039int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1040{
1041
1042 int ret = 0;
1043 void *bitmap;
1044 int blocks_per_page;
1045 int block, pnum, poff;
1046 int num_grp_locked = 0;
1047 struct ext4_group_info *this_grp;
1048 struct ext4_sb_info *sbi = EXT4_SB(sb);
1049 struct inode *inode = sbi->s_buddy_cache;
1050 struct page *page = NULL, *bitmap_page = NULL;
1051
1052 mb_debug(1, "init group %u\n", group);
1053 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1054 this_grp = ext4_get_group_info(sb, group);
1055 /*
08c3a813
AK
1056 * This ensures that we don't reinit the buddy cache
1057 * page which map to the group from which we are already
1058 * allocating. If we are looking at the buddy cache we would
1059 * have taken a reference using ext4_mb_load_buddy and that
1060 * would have taken the alloc_sem lock.
b6a758ec
AK
1061 */
1062 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1063 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1064 /*
1065 * somebody initialized the group
1066 * return without doing anything
1067 */
1068 ret = 0;
1069 goto err;
1070 }
1071 /*
1072 * the buddy cache inode stores the block bitmap
1073 * and buddy information in consecutive blocks.
1074 * So for each group we need two blocks.
1075 */
1076 block = group * 2;
1077 pnum = block / blocks_per_page;
1078 poff = block % blocks_per_page;
1079 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1080 if (page) {
1081 BUG_ON(page->mapping != inode->i_mapping);
1082 ret = ext4_mb_init_cache(page, NULL);
1083 if (ret) {
1084 unlock_page(page);
1085 goto err;
1086 }
1087 unlock_page(page);
1088 }
1089 if (page == NULL || !PageUptodate(page)) {
1090 ret = -EIO;
1091 goto err;
1092 }
1093 mark_page_accessed(page);
1094 bitmap_page = page;
1095 bitmap = page_address(page) + (poff * sb->s_blocksize);
1096
1097 /* init buddy cache */
1098 block++;
1099 pnum = block / blocks_per_page;
1100 poff = block % blocks_per_page;
1101 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1102 if (page == bitmap_page) {
1103 /*
1104 * If both the bitmap and buddy are in
1105 * the same page we don't need to force
1106 * init the buddy
1107 */
1108 unlock_page(page);
1109 } else if (page) {
1110 BUG_ON(page->mapping != inode->i_mapping);
1111 ret = ext4_mb_init_cache(page, bitmap);
1112 if (ret) {
1113 unlock_page(page);
1114 goto err;
1115 }
1116 unlock_page(page);
1117 }
1118 if (page == NULL || !PageUptodate(page)) {
1119 ret = -EIO;
1120 goto err;
1121 }
1122 mark_page_accessed(page);
1123err:
1124 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1125 if (bitmap_page)
1126 page_cache_release(bitmap_page);
1127 if (page)
1128 page_cache_release(page);
1129 return ret;
1130}
1131
8a57d9d6
CW
1132/*
1133 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1134 * block group lock of all groups for this page; do not hold the BG lock when
1135 * calling this routine!
1136 */
4ddfef7b
ES
1137static noinline_for_stack int
1138ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1139 struct ext4_buddy *e4b)
c9de560d 1140{
c9de560d
AT
1141 int blocks_per_page;
1142 int block;
1143 int pnum;
1144 int poff;
1145 struct page *page;
fdf6c7a7 1146 int ret;
920313a7
AK
1147 struct ext4_group_info *grp;
1148 struct ext4_sb_info *sbi = EXT4_SB(sb);
1149 struct inode *inode = sbi->s_buddy_cache;
c9de560d 1150
6ba495e9 1151 mb_debug(1, "load group %u\n", group);
c9de560d
AT
1152
1153 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
920313a7 1154 grp = ext4_get_group_info(sb, group);
c9de560d
AT
1155
1156 e4b->bd_blkbits = sb->s_blocksize_bits;
1157 e4b->bd_info = ext4_get_group_info(sb, group);
1158 e4b->bd_sb = sb;
1159 e4b->bd_group = group;
1160 e4b->bd_buddy_page = NULL;
1161 e4b->bd_bitmap_page = NULL;
920313a7
AK
1162 e4b->alloc_semp = &grp->alloc_sem;
1163
1164 /* Take the read lock on the group alloc
1165 * sem. This would make sure a parallel
1166 * ext4_mb_init_group happening on other
1167 * groups mapped by the page is blocked
1168 * till we are done with allocation
1169 */
f41c0750 1170repeat_load_buddy:
920313a7 1171 down_read(e4b->alloc_semp);
c9de560d 1172
f41c0750
AK
1173 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1174 /* we need to check for group need init flag
1175 * with alloc_semp held so that we can be sure
1176 * that new blocks didn't get added to the group
1177 * when we are loading the buddy cache
1178 */
1179 up_read(e4b->alloc_semp);
1180 /*
1181 * we need full data about the group
1182 * to make a good selection
1183 */
1184 ret = ext4_mb_init_group(sb, group);
1185 if (ret)
1186 return ret;
1187 goto repeat_load_buddy;
1188 }
1189
c9de560d
AT
1190 /*
1191 * the buddy cache inode stores the block bitmap
1192 * and buddy information in consecutive blocks.
1193 * So for each group we need two blocks.
1194 */
1195 block = group * 2;
1196 pnum = block / blocks_per_page;
1197 poff = block % blocks_per_page;
1198
1199 /* we could use find_or_create_page(), but it locks page
1200 * what we'd like to avoid in fast path ... */
1201 page = find_get_page(inode->i_mapping, pnum);
1202 if (page == NULL || !PageUptodate(page)) {
1203 if (page)
920313a7
AK
1204 /*
1205 * drop the page reference and try
1206 * to get the page with lock. If we
1207 * are not uptodate that implies
1208 * somebody just created the page but
1209 * is yet to initialize the same. So
1210 * wait for it to initialize.
1211 */
c9de560d
AT
1212 page_cache_release(page);
1213 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1214 if (page) {
1215 BUG_ON(page->mapping != inode->i_mapping);
1216 if (!PageUptodate(page)) {
fdf6c7a7
SF
1217 ret = ext4_mb_init_cache(page, NULL);
1218 if (ret) {
1219 unlock_page(page);
1220 goto err;
1221 }
c9de560d
AT
1222 mb_cmp_bitmaps(e4b, page_address(page) +
1223 (poff * sb->s_blocksize));
1224 }
1225 unlock_page(page);
1226 }
1227 }
fdf6c7a7
SF
1228 if (page == NULL || !PageUptodate(page)) {
1229 ret = -EIO;
c9de560d 1230 goto err;
fdf6c7a7 1231 }
c9de560d
AT
1232 e4b->bd_bitmap_page = page;
1233 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1234 mark_page_accessed(page);
1235
1236 block++;
1237 pnum = block / blocks_per_page;
1238 poff = block % blocks_per_page;
1239
1240 page = find_get_page(inode->i_mapping, pnum);
1241 if (page == NULL || !PageUptodate(page)) {
1242 if (page)
1243 page_cache_release(page);
1244 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1245 if (page) {
1246 BUG_ON(page->mapping != inode->i_mapping);
fdf6c7a7
SF
1247 if (!PageUptodate(page)) {
1248 ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
1249 if (ret) {
1250 unlock_page(page);
1251 goto err;
1252 }
1253 }
c9de560d
AT
1254 unlock_page(page);
1255 }
1256 }
fdf6c7a7
SF
1257 if (page == NULL || !PageUptodate(page)) {
1258 ret = -EIO;
c9de560d 1259 goto err;
fdf6c7a7 1260 }
c9de560d
AT
1261 e4b->bd_buddy_page = page;
1262 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1263 mark_page_accessed(page);
1264
1265 BUG_ON(e4b->bd_bitmap_page == NULL);
1266 BUG_ON(e4b->bd_buddy_page == NULL);
1267
1268 return 0;
1269
1270err:
1271 if (e4b->bd_bitmap_page)
1272 page_cache_release(e4b->bd_bitmap_page);
1273 if (e4b->bd_buddy_page)
1274 page_cache_release(e4b->bd_buddy_page);
1275 e4b->bd_buddy = NULL;
1276 e4b->bd_bitmap = NULL;
920313a7
AK
1277
1278 /* Done with the buddy cache */
1279 up_read(e4b->alloc_semp);
fdf6c7a7 1280 return ret;
c9de560d
AT
1281}
1282
e39e07fd 1283static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
c9de560d
AT
1284{
1285 if (e4b->bd_bitmap_page)
1286 page_cache_release(e4b->bd_bitmap_page);
1287 if (e4b->bd_buddy_page)
1288 page_cache_release(e4b->bd_buddy_page);
920313a7 1289 /* Done with the buddy cache */
8556e8f3
AK
1290 if (e4b->alloc_semp)
1291 up_read(e4b->alloc_semp);
c9de560d
AT
1292}
1293
1294
1295static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1296{
1297 int order = 1;
1298 void *bb;
1299
1300 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
1301 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1302
1303 bb = EXT4_MB_BUDDY(e4b);
1304 while (order <= e4b->bd_blkbits + 1) {
1305 block = block >> 1;
1306 if (!mb_test_bit(block, bb)) {
1307 /* this block is part of buddy of order 'order' */
1308 return order;
1309 }
1310 bb += 1 << (e4b->bd_blkbits - order);
1311 order++;
1312 }
1313 return 0;
1314}
1315
955ce5f5 1316static void mb_clear_bits(void *bm, int cur, int len)
c9de560d
AT
1317{
1318 __u32 *addr;
1319
1320 len = cur + len;
1321 while (cur < len) {
1322 if ((cur & 31) == 0 && (len - cur) >= 32) {
1323 /* fast path: clear whole word at once */
1324 addr = bm + (cur >> 3);
1325 *addr = 0;
1326 cur += 32;
1327 continue;
1328 }
955ce5f5 1329 mb_clear_bit(cur, bm);
c9de560d
AT
1330 cur++;
1331 }
1332}
1333
955ce5f5 1334static void mb_set_bits(void *bm, int cur, int len)
c9de560d
AT
1335{
1336 __u32 *addr;
1337
1338 len = cur + len;
1339 while (cur < len) {
1340 if ((cur & 31) == 0 && (len - cur) >= 32) {
1341 /* fast path: set whole word at once */
1342 addr = bm + (cur >> 3);
1343 *addr = 0xffffffff;
1344 cur += 32;
1345 continue;
1346 }
955ce5f5 1347 mb_set_bit(cur, bm);
c9de560d
AT
1348 cur++;
1349 }
1350}
1351
7e5a8cdd 1352static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
c9de560d
AT
1353 int first, int count)
1354{
1355 int block = 0;
1356 int max = 0;
1357 int order;
1358 void *buddy;
1359 void *buddy2;
1360 struct super_block *sb = e4b->bd_sb;
1361
1362 BUG_ON(first + count > (sb->s_blocksize << 3));
bc8e6740 1363 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
c9de560d
AT
1364 mb_check_buddy(e4b);
1365 mb_free_blocks_double(inode, e4b, first, count);
1366
1367 e4b->bd_info->bb_free += count;
1368 if (first < e4b->bd_info->bb_first_free)
1369 e4b->bd_info->bb_first_free = first;
1370
1371 /* let's maintain fragments counter */
1372 if (first != 0)
1373 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
1374 if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1375 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
1376 if (block && max)
1377 e4b->bd_info->bb_fragments--;
1378 else if (!block && !max)
1379 e4b->bd_info->bb_fragments++;
1380
1381 /* let's maintain buddy itself */
1382 while (count-- > 0) {
1383 block = first++;
1384 order = 0;
1385
1386 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1387 ext4_fsblk_t blocknr;
5661bd68
AM
1388
1389 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
c9de560d 1390 blocknr += block;
5d1b1b3f 1391 ext4_grp_locked_error(sb, e4b->bd_group,
e29136f8
TT
1392 inode ? inode->i_ino : 0,
1393 blocknr,
1394 "freeing already freed block "
1395 "(bit %u)", block);
c9de560d
AT
1396 }
1397 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1398 e4b->bd_info->bb_counters[order]++;
1399
1400 /* start of the buddy */
1401 buddy = mb_find_buddy(e4b, order, &max);
1402
1403 do {
1404 block &= ~1UL;
1405 if (mb_test_bit(block, buddy) ||
1406 mb_test_bit(block + 1, buddy))
1407 break;
1408
1409 /* both the buddies are free, try to coalesce them */
1410 buddy2 = mb_find_buddy(e4b, order + 1, &max);
1411
1412 if (!buddy2)
1413 break;
1414
1415 if (order > 0) {
1416 /* for special purposes, we don't set
1417 * free bits in bitmap */
1418 mb_set_bit(block, buddy);
1419 mb_set_bit(block + 1, buddy);
1420 }
1421 e4b->bd_info->bb_counters[order]--;
1422 e4b->bd_info->bb_counters[order]--;
1423
1424 block = block >> 1;
1425 order++;
1426 e4b->bd_info->bb_counters[order]++;
1427
1428 mb_clear_bit(block, buddy2);
1429 buddy = buddy2;
1430 } while (1);
1431 }
8a57d9d6 1432 mb_set_largest_free_order(sb, e4b->bd_info);
c9de560d 1433 mb_check_buddy(e4b);
c9de560d
AT
1434}
1435
1436static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1437 int needed, struct ext4_free_extent *ex)
1438{
1439 int next = block;
1440 int max;
1441 int ord;
1442 void *buddy;
1443
bc8e6740 1444 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
c9de560d
AT
1445 BUG_ON(ex == NULL);
1446
1447 buddy = mb_find_buddy(e4b, order, &max);
1448 BUG_ON(buddy == NULL);
1449 BUG_ON(block >= max);
1450 if (mb_test_bit(block, buddy)) {
1451 ex->fe_len = 0;
1452 ex->fe_start = 0;
1453 ex->fe_group = 0;
1454 return 0;
1455 }
1456
1457 /* FIXME dorp order completely ? */
1458 if (likely(order == 0)) {
1459 /* find actual order */
1460 order = mb_find_order_for_block(e4b, block);
1461 block = block >> order;
1462 }
1463
1464 ex->fe_len = 1 << order;
1465 ex->fe_start = block << order;
1466 ex->fe_group = e4b->bd_group;
1467
1468 /* calc difference from given start */
1469 next = next - ex->fe_start;
1470 ex->fe_len -= next;
1471 ex->fe_start += next;
1472
1473 while (needed > ex->fe_len &&
1474 (buddy = mb_find_buddy(e4b, order, &max))) {
1475
1476 if (block + 1 >= max)
1477 break;
1478
1479 next = (block + 1) * (1 << order);
1480 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
1481 break;
1482
1483 ord = mb_find_order_for_block(e4b, next);
1484
1485 order = ord;
1486 block = next >> order;
1487 ex->fe_len += 1 << order;
1488 }
1489
1490 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1491 return ex->fe_len;
1492}
1493
1494static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1495{
1496 int ord;
1497 int mlen = 0;
1498 int max = 0;
1499 int cur;
1500 int start = ex->fe_start;
1501 int len = ex->fe_len;
1502 unsigned ret = 0;
1503 int len0 = len;
1504 void *buddy;
1505
1506 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1507 BUG_ON(e4b->bd_group != ex->fe_group);
bc8e6740 1508 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
c9de560d
AT
1509 mb_check_buddy(e4b);
1510 mb_mark_used_double(e4b, start, len);
1511
1512 e4b->bd_info->bb_free -= len;
1513 if (e4b->bd_info->bb_first_free == start)
1514 e4b->bd_info->bb_first_free += len;
1515
1516 /* let's maintain fragments counter */
1517 if (start != 0)
1518 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
1519 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1520 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
1521 if (mlen && max)
1522 e4b->bd_info->bb_fragments++;
1523 else if (!mlen && !max)
1524 e4b->bd_info->bb_fragments--;
1525
1526 /* let's maintain buddy itself */
1527 while (len) {
1528 ord = mb_find_order_for_block(e4b, start);
1529
1530 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1531 /* the whole chunk may be allocated at once! */
1532 mlen = 1 << ord;
1533 buddy = mb_find_buddy(e4b, ord, &max);
1534 BUG_ON((start >> ord) >= max);
1535 mb_set_bit(start >> ord, buddy);
1536 e4b->bd_info->bb_counters[ord]--;
1537 start += mlen;
1538 len -= mlen;
1539 BUG_ON(len < 0);
1540 continue;
1541 }
1542
1543 /* store for history */
1544 if (ret == 0)
1545 ret = len | (ord << 16);
1546
1547 /* we have to split large buddy */
1548 BUG_ON(ord <= 0);
1549 buddy = mb_find_buddy(e4b, ord, &max);
1550 mb_set_bit(start >> ord, buddy);
1551 e4b->bd_info->bb_counters[ord]--;
1552
1553 ord--;
1554 cur = (start >> ord) & ~1U;
1555 buddy = mb_find_buddy(e4b, ord, &max);
1556 mb_clear_bit(cur, buddy);
1557 mb_clear_bit(cur + 1, buddy);
1558 e4b->bd_info->bb_counters[ord]++;
1559 e4b->bd_info->bb_counters[ord]++;
1560 }
8a57d9d6 1561 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
c9de560d 1562
955ce5f5 1563 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
c9de560d
AT
1564 mb_check_buddy(e4b);
1565
1566 return ret;
1567}
1568
1569/*
1570 * Must be called under group lock!
1571 */
1572static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1573 struct ext4_buddy *e4b)
1574{
1575 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1576 int ret;
1577
1578 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1579 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1580
1581 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1582 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1583 ret = mb_mark_used(e4b, &ac->ac_b_ex);
1584
1585 /* preallocation can change ac_b_ex, thus we store actually
1586 * allocated blocks for history */
1587 ac->ac_f_ex = ac->ac_b_ex;
1588
1589 ac->ac_status = AC_STATUS_FOUND;
1590 ac->ac_tail = ret & 0xffff;
1591 ac->ac_buddy = ret >> 16;
1592
c3a326a6
AK
1593 /*
1594 * take the page reference. We want the page to be pinned
1595 * so that we don't get a ext4_mb_init_cache_call for this
1596 * group until we update the bitmap. That would mean we
1597 * double allocate blocks. The reference is dropped
1598 * in ext4_mb_release_context
1599 */
c9de560d
AT
1600 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1601 get_page(ac->ac_bitmap_page);
1602 ac->ac_buddy_page = e4b->bd_buddy_page;
1603 get_page(ac->ac_buddy_page);
8556e8f3
AK
1604 /* on allocation we use ac to track the held semaphore */
1605 ac->alloc_semp = e4b->alloc_semp;
1606 e4b->alloc_semp = NULL;
c9de560d 1607 /* store last allocated for subsequent stream allocation */
4ba74d00 1608 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
c9de560d
AT
1609 spin_lock(&sbi->s_md_lock);
1610 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1611 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1612 spin_unlock(&sbi->s_md_lock);
1613 }
1614}
1615
1616/*
1617 * regular allocator, for general purposes allocation
1618 */
1619
1620static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1621 struct ext4_buddy *e4b,
1622 int finish_group)
1623{
1624 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1625 struct ext4_free_extent *bex = &ac->ac_b_ex;
1626 struct ext4_free_extent *gex = &ac->ac_g_ex;
1627 struct ext4_free_extent ex;
1628 int max;
1629
032115fc
AK
1630 if (ac->ac_status == AC_STATUS_FOUND)
1631 return;
c9de560d
AT
1632 /*
1633 * We don't want to scan for a whole year
1634 */
1635 if (ac->ac_found > sbi->s_mb_max_to_scan &&
1636 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1637 ac->ac_status = AC_STATUS_BREAK;
1638 return;
1639 }
1640
1641 /*
1642 * Haven't found good chunk so far, let's continue
1643 */
1644 if (bex->fe_len < gex->fe_len)
1645 return;
1646
1647 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1648 && bex->fe_group == e4b->bd_group) {
1649 /* recheck chunk's availability - we don't know
1650 * when it was found (within this lock-unlock
1651 * period or not) */
1652 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
1653 if (max >= gex->fe_len) {
1654 ext4_mb_use_best_found(ac, e4b);
1655 return;
1656 }
1657 }
1658}
1659
1660/*
1661 * The routine checks whether found extent is good enough. If it is,
1662 * then the extent gets marked used and flag is set to the context
1663 * to stop scanning. Otherwise, the extent is compared with the
1664 * previous found extent and if new one is better, then it's stored
1665 * in the context. Later, the best found extent will be used, if
1666 * mballoc can't find good enough extent.
1667 *
1668 * FIXME: real allocation policy is to be designed yet!
1669 */
1670static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1671 struct ext4_free_extent *ex,
1672 struct ext4_buddy *e4b)
1673{
1674 struct ext4_free_extent *bex = &ac->ac_b_ex;
1675 struct ext4_free_extent *gex = &ac->ac_g_ex;
1676
1677 BUG_ON(ex->fe_len <= 0);
8d03c7a0 1678 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
c9de560d
AT
1679 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1680 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1681
1682 ac->ac_found++;
1683
1684 /*
1685 * The special case - take what you catch first
1686 */
1687 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1688 *bex = *ex;
1689 ext4_mb_use_best_found(ac, e4b);
1690 return;
1691 }
1692
1693 /*
1694 * Let's check whether the chuck is good enough
1695 */
1696 if (ex->fe_len == gex->fe_len) {
1697 *bex = *ex;
1698 ext4_mb_use_best_found(ac, e4b);
1699 return;
1700 }
1701
1702 /*
1703 * If this is first found extent, just store it in the context
1704 */
1705 if (bex->fe_len == 0) {
1706 *bex = *ex;
1707 return;
1708 }
1709
1710 /*
1711 * If new found extent is better, store it in the context
1712 */
1713 if (bex->fe_len < gex->fe_len) {
1714 /* if the request isn't satisfied, any found extent
1715 * larger than previous best one is better */
1716 if (ex->fe_len > bex->fe_len)
1717 *bex = *ex;
1718 } else if (ex->fe_len > gex->fe_len) {
1719 /* if the request is satisfied, then we try to find
1720 * an extent that still satisfy the request, but is
1721 * smaller than previous one */
1722 if (ex->fe_len < bex->fe_len)
1723 *bex = *ex;
1724 }
1725
1726 ext4_mb_check_limits(ac, e4b, 0);
1727}
1728
089ceecc
ES
1729static noinline_for_stack
1730int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
c9de560d
AT
1731 struct ext4_buddy *e4b)
1732{
1733 struct ext4_free_extent ex = ac->ac_b_ex;
1734 ext4_group_t group = ex.fe_group;
1735 int max;
1736 int err;
1737
1738 BUG_ON(ex.fe_len <= 0);
1739 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1740 if (err)
1741 return err;
1742
1743 ext4_lock_group(ac->ac_sb, group);
1744 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
1745
1746 if (max > 0) {
1747 ac->ac_b_ex = ex;
1748 ext4_mb_use_best_found(ac, e4b);
1749 }
1750
1751 ext4_unlock_group(ac->ac_sb, group);
e39e07fd 1752 ext4_mb_unload_buddy(e4b);
c9de560d
AT
1753
1754 return 0;
1755}
1756
089ceecc
ES
1757static noinline_for_stack
1758int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
c9de560d
AT
1759 struct ext4_buddy *e4b)
1760{
1761 ext4_group_t group = ac->ac_g_ex.fe_group;
1762 int max;
1763 int err;
1764 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
c9de560d
AT
1765 struct ext4_free_extent ex;
1766
1767 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1768 return 0;
1769
1770 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1771 if (err)
1772 return err;
1773
1774 ext4_lock_group(ac->ac_sb, group);
1775 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
1776 ac->ac_g_ex.fe_len, &ex);
1777
1778 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1779 ext4_fsblk_t start;
1780
5661bd68
AM
1781 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1782 ex.fe_start;
c9de560d
AT
1783 /* use do_div to get remainder (would be 64-bit modulo) */
1784 if (do_div(start, sbi->s_stripe) == 0) {
1785 ac->ac_found++;
1786 ac->ac_b_ex = ex;
1787 ext4_mb_use_best_found(ac, e4b);
1788 }
1789 } else if (max >= ac->ac_g_ex.fe_len) {
1790 BUG_ON(ex.fe_len <= 0);
1791 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1792 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1793 ac->ac_found++;
1794 ac->ac_b_ex = ex;
1795 ext4_mb_use_best_found(ac, e4b);
1796 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1797 /* Sometimes, caller may want to merge even small
1798 * number of blocks to an existing extent */
1799 BUG_ON(ex.fe_len <= 0);
1800 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1801 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1802 ac->ac_found++;
1803 ac->ac_b_ex = ex;
1804 ext4_mb_use_best_found(ac, e4b);
1805 }
1806 ext4_unlock_group(ac->ac_sb, group);
e39e07fd 1807 ext4_mb_unload_buddy(e4b);
c9de560d
AT
1808
1809 return 0;
1810}
1811
1812/*
1813 * The routine scans buddy structures (not bitmap!) from given order
1814 * to max order and tries to find big enough chunk to satisfy the req
1815 */
089ceecc
ES
1816static noinline_for_stack
1817void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
c9de560d
AT
1818 struct ext4_buddy *e4b)
1819{
1820 struct super_block *sb = ac->ac_sb;
1821 struct ext4_group_info *grp = e4b->bd_info;
1822 void *buddy;
1823 int i;
1824 int k;
1825 int max;
1826
1827 BUG_ON(ac->ac_2order <= 0);
1828 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1829 if (grp->bb_counters[i] == 0)
1830 continue;
1831
1832 buddy = mb_find_buddy(e4b, i, &max);
1833 BUG_ON(buddy == NULL);
1834
ffad0a44 1835 k = mb_find_next_zero_bit(buddy, max, 0);
c9de560d
AT
1836 BUG_ON(k >= max);
1837
1838 ac->ac_found++;
1839
1840 ac->ac_b_ex.fe_len = 1 << i;
1841 ac->ac_b_ex.fe_start = k << i;
1842 ac->ac_b_ex.fe_group = e4b->bd_group;
1843
1844 ext4_mb_use_best_found(ac, e4b);
1845
1846 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1847
1848 if (EXT4_SB(sb)->s_mb_stats)
1849 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1850
1851 break;
1852 }
1853}
1854
1855/*
1856 * The routine scans the group and measures all found extents.
1857 * In order to optimize scanning, caller must pass number of
1858 * free blocks in the group, so the routine can know upper limit.
1859 */
089ceecc
ES
1860static noinline_for_stack
1861void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
c9de560d
AT
1862 struct ext4_buddy *e4b)
1863{
1864 struct super_block *sb = ac->ac_sb;
1865 void *bitmap = EXT4_MB_BITMAP(e4b);
1866 struct ext4_free_extent ex;
1867 int i;
1868 int free;
1869
1870 free = e4b->bd_info->bb_free;
1871 BUG_ON(free <= 0);
1872
1873 i = e4b->bd_info->bb_first_free;
1874
1875 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
ffad0a44 1876 i = mb_find_next_zero_bit(bitmap,
c9de560d
AT
1877 EXT4_BLOCKS_PER_GROUP(sb), i);
1878 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
26346ff6 1879 /*
e56eb659 1880 * IF we have corrupt bitmap, we won't find any
26346ff6
AK
1881 * free blocks even though group info says we
1882 * we have free blocks
1883 */
e29136f8
TT
1884 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1885 "%d free blocks as per "
fde4d95a 1886 "group info. But bitmap says 0",
26346ff6 1887 free);
c9de560d
AT
1888 break;
1889 }
1890
1891 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1892 BUG_ON(ex.fe_len <= 0);
26346ff6 1893 if (free < ex.fe_len) {
e29136f8
TT
1894 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1895 "%d free blocks as per "
fde4d95a 1896 "group info. But got %d blocks",
26346ff6 1897 free, ex.fe_len);
e56eb659
AK
1898 /*
1899 * The number of free blocks differs. This mostly
1900 * indicate that the bitmap is corrupt. So exit
1901 * without claiming the space.
1902 */
1903 break;
26346ff6 1904 }
c9de560d
AT
1905
1906 ext4_mb_measure_extent(ac, &ex, e4b);
1907
1908 i += ex.fe_len;
1909 free -= ex.fe_len;
1910 }
1911
1912 ext4_mb_check_limits(ac, e4b, 1);
1913}
1914
1915/*
1916 * This is a special case for storages like raid5
506bf2d8 1917 * we try to find stripe-aligned chunks for stripe-size-multiple requests
c9de560d 1918 */
089ceecc
ES
1919static noinline_for_stack
1920void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
c9de560d
AT
1921 struct ext4_buddy *e4b)
1922{
1923 struct super_block *sb = ac->ac_sb;
1924 struct ext4_sb_info *sbi = EXT4_SB(sb);
1925 void *bitmap = EXT4_MB_BITMAP(e4b);
1926 struct ext4_free_extent ex;
1927 ext4_fsblk_t first_group_block;
1928 ext4_fsblk_t a;
1929 ext4_grpblk_t i;
1930 int max;
1931
1932 BUG_ON(sbi->s_stripe == 0);
1933
1934 /* find first stripe-aligned block in group */
5661bd68
AM
1935 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1936
c9de560d
AT
1937 a = first_group_block + sbi->s_stripe - 1;
1938 do_div(a, sbi->s_stripe);
1939 i = (a * sbi->s_stripe) - first_group_block;
1940
1941 while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
1942 if (!mb_test_bit(i, bitmap)) {
1943 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
1944 if (max >= sbi->s_stripe) {
1945 ac->ac_found++;
1946 ac->ac_b_ex = ex;
1947 ext4_mb_use_best_found(ac, e4b);
1948 break;
1949 }
1950 }
1951 i += sbi->s_stripe;
1952 }
1953}
1954
8a57d9d6 1955/* This is now called BEFORE we load the buddy bitmap. */
c9de560d
AT
1956static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1957 ext4_group_t group, int cr)
1958{
1959 unsigned free, fragments;
a4912123 1960 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
c9de560d
AT
1961 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1962
1963 BUG_ON(cr < 0 || cr >= 4);
8a57d9d6
CW
1964
1965 /* We only do this if the grp has never been initialized */
1966 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1967 int ret = ext4_mb_init_group(ac->ac_sb, group);
1968 if (ret)
1969 return 0;
1970 }
c9de560d
AT
1971
1972 free = grp->bb_free;
1973 fragments = grp->bb_fragments;
1974 if (free == 0)
1975 return 0;
1976 if (fragments == 0)
1977 return 0;
1978
1979 switch (cr) {
1980 case 0:
1981 BUG_ON(ac->ac_2order == 0);
c9de560d 1982
8a57d9d6
CW
1983 if (grp->bb_largest_free_order < ac->ac_2order)
1984 return 0;
1985
a4912123
TT
1986 /* Avoid using the first bg of a flexgroup for data files */
1987 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1988 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1989 ((group % flex_size) == 0))
1990 return 0;
1991
8a57d9d6 1992 return 1;
c9de560d
AT
1993 case 1:
1994 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1995 return 1;
1996 break;
1997 case 2:
1998 if (free >= ac->ac_g_ex.fe_len)
1999 return 1;
2000 break;
2001 case 3:
2002 return 1;
2003 default:
2004 BUG();
2005 }
2006
2007 return 0;
2008}
2009
4ddfef7b
ES
2010static noinline_for_stack int
2011ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
c9de560d 2012{
8df9675f 2013 ext4_group_t ngroups, group, i;
c9de560d
AT
2014 int cr;
2015 int err = 0;
c9de560d
AT
2016 struct ext4_sb_info *sbi;
2017 struct super_block *sb;
2018 struct ext4_buddy e4b;
c9de560d
AT
2019
2020 sb = ac->ac_sb;
2021 sbi = EXT4_SB(sb);
8df9675f 2022 ngroups = ext4_get_groups_count(sb);
fb0a387d 2023 /* non-extent files are limited to low blocks/groups */
12e9b892 2024 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
fb0a387d
ES
2025 ngroups = sbi->s_blockfile_groups;
2026
c9de560d
AT
2027 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2028
2029 /* first, try the goal */
2030 err = ext4_mb_find_by_goal(ac, &e4b);
2031 if (err || ac->ac_status == AC_STATUS_FOUND)
2032 goto out;
2033
2034 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2035 goto out;
2036
2037 /*
2038 * ac->ac2_order is set only if the fe_len is a power of 2
2039 * if ac2_order is set we also set criteria to 0 so that we
2040 * try exact allocation using buddy.
2041 */
2042 i = fls(ac->ac_g_ex.fe_len);
2043 ac->ac_2order = 0;
2044 /*
2045 * We search using buddy data only if the order of the request
2046 * is greater than equal to the sbi_s_mb_order2_reqs
b713a5ec 2047 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
c9de560d
AT
2048 */
2049 if (i >= sbi->s_mb_order2_reqs) {
2050 /*
2051 * This should tell if fe_len is exactly power of 2
2052 */
2053 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2054 ac->ac_2order = i - 1;
2055 }
2056
4ba74d00
TT
2057 /* if stream allocation is enabled, use global goal */
2058 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
c9de560d
AT
2059 /* TBD: may be hot point */
2060 spin_lock(&sbi->s_md_lock);
2061 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2062 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2063 spin_unlock(&sbi->s_md_lock);
2064 }
4ba74d00 2065
c9de560d
AT
2066 /* Let's just scan groups to find more-less suitable blocks */
2067 cr = ac->ac_2order ? 0 : 1;
2068 /*
2069 * cr == 0 try to get exact allocation,
2070 * cr == 3 try to get anything
2071 */
2072repeat:
2073 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2074 ac->ac_criteria = cr;
ed8f9c75
AK
2075 /*
2076 * searching for the right group start
2077 * from the goal value specified
2078 */
2079 group = ac->ac_g_ex.fe_group;
2080
8df9675f 2081 for (i = 0; i < ngroups; group++, i++) {
8df9675f 2082 if (group == ngroups)
c9de560d
AT
2083 group = 0;
2084
8a57d9d6
CW
2085 /* This now checks without needing the buddy page */
2086 if (!ext4_mb_good_group(ac, group, cr))
c9de560d
AT
2087 continue;
2088
c9de560d
AT
2089 err = ext4_mb_load_buddy(sb, group, &e4b);
2090 if (err)
2091 goto out;
2092
2093 ext4_lock_group(sb, group);
8a57d9d6
CW
2094
2095 /*
2096 * We need to check again after locking the
2097 * block group
2098 */
c9de560d 2099 if (!ext4_mb_good_group(ac, group, cr)) {
c9de560d 2100 ext4_unlock_group(sb, group);
e39e07fd 2101 ext4_mb_unload_buddy(&e4b);
c9de560d
AT
2102 continue;
2103 }
2104
2105 ac->ac_groups_scanned++;
75507efb 2106 if (cr == 0)
c9de560d 2107 ext4_mb_simple_scan_group(ac, &e4b);
506bf2d8
ES
2108 else if (cr == 1 && sbi->s_stripe &&
2109 !(ac->ac_g_ex.fe_len % sbi->s_stripe))
c9de560d
AT
2110 ext4_mb_scan_aligned(ac, &e4b);
2111 else
2112 ext4_mb_complex_scan_group(ac, &e4b);
2113
2114 ext4_unlock_group(sb, group);
e39e07fd 2115 ext4_mb_unload_buddy(&e4b);
c9de560d
AT
2116
2117 if (ac->ac_status != AC_STATUS_CONTINUE)
2118 break;
2119 }
2120 }
2121
2122 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2123 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2124 /*
2125 * We've been searching too long. Let's try to allocate
2126 * the best chunk we've found so far
2127 */
2128
2129 ext4_mb_try_best_found(ac, &e4b);
2130 if (ac->ac_status != AC_STATUS_FOUND) {
2131 /*
2132 * Someone more lucky has already allocated it.
2133 * The only thing we can do is just take first
2134 * found block(s)
2135 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2136 */
2137 ac->ac_b_ex.fe_group = 0;
2138 ac->ac_b_ex.fe_start = 0;
2139 ac->ac_b_ex.fe_len = 0;
2140 ac->ac_status = AC_STATUS_CONTINUE;
2141 ac->ac_flags |= EXT4_MB_HINT_FIRST;
2142 cr = 3;
2143 atomic_inc(&sbi->s_mb_lost_chunks);
2144 goto repeat;
2145 }
2146 }
2147out:
2148 return err;
2149}
2150
c9de560d
AT
2151static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2152{
2153 struct super_block *sb = seq->private;
c9de560d
AT
2154 ext4_group_t group;
2155
8df9675f 2156 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
c9de560d 2157 return NULL;
c9de560d 2158 group = *pos + 1;
a9df9a49 2159 return (void *) ((unsigned long) group);
c9de560d
AT
2160}
2161
2162static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2163{
2164 struct super_block *sb = seq->private;
c9de560d
AT
2165 ext4_group_t group;
2166
2167 ++*pos;
8df9675f 2168 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
c9de560d
AT
2169 return NULL;
2170 group = *pos + 1;
a9df9a49 2171 return (void *) ((unsigned long) group);
c9de560d
AT
2172}
2173
2174static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2175{
2176 struct super_block *sb = seq->private;
a9df9a49 2177 ext4_group_t group = (ext4_group_t) ((unsigned long) v);
c9de560d
AT
2178 int i;
2179 int err;
2180 struct ext4_buddy e4b;
2181 struct sg {
2182 struct ext4_group_info info;
a36b4498 2183 ext4_grpblk_t counters[16];
c9de560d
AT
2184 } sg;
2185
2186 group--;
2187 if (group == 0)
2188 seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2189 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2190 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2191 "group", "free", "frags", "first",
2192 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2193 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2194
2195 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2196 sizeof(struct ext4_group_info);
2197 err = ext4_mb_load_buddy(sb, group, &e4b);
2198 if (err) {
a9df9a49 2199 seq_printf(seq, "#%-5u: I/O error\n", group);
c9de560d
AT
2200 return 0;
2201 }
2202 ext4_lock_group(sb, group);
2203 memcpy(&sg, ext4_get_group_info(sb, group), i);
2204 ext4_unlock_group(sb, group);
e39e07fd 2205 ext4_mb_unload_buddy(&e4b);
c9de560d 2206
a9df9a49 2207 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
c9de560d
AT
2208 sg.info.bb_fragments, sg.info.bb_first_free);
2209 for (i = 0; i <= 13; i++)
2210 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2211 sg.info.bb_counters[i] : 0);
2212 seq_printf(seq, " ]\n");
2213
2214 return 0;
2215}
2216
2217static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2218{
2219}
2220
7f1346a9 2221static const struct seq_operations ext4_mb_seq_groups_ops = {
c9de560d
AT
2222 .start = ext4_mb_seq_groups_start,
2223 .next = ext4_mb_seq_groups_next,
2224 .stop = ext4_mb_seq_groups_stop,
2225 .show = ext4_mb_seq_groups_show,
2226};
2227
2228static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2229{
2230 struct super_block *sb = PDE(inode)->data;
2231 int rc;
2232
2233 rc = seq_open(file, &ext4_mb_seq_groups_ops);
2234 if (rc == 0) {
a271fe85 2235 struct seq_file *m = file->private_data;
c9de560d
AT
2236 m->private = sb;
2237 }
2238 return rc;
2239
2240}
2241
7f1346a9 2242static const struct file_operations ext4_mb_seq_groups_fops = {
c9de560d
AT
2243 .owner = THIS_MODULE,
2244 .open = ext4_mb_seq_groups_open,
2245 .read = seq_read,
2246 .llseek = seq_lseek,
2247 .release = seq_release,
2248};
2249
fb1813f4
CW
2250static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2251{
2252 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2253 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2254
2255 BUG_ON(!cachep);
2256 return cachep;
2257}
5f21b0e6
FB
2258
2259/* Create and initialize ext4_group_info data for the given group. */
920313a7 2260int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
5f21b0e6
FB
2261 struct ext4_group_desc *desc)
2262{
fb1813f4 2263 int i;
5f21b0e6
FB
2264 int metalen = 0;
2265 struct ext4_sb_info *sbi = EXT4_SB(sb);
2266 struct ext4_group_info **meta_group_info;
fb1813f4 2267 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
5f21b0e6
FB
2268
2269 /*
2270 * First check if this group is the first of a reserved block.
2271 * If it's true, we have to allocate a new table of pointers
2272 * to ext4_group_info structures
2273 */
2274 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2275 metalen = sizeof(*meta_group_info) <<
2276 EXT4_DESC_PER_BLOCK_BITS(sb);
2277 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2278 if (meta_group_info == NULL) {
2279 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2280 "buddy group\n");
2281 goto exit_meta_group_info;
2282 }
2283 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2284 meta_group_info;
2285 }
2286
5f21b0e6
FB
2287 meta_group_info =
2288 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2289 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2290
fb1813f4 2291 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
5f21b0e6
FB
2292 if (meta_group_info[i] == NULL) {
2293 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2294 goto exit_group_info;
2295 }
fb1813f4 2296 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
5f21b0e6
FB
2297 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2298 &(meta_group_info[i]->bb_state));
2299
2300 /*
2301 * initialize bb_free to be able to skip
2302 * empty groups without initialization
2303 */
2304 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2305 meta_group_info[i]->bb_free =
2306 ext4_free_blocks_after_init(sb, group, desc);
2307 } else {
2308 meta_group_info[i]->bb_free =
560671a0 2309 ext4_free_blks_count(sb, desc);
5f21b0e6
FB
2310 }
2311
2312 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
920313a7 2313 init_rwsem(&meta_group_info[i]->alloc_sem);
64e290ec 2314 meta_group_info[i]->bb_free_root = RB_ROOT;
8a57d9d6 2315 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
5f21b0e6
FB
2316
2317#ifdef DOUBLE_CHECK
2318 {
2319 struct buffer_head *bh;
2320 meta_group_info[i]->bb_bitmap =
2321 kmalloc(sb->s_blocksize, GFP_KERNEL);
2322 BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2323 bh = ext4_read_block_bitmap(sb, group);
2324 BUG_ON(bh == NULL);
2325 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2326 sb->s_blocksize);
2327 put_bh(bh);
2328 }
2329#endif
2330
2331 return 0;
2332
2333exit_group_info:
2334 /* If a meta_group_info table has been allocated, release it now */
2335 if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
2336 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2337exit_meta_group_info:
2338 return -ENOMEM;
2339} /* ext4_mb_add_groupinfo */
2340
c9de560d
AT
2341static int ext4_mb_init_backend(struct super_block *sb)
2342{
8df9675f 2343 ext4_group_t ngroups = ext4_get_groups_count(sb);
c9de560d 2344 ext4_group_t i;
c9de560d 2345 struct ext4_sb_info *sbi = EXT4_SB(sb);
5f21b0e6
FB
2346 struct ext4_super_block *es = sbi->s_es;
2347 int num_meta_group_infos;
2348 int num_meta_group_infos_max;
2349 int array_size;
5f21b0e6 2350 struct ext4_group_desc *desc;
fb1813f4 2351 struct kmem_cache *cachep;
5f21b0e6
FB
2352
2353 /* This is the number of blocks used by GDT */
8df9675f 2354 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
5f21b0e6
FB
2355 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
2356
2357 /*
2358 * This is the total number of blocks used by GDT including
2359 * the number of reserved blocks for GDT.
2360 * The s_group_info array is allocated with this value
2361 * to allow a clean online resize without a complex
2362 * manipulation of pointer.
2363 * The drawback is the unused memory when no resize
2364 * occurs but it's very low in terms of pages
2365 * (see comments below)
2366 * Need to handle this properly when META_BG resizing is allowed
2367 */
2368 num_meta_group_infos_max = num_meta_group_infos +
2369 le16_to_cpu(es->s_reserved_gdt_blocks);
c9de560d 2370
5f21b0e6
FB
2371 /*
2372 * array_size is the size of s_group_info array. We round it
2373 * to the next power of two because this approximation is done
2374 * internally by kmalloc so we can have some more memory
2375 * for free here (e.g. may be used for META_BG resize).
2376 */
2377 array_size = 1;
2378 while (array_size < sizeof(*sbi->s_group_info) *
2379 num_meta_group_infos_max)
2380 array_size = array_size << 1;
c9de560d
AT
2381 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2382 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2383 * So a two level scheme suffices for now. */
5f21b0e6 2384 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
c9de560d
AT
2385 if (sbi->s_group_info == NULL) {
2386 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2387 return -ENOMEM;
2388 }
2389 sbi->s_buddy_cache = new_inode(sb);
2390 if (sbi->s_buddy_cache == NULL) {
2391 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2392 goto err_freesgi;
2393 }
85fe4025 2394 sbi->s_buddy_cache->i_ino = get_next_ino();
c9de560d 2395 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
8df9675f 2396 for (i = 0; i < ngroups; i++) {
c9de560d
AT
2397 desc = ext4_get_group_desc(sb, i, NULL);
2398 if (desc == NULL) {
2399 printk(KERN_ERR
a9df9a49 2400 "EXT4-fs: can't read descriptor %u\n", i);
c9de560d
AT
2401 goto err_freebuddy;
2402 }
5f21b0e6
FB
2403 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2404 goto err_freebuddy;
c9de560d
AT
2405 }
2406
2407 return 0;
2408
2409err_freebuddy:
fb1813f4 2410 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
f1fa3342 2411 while (i-- > 0)
fb1813f4 2412 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
c9de560d 2413 i = num_meta_group_infos;
f1fa3342 2414 while (i-- > 0)
c9de560d
AT
2415 kfree(sbi->s_group_info[i]);
2416 iput(sbi->s_buddy_cache);
2417err_freesgi:
2418 kfree(sbi->s_group_info);
2419 return -ENOMEM;
2420}
2421
2892c15d
ES
2422static void ext4_groupinfo_destroy_slabs(void)
2423{
2424 int i;
2425
2426 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2427 if (ext4_groupinfo_caches[i])
2428 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2429 ext4_groupinfo_caches[i] = NULL;
2430 }
2431}
2432
2433static int ext4_groupinfo_create_slab(size_t size)
2434{
2435 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2436 int slab_size;
2437 int blocksize_bits = order_base_2(size);
2438 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2439 struct kmem_cache *cachep;
2440
2441 if (cache_index >= NR_GRPINFO_CACHES)
2442 return -EINVAL;
2443
2444 if (unlikely(cache_index < 0))
2445 cache_index = 0;
2446
2447 mutex_lock(&ext4_grpinfo_slab_create_mutex);
2448 if (ext4_groupinfo_caches[cache_index]) {
2449 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2450 return 0; /* Already created */
2451 }
2452
2453 slab_size = offsetof(struct ext4_group_info,
2454 bb_counters[blocksize_bits + 2]);
2455
2456 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2457 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2458 NULL);
2459
2460 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2461 if (!cachep) {
2462 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
2463 return -ENOMEM;
2464 }
2465
2466 ext4_groupinfo_caches[cache_index] = cachep;
2467
2468 return 0;
2469}
2470
c9de560d
AT
2471int ext4_mb_init(struct super_block *sb, int needs_recovery)
2472{
2473 struct ext4_sb_info *sbi = EXT4_SB(sb);
6be2ded1 2474 unsigned i, j;
c9de560d
AT
2475 unsigned offset;
2476 unsigned max;
74767c5a 2477 int ret;
c9de560d 2478
1927805e 2479 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
c9de560d
AT
2480
2481 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2482 if (sbi->s_mb_offsets == NULL) {
fb1813f4
CW
2483 ret = -ENOMEM;
2484 goto out;
c9de560d 2485 }
ff7ef329 2486
1927805e 2487 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
c9de560d
AT
2488 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2489 if (sbi->s_mb_maxs == NULL) {
fb1813f4
CW
2490 ret = -ENOMEM;
2491 goto out;
2492 }
2493
2892c15d
ES
2494 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2495 if (ret < 0)
2496 goto out;
c9de560d
AT
2497
2498 /* order 0 is regular bitmap */
2499 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2500 sbi->s_mb_offsets[0] = 0;
2501
2502 i = 1;
2503 offset = 0;
2504 max = sb->s_blocksize << 2;
2505 do {
2506 sbi->s_mb_offsets[i] = offset;
2507 sbi->s_mb_maxs[i] = max;
2508 offset += 1 << (sb->s_blocksize_bits - i);
2509 max = max >> 1;
2510 i++;
2511 } while (i <= sb->s_blocksize_bits + 1);
2512
2513 /* init file for buddy data */
74767c5a
SF
2514 ret = ext4_mb_init_backend(sb);
2515 if (ret != 0) {
fb1813f4 2516 goto out;
c9de560d
AT
2517 }
2518
2519 spin_lock_init(&sbi->s_md_lock);
c9de560d
AT
2520 spin_lock_init(&sbi->s_bal_lock);
2521
2522 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2523 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2524 sbi->s_mb_stats = MB_DEFAULT_STATS;
2525 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2526 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
c9de560d
AT
2527 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2528
730c213c 2529 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
c9de560d 2530 if (sbi->s_locality_groups == NULL) {
fb1813f4
CW
2531 ret = -ENOMEM;
2532 goto out;
c9de560d 2533 }
730c213c 2534 for_each_possible_cpu(i) {
c9de560d 2535 struct ext4_locality_group *lg;
730c213c 2536 lg = per_cpu_ptr(sbi->s_locality_groups, i);
c9de560d 2537 mutex_init(&lg->lg_mutex);
6be2ded1
AK
2538 for (j = 0; j < PREALLOC_TB_SIZE; j++)
2539 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
c9de560d
AT
2540 spin_lock_init(&lg->lg_prealloc_lock);
2541 }
2542
296c355c
TT
2543 if (sbi->s_proc)
2544 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2545 &ext4_mb_seq_groups_fops, sb);
c9de560d 2546
0390131b
FM
2547 if (sbi->s_journal)
2548 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
fb1813f4
CW
2549out:
2550 if (ret) {
2551 kfree(sbi->s_mb_offsets);
2552 kfree(sbi->s_mb_maxs);
fb1813f4
CW
2553 }
2554 return ret;
c9de560d
AT
2555}
2556
955ce5f5 2557/* need to called with the ext4 group lock held */
c9de560d
AT
2558static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2559{
2560 struct ext4_prealloc_space *pa;
2561 struct list_head *cur, *tmp;
2562 int count = 0;
2563
2564 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2565 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2566 list_del(&pa->pa_group_list);
2567 count++;
688f05a0 2568 kmem_cache_free(ext4_pspace_cachep, pa);
c9de560d
AT
2569 }
2570 if (count)
6ba495e9 2571 mb_debug(1, "mballoc: %u PAs left\n", count);
c9de560d
AT
2572
2573}
2574
2575int ext4_mb_release(struct super_block *sb)
2576{
8df9675f 2577 ext4_group_t ngroups = ext4_get_groups_count(sb);
c9de560d
AT
2578 ext4_group_t i;
2579 int num_meta_group_infos;
2580 struct ext4_group_info *grinfo;
2581 struct ext4_sb_info *sbi = EXT4_SB(sb);
fb1813f4 2582 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
c9de560d 2583
c9de560d 2584 if (sbi->s_group_info) {
8df9675f 2585 for (i = 0; i < ngroups; i++) {
c9de560d
AT
2586 grinfo = ext4_get_group_info(sb, i);
2587#ifdef DOUBLE_CHECK
2588 kfree(grinfo->bb_bitmap);
2589#endif
2590 ext4_lock_group(sb, i);
2591 ext4_mb_cleanup_pa(grinfo);
2592 ext4_unlock_group(sb, i);
fb1813f4 2593 kmem_cache_free(cachep, grinfo);
c9de560d 2594 }
8df9675f 2595 num_meta_group_infos = (ngroups +
c9de560d
AT
2596 EXT4_DESC_PER_BLOCK(sb) - 1) >>
2597 EXT4_DESC_PER_BLOCK_BITS(sb);
2598 for (i = 0; i < num_meta_group_infos; i++)
2599 kfree(sbi->s_group_info[i]);
2600 kfree(sbi->s_group_info);
2601 }
2602 kfree(sbi->s_mb_offsets);
2603 kfree(sbi->s_mb_maxs);
2604 if (sbi->s_buddy_cache)
2605 iput(sbi->s_buddy_cache);
2606 if (sbi->s_mb_stats) {
2607 printk(KERN_INFO
2608 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
2609 atomic_read(&sbi->s_bal_allocated),
2610 atomic_read(&sbi->s_bal_reqs),
2611 atomic_read(&sbi->s_bal_success));
2612 printk(KERN_INFO
2613 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
2614 "%u 2^N hits, %u breaks, %u lost\n",
2615 atomic_read(&sbi->s_bal_ex_scanned),
2616 atomic_read(&sbi->s_bal_goals),
2617 atomic_read(&sbi->s_bal_2orders),
2618 atomic_read(&sbi->s_bal_breaks),
2619 atomic_read(&sbi->s_mb_lost_chunks));
2620 printk(KERN_INFO
2621 "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
2622 sbi->s_mb_buddies_generated++,
2623 sbi->s_mb_generation_time);
2624 printk(KERN_INFO
2625 "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
2626 atomic_read(&sbi->s_mb_preallocated),
2627 atomic_read(&sbi->s_mb_discarded));
2628 }
2629
730c213c 2630 free_percpu(sbi->s_locality_groups);
296c355c
TT
2631 if (sbi->s_proc)
2632 remove_proc_entry("mb_groups", sbi->s_proc);
c9de560d
AT
2633
2634 return 0;
2635}
2636
77ca6cdf 2637static inline int ext4_issue_discard(struct super_block *sb,
5c521830
JZ
2638 ext4_group_t block_group, ext4_grpblk_t block, int count)
2639{
5c521830
JZ
2640 ext4_fsblk_t discard_block;
2641
2642 discard_block = block + ext4_group_first_block_no(sb, block_group);
2643 trace_ext4_discard_blocks(sb,
2644 (unsigned long long) discard_block, count);
93259636 2645 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
5c521830
JZ
2646}
2647
3e624fc7
TT
2648/*
2649 * This function is called by the jbd2 layer once the commit has finished,
2650 * so we know we can free the blocks that were released with that commit.
2651 */
2652static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
c9de560d 2653{
3e624fc7 2654 struct super_block *sb = journal->j_private;
c9de560d 2655 struct ext4_buddy e4b;
c894058d 2656 struct ext4_group_info *db;
93259636 2657 int err, ret, count = 0, count2 = 0;
c894058d 2658 struct ext4_free_data *entry;
3e624fc7 2659 struct list_head *l, *ltmp;
c9de560d 2660
3e624fc7
TT
2661 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2662 entry = list_entry(l, struct ext4_free_data, list);
c9de560d 2663
6ba495e9 2664 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
3e624fc7 2665 entry->count, entry->group, entry);
c9de560d 2666
93259636
LC
2667 if (test_opt(sb, DISCARD)) {
2668 ret = ext4_issue_discard(sb, entry->group,
5c521830 2669 entry->start_blk, entry->count);
93259636
LC
2670 if (unlikely(ret == -EOPNOTSUPP)) {
2671 ext4_warning(sb, "discard not supported, "
2672 "disabling");
2673 clear_opt(sb, DISCARD);
2674 }
2675 }
b90f6870 2676
c894058d 2677 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
c9de560d
AT
2678 /* we expect to find existing buddy because it's pinned */
2679 BUG_ON(err != 0);
2680
c894058d 2681 db = e4b.bd_info;
c9de560d 2682 /* there are blocks to put in buddy to make them really free */
c894058d 2683 count += entry->count;
c9de560d 2684 count2++;
c894058d
AK
2685 ext4_lock_group(sb, entry->group);
2686 /* Take it out of per group rb tree */
2687 rb_erase(&entry->node, &(db->bb_free_root));
2688 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
2689
2690 if (!db->bb_free_root.rb_node) {
2691 /* No more items in the per group rb tree
2692 * balance refcounts from ext4_mb_free_metadata()
2693 */
2694 page_cache_release(e4b.bd_buddy_page);
2695 page_cache_release(e4b.bd_bitmap_page);
c9de560d 2696 }
c894058d 2697 ext4_unlock_group(sb, entry->group);
c894058d 2698 kmem_cache_free(ext4_free_ext_cachep, entry);
e39e07fd 2699 ext4_mb_unload_buddy(&e4b);
3e624fc7 2700 }
c9de560d 2701
6ba495e9 2702 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
c9de560d
AT
2703}
2704
6ba495e9
TT
2705#ifdef CONFIG_EXT4_DEBUG
2706u8 mb_enable_debug __read_mostly;
2707
2708static struct dentry *debugfs_dir;
2709static struct dentry *debugfs_debug;
2710
2711static void __init ext4_create_debugfs_entry(void)
2712{
2713 debugfs_dir = debugfs_create_dir("ext4", NULL);
2714 if (debugfs_dir)
2715 debugfs_debug = debugfs_create_u8("mballoc-debug",
2716 S_IRUGO | S_IWUSR,
2717 debugfs_dir,
2718 &mb_enable_debug);
2719}
2720
2721static void ext4_remove_debugfs_entry(void)
2722{
2723 debugfs_remove(debugfs_debug);
2724 debugfs_remove(debugfs_dir);
2725}
2726
2727#else
2728
2729static void __init ext4_create_debugfs_entry(void)
2730{
2731}
2732
2733static void ext4_remove_debugfs_entry(void)
2734{
2735}
2736
2737#endif
2738
5dabfc78 2739int __init ext4_init_mballoc(void)
c9de560d 2740{
16828088
TT
2741 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2742 SLAB_RECLAIM_ACCOUNT);
c9de560d
AT
2743 if (ext4_pspace_cachep == NULL)
2744 return -ENOMEM;
2745
16828088
TT
2746 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2747 SLAB_RECLAIM_ACCOUNT);
256bdb49
ES
2748 if (ext4_ac_cachep == NULL) {
2749 kmem_cache_destroy(ext4_pspace_cachep);
2750 return -ENOMEM;
2751 }
c894058d 2752
16828088
TT
2753 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
2754 SLAB_RECLAIM_ACCOUNT);
c894058d
AK
2755 if (ext4_free_ext_cachep == NULL) {
2756 kmem_cache_destroy(ext4_pspace_cachep);
2757 kmem_cache_destroy(ext4_ac_cachep);
2758 return -ENOMEM;
2759 }
6ba495e9 2760 ext4_create_debugfs_entry();
c9de560d
AT
2761 return 0;
2762}
2763
5dabfc78 2764void ext4_exit_mballoc(void)
c9de560d 2765{
60e6679e 2766 /*
3e03f9ca
JDB
2767 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2768 * before destroying the slab cache.
2769 */
2770 rcu_barrier();
c9de560d 2771 kmem_cache_destroy(ext4_pspace_cachep);
256bdb49 2772 kmem_cache_destroy(ext4_ac_cachep);
c894058d 2773 kmem_cache_destroy(ext4_free_ext_cachep);
2892c15d 2774 ext4_groupinfo_destroy_slabs();
6ba495e9 2775 ext4_remove_debugfs_entry();
c9de560d
AT
2776}
2777
2778
2779/*
73b2c716 2780 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
c9de560d
AT
2781 * Returns 0 if success or error code
2782 */
4ddfef7b
ES
2783static noinline_for_stack int
2784ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
498e5f24 2785 handle_t *handle, unsigned int reserv_blks)
c9de560d
AT
2786{
2787 struct buffer_head *bitmap_bh = NULL;
c9de560d
AT
2788 struct ext4_group_desc *gdp;
2789 struct buffer_head *gdp_bh;
2790 struct ext4_sb_info *sbi;
2791 struct super_block *sb;
2792 ext4_fsblk_t block;
519deca0 2793 int err, len;
c9de560d
AT
2794
2795 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2796 BUG_ON(ac->ac_b_ex.fe_len <= 0);
2797
2798 sb = ac->ac_sb;
2799 sbi = EXT4_SB(sb);
c9de560d
AT
2800
2801 err = -EIO;
574ca174 2802 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
c9de560d
AT
2803 if (!bitmap_bh)
2804 goto out_err;
2805
2806 err = ext4_journal_get_write_access(handle, bitmap_bh);
2807 if (err)
2808 goto out_err;
2809
2810 err = -EIO;
2811 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2812 if (!gdp)
2813 goto out_err;
2814
a9df9a49 2815 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
9fd9784c 2816 ext4_free_blks_count(sb, gdp));
03cddb80 2817
c9de560d
AT
2818 err = ext4_journal_get_write_access(handle, gdp_bh);
2819 if (err)
2820 goto out_err;
2821
bda00de7 2822 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
c9de560d 2823
519deca0 2824 len = ac->ac_b_ex.fe_len;
6fd058f7 2825 if (!ext4_data_block_valid(sbi, block, len)) {
12062ddd 2826 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
6fd058f7 2827 "fs metadata\n", block, block+len);
519deca0
AK
2828 /* File system mounted not to panic on error
2829 * Fix the bitmap and repeat the block allocation
2830 * We leak some of the blocks here.
2831 */
955ce5f5
AK
2832 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2833 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2834 ac->ac_b_ex.fe_len);
2835 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
0390131b 2836 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
519deca0
AK
2837 if (!err)
2838 err = -EAGAIN;
2839 goto out_err;
c9de560d 2840 }
955ce5f5
AK
2841
2842 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
c9de560d
AT
2843#ifdef AGGRESSIVE_CHECK
2844 {
2845 int i;
2846 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2847 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2848 bitmap_bh->b_data));
2849 }
2850 }
2851#endif
955ce5f5 2852 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
c9de560d
AT
2853 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2854 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
560671a0
AK
2855 ext4_free_blks_set(sb, gdp,
2856 ext4_free_blocks_after_init(sb,
2857 ac->ac_b_ex.fe_group, gdp));
c9de560d 2858 }
560671a0
AK
2859 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
2860 ext4_free_blks_set(sb, gdp, len);
c9de560d 2861 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
955ce5f5
AK
2862
2863 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
6bc6e63f 2864 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
d2a17637 2865 /*
6bc6e63f 2866 * Now reduce the dirty block count also. Should not go negative
d2a17637 2867 */
6bc6e63f
AK
2868 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2869 /* release all the reserved blocks if non delalloc */
2870 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
c9de560d 2871
772cb7c8
JS
2872 if (sbi->s_log_groups_per_flex) {
2873 ext4_group_t flex_group = ext4_flex_group(sbi,
2874 ac->ac_b_ex.fe_group);
9f24e420
TT
2875 atomic_sub(ac->ac_b_ex.fe_len,
2876 &sbi->s_flex_groups[flex_group].free_blocks);
772cb7c8
JS
2877 }
2878
0390131b 2879 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
c9de560d
AT
2880 if (err)
2881 goto out_err;
0390131b 2882 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
c9de560d
AT
2883
2884out_err:
a0375156 2885 ext4_mark_super_dirty(sb);
42a10add 2886 brelse(bitmap_bh);
c9de560d
AT
2887 return err;
2888}
2889
2890/*
2891 * here we normalize request for locality group
2892 * Group request are normalized to s_strip size if we set the same via mount
2893 * option. If not we set it to s_mb_group_prealloc which can be configured via
b713a5ec 2894 * /sys/fs/ext4/<partition>/mb_group_prealloc
c9de560d
AT
2895 *
2896 * XXX: should we try to preallocate more than the group has now?
2897 */
2898static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
2899{
2900 struct super_block *sb = ac->ac_sb;
2901 struct ext4_locality_group *lg = ac->ac_lg;
2902
2903 BUG_ON(lg == NULL);
2904 if (EXT4_SB(sb)->s_stripe)
2905 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
2906 else
2907 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
6ba495e9 2908 mb_debug(1, "#%u: goal %u blocks for locality group\n",
c9de560d
AT
2909 current->pid, ac->ac_g_ex.fe_len);
2910}
2911
2912/*
2913 * Normalization means making request better in terms of
2914 * size and alignment
2915 */
4ddfef7b
ES
2916static noinline_for_stack void
2917ext4_mb_normalize_request(struct ext4_allocation_context *ac,
c9de560d
AT
2918 struct ext4_allocation_request *ar)
2919{
2920 int bsbits, max;
2921 ext4_lblk_t end;
c9de560d 2922 loff_t size, orig_size, start_off;
5a0790c2 2923 ext4_lblk_t start;
c9de560d 2924 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
9a0762c5 2925 struct ext4_prealloc_space *pa;
c9de560d
AT
2926
2927 /* do normalize only data requests, metadata requests
2928 do not need preallocation */
2929 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
2930 return;
2931
2932 /* sometime caller may want exact blocks */
2933 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2934 return;
2935
2936 /* caller may indicate that preallocation isn't
2937 * required (it's a tail, for example) */
2938 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
2939 return;
2940
2941 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
2942 ext4_mb_normalize_group_request(ac);
2943 return ;
2944 }
2945
2946 bsbits = ac->ac_sb->s_blocksize_bits;
2947
2948 /* first, let's learn actual file size
2949 * given current request is allocated */
2950 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
2951 size = size << bsbits;
2952 if (size < i_size_read(ac->ac_inode))
2953 size = i_size_read(ac->ac_inode);
5a0790c2 2954 orig_size = size;
c9de560d 2955
1930479c
VC
2956 /* max size of free chunks */
2957 max = 2 << bsbits;
c9de560d 2958
1930479c
VC
2959#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
2960 (req <= (size) || max <= (chunk_size))
c9de560d
AT
2961
2962 /* first, try to predict filesize */
2963 /* XXX: should this table be tunable? */
2964 start_off = 0;
2965 if (size <= 16 * 1024) {
2966 size = 16 * 1024;
2967 } else if (size <= 32 * 1024) {
2968 size = 32 * 1024;
2969 } else if (size <= 64 * 1024) {
2970 size = 64 * 1024;
2971 } else if (size <= 128 * 1024) {
2972 size = 128 * 1024;
2973 } else if (size <= 256 * 1024) {
2974 size = 256 * 1024;
2975 } else if (size <= 512 * 1024) {
2976 size = 512 * 1024;
2977 } else if (size <= 1024 * 1024) {
2978 size = 1024 * 1024;
1930479c 2979 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
c9de560d 2980 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
1930479c
VC
2981 (21 - bsbits)) << 21;
2982 size = 2 * 1024 * 1024;
2983 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
c9de560d
AT
2984 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
2985 (22 - bsbits)) << 22;
2986 size = 4 * 1024 * 1024;
2987 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
1930479c 2988 (8<<20)>>bsbits, max, 8 * 1024)) {
c9de560d
AT
2989 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
2990 (23 - bsbits)) << 23;
2991 size = 8 * 1024 * 1024;
2992 } else {
2993 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
2994 size = ac->ac_o_ex.fe_len << bsbits;
2995 }
5a0790c2
AK
2996 size = size >> bsbits;
2997 start = start_off >> bsbits;
c9de560d
AT
2998
2999 /* don't cover already allocated blocks in selected range */
3000 if (ar->pleft && start <= ar->lleft) {
3001 size -= ar->lleft + 1 - start;
3002 start = ar->lleft + 1;
3003 }
3004 if (ar->pright && start + size - 1 >= ar->lright)
3005 size -= start + size - ar->lright;
3006
3007 end = start + size;
3008
3009 /* check we don't cross already preallocated blocks */
3010 rcu_read_lock();
9a0762c5 3011 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
498e5f24 3012 ext4_lblk_t pa_end;
c9de560d 3013
c9de560d
AT
3014 if (pa->pa_deleted)
3015 continue;
3016 spin_lock(&pa->pa_lock);
3017 if (pa->pa_deleted) {
3018 spin_unlock(&pa->pa_lock);
3019 continue;
3020 }
3021
3022 pa_end = pa->pa_lstart + pa->pa_len;
3023
3024 /* PA must not overlap original request */
3025 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3026 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3027
38877f4e
ES
3028 /* skip PAs this normalized request doesn't overlap with */
3029 if (pa->pa_lstart >= end || pa_end <= start) {
c9de560d
AT
3030 spin_unlock(&pa->pa_lock);
3031 continue;
3032 }
3033 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3034
38877f4e 3035 /* adjust start or end to be adjacent to this pa */
c9de560d
AT
3036 if (pa_end <= ac->ac_o_ex.fe_logical) {
3037 BUG_ON(pa_end < start);
3038 start = pa_end;
38877f4e 3039 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
c9de560d
AT
3040 BUG_ON(pa->pa_lstart > end);
3041 end = pa->pa_lstart;
3042 }
3043 spin_unlock(&pa->pa_lock);
3044 }
3045 rcu_read_unlock();
3046 size = end - start;
3047
3048 /* XXX: extra loop to check we really don't overlap preallocations */
3049 rcu_read_lock();
9a0762c5 3050 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
498e5f24 3051 ext4_lblk_t pa_end;
c9de560d
AT
3052 spin_lock(&pa->pa_lock);
3053 if (pa->pa_deleted == 0) {
3054 pa_end = pa->pa_lstart + pa->pa_len;
3055 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3056 }
3057 spin_unlock(&pa->pa_lock);
3058 }
3059 rcu_read_unlock();
3060
3061 if (start + size <= ac->ac_o_ex.fe_logical &&
3062 start > ac->ac_o_ex.fe_logical) {
3063 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
3064 (unsigned long) start, (unsigned long) size,
3065 (unsigned long) ac->ac_o_ex.fe_logical);
3066 }
3067 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3068 start > ac->ac_o_ex.fe_logical);
8d03c7a0 3069 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
c9de560d
AT
3070
3071 /* now prepare goal request */
3072
3073 /* XXX: is it better to align blocks WRT to logical
3074 * placement or satisfy big request as is */
3075 ac->ac_g_ex.fe_logical = start;
3076 ac->ac_g_ex.fe_len = size;
3077
3078 /* define goal start in order to merge */
3079 if (ar->pright && (ar->lright == (start + size))) {
3080 /* merge to the right */
3081 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3082 &ac->ac_f_ex.fe_group,
3083 &ac->ac_f_ex.fe_start);
3084 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3085 }
3086 if (ar->pleft && (ar->lleft + 1 == start)) {
3087 /* merge to the left */
3088 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3089 &ac->ac_f_ex.fe_group,
3090 &ac->ac_f_ex.fe_start);
3091 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3092 }
3093
6ba495e9 3094 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
c9de560d
AT
3095 (unsigned) orig_size, (unsigned) start);
3096}
3097
3098static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3099{
3100 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3101
3102 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3103 atomic_inc(&sbi->s_bal_reqs);
3104 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
291dae47 3105 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
c9de560d
AT
3106 atomic_inc(&sbi->s_bal_success);
3107 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3108 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3109 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3110 atomic_inc(&sbi->s_bal_goals);
3111 if (ac->ac_found > sbi->s_mb_max_to_scan)
3112 atomic_inc(&sbi->s_bal_breaks);
3113 }
3114
296c355c
TT
3115 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3116 trace_ext4_mballoc_alloc(ac);
3117 else
3118 trace_ext4_mballoc_prealloc(ac);
c9de560d
AT
3119}
3120
b844167e
CW
3121/*
3122 * Called on failure; free up any blocks from the inode PA for this
3123 * context. We don't need this for MB_GROUP_PA because we only change
3124 * pa_free in ext4_mb_release_context(), but on failure, we've already
3125 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3126 */
3127static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3128{
3129 struct ext4_prealloc_space *pa = ac->ac_pa;
3130 int len;
3131
3132 if (pa && pa->pa_type == MB_INODE_PA) {
3133 len = ac->ac_b_ex.fe_len;
3134 pa->pa_free += len;
3135 }
3136
3137}
3138
c9de560d
AT
3139/*
3140 * use blocks preallocated to inode
3141 */
3142static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3143 struct ext4_prealloc_space *pa)
3144{
3145 ext4_fsblk_t start;
3146 ext4_fsblk_t end;
3147 int len;
3148
3149 /* found preallocated blocks, use them */
3150 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3151 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
3152 len = end - start;
3153 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3154 &ac->ac_b_ex.fe_start);
3155 ac->ac_b_ex.fe_len = len;
3156 ac->ac_status = AC_STATUS_FOUND;
3157 ac->ac_pa = pa;
3158
3159 BUG_ON(start < pa->pa_pstart);
3160 BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
3161 BUG_ON(pa->pa_free < len);
3162 pa->pa_free -= len;
3163
6ba495e9 3164 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
c9de560d
AT
3165}
3166
3167/*
3168 * use blocks preallocated to locality group
3169 */
3170static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3171 struct ext4_prealloc_space *pa)
3172{
03cddb80 3173 unsigned int len = ac->ac_o_ex.fe_len;
6be2ded1 3174
c9de560d
AT
3175 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3176 &ac->ac_b_ex.fe_group,
3177 &ac->ac_b_ex.fe_start);
3178 ac->ac_b_ex.fe_len = len;
3179 ac->ac_status = AC_STATUS_FOUND;
3180 ac->ac_pa = pa;
3181
3182 /* we don't correct pa_pstart or pa_plen here to avoid
26346ff6 3183 * possible race when the group is being loaded concurrently
c9de560d 3184 * instead we correct pa later, after blocks are marked
26346ff6
AK
3185 * in on-disk bitmap -- see ext4_mb_release_context()
3186 * Other CPUs are prevented from allocating from this pa by lg_mutex
c9de560d 3187 */
6ba495e9 3188 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
c9de560d
AT
3189}
3190
5e745b04
AK
3191/*
3192 * Return the prealloc space that have minimal distance
3193 * from the goal block. @cpa is the prealloc
3194 * space that is having currently known minimal distance
3195 * from the goal block.
3196 */
3197static struct ext4_prealloc_space *
3198ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3199 struct ext4_prealloc_space *pa,
3200 struct ext4_prealloc_space *cpa)
3201{
3202 ext4_fsblk_t cur_distance, new_distance;
3203
3204 if (cpa == NULL) {
3205 atomic_inc(&pa->pa_count);
3206 return pa;
3207 }
3208 cur_distance = abs(goal_block - cpa->pa_pstart);
3209 new_distance = abs(goal_block - pa->pa_pstart);
3210
3211 if (cur_distance < new_distance)
3212 return cpa;
3213
3214 /* drop the previous reference */
3215 atomic_dec(&cpa->pa_count);
3216 atomic_inc(&pa->pa_count);
3217 return pa;
3218}
3219
c9de560d
AT
3220/*
3221 * search goal blocks in preallocated space
3222 */
4ddfef7b
ES
3223static noinline_for_stack int
3224ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
c9de560d 3225{
6be2ded1 3226 int order, i;
c9de560d
AT
3227 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3228 struct ext4_locality_group *lg;
5e745b04
AK
3229 struct ext4_prealloc_space *pa, *cpa = NULL;
3230 ext4_fsblk_t goal_block;
c9de560d
AT
3231
3232 /* only data can be preallocated */
3233 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3234 return 0;
3235
3236 /* first, try per-file preallocation */
3237 rcu_read_lock();
9a0762c5 3238 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
c9de560d
AT
3239
3240 /* all fields in this condition don't change,
3241 * so we can skip locking for them */
3242 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3243 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3244 continue;
3245
fb0a387d 3246 /* non-extent files can't have physical blocks past 2^32 */
12e9b892 3247 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
fb0a387d
ES
3248 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3249 continue;
3250
c9de560d
AT
3251 /* found preallocated blocks, use them */
3252 spin_lock(&pa->pa_lock);
3253 if (pa->pa_deleted == 0 && pa->pa_free) {
3254 atomic_inc(&pa->pa_count);
3255 ext4_mb_use_inode_pa(ac, pa);
3256 spin_unlock(&pa->pa_lock);
3257 ac->ac_criteria = 10;
3258 rcu_read_unlock();
3259 return 1;
3260 }
3261 spin_unlock(&pa->pa_lock);
3262 }
3263 rcu_read_unlock();
3264
3265 /* can we use group allocation? */
3266 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3267 return 0;
3268
3269 /* inode may have no locality group for some reason */
3270 lg = ac->ac_lg;
3271 if (lg == NULL)
3272 return 0;
6be2ded1
AK
3273 order = fls(ac->ac_o_ex.fe_len) - 1;
3274 if (order > PREALLOC_TB_SIZE - 1)
3275 /* The max size of hash table is PREALLOC_TB_SIZE */
3276 order = PREALLOC_TB_SIZE - 1;
3277
bda00de7 3278 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
5e745b04
AK
3279 /*
3280 * search for the prealloc space that is having
3281 * minimal distance from the goal block.
3282 */
6be2ded1
AK
3283 for (i = order; i < PREALLOC_TB_SIZE; i++) {
3284 rcu_read_lock();
3285 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3286 pa_inode_list) {
3287 spin_lock(&pa->pa_lock);
3288 if (pa->pa_deleted == 0 &&
3289 pa->pa_free >= ac->ac_o_ex.fe_len) {
5e745b04
AK
3290
3291 cpa = ext4_mb_check_group_pa(goal_block,
3292 pa, cpa);
6be2ded1 3293 }
c9de560d 3294 spin_unlock(&pa->pa_lock);
c9de560d 3295 }
6be2ded1 3296 rcu_read_unlock();
c9de560d 3297 }
5e745b04
AK
3298 if (cpa) {
3299 ext4_mb_use_group_pa(ac, cpa);
3300 ac->ac_criteria = 20;
3301 return 1;
3302 }
c9de560d
AT
3303 return 0;
3304}
3305
7a2fcbf7
AK
3306/*
3307 * the function goes through all block freed in the group
3308 * but not yet committed and marks them used in in-core bitmap.
3309 * buddy must be generated from this bitmap
955ce5f5 3310 * Need to be called with the ext4 group lock held
7a2fcbf7
AK
3311 */
3312static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3313 ext4_group_t group)
3314{
3315 struct rb_node *n;
3316 struct ext4_group_info *grp;
3317 struct ext4_free_data *entry;
3318
3319 grp = ext4_get_group_info(sb, group);
3320 n = rb_first(&(grp->bb_free_root));
3321
3322 while (n) {
3323 entry = rb_entry(n, struct ext4_free_data, node);
955ce5f5 3324 mb_set_bits(bitmap, entry->start_blk, entry->count);
7a2fcbf7
AK
3325 n = rb_next(n);
3326 }
3327 return;
3328}
3329
c9de560d
AT
3330/*
3331 * the function goes through all preallocation in this group and marks them
3332 * used in in-core bitmap. buddy must be generated from this bitmap
955ce5f5 3333 * Need to be called with ext4 group lock held
c9de560d 3334 */
089ceecc
ES
3335static noinline_for_stack
3336void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
c9de560d
AT
3337 ext4_group_t group)
3338{
3339 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3340 struct ext4_prealloc_space *pa;
3341 struct list_head *cur;
3342 ext4_group_t groupnr;
3343 ext4_grpblk_t start;
3344 int preallocated = 0;
3345 int count = 0;
3346 int len;
3347
3348 /* all form of preallocation discards first load group,
3349 * so the only competing code is preallocation use.
3350 * we don't need any locking here
3351 * notice we do NOT ignore preallocations with pa_deleted
3352 * otherwise we could leave used blocks available for
3353 * allocation in buddy when concurrent ext4_mb_put_pa()
3354 * is dropping preallocation
3355 */
3356 list_for_each(cur, &grp->bb_prealloc_list) {
3357 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3358 spin_lock(&pa->pa_lock);
3359 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3360 &groupnr, &start);
3361 len = pa->pa_len;
3362 spin_unlock(&pa->pa_lock);
3363 if (unlikely(len == 0))
3364 continue;
3365 BUG_ON(groupnr != group);
955ce5f5 3366 mb_set_bits(bitmap, start, len);
c9de560d
AT
3367 preallocated += len;
3368 count++;
3369 }
6ba495e9 3370 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
c9de560d
AT
3371}
3372
3373static void ext4_mb_pa_callback(struct rcu_head *head)
3374{
3375 struct ext4_prealloc_space *pa;
3376 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3377 kmem_cache_free(ext4_pspace_cachep, pa);
3378}
3379
3380/*
3381 * drops a reference to preallocated space descriptor
3382 * if this was the last reference and the space is consumed
3383 */
3384static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3385 struct super_block *sb, struct ext4_prealloc_space *pa)
3386{
a9df9a49 3387 ext4_group_t grp;
d33a1976 3388 ext4_fsblk_t grp_blk;
c9de560d
AT
3389
3390 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3391 return;
3392
3393 /* in this short window concurrent discard can set pa_deleted */
3394 spin_lock(&pa->pa_lock);
3395 if (pa->pa_deleted == 1) {
3396 spin_unlock(&pa->pa_lock);
3397 return;
3398 }
3399
3400 pa->pa_deleted = 1;
3401 spin_unlock(&pa->pa_lock);
3402
d33a1976 3403 grp_blk = pa->pa_pstart;
60e6679e 3404 /*
cc0fb9ad
AK
3405 * If doing group-based preallocation, pa_pstart may be in the
3406 * next group when pa is used up
3407 */
3408 if (pa->pa_type == MB_GROUP_PA)
d33a1976
ES
3409 grp_blk--;
3410
3411 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
c9de560d
AT
3412
3413 /*
3414 * possible race:
3415 *
3416 * P1 (buddy init) P2 (regular allocation)
3417 * find block B in PA
3418 * copy on-disk bitmap to buddy
3419 * mark B in on-disk bitmap
3420 * drop PA from group
3421 * mark all PAs in buddy
3422 *
3423 * thus, P1 initializes buddy with B available. to prevent this
3424 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3425 * against that pair
3426 */
3427 ext4_lock_group(sb, grp);
3428 list_del(&pa->pa_group_list);
3429 ext4_unlock_group(sb, grp);
3430
3431 spin_lock(pa->pa_obj_lock);
3432 list_del_rcu(&pa->pa_inode_list);
3433 spin_unlock(pa->pa_obj_lock);
3434
3435 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3436}
3437
3438/*
3439 * creates new preallocated space for given inode
3440 */
4ddfef7b
ES
3441static noinline_for_stack int
3442ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
c9de560d
AT
3443{
3444 struct super_block *sb = ac->ac_sb;
3445 struct ext4_prealloc_space *pa;
3446 struct ext4_group_info *grp;
3447 struct ext4_inode_info *ei;
3448
3449 /* preallocate only when found space is larger then requested */
3450 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3451 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3452 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3453
3454 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3455 if (pa == NULL)
3456 return -ENOMEM;
3457
3458 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3459 int winl;
3460 int wins;
3461 int win;
3462 int offs;
3463
3464 /* we can't allocate as much as normalizer wants.
3465 * so, found space must get proper lstart
3466 * to cover original request */
3467 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3468 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3469
3470 /* we're limited by original request in that
3471 * logical block must be covered any way
3472 * winl is window we can move our chunk within */
3473 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3474
3475 /* also, we should cover whole original request */
3476 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
3477
3478 /* the smallest one defines real window */
3479 win = min(winl, wins);
3480
3481 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
3482 if (offs && offs < win)
3483 win = offs;
3484
3485 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
3486 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3487 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3488 }
3489
3490 /* preallocation can change ac_b_ex, thus we store actually
3491 * allocated blocks for history */
3492 ac->ac_f_ex = ac->ac_b_ex;
3493
3494 pa->pa_lstart = ac->ac_b_ex.fe_logical;
3495 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3496 pa->pa_len = ac->ac_b_ex.fe_len;
3497 pa->pa_free = pa->pa_len;
3498 atomic_set(&pa->pa_count, 1);
3499 spin_lock_init(&pa->pa_lock);
d794bf8e
AK
3500 INIT_LIST_HEAD(&pa->pa_inode_list);
3501 INIT_LIST_HEAD(&pa->pa_group_list);
c9de560d 3502 pa->pa_deleted = 0;
cc0fb9ad 3503 pa->pa_type = MB_INODE_PA;
c9de560d 3504
6ba495e9 3505 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
c9de560d 3506 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
9bffad1e 3507 trace_ext4_mb_new_inode_pa(ac, pa);
c9de560d
AT
3508
3509 ext4_mb_use_inode_pa(ac, pa);
3510 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3511
3512 ei = EXT4_I(ac->ac_inode);
3513 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3514
3515 pa->pa_obj_lock = &ei->i_prealloc_lock;
3516 pa->pa_inode = ac->ac_inode;
3517
3518 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3519 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3520 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3521
3522 spin_lock(pa->pa_obj_lock);
3523 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3524 spin_unlock(pa->pa_obj_lock);
3525
3526 return 0;
3527}
3528
3529/*
3530 * creates new preallocated space for locality group inodes belongs to
3531 */
4ddfef7b
ES
3532static noinline_for_stack int
3533ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
c9de560d
AT
3534{
3535 struct super_block *sb = ac->ac_sb;
3536 struct ext4_locality_group *lg;
3537 struct ext4_prealloc_space *pa;
3538 struct ext4_group_info *grp;
3539
3540 /* preallocate only when found space is larger then requested */
3541 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3542 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3543 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3544
3545 BUG_ON(ext4_pspace_cachep == NULL);
3546 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3547 if (pa == NULL)
3548 return -ENOMEM;
3549
3550 /* preallocation can change ac_b_ex, thus we store actually
3551 * allocated blocks for history */
3552 ac->ac_f_ex = ac->ac_b_ex;
3553
3554 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3555 pa->pa_lstart = pa->pa_pstart;
3556 pa->pa_len = ac->ac_b_ex.fe_len;
3557 pa->pa_free = pa->pa_len;
3558 atomic_set(&pa->pa_count, 1);
3559 spin_lock_init(&pa->pa_lock);
6be2ded1 3560 INIT_LIST_HEAD(&pa->pa_inode_list);
d794bf8e 3561 INIT_LIST_HEAD(&pa->pa_group_list);
c9de560d 3562 pa->pa_deleted = 0;
cc0fb9ad 3563 pa->pa_type = MB_GROUP_PA;
c9de560d 3564
6ba495e9 3565 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
9bffad1e
TT
3566 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3567 trace_ext4_mb_new_group_pa(ac, pa);
c9de560d
AT
3568
3569 ext4_mb_use_group_pa(ac, pa);
3570 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3571
3572 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3573 lg = ac->ac_lg;
3574 BUG_ON(lg == NULL);
3575
3576 pa->pa_obj_lock = &lg->lg_prealloc_lock;
3577 pa->pa_inode = NULL;
3578
3579 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3580 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3581 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3582
6be2ded1
AK
3583 /*
3584 * We will later add the new pa to the right bucket
3585 * after updating the pa_free in ext4_mb_release_context
3586 */
c9de560d
AT
3587 return 0;
3588}
3589
3590static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3591{
3592 int err;
3593
3594 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3595 err = ext4_mb_new_group_pa(ac);
3596 else
3597 err = ext4_mb_new_inode_pa(ac);
3598 return err;
3599}
3600
3601/*
3602 * finds all unused blocks in on-disk bitmap, frees them in
3603 * in-core bitmap and buddy.
3604 * @pa must be unlinked from inode and group lists, so that
3605 * nobody else can find/use it.
3606 * the caller MUST hold group/inode locks.
3607 * TODO: optimize the case when there are no in-core structures yet
3608 */
4ddfef7b
ES
3609static noinline_for_stack int
3610ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3e1e5f50 3611 struct ext4_prealloc_space *pa)
c9de560d 3612{
c9de560d
AT
3613 struct super_block *sb = e4b->bd_sb;
3614 struct ext4_sb_info *sbi = EXT4_SB(sb);
498e5f24
TT
3615 unsigned int end;
3616 unsigned int next;
c9de560d
AT
3617 ext4_group_t group;
3618 ext4_grpblk_t bit;
ba80b101 3619 unsigned long long grp_blk_start;
c9de560d
AT
3620 int err = 0;
3621 int free = 0;
3622
3623 BUG_ON(pa->pa_deleted == 0);
3624 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
ba80b101 3625 grp_blk_start = pa->pa_pstart - bit;
c9de560d
AT
3626 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3627 end = bit + pa->pa_len;
3628
c9de560d 3629 while (bit < end) {
ffad0a44 3630 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
c9de560d
AT
3631 if (bit >= end)
3632 break;
ffad0a44 3633 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
6ba495e9 3634 mb_debug(1, " free preallocated %u/%u in group %u\n",
5a0790c2
AK
3635 (unsigned) ext4_group_first_block_no(sb, group) + bit,
3636 (unsigned) next - bit, (unsigned) group);
c9de560d
AT
3637 free += next - bit;
3638
3e1e5f50
ES
3639 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3640 trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
3641 grp_blk_start + bit, next - bit);
c9de560d
AT
3642 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3643 bit = next + 1;
3644 }
3645 if (free != pa->pa_free) {
26346ff6 3646 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
c9de560d
AT
3647 pa, (unsigned long) pa->pa_lstart,
3648 (unsigned long) pa->pa_pstart,
3649 (unsigned long) pa->pa_len);
e29136f8 3650 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
5d1b1b3f 3651 free, pa->pa_free);
e56eb659
AK
3652 /*
3653 * pa is already deleted so we use the value obtained
3654 * from the bitmap and continue.
3655 */
c9de560d 3656 }
c9de560d
AT
3657 atomic_add(free, &sbi->s_mb_discarded);
3658
3659 return err;
3660}
3661
4ddfef7b
ES
3662static noinline_for_stack int
3663ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3e1e5f50 3664 struct ext4_prealloc_space *pa)
c9de560d 3665{
c9de560d
AT
3666 struct super_block *sb = e4b->bd_sb;
3667 ext4_group_t group;
3668 ext4_grpblk_t bit;
3669
3e1e5f50 3670 trace_ext4_mb_release_group_pa(sb, pa);
c9de560d
AT
3671 BUG_ON(pa->pa_deleted == 0);
3672 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3673 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3674 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3675 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3e1e5f50 3676 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
c9de560d
AT
3677
3678 return 0;
3679}
3680
3681/*
3682 * releases all preallocations in given group
3683 *
3684 * first, we need to decide discard policy:
3685 * - when do we discard
3686 * 1) ENOSPC
3687 * - how many do we discard
3688 * 1) how many requested
3689 */
4ddfef7b
ES
3690static noinline_for_stack int
3691ext4_mb_discard_group_preallocations(struct super_block *sb,
c9de560d
AT
3692 ext4_group_t group, int needed)
3693{
3694 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3695 struct buffer_head *bitmap_bh = NULL;
3696 struct ext4_prealloc_space *pa, *tmp;
3697 struct list_head list;
3698 struct ext4_buddy e4b;
3699 int err;
3700 int busy = 0;
3701 int free = 0;
3702
6ba495e9 3703 mb_debug(1, "discard preallocation for group %u\n", group);
c9de560d
AT
3704
3705 if (list_empty(&grp->bb_prealloc_list))
3706 return 0;
3707
574ca174 3708 bitmap_bh = ext4_read_block_bitmap(sb, group);
c9de560d 3709 if (bitmap_bh == NULL) {
12062ddd 3710 ext4_error(sb, "Error reading block bitmap for %u", group);
ce89f46c 3711 return 0;
c9de560d
AT
3712 }
3713
3714 err = ext4_mb_load_buddy(sb, group, &e4b);
ce89f46c 3715 if (err) {
12062ddd 3716 ext4_error(sb, "Error loading buddy information for %u", group);
ce89f46c
AK
3717 put_bh(bitmap_bh);
3718 return 0;
3719 }
c9de560d
AT
3720
3721 if (needed == 0)
3722 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3723
c9de560d 3724 INIT_LIST_HEAD(&list);
c9de560d
AT
3725repeat:
3726 ext4_lock_group(sb, group);
3727 list_for_each_entry_safe(pa, tmp,
3728 &grp->bb_prealloc_list, pa_group_list) {
3729 spin_lock(&pa->pa_lock);
3730 if (atomic_read(&pa->pa_count)) {
3731 spin_unlock(&pa->pa_lock);
3732 busy = 1;
3733 continue;
3734 }
3735 if (pa->pa_deleted) {
3736 spin_unlock(&pa->pa_lock);
3737 continue;
3738 }
3739
3740 /* seems this one can be freed ... */
3741 pa->pa_deleted = 1;
3742
3743 /* we can trust pa_free ... */
3744 free += pa->pa_free;
3745
3746 spin_unlock(&pa->pa_lock);
3747
3748 list_del(&pa->pa_group_list);
3749 list_add(&pa->u.pa_tmp_list, &list);
3750 }
3751
3752 /* if we still need more blocks and some PAs were used, try again */
3753 if (free < needed && busy) {
3754 busy = 0;
3755 ext4_unlock_group(sb, group);
3756 /*
3757 * Yield the CPU here so that we don't get soft lockup
3758 * in non preempt case.
3759 */
3760 yield();
3761 goto repeat;
3762 }
3763
3764 /* found anything to free? */
3765 if (list_empty(&list)) {
3766 BUG_ON(free != 0);
3767 goto out;
3768 }
3769
3770 /* now free all selected PAs */
3771 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3772
3773 /* remove from object (inode or locality group) */
3774 spin_lock(pa->pa_obj_lock);
3775 list_del_rcu(&pa->pa_inode_list);
3776 spin_unlock(pa->pa_obj_lock);
3777
cc0fb9ad 3778 if (pa->pa_type == MB_GROUP_PA)
3e1e5f50 3779 ext4_mb_release_group_pa(&e4b, pa);
c9de560d 3780 else
3e1e5f50 3781 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
c9de560d
AT
3782
3783 list_del(&pa->u.pa_tmp_list);
3784 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3785 }
3786
3787out:
3788 ext4_unlock_group(sb, group);
e39e07fd 3789 ext4_mb_unload_buddy(&e4b);
c9de560d
AT
3790 put_bh(bitmap_bh);
3791 return free;
3792}
3793
3794/*
3795 * releases all non-used preallocated blocks for given inode
3796 *
3797 * It's important to discard preallocations under i_data_sem
3798 * We don't want another block to be served from the prealloc
3799 * space when we are discarding the inode prealloc space.
3800 *
3801 * FIXME!! Make sure it is valid at all the call sites
3802 */
c2ea3fde 3803void ext4_discard_preallocations(struct inode *inode)
c9de560d
AT
3804{
3805 struct ext4_inode_info *ei = EXT4_I(inode);
3806 struct super_block *sb = inode->i_sb;
3807 struct buffer_head *bitmap_bh = NULL;
3808 struct ext4_prealloc_space *pa, *tmp;
3809 ext4_group_t group = 0;
3810 struct list_head list;
3811 struct ext4_buddy e4b;
3812 int err;
3813
c2ea3fde 3814 if (!S_ISREG(inode->i_mode)) {
c9de560d
AT
3815 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3816 return;
3817 }
3818
6ba495e9 3819 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
9bffad1e 3820 trace_ext4_discard_preallocations(inode);
c9de560d
AT
3821
3822 INIT_LIST_HEAD(&list);
3823
3824repeat:
3825 /* first, collect all pa's in the inode */
3826 spin_lock(&ei->i_prealloc_lock);
3827 while (!list_empty(&ei->i_prealloc_list)) {
3828 pa = list_entry(ei->i_prealloc_list.next,
3829 struct ext4_prealloc_space, pa_inode_list);
3830 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3831 spin_lock(&pa->pa_lock);
3832 if (atomic_read(&pa->pa_count)) {
3833 /* this shouldn't happen often - nobody should
3834 * use preallocation while we're discarding it */
3835 spin_unlock(&pa->pa_lock);
3836 spin_unlock(&ei->i_prealloc_lock);
3837 printk(KERN_ERR "uh-oh! used pa while discarding\n");
3838 WARN_ON(1);
3839 schedule_timeout_uninterruptible(HZ);
3840 goto repeat;
3841
3842 }
3843 if (pa->pa_deleted == 0) {
3844 pa->pa_deleted = 1;
3845 spin_unlock(&pa->pa_lock);
3846 list_del_rcu(&pa->pa_inode_list);
3847 list_add(&pa->u.pa_tmp_list, &list);
3848 continue;
3849 }
3850
3851 /* someone is deleting pa right now */
3852 spin_unlock(&pa->pa_lock);
3853 spin_unlock(&ei->i_prealloc_lock);
3854
3855 /* we have to wait here because pa_deleted
3856 * doesn't mean pa is already unlinked from
3857 * the list. as we might be called from
3858 * ->clear_inode() the inode will get freed
3859 * and concurrent thread which is unlinking
3860 * pa from inode's list may access already
3861 * freed memory, bad-bad-bad */
3862
3863 /* XXX: if this happens too often, we can
3864 * add a flag to force wait only in case
3865 * of ->clear_inode(), but not in case of
3866 * regular truncate */
3867 schedule_timeout_uninterruptible(HZ);
3868 goto repeat;
3869 }
3870 spin_unlock(&ei->i_prealloc_lock);
3871
3872 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
cc0fb9ad 3873 BUG_ON(pa->pa_type != MB_INODE_PA);
c9de560d
AT
3874 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
3875
3876 err = ext4_mb_load_buddy(sb, group, &e4b);
ce89f46c 3877 if (err) {
12062ddd
ES
3878 ext4_error(sb, "Error loading buddy information for %u",
3879 group);
ce89f46c
AK
3880 continue;
3881 }
c9de560d 3882
574ca174 3883 bitmap_bh = ext4_read_block_bitmap(sb, group);
c9de560d 3884 if (bitmap_bh == NULL) {
12062ddd
ES
3885 ext4_error(sb, "Error reading block bitmap for %u",
3886 group);
e39e07fd 3887 ext4_mb_unload_buddy(&e4b);
ce89f46c 3888 continue;
c9de560d
AT
3889 }
3890
3891 ext4_lock_group(sb, group);
3892 list_del(&pa->pa_group_list);
3e1e5f50 3893 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
c9de560d
AT
3894 ext4_unlock_group(sb, group);
3895
e39e07fd 3896 ext4_mb_unload_buddy(&e4b);
c9de560d
AT
3897 put_bh(bitmap_bh);
3898
3899 list_del(&pa->u.pa_tmp_list);
3900 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3901 }
3902}
3903
6ba495e9 3904#ifdef CONFIG_EXT4_DEBUG
c9de560d
AT
3905static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3906{
3907 struct super_block *sb = ac->ac_sb;
8df9675f 3908 ext4_group_t ngroups, i;
c9de560d 3909
e3570639
ES
3910 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
3911 return;
3912
c9de560d
AT
3913 printk(KERN_ERR "EXT4-fs: Can't allocate:"
3914 " Allocation context details:\n");
3915 printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
3916 ac->ac_status, ac->ac_flags);
3917 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
3918 "best %lu/%lu/%lu@%lu cr %d\n",
3919 (unsigned long)ac->ac_o_ex.fe_group,
3920 (unsigned long)ac->ac_o_ex.fe_start,
3921 (unsigned long)ac->ac_o_ex.fe_len,
3922 (unsigned long)ac->ac_o_ex.fe_logical,
3923 (unsigned long)ac->ac_g_ex.fe_group,
3924 (unsigned long)ac->ac_g_ex.fe_start,
3925 (unsigned long)ac->ac_g_ex.fe_len,
3926 (unsigned long)ac->ac_g_ex.fe_logical,
3927 (unsigned long)ac->ac_b_ex.fe_group,
3928 (unsigned long)ac->ac_b_ex.fe_start,
3929 (unsigned long)ac->ac_b_ex.fe_len,
3930 (unsigned long)ac->ac_b_ex.fe_logical,
3931 (int)ac->ac_criteria);
3932 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
3933 ac->ac_found);
3934 printk(KERN_ERR "EXT4-fs: groups: \n");
8df9675f
TT
3935 ngroups = ext4_get_groups_count(sb);
3936 for (i = 0; i < ngroups; i++) {
c9de560d
AT
3937 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
3938 struct ext4_prealloc_space *pa;
3939 ext4_grpblk_t start;
3940 struct list_head *cur;
3941 ext4_lock_group(sb, i);
3942 list_for_each(cur, &grp->bb_prealloc_list) {
3943 pa = list_entry(cur, struct ext4_prealloc_space,
3944 pa_group_list);
3945 spin_lock(&pa->pa_lock);
3946 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3947 NULL, &start);
3948 spin_unlock(&pa->pa_lock);
1c718505
AF
3949 printk(KERN_ERR "PA:%u:%d:%u \n", i,
3950 start, pa->pa_len);
c9de560d 3951 }
60bd63d1 3952 ext4_unlock_group(sb, i);
c9de560d
AT
3953
3954 if (grp->bb_free == 0)
3955 continue;
1c718505 3956 printk(KERN_ERR "%u: %d/%d \n",
c9de560d
AT
3957 i, grp->bb_free, grp->bb_fragments);
3958 }
3959 printk(KERN_ERR "\n");
3960}
3961#else
3962static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3963{
3964 return;
3965}
3966#endif
3967
3968/*
3969 * We use locality group preallocation for small size file. The size of the
3970 * file is determined by the current size or the resulting size after
3971 * allocation which ever is larger
3972 *
b713a5ec 3973 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
c9de560d
AT
3974 */
3975static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3976{
3977 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3978 int bsbits = ac->ac_sb->s_blocksize_bits;
3979 loff_t size, isize;
3980
3981 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3982 return;
3983
4ba74d00
TT
3984 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3985 return;
3986
c9de560d 3987 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
50797481
TT
3988 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
3989 >> bsbits;
c9de560d 3990
50797481
TT
3991 if ((size == isize) &&
3992 !ext4_fs_is_busy(sbi) &&
3993 (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
3994 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
3995 return;
3996 }
3997
c9de560d 3998 /* don't use group allocation for large files */
71780577 3999 size = max(size, isize);
cc483f10 4000 if (size > sbi->s_mb_stream_request) {
4ba74d00 4001 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
c9de560d 4002 return;
4ba74d00 4003 }
c9de560d
AT
4004
4005 BUG_ON(ac->ac_lg != NULL);
4006 /*
4007 * locality group prealloc space are per cpu. The reason for having
4008 * per cpu locality group is to reduce the contention between block
4009 * request from multiple CPUs.
4010 */
ca0c9584 4011 ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
c9de560d
AT
4012
4013 /* we're going to use group allocation */
4014 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4015
4016 /* serialize all allocations in the group */
4017 mutex_lock(&ac->ac_lg->lg_mutex);
4018}
4019
4ddfef7b
ES
4020static noinline_for_stack int
4021ext4_mb_initialize_context(struct ext4_allocation_context *ac,
c9de560d
AT
4022 struct ext4_allocation_request *ar)
4023{
4024 struct super_block *sb = ar->inode->i_sb;
4025 struct ext4_sb_info *sbi = EXT4_SB(sb);
4026 struct ext4_super_block *es = sbi->s_es;
4027 ext4_group_t group;
498e5f24
TT
4028 unsigned int len;
4029 ext4_fsblk_t goal;
c9de560d
AT
4030 ext4_grpblk_t block;
4031
4032 /* we can't allocate > group size */
4033 len = ar->len;
4034
4035 /* just a dirty hack to filter too big requests */
4036 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
4037 len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
4038
4039 /* start searching from the goal */
4040 goal = ar->goal;
4041 if (goal < le32_to_cpu(es->s_first_data_block) ||
4042 goal >= ext4_blocks_count(es))
4043 goal = le32_to_cpu(es->s_first_data_block);
4044 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4045
4046 /* set up allocation goals */
833576b3 4047 memset(ac, 0, sizeof(struct ext4_allocation_context));
c9de560d 4048 ac->ac_b_ex.fe_logical = ar->logical;
c9de560d 4049 ac->ac_status = AC_STATUS_CONTINUE;
c9de560d
AT
4050 ac->ac_sb = sb;
4051 ac->ac_inode = ar->inode;
4052 ac->ac_o_ex.fe_logical = ar->logical;
4053 ac->ac_o_ex.fe_group = group;
4054 ac->ac_o_ex.fe_start = block;
4055 ac->ac_o_ex.fe_len = len;
4056 ac->ac_g_ex.fe_logical = ar->logical;
4057 ac->ac_g_ex.fe_group = group;
4058 ac->ac_g_ex.fe_start = block;
4059 ac->ac_g_ex.fe_len = len;
c9de560d 4060 ac->ac_flags = ar->flags;
c9de560d
AT
4061
4062 /* we have to define context: we'll we work with a file or
4063 * locality group. this is a policy, actually */
4064 ext4_mb_group_or_file(ac);
4065
6ba495e9 4066 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
c9de560d
AT
4067 "left: %u/%u, right %u/%u to %swritable\n",
4068 (unsigned) ar->len, (unsigned) ar->logical,
4069 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4070 (unsigned) ar->lleft, (unsigned) ar->pleft,
4071 (unsigned) ar->lright, (unsigned) ar->pright,
4072 atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4073 return 0;
4074
4075}
4076
6be2ded1
AK
4077static noinline_for_stack void
4078ext4_mb_discard_lg_preallocations(struct super_block *sb,
4079 struct ext4_locality_group *lg,
4080 int order, int total_entries)
4081{
4082 ext4_group_t group = 0;
4083 struct ext4_buddy e4b;
4084 struct list_head discard_list;
4085 struct ext4_prealloc_space *pa, *tmp;
6be2ded1 4086
6ba495e9 4087 mb_debug(1, "discard locality group preallocation\n");
6be2ded1
AK
4088
4089 INIT_LIST_HEAD(&discard_list);
6be2ded1
AK
4090
4091 spin_lock(&lg->lg_prealloc_lock);
4092 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4093 pa_inode_list) {
4094 spin_lock(&pa->pa_lock);
4095 if (atomic_read(&pa->pa_count)) {
4096 /*
4097 * This is the pa that we just used
4098 * for block allocation. So don't
4099 * free that
4100 */
4101 spin_unlock(&pa->pa_lock);
4102 continue;
4103 }
4104 if (pa->pa_deleted) {
4105 spin_unlock(&pa->pa_lock);
4106 continue;
4107 }
4108 /* only lg prealloc space */
cc0fb9ad 4109 BUG_ON(pa->pa_type != MB_GROUP_PA);
6be2ded1
AK
4110
4111 /* seems this one can be freed ... */
4112 pa->pa_deleted = 1;
4113 spin_unlock(&pa->pa_lock);
4114
4115 list_del_rcu(&pa->pa_inode_list);
4116 list_add(&pa->u.pa_tmp_list, &discard_list);
4117
4118 total_entries--;
4119 if (total_entries <= 5) {
4120 /*
4121 * we want to keep only 5 entries
4122 * allowing it to grow to 8. This
4123 * mak sure we don't call discard
4124 * soon for this list.
4125 */
4126 break;
4127 }
4128 }
4129 spin_unlock(&lg->lg_prealloc_lock);
4130
4131 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4132
4133 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4134 if (ext4_mb_load_buddy(sb, group, &e4b)) {
12062ddd
ES
4135 ext4_error(sb, "Error loading buddy information for %u",
4136 group);
6be2ded1
AK
4137 continue;
4138 }
4139 ext4_lock_group(sb, group);
4140 list_del(&pa->pa_group_list);
3e1e5f50 4141 ext4_mb_release_group_pa(&e4b, pa);
6be2ded1
AK
4142 ext4_unlock_group(sb, group);
4143
e39e07fd 4144 ext4_mb_unload_buddy(&e4b);
6be2ded1
AK
4145 list_del(&pa->u.pa_tmp_list);
4146 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4147 }
6be2ded1
AK
4148}
4149
4150/*
4151 * We have incremented pa_count. So it cannot be freed at this
4152 * point. Also we hold lg_mutex. So no parallel allocation is
4153 * possible from this lg. That means pa_free cannot be updated.
4154 *
4155 * A parallel ext4_mb_discard_group_preallocations is possible.
4156 * which can cause the lg_prealloc_list to be updated.
4157 */
4158
4159static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4160{
4161 int order, added = 0, lg_prealloc_count = 1;
4162 struct super_block *sb = ac->ac_sb;
4163 struct ext4_locality_group *lg = ac->ac_lg;
4164 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4165
4166 order = fls(pa->pa_free) - 1;
4167 if (order > PREALLOC_TB_SIZE - 1)
4168 /* The max size of hash table is PREALLOC_TB_SIZE */
4169 order = PREALLOC_TB_SIZE - 1;
4170 /* Add the prealloc space to lg */
4171 rcu_read_lock();
4172 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4173 pa_inode_list) {
4174 spin_lock(&tmp_pa->pa_lock);
4175 if (tmp_pa->pa_deleted) {
e7c9e3e9 4176 spin_unlock(&tmp_pa->pa_lock);
6be2ded1
AK
4177 continue;
4178 }
4179 if (!added && pa->pa_free < tmp_pa->pa_free) {
4180 /* Add to the tail of the previous entry */
4181 list_add_tail_rcu(&pa->pa_inode_list,
4182 &tmp_pa->pa_inode_list);
4183 added = 1;
4184 /*
4185 * we want to count the total
4186 * number of entries in the list
4187 */
4188 }
4189 spin_unlock(&tmp_pa->pa_lock);
4190 lg_prealloc_count++;
4191 }
4192 if (!added)
4193 list_add_tail_rcu(&pa->pa_inode_list,
4194 &lg->lg_prealloc_list[order]);
4195 rcu_read_unlock();
4196
4197 /* Now trim the list to be not more than 8 elements */
4198 if (lg_prealloc_count > 8) {
4199 ext4_mb_discard_lg_preallocations(sb, lg,
4200 order, lg_prealloc_count);
4201 return;
4202 }
4203 return ;
4204}
4205
c9de560d
AT
4206/*
4207 * release all resource we used in allocation
4208 */
4209static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4210{
6be2ded1
AK
4211 struct ext4_prealloc_space *pa = ac->ac_pa;
4212 if (pa) {
cc0fb9ad 4213 if (pa->pa_type == MB_GROUP_PA) {
c9de560d 4214 /* see comment in ext4_mb_use_group_pa() */
6be2ded1
AK
4215 spin_lock(&pa->pa_lock);
4216 pa->pa_pstart += ac->ac_b_ex.fe_len;
4217 pa->pa_lstart += ac->ac_b_ex.fe_len;
4218 pa->pa_free -= ac->ac_b_ex.fe_len;
4219 pa->pa_len -= ac->ac_b_ex.fe_len;
4220 spin_unlock(&pa->pa_lock);
c9de560d 4221 }
c9de560d 4222 }
8556e8f3
AK
4223 if (ac->alloc_semp)
4224 up_read(ac->alloc_semp);
ba443916
AK
4225 if (pa) {
4226 /*
4227 * We want to add the pa to the right bucket.
4228 * Remove it from the list and while adding
4229 * make sure the list to which we are adding
4230 * doesn't grow big. We need to release
4231 * alloc_semp before calling ext4_mb_add_n_trim()
4232 */
cc0fb9ad 4233 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
ba443916
AK
4234 spin_lock(pa->pa_obj_lock);
4235 list_del_rcu(&pa->pa_inode_list);
4236 spin_unlock(pa->pa_obj_lock);
4237 ext4_mb_add_n_trim(ac);
4238 }
4239 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4240 }
c9de560d
AT
4241 if (ac->ac_bitmap_page)
4242 page_cache_release(ac->ac_bitmap_page);
4243 if (ac->ac_buddy_page)
4244 page_cache_release(ac->ac_buddy_page);
4245 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4246 mutex_unlock(&ac->ac_lg->lg_mutex);
4247 ext4_mb_collect_stats(ac);
4248 return 0;
4249}
4250
4251static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4252{
8df9675f 4253 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
c9de560d
AT
4254 int ret;
4255 int freed = 0;
4256
9bffad1e 4257 trace_ext4_mb_discard_preallocations(sb, needed);
8df9675f 4258 for (i = 0; i < ngroups && needed > 0; i++) {
c9de560d
AT
4259 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4260 freed += ret;
4261 needed -= ret;
4262 }
4263
4264 return freed;
4265}
4266
4267/*
4268 * Main entry point into mballoc to allocate blocks
4269 * it tries to use preallocation first, then falls back
4270 * to usual allocation
4271 */
4272ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
6c7a120a 4273 struct ext4_allocation_request *ar, int *errp)
c9de560d 4274{
6bc6e63f 4275 int freed;
256bdb49 4276 struct ext4_allocation_context *ac = NULL;
c9de560d
AT
4277 struct ext4_sb_info *sbi;
4278 struct super_block *sb;
4279 ext4_fsblk_t block = 0;
60e58e0f 4280 unsigned int inquota = 0;
498e5f24 4281 unsigned int reserv_blks = 0;
c9de560d
AT
4282
4283 sb = ar->inode->i_sb;
4284 sbi = EXT4_SB(sb);
4285
9bffad1e 4286 trace_ext4_request_blocks(ar);
ba80b101 4287
60e58e0f
MC
4288 /*
4289 * For delayed allocation, we could skip the ENOSPC and
4290 * EDQUOT check, as blocks and quotas have been already
4291 * reserved when data being copied into pagecache.
4292 */
f2321097 4293 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
60e58e0f
MC
4294 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4295 else {
4296 /* Without delayed allocation we need to verify
4297 * there is enough free blocks to do block allocation
4298 * and verify allocation doesn't exceed the quota limits.
d2a17637 4299 */
030ba6bc
AK
4300 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
4301 /* let others to free the space */
4302 yield();
4303 ar->len = ar->len >> 1;
4304 }
4305 if (!ar->len) {
a30d542a
AK
4306 *errp = -ENOSPC;
4307 return 0;
4308 }
6bc6e63f 4309 reserv_blks = ar->len;
5dd4056d 4310 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
60e58e0f
MC
4311 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4312 ar->len--;
4313 }
4314 inquota = ar->len;
4315 if (ar->len == 0) {
4316 *errp = -EDQUOT;
6c7a120a 4317 goto out;
60e58e0f 4318 }
07031431 4319 }
d2a17637 4320
256bdb49 4321 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
833576b3 4322 if (!ac) {
363d4251 4323 ar->len = 0;
256bdb49 4324 *errp = -ENOMEM;
6c7a120a 4325 goto out;
256bdb49
ES
4326 }
4327
256bdb49 4328 *errp = ext4_mb_initialize_context(ac, ar);
c9de560d
AT
4329 if (*errp) {
4330 ar->len = 0;
6c7a120a 4331 goto out;
c9de560d
AT
4332 }
4333
256bdb49
ES
4334 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4335 if (!ext4_mb_use_preallocated(ac)) {
256bdb49
ES
4336 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4337 ext4_mb_normalize_request(ac, ar);
c9de560d
AT
4338repeat:
4339 /* allocate space in core */
6c7a120a
AK
4340 *errp = ext4_mb_regular_allocator(ac);
4341 if (*errp)
4342 goto errout;
c9de560d
AT
4343
4344 /* as we've just preallocated more space than
4345 * user requested orinally, we store allocated
4346 * space in a special descriptor */
256bdb49
ES
4347 if (ac->ac_status == AC_STATUS_FOUND &&
4348 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4349 ext4_mb_new_preallocation(ac);
c9de560d 4350 }
256bdb49 4351 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
6bc6e63f 4352 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
6c7a120a 4353 if (*errp == -EAGAIN) {
8556e8f3
AK
4354 /*
4355 * drop the reference that we took
4356 * in ext4_mb_use_best_found
4357 */
4358 ext4_mb_release_context(ac);
519deca0
AK
4359 ac->ac_b_ex.fe_group = 0;
4360 ac->ac_b_ex.fe_start = 0;
4361 ac->ac_b_ex.fe_len = 0;
4362 ac->ac_status = AC_STATUS_CONTINUE;
4363 goto repeat;
6c7a120a
AK
4364 } else if (*errp)
4365 errout:
b844167e 4366 ext4_discard_allocated_blocks(ac);
6c7a120a 4367 else {
519deca0
AK
4368 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4369 ar->len = ac->ac_b_ex.fe_len;
4370 }
c9de560d 4371 } else {
256bdb49 4372 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
c9de560d
AT
4373 if (freed)
4374 goto repeat;
4375 *errp = -ENOSPC;
6c7a120a
AK
4376 }
4377
4378 if (*errp) {
256bdb49 4379 ac->ac_b_ex.fe_len = 0;
c9de560d 4380 ar->len = 0;
256bdb49 4381 ext4_mb_show_ac(ac);
c9de560d 4382 }
256bdb49 4383 ext4_mb_release_context(ac);
6c7a120a
AK
4384out:
4385 if (ac)
4386 kmem_cache_free(ext4_ac_cachep, ac);
60e58e0f 4387 if (inquota && ar->len < inquota)
5dd4056d 4388 dquot_free_block(ar->inode, inquota - ar->len);
0087d9fb 4389 if (!ar->len) {
f2321097
TT
4390 if (!ext4_test_inode_state(ar->inode,
4391 EXT4_STATE_DELALLOC_RESERVED))
0087d9fb
AK
4392 /* release all the reserved blocks if non delalloc */
4393 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4394 reserv_blks);
4395 }
c9de560d 4396
9bffad1e 4397 trace_ext4_allocate_blocks(ar, (unsigned long long)block);
ba80b101 4398
c9de560d
AT
4399 return block;
4400}
c9de560d 4401
c894058d
AK
4402/*
4403 * We can merge two free data extents only if the physical blocks
4404 * are contiguous, AND the extents were freed by the same transaction,
4405 * AND the blocks are associated with the same group.
4406 */
4407static int can_merge(struct ext4_free_data *entry1,
4408 struct ext4_free_data *entry2)
4409{
4410 if ((entry1->t_tid == entry2->t_tid) &&
4411 (entry1->group == entry2->group) &&
4412 ((entry1->start_blk + entry1->count) == entry2->start_blk))
4413 return 1;
4414 return 0;
4415}
4416
4ddfef7b
ES
4417static noinline_for_stack int
4418ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
7a2fcbf7 4419 struct ext4_free_data *new_entry)
c9de560d 4420{
e29136f8 4421 ext4_group_t group = e4b->bd_group;
7a2fcbf7
AK
4422 ext4_grpblk_t block;
4423 struct ext4_free_data *entry;
c9de560d
AT
4424 struct ext4_group_info *db = e4b->bd_info;
4425 struct super_block *sb = e4b->bd_sb;
4426 struct ext4_sb_info *sbi = EXT4_SB(sb);
c894058d
AK
4427 struct rb_node **n = &db->bb_free_root.rb_node, *node;
4428 struct rb_node *parent = NULL, *new_node;
4429
0390131b 4430 BUG_ON(!ext4_handle_valid(handle));
c9de560d
AT
4431 BUG_ON(e4b->bd_bitmap_page == NULL);
4432 BUG_ON(e4b->bd_buddy_page == NULL);
4433
c894058d 4434 new_node = &new_entry->node;
7a2fcbf7 4435 block = new_entry->start_blk;
c894058d 4436
c894058d
AK
4437 if (!*n) {
4438 /* first free block exent. We need to
4439 protect buddy cache from being freed,
4440 * otherwise we'll refresh it from
4441 * on-disk bitmap and lose not-yet-available
4442 * blocks */
4443 page_cache_get(e4b->bd_buddy_page);
4444 page_cache_get(e4b->bd_bitmap_page);
4445 }
4446 while (*n) {
4447 parent = *n;
4448 entry = rb_entry(parent, struct ext4_free_data, node);
4449 if (block < entry->start_blk)
4450 n = &(*n)->rb_left;
4451 else if (block >= (entry->start_blk + entry->count))
4452 n = &(*n)->rb_right;
4453 else {
e29136f8
TT
4454 ext4_grp_locked_error(sb, group, 0,
4455 ext4_group_first_block_no(sb, group) + block,
4456 "Block already on to-be-freed list");
c894058d 4457 return 0;
c9de560d 4458 }
c894058d 4459 }
c9de560d 4460
c894058d
AK
4461 rb_link_node(new_node, parent, n);
4462 rb_insert_color(new_node, &db->bb_free_root);
4463
4464 /* Now try to see the extent can be merged to left and right */
4465 node = rb_prev(new_node);
4466 if (node) {
4467 entry = rb_entry(node, struct ext4_free_data, node);
4468 if (can_merge(entry, new_entry)) {
4469 new_entry->start_blk = entry->start_blk;
4470 new_entry->count += entry->count;
4471 rb_erase(node, &(db->bb_free_root));
4472 spin_lock(&sbi->s_md_lock);
4473 list_del(&entry->list);
4474 spin_unlock(&sbi->s_md_lock);
4475 kmem_cache_free(ext4_free_ext_cachep, entry);
c9de560d 4476 }
c894058d 4477 }
c9de560d 4478
c894058d
AK
4479 node = rb_next(new_node);
4480 if (node) {
4481 entry = rb_entry(node, struct ext4_free_data, node);
4482 if (can_merge(new_entry, entry)) {
4483 new_entry->count += entry->count;
4484 rb_erase(node, &(db->bb_free_root));
4485 spin_lock(&sbi->s_md_lock);
4486 list_del(&entry->list);
4487 spin_unlock(&sbi->s_md_lock);
4488 kmem_cache_free(ext4_free_ext_cachep, entry);
c9de560d
AT
4489 }
4490 }
3e624fc7 4491 /* Add the extent to transaction's private list */
c894058d 4492 spin_lock(&sbi->s_md_lock);
3e624fc7 4493 list_add(&new_entry->list, &handle->h_transaction->t_private_list);
c894058d 4494 spin_unlock(&sbi->s_md_lock);
c9de560d
AT
4495 return 0;
4496}
4497
44338711
TT
4498/**
4499 * ext4_free_blocks() -- Free given blocks and update quota
4500 * @handle: handle for this transaction
4501 * @inode: inode
4502 * @block: start physical block to free
4503 * @count: number of blocks to count
4504 * @metadata: Are these metadata blocks
c9de560d 4505 */
44338711 4506void ext4_free_blocks(handle_t *handle, struct inode *inode,
e6362609
TT
4507 struct buffer_head *bh, ext4_fsblk_t block,
4508 unsigned long count, int flags)
c9de560d 4509{
26346ff6 4510 struct buffer_head *bitmap_bh = NULL;
c9de560d 4511 struct super_block *sb = inode->i_sb;
c9de560d 4512 struct ext4_group_desc *gdp;
44338711 4513 unsigned long freed = 0;
498e5f24 4514 unsigned int overflow;
c9de560d
AT
4515 ext4_grpblk_t bit;
4516 struct buffer_head *gd_bh;
4517 ext4_group_t block_group;
4518 struct ext4_sb_info *sbi;
4519 struct ext4_buddy e4b;
4520 int err = 0;
4521 int ret;
4522
e6362609
TT
4523 if (bh) {
4524 if (block)
4525 BUG_ON(block != bh->b_blocknr);
4526 else
4527 block = bh->b_blocknr;
4528 }
c9de560d 4529
c9de560d 4530 sbi = EXT4_SB(sb);
1f2acb60
TT
4531 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4532 !ext4_data_block_valid(sbi, block, count)) {
12062ddd 4533 ext4_error(sb, "Freeing blocks not in datazone - "
1f2acb60 4534 "block = %llu, count = %lu", block, count);
c9de560d
AT
4535 goto error_return;
4536 }
4537
0610b6e9 4538 ext4_debug("freeing block %llu\n", block);
e6362609
TT
4539 trace_ext4_free_blocks(inode, block, count, flags);
4540
4541 if (flags & EXT4_FREE_BLOCKS_FORGET) {
4542 struct buffer_head *tbh = bh;
4543 int i;
4544
4545 BUG_ON(bh && (count > 1));
4546
4547 for (i = 0; i < count; i++) {
4548 if (!bh)
4549 tbh = sb_find_get_block(inode->i_sb,
4550 block + i);
87783690
NK
4551 if (unlikely(!tbh))
4552 continue;
60e6679e 4553 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
e6362609
TT
4554 inode, tbh, block + i);
4555 }
4556 }
4557
60e6679e 4558 /*
e6362609
TT
4559 * We need to make sure we don't reuse the freed block until
4560 * after the transaction is committed, which we can do by
4561 * treating the block as metadata, below. We make an
4562 * exception if the inode is to be written in writeback mode
4563 * since writeback mode has weak data consistency guarantees.
4564 */
4565 if (!ext4_should_writeback_data(inode))
4566 flags |= EXT4_FREE_BLOCKS_METADATA;
c9de560d 4567
c9de560d
AT
4568do_more:
4569 overflow = 0;
4570 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4571
4572 /*
4573 * Check to see if we are freeing blocks across a group
4574 * boundary.
4575 */
4576 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4577 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4578 count -= overflow;
4579 }
574ca174 4580 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
ce89f46c
AK
4581 if (!bitmap_bh) {
4582 err = -EIO;
c9de560d 4583 goto error_return;
ce89f46c 4584 }
c9de560d 4585 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
ce89f46c
AK
4586 if (!gdp) {
4587 err = -EIO;
c9de560d 4588 goto error_return;
ce89f46c 4589 }
c9de560d
AT
4590
4591 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4592 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4593 in_range(block, ext4_inode_table(sb, gdp),
4594 EXT4_SB(sb)->s_itb_per_group) ||
4595 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4596 EXT4_SB(sb)->s_itb_per_group)) {
4597
12062ddd 4598 ext4_error(sb, "Freeing blocks in system zone - "
0610b6e9 4599 "Block = %llu, count = %lu", block, count);
519deca0
AK
4600 /* err = 0. ext4_std_error should be a no op */
4601 goto error_return;
c9de560d
AT
4602 }
4603
4604 BUFFER_TRACE(bitmap_bh, "getting write access");
4605 err = ext4_journal_get_write_access(handle, bitmap_bh);
4606 if (err)
4607 goto error_return;
4608
4609 /*
4610 * We are about to modify some metadata. Call the journal APIs
4611 * to unshare ->b_data if a currently-committing transaction is
4612 * using it
4613 */
4614 BUFFER_TRACE(gd_bh, "get_write_access");
4615 err = ext4_journal_get_write_access(handle, gd_bh);
4616 if (err)
4617 goto error_return;
c9de560d
AT
4618#ifdef AGGRESSIVE_CHECK
4619 {
4620 int i;
4621 for (i = 0; i < count; i++)
4622 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4623 }
4624#endif
3e1e5f50 4625 trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
c9de560d 4626
920313a7
AK
4627 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4628 if (err)
4629 goto error_return;
e6362609
TT
4630
4631 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
7a2fcbf7
AK
4632 struct ext4_free_data *new_entry;
4633 /*
4634 * blocks being freed are metadata. these blocks shouldn't
4635 * be used until this transaction is committed
4636 */
b72143ab
TT
4637 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4638 if (!new_entry) {
4639 err = -ENOMEM;
4640 goto error_return;
4641 }
7a2fcbf7
AK
4642 new_entry->start_blk = bit;
4643 new_entry->group = block_group;
4644 new_entry->count = count;
4645 new_entry->t_tid = handle->h_transaction->t_tid;
955ce5f5 4646
7a2fcbf7 4647 ext4_lock_group(sb, block_group);
955ce5f5 4648 mb_clear_bits(bitmap_bh->b_data, bit, count);
7a2fcbf7 4649 ext4_mb_free_metadata(handle, &e4b, new_entry);
c9de560d 4650 } else {
7a2fcbf7
AK
4651 /* need to update group_info->bb_free and bitmap
4652 * with group lock held. generate_buddy look at
4653 * them with group lock_held
4654 */
955ce5f5
AK
4655 ext4_lock_group(sb, block_group);
4656 mb_clear_bits(bitmap_bh->b_data, bit, count);
7e5a8cdd 4657 mb_free_blocks(inode, &e4b, bit, count);
c9de560d
AT
4658 }
4659
560671a0
AK
4660 ret = ext4_free_blks_count(sb, gdp) + count;
4661 ext4_free_blks_set(sb, gdp, ret);
c9de560d 4662 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
955ce5f5 4663 ext4_unlock_group(sb, block_group);
c9de560d
AT
4664 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4665
772cb7c8
JS
4666 if (sbi->s_log_groups_per_flex) {
4667 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
9f24e420 4668 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
772cb7c8
JS
4669 }
4670
e39e07fd 4671 ext4_mb_unload_buddy(&e4b);
c9de560d 4672
44338711 4673 freed += count;
c9de560d 4674
7a2fcbf7
AK
4675 /* We dirtied the bitmap block */
4676 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4677 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4678
c9de560d
AT
4679 /* And the group descriptor block */
4680 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
0390131b 4681 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
c9de560d
AT
4682 if (!err)
4683 err = ret;
4684
4685 if (overflow && !err) {
4686 block += count;
4687 count = overflow;
4688 put_bh(bitmap_bh);
4689 goto do_more;
4690 }
a0375156 4691 ext4_mark_super_dirty(sb);
c9de560d 4692error_return:
44338711 4693 if (freed)
5dd4056d 4694 dquot_free_block(inode, freed);
c9de560d
AT
4695 brelse(bitmap_bh);
4696 ext4_std_error(sb, err);
4697 return;
4698}
7360d173
LC
4699
4700/**
4701 * ext4_trim_extent -- function to TRIM one single free extent in the group
4702 * @sb: super block for the file system
4703 * @start: starting block of the free extent in the alloc. group
4704 * @count: number of blocks to TRIM
4705 * @group: alloc. group we are working with
4706 * @e4b: ext4 buddy for the group
4707 *
4708 * Trim "count" blocks starting at "start" in the "group". To assure that no
4709 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4710 * be called with under the group lock.
4711 */
4712static int ext4_trim_extent(struct super_block *sb, int start, int count,
4713 ext4_group_t group, struct ext4_buddy *e4b)
4714{
4715 struct ext4_free_extent ex;
4716 int ret = 0;
4717
4718 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4719
4720 ex.fe_start = start;
4721 ex.fe_group = group;
4722 ex.fe_len = count;
4723
4724 /*
4725 * Mark blocks used, so no one can reuse them while
4726 * being trimmed.
4727 */
4728 mb_mark_used(e4b, &ex);
4729 ext4_unlock_group(sb, group);
4730
4731 ret = ext4_issue_discard(sb, group, start, count);
7360d173
LC
4732
4733 ext4_lock_group(sb, group);
4734 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4735 return ret;
4736}
4737
4738/**
4739 * ext4_trim_all_free -- function to trim all free space in alloc. group
4740 * @sb: super block for file system
4741 * @e4b: ext4 buddy
4742 * @start: first group block to examine
4743 * @max: last group block to examine
4744 * @minblocks: minimum extent block count
4745 *
4746 * ext4_trim_all_free walks through group's buddy bitmap searching for free
4747 * extents. When the free block is found, ext4_trim_extent is called to TRIM
4748 * the extent.
4749 *
4750 *
4751 * ext4_trim_all_free walks through group's block bitmap searching for free
4752 * extents. When the free extent is found, mark it as used in group buddy
4753 * bitmap. Then issue a TRIM command on this extent and free the extent in
4754 * the group buddy bitmap. This is done until whole group is scanned.
4755 */
0b75a840
LC
4756static ext4_grpblk_t
4757ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
7360d173
LC
4758 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
4759{
4760 void *bitmap;
4761 ext4_grpblk_t next, count = 0;
4762 ext4_group_t group;
4763 int ret = 0;
4764
4765 BUG_ON(e4b == NULL);
4766
4767 bitmap = e4b->bd_bitmap;
4768 group = e4b->bd_group;
4769 start = (e4b->bd_info->bb_first_free > start) ?
4770 e4b->bd_info->bb_first_free : start;
4771 ext4_lock_group(sb, group);
4772
4773 while (start < max) {
4774 start = mb_find_next_zero_bit(bitmap, max, start);
4775 if (start >= max)
4776 break;
4777 next = mb_find_next_bit(bitmap, max, start);
4778
4779 if ((next - start) >= minblocks) {
4780 ret = ext4_trim_extent(sb, start,
4781 next - start, group, e4b);
4782 if (ret < 0)
4783 break;
4784 count += next - start;
4785 }
4786 start = next + 1;
4787
4788 if (fatal_signal_pending(current)) {
4789 count = -ERESTARTSYS;
4790 break;
4791 }
4792
4793 if (need_resched()) {
4794 ext4_unlock_group(sb, group);
4795 cond_resched();
4796 ext4_lock_group(sb, group);
4797 }
4798
4799 if ((e4b->bd_info->bb_free - count) < minblocks)
4800 break;
4801 }
4802 ext4_unlock_group(sb, group);
4803
4804 ext4_debug("trimmed %d blocks in the group %d\n",
4805 count, group);
4806
4807 if (ret < 0)
4808 count = ret;
4809
4810 return count;
4811}
4812
4813/**
4814 * ext4_trim_fs() -- trim ioctl handle function
4815 * @sb: superblock for filesystem
4816 * @range: fstrim_range structure
4817 *
4818 * start: First Byte to trim
4819 * len: number of Bytes to trim from start
4820 * minlen: minimum extent length in Bytes
4821 * ext4_trim_fs goes through all allocation groups containing Bytes from
4822 * start to start+len. For each such a group ext4_trim_all_free function
4823 * is invoked to trim all free space.
4824 */
4825int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4826{
4827 struct ext4_buddy e4b;
4828 ext4_group_t first_group, last_group;
4829 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4830 ext4_grpblk_t cnt = 0, first_block, last_block;
4831 uint64_t start, len, minlen, trimmed;
0f0a25bf
JK
4832 ext4_fsblk_t first_data_blk =
4833 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
7360d173
LC
4834 int ret = 0;
4835
4836 start = range->start >> sb->s_blocksize_bits;
4837 len = range->len >> sb->s_blocksize_bits;
4838 minlen = range->minlen >> sb->s_blocksize_bits;
4839 trimmed = 0;
4840
4841 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4842 return -EINVAL;
0f0a25bf
JK
4843 if (start < first_data_blk) {
4844 len -= first_data_blk - start;
4845 start = first_data_blk;
4846 }
7360d173
LC
4847
4848 /* Determine first and last group to examine based on start and len */
4849 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4850 &first_group, &first_block);
4851 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4852 &last_group, &last_block);
4853 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4854 last_block = EXT4_BLOCKS_PER_GROUP(sb);
4855
4856 if (first_group > last_group)
4857 return -EINVAL;
4858
4859 for (group = first_group; group <= last_group; group++) {
4860 ret = ext4_mb_load_buddy(sb, group, &e4b);
4861 if (ret) {
4862 ext4_error(sb, "Error in loading buddy "
4863 "information for %u", group);
4864 break;
4865 }
4866
4867 if (len >= EXT4_BLOCKS_PER_GROUP(sb))
4868 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
4869 else
ca6e909f 4870 last_block = first_block + len;
7360d173
LC
4871
4872 if (e4b.bd_info->bb_free >= minlen) {
4873 cnt = ext4_trim_all_free(sb, &e4b, first_block,
4874 last_block, minlen);
4875 if (cnt < 0) {
4876 ret = cnt;
4877 ext4_mb_unload_buddy(&e4b);
4878 break;
4879 }
4880 }
4881 ext4_mb_unload_buddy(&e4b);
4882 trimmed += cnt;
4883 first_block = 0;
4884 }
4885 range->len = trimmed * sb->s_blocksize;
4886
4887 return ret;
4888}