]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/xfs/xfs_buf.c
xfs: Remove kmem_zone_alloc() usage
[mirror_ubuntu-jammy-kernel.git] / fs / xfs / xfs_buf.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
f07c2250 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
7b718769 4 * All Rights Reserved.
1da177e4 5 */
93c189c1 6#include "xfs.h"
3fcfab16 7#include <linux/backing-dev.h>
1da177e4 8
5467b34b 9#include "xfs_shared.h"
4fb6e8ad 10#include "xfs_format.h"
239880ef 11#include "xfs_log_format.h"
7fd36c44 12#include "xfs_trans_resv.h"
239880ef 13#include "xfs_sb.h"
b7963133 14#include "xfs_mount.h"
0b1b213f 15#include "xfs_trace.h"
239880ef 16#include "xfs_log.h"
9fe5c77c 17#include "xfs_log_recover.h"
f593bf14
DC
18#include "xfs_trans.h"
19#include "xfs_buf_item.h"
e9e899a2 20#include "xfs_errortag.h"
7561d27e 21#include "xfs_error.h"
b7963133 22
7989cb8e 23static kmem_zone_t *xfs_buf_zone;
23ea4032 24
ce8e922c 25#define xb_to_gfp(flags) \
aa5c158e 26 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
1da177e4 27
37fd1678
DC
28/*
29 * Locking orders
30 *
31 * xfs_buf_ioacct_inc:
32 * xfs_buf_ioacct_dec:
33 * b_sema (caller holds)
34 * b_lock
35 *
36 * xfs_buf_stale:
37 * b_sema (caller holds)
38 * b_lock
39 * lru_lock
40 *
41 * xfs_buf_rele:
42 * b_lock
43 * pag_buf_lock
44 * lru_lock
45 *
46 * xfs_buftarg_wait_rele
47 * lru_lock
48 * b_lock (trylock due to inversion)
49 *
50 * xfs_buftarg_isolate
51 * lru_lock
52 * b_lock (trylock due to inversion)
53 */
1da177e4 54
73c77e2c
JB
55static inline int
56xfs_buf_is_vmapped(
57 struct xfs_buf *bp)
58{
59 /*
60 * Return true if the buffer is vmapped.
61 *
611c9946
DC
62 * b_addr is null if the buffer is not mapped, but the code is clever
63 * enough to know it doesn't have to map a single page, so the check has
64 * to be both for b_addr and bp->b_page_count > 1.
73c77e2c 65 */
611c9946 66 return bp->b_addr && bp->b_page_count > 1;
73c77e2c
JB
67}
68
69static inline int
70xfs_buf_vmap_len(
71 struct xfs_buf *bp)
72{
73 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
74}
75
9c7504aa
BF
76/*
77 * Bump the I/O in flight count on the buftarg if we haven't yet done so for
78 * this buffer. The count is incremented once per buffer (per hold cycle)
79 * because the corresponding decrement is deferred to buffer release. Buffers
80 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
81 * tracking adds unnecessary overhead. This is used for sychronization purposes
82 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
83 * in-flight buffers.
84 *
85 * Buffers that are never released (e.g., superblock, iclog buffers) must set
86 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
87 * never reaches zero and unmount hangs indefinitely.
88 */
89static inline void
90xfs_buf_ioacct_inc(
91 struct xfs_buf *bp)
92{
63db7c81 93 if (bp->b_flags & XBF_NO_IOACCT)
9c7504aa
BF
94 return;
95
96 ASSERT(bp->b_flags & XBF_ASYNC);
63db7c81
BF
97 spin_lock(&bp->b_lock);
98 if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
99 bp->b_state |= XFS_BSTATE_IN_FLIGHT;
100 percpu_counter_inc(&bp->b_target->bt_io_count);
101 }
102 spin_unlock(&bp->b_lock);
9c7504aa
BF
103}
104
105/*
106 * Clear the in-flight state on a buffer about to be released to the LRU or
107 * freed and unaccount from the buftarg.
108 */
109static inline void
63db7c81 110__xfs_buf_ioacct_dec(
9c7504aa
BF
111 struct xfs_buf *bp)
112{
95989c46 113 lockdep_assert_held(&bp->b_lock);
9c7504aa 114
63db7c81
BF
115 if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
116 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
117 percpu_counter_dec(&bp->b_target->bt_io_count);
118 }
119}
120
121static inline void
122xfs_buf_ioacct_dec(
123 struct xfs_buf *bp)
124{
125 spin_lock(&bp->b_lock);
126 __xfs_buf_ioacct_dec(bp);
127 spin_unlock(&bp->b_lock);
9c7504aa
BF
128}
129
430cbeb8
DC
130/*
131 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
132 * b_lru_ref count so that the buffer is freed immediately when the buffer
133 * reference count falls to zero. If the buffer is already on the LRU, we need
134 * to remove the reference that LRU holds on the buffer.
135 *
136 * This prevents build-up of stale buffers on the LRU.
137 */
138void
139xfs_buf_stale(
140 struct xfs_buf *bp)
141{
43ff2122
CH
142 ASSERT(xfs_buf_islocked(bp));
143
430cbeb8 144 bp->b_flags |= XBF_STALE;
43ff2122
CH
145
146 /*
147 * Clear the delwri status so that a delwri queue walker will not
148 * flush this buffer to disk now that it is stale. The delwri queue has
149 * a reference to the buffer, so this is safe to do.
150 */
151 bp->b_flags &= ~_XBF_DELWRI_Q;
152
9c7504aa
BF
153 /*
154 * Once the buffer is marked stale and unlocked, a subsequent lookup
155 * could reset b_flags. There is no guarantee that the buffer is
156 * unaccounted (released to LRU) before that occurs. Drop in-flight
157 * status now to preserve accounting consistency.
158 */
a4082357 159 spin_lock(&bp->b_lock);
63db7c81
BF
160 __xfs_buf_ioacct_dec(bp);
161
a4082357
DC
162 atomic_set(&bp->b_lru_ref, 0);
163 if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
e80dfa19
DC
164 (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
165 atomic_dec(&bp->b_hold);
166
430cbeb8 167 ASSERT(atomic_read(&bp->b_hold) >= 1);
a4082357 168 spin_unlock(&bp->b_lock);
430cbeb8 169}
1da177e4 170
3e85c868
DC
171static int
172xfs_buf_get_maps(
173 struct xfs_buf *bp,
174 int map_count)
175{
176 ASSERT(bp->b_maps == NULL);
177 bp->b_map_count = map_count;
178
179 if (map_count == 1) {
f4b42421 180 bp->b_maps = &bp->__b_map;
3e85c868
DC
181 return 0;
182 }
183
184 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
185 KM_NOFS);
186 if (!bp->b_maps)
2451337d 187 return -ENOMEM;
3e85c868
DC
188 return 0;
189}
190
191/*
192 * Frees b_pages if it was allocated.
193 */
194static void
195xfs_buf_free_maps(
196 struct xfs_buf *bp)
197{
f4b42421 198 if (bp->b_maps != &bp->__b_map) {
3e85c868
DC
199 kmem_free(bp->b_maps);
200 bp->b_maps = NULL;
201 }
202}
203
32dff5e5 204static int
3e85c868 205_xfs_buf_alloc(
4347b9d7 206 struct xfs_buftarg *target,
3e85c868
DC
207 struct xfs_buf_map *map,
208 int nmaps,
32dff5e5
DW
209 xfs_buf_flags_t flags,
210 struct xfs_buf **bpp)
1da177e4 211{
4347b9d7 212 struct xfs_buf *bp;
3e85c868
DC
213 int error;
214 int i;
4347b9d7 215
32dff5e5 216 *bpp = NULL;
aa5c158e 217 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
4347b9d7 218 if (unlikely(!bp))
32dff5e5 219 return -ENOMEM;
4347b9d7 220
1da177e4 221 /*
12bcb3f7
DC
222 * We don't want certain flags to appear in b_flags unless they are
223 * specifically set by later operations on the buffer.
1da177e4 224 */
611c9946 225 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
ce8e922c 226
ce8e922c 227 atomic_set(&bp->b_hold, 1);
430cbeb8 228 atomic_set(&bp->b_lru_ref, 1);
b4dd330b 229 init_completion(&bp->b_iowait);
430cbeb8 230 INIT_LIST_HEAD(&bp->b_lru);
ce8e922c 231 INIT_LIST_HEAD(&bp->b_list);
643c8c05 232 INIT_LIST_HEAD(&bp->b_li_list);
a731cd11 233 sema_init(&bp->b_sema, 0); /* held, no waiters */
a4082357 234 spin_lock_init(&bp->b_lock);
ce8e922c 235 bp->b_target = target;
dbd329f1 236 bp->b_mount = target->bt_mount;
3e85c868 237 bp->b_flags = flags;
de1cbee4 238
1da177e4 239 /*
aa0e8833
DC
240 * Set length and io_length to the same value initially.
241 * I/O routines should use io_length, which will be the same in
1da177e4
LT
242 * most cases but may be reset (e.g. XFS recovery).
243 */
3e85c868
DC
244 error = xfs_buf_get_maps(bp, nmaps);
245 if (error) {
377bcd5f 246 kmem_cache_free(xfs_buf_zone, bp);
32dff5e5 247 return error;
3e85c868
DC
248 }
249
250 bp->b_bn = map[0].bm_bn;
251 bp->b_length = 0;
252 for (i = 0; i < nmaps; i++) {
253 bp->b_maps[i].bm_bn = map[i].bm_bn;
254 bp->b_maps[i].bm_len = map[i].bm_len;
255 bp->b_length += map[i].bm_len;
256 }
3e85c868 257
ce8e922c
NS
258 atomic_set(&bp->b_pin_count, 0);
259 init_waitqueue_head(&bp->b_waiters);
260
dbd329f1 261 XFS_STATS_INC(bp->b_mount, xb_create);
0b1b213f 262 trace_xfs_buf_init(bp, _RET_IP_);
4347b9d7 263
32dff5e5
DW
264 *bpp = bp;
265 return 0;
1da177e4
LT
266}
267
268/*
ce8e922c
NS
269 * Allocate a page array capable of holding a specified number
270 * of pages, and point the page buf at it.
1da177e4
LT
271 */
272STATIC int
ce8e922c
NS
273_xfs_buf_get_pages(
274 xfs_buf_t *bp,
87937bf8 275 int page_count)
1da177e4
LT
276{
277 /* Make sure that we have a page list */
ce8e922c 278 if (bp->b_pages == NULL) {
ce8e922c
NS
279 bp->b_page_count = page_count;
280 if (page_count <= XB_PAGES) {
281 bp->b_pages = bp->b_page_array;
1da177e4 282 } else {
ce8e922c 283 bp->b_pages = kmem_alloc(sizeof(struct page *) *
aa5c158e 284 page_count, KM_NOFS);
ce8e922c 285 if (bp->b_pages == NULL)
1da177e4
LT
286 return -ENOMEM;
287 }
ce8e922c 288 memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
1da177e4
LT
289 }
290 return 0;
291}
292
293/*
ce8e922c 294 * Frees b_pages if it was allocated.
1da177e4
LT
295 */
296STATIC void
ce8e922c 297_xfs_buf_free_pages(
1da177e4
LT
298 xfs_buf_t *bp)
299{
ce8e922c 300 if (bp->b_pages != bp->b_page_array) {
f0e2d93c 301 kmem_free(bp->b_pages);
3fc98b1a 302 bp->b_pages = NULL;
1da177e4
LT
303 }
304}
305
306/*
307 * Releases the specified buffer.
308 *
309 * The modification state of any associated pages is left unchanged.
b46fe825 310 * The buffer must not be on any hash - use xfs_buf_rele instead for
1da177e4
LT
311 * hashed and refcounted buffers
312 */
25a40957 313static void
ce8e922c 314xfs_buf_free(
1da177e4
LT
315 xfs_buf_t *bp)
316{
0b1b213f 317 trace_xfs_buf_free(bp, _RET_IP_);
1da177e4 318
430cbeb8
DC
319 ASSERT(list_empty(&bp->b_lru));
320
0e6e847f 321 if (bp->b_flags & _XBF_PAGES) {
1da177e4
LT
322 uint i;
323
73c77e2c 324 if (xfs_buf_is_vmapped(bp))
8a262e57
AE
325 vm_unmap_ram(bp->b_addr - bp->b_offset,
326 bp->b_page_count);
1da177e4 327
948ecdb4
NS
328 for (i = 0; i < bp->b_page_count; i++) {
329 struct page *page = bp->b_pages[i];
330
0e6e847f 331 __free_page(page);
948ecdb4 332 }
12eba65b
DC
333 if (current->reclaim_state)
334 current->reclaim_state->reclaimed_slab +=
335 bp->b_page_count;
0e6e847f
DC
336 } else if (bp->b_flags & _XBF_KMEM)
337 kmem_free(bp->b_addr);
3fc98b1a 338 _xfs_buf_free_pages(bp);
3e85c868 339 xfs_buf_free_maps(bp);
377bcd5f 340 kmem_cache_free(xfs_buf_zone, bp);
1da177e4
LT
341}
342
343/*
0e6e847f 344 * Allocates all the pages for buffer in question and builds it's page list.
1da177e4
LT
345 */
346STATIC int
0e6e847f 347xfs_buf_allocate_memory(
1da177e4
LT
348 xfs_buf_t *bp,
349 uint flags)
350{
aa0e8833 351 size_t size;
1da177e4 352 size_t nbytes, offset;
ce8e922c 353 gfp_t gfp_mask = xb_to_gfp(flags);
1da177e4 354 unsigned short page_count, i;
795cac72 355 xfs_off_t start, end;
1da177e4 356 int error;
3219e8cf
BD
357 xfs_km_flags_t kmflag_mask = 0;
358
359 /*
360 * assure zeroed buffer for non-read cases.
361 */
362 if (!(flags & XBF_READ)) {
363 kmflag_mask |= KM_ZERO;
364 gfp_mask |= __GFP_ZERO;
365 }
1da177e4 366
0e6e847f
DC
367 /*
368 * for buffers that are contained within a single page, just allocate
369 * the memory from the heap - there's no need for the complexity of
370 * page arrays to keep allocation down to order 0.
371 */
795cac72
DC
372 size = BBTOB(bp->b_length);
373 if (size < PAGE_SIZE) {
f8f9ee47 374 int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
3219e8cf
BD
375 bp->b_addr = kmem_alloc_io(size, align_mask,
376 KM_NOFS | kmflag_mask);
0e6e847f
DC
377 if (!bp->b_addr) {
378 /* low memory - use alloc_page loop instead */
379 goto use_alloc_page;
380 }
381
795cac72 382 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
0e6e847f
DC
383 ((unsigned long)bp->b_addr & PAGE_MASK)) {
384 /* b_addr spans two pages - use alloc_page instead */
385 kmem_free(bp->b_addr);
386 bp->b_addr = NULL;
387 goto use_alloc_page;
388 }
389 bp->b_offset = offset_in_page(bp->b_addr);
390 bp->b_pages = bp->b_page_array;
f8f9ee47 391 bp->b_pages[0] = kmem_to_page(bp->b_addr);
0e6e847f 392 bp->b_page_count = 1;
611c9946 393 bp->b_flags |= _XBF_KMEM;
0e6e847f
DC
394 return 0;
395 }
396
397use_alloc_page:
f4b42421
MT
398 start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
399 end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
cbb7baab 400 >> PAGE_SHIFT;
795cac72 401 page_count = end - start;
87937bf8 402 error = _xfs_buf_get_pages(bp, page_count);
1da177e4
LT
403 if (unlikely(error))
404 return error;
1da177e4 405
ce8e922c 406 offset = bp->b_offset;
0e6e847f 407 bp->b_flags |= _XBF_PAGES;
1da177e4 408
ce8e922c 409 for (i = 0; i < bp->b_page_count; i++) {
1da177e4
LT
410 struct page *page;
411 uint retries = 0;
0e6e847f
DC
412retry:
413 page = alloc_page(gfp_mask);
1da177e4 414 if (unlikely(page == NULL)) {
ce8e922c
NS
415 if (flags & XBF_READ_AHEAD) {
416 bp->b_page_count = i;
2451337d 417 error = -ENOMEM;
0e6e847f 418 goto out_free_pages;
1da177e4
LT
419 }
420
421 /*
422 * This could deadlock.
423 *
424 * But until all the XFS lowlevel code is revamped to
425 * handle buffer allocation failures we can't do much.
426 */
427 if (!(++retries % 100))
4f10700a 428 xfs_err(NULL,
5bf97b1c
TH
429 "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
430 current->comm, current->pid,
34a622b2 431 __func__, gfp_mask);
1da177e4 432
dbd329f1 433 XFS_STATS_INC(bp->b_mount, xb_page_retries);
8aa7e847 434 congestion_wait(BLK_RW_ASYNC, HZ/50);
1da177e4
LT
435 goto retry;
436 }
437
dbd329f1 438 XFS_STATS_INC(bp->b_mount, xb_page_found);
1da177e4 439
0e6e847f 440 nbytes = min_t(size_t, size, PAGE_SIZE - offset);
1da177e4 441 size -= nbytes;
ce8e922c 442 bp->b_pages[i] = page;
1da177e4
LT
443 offset = 0;
444 }
0e6e847f 445 return 0;
1da177e4 446
0e6e847f
DC
447out_free_pages:
448 for (i = 0; i < bp->b_page_count; i++)
449 __free_page(bp->b_pages[i]);
2aa6ba7b 450 bp->b_flags &= ~_XBF_PAGES;
1da177e4
LT
451 return error;
452}
453
454/*
25985edc 455 * Map buffer into kernel address-space if necessary.
1da177e4
LT
456 */
457STATIC int
ce8e922c 458_xfs_buf_map_pages(
1da177e4
LT
459 xfs_buf_t *bp,
460 uint flags)
461{
0e6e847f 462 ASSERT(bp->b_flags & _XBF_PAGES);
ce8e922c 463 if (bp->b_page_count == 1) {
0e6e847f 464 /* A single page buffer is always mappable */
ce8e922c 465 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
611c9946
DC
466 } else if (flags & XBF_UNMAPPED) {
467 bp->b_addr = NULL;
468 } else {
a19fb380 469 int retried = 0;
9ba1fb2c 470 unsigned nofs_flag;
ae687e58
DC
471
472 /*
cf085a1b 473 * vm_map_ram() will allocate auxiliary structures (e.g.
ae687e58
DC
474 * pagetables) with GFP_KERNEL, yet we are likely to be under
475 * GFP_NOFS context here. Hence we need to tell memory reclaim
9ba1fb2c 476 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
ae687e58
DC
477 * memory reclaim re-entering the filesystem here and
478 * potentially deadlocking.
479 */
9ba1fb2c 480 nofs_flag = memalloc_nofs_save();
a19fb380
DC
481 do {
482 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
d4efd79a 483 -1);
a19fb380
DC
484 if (bp->b_addr)
485 break;
486 vm_unmap_aliases();
487 } while (retried++ <= 1);
9ba1fb2c 488 memalloc_nofs_restore(nofs_flag);
a19fb380
DC
489
490 if (!bp->b_addr)
1da177e4 491 return -ENOMEM;
ce8e922c 492 bp->b_addr += bp->b_offset;
1da177e4
LT
493 }
494
495 return 0;
496}
497
498/*
499 * Finding and Reading Buffers
500 */
6031e73a
LS
501static int
502_xfs_buf_obj_cmp(
503 struct rhashtable_compare_arg *arg,
504 const void *obj)
505{
506 const struct xfs_buf_map *map = arg->key;
507 const struct xfs_buf *bp = obj;
508
509 /*
510 * The key hashing in the lookup path depends on the key being the
511 * first element of the compare_arg, make sure to assert this.
512 */
513 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
514
515 if (bp->b_bn != map->bm_bn)
516 return 1;
517
518 if (unlikely(bp->b_length != map->bm_len)) {
519 /*
520 * found a block number match. If the range doesn't
521 * match, the only way this is allowed is if the buffer
522 * in the cache is stale and the transaction that made
523 * it stale has not yet committed. i.e. we are
524 * reallocating a busy extent. Skip this buffer and
525 * continue searching for an exact match.
526 */
527 ASSERT(bp->b_flags & XBF_STALE);
528 return 1;
529 }
530 return 0;
531}
532
533static const struct rhashtable_params xfs_buf_hash_params = {
534 .min_size = 32, /* empty AGs have minimal footprint */
535 .nelem_hint = 16,
536 .key_len = sizeof(xfs_daddr_t),
537 .key_offset = offsetof(struct xfs_buf, b_bn),
538 .head_offset = offsetof(struct xfs_buf, b_rhash_head),
539 .automatic_shrinking = true,
540 .obj_cmpfn = _xfs_buf_obj_cmp,
541};
542
543int
544xfs_buf_hash_init(
545 struct xfs_perag *pag)
546{
547 spin_lock_init(&pag->pag_buf_lock);
548 return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
549}
550
551void
552xfs_buf_hash_destroy(
553 struct xfs_perag *pag)
554{
555 rhashtable_destroy(&pag->pag_buf_hash);
556}
1da177e4
LT
557
558/*
b027d4c9
DC
559 * Look up a buffer in the buffer cache and return it referenced and locked
560 * in @found_bp.
561 *
562 * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
563 * cache.
564 *
565 * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
566 * -EAGAIN if we fail to lock it.
567 *
568 * Return values are:
569 * -EFSCORRUPTED if have been supplied with an invalid address
570 * -EAGAIN on trylock failure
571 * -ENOENT if we fail to find a match and @new_bp was NULL
572 * 0, with @found_bp:
573 * - @new_bp if we inserted it into the cache
574 * - the buffer we found and locked.
1da177e4 575 */
b027d4c9
DC
576static int
577xfs_buf_find(
e70b73f8 578 struct xfs_buftarg *btp,
3e85c868
DC
579 struct xfs_buf_map *map,
580 int nmaps,
ce8e922c 581 xfs_buf_flags_t flags,
b027d4c9
DC
582 struct xfs_buf *new_bp,
583 struct xfs_buf **found_bp)
1da177e4 584{
74f75a0c 585 struct xfs_perag *pag;
74f75a0c 586 xfs_buf_t *bp;
6031e73a 587 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
10616b80 588 xfs_daddr_t eofs;
3e85c868 589 int i;
1da177e4 590
b027d4c9
DC
591 *found_bp = NULL;
592
3e85c868 593 for (i = 0; i < nmaps; i++)
6031e73a 594 cmap.bm_len += map[i].bm_len;
1da177e4
LT
595
596 /* Check for IOs smaller than the sector size / not sector aligned */
6031e73a
LS
597 ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
598 ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
1da177e4 599
10616b80
DC
600 /*
601 * Corrupted block numbers can get through to here, unfortunately, so we
602 * have to check that the buffer falls within the filesystem bounds.
603 */
604 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
6031e73a 605 if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
10616b80 606 xfs_alert(btp->bt_mount,
c219b015 607 "%s: daddr 0x%llx out of range, EOFS 0x%llx",
6031e73a 608 __func__, cmap.bm_bn, eofs);
7bc0dc27 609 WARN_ON(1);
b027d4c9 610 return -EFSCORRUPTED;
10616b80
DC
611 }
612
74f75a0c 613 pag = xfs_perag_get(btp->bt_mount,
6031e73a 614 xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
74f75a0c 615
74f75a0c 616 spin_lock(&pag->pag_buf_lock);
6031e73a
LS
617 bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
618 xfs_buf_hash_params);
619 if (bp) {
620 atomic_inc(&bp->b_hold);
621 goto found;
1da177e4
LT
622 }
623
624 /* No match found */
b027d4c9 625 if (!new_bp) {
ff6d6af2 626 XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
74f75a0c
DC
627 spin_unlock(&pag->pag_buf_lock);
628 xfs_perag_put(pag);
b027d4c9 629 return -ENOENT;
1da177e4 630 }
b027d4c9
DC
631
632 /* the buffer keeps the perag reference until it is freed */
633 new_bp->b_pag = pag;
634 rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
635 xfs_buf_hash_params);
636 spin_unlock(&pag->pag_buf_lock);
637 *found_bp = new_bp;
638 return 0;
1da177e4
LT
639
640found:
74f75a0c
DC
641 spin_unlock(&pag->pag_buf_lock);
642 xfs_perag_put(pag);
1da177e4 643
0c842ad4
CH
644 if (!xfs_buf_trylock(bp)) {
645 if (flags & XBF_TRYLOCK) {
ce8e922c 646 xfs_buf_rele(bp);
ff6d6af2 647 XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
b027d4c9 648 return -EAGAIN;
1da177e4 649 }
0c842ad4 650 xfs_buf_lock(bp);
ff6d6af2 651 XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
1da177e4
LT
652 }
653
0e6e847f
DC
654 /*
655 * if the buffer is stale, clear all the external state associated with
656 * it. We need to keep flags such as how we allocated the buffer memory
657 * intact here.
658 */
ce8e922c
NS
659 if (bp->b_flags & XBF_STALE) {
660 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
611c9946 661 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
1813dd64 662 bp->b_ops = NULL;
2f926587 663 }
0b1b213f
CH
664
665 trace_xfs_buf_find(bp, flags, _RET_IP_);
ff6d6af2 666 XFS_STATS_INC(btp->bt_mount, xb_get_locked);
b027d4c9
DC
667 *found_bp = bp;
668 return 0;
1da177e4
LT
669}
670
8925a3dc
DC
671struct xfs_buf *
672xfs_buf_incore(
673 struct xfs_buftarg *target,
674 xfs_daddr_t blkno,
675 size_t numblks,
676 xfs_buf_flags_t flags)
677{
b027d4c9
DC
678 struct xfs_buf *bp;
679 int error;
8925a3dc 680 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
b027d4c9
DC
681
682 error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
683 if (error)
684 return NULL;
685 return bp;
8925a3dc
DC
686}
687
1da177e4 688/*
3815832a
DC
689 * Assembles a buffer covering the specified range. The code is optimised for
690 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
691 * more hits than misses.
1da177e4 692 */
3848b5f6 693int
6dde2707
DC
694xfs_buf_get_map(
695 struct xfs_buftarg *target,
696 struct xfs_buf_map *map,
697 int nmaps,
3848b5f6
DW
698 xfs_buf_flags_t flags,
699 struct xfs_buf **bpp)
1da177e4 700{
3815832a
DC
701 struct xfs_buf *bp;
702 struct xfs_buf *new_bp;
0e6e847f 703 int error = 0;
1da177e4 704
3848b5f6 705 *bpp = NULL;
b027d4c9 706 error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
3848b5f6 707 if (!error)
3815832a 708 goto found;
3848b5f6
DW
709 if (error != -ENOENT)
710 return error;
3815832a 711
32dff5e5
DW
712 error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
713 if (error)
3848b5f6 714 return error;
1da177e4 715
fe2429b0
DC
716 error = xfs_buf_allocate_memory(new_bp, flags);
717 if (error) {
3e85c868 718 xfs_buf_free(new_bp);
3848b5f6 719 return error;
fe2429b0
DC
720 }
721
b027d4c9
DC
722 error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
723 if (error) {
fe2429b0 724 xfs_buf_free(new_bp);
3848b5f6 725 return error;
3815832a
DC
726 }
727
fe2429b0
DC
728 if (bp != new_bp)
729 xfs_buf_free(new_bp);
1da177e4 730
3815832a 731found:
611c9946 732 if (!bp->b_addr) {
ce8e922c 733 error = _xfs_buf_map_pages(bp, flags);
1da177e4 734 if (unlikely(error)) {
93baa55a
DW
735 xfs_warn_ratelimited(target->bt_mount,
736 "%s: failed to map %u pages", __func__,
737 bp->b_page_count);
a8acad70 738 xfs_buf_relse(bp);
3848b5f6 739 return error;
1da177e4
LT
740 }
741 }
742
b79f4a1c
DC
743 /*
744 * Clear b_error if this is a lookup from a caller that doesn't expect
745 * valid data to be found in the buffer.
746 */
747 if (!(flags & XBF_READ))
748 xfs_buf_ioerror(bp, 0);
749
ff6d6af2 750 XFS_STATS_INC(target->bt_mount, xb_get);
0b1b213f 751 trace_xfs_buf_get(bp, flags, _RET_IP_);
3848b5f6
DW
752 *bpp = bp;
753 return 0;
1da177e4
LT
754}
755
5d765b97
CH
756STATIC int
757_xfs_buf_read(
758 xfs_buf_t *bp,
759 xfs_buf_flags_t flags)
760{
43ff2122 761 ASSERT(!(flags & XBF_WRITE));
f4b42421 762 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
5d765b97 763
43ff2122 764 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
1d5ae5df 765 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
5d765b97 766
6af88cda 767 return xfs_buf_submit(bp);
5d765b97
CH
768}
769
1aff5696 770/*
75d02303 771 * Reverify a buffer found in cache without an attached ->b_ops.
add46b3b 772 *
75d02303
BF
773 * If the caller passed an ops structure and the buffer doesn't have ops
774 * assigned, set the ops and use it to verify the contents. If verification
775 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
776 * already in XBF_DONE state on entry.
add46b3b 777 *
75d02303
BF
778 * Under normal operations, every in-core buffer is verified on read I/O
779 * completion. There are two scenarios that can lead to in-core buffers without
780 * an assigned ->b_ops. The first is during log recovery of buffers on a V4
781 * filesystem, though these buffers are purged at the end of recovery. The
782 * other is online repair, which intentionally reads with a NULL buffer ops to
783 * run several verifiers across an in-core buffer in order to establish buffer
784 * type. If repair can't establish that, the buffer will be left in memory
785 * with NULL buffer ops.
1aff5696
DW
786 */
787int
75d02303 788xfs_buf_reverify(
1aff5696
DW
789 struct xfs_buf *bp,
790 const struct xfs_buf_ops *ops)
791{
792 ASSERT(bp->b_flags & XBF_DONE);
793 ASSERT(bp->b_error == 0);
794
795 if (!ops || bp->b_ops)
796 return 0;
797
798 bp->b_ops = ops;
799 bp->b_ops->verify_read(bp);
800 if (bp->b_error)
801 bp->b_flags &= ~XBF_DONE;
802 return bp->b_error;
803}
804
4ed8e27b 805int
6dde2707
DC
806xfs_buf_read_map(
807 struct xfs_buftarg *target,
808 struct xfs_buf_map *map,
809 int nmaps,
c3f8fc73 810 xfs_buf_flags_t flags,
4ed8e27b 811 struct xfs_buf **bpp,
cdbcf82b
DW
812 const struct xfs_buf_ops *ops,
813 xfs_failaddr_t fa)
1da177e4 814{
6dde2707 815 struct xfs_buf *bp;
3848b5f6 816 int error;
ce8e922c
NS
817
818 flags |= XBF_READ;
4ed8e27b 819 *bpp = NULL;
ce8e922c 820
3848b5f6
DW
821 error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
822 if (error)
4ed8e27b 823 return error;
0b1b213f 824
1aff5696
DW
825 trace_xfs_buf_read(bp, flags, _RET_IP_);
826
827 if (!(bp->b_flags & XBF_DONE)) {
4ed8e27b 828 /* Initiate the buffer read and wait. */
1aff5696
DW
829 XFS_STATS_INC(target->bt_mount, xb_get_read);
830 bp->b_ops = ops;
4ed8e27b
DW
831 error = _xfs_buf_read(bp, flags);
832
833 /* Readahead iodone already dropped the buffer, so exit. */
834 if (flags & XBF_ASYNC)
835 return 0;
836 } else {
837 /* Buffer already read; all we need to do is check it. */
838 error = xfs_buf_reverify(bp, ops);
839
840 /* Readahead already finished; drop the buffer and exit. */
841 if (flags & XBF_ASYNC) {
842 xfs_buf_relse(bp);
843 return 0;
844 }
845
846 /* We do not want read in the flags */
847 bp->b_flags &= ~XBF_READ;
848 ASSERT(bp->b_ops != NULL || ops == NULL);
1aff5696
DW
849 }
850
4ed8e27b
DW
851 /*
852 * If we've had a read error, then the contents of the buffer are
853 * invalid and should not be used. To ensure that a followup read tries
854 * to pull the buffer from disk again, we clear the XBF_DONE flag and
855 * mark the buffer stale. This ensures that anyone who has a current
856 * reference to the buffer will interpret it's contents correctly and
857 * future cache lookups will also treat it as an empty, uninitialised
858 * buffer.
859 */
860 if (error) {
861 if (!XFS_FORCED_SHUTDOWN(target->bt_mount))
cdbcf82b 862 xfs_buf_ioerror_alert(bp, fa);
1aff5696 863
4ed8e27b
DW
864 bp->b_flags &= ~XBF_DONE;
865 xfs_buf_stale(bp);
1aff5696 866 xfs_buf_relse(bp);
4ed8e27b
DW
867
868 /* bad CRC means corrupted metadata */
869 if (error == -EFSBADCRC)
870 error = -EFSCORRUPTED;
871 return error;
1da177e4
LT
872 }
873
4ed8e27b
DW
874 *bpp = bp;
875 return 0;
1da177e4
LT
876}
877
1da177e4 878/*
ce8e922c
NS
879 * If we are not low on memory then do the readahead in a deadlock
880 * safe manner.
1da177e4
LT
881 */
882void
6dde2707
DC
883xfs_buf_readahead_map(
884 struct xfs_buftarg *target,
885 struct xfs_buf_map *map,
c3f8fc73 886 int nmaps,
1813dd64 887 const struct xfs_buf_ops *ops)
1da177e4 888{
4ed8e27b
DW
889 struct xfs_buf *bp;
890
efa7c9f9 891 if (bdi_read_congested(target->bt_bdev->bd_bdi))
1da177e4
LT
892 return;
893
6dde2707 894 xfs_buf_read_map(target, map, nmaps,
cdbcf82b
DW
895 XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
896 __this_address);
1da177e4
LT
897}
898
5adc94c2
DC
899/*
900 * Read an uncached buffer from disk. Allocates and returns a locked
901 * buffer containing the disk contents or nothing.
902 */
ba372674 903int
5adc94c2 904xfs_buf_read_uncached(
5adc94c2
DC
905 struct xfs_buftarg *target,
906 xfs_daddr_t daddr,
e70b73f8 907 size_t numblks,
c3f8fc73 908 int flags,
ba372674 909 struct xfs_buf **bpp,
1813dd64 910 const struct xfs_buf_ops *ops)
5adc94c2 911{
eab4e633 912 struct xfs_buf *bp;
2842b6db 913 int error;
5adc94c2 914
ba372674
DC
915 *bpp = NULL;
916
2842b6db
DW
917 error = xfs_buf_get_uncached(target, numblks, flags, &bp);
918 if (error)
919 return error;
5adc94c2
DC
920
921 /* set up the buffer for a read IO */
3e85c868 922 ASSERT(bp->b_map_count == 1);
ba372674 923 bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
3e85c868 924 bp->b_maps[0].bm_bn = daddr;
cbb7baab 925 bp->b_flags |= XBF_READ;
1813dd64 926 bp->b_ops = ops;
5adc94c2 927
6af88cda 928 xfs_buf_submit(bp);
ba372674 929 if (bp->b_error) {
2842b6db 930 error = bp->b_error;
83a0adc3 931 xfs_buf_relse(bp);
ba372674 932 return error;
83a0adc3 933 }
ba372674
DC
934
935 *bpp = bp;
936 return 0;
1da177e4
LT
937}
938
2842b6db 939int
686865f7
DC
940xfs_buf_get_uncached(
941 struct xfs_buftarg *target,
e70b73f8 942 size_t numblks,
2842b6db
DW
943 int flags,
944 struct xfs_buf **bpp)
1da177e4 945{
e70b73f8 946 unsigned long page_count;
1fa40b01 947 int error, i;
3e85c868
DC
948 struct xfs_buf *bp;
949 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
1da177e4 950
2842b6db
DW
951 *bpp = NULL;
952
c891c30a 953 /* flags might contain irrelevant bits, pass only what we care about */
32dff5e5
DW
954 error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
955 if (error)
1da177e4 956 goto fail;
1da177e4 957
e70b73f8 958 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
87937bf8 959 error = _xfs_buf_get_pages(bp, page_count);
1fa40b01 960 if (error)
1da177e4
LT
961 goto fail_free_buf;
962
1fa40b01 963 for (i = 0; i < page_count; i++) {
686865f7 964 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
2842b6db
DW
965 if (!bp->b_pages[i]) {
966 error = -ENOMEM;
1fa40b01 967 goto fail_free_mem;
2842b6db 968 }
1da177e4 969 }
1fa40b01 970 bp->b_flags |= _XBF_PAGES;
1da177e4 971
611c9946 972 error = _xfs_buf_map_pages(bp, 0);
1fa40b01 973 if (unlikely(error)) {
4f10700a 974 xfs_warn(target->bt_mount,
08e96e1a 975 "%s: failed to map pages", __func__);
1da177e4 976 goto fail_free_mem;
1fa40b01 977 }
1da177e4 978
686865f7 979 trace_xfs_buf_get_uncached(bp, _RET_IP_);
2842b6db
DW
980 *bpp = bp;
981 return 0;
1fa40b01 982
1da177e4 983 fail_free_mem:
1fa40b01
CH
984 while (--i >= 0)
985 __free_page(bp->b_pages[i]);
ca165b88 986 _xfs_buf_free_pages(bp);
1da177e4 987 fail_free_buf:
3e85c868 988 xfs_buf_free_maps(bp);
377bcd5f 989 kmem_cache_free(xfs_buf_zone, bp);
1da177e4 990 fail:
2842b6db 991 return error;
1da177e4
LT
992}
993
994/*
1da177e4
LT
995 * Increment reference count on buffer, to hold the buffer concurrently
996 * with another thread which may release (free) the buffer asynchronously.
1da177e4
LT
997 * Must hold the buffer already to call this function.
998 */
999void
ce8e922c
NS
1000xfs_buf_hold(
1001 xfs_buf_t *bp)
1da177e4 1002{
0b1b213f 1003 trace_xfs_buf_hold(bp, _RET_IP_);
ce8e922c 1004 atomic_inc(&bp->b_hold);
1da177e4
LT
1005}
1006
1007/*
9c7504aa
BF
1008 * Release a hold on the specified buffer. If the hold count is 1, the buffer is
1009 * placed on LRU or freed (depending on b_lru_ref).
1da177e4
LT
1010 */
1011void
ce8e922c
NS
1012xfs_buf_rele(
1013 xfs_buf_t *bp)
1da177e4 1014{
74f75a0c 1015 struct xfs_perag *pag = bp->b_pag;
9c7504aa
BF
1016 bool release;
1017 bool freebuf = false;
1da177e4 1018
0b1b213f 1019 trace_xfs_buf_rele(bp, _RET_IP_);
1da177e4 1020
74f75a0c 1021 if (!pag) {
430cbeb8 1022 ASSERT(list_empty(&bp->b_lru));
9c7504aa
BF
1023 if (atomic_dec_and_test(&bp->b_hold)) {
1024 xfs_buf_ioacct_dec(bp);
fad3aa1e 1025 xfs_buf_free(bp);
9c7504aa 1026 }
fad3aa1e
NS
1027 return;
1028 }
1029
3790689f 1030 ASSERT(atomic_read(&bp->b_hold) > 0);
a4082357 1031
37fd1678
DC
1032 /*
1033 * We grab the b_lock here first to serialise racing xfs_buf_rele()
1034 * calls. The pag_buf_lock being taken on the last reference only
1035 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
1036 * to last reference we drop here is not serialised against the last
1037 * reference until we take bp->b_lock. Hence if we don't grab b_lock
1038 * first, the last "release" reference can win the race to the lock and
1039 * free the buffer before the second-to-last reference is processed,
1040 * leading to a use-after-free scenario.
1041 */
9c7504aa 1042 spin_lock(&bp->b_lock);
37fd1678 1043 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
9c7504aa
BF
1044 if (!release) {
1045 /*
1046 * Drop the in-flight state if the buffer is already on the LRU
1047 * and it holds the only reference. This is racy because we
1048 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
1049 * ensures the decrement occurs only once per-buf.
1050 */
1051 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
63db7c81 1052 __xfs_buf_ioacct_dec(bp);
9c7504aa
BF
1053 goto out_unlock;
1054 }
1055
1056 /* the last reference has been dropped ... */
63db7c81 1057 __xfs_buf_ioacct_dec(bp);
9c7504aa
BF
1058 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
1059 /*
1060 * If the buffer is added to the LRU take a new reference to the
1061 * buffer for the LRU and clear the (now stale) dispose list
1062 * state flag
1063 */
1064 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
1065 bp->b_state &= ~XFS_BSTATE_DISPOSE;
1066 atomic_inc(&bp->b_hold);
1da177e4 1067 }
9c7504aa
BF
1068 spin_unlock(&pag->pag_buf_lock);
1069 } else {
1070 /*
1071 * most of the time buffers will already be removed from the
1072 * LRU, so optimise that case by checking for the
1073 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
1074 * was on was the disposal list
1075 */
1076 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
1077 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
1078 } else {
1079 ASSERT(list_empty(&bp->b_lru));
1da177e4 1080 }
9c7504aa
BF
1081
1082 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
6031e73a
LS
1083 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
1084 xfs_buf_hash_params);
9c7504aa
BF
1085 spin_unlock(&pag->pag_buf_lock);
1086 xfs_perag_put(pag);
1087 freebuf = true;
1da177e4 1088 }
9c7504aa
BF
1089
1090out_unlock:
1091 spin_unlock(&bp->b_lock);
1092
1093 if (freebuf)
1094 xfs_buf_free(bp);
1da177e4
LT
1095}
1096
1097
1098/*
0e6e847f 1099 * Lock a buffer object, if it is not already locked.
90810b9e
DC
1100 *
1101 * If we come across a stale, pinned, locked buffer, we know that we are
1102 * being asked to lock a buffer that has been reallocated. Because it is
1103 * pinned, we know that the log has not been pushed to disk and hence it
1104 * will still be locked. Rather than continuing to have trylock attempts
1105 * fail until someone else pushes the log, push it ourselves before
1106 * returning. This means that the xfsaild will not get stuck trying
1107 * to push on stale inode buffers.
1da177e4
LT
1108 */
1109int
0c842ad4
CH
1110xfs_buf_trylock(
1111 struct xfs_buf *bp)
1da177e4
LT
1112{
1113 int locked;
1114
ce8e922c 1115 locked = down_trylock(&bp->b_sema) == 0;
fa6c668d 1116 if (locked)
479c6412 1117 trace_xfs_buf_trylock(bp, _RET_IP_);
fa6c668d 1118 else
479c6412 1119 trace_xfs_buf_trylock_fail(bp, _RET_IP_);
0c842ad4 1120 return locked;
1da177e4 1121}
1da177e4
LT
1122
1123/*
0e6e847f 1124 * Lock a buffer object.
ed3b4d6c
DC
1125 *
1126 * If we come across a stale, pinned, locked buffer, we know that we
1127 * are being asked to lock a buffer that has been reallocated. Because
1128 * it is pinned, we know that the log has not been pushed to disk and
1129 * hence it will still be locked. Rather than sleeping until someone
1130 * else pushes the log, push it ourselves before trying to get the lock.
1da177e4 1131 */
ce8e922c
NS
1132void
1133xfs_buf_lock(
0c842ad4 1134 struct xfs_buf *bp)
1da177e4 1135{
0b1b213f
CH
1136 trace_xfs_buf_lock(bp, _RET_IP_);
1137
ed3b4d6c 1138 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
dbd329f1 1139 xfs_log_force(bp->b_mount, 0);
ce8e922c 1140 down(&bp->b_sema);
0b1b213f
CH
1141
1142 trace_xfs_buf_lock_done(bp, _RET_IP_);
1da177e4
LT
1143}
1144
1da177e4 1145void
ce8e922c 1146xfs_buf_unlock(
0c842ad4 1147 struct xfs_buf *bp)
1da177e4 1148{
20e8a063
BF
1149 ASSERT(xfs_buf_islocked(bp));
1150
ce8e922c 1151 up(&bp->b_sema);
0b1b213f 1152 trace_xfs_buf_unlock(bp, _RET_IP_);
1da177e4
LT
1153}
1154
ce8e922c
NS
1155STATIC void
1156xfs_buf_wait_unpin(
1157 xfs_buf_t *bp)
1da177e4
LT
1158{
1159 DECLARE_WAITQUEUE (wait, current);
1160
ce8e922c 1161 if (atomic_read(&bp->b_pin_count) == 0)
1da177e4
LT
1162 return;
1163
ce8e922c 1164 add_wait_queue(&bp->b_waiters, &wait);
1da177e4
LT
1165 for (;;) {
1166 set_current_state(TASK_UNINTERRUPTIBLE);
ce8e922c 1167 if (atomic_read(&bp->b_pin_count) == 0)
1da177e4 1168 break;
7eaceacc 1169 io_schedule();
1da177e4 1170 }
ce8e922c 1171 remove_wait_queue(&bp->b_waiters, &wait);
1da177e4
LT
1172 set_current_state(TASK_RUNNING);
1173}
1174
1175/*
1176 * Buffer Utility Routines
1177 */
1178
e8aaba9a
DC
1179void
1180xfs_buf_ioend(
1181 struct xfs_buf *bp)
1da177e4 1182{
e8aaba9a
DC
1183 bool read = bp->b_flags & XBF_READ;
1184
1185 trace_xfs_buf_iodone(bp, _RET_IP_);
1813dd64
DC
1186
1187 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
d5929de8 1188
61be9c52
DC
1189 /*
1190 * Pull in IO completion errors now. We are guaranteed to be running
1191 * single threaded, so we don't need the lock to read b_io_error.
1192 */
1193 if (!bp->b_error && bp->b_io_error)
1194 xfs_buf_ioerror(bp, bp->b_io_error);
1195
b01d1461
DC
1196 if (read) {
1197 if (!bp->b_error && bp->b_ops)
1198 bp->b_ops->verify_read(bp);
1199 if (!bp->b_error)
1200 bp->b_flags |= XBF_DONE;
1201 xfs_buf_ioend_finish(bp);
1202 return;
e8aaba9a
DC
1203 }
1204
b6983e80
BF
1205 if (!bp->b_error) {
1206 bp->b_flags &= ~XBF_WRITE_FAIL;
e8aaba9a 1207 bp->b_flags |= XBF_DONE;
b6983e80 1208 }
1da177e4 1209
9fe5c77c
DC
1210 /*
1211 * If this is a log recovery buffer, we aren't doing transactional IO
1212 * yet so we need to let it handle IO completions.
1213 */
1214 if (bp->b_flags & _XBF_LOGRECOVERY) {
1215 xlog_recover_iodone(bp);
1216 return;
1217 }
1218
f593bf14
DC
1219 if (bp->b_flags & _XBF_INODES) {
1220 xfs_buf_inode_iodone(bp);
1221 return;
1222 }
1223
0c7e5afb
DC
1224 if (bp->b_flags & _XBF_DQUOTS) {
1225 xfs_buf_dquot_iodone(bp);
1226 return;
1227 }
b01d1461 1228 xfs_buf_iodone(bp);
1da177e4
LT
1229}
1230
e8aaba9a
DC
1231static void
1232xfs_buf_ioend_work(
1233 struct work_struct *work)
1da177e4 1234{
e8aaba9a 1235 struct xfs_buf *bp =
b29c70f5 1236 container_of(work, xfs_buf_t, b_ioend_work);
0b1b213f 1237
e8aaba9a
DC
1238 xfs_buf_ioend(bp);
1239}
1da177e4 1240
211fe1a4 1241static void
e8aaba9a
DC
1242xfs_buf_ioend_async(
1243 struct xfs_buf *bp)
1244{
b29c70f5 1245 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
dbd329f1 1246 queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
1da177e4
LT
1247}
1248
1da177e4 1249void
31ca03c9 1250__xfs_buf_ioerror(
ce8e922c 1251 xfs_buf_t *bp,
31ca03c9
DW
1252 int error,
1253 xfs_failaddr_t failaddr)
1da177e4 1254{
2451337d
DC
1255 ASSERT(error <= 0 && error >= -1000);
1256 bp->b_error = error;
31ca03c9 1257 trace_xfs_buf_ioerror(bp, error, failaddr);
1da177e4
LT
1258}
1259
901796af
CH
1260void
1261xfs_buf_ioerror_alert(
1262 struct xfs_buf *bp,
cdbcf82b 1263 xfs_failaddr_t func)
901796af 1264{
f9bccfcc
BF
1265 xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
1266 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
1267 func, (uint64_t)XFS_BUF_ADDR(bp),
1268 bp->b_length, -bp->b_error);
901796af
CH
1269}
1270
54b3b1f6
BF
1271/*
1272 * To simulate an I/O failure, the buffer must be locked and held with at least
1273 * three references. The LRU reference is dropped by the stale call. The buf
1274 * item reference is dropped via ioend processing. The third reference is owned
1275 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
1276 */
1277void
1278xfs_buf_ioend_fail(
1279 struct xfs_buf *bp)
1280{
1281 bp->b_flags &= ~XBF_DONE;
1282 xfs_buf_stale(bp);
1283 xfs_buf_ioerror(bp, -EIO);
1284 xfs_buf_ioend(bp);
901796af
CH
1285}
1286
a2dcf5df
CH
1287int
1288xfs_bwrite(
1289 struct xfs_buf *bp)
1290{
1291 int error;
1292
1293 ASSERT(xfs_buf_islocked(bp));
1294
1295 bp->b_flags |= XBF_WRITE;
27187754 1296 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
b6983e80 1297 XBF_DONE);
a2dcf5df 1298
6af88cda 1299 error = xfs_buf_submit(bp);
dbd329f1
CH
1300 if (error)
1301 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
a2dcf5df
CH
1302 return error;
1303}
1304
9bdd9bd6 1305static void
ce8e922c 1306xfs_buf_bio_end_io(
4246a0b6 1307 struct bio *bio)
1da177e4 1308{
9bdd9bd6 1309 struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
1da177e4 1310
7376d745
BF
1311 if (!bio->bi_status &&
1312 (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
43dc0aa8 1313 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
7376d745 1314 bio->bi_status = BLK_STS_IOERR;
1da177e4 1315
37eb17e6
DC
1316 /*
1317 * don't overwrite existing errors - otherwise we can lose errors on
1318 * buffers that require multiple bios to complete.
1319 */
4e4cbee9
CH
1320 if (bio->bi_status) {
1321 int error = blk_status_to_errno(bio->bi_status);
1322
1323 cmpxchg(&bp->b_io_error, 0, error);
1324 }
1da177e4 1325
37eb17e6 1326 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
73c77e2c
JB
1327 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1328
e8aaba9a
DC
1329 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1330 xfs_buf_ioend_async(bp);
1da177e4 1331 bio_put(bio);
1da177e4
LT
1332}
1333
3e85c868
DC
1334static void
1335xfs_buf_ioapply_map(
1336 struct xfs_buf *bp,
1337 int map,
1338 int *buf_offset,
1339 int *count,
2123ef85 1340 int op)
1da177e4 1341{
3e85c868
DC
1342 int page_index;
1343 int total_nr_pages = bp->b_page_count;
1344 int nr_pages;
1345 struct bio *bio;
1346 sector_t sector = bp->b_maps[map].bm_bn;
1347 int size;
1348 int offset;
1da177e4 1349
3e85c868
DC
1350 /* skip the pages in the buffer before the start offset */
1351 page_index = 0;
1352 offset = *buf_offset;
1353 while (offset >= PAGE_SIZE) {
1354 page_index++;
1355 offset -= PAGE_SIZE;
f538d4da
CH
1356 }
1357
3e85c868
DC
1358 /*
1359 * Limit the IO size to the length of the current vector, and update the
1360 * remaining IO count for the next time around.
1361 */
1362 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
1363 *count -= size;
1364 *buf_offset += size;
34951f5c 1365
1da177e4 1366next_chunk:
ce8e922c 1367 atomic_inc(&bp->b_io_remaining);
c908e380 1368 nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
1da177e4
LT
1369
1370 bio = bio_alloc(GFP_NOIO, nr_pages);
74d46992 1371 bio_set_dev(bio, bp->b_target->bt_bdev);
4f024f37 1372 bio->bi_iter.bi_sector = sector;
ce8e922c
NS
1373 bio->bi_end_io = xfs_buf_bio_end_io;
1374 bio->bi_private = bp;
2123ef85 1375 bio->bi_opf = op;
0e6e847f 1376
3e85c868 1377 for (; size && nr_pages; nr_pages--, page_index++) {
0e6e847f 1378 int rbytes, nbytes = PAGE_SIZE - offset;
1da177e4
LT
1379
1380 if (nbytes > size)
1381 nbytes = size;
1382
3e85c868
DC
1383 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
1384 offset);
ce8e922c 1385 if (rbytes < nbytes)
1da177e4
LT
1386 break;
1387
1388 offset = 0;
aa0e8833 1389 sector += BTOBB(nbytes);
1da177e4
LT
1390 size -= nbytes;
1391 total_nr_pages--;
1392 }
1393
4f024f37 1394 if (likely(bio->bi_iter.bi_size)) {
73c77e2c
JB
1395 if (xfs_buf_is_vmapped(bp)) {
1396 flush_kernel_vmap_range(bp->b_addr,
1397 xfs_buf_vmap_len(bp));
1398 }
4e49ea4a 1399 submit_bio(bio);
1da177e4
LT
1400 if (size)
1401 goto next_chunk;
1402 } else {
37eb17e6
DC
1403 /*
1404 * This is guaranteed not to be the last io reference count
595bff75 1405 * because the caller (xfs_buf_submit) holds a count itself.
37eb17e6
DC
1406 */
1407 atomic_dec(&bp->b_io_remaining);
2451337d 1408 xfs_buf_ioerror(bp, -EIO);
ec53d1db 1409 bio_put(bio);
1da177e4 1410 }
3e85c868
DC
1411
1412}
1413
1414STATIC void
1415_xfs_buf_ioapply(
1416 struct xfs_buf *bp)
1417{
1418 struct blk_plug plug;
50bfcd0c 1419 int op;
3e85c868
DC
1420 int offset;
1421 int size;
1422 int i;
1423
c163f9a1
DC
1424 /*
1425 * Make sure we capture only current IO errors rather than stale errors
1426 * left over from previous use of the buffer (e.g. failed readahead).
1427 */
1428 bp->b_error = 0;
1429
3e85c868 1430 if (bp->b_flags & XBF_WRITE) {
50bfcd0c 1431 op = REQ_OP_WRITE;
1813dd64
DC
1432
1433 /*
1434 * Run the write verifier callback function if it exists. If
1435 * this function fails it will mark the buffer with an error and
1436 * the IO should not be dispatched.
1437 */
1438 if (bp->b_ops) {
1439 bp->b_ops->verify_write(bp);
1440 if (bp->b_error) {
dbd329f1 1441 xfs_force_shutdown(bp->b_mount,
1813dd64
DC
1442 SHUTDOWN_CORRUPT_INCORE);
1443 return;
1444 }
400b9d88 1445 } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
dbd329f1 1446 struct xfs_mount *mp = bp->b_mount;
400b9d88
DC
1447
1448 /*
1449 * non-crc filesystems don't attach verifiers during
1450 * log recovery, so don't warn for such filesystems.
1451 */
1452 if (xfs_sb_version_hascrc(&mp->m_sb)) {
1453 xfs_warn(mp,
c219b015 1454 "%s: no buf ops on daddr 0x%llx len %d",
400b9d88 1455 __func__, bp->b_bn, bp->b_length);
9c712a13
DW
1456 xfs_hex_dump(bp->b_addr,
1457 XFS_CORRUPTION_DUMP_LEN);
400b9d88
DC
1458 dump_stack();
1459 }
1813dd64 1460 }
3e85c868 1461 } else {
50bfcd0c 1462 op = REQ_OP_READ;
2123ef85
CH
1463 if (bp->b_flags & XBF_READ_AHEAD)
1464 op |= REQ_RAHEAD;
3e85c868
DC
1465 }
1466
1467 /* we only use the buffer cache for meta-data */
2123ef85 1468 op |= REQ_META;
3e85c868
DC
1469
1470 /*
1471 * Walk all the vectors issuing IO on them. Set up the initial offset
1472 * into the buffer and the desired IO size before we start -
1473 * _xfs_buf_ioapply_vec() will modify them appropriately for each
1474 * subsequent call.
1475 */
1476 offset = bp->b_offset;
8124b9b6 1477 size = BBTOB(bp->b_length);
3e85c868
DC
1478 blk_start_plug(&plug);
1479 for (i = 0; i < bp->b_map_count; i++) {
2123ef85 1480 xfs_buf_ioapply_map(bp, i, &offset, &size, op);
3e85c868
DC
1481 if (bp->b_error)
1482 break;
1483 if (size <= 0)
1484 break; /* all done */
1485 }
1486 blk_finish_plug(&plug);
1da177e4
LT
1487}
1488
595bff75 1489/*
bb00b6f1 1490 * Wait for I/O completion of a sync buffer and return the I/O error code.
595bff75 1491 */
eaebb515 1492static int
bb00b6f1 1493xfs_buf_iowait(
595bff75 1494 struct xfs_buf *bp)
1da177e4 1495{
bb00b6f1
BF
1496 ASSERT(!(bp->b_flags & XBF_ASYNC));
1497
1498 trace_xfs_buf_iowait(bp, _RET_IP_);
1499 wait_for_completion(&bp->b_iowait);
1500 trace_xfs_buf_iowait_done(bp, _RET_IP_);
1501
1502 return bp->b_error;
1503}
1504
1505/*
1506 * Buffer I/O submission path, read or write. Asynchronous submission transfers
1507 * the buffer lock ownership and the current reference to the IO. It is not
1508 * safe to reference the buffer after a call to this function unless the caller
1509 * holds an additional reference itself.
1510 */
1511int
1512__xfs_buf_submit(
1513 struct xfs_buf *bp,
1514 bool wait)
1515{
1516 int error = 0;
1517
595bff75 1518 trace_xfs_buf_submit(bp, _RET_IP_);
1da177e4 1519
43ff2122 1520 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
595bff75
DC
1521
1522 /* on shutdown we stale and complete the buffer immediately */
dbd329f1 1523 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
54b3b1f6 1524 xfs_buf_ioend_fail(bp);
eaebb515 1525 return -EIO;
595bff75 1526 }
1da177e4 1527
bb00b6f1
BF
1528 /*
1529 * Grab a reference so the buffer does not go away underneath us. For
1530 * async buffers, I/O completion drops the callers reference, which
1531 * could occur before submission returns.
1532 */
1533 xfs_buf_hold(bp);
1534
375ec69d 1535 if (bp->b_flags & XBF_WRITE)
ce8e922c 1536 xfs_buf_wait_unpin(bp);
e11bb805 1537
61be9c52
DC
1538 /* clear the internal error state to avoid spurious errors */
1539 bp->b_io_error = 0;
1540
8d6c1210 1541 /*
e11bb805
DC
1542 * Set the count to 1 initially, this will stop an I/O completion
1543 * callout which happens before we have started all the I/O from calling
1544 * xfs_buf_ioend too early.
1da177e4 1545 */
ce8e922c 1546 atomic_set(&bp->b_io_remaining, 1);
eaebb515
BF
1547 if (bp->b_flags & XBF_ASYNC)
1548 xfs_buf_ioacct_inc(bp);
ce8e922c 1549 _xfs_buf_ioapply(bp);
e11bb805 1550
8d6c1210 1551 /*
595bff75
DC
1552 * If _xfs_buf_ioapply failed, we can get back here with only the IO
1553 * reference we took above. If we drop it to zero, run completion so
1554 * that we don't return to the caller with completion still pending.
8d6c1210 1555 */
e8aaba9a 1556 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
eaebb515 1557 if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
e8aaba9a
DC
1558 xfs_buf_ioend(bp);
1559 else
1560 xfs_buf_ioend_async(bp);
1561 }
1da177e4 1562
6af88cda
BF
1563 if (wait)
1564 error = xfs_buf_iowait(bp);
bb00b6f1 1565
595bff75 1566 /*
6af88cda
BF
1567 * Release the hold that keeps the buffer referenced for the entire
1568 * I/O. Note that if the buffer is async, it is not safe to reference
1569 * after this release.
595bff75
DC
1570 */
1571 xfs_buf_rele(bp);
1572 return error;
1da177e4
LT
1573}
1574
88ee2df7 1575void *
ce8e922c 1576xfs_buf_offset(
88ee2df7 1577 struct xfs_buf *bp,
1da177e4
LT
1578 size_t offset)
1579{
1580 struct page *page;
1581
611c9946 1582 if (bp->b_addr)
62926044 1583 return bp->b_addr + offset;
1da177e4 1584
ce8e922c 1585 offset += bp->b_offset;
0e6e847f 1586 page = bp->b_pages[offset >> PAGE_SHIFT];
88ee2df7 1587 return page_address(page) + (offset & (PAGE_SIZE-1));
1da177e4
LT
1588}
1589
1da177e4 1590void
f9a196ee
CH
1591xfs_buf_zero(
1592 struct xfs_buf *bp,
1593 size_t boff,
1594 size_t bsize)
1da177e4 1595{
795cac72 1596 size_t bend;
1da177e4
LT
1597
1598 bend = boff + bsize;
1599 while (boff < bend) {
795cac72
DC
1600 struct page *page;
1601 int page_index, page_offset, csize;
1602
1603 page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1604 page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1605 page = bp->b_pages[page_index];
1606 csize = min_t(size_t, PAGE_SIZE - page_offset,
8124b9b6 1607 BBTOB(bp->b_length) - boff);
1da177e4 1608
795cac72 1609 ASSERT((csize + page_offset) <= PAGE_SIZE);
1da177e4 1610
f9a196ee 1611 memset(page_address(page) + page_offset, 0, csize);
1da177e4
LT
1612
1613 boff += csize;
1da177e4
LT
1614 }
1615}
1616
8d57c216
DW
1617/*
1618 * Log a message about and stale a buffer that a caller has decided is corrupt.
1619 *
1620 * This function should be called for the kinds of metadata corruption that
1621 * cannot be detect from a verifier, such as incorrect inter-block relationship
1622 * data. Do /not/ call this function from a verifier function.
1623 *
1624 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
1625 * be marked stale, but b_error will not be set. The caller is responsible for
1626 * releasing the buffer or fixing it.
1627 */
1628void
1629__xfs_buf_mark_corrupt(
1630 struct xfs_buf *bp,
1631 xfs_failaddr_t fa)
1632{
1633 ASSERT(bp->b_flags & XBF_DONE);
1634
e83cf875 1635 xfs_buf_corruption_error(bp, fa);
8d57c216
DW
1636 xfs_buf_stale(bp);
1637}
1638
1da177e4 1639/*
ce8e922c 1640 * Handling of buffer targets (buftargs).
1da177e4
LT
1641 */
1642
1643/*
430cbeb8
DC
1644 * Wait for any bufs with callbacks that have been submitted but have not yet
1645 * returned. These buffers will have an elevated hold count, so wait on those
1646 * while freeing all the buffers only held by the LRU.
1da177e4 1647 */
e80dfa19
DC
1648static enum lru_status
1649xfs_buftarg_wait_rele(
1650 struct list_head *item,
3f97b163 1651 struct list_lru_one *lru,
e80dfa19
DC
1652 spinlock_t *lru_lock,
1653 void *arg)
1654
1da177e4 1655{
e80dfa19 1656 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
a4082357 1657 struct list_head *dispose = arg;
430cbeb8 1658
e80dfa19 1659 if (atomic_read(&bp->b_hold) > 1) {
a4082357 1660 /* need to wait, so skip it this pass */
e80dfa19 1661 trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
a4082357 1662 return LRU_SKIP;
1da177e4 1663 }
a4082357
DC
1664 if (!spin_trylock(&bp->b_lock))
1665 return LRU_SKIP;
e80dfa19 1666
a4082357
DC
1667 /*
1668 * clear the LRU reference count so the buffer doesn't get
1669 * ignored in xfs_buf_rele().
1670 */
1671 atomic_set(&bp->b_lru_ref, 0);
1672 bp->b_state |= XFS_BSTATE_DISPOSE;
3f97b163 1673 list_lru_isolate_move(lru, item, dispose);
a4082357
DC
1674 spin_unlock(&bp->b_lock);
1675 return LRU_REMOVED;
1da177e4
LT
1676}
1677
e80dfa19
DC
1678void
1679xfs_wait_buftarg(
1680 struct xfs_buftarg *btp)
1681{
a4082357 1682 LIST_HEAD(dispose);
61948b6f
BF
1683 int loop = 0;
1684 bool write_fail = false;
a4082357 1685
85bec546 1686 /*
9c7504aa
BF
1687 * First wait on the buftarg I/O count for all in-flight buffers to be
1688 * released. This is critical as new buffers do not make the LRU until
1689 * they are released.
1690 *
1691 * Next, flush the buffer workqueue to ensure all completion processing
1692 * has finished. Just waiting on buffer locks is not sufficient for
1693 * async IO as the reference count held over IO is not released until
1694 * after the buffer lock is dropped. Hence we need to ensure here that
1695 * all reference counts have been dropped before we start walking the
1696 * LRU list.
85bec546 1697 */
9c7504aa
BF
1698 while (percpu_counter_sum(&btp->bt_io_count))
1699 delay(100);
800b2694 1700 flush_workqueue(btp->bt_mount->m_buf_workqueue);
85bec546 1701
a4082357
DC
1702 /* loop until there is nothing left on the lru list. */
1703 while (list_lru_count(&btp->bt_lru)) {
e80dfa19 1704 list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
a4082357
DC
1705 &dispose, LONG_MAX);
1706
1707 while (!list_empty(&dispose)) {
1708 struct xfs_buf *bp;
1709 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1710 list_del_init(&bp->b_lru);
ac8809f9 1711 if (bp->b_flags & XBF_WRITE_FAIL) {
61948b6f
BF
1712 write_fail = true;
1713 xfs_buf_alert_ratelimited(bp,
1714 "XFS: Corruption Alert",
c219b015 1715"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
ac8809f9
DC
1716 (long long)bp->b_bn);
1717 }
a4082357
DC
1718 xfs_buf_rele(bp);
1719 }
1720 if (loop++ != 0)
1721 delay(100);
1722 }
61948b6f
BF
1723
1724 /*
1725 * If one or more failed buffers were freed, that means dirty metadata
1726 * was thrown away. This should only ever happen after I/O completion
1727 * handling has elevated I/O error(s) to permanent failures and shuts
1728 * down the fs.
1729 */
1730 if (write_fail) {
1731 ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount));
1732 xfs_alert(btp->bt_mount,
1733 "Please run xfs_repair to determine the extent of the problem.");
1734 }
e80dfa19
DC
1735}
1736
1737static enum lru_status
1738xfs_buftarg_isolate(
1739 struct list_head *item,
3f97b163 1740 struct list_lru_one *lru,
e80dfa19
DC
1741 spinlock_t *lru_lock,
1742 void *arg)
1743{
1744 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
1745 struct list_head *dispose = arg;
1746
a4082357
DC
1747 /*
1748 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
1749 * If we fail to get the lock, just skip it.
1750 */
1751 if (!spin_trylock(&bp->b_lock))
1752 return LRU_SKIP;
e80dfa19
DC
1753 /*
1754 * Decrement the b_lru_ref count unless the value is already
1755 * zero. If the value is already zero, we need to reclaim the
1756 * buffer, otherwise it gets another trip through the LRU.
1757 */
19957a18 1758 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
a4082357 1759 spin_unlock(&bp->b_lock);
e80dfa19 1760 return LRU_ROTATE;
a4082357 1761 }
e80dfa19 1762
a4082357 1763 bp->b_state |= XFS_BSTATE_DISPOSE;
3f97b163 1764 list_lru_isolate_move(lru, item, dispose);
a4082357 1765 spin_unlock(&bp->b_lock);
e80dfa19
DC
1766 return LRU_REMOVED;
1767}
1768
addbda40 1769static unsigned long
e80dfa19 1770xfs_buftarg_shrink_scan(
ff57ab21 1771 struct shrinker *shrink,
1495f230 1772 struct shrink_control *sc)
a6867a68 1773{
ff57ab21
DC
1774 struct xfs_buftarg *btp = container_of(shrink,
1775 struct xfs_buftarg, bt_shrinker);
430cbeb8 1776 LIST_HEAD(dispose);
addbda40 1777 unsigned long freed;
430cbeb8 1778
503c358c
VD
1779 freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1780 xfs_buftarg_isolate, &dispose);
430cbeb8
DC
1781
1782 while (!list_empty(&dispose)) {
e80dfa19 1783 struct xfs_buf *bp;
430cbeb8
DC
1784 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1785 list_del_init(&bp->b_lru);
1786 xfs_buf_rele(bp);
1787 }
1788
e80dfa19
DC
1789 return freed;
1790}
1791
addbda40 1792static unsigned long
e80dfa19
DC
1793xfs_buftarg_shrink_count(
1794 struct shrinker *shrink,
1795 struct shrink_control *sc)
1796{
1797 struct xfs_buftarg *btp = container_of(shrink,
1798 struct xfs_buftarg, bt_shrinker);
503c358c 1799 return list_lru_shrink_count(&btp->bt_lru, sc);
a6867a68
DC
1800}
1801
1da177e4
LT
1802void
1803xfs_free_buftarg(
b7963133 1804 struct xfs_buftarg *btp)
1da177e4 1805{
ff57ab21 1806 unregister_shrinker(&btp->bt_shrinker);
9c7504aa
BF
1807 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
1808 percpu_counter_destroy(&btp->bt_io_count);
f5e1dd34 1809 list_lru_destroy(&btp->bt_lru);
ff57ab21 1810
2291dab2 1811 xfs_blkdev_issue_flush(btp);
a6867a68 1812
f0e2d93c 1813 kmem_free(btp);
1da177e4
LT
1814}
1815
3fefdeee
ES
1816int
1817xfs_setsize_buftarg(
1da177e4 1818 xfs_buftarg_t *btp,
3fefdeee 1819 unsigned int sectorsize)
1da177e4 1820{
7c71ee78 1821 /* Set up metadata sector size info */
6da54179
ES
1822 btp->bt_meta_sectorsize = sectorsize;
1823 btp->bt_meta_sectormask = sectorsize - 1;
1da177e4 1824
ce8e922c 1825 if (set_blocksize(btp->bt_bdev, sectorsize)) {
4f10700a 1826 xfs_warn(btp->bt_mount,
a1c6f057
DM
1827 "Cannot set_blocksize to %u on device %pg",
1828 sectorsize, btp->bt_bdev);
2451337d 1829 return -EINVAL;
1da177e4
LT
1830 }
1831
7c71ee78
ES
1832 /* Set up device logical sector size mask */
1833 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
1834 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
1835
1da177e4
LT
1836 return 0;
1837}
1838
1839/*
3fefdeee
ES
1840 * When allocating the initial buffer target we have not yet
1841 * read in the superblock, so don't know what sized sectors
1842 * are being used at this early stage. Play safe.
ce8e922c 1843 */
1da177e4
LT
1844STATIC int
1845xfs_setsize_buftarg_early(
1846 xfs_buftarg_t *btp,
1847 struct block_device *bdev)
1848{
a96c4151 1849 return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
1da177e4
LT
1850}
1851
1da177e4
LT
1852xfs_buftarg_t *
1853xfs_alloc_buftarg(
ebad861b 1854 struct xfs_mount *mp,
486aff5e
DW
1855 struct block_device *bdev,
1856 struct dax_device *dax_dev)
1da177e4
LT
1857{
1858 xfs_buftarg_t *btp;
1859
707e0dda 1860 btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
1da177e4 1861
ebad861b 1862 btp->bt_mount = mp;
ce8e922c
NS
1863 btp->bt_dev = bdev->bd_dev;
1864 btp->bt_bdev = bdev;
486aff5e 1865 btp->bt_daxdev = dax_dev;
0e6e847f 1866
f9bccfcc
BF
1867 /*
1868 * Buffer IO error rate limiting. Limit it to no more than 10 messages
1869 * per 30 seconds so as to not spam logs too much on repeated errors.
1870 */
1871 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
1872 DEFAULT_RATELIMIT_BURST);
1873
1da177e4 1874 if (xfs_setsize_buftarg_early(btp, bdev))
d210a987 1875 goto error_free;
5ca302c8
GC
1876
1877 if (list_lru_init(&btp->bt_lru))
d210a987 1878 goto error_free;
5ca302c8 1879
9c7504aa 1880 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
d210a987 1881 goto error_lru;
9c7504aa 1882
e80dfa19
DC
1883 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
1884 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
ff57ab21 1885 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
e80dfa19 1886 btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
d210a987
MH
1887 if (register_shrinker(&btp->bt_shrinker))
1888 goto error_pcpu;
1da177e4
LT
1889 return btp;
1890
d210a987
MH
1891error_pcpu:
1892 percpu_counter_destroy(&btp->bt_io_count);
1893error_lru:
1894 list_lru_destroy(&btp->bt_lru);
1895error_free:
f0e2d93c 1896 kmem_free(btp);
1da177e4
LT
1897 return NULL;
1898}
1899
20e8a063
BF
1900/*
1901 * Cancel a delayed write list.
1902 *
1903 * Remove each buffer from the list, clear the delwri queue flag and drop the
1904 * associated buffer reference.
1905 */
1906void
1907xfs_buf_delwri_cancel(
1908 struct list_head *list)
1909{
1910 struct xfs_buf *bp;
1911
1912 while (!list_empty(list)) {
1913 bp = list_first_entry(list, struct xfs_buf, b_list);
1914
1915 xfs_buf_lock(bp);
1916 bp->b_flags &= ~_XBF_DELWRI_Q;
1917 list_del_init(&bp->b_list);
1918 xfs_buf_relse(bp);
1919 }
1920}
1921
1da177e4 1922/*
43ff2122
CH
1923 * Add a buffer to the delayed write list.
1924 *
1925 * This queues a buffer for writeout if it hasn't already been. Note that
1926 * neither this routine nor the buffer list submission functions perform
1927 * any internal synchronization. It is expected that the lists are thread-local
1928 * to the callers.
1929 *
1930 * Returns true if we queued up the buffer, or false if it already had
1931 * been on the buffer list.
1da177e4 1932 */
43ff2122 1933bool
ce8e922c 1934xfs_buf_delwri_queue(
43ff2122
CH
1935 struct xfs_buf *bp,
1936 struct list_head *list)
1da177e4 1937{
43ff2122 1938 ASSERT(xfs_buf_islocked(bp));
5a8ee6ba 1939 ASSERT(!(bp->b_flags & XBF_READ));
1da177e4 1940
43ff2122
CH
1941 /*
1942 * If the buffer is already marked delwri it already is queued up
1943 * by someone else for imediate writeout. Just ignore it in that
1944 * case.
1945 */
1946 if (bp->b_flags & _XBF_DELWRI_Q) {
1947 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
1948 return false;
1da177e4 1949 }
1da177e4 1950
43ff2122 1951 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
d808f617
DC
1952
1953 /*
43ff2122
CH
1954 * If a buffer gets written out synchronously or marked stale while it
1955 * is on a delwri list we lazily remove it. To do this, the other party
1956 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
1957 * It remains referenced and on the list. In a rare corner case it
1958 * might get readded to a delwri list after the synchronous writeout, in
1959 * which case we need just need to re-add the flag here.
d808f617 1960 */
43ff2122
CH
1961 bp->b_flags |= _XBF_DELWRI_Q;
1962 if (list_empty(&bp->b_list)) {
1963 atomic_inc(&bp->b_hold);
1964 list_add_tail(&bp->b_list, list);
585e6d88 1965 }
585e6d88 1966
43ff2122 1967 return true;
585e6d88
DC
1968}
1969
089716aa
DC
1970/*
1971 * Compare function is more complex than it needs to be because
1972 * the return value is only 32 bits and we are doing comparisons
1973 * on 64 bit values
1974 */
1975static int
1976xfs_buf_cmp(
1977 void *priv,
1978 struct list_head *a,
1979 struct list_head *b)
1980{
1981 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1982 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1983 xfs_daddr_t diff;
1984
f4b42421 1985 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
089716aa
DC
1986 if (diff < 0)
1987 return -1;
1988 if (diff > 0)
1989 return 1;
1990 return 0;
1991}
1992
26f1fe85 1993/*
e339dd8d
BF
1994 * Submit buffers for write. If wait_list is specified, the buffers are
1995 * submitted using sync I/O and placed on the wait list such that the caller can
1996 * iowait each buffer. Otherwise async I/O is used and the buffers are released
1997 * at I/O completion time. In either case, buffers remain locked until I/O
1998 * completes and the buffer is released from the queue.
26f1fe85 1999 */
43ff2122 2000static int
26f1fe85 2001xfs_buf_delwri_submit_buffers(
43ff2122 2002 struct list_head *buffer_list,
26f1fe85 2003 struct list_head *wait_list)
1da177e4 2004{
43ff2122
CH
2005 struct xfs_buf *bp, *n;
2006 int pinned = 0;
26f1fe85 2007 struct blk_plug plug;
43ff2122 2008
26f1fe85 2009 list_sort(NULL, buffer_list, xfs_buf_cmp);
43ff2122 2010
26f1fe85 2011 blk_start_plug(&plug);
43ff2122 2012 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
26f1fe85 2013 if (!wait_list) {
43ff2122
CH
2014 if (xfs_buf_ispinned(bp)) {
2015 pinned++;
2016 continue;
2017 }
2018 if (!xfs_buf_trylock(bp))
2019 continue;
2020 } else {
2021 xfs_buf_lock(bp);
2022 }
978c7b2f 2023
43ff2122
CH
2024 /*
2025 * Someone else might have written the buffer synchronously or
2026 * marked it stale in the meantime. In that case only the
2027 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
2028 * reference and remove it from the list here.
2029 */
2030 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
2031 list_del_init(&bp->b_list);
2032 xfs_buf_relse(bp);
2033 continue;
2034 }
c9c12971 2035
43ff2122 2036 trace_xfs_buf_delwri_split(bp, _RET_IP_);
a1b7ea5d 2037
cf53e99d 2038 /*
e339dd8d
BF
2039 * If we have a wait list, each buffer (and associated delwri
2040 * queue reference) transfers to it and is submitted
2041 * synchronously. Otherwise, drop the buffer from the delwri
2042 * queue and submit async.
cf53e99d 2043 */
b6983e80 2044 bp->b_flags &= ~_XBF_DELWRI_Q;
e339dd8d 2045 bp->b_flags |= XBF_WRITE;
26f1fe85 2046 if (wait_list) {
e339dd8d 2047 bp->b_flags &= ~XBF_ASYNC;
26f1fe85 2048 list_move_tail(&bp->b_list, wait_list);
e339dd8d
BF
2049 } else {
2050 bp->b_flags |= XBF_ASYNC;
ce8e922c 2051 list_del_init(&bp->b_list);
e339dd8d 2052 }
6af88cda 2053 __xfs_buf_submit(bp, false);
43ff2122
CH
2054 }
2055 blk_finish_plug(&plug);
1da177e4 2056
43ff2122 2057 return pinned;
1da177e4
LT
2058}
2059
2060/*
43ff2122
CH
2061 * Write out a buffer list asynchronously.
2062 *
2063 * This will take the @buffer_list, write all non-locked and non-pinned buffers
2064 * out and not wait for I/O completion on any of the buffers. This interface
2065 * is only safely useable for callers that can track I/O completion by higher
2066 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
2067 * function.
efc3289c
BF
2068 *
2069 * Note: this function will skip buffers it would block on, and in doing so
2070 * leaves them on @buffer_list so they can be retried on a later pass. As such,
2071 * it is up to the caller to ensure that the buffer list is fully submitted or
2072 * cancelled appropriately when they are finished with the list. Failure to
2073 * cancel or resubmit the list until it is empty will result in leaked buffers
2074 * at unmount time.
1da177e4
LT
2075 */
2076int
43ff2122
CH
2077xfs_buf_delwri_submit_nowait(
2078 struct list_head *buffer_list)
1da177e4 2079{
26f1fe85 2080 return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
43ff2122 2081}
1da177e4 2082
43ff2122
CH
2083/*
2084 * Write out a buffer list synchronously.
2085 *
2086 * This will take the @buffer_list, write all buffers out and wait for I/O
2087 * completion on all of the buffers. @buffer_list is consumed by the function,
2088 * so callers must have some other way of tracking buffers if they require such
2089 * functionality.
2090 */
2091int
2092xfs_buf_delwri_submit(
2093 struct list_head *buffer_list)
2094{
26f1fe85 2095 LIST_HEAD (wait_list);
43ff2122
CH
2096 int error = 0, error2;
2097 struct xfs_buf *bp;
1da177e4 2098
26f1fe85 2099 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
1da177e4 2100
43ff2122 2101 /* Wait for IO to complete. */
26f1fe85
DC
2102 while (!list_empty(&wait_list)) {
2103 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
a1b7ea5d 2104
089716aa 2105 list_del_init(&bp->b_list);
cf53e99d 2106
e339dd8d
BF
2107 /*
2108 * Wait on the locked buffer, check for errors and unlock and
2109 * release the delwri queue reference.
2110 */
2111 error2 = xfs_buf_iowait(bp);
43ff2122
CH
2112 xfs_buf_relse(bp);
2113 if (!error)
2114 error = error2;
1da177e4
LT
2115 }
2116
43ff2122 2117 return error;
1da177e4
LT
2118}
2119
7912e7fe
BF
2120/*
2121 * Push a single buffer on a delwri queue.
2122 *
2123 * The purpose of this function is to submit a single buffer of a delwri queue
2124 * and return with the buffer still on the original queue. The waiting delwri
2125 * buffer submission infrastructure guarantees transfer of the delwri queue
2126 * buffer reference to a temporary wait list. We reuse this infrastructure to
2127 * transfer the buffer back to the original queue.
2128 *
2129 * Note the buffer transitions from the queued state, to the submitted and wait
2130 * listed state and back to the queued state during this call. The buffer
2131 * locking and queue management logic between _delwri_pushbuf() and
2132 * _delwri_queue() guarantee that the buffer cannot be queued to another list
2133 * before returning.
2134 */
2135int
2136xfs_buf_delwri_pushbuf(
2137 struct xfs_buf *bp,
2138 struct list_head *buffer_list)
2139{
2140 LIST_HEAD (submit_list);
2141 int error;
2142
2143 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
2144
2145 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
2146
2147 /*
2148 * Isolate the buffer to a new local list so we can submit it for I/O
2149 * independently from the rest of the original list.
2150 */
2151 xfs_buf_lock(bp);
2152 list_move(&bp->b_list, &submit_list);
2153 xfs_buf_unlock(bp);
2154
2155 /*
2156 * Delwri submission clears the DELWRI_Q buffer flag and returns with
e339dd8d 2157 * the buffer on the wait list with the original reference. Rather than
7912e7fe
BF
2158 * bounce the buffer from a local wait list back to the original list
2159 * after I/O completion, reuse the original list as the wait list.
2160 */
2161 xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
2162
2163 /*
e339dd8d
BF
2164 * The buffer is now locked, under I/O and wait listed on the original
2165 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
2166 * return with the buffer unlocked and on the original queue.
7912e7fe 2167 */
e339dd8d 2168 error = xfs_buf_iowait(bp);
7912e7fe
BF
2169 bp->b_flags |= _XBF_DELWRI_Q;
2170 xfs_buf_unlock(bp);
2171
2172 return error;
2173}
2174
04d8b284 2175int __init
ce8e922c 2176xfs_buf_init(void)
1da177e4 2177{
12eba65b
DC
2178 xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
2179 SLAB_HWCACHE_ALIGN |
2180 SLAB_RECLAIM_ACCOUNT |
2181 SLAB_MEM_SPREAD,
2182 NULL);
ce8e922c 2183 if (!xfs_buf_zone)
0b1b213f 2184 goto out;
04d8b284 2185
23ea4032 2186 return 0;
1da177e4 2187
0b1b213f 2188 out:
8758280f 2189 return -ENOMEM;
1da177e4
LT
2190}
2191
1da177e4 2192void
ce8e922c 2193xfs_buf_terminate(void)
1da177e4 2194{
aaf54eb8 2195 kmem_cache_destroy(xfs_buf_zone);
1da177e4 2196}
7561d27e
BF
2197
2198void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
2199{
7561d27e
BF
2200 /*
2201 * Set the lru reference count to 0 based on the error injection tag.
2202 * This allows userspace to disrupt buffer caching for debug/testing
2203 * purposes.
2204 */
dbd329f1 2205 if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
7561d27e
BF
2206 lru_ref = 0;
2207
2208 atomic_set(&bp->b_lru_ref, lru_ref);
2209}
8473fee3
BF
2210
2211/*
2212 * Verify an on-disk magic value against the magic value specified in the
2213 * verifier structure. The verifier magic is in disk byte order so the caller is
2214 * expected to pass the value directly from disk.
2215 */
2216bool
2217xfs_verify_magic(
2218 struct xfs_buf *bp,
15baadf7 2219 __be32 dmagic)
8473fee3 2220{
dbd329f1 2221 struct xfs_mount *mp = bp->b_mount;
8473fee3
BF
2222 int idx;
2223
2224 idx = xfs_sb_version_hascrc(&mp->m_sb);
14ed8688 2225 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
8473fee3
BF
2226 return false;
2227 return dmagic == bp->b_ops->magic[idx];
2228}
15baadf7
DW
2229/*
2230 * Verify an on-disk magic value against the magic value specified in the
2231 * verifier structure. The verifier magic is in disk byte order so the caller is
2232 * expected to pass the value directly from disk.
2233 */
2234bool
2235xfs_verify_magic16(
2236 struct xfs_buf *bp,
2237 __be16 dmagic)
2238{
dbd329f1 2239 struct xfs_mount *mp = bp->b_mount;
15baadf7
DW
2240 int idx;
2241
2242 idx = xfs_sb_version_hascrc(&mp->m_sb);
14ed8688 2243 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
15baadf7
DW
2244 return false;
2245 return dmagic == bp->b_ops->magic16[idx];
2246}