]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - fs/xfs/xfs_buf.c
xfs: call xfs_buf_delwri_queue directly
[mirror_ubuntu-bionic-kernel.git] / fs / xfs / xfs_buf.c
CommitLineData
1da177e4 1/*
f07c2250 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
7b718769 3 * All Rights Reserved.
1da177e4 4 *
7b718769
NS
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
1da177e4
LT
7 * published by the Free Software Foundation.
8 *
7b718769
NS
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
1da177e4 13 *
7b718769
NS
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
1da177e4 17 */
93c189c1 18#include "xfs.h"
1da177e4
LT
19#include <linux/stddef.h>
20#include <linux/errno.h>
5a0e3ad6 21#include <linux/gfp.h>
1da177e4
LT
22#include <linux/pagemap.h>
23#include <linux/init.h>
24#include <linux/vmalloc.h>
25#include <linux/bio.h>
26#include <linux/sysctl.h>
27#include <linux/proc_fs.h>
28#include <linux/workqueue.h>
29#include <linux/percpu.h>
30#include <linux/blkdev.h>
31#include <linux/hash.h>
4df08c52 32#include <linux/kthread.h>
b20a3503 33#include <linux/migrate.h>
3fcfab16 34#include <linux/backing-dev.h>
7dfb7103 35#include <linux/freezer.h>
1da177e4 36
b7963133
CH
37#include "xfs_sb.h"
38#include "xfs_inum.h"
ed3b4d6c 39#include "xfs_log.h"
b7963133 40#include "xfs_ag.h"
b7963133 41#include "xfs_mount.h"
0b1b213f 42#include "xfs_trace.h"
b7963133 43
7989cb8e 44static kmem_zone_t *xfs_buf_zone;
a6867a68 45STATIC int xfsbufd(void *);
23ea4032 46
7989cb8e 47static struct workqueue_struct *xfslogd_workqueue;
0829c360 48struct workqueue_struct *xfsdatad_workqueue;
c626d174 49struct workqueue_struct *xfsconvertd_workqueue;
1da177e4 50
ce8e922c
NS
51#ifdef XFS_BUF_LOCK_TRACKING
52# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
53# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
54# define XB_GET_OWNER(bp) ((bp)->b_last_holder)
1da177e4 55#else
ce8e922c
NS
56# define XB_SET_OWNER(bp) do { } while (0)
57# define XB_CLEAR_OWNER(bp) do { } while (0)
58# define XB_GET_OWNER(bp) do { } while (0)
1da177e4
LT
59#endif
60
ce8e922c
NS
61#define xb_to_gfp(flags) \
62 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
63 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
1da177e4 64
ce8e922c
NS
65#define xb_to_km(flags) \
66 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
1da177e4 67
ce8e922c
NS
68#define xfs_buf_allocate(flags) \
69 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
70#define xfs_buf_deallocate(bp) \
71 kmem_zone_free(xfs_buf_zone, (bp));
1da177e4 72
73c77e2c
JB
73static inline int
74xfs_buf_is_vmapped(
75 struct xfs_buf *bp)
76{
77 /*
78 * Return true if the buffer is vmapped.
79 *
80 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
81 * code is clever enough to know it doesn't have to map a single page,
82 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
83 */
84 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
85}
86
87static inline int
88xfs_buf_vmap_len(
89 struct xfs_buf *bp)
90{
91 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
92}
93
1da177e4 94/*
430cbeb8
DC
95 * xfs_buf_lru_add - add a buffer to the LRU.
96 *
97 * The LRU takes a new reference to the buffer so that it will only be freed
98 * once the shrinker takes the buffer off the LRU.
99 */
100STATIC void
101xfs_buf_lru_add(
102 struct xfs_buf *bp)
103{
104 struct xfs_buftarg *btp = bp->b_target;
105
106 spin_lock(&btp->bt_lru_lock);
107 if (list_empty(&bp->b_lru)) {
108 atomic_inc(&bp->b_hold);
109 list_add_tail(&bp->b_lru, &btp->bt_lru);
110 btp->bt_lru_nr++;
111 }
112 spin_unlock(&btp->bt_lru_lock);
113}
114
115/*
116 * xfs_buf_lru_del - remove a buffer from the LRU
117 *
118 * The unlocked check is safe here because it only occurs when there are not
119 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
120 * to optimise the shrinker removing the buffer from the LRU and calling
25985edc 121 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
430cbeb8 122 * bt_lru_lock.
1da177e4 123 */
430cbeb8
DC
124STATIC void
125xfs_buf_lru_del(
126 struct xfs_buf *bp)
127{
128 struct xfs_buftarg *btp = bp->b_target;
129
130 if (list_empty(&bp->b_lru))
131 return;
132
133 spin_lock(&btp->bt_lru_lock);
134 if (!list_empty(&bp->b_lru)) {
135 list_del_init(&bp->b_lru);
136 btp->bt_lru_nr--;
137 }
138 spin_unlock(&btp->bt_lru_lock);
139}
140
141/*
142 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
143 * b_lru_ref count so that the buffer is freed immediately when the buffer
144 * reference count falls to zero. If the buffer is already on the LRU, we need
145 * to remove the reference that LRU holds on the buffer.
146 *
147 * This prevents build-up of stale buffers on the LRU.
148 */
149void
150xfs_buf_stale(
151 struct xfs_buf *bp)
152{
153 bp->b_flags |= XBF_STALE;
154 atomic_set(&(bp)->b_lru_ref, 0);
155 if (!list_empty(&bp->b_lru)) {
156 struct xfs_buftarg *btp = bp->b_target;
157
158 spin_lock(&btp->bt_lru_lock);
159 if (!list_empty(&bp->b_lru)) {
160 list_del_init(&bp->b_lru);
161 btp->bt_lru_nr--;
162 atomic_dec(&bp->b_hold);
163 }
164 spin_unlock(&btp->bt_lru_lock);
165 }
166 ASSERT(atomic_read(&bp->b_hold) >= 1);
167}
1da177e4
LT
168
169STATIC void
ce8e922c
NS
170_xfs_buf_initialize(
171 xfs_buf_t *bp,
1da177e4 172 xfs_buftarg_t *target,
204ab25f 173 xfs_off_t range_base,
1da177e4 174 size_t range_length,
ce8e922c 175 xfs_buf_flags_t flags)
1da177e4
LT
176{
177 /*
ce8e922c 178 * We don't want certain flags to appear in b_flags.
1da177e4 179 */
ce8e922c
NS
180 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
181
182 memset(bp, 0, sizeof(xfs_buf_t));
183 atomic_set(&bp->b_hold, 1);
430cbeb8 184 atomic_set(&bp->b_lru_ref, 1);
b4dd330b 185 init_completion(&bp->b_iowait);
430cbeb8 186 INIT_LIST_HEAD(&bp->b_lru);
ce8e922c 187 INIT_LIST_HEAD(&bp->b_list);
74f75a0c 188 RB_CLEAR_NODE(&bp->b_rbnode);
a731cd11 189 sema_init(&bp->b_sema, 0); /* held, no waiters */
ce8e922c
NS
190 XB_SET_OWNER(bp);
191 bp->b_target = target;
192 bp->b_file_offset = range_base;
1da177e4
LT
193 /*
194 * Set buffer_length and count_desired to the same value initially.
195 * I/O routines should use count_desired, which will be the same in
196 * most cases but may be reset (e.g. XFS recovery).
197 */
ce8e922c
NS
198 bp->b_buffer_length = bp->b_count_desired = range_length;
199 bp->b_flags = flags;
200 bp->b_bn = XFS_BUF_DADDR_NULL;
201 atomic_set(&bp->b_pin_count, 0);
202 init_waitqueue_head(&bp->b_waiters);
203
204 XFS_STATS_INC(xb_create);
0b1b213f
CH
205
206 trace_xfs_buf_init(bp, _RET_IP_);
1da177e4
LT
207}
208
209/*
ce8e922c
NS
210 * Allocate a page array capable of holding a specified number
211 * of pages, and point the page buf at it.
1da177e4
LT
212 */
213STATIC int
ce8e922c
NS
214_xfs_buf_get_pages(
215 xfs_buf_t *bp,
1da177e4 216 int page_count,
ce8e922c 217 xfs_buf_flags_t flags)
1da177e4
LT
218{
219 /* Make sure that we have a page list */
ce8e922c
NS
220 if (bp->b_pages == NULL) {
221 bp->b_offset = xfs_buf_poff(bp->b_file_offset);
222 bp->b_page_count = page_count;
223 if (page_count <= XB_PAGES) {
224 bp->b_pages = bp->b_page_array;
1da177e4 225 } else {
ce8e922c
NS
226 bp->b_pages = kmem_alloc(sizeof(struct page *) *
227 page_count, xb_to_km(flags));
228 if (bp->b_pages == NULL)
1da177e4
LT
229 return -ENOMEM;
230 }
ce8e922c 231 memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
1da177e4
LT
232 }
233 return 0;
234}
235
236/*
ce8e922c 237 * Frees b_pages if it was allocated.
1da177e4
LT
238 */
239STATIC void
ce8e922c 240_xfs_buf_free_pages(
1da177e4
LT
241 xfs_buf_t *bp)
242{
ce8e922c 243 if (bp->b_pages != bp->b_page_array) {
f0e2d93c 244 kmem_free(bp->b_pages);
3fc98b1a 245 bp->b_pages = NULL;
1da177e4
LT
246 }
247}
248
249/*
250 * Releases the specified buffer.
251 *
252 * The modification state of any associated pages is left unchanged.
ce8e922c 253 * The buffer most not be on any hash - use xfs_buf_rele instead for
1da177e4
LT
254 * hashed and refcounted buffers
255 */
256void
ce8e922c 257xfs_buf_free(
1da177e4
LT
258 xfs_buf_t *bp)
259{
0b1b213f 260 trace_xfs_buf_free(bp, _RET_IP_);
1da177e4 261
430cbeb8
DC
262 ASSERT(list_empty(&bp->b_lru));
263
0e6e847f 264 if (bp->b_flags & _XBF_PAGES) {
1da177e4
LT
265 uint i;
266
73c77e2c 267 if (xfs_buf_is_vmapped(bp))
8a262e57
AE
268 vm_unmap_ram(bp->b_addr - bp->b_offset,
269 bp->b_page_count);
1da177e4 270
948ecdb4
NS
271 for (i = 0; i < bp->b_page_count; i++) {
272 struct page *page = bp->b_pages[i];
273
0e6e847f 274 __free_page(page);
948ecdb4 275 }
0e6e847f
DC
276 } else if (bp->b_flags & _XBF_KMEM)
277 kmem_free(bp->b_addr);
3fc98b1a 278 _xfs_buf_free_pages(bp);
ce8e922c 279 xfs_buf_deallocate(bp);
1da177e4
LT
280}
281
282/*
0e6e847f 283 * Allocates all the pages for buffer in question and builds it's page list.
1da177e4
LT
284 */
285STATIC int
0e6e847f 286xfs_buf_allocate_memory(
1da177e4
LT
287 xfs_buf_t *bp,
288 uint flags)
289{
ce8e922c 290 size_t size = bp->b_count_desired;
1da177e4 291 size_t nbytes, offset;
ce8e922c 292 gfp_t gfp_mask = xb_to_gfp(flags);
1da177e4 293 unsigned short page_count, i;
204ab25f 294 xfs_off_t end;
1da177e4
LT
295 int error;
296
0e6e847f
DC
297 /*
298 * for buffers that are contained within a single page, just allocate
299 * the memory from the heap - there's no need for the complexity of
300 * page arrays to keep allocation down to order 0.
301 */
302 if (bp->b_buffer_length < PAGE_SIZE) {
303 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
304 if (!bp->b_addr) {
305 /* low memory - use alloc_page loop instead */
306 goto use_alloc_page;
307 }
308
309 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
310 PAGE_MASK) !=
311 ((unsigned long)bp->b_addr & PAGE_MASK)) {
312 /* b_addr spans two pages - use alloc_page instead */
313 kmem_free(bp->b_addr);
314 bp->b_addr = NULL;
315 goto use_alloc_page;
316 }
317 bp->b_offset = offset_in_page(bp->b_addr);
318 bp->b_pages = bp->b_page_array;
319 bp->b_pages[0] = virt_to_page(bp->b_addr);
320 bp->b_page_count = 1;
321 bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
322 return 0;
323 }
324
325use_alloc_page:
ce8e922c
NS
326 end = bp->b_file_offset + bp->b_buffer_length;
327 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
ce8e922c 328 error = _xfs_buf_get_pages(bp, page_count, flags);
1da177e4
LT
329 if (unlikely(error))
330 return error;
1da177e4 331
ce8e922c 332 offset = bp->b_offset;
0e6e847f 333 bp->b_flags |= _XBF_PAGES;
1da177e4 334
ce8e922c 335 for (i = 0; i < bp->b_page_count; i++) {
1da177e4
LT
336 struct page *page;
337 uint retries = 0;
0e6e847f
DC
338retry:
339 page = alloc_page(gfp_mask);
1da177e4 340 if (unlikely(page == NULL)) {
ce8e922c
NS
341 if (flags & XBF_READ_AHEAD) {
342 bp->b_page_count = i;
0e6e847f
DC
343 error = ENOMEM;
344 goto out_free_pages;
1da177e4
LT
345 }
346
347 /*
348 * This could deadlock.
349 *
350 * But until all the XFS lowlevel code is revamped to
351 * handle buffer allocation failures we can't do much.
352 */
353 if (!(++retries % 100))
4f10700a
DC
354 xfs_err(NULL,
355 "possible memory allocation deadlock in %s (mode:0x%x)",
34a622b2 356 __func__, gfp_mask);
1da177e4 357
ce8e922c 358 XFS_STATS_INC(xb_page_retries);
8aa7e847 359 congestion_wait(BLK_RW_ASYNC, HZ/50);
1da177e4
LT
360 goto retry;
361 }
362
ce8e922c 363 XFS_STATS_INC(xb_page_found);
1da177e4 364
0e6e847f 365 nbytes = min_t(size_t, size, PAGE_SIZE - offset);
1da177e4 366 size -= nbytes;
ce8e922c 367 bp->b_pages[i] = page;
1da177e4
LT
368 offset = 0;
369 }
0e6e847f 370 return 0;
1da177e4 371
0e6e847f
DC
372out_free_pages:
373 for (i = 0; i < bp->b_page_count; i++)
374 __free_page(bp->b_pages[i]);
1da177e4
LT
375 return error;
376}
377
378/*
25985edc 379 * Map buffer into kernel address-space if necessary.
1da177e4
LT
380 */
381STATIC int
ce8e922c 382_xfs_buf_map_pages(
1da177e4
LT
383 xfs_buf_t *bp,
384 uint flags)
385{
0e6e847f 386 ASSERT(bp->b_flags & _XBF_PAGES);
ce8e922c 387 if (bp->b_page_count == 1) {
0e6e847f 388 /* A single page buffer is always mappable */
ce8e922c
NS
389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
390 bp->b_flags |= XBF_MAPPED;
391 } else if (flags & XBF_MAPPED) {
a19fb380
DC
392 int retried = 0;
393
394 do {
395 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
396 -1, PAGE_KERNEL);
397 if (bp->b_addr)
398 break;
399 vm_unmap_aliases();
400 } while (retried++ <= 1);
401
402 if (!bp->b_addr)
1da177e4 403 return -ENOMEM;
ce8e922c
NS
404 bp->b_addr += bp->b_offset;
405 bp->b_flags |= XBF_MAPPED;
1da177e4
LT
406 }
407
408 return 0;
409}
410
411/*
412 * Finding and Reading Buffers
413 */
414
415/*
ce8e922c 416 * Look up, and creates if absent, a lockable buffer for
1da177e4
LT
417 * a given range of an inode. The buffer is returned
418 * locked. If other overlapping buffers exist, they are
419 * released before the new buffer is created and locked,
420 * which may imply that this call will block until those buffers
421 * are unlocked. No I/O is implied by this call.
422 */
423xfs_buf_t *
ce8e922c 424_xfs_buf_find(
1da177e4 425 xfs_buftarg_t *btp, /* block device target */
204ab25f 426 xfs_off_t ioff, /* starting offset of range */
1da177e4 427 size_t isize, /* length of range */
ce8e922c
NS
428 xfs_buf_flags_t flags,
429 xfs_buf_t *new_bp)
1da177e4 430{
204ab25f 431 xfs_off_t range_base;
1da177e4 432 size_t range_length;
74f75a0c
DC
433 struct xfs_perag *pag;
434 struct rb_node **rbp;
435 struct rb_node *parent;
436 xfs_buf_t *bp;
1da177e4
LT
437
438 range_base = (ioff << BBSHIFT);
439 range_length = (isize << BBSHIFT);
440
441 /* Check for IOs smaller than the sector size / not sector aligned */
ce8e922c 442 ASSERT(!(range_length < (1 << btp->bt_sshift)));
204ab25f 443 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
1da177e4 444
74f75a0c
DC
445 /* get tree root */
446 pag = xfs_perag_get(btp->bt_mount,
447 xfs_daddr_to_agno(btp->bt_mount, ioff));
448
449 /* walk tree */
450 spin_lock(&pag->pag_buf_lock);
451 rbp = &pag->pag_buf_tree.rb_node;
452 parent = NULL;
453 bp = NULL;
454 while (*rbp) {
455 parent = *rbp;
456 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
457
458 if (range_base < bp->b_file_offset)
459 rbp = &(*rbp)->rb_left;
460 else if (range_base > bp->b_file_offset)
461 rbp = &(*rbp)->rb_right;
462 else {
463 /*
464 * found a block offset match. If the range doesn't
465 * match, the only way this is allowed is if the buffer
466 * in the cache is stale and the transaction that made
467 * it stale has not yet committed. i.e. we are
468 * reallocating a busy extent. Skip this buffer and
469 * continue searching to the right for an exact match.
470 */
471 if (bp->b_buffer_length != range_length) {
472 ASSERT(bp->b_flags & XBF_STALE);
473 rbp = &(*rbp)->rb_right;
474 continue;
475 }
ce8e922c 476 atomic_inc(&bp->b_hold);
1da177e4
LT
477 goto found;
478 }
479 }
480
481 /* No match found */
ce8e922c
NS
482 if (new_bp) {
483 _xfs_buf_initialize(new_bp, btp, range_base,
1da177e4 484 range_length, flags);
74f75a0c
DC
485 rb_link_node(&new_bp->b_rbnode, parent, rbp);
486 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
487 /* the buffer keeps the perag reference until it is freed */
488 new_bp->b_pag = pag;
489 spin_unlock(&pag->pag_buf_lock);
1da177e4 490 } else {
ce8e922c 491 XFS_STATS_INC(xb_miss_locked);
74f75a0c
DC
492 spin_unlock(&pag->pag_buf_lock);
493 xfs_perag_put(pag);
1da177e4 494 }
ce8e922c 495 return new_bp;
1da177e4
LT
496
497found:
74f75a0c
DC
498 spin_unlock(&pag->pag_buf_lock);
499 xfs_perag_put(pag);
1da177e4 500
0c842ad4
CH
501 if (!xfs_buf_trylock(bp)) {
502 if (flags & XBF_TRYLOCK) {
ce8e922c
NS
503 xfs_buf_rele(bp);
504 XFS_STATS_INC(xb_busy_locked);
505 return NULL;
1da177e4 506 }
0c842ad4
CH
507 xfs_buf_lock(bp);
508 XFS_STATS_INC(xb_get_locked_waited);
1da177e4
LT
509 }
510
0e6e847f
DC
511 /*
512 * if the buffer is stale, clear all the external state associated with
513 * it. We need to keep flags such as how we allocated the buffer memory
514 * intact here.
515 */
ce8e922c
NS
516 if (bp->b_flags & XBF_STALE) {
517 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
0e6e847f 518 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
2f926587 519 }
0b1b213f
CH
520
521 trace_xfs_buf_find(bp, flags, _RET_IP_);
ce8e922c
NS
522 XFS_STATS_INC(xb_get_locked);
523 return bp;
1da177e4
LT
524}
525
526/*
ce8e922c 527 * Assembles a buffer covering the specified range.
1da177e4
LT
528 * Storage in memory for all portions of the buffer will be allocated,
529 * although backing storage may not be.
530 */
531xfs_buf_t *
6ad112bf 532xfs_buf_get(
1da177e4 533 xfs_buftarg_t *target,/* target for buffer */
204ab25f 534 xfs_off_t ioff, /* starting offset of range */
1da177e4 535 size_t isize, /* length of range */
ce8e922c 536 xfs_buf_flags_t flags)
1da177e4 537{
ce8e922c 538 xfs_buf_t *bp, *new_bp;
0e6e847f 539 int error = 0;
1da177e4 540
ce8e922c
NS
541 new_bp = xfs_buf_allocate(flags);
542 if (unlikely(!new_bp))
1da177e4
LT
543 return NULL;
544
ce8e922c
NS
545 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
546 if (bp == new_bp) {
0e6e847f 547 error = xfs_buf_allocate_memory(bp, flags);
1da177e4
LT
548 if (error)
549 goto no_buffer;
550 } else {
ce8e922c
NS
551 xfs_buf_deallocate(new_bp);
552 if (unlikely(bp == NULL))
1da177e4
LT
553 return NULL;
554 }
555
ce8e922c
NS
556 if (!(bp->b_flags & XBF_MAPPED)) {
557 error = _xfs_buf_map_pages(bp, flags);
1da177e4 558 if (unlikely(error)) {
4f10700a
DC
559 xfs_warn(target->bt_mount,
560 "%s: failed to map pages\n", __func__);
1da177e4
LT
561 goto no_buffer;
562 }
563 }
564
ce8e922c 565 XFS_STATS_INC(xb_get);
1da177e4
LT
566
567 /*
568 * Always fill in the block number now, the mapped cases can do
569 * their own overlay of this later.
570 */
ce8e922c
NS
571 bp->b_bn = ioff;
572 bp->b_count_desired = bp->b_buffer_length;
1da177e4 573
0b1b213f 574 trace_xfs_buf_get(bp, flags, _RET_IP_);
ce8e922c 575 return bp;
1da177e4
LT
576
577 no_buffer:
ce8e922c
NS
578 if (flags & (XBF_LOCK | XBF_TRYLOCK))
579 xfs_buf_unlock(bp);
580 xfs_buf_rele(bp);
1da177e4
LT
581 return NULL;
582}
583
5d765b97
CH
584STATIC int
585_xfs_buf_read(
586 xfs_buf_t *bp,
587 xfs_buf_flags_t flags)
588{
589 int status;
590
5d765b97
CH
591 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
592 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
593
1d5ae5df
CH
594 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD);
595 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
5d765b97
CH
596
597 status = xfs_buf_iorequest(bp);
5a52c2a5 598 if (status || bp->b_error || (flags & XBF_ASYNC))
ec53d1db
DC
599 return status;
600 return xfs_buf_iowait(bp);
5d765b97
CH
601}
602
1da177e4 603xfs_buf_t *
6ad112bf 604xfs_buf_read(
1da177e4 605 xfs_buftarg_t *target,
204ab25f 606 xfs_off_t ioff,
1da177e4 607 size_t isize,
ce8e922c 608 xfs_buf_flags_t flags)
1da177e4 609{
ce8e922c
NS
610 xfs_buf_t *bp;
611
612 flags |= XBF_READ;
613
6ad112bf 614 bp = xfs_buf_get(target, ioff, isize, flags);
ce8e922c 615 if (bp) {
0b1b213f
CH
616 trace_xfs_buf_read(bp, flags, _RET_IP_);
617
ce8e922c 618 if (!XFS_BUF_ISDONE(bp)) {
ce8e922c 619 XFS_STATS_INC(xb_get_read);
5d765b97 620 _xfs_buf_read(bp, flags);
ce8e922c 621 } else if (flags & XBF_ASYNC) {
1da177e4
LT
622 /*
623 * Read ahead call which is already satisfied,
624 * drop the buffer
625 */
626 goto no_buffer;
627 } else {
1da177e4 628 /* We do not want read in the flags */
ce8e922c 629 bp->b_flags &= ~XBF_READ;
1da177e4
LT
630 }
631 }
632
ce8e922c 633 return bp;
1da177e4
LT
634
635 no_buffer:
ce8e922c
NS
636 if (flags & (XBF_LOCK | XBF_TRYLOCK))
637 xfs_buf_unlock(bp);
638 xfs_buf_rele(bp);
1da177e4
LT
639 return NULL;
640}
641
1da177e4 642/*
ce8e922c
NS
643 * If we are not low on memory then do the readahead in a deadlock
644 * safe manner.
1da177e4
LT
645 */
646void
ce8e922c 647xfs_buf_readahead(
1da177e4 648 xfs_buftarg_t *target,
204ab25f 649 xfs_off_t ioff,
1a1a3e97 650 size_t isize)
1da177e4 651{
0e6e847f 652 if (bdi_read_congested(target->bt_bdi))
1da177e4
LT
653 return;
654
1a1a3e97
CH
655 xfs_buf_read(target, ioff, isize,
656 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
1da177e4
LT
657}
658
5adc94c2
DC
659/*
660 * Read an uncached buffer from disk. Allocates and returns a locked
661 * buffer containing the disk contents or nothing.
662 */
663struct xfs_buf *
664xfs_buf_read_uncached(
665 struct xfs_mount *mp,
666 struct xfs_buftarg *target,
667 xfs_daddr_t daddr,
668 size_t length,
669 int flags)
670{
671 xfs_buf_t *bp;
672 int error;
673
674 bp = xfs_buf_get_uncached(target, length, flags);
675 if (!bp)
676 return NULL;
677
678 /* set up the buffer for a read IO */
5adc94c2
DC
679 XFS_BUF_SET_ADDR(bp, daddr);
680 XFS_BUF_READ(bp);
5adc94c2
DC
681
682 xfsbdstrat(mp, bp);
1a1a3e97 683 error = xfs_buf_iowait(bp);
5adc94c2
DC
684 if (error || bp->b_error) {
685 xfs_buf_relse(bp);
686 return NULL;
687 }
688 return bp;
1da177e4
LT
689}
690
691xfs_buf_t *
ce8e922c 692xfs_buf_get_empty(
1da177e4
LT
693 size_t len,
694 xfs_buftarg_t *target)
695{
ce8e922c 696 xfs_buf_t *bp;
1da177e4 697
ce8e922c
NS
698 bp = xfs_buf_allocate(0);
699 if (bp)
700 _xfs_buf_initialize(bp, target, 0, len, 0);
701 return bp;
1da177e4
LT
702}
703
44396476
DC
704/*
705 * Return a buffer allocated as an empty buffer and associated to external
706 * memory via xfs_buf_associate_memory() back to it's empty state.
707 */
708void
709xfs_buf_set_empty(
710 struct xfs_buf *bp,
711 size_t len)
712{
713 if (bp->b_pages)
714 _xfs_buf_free_pages(bp);
715
716 bp->b_pages = NULL;
717 bp->b_page_count = 0;
718 bp->b_addr = NULL;
719 bp->b_file_offset = 0;
720 bp->b_buffer_length = bp->b_count_desired = len;
721 bp->b_bn = XFS_BUF_DADDR_NULL;
722 bp->b_flags &= ~XBF_MAPPED;
723}
724
1da177e4
LT
725static inline struct page *
726mem_to_page(
727 void *addr)
728{
9e2779fa 729 if ((!is_vmalloc_addr(addr))) {
1da177e4
LT
730 return virt_to_page(addr);
731 } else {
732 return vmalloc_to_page(addr);
733 }
734}
735
736int
ce8e922c
NS
737xfs_buf_associate_memory(
738 xfs_buf_t *bp,
1da177e4
LT
739 void *mem,
740 size_t len)
741{
742 int rval;
743 int i = 0;
d1afb678
LM
744 unsigned long pageaddr;
745 unsigned long offset;
746 size_t buflen;
1da177e4
LT
747 int page_count;
748
0e6e847f 749 pageaddr = (unsigned long)mem & PAGE_MASK;
d1afb678 750 offset = (unsigned long)mem - pageaddr;
0e6e847f
DC
751 buflen = PAGE_ALIGN(len + offset);
752 page_count = buflen >> PAGE_SHIFT;
1da177e4
LT
753
754 /* Free any previous set of page pointers */
ce8e922c
NS
755 if (bp->b_pages)
756 _xfs_buf_free_pages(bp);
1da177e4 757
ce8e922c
NS
758 bp->b_pages = NULL;
759 bp->b_addr = mem;
1da177e4 760
36fae17a 761 rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
1da177e4
LT
762 if (rval)
763 return rval;
764
ce8e922c 765 bp->b_offset = offset;
d1afb678
LM
766
767 for (i = 0; i < bp->b_page_count; i++) {
768 bp->b_pages[i] = mem_to_page((void *)pageaddr);
0e6e847f 769 pageaddr += PAGE_SIZE;
1da177e4 770 }
1da177e4 771
d1afb678
LM
772 bp->b_count_desired = len;
773 bp->b_buffer_length = buflen;
ce8e922c 774 bp->b_flags |= XBF_MAPPED;
1da177e4
LT
775
776 return 0;
777}
778
779xfs_buf_t *
686865f7
DC
780xfs_buf_get_uncached(
781 struct xfs_buftarg *target,
1da177e4 782 size_t len,
686865f7 783 int flags)
1da177e4 784{
1fa40b01
CH
785 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
786 int error, i;
1da177e4 787 xfs_buf_t *bp;
1da177e4 788
ce8e922c 789 bp = xfs_buf_allocate(0);
1da177e4
LT
790 if (unlikely(bp == NULL))
791 goto fail;
ce8e922c 792 _xfs_buf_initialize(bp, target, 0, len, 0);
1da177e4 793
1fa40b01
CH
794 error = _xfs_buf_get_pages(bp, page_count, 0);
795 if (error)
1da177e4
LT
796 goto fail_free_buf;
797
1fa40b01 798 for (i = 0; i < page_count; i++) {
686865f7 799 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
1fa40b01
CH
800 if (!bp->b_pages[i])
801 goto fail_free_mem;
1da177e4 802 }
1fa40b01 803 bp->b_flags |= _XBF_PAGES;
1da177e4 804
1fa40b01
CH
805 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
806 if (unlikely(error)) {
4f10700a
DC
807 xfs_warn(target->bt_mount,
808 "%s: failed to map pages\n", __func__);
1da177e4 809 goto fail_free_mem;
1fa40b01 810 }
1da177e4 811
686865f7 812 trace_xfs_buf_get_uncached(bp, _RET_IP_);
1da177e4 813 return bp;
1fa40b01 814
1da177e4 815 fail_free_mem:
1fa40b01
CH
816 while (--i >= 0)
817 __free_page(bp->b_pages[i]);
ca165b88 818 _xfs_buf_free_pages(bp);
1da177e4 819 fail_free_buf:
ca165b88 820 xfs_buf_deallocate(bp);
1da177e4
LT
821 fail:
822 return NULL;
823}
824
825/*
1da177e4
LT
826 * Increment reference count on buffer, to hold the buffer concurrently
827 * with another thread which may release (free) the buffer asynchronously.
1da177e4
LT
828 * Must hold the buffer already to call this function.
829 */
830void
ce8e922c
NS
831xfs_buf_hold(
832 xfs_buf_t *bp)
1da177e4 833{
0b1b213f 834 trace_xfs_buf_hold(bp, _RET_IP_);
ce8e922c 835 atomic_inc(&bp->b_hold);
1da177e4
LT
836}
837
838/*
ce8e922c
NS
839 * Releases a hold on the specified buffer. If the
840 * the hold count is 1, calls xfs_buf_free.
1da177e4
LT
841 */
842void
ce8e922c
NS
843xfs_buf_rele(
844 xfs_buf_t *bp)
1da177e4 845{
74f75a0c 846 struct xfs_perag *pag = bp->b_pag;
1da177e4 847
0b1b213f 848 trace_xfs_buf_rele(bp, _RET_IP_);
1da177e4 849
74f75a0c 850 if (!pag) {
430cbeb8 851 ASSERT(list_empty(&bp->b_lru));
74f75a0c 852 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
fad3aa1e
NS
853 if (atomic_dec_and_test(&bp->b_hold))
854 xfs_buf_free(bp);
855 return;
856 }
857
74f75a0c 858 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
430cbeb8 859
3790689f 860 ASSERT(atomic_read(&bp->b_hold) > 0);
74f75a0c 861 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
bfc60177 862 if (!(bp->b_flags & XBF_STALE) &&
430cbeb8
DC
863 atomic_read(&bp->b_lru_ref)) {
864 xfs_buf_lru_add(bp);
865 spin_unlock(&pag->pag_buf_lock);
1da177e4 866 } else {
430cbeb8 867 xfs_buf_lru_del(bp);
ce8e922c 868 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
74f75a0c
DC
869 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
870 spin_unlock(&pag->pag_buf_lock);
871 xfs_perag_put(pag);
ce8e922c 872 xfs_buf_free(bp);
1da177e4
LT
873 }
874 }
875}
876
877
878/*
0e6e847f 879 * Lock a buffer object, if it is not already locked.
90810b9e
DC
880 *
881 * If we come across a stale, pinned, locked buffer, we know that we are
882 * being asked to lock a buffer that has been reallocated. Because it is
883 * pinned, we know that the log has not been pushed to disk and hence it
884 * will still be locked. Rather than continuing to have trylock attempts
885 * fail until someone else pushes the log, push it ourselves before
886 * returning. This means that the xfsaild will not get stuck trying
887 * to push on stale inode buffers.
1da177e4
LT
888 */
889int
0c842ad4
CH
890xfs_buf_trylock(
891 struct xfs_buf *bp)
1da177e4
LT
892{
893 int locked;
894
ce8e922c 895 locked = down_trylock(&bp->b_sema) == 0;
0b1b213f 896 if (locked)
ce8e922c 897 XB_SET_OWNER(bp);
90810b9e
DC
898 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
899 xfs_log_force(bp->b_target->bt_mount, 0);
0b1b213f 900
0c842ad4
CH
901 trace_xfs_buf_trylock(bp, _RET_IP_);
902 return locked;
1da177e4 903}
1da177e4
LT
904
905/*
0e6e847f 906 * Lock a buffer object.
ed3b4d6c
DC
907 *
908 * If we come across a stale, pinned, locked buffer, we know that we
909 * are being asked to lock a buffer that has been reallocated. Because
910 * it is pinned, we know that the log has not been pushed to disk and
911 * hence it will still be locked. Rather than sleeping until someone
912 * else pushes the log, push it ourselves before trying to get the lock.
1da177e4 913 */
ce8e922c
NS
914void
915xfs_buf_lock(
0c842ad4 916 struct xfs_buf *bp)
1da177e4 917{
0b1b213f
CH
918 trace_xfs_buf_lock(bp, _RET_IP_);
919
ed3b4d6c 920 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
ebad861b 921 xfs_log_force(bp->b_target->bt_mount, 0);
ce8e922c
NS
922 down(&bp->b_sema);
923 XB_SET_OWNER(bp);
0b1b213f
CH
924
925 trace_xfs_buf_lock_done(bp, _RET_IP_);
1da177e4
LT
926}
927
928/*
ce8e922c 929 * Releases the lock on the buffer object.
2f926587 930 * If the buffer is marked delwri but is not queued, do so before we
ce8e922c 931 * unlock the buffer as we need to set flags correctly. We also need to
2f926587
DC
932 * take a reference for the delwri queue because the unlocker is going to
933 * drop their's and they don't know we just queued it.
1da177e4
LT
934 */
935void
ce8e922c 936xfs_buf_unlock(
0c842ad4 937 struct xfs_buf *bp)
1da177e4 938{
ce8e922c
NS
939 XB_CLEAR_OWNER(bp);
940 up(&bp->b_sema);
0b1b213f
CH
941
942 trace_xfs_buf_unlock(bp, _RET_IP_);
1da177e4
LT
943}
944
ce8e922c
NS
945STATIC void
946xfs_buf_wait_unpin(
947 xfs_buf_t *bp)
1da177e4
LT
948{
949 DECLARE_WAITQUEUE (wait, current);
950
ce8e922c 951 if (atomic_read(&bp->b_pin_count) == 0)
1da177e4
LT
952 return;
953
ce8e922c 954 add_wait_queue(&bp->b_waiters, &wait);
1da177e4
LT
955 for (;;) {
956 set_current_state(TASK_UNINTERRUPTIBLE);
ce8e922c 957 if (atomic_read(&bp->b_pin_count) == 0)
1da177e4 958 break;
7eaceacc 959 io_schedule();
1da177e4 960 }
ce8e922c 961 remove_wait_queue(&bp->b_waiters, &wait);
1da177e4
LT
962 set_current_state(TASK_RUNNING);
963}
964
965/*
966 * Buffer Utility Routines
967 */
968
1da177e4 969STATIC void
ce8e922c 970xfs_buf_iodone_work(
c4028958 971 struct work_struct *work)
1da177e4 972{
c4028958
DH
973 xfs_buf_t *bp =
974 container_of(work, xfs_buf_t, b_iodone_work);
1da177e4 975
80f6c29d 976 if (bp->b_iodone)
ce8e922c
NS
977 (*(bp->b_iodone))(bp);
978 else if (bp->b_flags & XBF_ASYNC)
1da177e4
LT
979 xfs_buf_relse(bp);
980}
981
982void
ce8e922c
NS
983xfs_buf_ioend(
984 xfs_buf_t *bp,
1da177e4
LT
985 int schedule)
986{
0b1b213f
CH
987 trace_xfs_buf_iodone(bp, _RET_IP_);
988
77be55a5 989 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
ce8e922c
NS
990 if (bp->b_error == 0)
991 bp->b_flags |= XBF_DONE;
1da177e4 992
ce8e922c 993 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
1da177e4 994 if (schedule) {
c4028958 995 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
ce8e922c 996 queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1da177e4 997 } else {
c4028958 998 xfs_buf_iodone_work(&bp->b_iodone_work);
1da177e4
LT
999 }
1000 } else {
b4dd330b 1001 complete(&bp->b_iowait);
1da177e4
LT
1002 }
1003}
1004
1da177e4 1005void
ce8e922c
NS
1006xfs_buf_ioerror(
1007 xfs_buf_t *bp,
1008 int error)
1da177e4
LT
1009{
1010 ASSERT(error >= 0 && error <= 0xffff);
ce8e922c 1011 bp->b_error = (unsigned short)error;
0b1b213f 1012 trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1da177e4
LT
1013}
1014
1da177e4 1015int
64e0bc7d
CH
1016xfs_bwrite(
1017 struct xfs_mount *mp,
5d765b97 1018 struct xfs_buf *bp)
1da177e4 1019{
8c38366f 1020 int error;
1da177e4 1021
64e0bc7d 1022 bp->b_flags |= XBF_WRITE;
8c38366f 1023 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
1da177e4 1024
5d765b97 1025 xfs_buf_delwri_dequeue(bp);
939d723b 1026 xfs_bdstrat_cb(bp);
1da177e4 1027
8c38366f
CH
1028 error = xfs_buf_iowait(bp);
1029 if (error)
1030 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1031 xfs_buf_relse(bp);
64e0bc7d 1032 return error;
5d765b97 1033}
1da177e4 1034
4e23471a
CH
1035/*
1036 * Called when we want to stop a buffer from getting written or read.
1a1a3e97 1037 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
4e23471a
CH
1038 * so that the proper iodone callbacks get called.
1039 */
1040STATIC int
1041xfs_bioerror(
1042 xfs_buf_t *bp)
1043{
1044#ifdef XFSERRORDEBUG
1045 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1046#endif
1047
1048 /*
1049 * No need to wait until the buffer is unpinned, we aren't flushing it.
1050 */
5a52c2a5 1051 xfs_buf_ioerror(bp, EIO);
4e23471a
CH
1052
1053 /*
1a1a3e97 1054 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
4e23471a
CH
1055 */
1056 XFS_BUF_UNREAD(bp);
61551f1e 1057 xfs_buf_delwri_dequeue(bp);
4e23471a
CH
1058 XFS_BUF_UNDONE(bp);
1059 XFS_BUF_STALE(bp);
1060
1a1a3e97 1061 xfs_buf_ioend(bp, 0);
4e23471a
CH
1062
1063 return EIO;
1064}
1065
1066/*
1067 * Same as xfs_bioerror, except that we are releasing the buffer
1a1a3e97 1068 * here ourselves, and avoiding the xfs_buf_ioend call.
4e23471a
CH
1069 * This is meant for userdata errors; metadata bufs come with
1070 * iodone functions attached, so that we can track down errors.
1071 */
1072STATIC int
1073xfs_bioerror_relse(
1074 struct xfs_buf *bp)
1075{
ed43233b 1076 int64_t fl = bp->b_flags;
4e23471a
CH
1077 /*
1078 * No need to wait until the buffer is unpinned.
1079 * We aren't flushing it.
1080 *
1081 * chunkhold expects B_DONE to be set, whether
1082 * we actually finish the I/O or not. We don't want to
1083 * change that interface.
1084 */
1085 XFS_BUF_UNREAD(bp);
61551f1e 1086 xfs_buf_delwri_dequeue(bp);
4e23471a
CH
1087 XFS_BUF_DONE(bp);
1088 XFS_BUF_STALE(bp);
cb669ca5 1089 bp->b_iodone = NULL;
0cadda1c 1090 if (!(fl & XBF_ASYNC)) {
4e23471a
CH
1091 /*
1092 * Mark b_error and B_ERROR _both_.
1093 * Lot's of chunkcache code assumes that.
1094 * There's no reason to mark error for
1095 * ASYNC buffers.
1096 */
5a52c2a5 1097 xfs_buf_ioerror(bp, EIO);
4e23471a
CH
1098 XFS_BUF_FINISH_IOWAIT(bp);
1099 } else {
1100 xfs_buf_relse(bp);
1101 }
1102
1103 return EIO;
1104}
1105
1106
1107/*
1108 * All xfs metadata buffers except log state machine buffers
1109 * get this attached as their b_bdstrat callback function.
1110 * This is so that we can catch a buffer
1111 * after prematurely unpinning it to forcibly shutdown the filesystem.
1112 */
1113int
1114xfs_bdstrat_cb(
1115 struct xfs_buf *bp)
1116{
ebad861b 1117 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
4e23471a
CH
1118 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1119 /*
1120 * Metadata write that didn't get logged but
1121 * written delayed anyway. These aren't associated
1122 * with a transaction, and can be ignored.
1123 */
1124 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1125 return xfs_bioerror_relse(bp);
1126 else
1127 return xfs_bioerror(bp);
1128 }
1129
1130 xfs_buf_iorequest(bp);
1131 return 0;
1132}
1133
1134/*
1135 * Wrapper around bdstrat so that we can stop data from going to disk in case
1136 * we are shutting down the filesystem. Typically user data goes thru this
1137 * path; one of the exceptions is the superblock.
1138 */
1139void
1140xfsbdstrat(
1141 struct xfs_mount *mp,
1142 struct xfs_buf *bp)
1143{
1144 if (XFS_FORCED_SHUTDOWN(mp)) {
1145 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1146 xfs_bioerror_relse(bp);
1147 return;
1148 }
1149
1150 xfs_buf_iorequest(bp);
1151}
1152
b8f82a4a 1153STATIC void
ce8e922c
NS
1154_xfs_buf_ioend(
1155 xfs_buf_t *bp,
1da177e4
LT
1156 int schedule)
1157{
0e6e847f 1158 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
ce8e922c 1159 xfs_buf_ioend(bp, schedule);
1da177e4
LT
1160}
1161
782e3b3b 1162STATIC void
ce8e922c 1163xfs_buf_bio_end_io(
1da177e4 1164 struct bio *bio,
1da177e4
LT
1165 int error)
1166{
ce8e922c 1167 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1da177e4 1168
cfbe5267 1169 xfs_buf_ioerror(bp, -error);
1da177e4 1170
73c77e2c
JB
1171 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1172 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1173
ce8e922c 1174 _xfs_buf_ioend(bp, 1);
1da177e4 1175 bio_put(bio);
1da177e4
LT
1176}
1177
1178STATIC void
ce8e922c
NS
1179_xfs_buf_ioapply(
1180 xfs_buf_t *bp)
1da177e4 1181{
a9759f2d 1182 int rw, map_i, total_nr_pages, nr_pages;
1da177e4 1183 struct bio *bio;
ce8e922c
NS
1184 int offset = bp->b_offset;
1185 int size = bp->b_count_desired;
1186 sector_t sector = bp->b_bn;
1da177e4 1187
ce8e922c 1188 total_nr_pages = bp->b_page_count;
1da177e4
LT
1189 map_i = 0;
1190
1d5ae5df
CH
1191 if (bp->b_flags & XBF_WRITE) {
1192 if (bp->b_flags & XBF_SYNCIO)
1193 rw = WRITE_SYNC;
1194 else
1195 rw = WRITE;
1196 if (bp->b_flags & XBF_FUA)
1197 rw |= REQ_FUA;
1198 if (bp->b_flags & XBF_FLUSH)
1199 rw |= REQ_FLUSH;
1200 } else if (bp->b_flags & XBF_READ_AHEAD) {
1201 rw = READA;
51bdd706 1202 } else {
1d5ae5df 1203 rw = READ;
f538d4da
CH
1204 }
1205
34951f5c
CH
1206 /* we only use the buffer cache for meta-data */
1207 rw |= REQ_META;
1208
1da177e4 1209next_chunk:
ce8e922c 1210 atomic_inc(&bp->b_io_remaining);
1da177e4
LT
1211 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1212 if (nr_pages > total_nr_pages)
1213 nr_pages = total_nr_pages;
1214
1215 bio = bio_alloc(GFP_NOIO, nr_pages);
ce8e922c 1216 bio->bi_bdev = bp->b_target->bt_bdev;
1da177e4 1217 bio->bi_sector = sector;
ce8e922c
NS
1218 bio->bi_end_io = xfs_buf_bio_end_io;
1219 bio->bi_private = bp;
1da177e4 1220
0e6e847f 1221
1da177e4 1222 for (; size && nr_pages; nr_pages--, map_i++) {
0e6e847f 1223 int rbytes, nbytes = PAGE_SIZE - offset;
1da177e4
LT
1224
1225 if (nbytes > size)
1226 nbytes = size;
1227
ce8e922c
NS
1228 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
1229 if (rbytes < nbytes)
1da177e4
LT
1230 break;
1231
1232 offset = 0;
1233 sector += nbytes >> BBSHIFT;
1234 size -= nbytes;
1235 total_nr_pages--;
1236 }
1237
1da177e4 1238 if (likely(bio->bi_size)) {
73c77e2c
JB
1239 if (xfs_buf_is_vmapped(bp)) {
1240 flush_kernel_vmap_range(bp->b_addr,
1241 xfs_buf_vmap_len(bp));
1242 }
1da177e4
LT
1243 submit_bio(rw, bio);
1244 if (size)
1245 goto next_chunk;
1246 } else {
ce8e922c 1247 xfs_buf_ioerror(bp, EIO);
ec53d1db 1248 bio_put(bio);
1da177e4
LT
1249 }
1250}
1251
1da177e4 1252int
ce8e922c
NS
1253xfs_buf_iorequest(
1254 xfs_buf_t *bp)
1da177e4 1255{
0b1b213f 1256 trace_xfs_buf_iorequest(bp, _RET_IP_);
1da177e4 1257
375ec69d 1258 ASSERT(!(bp->b_flags & XBF_DELWRI));
1da177e4 1259
375ec69d 1260 if (bp->b_flags & XBF_WRITE)
ce8e922c 1261 xfs_buf_wait_unpin(bp);
ce8e922c 1262 xfs_buf_hold(bp);
1da177e4
LT
1263
1264 /* Set the count to 1 initially, this will stop an I/O
1265 * completion callout which happens before we have started
ce8e922c 1266 * all the I/O from calling xfs_buf_ioend too early.
1da177e4 1267 */
ce8e922c
NS
1268 atomic_set(&bp->b_io_remaining, 1);
1269 _xfs_buf_ioapply(bp);
1270 _xfs_buf_ioend(bp, 0);
1da177e4 1271
ce8e922c 1272 xfs_buf_rele(bp);
1da177e4
LT
1273 return 0;
1274}
1275
1276/*
ce8e922c
NS
1277 * Waits for I/O to complete on the buffer supplied.
1278 * It returns immediately if no I/O is pending.
1279 * It returns the I/O error code, if any, or 0 if there was no error.
1da177e4
LT
1280 */
1281int
ce8e922c
NS
1282xfs_buf_iowait(
1283 xfs_buf_t *bp)
1da177e4 1284{
0b1b213f
CH
1285 trace_xfs_buf_iowait(bp, _RET_IP_);
1286
b4dd330b 1287 wait_for_completion(&bp->b_iowait);
0b1b213f
CH
1288
1289 trace_xfs_buf_iowait_done(bp, _RET_IP_);
ce8e922c 1290 return bp->b_error;
1da177e4
LT
1291}
1292
ce8e922c
NS
1293xfs_caddr_t
1294xfs_buf_offset(
1295 xfs_buf_t *bp,
1da177e4
LT
1296 size_t offset)
1297{
1298 struct page *page;
1299
ce8e922c 1300 if (bp->b_flags & XBF_MAPPED)
62926044 1301 return bp->b_addr + offset;
1da177e4 1302
ce8e922c 1303 offset += bp->b_offset;
0e6e847f
DC
1304 page = bp->b_pages[offset >> PAGE_SHIFT];
1305 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1da177e4
LT
1306}
1307
1308/*
1da177e4
LT
1309 * Move data into or out of a buffer.
1310 */
1311void
ce8e922c
NS
1312xfs_buf_iomove(
1313 xfs_buf_t *bp, /* buffer to process */
1da177e4
LT
1314 size_t boff, /* starting buffer offset */
1315 size_t bsize, /* length to copy */
b9c48649 1316 void *data, /* data address */
ce8e922c 1317 xfs_buf_rw_t mode) /* read/write/zero flag */
1da177e4
LT
1318{
1319 size_t bend, cpoff, csize;
1320 struct page *page;
1321
1322 bend = boff + bsize;
1323 while (boff < bend) {
ce8e922c
NS
1324 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1325 cpoff = xfs_buf_poff(boff + bp->b_offset);
1da177e4 1326 csize = min_t(size_t,
0e6e847f 1327 PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1da177e4 1328
0e6e847f 1329 ASSERT(((csize + cpoff) <= PAGE_SIZE));
1da177e4
LT
1330
1331 switch (mode) {
ce8e922c 1332 case XBRW_ZERO:
1da177e4
LT
1333 memset(page_address(page) + cpoff, 0, csize);
1334 break;
ce8e922c 1335 case XBRW_READ:
1da177e4
LT
1336 memcpy(data, page_address(page) + cpoff, csize);
1337 break;
ce8e922c 1338 case XBRW_WRITE:
1da177e4
LT
1339 memcpy(page_address(page) + cpoff, data, csize);
1340 }
1341
1342 boff += csize;
1343 data += csize;
1344 }
1345}
1346
1347/*
ce8e922c 1348 * Handling of buffer targets (buftargs).
1da177e4
LT
1349 */
1350
1351/*
430cbeb8
DC
1352 * Wait for any bufs with callbacks that have been submitted but have not yet
1353 * returned. These buffers will have an elevated hold count, so wait on those
1354 * while freeing all the buffers only held by the LRU.
1da177e4
LT
1355 */
1356void
1357xfs_wait_buftarg(
74f75a0c 1358 struct xfs_buftarg *btp)
1da177e4 1359{
430cbeb8
DC
1360 struct xfs_buf *bp;
1361
1362restart:
1363 spin_lock(&btp->bt_lru_lock);
1364 while (!list_empty(&btp->bt_lru)) {
1365 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1366 if (atomic_read(&bp->b_hold) > 1) {
1367 spin_unlock(&btp->bt_lru_lock);
26af6552 1368 delay(100);
430cbeb8 1369 goto restart;
1da177e4 1370 }
430cbeb8
DC
1371 /*
1372 * clear the LRU reference count so the bufer doesn't get
1373 * ignored in xfs_buf_rele().
1374 */
1375 atomic_set(&bp->b_lru_ref, 0);
1376 spin_unlock(&btp->bt_lru_lock);
1377 xfs_buf_rele(bp);
1378 spin_lock(&btp->bt_lru_lock);
1da177e4 1379 }
430cbeb8 1380 spin_unlock(&btp->bt_lru_lock);
1da177e4
LT
1381}
1382
ff57ab21
DC
1383int
1384xfs_buftarg_shrink(
1385 struct shrinker *shrink,
1495f230 1386 struct shrink_control *sc)
a6867a68 1387{
ff57ab21
DC
1388 struct xfs_buftarg *btp = container_of(shrink,
1389 struct xfs_buftarg, bt_shrinker);
430cbeb8 1390 struct xfs_buf *bp;
1495f230 1391 int nr_to_scan = sc->nr_to_scan;
430cbeb8
DC
1392 LIST_HEAD(dispose);
1393
1394 if (!nr_to_scan)
1395 return btp->bt_lru_nr;
1396
1397 spin_lock(&btp->bt_lru_lock);
1398 while (!list_empty(&btp->bt_lru)) {
1399 if (nr_to_scan-- <= 0)
1400 break;
1401
1402 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1403
1404 /*
1405 * Decrement the b_lru_ref count unless the value is already
1406 * zero. If the value is already zero, we need to reclaim the
1407 * buffer, otherwise it gets another trip through the LRU.
1408 */
1409 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1410 list_move_tail(&bp->b_lru, &btp->bt_lru);
1411 continue;
1412 }
1413
1414 /*
1415 * remove the buffer from the LRU now to avoid needing another
1416 * lock round trip inside xfs_buf_rele().
1417 */
1418 list_move(&bp->b_lru, &dispose);
1419 btp->bt_lru_nr--;
ff57ab21 1420 }
430cbeb8
DC
1421 spin_unlock(&btp->bt_lru_lock);
1422
1423 while (!list_empty(&dispose)) {
1424 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1425 list_del_init(&bp->b_lru);
1426 xfs_buf_rele(bp);
1427 }
1428
1429 return btp->bt_lru_nr;
a6867a68
DC
1430}
1431
1da177e4
LT
1432void
1433xfs_free_buftarg(
b7963133
CH
1434 struct xfs_mount *mp,
1435 struct xfs_buftarg *btp)
1da177e4 1436{
ff57ab21
DC
1437 unregister_shrinker(&btp->bt_shrinker);
1438
1da177e4 1439 xfs_flush_buftarg(btp, 1);
b7963133
CH
1440 if (mp->m_flags & XFS_MOUNT_BARRIER)
1441 xfs_blkdev_issue_flush(btp);
a6867a68 1442
a6867a68 1443 kthread_stop(btp->bt_task);
f0e2d93c 1444 kmem_free(btp);
1da177e4
LT
1445}
1446
1da177e4
LT
1447STATIC int
1448xfs_setsize_buftarg_flags(
1449 xfs_buftarg_t *btp,
1450 unsigned int blocksize,
1451 unsigned int sectorsize,
1452 int verbose)
1453{
ce8e922c
NS
1454 btp->bt_bsize = blocksize;
1455 btp->bt_sshift = ffs(sectorsize) - 1;
1456 btp->bt_smask = sectorsize - 1;
1da177e4 1457
ce8e922c 1458 if (set_blocksize(btp->bt_bdev, sectorsize)) {
4f10700a
DC
1459 xfs_warn(btp->bt_mount,
1460 "Cannot set_blocksize to %u on device %s\n",
c35a549c 1461 sectorsize, xfs_buf_target_name(btp));
1da177e4
LT
1462 return EINVAL;
1463 }
1464
1da177e4
LT
1465 return 0;
1466}
1467
1468/*
ce8e922c
NS
1469 * When allocating the initial buffer target we have not yet
1470 * read in the superblock, so don't know what sized sectors
1471 * are being used is at this early stage. Play safe.
1472 */
1da177e4
LT
1473STATIC int
1474xfs_setsize_buftarg_early(
1475 xfs_buftarg_t *btp,
1476 struct block_device *bdev)
1477{
1478 return xfs_setsize_buftarg_flags(btp,
0e6e847f 1479 PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1da177e4
LT
1480}
1481
1482int
1483xfs_setsize_buftarg(
1484 xfs_buftarg_t *btp,
1485 unsigned int blocksize,
1486 unsigned int sectorsize)
1487{
1488 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1489}
1490
a6867a68
DC
1491STATIC int
1492xfs_alloc_delwrite_queue(
e2a07812
JE
1493 xfs_buftarg_t *btp,
1494 const char *fsname)
a6867a68 1495{
a6867a68 1496 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
007c61c6 1497 spin_lock_init(&btp->bt_delwrite_lock);
a6867a68 1498 btp->bt_flags = 0;
e2a07812 1499 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
ff57ab21
DC
1500 if (IS_ERR(btp->bt_task))
1501 return PTR_ERR(btp->bt_task);
1502 return 0;
a6867a68
DC
1503}
1504
1da177e4
LT
1505xfs_buftarg_t *
1506xfs_alloc_buftarg(
ebad861b 1507 struct xfs_mount *mp,
1da177e4 1508 struct block_device *bdev,
e2a07812
JE
1509 int external,
1510 const char *fsname)
1da177e4
LT
1511{
1512 xfs_buftarg_t *btp;
1513
1514 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1515
ebad861b 1516 btp->bt_mount = mp;
ce8e922c
NS
1517 btp->bt_dev = bdev->bd_dev;
1518 btp->bt_bdev = bdev;
0e6e847f
DC
1519 btp->bt_bdi = blk_get_backing_dev_info(bdev);
1520 if (!btp->bt_bdi)
1521 goto error;
1522
430cbeb8
DC
1523 INIT_LIST_HEAD(&btp->bt_lru);
1524 spin_lock_init(&btp->bt_lru_lock);
1da177e4
LT
1525 if (xfs_setsize_buftarg_early(btp, bdev))
1526 goto error;
e2a07812 1527 if (xfs_alloc_delwrite_queue(btp, fsname))
a6867a68 1528 goto error;
ff57ab21
DC
1529 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1530 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1531 register_shrinker(&btp->bt_shrinker);
1da177e4
LT
1532 return btp;
1533
1534error:
f0e2d93c 1535 kmem_free(btp);
1da177e4
LT
1536 return NULL;
1537}
1538
1539
1540/*
ce8e922c 1541 * Delayed write buffer handling
1da177e4 1542 */
61551f1e 1543void
ce8e922c 1544xfs_buf_delwri_queue(
527cfdf1 1545 xfs_buf_t *bp)
1da177e4 1546{
ce8e922c
NS
1547 struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
1548 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
a6867a68 1549
0b1b213f
CH
1550 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1551
5a8ee6ba 1552 ASSERT(!(bp->b_flags & XBF_READ));
1da177e4 1553
a6867a68 1554 spin_lock(dwlk);
ce8e922c 1555 if (!list_empty(&bp->b_list)) {
5a8ee6ba 1556 /* if already in the queue, move it to the tail */
ce8e922c 1557 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
5a8ee6ba
CH
1558 list_move_tail(&bp->b_list, dwq);
1559 } else {
c9c12971 1560 /* start xfsbufd as it is about to have something to do */
5a8ee6ba
CH
1561 if (list_empty(dwq))
1562 wake_up_process(bp->b_target->bt_task);
c9c12971 1563
5a8ee6ba
CH
1564 atomic_inc(&bp->b_hold);
1565 bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
1566 list_add_tail(&bp->b_list, dwq);
1567 }
ce8e922c 1568 bp->b_queuetime = jiffies;
a6867a68 1569 spin_unlock(dwlk);
1da177e4
LT
1570}
1571
1572void
ce8e922c
NS
1573xfs_buf_delwri_dequeue(
1574 xfs_buf_t *bp)
1da177e4 1575{
ce8e922c 1576 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1da177e4
LT
1577 int dequeued = 0;
1578
a6867a68 1579 spin_lock(dwlk);
ce8e922c
NS
1580 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1581 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1582 list_del_init(&bp->b_list);
1da177e4
LT
1583 dequeued = 1;
1584 }
ce8e922c 1585 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
a6867a68 1586 spin_unlock(dwlk);
1da177e4
LT
1587
1588 if (dequeued)
ce8e922c 1589 xfs_buf_rele(bp);
1da177e4 1590
0b1b213f 1591 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1da177e4
LT
1592}
1593
d808f617
DC
1594/*
1595 * If a delwri buffer needs to be pushed before it has aged out, then promote
1596 * it to the head of the delwri queue so that it will be flushed on the next
1597 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1598 * than the age currently needed to flush the buffer. Hence the next time the
1599 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1600 */
1601void
1602xfs_buf_delwri_promote(
1603 struct xfs_buf *bp)
1604{
1605 struct xfs_buftarg *btp = bp->b_target;
1606 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1607
1608 ASSERT(bp->b_flags & XBF_DELWRI);
1609 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1610
1611 /*
1612 * Check the buffer age before locking the delayed write queue as we
1613 * don't need to promote buffers that are already past the flush age.
1614 */
1615 if (bp->b_queuetime < jiffies - age)
1616 return;
1617 bp->b_queuetime = jiffies - age;
1618 spin_lock(&btp->bt_delwrite_lock);
1619 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1620 spin_unlock(&btp->bt_delwrite_lock);
1621}
1622
1da177e4 1623STATIC void
ce8e922c 1624xfs_buf_runall_queues(
1da177e4
LT
1625 struct workqueue_struct *queue)
1626{
1627 flush_workqueue(queue);
1628}
1629
585e6d88
DC
1630/*
1631 * Move as many buffers as specified to the supplied list
1632 * idicating if we skipped any buffers to prevent deadlocks.
1633 */
1634STATIC int
1635xfs_buf_delwri_split(
1636 xfs_buftarg_t *target,
1637 struct list_head *list,
5e6a07df 1638 unsigned long age)
585e6d88
DC
1639{
1640 xfs_buf_t *bp, *n;
1641 struct list_head *dwq = &target->bt_delwrite_queue;
1642 spinlock_t *dwlk = &target->bt_delwrite_lock;
1643 int skipped = 0;
5e6a07df 1644 int force;
585e6d88 1645
5e6a07df 1646 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
585e6d88
DC
1647 INIT_LIST_HEAD(list);
1648 spin_lock(dwlk);
1649 list_for_each_entry_safe(bp, n, dwq, b_list) {
585e6d88
DC
1650 ASSERT(bp->b_flags & XBF_DELWRI);
1651
811e64c7 1652 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
5e6a07df 1653 if (!force &&
585e6d88
DC
1654 time_before(jiffies, bp->b_queuetime + age)) {
1655 xfs_buf_unlock(bp);
1656 break;
1657 }
1658
1d5ae5df 1659 bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
585e6d88
DC
1660 bp->b_flags |= XBF_WRITE;
1661 list_move_tail(&bp->b_list, list);
bfe27419 1662 trace_xfs_buf_delwri_split(bp, _RET_IP_);
585e6d88
DC
1663 } else
1664 skipped++;
1665 }
1666 spin_unlock(dwlk);
1667
1668 return skipped;
1669
1670}
1671
089716aa
DC
1672/*
1673 * Compare function is more complex than it needs to be because
1674 * the return value is only 32 bits and we are doing comparisons
1675 * on 64 bit values
1676 */
1677static int
1678xfs_buf_cmp(
1679 void *priv,
1680 struct list_head *a,
1681 struct list_head *b)
1682{
1683 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1684 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1685 xfs_daddr_t diff;
1686
1687 diff = ap->b_bn - bp->b_bn;
1688 if (diff < 0)
1689 return -1;
1690 if (diff > 0)
1691 return 1;
1692 return 0;
1693}
1694
1da177e4 1695STATIC int
23ea4032 1696xfsbufd(
585e6d88 1697 void *data)
1da177e4 1698{
089716aa 1699 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1da177e4 1700
1da177e4
LT
1701 current->flags |= PF_MEMALLOC;
1702
978c7b2f
RW
1703 set_freezable();
1704
1da177e4 1705 do {
c9c12971
DC
1706 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1707 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
089716aa 1708 struct list_head tmp;
a1b7ea5d 1709 struct blk_plug plug;
c9c12971 1710
3e1d1d28 1711 if (unlikely(freezing(current))) {
ce8e922c 1712 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
3e1d1d28 1713 refrigerator();
abd0cf7a 1714 } else {
ce8e922c 1715 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
abd0cf7a 1716 }
1da177e4 1717
c9c12971
DC
1718 /* sleep for a long time if there is nothing to do. */
1719 if (list_empty(&target->bt_delwrite_queue))
1720 tout = MAX_SCHEDULE_TIMEOUT;
1721 schedule_timeout_interruptible(tout);
1da177e4 1722
c9c12971 1723 xfs_buf_delwri_split(target, &tmp, age);
089716aa 1724 list_sort(NULL, &tmp, xfs_buf_cmp);
a1b7ea5d
CH
1725
1726 blk_start_plug(&plug);
1da177e4 1727 while (!list_empty(&tmp)) {
089716aa
DC
1728 struct xfs_buf *bp;
1729 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
ce8e922c 1730 list_del_init(&bp->b_list);
939d723b 1731 xfs_bdstrat_cb(bp);
1da177e4 1732 }
a1b7ea5d 1733 blk_finish_plug(&plug);
4df08c52 1734 } while (!kthread_should_stop());
1da177e4 1735
4df08c52 1736 return 0;
1da177e4
LT
1737}
1738
1739/*
ce8e922c
NS
1740 * Go through all incore buffers, and release buffers if they belong to
1741 * the given device. This is used in filesystem error handling to
1742 * preserve the consistency of its metadata.
1da177e4
LT
1743 */
1744int
1745xfs_flush_buftarg(
585e6d88
DC
1746 xfs_buftarg_t *target,
1747 int wait)
1da177e4 1748{
089716aa 1749 xfs_buf_t *bp;
585e6d88 1750 int pincount = 0;
089716aa
DC
1751 LIST_HEAD(tmp_list);
1752 LIST_HEAD(wait_list);
a1b7ea5d 1753 struct blk_plug plug;
1da177e4 1754
c626d174 1755 xfs_buf_runall_queues(xfsconvertd_workqueue);
ce8e922c
NS
1756 xfs_buf_runall_queues(xfsdatad_workqueue);
1757 xfs_buf_runall_queues(xfslogd_workqueue);
1da177e4 1758
5e6a07df 1759 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
089716aa 1760 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1da177e4
LT
1761
1762 /*
089716aa
DC
1763 * Dropped the delayed write list lock, now walk the temporary list.
1764 * All I/O is issued async and then if we need to wait for completion
1765 * we do that after issuing all the IO.
1da177e4 1766 */
089716aa 1767 list_sort(NULL, &tmp_list, xfs_buf_cmp);
a1b7ea5d
CH
1768
1769 blk_start_plug(&plug);
089716aa
DC
1770 while (!list_empty(&tmp_list)) {
1771 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
585e6d88 1772 ASSERT(target == bp->b_target);
089716aa
DC
1773 list_del_init(&bp->b_list);
1774 if (wait) {
ce8e922c 1775 bp->b_flags &= ~XBF_ASYNC;
089716aa
DC
1776 list_add(&bp->b_list, &wait_list);
1777 }
939d723b 1778 xfs_bdstrat_cb(bp);
1da177e4 1779 }
a1b7ea5d 1780 blk_finish_plug(&plug);
1da177e4 1781
089716aa 1782 if (wait) {
a1b7ea5d 1783 /* Wait for IO to complete. */
089716aa
DC
1784 while (!list_empty(&wait_list)) {
1785 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
f07c2250 1786
089716aa 1787 list_del_init(&bp->b_list);
1a1a3e97 1788 xfs_buf_iowait(bp);
089716aa
DC
1789 xfs_buf_relse(bp);
1790 }
1da177e4
LT
1791 }
1792
1da177e4
LT
1793 return pincount;
1794}
1795
04d8b284 1796int __init
ce8e922c 1797xfs_buf_init(void)
1da177e4 1798{
8758280f
NS
1799 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1800 KM_ZONE_HWALIGN, NULL);
ce8e922c 1801 if (!xfs_buf_zone)
0b1b213f 1802 goto out;
04d8b284 1803
51749e47 1804 xfslogd_workqueue = alloc_workqueue("xfslogd",
6370a6ad 1805 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
23ea4032 1806 if (!xfslogd_workqueue)
04d8b284 1807 goto out_free_buf_zone;
1da177e4 1808
83e75904 1809 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
23ea4032
CH
1810 if (!xfsdatad_workqueue)
1811 goto out_destroy_xfslogd_workqueue;
1da177e4 1812
83e75904
TH
1813 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
1814 WQ_MEM_RECLAIM, 1);
c626d174
DC
1815 if (!xfsconvertd_workqueue)
1816 goto out_destroy_xfsdatad_workqueue;
1817
23ea4032 1818 return 0;
1da177e4 1819
c626d174
DC
1820 out_destroy_xfsdatad_workqueue:
1821 destroy_workqueue(xfsdatad_workqueue);
23ea4032
CH
1822 out_destroy_xfslogd_workqueue:
1823 destroy_workqueue(xfslogd_workqueue);
23ea4032 1824 out_free_buf_zone:
ce8e922c 1825 kmem_zone_destroy(xfs_buf_zone);
0b1b213f 1826 out:
8758280f 1827 return -ENOMEM;
1da177e4
LT
1828}
1829
1da177e4 1830void
ce8e922c 1831xfs_buf_terminate(void)
1da177e4 1832{
c626d174 1833 destroy_workqueue(xfsconvertd_workqueue);
04d8b284
CH
1834 destroy_workqueue(xfsdatad_workqueue);
1835 destroy_workqueue(xfslogd_workqueue);
ce8e922c 1836 kmem_zone_destroy(xfs_buf_zone);
1da177e4 1837}
e6a0e9cd
TS
1838
1839#ifdef CONFIG_KDB_MODULES
1840struct list_head *
1841xfs_get_buftarg_list(void)
1842{
1843 return &xfs_buftarg_list;
1844}
1845#endif