]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - fs/nfs/direct.c
NFS: avoid some stat gathering for direct io
[mirror_ubuntu-zesty-kernel.git] / fs / nfs / direct.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/nfs/direct.c
3 *
4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
5 *
6 * High-performance uncached I/O for the Linux NFS client
7 *
8 * There are important applications whose performance or correctness
9 * depends on uncached access to file data. Database clusters
88467055 10 * (multiple copies of the same instance running on separate hosts)
1da177e4 11 * implement their own cache coherency protocol that subsumes file
88467055
CL
12 * system cache protocols. Applications that process datasets
13 * considerably larger than the client's memory do not always benefit
14 * from a local cache. A streaming video server, for instance, has no
1da177e4
LT
15 * need to cache the contents of a file.
16 *
17 * When an application requests uncached I/O, all read and write requests
18 * are made directly to the server; data stored or fetched via these
19 * requests is not cached in the Linux page cache. The client does not
20 * correct unaligned requests from applications. All requested bytes are
21 * held on permanent storage before a direct write system call returns to
22 * an application.
23 *
24 * Solaris implements an uncached I/O facility called directio() that
25 * is used for backups and sequential I/O to very large files. Solaris
26 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
27 * an undocumented mount option.
28 *
29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
30 * help from Andrew Morton.
31 *
32 * 18 Dec 2001 Initial implementation for 2.4 --cel
33 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
34 * 08 Jun 2003 Port to 2.5 APIs --cel
35 * 31 Mar 2004 Handle direct I/O without VFS support --cel
36 * 15 Sep 2004 Parallel async reads --cel
88467055 37 * 04 May 2005 support O_DIRECT with aio --cel
1da177e4
LT
38 *
39 */
40
1da177e4
LT
41#include <linux/errno.h>
42#include <linux/sched.h>
43#include <linux/kernel.h>
1da177e4
LT
44#include <linux/file.h>
45#include <linux/pagemap.h>
46#include <linux/kref.h>
5a0e3ad6 47#include <linux/slab.h>
7ec10f26 48#include <linux/task_io_accounting_ops.h>
1da177e4
LT
49
50#include <linux/nfs_fs.h>
51#include <linux/nfs_page.h>
52#include <linux/sunrpc/clnt.h>
53
1da177e4 54#include <asm/uaccess.h>
60063497 55#include <linux/atomic.h>
1da177e4 56
8d5658c9 57#include "internal.h"
91d5b470
CL
58#include "iostat.h"
59
1da177e4 60#define NFSDBG_FACILITY NFSDBG_VFS
1da177e4 61
e18b890b 62static struct kmem_cache *nfs_direct_cachep;
1da177e4
LT
63
64/*
65 * This represents a set of asynchronous requests that we're waiting on
66 */
67struct nfs_direct_req {
68 struct kref kref; /* release manager */
15ce4a0c
CL
69
70 /* I/O parameters */
a8881f5a 71 struct nfs_open_context *ctx; /* file open context info */
f11ac8db 72 struct nfs_lock_context *l_ctx; /* Lock context info */
99514f8f 73 struct kiocb * iocb; /* controlling i/o request */
88467055 74 struct inode * inode; /* target file of i/o */
15ce4a0c
CL
75
76 /* completion state */
607f31e8 77 atomic_t io_count; /* i/os we're waiting for */
15ce4a0c 78 spinlock_t lock; /* protect completion state */
15ce4a0c 79 ssize_t count, /* bytes actually processed */
1da177e4 80 error; /* any reported error */
d72b7a6b 81 struct completion completion; /* wait for i/o completion */
fad61490
TM
82
83 /* commit state */
607f31e8 84 struct list_head rewrite_list; /* saved nfs_write_data structs */
0b7c0153 85 struct nfs_commit_data *commit_data; /* special write_data for commits */
fad61490
TM
86 int flags;
87#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
88#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
89 struct nfs_writeverf verf; /* unstable write verifier */
1da177e4
LT
90};
91
fad61490 92static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
607f31e8
TM
93static const struct rpc_call_ops nfs_write_direct_ops;
94
95static inline void get_dreq(struct nfs_direct_req *dreq)
96{
97 atomic_inc(&dreq->io_count);
98}
99
100static inline int put_dreq(struct nfs_direct_req *dreq)
101{
102 return atomic_dec_and_test(&dreq->io_count);
103}
104
1da177e4 105/**
b8a32e2b
CL
106 * nfs_direct_IO - NFS address space operation for direct I/O
107 * @rw: direction (read or write)
108 * @iocb: target I/O control block
109 * @iov: array of vectors that define I/O buffer
110 * @pos: offset in file to begin the operation
111 * @nr_segs: size of iovec array
112 *
113 * The presence of this routine in the address space ops vector means
114 * the NFS client supports direct I/O. However, we shunt off direct
115 * read and write requests before the VFS gets them, so this method
116 * should never be called.
1da177e4 117 */
b8a32e2b
CL
118ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
119{
b8a32e2b 120 dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
01cce933 121 iocb->ki_filp->f_path.dentry->d_name.name,
e99170ff 122 (long long) pos, nr_segs);
b8a32e2b
CL
123
124 return -EINVAL;
125}
126
749e146e 127static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
9c93ab7d 128{
749e146e 129 unsigned int i;
607f31e8
TM
130 for (i = 0; i < npages; i++)
131 page_cache_release(pages[i]);
6b45d858
TM
132}
133
93619e59 134static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
1da177e4 135{
93619e59
CL
136 struct nfs_direct_req *dreq;
137
e94b1766 138 dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL);
93619e59
CL
139 if (!dreq)
140 return NULL;
141
142 kref_init(&dreq->kref);
607f31e8 143 kref_get(&dreq->kref);
d72b7a6b 144 init_completion(&dreq->completion);
fad61490 145 INIT_LIST_HEAD(&dreq->rewrite_list);
93619e59 146 dreq->iocb = NULL;
a8881f5a 147 dreq->ctx = NULL;
f11ac8db 148 dreq->l_ctx = NULL;
15ce4a0c 149 spin_lock_init(&dreq->lock);
607f31e8 150 atomic_set(&dreq->io_count, 0);
15ce4a0c
CL
151 dreq->count = 0;
152 dreq->error = 0;
fad61490 153 dreq->flags = 0;
93619e59
CL
154
155 return dreq;
1da177e4
LT
156}
157
b4946ffb 158static void nfs_direct_req_free(struct kref *kref)
1da177e4
LT
159{
160 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
a8881f5a 161
f11ac8db
TM
162 if (dreq->l_ctx != NULL)
163 nfs_put_lock_context(dreq->l_ctx);
a8881f5a
TM
164 if (dreq->ctx != NULL)
165 put_nfs_open_context(dreq->ctx);
1da177e4
LT
166 kmem_cache_free(nfs_direct_cachep, dreq);
167}
168
b4946ffb
TM
169static void nfs_direct_req_release(struct nfs_direct_req *dreq)
170{
171 kref_put(&dreq->kref, nfs_direct_req_free);
172}
173
bc0fb201
CL
174/*
175 * Collects and returns the final error value/byte-count.
176 */
177static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
178{
15ce4a0c 179 ssize_t result = -EIOCBQUEUED;
bc0fb201
CL
180
181 /* Async requests don't wait here */
182 if (dreq->iocb)
183 goto out;
184
150030b7 185 result = wait_for_completion_killable(&dreq->completion);
bc0fb201
CL
186
187 if (!result)
15ce4a0c 188 result = dreq->error;
bc0fb201 189 if (!result)
15ce4a0c 190 result = dreq->count;
bc0fb201
CL
191
192out:
bc0fb201
CL
193 return (ssize_t) result;
194}
195
63ab46ab 196/*
607f31e8
TM
197 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
198 * the iocb is still valid here if this is a synchronous request.
63ab46ab
CL
199 */
200static void nfs_direct_complete(struct nfs_direct_req *dreq)
201{
63ab46ab 202 if (dreq->iocb) {
15ce4a0c 203 long res = (long) dreq->error;
63ab46ab 204 if (!res)
15ce4a0c 205 res = (long) dreq->count;
63ab46ab 206 aio_complete(dreq->iocb, res, 0);
d72b7a6b
TM
207 }
208 complete_all(&dreq->completion);
63ab46ab 209
b4946ffb 210 nfs_direct_req_release(dreq);
63ab46ab
CL
211}
212
584aa810 213void nfs_direct_readpage_release(struct nfs_page *req)
1da177e4 214{
584aa810
FI
215 dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
216 req->wb_context->dentry->d_inode->i_sb->s_id,
217 (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
218 req->wb_bytes,
219 (long long)req_offset(req));
220 nfs_release_request(req);
fdd1e74c
TM
221}
222
584aa810 223static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
fdd1e74c 224{
584aa810
FI
225 unsigned long bytes = 0;
226 struct nfs_direct_req *dreq = hdr->dreq;
fdd1e74c 227
584aa810
FI
228 if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
229 goto out_put;
15ce4a0c
CL
230
231 spin_lock(&dreq->lock);
584aa810
FI
232 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
233 dreq->error = hdr->error;
234 else
235 dreq->count += hdr->good_bytes;
236 spin_unlock(&dreq->lock);
237
238 if (!test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
239 while (!list_empty(&hdr->pages)) {
240 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
241 struct page *page = req->wb_page;
242
243 if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
244 if (bytes > hdr->good_bytes)
245 zero_user(page, 0, PAGE_SIZE);
246 else if (hdr->good_bytes - bytes < PAGE_SIZE)
247 zero_user_segment(page,
248 hdr->good_bytes & ~PAGE_MASK,
249 PAGE_SIZE);
250 }
251 bytes += req->wb_bytes;
252 nfs_list_remove_request(req);
253 nfs_direct_readpage_release(req);
254 if (!PageCompound(page))
255 set_page_dirty(page);
256 page_cache_release(page);
257 }
d4a8f367 258 } else {
584aa810
FI
259 while (!list_empty(&hdr->pages)) {
260 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
261
262 if (bytes < hdr->good_bytes)
263 if (!PageCompound(req->wb_page))
264 set_page_dirty(req->wb_page);
265 bytes += req->wb_bytes;
266 page_cache_release(req->wb_page);
267 nfs_list_remove_request(req);
268 nfs_direct_readpage_release(req);
269 }
d4a8f367 270 }
584aa810 271out_put:
607f31e8
TM
272 if (put_dreq(dreq))
273 nfs_direct_complete(dreq);
584aa810 274 hdr->release(hdr);
1da177e4
LT
275}
276
584aa810 277static void nfs_sync_pgio_error(struct list_head *head)
cd841605 278{
584aa810 279 struct nfs_page *req;
cd841605 280
584aa810
FI
281 while (!list_empty(head)) {
282 req = nfs_list_entry(head->next);
283 nfs_list_remove_request(req);
284 nfs_release_request(req);
285 }
cd841605
FI
286}
287
584aa810
FI
288static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
289{
290 get_dreq(hdr->dreq);
291}
292
293static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
294 .error_cleanup = nfs_sync_pgio_error,
295 .init_hdr = nfs_direct_pgio_init,
296 .completion = nfs_direct_read_completion,
297};
298
d4cc948b 299/*
607f31e8
TM
300 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
301 * operation. If nfs_readdata_alloc() or get_user_pages() fails,
302 * bail and stop sending more reads. Read length accounting is
303 * handled automatically by nfs_direct_read_result(). Otherwise, if
304 * no requests have been sent, just return an error.
1da177e4 305 */
584aa810 306static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
02fe4946
CL
307 const struct iovec *iov,
308 loff_t pos)
1da177e4 309{
584aa810 310 struct nfs_direct_req *dreq = desc->pg_dreq;
a8881f5a 311 struct nfs_open_context *ctx = dreq->ctx;
3d4ff43d 312 struct inode *inode = ctx->dentry->d_inode;
02fe4946
CL
313 unsigned long user_addr = (unsigned long)iov->iov_base;
314 size_t count = iov->iov_len;
5dd602f2 315 size_t rsize = NFS_SERVER(inode)->rsize;
607f31e8
TM
316 unsigned int pgbase;
317 int result;
318 ssize_t started = 0;
584aa810
FI
319 struct page **pagevec = NULL;
320 unsigned int npages;
607f31e8 321
1da177e4 322 do {
5dd602f2 323 size_t bytes;
584aa810 324 int i;
1da177e4 325
e9f7bee1 326 pgbase = user_addr & ~PAGE_MASK;
584aa810 327 bytes = min(max(rsize, PAGE_SIZE), count);
e9f7bee1 328
607f31e8 329 result = -ENOMEM;
584aa810
FI
330 npages = nfs_page_array_len(pgbase, bytes);
331 if (!pagevec)
332 pagevec = kmalloc(npages * sizeof(struct page *),
333 GFP_KERNEL);
334 if (!pagevec)
4db6e0b7 335 break;
607f31e8
TM
336 down_read(&current->mm->mmap_sem);
337 result = get_user_pages(current, current->mm, user_addr,
584aa810 338 npages, 1, 0, pagevec, NULL);
607f31e8 339 up_read(&current->mm->mmap_sem);
584aa810 340 if (result < 0)
749e146e 341 break;
584aa810 342 if ((unsigned)result < npages) {
d9df8d6b
TM
343 bytes = result * PAGE_SIZE;
344 if (bytes <= pgbase) {
584aa810 345 nfs_direct_release_pages(pagevec, result);
d9df8d6b
TM
346 break;
347 }
348 bytes -= pgbase;
584aa810 349 npages = result;
607f31e8
TM
350 }
351
584aa810
FI
352 for (i = 0; i < npages; i++) {
353 struct nfs_page *req;
354 unsigned int req_len = min(bytes, PAGE_SIZE - pgbase);
355 /* XXX do we need to do the eof zeroing found in async_filler? */
356 req = nfs_create_request(dreq->ctx, dreq->inode,
357 pagevec[i],
358 pgbase, req_len);
359 if (IS_ERR(req)) {
360 nfs_direct_release_pages(pagevec + i,
361 npages - i);
362 result = PTR_ERR(req);
363 break;
364 }
365 req->wb_index = pos >> PAGE_SHIFT;
366 req->wb_offset = pos & ~PAGE_MASK;
367 if (!nfs_pageio_add_request(desc, req)) {
368 result = desc->pg_error;
369 nfs_release_request(req);
370 nfs_direct_release_pages(pagevec + i,
371 npages - i);
372 break;
373 }
374 pgbase = 0;
375 bytes -= req_len;
376 started += req_len;
377 user_addr += req_len;
378 pos += req_len;
379 count -= req_len;
380 }
1da177e4 381 } while (count != 0);
607f31e8 382
584aa810
FI
383 kfree(pagevec);
384
607f31e8 385 if (started)
c216fd70 386 return started;
607f31e8 387 return result < 0 ? (ssize_t) result : -EFAULT;
1da177e4
LT
388}
389
19f73787
CL
390static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
391 const struct iovec *iov,
392 unsigned long nr_segs,
393 loff_t pos)
394{
584aa810 395 struct nfs_pageio_descriptor desc;
19f73787
CL
396 ssize_t result = -EINVAL;
397 size_t requested_bytes = 0;
398 unsigned long seg;
399
584aa810
FI
400 nfs_pageio_init_read(&desc, dreq->inode,
401 &nfs_direct_read_completion_ops);
19f73787 402 get_dreq(dreq);
584aa810 403 desc.pg_dreq = dreq;
19f73787
CL
404
405 for (seg = 0; seg < nr_segs; seg++) {
406 const struct iovec *vec = &iov[seg];
584aa810 407 result = nfs_direct_read_schedule_segment(&desc, vec, pos);
19f73787
CL
408 if (result < 0)
409 break;
410 requested_bytes += result;
411 if ((size_t)result < vec->iov_len)
412 break;
413 pos += vec->iov_len;
414 }
415
584aa810
FI
416 nfs_pageio_complete(&desc);
417
839f7ad6
CL
418 /*
419 * If no bytes were started, return the error, and let the
420 * generic layer handle the completion.
421 */
422 if (requested_bytes == 0) {
423 nfs_direct_req_release(dreq);
424 return result < 0 ? result : -EIO;
425 }
426
19f73787
CL
427 if (put_dreq(dreq))
428 nfs_direct_complete(dreq);
839f7ad6 429 return 0;
19f73787
CL
430}
431
c216fd70
CL
432static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
433 unsigned long nr_segs, loff_t pos)
1da177e4 434{
f11ac8db 435 ssize_t result = -ENOMEM;
99514f8f 436 struct inode *inode = iocb->ki_filp->f_mapping->host;
1da177e4
LT
437 struct nfs_direct_req *dreq;
438
607f31e8 439 dreq = nfs_direct_req_alloc();
f11ac8db
TM
440 if (dreq == NULL)
441 goto out;
1da177e4 442
91d5b470 443 dreq->inode = inode;
cd3758e3 444 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
f11ac8db
TM
445 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
446 if (dreq->l_ctx == NULL)
447 goto out_release;
487b8372
CL
448 if (!is_sync_kiocb(iocb))
449 dreq->iocb = iocb;
1da177e4 450
c216fd70 451 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
607f31e8
TM
452 if (!result)
453 result = nfs_direct_wait(dreq);
f11ac8db 454out_release:
b4946ffb 455 nfs_direct_req_release(dreq);
f11ac8db 456out:
1da177e4
LT
457 return result;
458}
459
cd841605
FI
460static void nfs_direct_writehdr_release(struct nfs_write_header *whdr)
461{
462 struct nfs_write_data *data = &whdr->rpc_data;
463
30dd374f
FI
464 if (data->pages.pagevec != data->pages.page_array)
465 kfree(data->pages.pagevec);
cd841605
FI
466 nfs_writehdr_free(&whdr->header);
467}
468
fad61490 469static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
1da177e4 470{
607f31e8 471 while (!list_empty(&dreq->rewrite_list)) {
cd841605
FI
472 struct nfs_pgio_header *hdr = list_entry(dreq->rewrite_list.next, struct nfs_pgio_header, pages);
473 struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
30dd374f
FI
474 struct nfs_page_array *p = &whdr->rpc_data.pages;
475
cd841605 476 list_del(&hdr->pages);
30dd374f 477 nfs_direct_release_pages(p->pagevec, p->npages);
cd841605 478 nfs_direct_writehdr_release(whdr);
fad61490
TM
479 }
480}
1da177e4 481
fad61490
TM
482#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
483static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
484{
607f31e8
TM
485 struct inode *inode = dreq->inode;
486 struct list_head *p;
487 struct nfs_write_data *data;
cd841605 488 struct nfs_pgio_header *hdr;
07737691 489 struct rpc_task *task;
bdc7f021
TM
490 struct rpc_message msg = {
491 .rpc_cred = dreq->ctx->cred,
492 };
84115e1c
TM
493 struct rpc_task_setup task_setup_data = {
494 .rpc_client = NFS_CLIENT(inode),
a8b40bc7 495 .rpc_message = &msg,
84115e1c 496 .callback_ops = &nfs_write_direct_ops,
101070ca 497 .workqueue = nfsiod_workqueue,
84115e1c
TM
498 .flags = RPC_TASK_ASYNC,
499 };
1da177e4 500
fad61490 501 dreq->count = 0;
607f31e8
TM
502 get_dreq(dreq);
503
504 list_for_each(p, &dreq->rewrite_list) {
cd841605
FI
505 hdr = list_entry(p, struct nfs_pgio_header, pages);
506 data = &(container_of(hdr, struct nfs_write_header, header))->rpc_data;
607f31e8
TM
507
508 get_dreq(dreq);
509
bdc7f021
TM
510 /* Use stable writes */
511 data->args.stable = NFS_FILE_SYNC;
512
607f31e8
TM
513 /*
514 * Reset data->res.
515 */
516 nfs_fattr_init(&data->fattr);
517 data->res.count = data->args.count;
518 memset(&data->verf, 0, sizeof(data->verf));
519
520 /*
521 * Reuse data->task; data->args should not have changed
522 * since the original request was sent.
523 */
07737691 524 task_setup_data.task = &data->task;
84115e1c 525 task_setup_data.callback_data = data;
bdc7f021
TM
526 msg.rpc_argp = &data->args;
527 msg.rpc_resp = &data->res;
528 NFS_PROTO(inode)->write_setup(data, &msg);
607f31e8 529
607f31e8
TM
530 /*
531 * We're called via an RPC callback, so BKL is already held.
532 */
07737691
TM
533 task = rpc_run_task(&task_setup_data);
534 if (!IS_ERR(task))
535 rpc_put_task(task);
607f31e8
TM
536
537 dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
538 data->task.tk_pid,
539 inode->i_sb->s_id,
540 (long long)NFS_FILEID(inode),
541 data->args.count,
542 (unsigned long long)data->args.offset);
543 }
fedb595c 544
607f31e8
TM
545 if (put_dreq(dreq))
546 nfs_direct_write_complete(dreq, inode);
fad61490
TM
547}
548
549static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
550{
0b7c0153 551 struct nfs_commit_data *data = calldata;
fad61490
TM
552
553 /* Call the NFS version-specific code */
c9d8f89d
TM
554 NFS_PROTO(data->inode)->commit_done(task, data);
555}
556
557static void nfs_direct_commit_release(void *calldata)
558{
0b7c0153
FI
559 struct nfs_commit_data *data = calldata;
560 struct nfs_direct_req *dreq = data->dreq;
c9d8f89d
TM
561 int status = data->task.tk_status;
562
563 if (status < 0) {
60fa3f76 564 dprintk("NFS: %5u commit failed with error %d.\n",
c9d8f89d 565 data->task.tk_pid, status);
fad61490 566 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
60fa3f76 567 } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
c9d8f89d 568 dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
fad61490 569 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
1da177e4
LT
570 }
571
c9d8f89d 572 dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
fad61490 573 nfs_direct_write_complete(dreq, data->inode);
1ae88b2e 574 nfs_commit_free(data);
1da177e4
LT
575}
576
fad61490 577static const struct rpc_call_ops nfs_commit_direct_ops = {
0b7c0153 578 .rpc_call_prepare = nfs_commit_prepare,
fad61490 579 .rpc_call_done = nfs_direct_commit_result,
c9d8f89d 580 .rpc_release = nfs_direct_commit_release,
fad61490
TM
581};
582
583static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
1da177e4 584{
0b7c0153 585 struct nfs_commit_data *data = dreq->commit_data;
07737691 586 struct rpc_task *task;
bdc7f021
TM
587 struct rpc_message msg = {
588 .rpc_argp = &data->args,
589 .rpc_resp = &data->res,
590 .rpc_cred = dreq->ctx->cred,
591 };
84115e1c 592 struct rpc_task_setup task_setup_data = {
07737691 593 .task = &data->task,
84115e1c 594 .rpc_client = NFS_CLIENT(dreq->inode),
bdc7f021 595 .rpc_message = &msg,
84115e1c
TM
596 .callback_ops = &nfs_commit_direct_ops,
597 .callback_data = data,
101070ca 598 .workqueue = nfsiod_workqueue,
84115e1c
TM
599 .flags = RPC_TASK_ASYNC,
600 };
1da177e4 601
fad61490 602 data->inode = dreq->inode;
bdc7f021 603 data->cred = msg.rpc_cred;
1da177e4 604
fad61490 605 data->args.fh = NFS_FH(data->inode);
607f31e8
TM
606 data->args.offset = 0;
607 data->args.count = 0;
fad61490
TM
608 data->res.fattr = &data->fattr;
609 data->res.verf = &data->verf;
65d26953 610 nfs_fattr_init(&data->fattr);
1da177e4 611
bdc7f021 612 NFS_PROTO(data->inode)->commit_setup(data, &msg);
1da177e4 613
fad61490
TM
614 /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
615 dreq->commit_data = NULL;
1da177e4 616
e99170ff 617 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1da177e4 618
07737691
TM
619 task = rpc_run_task(&task_setup_data);
620 if (!IS_ERR(task))
621 rpc_put_task(task);
fad61490 622}
1da177e4 623
fad61490
TM
624static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
625{
626 int flags = dreq->flags;
1da177e4 627
fad61490
TM
628 dreq->flags = 0;
629 switch (flags) {
630 case NFS_ODIRECT_DO_COMMIT:
631 nfs_direct_commit_schedule(dreq);
1da177e4 632 break;
fad61490
TM
633 case NFS_ODIRECT_RESCHED_WRITES:
634 nfs_direct_write_reschedule(dreq);
635 break;
636 default:
fad61490
TM
637 if (dreq->commit_data != NULL)
638 nfs_commit_free(dreq->commit_data);
639 nfs_direct_free_writedata(dreq);
cd9ae2b6 640 nfs_zap_mapping(inode, inode->i_mapping);
fad61490
TM
641 nfs_direct_complete(dreq);
642 }
643}
1da177e4 644
fad61490
TM
645static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
646{
c9d8f89d 647 dreq->commit_data = nfs_commitdata_alloc();
fad61490 648 if (dreq->commit_data != NULL)
0b7c0153 649 dreq->commit_data->dreq = dreq;
fad61490
TM
650}
651#else
652static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
653{
654 dreq->commit_data = NULL;
655}
1da177e4 656
fad61490
TM
657static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
658{
fad61490 659 nfs_direct_free_writedata(dreq);
cd9ae2b6 660 nfs_zap_mapping(inode, inode->i_mapping);
fad61490
TM
661 nfs_direct_complete(dreq);
662}
663#endif
1da177e4 664
462d5b32 665static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
1da177e4 666{
462d5b32 667 struct nfs_write_data *data = calldata;
462d5b32 668
83762c56 669 nfs_writeback_done(task, data);
c9d8f89d
TM
670}
671
672/*
673 * NB: Return the value of the first error return code. Subsequent
674 * errors after the first one are ignored.
675 */
676static void nfs_direct_write_release(void *calldata)
677{
678 struct nfs_write_data *data = calldata;
cd841605
FI
679 struct nfs_pgio_header *hdr = data->header;
680 struct nfs_direct_req *dreq = (struct nfs_direct_req *) hdr->req;
c9d8f89d 681 int status = data->task.tk_status;
462d5b32 682
15ce4a0c 683 spin_lock(&dreq->lock);
1da177e4 684
eda3cef8 685 if (unlikely(status < 0)) {
432409ee 686 /* An error has occurred, so we should not commit */
60fa3f76 687 dreq->flags = 0;
eda3cef8 688 dreq->error = status;
eda3cef8 689 }
432409ee
NB
690 if (unlikely(dreq->error != 0))
691 goto out_unlock;
eda3cef8
TM
692
693 dreq->count += data->res.count;
1da177e4 694
fad61490
TM
695 if (data->res.verf->committed != NFS_FILE_SYNC) {
696 switch (dreq->flags) {
697 case 0:
698 memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
699 dreq->flags = NFS_ODIRECT_DO_COMMIT;
1da177e4 700 break;
fad61490
TM
701 case NFS_ODIRECT_DO_COMMIT:
702 if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
c9d8f89d 703 dprintk("NFS: %5u write verify failed\n", data->task.tk_pid);
fad61490
TM
704 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
705 }
1da177e4 706 }
1da177e4 707 }
eda3cef8 708out_unlock:
fad61490 709 spin_unlock(&dreq->lock);
1da177e4 710
607f31e8 711 if (put_dreq(dreq))
cd841605 712 nfs_direct_write_complete(dreq, hdr->inode);
462d5b32
CL
713}
714
715static const struct rpc_call_ops nfs_write_direct_ops = {
def6ed7e 716 .rpc_call_prepare = nfs_write_prepare,
462d5b32 717 .rpc_call_done = nfs_direct_write_result,
fad61490 718 .rpc_release = nfs_direct_write_release,
462d5b32
CL
719};
720
721/*
607f31e8
TM
722 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
723 * operation. If nfs_writedata_alloc() or get_user_pages() fails,
724 * bail and stop sending more writes. Write length accounting is
725 * handled automatically by nfs_direct_write_result(). Otherwise, if
726 * no requests have been sent, just return an error.
462d5b32 727 */
02fe4946
CL
728static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
729 const struct iovec *iov,
730 loff_t pos, int sync)
462d5b32 731{
a8881f5a 732 struct nfs_open_context *ctx = dreq->ctx;
3d4ff43d 733 struct inode *inode = ctx->dentry->d_inode;
02fe4946
CL
734 unsigned long user_addr = (unsigned long)iov->iov_base;
735 size_t count = iov->iov_len;
07737691 736 struct rpc_task *task;
bdc7f021
TM
737 struct rpc_message msg = {
738 .rpc_cred = ctx->cred,
739 };
84115e1c
TM
740 struct rpc_task_setup task_setup_data = {
741 .rpc_client = NFS_CLIENT(inode),
bdc7f021 742 .rpc_message = &msg,
84115e1c 743 .callback_ops = &nfs_write_direct_ops,
101070ca 744 .workqueue = nfsiod_workqueue,
84115e1c
TM
745 .flags = RPC_TASK_ASYNC,
746 };
462d5b32 747 size_t wsize = NFS_SERVER(inode)->wsize;
607f31e8
TM
748 unsigned int pgbase;
749 int result;
750 ssize_t started = 0;
82b145c5 751
1da177e4 752 do {
cd841605 753 struct nfs_write_header *whdr;
82b145c5 754 struct nfs_write_data *data;
30dd374f 755 struct nfs_page_array *pages;
462d5b32
CL
756 size_t bytes;
757
e9f7bee1
TM
758 pgbase = user_addr & ~PAGE_MASK;
759 bytes = min(wsize,count);
760
607f31e8 761 result = -ENOMEM;
6c75dc0d 762 whdr = nfs_writehdr_alloc();
cd841605 763 if (unlikely(!whdr))
607f31e8
TM
764 break;
765
6c75dc0d
FI
766 data = nfs_writedata_alloc(&whdr->header, nfs_page_array_len(pgbase, bytes));
767 if (!data) {
768 nfs_writehdr_free(&whdr->header);
769 break;
770 }
771 data->header = &whdr->header;
772 atomic_inc(&data->header->refcnt);
30dd374f 773 pages = &data->pages;
cd841605 774
607f31e8
TM
775 down_read(&current->mm->mmap_sem);
776 result = get_user_pages(current, current->mm, user_addr,
30dd374f 777 pages->npages, 0, 0, pages->pagevec, NULL);
607f31e8 778 up_read(&current->mm->mmap_sem);
749e146e 779 if (result < 0) {
cd841605 780 nfs_direct_writehdr_release(whdr);
749e146e
CL
781 break;
782 }
30dd374f 783 if ((unsigned)result < pages->npages) {
d9df8d6b
TM
784 bytes = result * PAGE_SIZE;
785 if (bytes <= pgbase) {
30dd374f 786 nfs_direct_release_pages(pages->pagevec, result);
cd841605 787 nfs_direct_writehdr_release(whdr);
d9df8d6b
TM
788 break;
789 }
790 bytes -= pgbase;
30dd374f 791 pages->npages = result;
607f31e8
TM
792 }
793
794 get_dreq(dreq);
795
cd841605 796 list_move_tail(&whdr->header.pages, &dreq->rewrite_list);
462d5b32 797
cd841605
FI
798 whdr->header.req = (struct nfs_page *) dreq;
799 whdr->header.inode = inode;
800 whdr->header.cred = msg.rpc_cred;
462d5b32 801 data->args.fh = NFS_FH(inode);
1ae88b2e 802 data->args.context = ctx;
f11ac8db 803 data->args.lock_context = dreq->l_ctx;
88467055 804 data->args.offset = pos;
462d5b32 805 data->args.pgbase = pgbase;
30dd374f 806 data->args.pages = pages->pagevec;
462d5b32 807 data->args.count = bytes;
bdc7f021 808 data->args.stable = sync;
462d5b32
CL
809 data->res.fattr = &data->fattr;
810 data->res.count = bytes;
47989d74 811 data->res.verf = &data->verf;
65d26953 812 nfs_fattr_init(&data->fattr);
462d5b32 813
07737691 814 task_setup_data.task = &data->task;
84115e1c 815 task_setup_data.callback_data = data;
bdc7f021
TM
816 msg.rpc_argp = &data->args;
817 msg.rpc_resp = &data->res;
818 NFS_PROTO(inode)->write_setup(data, &msg);
1da177e4 819
07737691 820 task = rpc_run_task(&task_setup_data);
dbae4c73
TM
821 if (IS_ERR(task))
822 break;
1da177e4 823
a3f565b1
CL
824 dprintk("NFS: %5u initiated direct write call "
825 "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
31f6852a 826 task->tk_pid,
462d5b32
CL
827 inode->i_sb->s_id,
828 (long long)NFS_FILEID(inode),
829 bytes,
830 (unsigned long long)data->args.offset);
31f6852a 831 rpc_put_task(task);
1da177e4 832
607f31e8
TM
833 started += bytes;
834 user_addr += bytes;
88467055 835 pos += bytes;
e9f7bee1
TM
836
837 /* FIXME: Remove this useless math from the final patch */
462d5b32 838 pgbase += bytes;
462d5b32 839 pgbase &= ~PAGE_MASK;
e9f7bee1 840 BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
1da177e4 841
462d5b32
CL
842 count -= bytes;
843 } while (count != 0);
607f31e8 844
607f31e8 845 if (started)
c216fd70 846 return started;
607f31e8 847 return result < 0 ? (ssize_t) result : -EFAULT;
462d5b32 848}
1da177e4 849
19f73787
CL
850static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
851 const struct iovec *iov,
852 unsigned long nr_segs,
853 loff_t pos, int sync)
854{
855 ssize_t result = 0;
856 size_t requested_bytes = 0;
857 unsigned long seg;
858
859 get_dreq(dreq);
860
861 for (seg = 0; seg < nr_segs; seg++) {
862 const struct iovec *vec = &iov[seg];
02fe4946
CL
863 result = nfs_direct_write_schedule_segment(dreq, vec,
864 pos, sync);
19f73787
CL
865 if (result < 0)
866 break;
867 requested_bytes += result;
868 if ((size_t)result < vec->iov_len)
869 break;
870 pos += vec->iov_len;
871 }
872
839f7ad6
CL
873 /*
874 * If no bytes were started, return the error, and let the
875 * generic layer handle the completion.
876 */
877 if (requested_bytes == 0) {
878 nfs_direct_req_release(dreq);
879 return result < 0 ? result : -EIO;
880 }
881
19f73787
CL
882 if (put_dreq(dreq))
883 nfs_direct_write_complete(dreq, dreq->inode);
839f7ad6 884 return 0;
19f73787
CL
885}
886
c216fd70
CL
887static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
888 unsigned long nr_segs, loff_t pos,
889 size_t count)
462d5b32 890{
f11ac8db 891 ssize_t result = -ENOMEM;
c89f2ee5 892 struct inode *inode = iocb->ki_filp->f_mapping->host;
462d5b32 893 struct nfs_direct_req *dreq;
fad61490 894 size_t wsize = NFS_SERVER(inode)->wsize;
bdc7f021 895 int sync = NFS_UNSTABLE;
1da177e4 896
607f31e8 897 dreq = nfs_direct_req_alloc();
462d5b32 898 if (!dreq)
f11ac8db 899 goto out;
607f31e8
TM
900 nfs_alloc_commit_data(dreq);
901
b47d19de 902 if (dreq->commit_data == NULL || count <= wsize)
bdc7f021 903 sync = NFS_FILE_SYNC;
1da177e4 904
c89f2ee5 905 dreq->inode = inode;
cd3758e3 906 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
f11ac8db 907 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
568a810d 908 if (dreq->l_ctx == NULL)
f11ac8db 909 goto out_release;
c89f2ee5
CL
910 if (!is_sync_kiocb(iocb))
911 dreq->iocb = iocb;
1da177e4 912
c216fd70 913 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
607f31e8
TM
914 if (!result)
915 result = nfs_direct_wait(dreq);
f11ac8db 916out_release:
b4946ffb 917 nfs_direct_req_release(dreq);
f11ac8db 918out:
1da177e4
LT
919 return result;
920}
921
922/**
923 * nfs_file_direct_read - file direct read operation for NFS files
924 * @iocb: target I/O control block
027445c3
BP
925 * @iov: vector of user buffers into which to read data
926 * @nr_segs: size of iov vector
88467055 927 * @pos: byte offset in file where reading starts
1da177e4
LT
928 *
929 * We use this function for direct reads instead of calling
930 * generic_file_aio_read() in order to avoid gfar's check to see if
931 * the request starts before the end of the file. For that check
932 * to work, we must generate a GETATTR before each direct read, and
933 * even then there is a window between the GETATTR and the subsequent
88467055 934 * READ where the file size could change. Our preference is simply
1da177e4
LT
935 * to do all reads the application wants, and the server will take
936 * care of managing the end of file boundary.
88467055 937 *
1da177e4
LT
938 * This function also eliminates unnecessarily updating the file's
939 * atime locally, as the NFS server sets the file's atime, and this
940 * client must read the updated atime from the server back into its
941 * cache.
942 */
027445c3
BP
943ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
944 unsigned long nr_segs, loff_t pos)
1da177e4
LT
945{
946 ssize_t retval = -EINVAL;
1da177e4 947 struct file *file = iocb->ki_filp;
1da177e4 948 struct address_space *mapping = file->f_mapping;
c216fd70
CL
949 size_t count;
950
951 count = iov_length(iov, nr_segs);
952 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
1da177e4 953
6da24bc9 954 dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
01cce933
JJS
955 file->f_path.dentry->d_parent->d_name.name,
956 file->f_path.dentry->d_name.name,
c216fd70 957 count, (long long) pos);
1da177e4 958
1da177e4
LT
959 retval = 0;
960 if (!count)
961 goto out;
962
29884df0
TM
963 retval = nfs_sync_mapping(mapping);
964 if (retval)
965 goto out;
1da177e4 966
7ec10f26
KK
967 task_io_account_read(count);
968
c216fd70 969 retval = nfs_direct_read(iocb, iov, nr_segs, pos);
1da177e4 970 if (retval > 0)
0cdd80d0 971 iocb->ki_pos = pos + retval;
1da177e4
LT
972
973out:
974 return retval;
975}
976
977/**
978 * nfs_file_direct_write - file direct write operation for NFS files
979 * @iocb: target I/O control block
027445c3
BP
980 * @iov: vector of user buffers from which to write data
981 * @nr_segs: size of iov vector
88467055 982 * @pos: byte offset in file where writing starts
1da177e4
LT
983 *
984 * We use this function for direct writes instead of calling
985 * generic_file_aio_write() in order to avoid taking the inode
986 * semaphore and updating the i_size. The NFS server will set
987 * the new i_size and this client must read the updated size
988 * back into its cache. We let the server do generic write
989 * parameter checking and report problems.
990 *
1da177e4
LT
991 * We eliminate local atime updates, see direct read above.
992 *
993 * We avoid unnecessary page cache invalidations for normal cached
994 * readers of this file.
995 *
996 * Note that O_APPEND is not supported for NFS direct writes, as there
997 * is no atomic O_APPEND write facility in the NFS protocol.
998 */
027445c3
BP
999ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1000 unsigned long nr_segs, loff_t pos)
1da177e4 1001{
070ea602 1002 ssize_t retval = -EINVAL;
1da177e4 1003 struct file *file = iocb->ki_filp;
1da177e4 1004 struct address_space *mapping = file->f_mapping;
c216fd70 1005 size_t count;
1da177e4 1006
c216fd70
CL
1007 count = iov_length(iov, nr_segs);
1008 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
1009
6da24bc9 1010 dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
01cce933
JJS
1011 file->f_path.dentry->d_parent->d_name.name,
1012 file->f_path.dentry->d_name.name,
c216fd70 1013 count, (long long) pos);
027445c3 1014
ce1a8e67
CL
1015 retval = generic_write_checks(file, &pos, &count, 0);
1016 if (retval)
1da177e4 1017 goto out;
ce1a8e67
CL
1018
1019 retval = -EINVAL;
1020 if ((ssize_t) count < 0)
1da177e4 1021 goto out;
1da177e4
LT
1022 retval = 0;
1023 if (!count)
1024 goto out;
ce1a8e67 1025
29884df0
TM
1026 retval = nfs_sync_mapping(mapping);
1027 if (retval)
1028 goto out;
1da177e4 1029
7ec10f26
KK
1030 task_io_account_write(count);
1031
c216fd70 1032 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
9eafa8cc 1033
1da177e4 1034 if (retval > 0)
ce1a8e67 1035 iocb->ki_pos = pos + retval;
1da177e4
LT
1036
1037out:
1038 return retval;
1039}
1040
88467055
CL
1041/**
1042 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1043 *
1044 */
f7b422b1 1045int __init nfs_init_directcache(void)
1da177e4
LT
1046{
1047 nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1048 sizeof(struct nfs_direct_req),
fffb60f9
PJ
1049 0, (SLAB_RECLAIM_ACCOUNT|
1050 SLAB_MEM_SPREAD),
20c2df83 1051 NULL);
1da177e4
LT
1052 if (nfs_direct_cachep == NULL)
1053 return -ENOMEM;
1054
1055 return 0;
1056}
1057
88467055 1058/**
f7b422b1 1059 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
88467055
CL
1060 *
1061 */
266bee88 1062void nfs_destroy_directcache(void)
1da177e4 1063{
1a1d92c1 1064 kmem_cache_destroy(nfs_direct_cachep);
1da177e4 1065}