]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/zpl_file.c
Document ZFS module parameters.
[mirror_zfs.git] / module / zfs / zpl_file.c
CommitLineData
1efb473f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
23 */
24
25
26#include <sys/zfs_vfsops.h>
27#include <sys/zfs_vnops.h>
28#include <sys/zfs_znode.h>
29#include <sys/zpl.h>
30
31
126400a1
BB
32static int
33zpl_open(struct inode *ip, struct file *filp)
34{
81e97e21 35 cred_t *cr = CRED();
126400a1
BB
36 int error;
37
81e97e21 38 crhold(cr);
126400a1 39 error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
81e97e21 40 crfree(cr);
126400a1
BB
41 ASSERT3S(error, <=, 0);
42
43 if (error)
44 return (error);
45
46 return generic_file_open(ip, filp);
47}
48
49static int
50zpl_release(struct inode *ip, struct file *filp)
51{
81e97e21 52 cred_t *cr = CRED();
126400a1
BB
53 int error;
54
78d7a5d7
BB
55 if (ITOZ(ip)->z_atime_dirty)
56 mark_inode_dirty(ip);
57
81e97e21 58 crhold(cr);
126400a1 59 error = -zfs_close(ip, filp->f_flags, cr);
81e97e21 60 crfree(cr);
126400a1
BB
61 ASSERT3S(error, <=, 0);
62
63 return (error);
64}
65
1efb473f 66static int
0f37d0c8 67zpl_iterate(struct file *filp, struct dir_context *ctx)
1efb473f
BB
68{
69 struct dentry *dentry = filp->f_path.dentry;
81e97e21 70 cred_t *cr = CRED();
1efb473f
BB
71 int error;
72
81e97e21 73 crhold(cr);
0f37d0c8 74 error = -zfs_readdir(dentry->d_inode, ctx, cr);
81e97e21 75 crfree(cr);
1efb473f
BB
76 ASSERT3S(error, <=, 0);
77
78 return (error);
79}
80
0f37d0c8
RY
81#if !defined(HAVE_VFS_ITERATE)
82static int
83zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
84{
85 struct dir_context ctx = DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
86 int error;
87
88 error = zpl_iterate(filp, &ctx);
89 filp->f_pos = ctx.pos;
90
91 return (error);
92}
93#endif /* HAVE_VFS_ITERATE */
94
adcd70bd 95#if defined(HAVE_FSYNC_WITH_DENTRY)
3117dd0b 96/*
adcd70bd
BB
97 * Linux 2.6.x - 2.6.34 API,
98 * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *'
99 * to the fops->fsync() hook. For this reason, we must be careful not to
100 * use filp unconditionally.
101 */
102static int
103zpl_fsync(struct file *filp, struct dentry *dentry, int datasync)
104{
105 cred_t *cr = CRED();
106 int error;
107
108 crhold(cr);
109 error = -zfs_fsync(dentry->d_inode, datasync, cr);
110 crfree(cr);
111 ASSERT3S(error, <=, 0);
112
113 return (error);
114}
115
116#elif defined(HAVE_FSYNC_WITHOUT_DENTRY)
117/*
118 * Linux 2.6.35 - 3.0 API,
119 * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
3117dd0b
BB
120 * redundant. The dentry is still accessible via filp->f_path.dentry,
121 * and we are guaranteed that filp will never be NULL.
3117dd0b 122 */
3117dd0b
BB
123static int
124zpl_fsync(struct file *filp, int datasync)
125{
adcd70bd
BB
126 struct inode *inode = filp->f_mapping->host;
127 cred_t *cr = CRED();
128 int error;
129
130 crhold(cr);
131 error = -zfs_fsync(inode, datasync, cr);
132 crfree(cr);
133 ASSERT3S(error, <=, 0);
134
135 return (error);
136}
137
138#elif defined(HAVE_FSYNC_RANGE)
139/*
140 * Linux 3.1 - 3.x API,
141 * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
142 * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex
143 * lock is no longer held by the caller, for zfs we don't require the lock
144 * to be held so we don't acquire it.
145 */
3117dd0b 146static int
adcd70bd 147zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
1efb473f 148{
adcd70bd 149 struct inode *inode = filp->f_mapping->host;
81e97e21 150 cred_t *cr = CRED();
1efb473f
BB
151 int error;
152
adcd70bd
BB
153 error = filemap_write_and_wait_range(inode->i_mapping, start, end);
154 if (error)
155 return (error);
156
81e97e21 157 crhold(cr);
adcd70bd 158 error = -zfs_fsync(inode, datasync, cr);
81e97e21 159 crfree(cr);
1efb473f
BB
160 ASSERT3S(error, <=, 0);
161
162 return (error);
163}
adcd70bd
BB
164#else
165#error "Unsupported fops->fsync() implementation"
166#endif
1efb473f
BB
167
168ssize_t
169zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t pos,
170 uio_seg_t segment, int flags, cred_t *cr)
171{
172 int error;
173 struct iovec iov;
174 uio_t uio;
175
176 iov.iov_base = (void *)buf;
177 iov.iov_len = len;
178
179 uio.uio_iov = &iov;
180 uio.uio_resid = len;
181 uio.uio_iovcnt = 1;
182 uio.uio_loffset = pos;
183 uio.uio_limit = MAXOFFSET_T;
184 uio.uio_segflg = segment;
185
186 error = -zfs_read(ip, &uio, flags, cr);
187 if (error < 0)
188 return (error);
189
190 return (len - uio.uio_resid);
191}
192
193static ssize_t
194zpl_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
195{
81e97e21 196 cred_t *cr = CRED();
1efb473f
BB
197 ssize_t read;
198
81e97e21 199 crhold(cr);
1efb473f
BB
200 read = zpl_read_common(filp->f_mapping->host, buf, len, *ppos,
201 UIO_USERSPACE, filp->f_flags, cr);
81e97e21 202 crfree(cr);
1efb473f
BB
203
204 if (read < 0)
205 return (read);
206
207 *ppos += read;
208 return (read);
209}
210
211ssize_t
212zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t pos,
213 uio_seg_t segment, int flags, cred_t *cr)
214{
215 int error;
216 struct iovec iov;
217 uio_t uio;
218
219 iov.iov_base = (void *)buf;
220 iov.iov_len = len;
221
222 uio.uio_iov = &iov;
223 uio.uio_resid = len,
224 uio.uio_iovcnt = 1;
225 uio.uio_loffset = pos;
226 uio.uio_limit = MAXOFFSET_T;
227 uio.uio_segflg = segment;
228
229 error = -zfs_write(ip, &uio, flags, cr);
230 if (error < 0)
231 return (error);
232
233 return (len - uio.uio_resid);
234}
235
236static ssize_t
237zpl_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
238{
81e97e21 239 cred_t *cr = CRED();
1efb473f
BB
240 ssize_t wrote;
241
81e97e21 242 crhold(cr);
1efb473f
BB
243 wrote = zpl_write_common(filp->f_mapping->host, buf, len, *ppos,
244 UIO_USERSPACE, filp->f_flags, cr);
81e97e21 245 crfree(cr);
1efb473f
BB
246
247 if (wrote < 0)
248 return (wrote);
249
250 *ppos += wrote;
251 return (wrote);
252}
253
802e7b5f
LD
254static loff_t
255zpl_llseek(struct file *filp, loff_t offset, int whence)
256{
257#if defined(SEEK_HOLE) && defined(SEEK_DATA)
258 if (whence == SEEK_DATA || whence == SEEK_HOLE) {
259 struct inode *ip = filp->f_mapping->host;
260 loff_t maxbytes = ip->i_sb->s_maxbytes;
261 loff_t error;
262
263 spl_inode_lock(ip);
264 error = -zfs_holey(ip, whence, &offset);
265 if (error == 0)
266 error = lseek_execute(filp, ip, offset, maxbytes);
267 spl_inode_unlock(ip);
268
269 return (error);
270 }
271#endif /* SEEK_HOLE && SEEK_DATA */
272
273 return generic_file_llseek(filp, offset, whence);
274}
275
c0d35759
BB
276/*
277 * It's worth taking a moment to describe how mmap is implemented
278 * for zfs because it differs considerably from other Linux filesystems.
279 * However, this issue is handled the same way under OpenSolaris.
280 *
281 * The issue is that by design zfs bypasses the Linux page cache and
282 * leaves all caching up to the ARC. This has been shown to work
283 * well for the common read(2)/write(2) case. However, mmap(2)
284 * is problem because it relies on being tightly integrated with the
285 * page cache. To handle this we cache mmap'ed files twice, once in
286 * the ARC and a second time in the page cache. The code is careful
287 * to keep both copies synchronized.
288 *
289 * When a file with an mmap'ed region is written to using write(2)
290 * both the data in the ARC and existing pages in the page cache
291 * are updated. For a read(2) data will be read first from the page
292 * cache then the ARC if needed. Neither a write(2) or read(2) will
293 * will ever result in new pages being added to the page cache.
294 *
295 * New pages are added to the page cache only via .readpage() which
296 * is called when the vfs needs to read a page off disk to back the
297 * virtual memory region. These pages may be modified without
298 * notifying the ARC and will be written out periodically via
299 * .writepage(). This will occur due to either a sync or the usual
300 * page aging behavior. Note because a read(2) of a mmap'ed file
301 * will always check the page cache first even when the ARC is out
302 * of date correct data will still be returned.
303 *
304 * While this implementation ensures correct behavior it does have
305 * have some drawbacks. The most obvious of which is that it
306 * increases the required memory footprint when access mmap'ed
307 * files. It also adds additional complexity to the code keeping
308 * both caches synchronized.
309 *
310 * Longer term it may be possible to cleanly resolve this wart by
311 * mapping page cache pages directly on to the ARC buffers. The
312 * Linux address space operations are flexible enough to allow
313 * selection of which pages back a particular index. The trick
314 * would be working out the details of which subsystem is in
315 * charge, the ARC, the page cache, or both. It may also prove
316 * helpful to move the ARC buffers to a scatter-gather lists
317 * rather than a vmalloc'ed region.
318 */
319static int
320zpl_mmap(struct file *filp, struct vm_area_struct *vma)
321{
e2e7aa2d
BB
322 struct inode *ip = filp->f_mapping->host;
323 znode_t *zp = ITOZ(ip);
c0d35759
BB
324 int error;
325
e2e7aa2d
BB
326 error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
327 (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
328 if (error)
329 return (error);
330
c0d35759
BB
331 error = generic_file_mmap(filp, vma);
332 if (error)
333 return (error);
334
335 mutex_enter(&zp->z_lock);
336 zp->z_is_mapped = 1;
337 mutex_exit(&zp->z_lock);
338
339 return (error);
340}
341
342/*
343 * Populate a page with data for the Linux page cache. This function is
344 * only used to support mmap(2). There will be an identical copy of the
345 * data in the ARC which is kept up to date via .write() and .writepage().
346 *
347 * Current this function relies on zpl_read_common() and the O_DIRECT
348 * flag to read in a page. This works but the more correct way is to
349 * update zfs_fillpage() to be Linux friendly and use that interface.
350 */
351static int
352zpl_readpage(struct file *filp, struct page *pp)
353{
354 struct inode *ip;
dde471ef 355 struct page *pl[1];
c0d35759
BB
356 int error = 0;
357
358 ASSERT(PageLocked(pp));
359 ip = pp->mapping->host;
dde471ef 360 pl[0] = pp;
c0d35759 361
dde471ef 362 error = -zfs_getpage(ip, pl, 1);
c0d35759 363
dde471ef
PJ
364 if (error) {
365 SetPageError(pp);
366 ClearPageUptodate(pp);
367 } else {
368 ClearPageError(pp);
369 SetPageUptodate(pp);
370 flush_dcache_page(pp);
371 }
c0d35759 372
dde471ef
PJ
373 unlock_page(pp);
374 return error;
375}
c0d35759 376
f3ab88d6
BB
377/*
378 * Populate a set of pages with data for the Linux page cache. This
379 * function will only be called for read ahead and never for demand
380 * paging. For simplicity, the code relies on read_cache_pages() to
381 * correctly lock each page for IO and call zpl_readpage().
382 */
383static int
384zpl_readpages(struct file *filp, struct address_space *mapping,
385 struct list_head *pages, unsigned nr_pages)
386{
95d9fd02
BB
387 return (read_cache_pages(mapping, pages,
388 (filler_t *)zpl_readpage, filp));
f3ab88d6
BB
389}
390
dde471ef
PJ
391int
392zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
393{
3c0e5c0f
BB
394 struct address_space *mapping = data;
395
396 ASSERT(PageLocked(pp));
397 ASSERT(!PageWriteback(pp));
8630650a
BB
398 ASSERT(!(current->flags & PF_NOFS));
399
400 /*
401 * Annotate this call path with a flag that indicates that it is
402 * unsafe to use KM_SLEEP during memory allocations due to the
403 * potential for a deadlock. KM_PUSHPAGE should be used instead.
404 */
405 current->flags |= PF_NOFS;
62c4165a 406 (void) zfs_putpage(mapping->host, pp, wbc);
8630650a 407 current->flags &= ~PF_NOFS;
c0d35759 408
3c0e5c0f 409 return (0);
dde471ef 410}
c0d35759 411
dde471ef
PJ
412static int
413zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
414{
415 return write_cache_pages(mapping, wbc, zpl_putpage, mapping);
c0d35759
BB
416}
417
418/*
419 * Write out dirty pages to the ARC, this function is only required to
420 * support mmap(2). Mapped pages may be dirtied by memory operations
421 * which never call .write(). These dirty pages are kept in sync with
422 * the ARC buffers via this hook.
c0d35759
BB
423 */
424static int
425zpl_writepage(struct page *pp, struct writeback_control *wbc)
426{
dde471ef 427 return zpl_putpage(pp, wbc, pp->mapping);
c0d35759
BB
428}
429
cb2d1901
ED
430/*
431 * The only flag combination which matches the behavior of zfs_space()
432 * is FALLOC_FL_PUNCH_HOLE. This flag was introduced in the 2.6.38 kernel.
433 */
434long
435zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
436{
437 cred_t *cr = CRED();
438 int error = -EOPNOTSUPP;
439
440 if (mode & FALLOC_FL_KEEP_SIZE)
441 return (-EOPNOTSUPP);
442
443 crhold(cr);
444
445#ifdef FALLOC_FL_PUNCH_HOLE
446 if (mode & FALLOC_FL_PUNCH_HOLE) {
447 flock64_t bf;
448
449 bf.l_type = F_WRLCK;
450 bf.l_whence = 0;
451 bf.l_start = offset;
452 bf.l_len = len;
453 bf.l_pid = 0;
454
455 error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr);
456 }
457#endif /* FALLOC_FL_PUNCH_HOLE */
458
459 crfree(cr);
460
461 ASSERT3S(error, <=, 0);
462 return (error);
463}
464
465#ifdef HAVE_FILE_FALLOCATE
466static long
467zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
468{
469 return zpl_fallocate_common(filp->f_path.dentry->d_inode,
470 mode, offset, len);
471}
472#endif /* HAVE_FILE_FALLOCATE */
473
88c28395
BB
474static long
475zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
476{
477 switch (cmd) {
478 case ZFS_IOC_GETFLAGS:
479 case ZFS_IOC_SETFLAGS:
480 return (-EOPNOTSUPP);
481 default:
482 return (-ENOTTY);
483 }
484}
485
486#ifdef CONFIG_COMPAT
487static long
488zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
489{
490 return zpl_ioctl(filp, cmd, arg);
491}
492#endif /* CONFIG_COMPAT */
493
494
1efb473f 495const struct address_space_operations zpl_address_space_operations = {
dde471ef 496 .readpages = zpl_readpages,
1efb473f
BB
497 .readpage = zpl_readpage,
498 .writepage = zpl_writepage,
dde471ef 499 .writepages = zpl_writepages,
1efb473f
BB
500};
501
502const struct file_operations zpl_file_operations = {
126400a1
BB
503 .open = zpl_open,
504 .release = zpl_release,
802e7b5f 505 .llseek = zpl_llseek,
c0d35759
BB
506 .read = zpl_read,
507 .write = zpl_write,
c0d35759 508 .mmap = zpl_mmap,
1efb473f 509 .fsync = zpl_fsync,
cb2d1901
ED
510#ifdef HAVE_FILE_FALLOCATE
511 .fallocate = zpl_fallocate,
512#endif /* HAVE_FILE_FALLOCATE */
88c28395
BB
513 .unlocked_ioctl = zpl_ioctl,
514#ifdef CONFIG_COMPAT
515 .compat_ioctl = zpl_compat_ioctl,
516#endif
1efb473f
BB
517};
518
519const struct file_operations zpl_dir_file_operations = {
520 .llseek = generic_file_llseek,
521 .read = generic_read_dir,
0f37d0c8
RY
522#ifdef HAVE_VFS_ITERATE
523 .iterate = zpl_iterate,
524#else
1efb473f 525 .readdir = zpl_readdir,
0f37d0c8 526#endif
1efb473f 527 .fsync = zpl_fsync,
88c28395
BB
528 .unlocked_ioctl = zpl_ioctl,
529#ifdef CONFIG_COMPAT
530 .compat_ioctl = zpl_compat_ioctl,
531#endif
1efb473f 532};