[mirror_zfs.git] / module / zfs / zpl_file.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
 */


#include <sys/dmu_objset.h>
#include <sys/zfs_vfsops.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_znode.h>
#include <sys/zpl.h>


static int
zpl_open(struct inode *ip, struct file *filp)
{
	cred_t *cr = CRED();
	int error;

	error = generic_file_open(ip, filp);
	if (error)
		return (error);

	crhold(cr);
	error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
	crfree(cr);
	ASSERT3S(error, <=, 0);

	return (error);
}

static int
zpl_release(struct inode *ip, struct file *filp)
{
	cred_t *cr = CRED();
	int error;

	if (ITOZ(ip)->z_atime_dirty)
		mark_inode_dirty(ip);

	crhold(cr);
	error = -zfs_close(ip, filp->f_flags, cr);
	crfree(cr);
	ASSERT3S(error, <=, 0);

	return (error);
}

static int
zpl_iterate(struct file *filp, struct dir_context *ctx)
{
	struct dentry *dentry = filp->f_path.dentry;
	cred_t *cr = CRED();
	int error;

	crhold(cr);
	error = -zfs_readdir(dentry->d_inode, ctx, cr);
	crfree(cr);
	ASSERT3S(error, <=, 0);

	return (error);
}

#if !defined(HAVE_VFS_ITERATE)
static int
zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
	struct dir_context ctx = DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
	int error;

	error = zpl_iterate(filp, &ctx);
	filp->f_pos = ctx.pos;

	return (error);
}
#endif /* HAVE_VFS_ITERATE */

#if defined(HAVE_FSYNC_WITH_DENTRY)
/*
 * Linux 2.6.x - 2.6.34 API,
 * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *'
 * to the fops->fsync() hook.  For this reason, we must be careful not to
 * use filp unconditionally.
 */
static int
zpl_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
	cred_t *cr = CRED();
	int error;

	crhold(cr);
	error = -zfs_fsync(dentry->d_inode, datasync, cr);
	crfree(cr);
	ASSERT3S(error, <=, 0);

	return (error);
}

#elif defined(HAVE_FSYNC_WITHOUT_DENTRY)
/*
 * Linux 2.6.35 - 3.0 API,
 * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
 * redundant.  The dentry is still accessible via filp->f_path.dentry,
 * and we are guaranteed that filp will never be NULL.
 */
static int
zpl_fsync(struct file *filp, int datasync)
{
	struct inode *inode = filp->f_mapping->host;
	cred_t *cr = CRED();
	int error;

	crhold(cr);
	error = -zfs_fsync(inode, datasync, cr);
	crfree(cr);
	ASSERT3S(error, <=, 0);

	return (error);
}

#elif defined(HAVE_FSYNC_RANGE)
/*
 * Linux 3.1 - 3.x API,
 * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
 * been pushed down in to the .fsync() vfs hook.  Additionally, the i_mutex
 * lock is no longer held by the caller, for zfs we don't require the lock
 * to be held so we don't acquire it.
 */
static int
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
	struct inode *inode = filp->f_mapping->host;
	cred_t *cr = CRED();
	int error;

	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
	if (error)
		return (error);

	crhold(cr);
	error = -zfs_fsync(inode, datasync, cr);
	crfree(cr);
	ASSERT3S(error, <=, 0);

	return (error);
}
#else
#error "Unsupported fops->fsync() implementation"
#endif

ssize_t
zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t pos,
    uio_seg_t segment, int flags, cred_t *cr)
{
	int error;
	ssize_t read;
	struct iovec iov;
	uio_t uio;

	iov.iov_base = (void *)buf;
	iov.iov_len = len;

	uio.uio_iov = &iov;
	uio.uio_resid = len;
	uio.uio_iovcnt = 1;
	uio.uio_loffset = pos;
	uio.uio_limit = MAXOFFSET_T;
	uio.uio_segflg = segment;

	error = -zfs_read(ip, &uio, flags, cr);
	if (error < 0)
		return (error);

	read = len - uio.uio_resid;
	task_io_account_read(read);

	return (read);
}

static ssize_t
zpl_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	cred_t *cr = CRED();
	ssize_t read;

	crhold(cr);
	read = zpl_read_common(filp->f_mapping->host, buf, len, *ppos,
	    UIO_USERSPACE, filp->f_flags, cr);
	crfree(cr);

	if (read < 0)
		return (read);

	*ppos += read;
	return (read);
}

ssize_t
zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t pos,
    uio_seg_t segment, int flags, cred_t *cr)
{
	int error;
	ssize_t wrote;
	struct iovec iov;
	uio_t uio;

	iov.iov_base = (void *)buf;
	iov.iov_len = len;

	uio.uio_iov = &iov;
	uio.uio_resid = len,
	uio.uio_iovcnt = 1;
	uio.uio_loffset = pos;
	uio.uio_limit = MAXOFFSET_T;
	uio.uio_segflg = segment;

	error = -zfs_write(ip, &uio, flags, cr);
	if (error < 0)
		return (error);

	wrote = len - uio.uio_resid;
	task_io_account_write(wrote);

	return (wrote);
}

static ssize_t
zpl_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
	cred_t *cr = CRED();
	ssize_t wrote;

	crhold(cr);
	wrote = zpl_write_common(filp->f_mapping->host, buf, len, *ppos,
	    UIO_USERSPACE, filp->f_flags, cr);
	crfree(cr);

	if (wrote < 0)
		return (wrote);

	*ppos += wrote;
	return (wrote);
}

static loff_t
zpl_llseek(struct file *filp, loff_t offset, int whence)
{
#if defined(SEEK_HOLE) && defined(SEEK_DATA)
	if (whence == SEEK_DATA || whence == SEEK_HOLE) {
		struct inode *ip = filp->f_mapping->host;
		loff_t maxbytes = ip->i_sb->s_maxbytes;
		loff_t error;

		spl_inode_lock(ip);
		error = -zfs_holey(ip, whence, &offset);
		if (error == 0)
			error = lseek_execute(filp, ip, offset, maxbytes);
		spl_inode_unlock(ip);

		return (error);
	}
#endif /* SEEK_HOLE && SEEK_DATA */

	return (generic_file_llseek(filp, offset, whence));
}

/*
 * It's worth taking a moment to describe how mmap is implemented
 * for zfs because it differs considerably from other Linux filesystems.
 * However, this issue is handled the same way under OpenSolaris.
 *
 * The issue is that by design zfs bypasses the Linux page cache and
 * leaves all caching up to the ARC.  This has been shown to work
 * well for the common read(2)/write(2) case.  However, mmap(2)
 * is problem because it relies on being tightly integrated with the
 * page cache.  To handle this we cache mmap'ed files twice, once in
 * the ARC and a second time in the page cache.  The code is careful
 * to keep both copies synchronized.
 *
 * When a file with an mmap'ed region is written to using write(2)
 * both the data in the ARC and existing pages in the page cache
 * are updated.  For a read(2) data will be read first from the page
 * cache then the ARC if needed.  Neither a write(2) or read(2) will
 * will ever result in new pages being added to the page cache.
 *
 * New pages are added to the page cache only via .readpage() which
 * is called when the vfs needs to read a page off disk to back the
 * virtual memory region.  These pages may be modified without
 * notifying the ARC and will be written out periodically via
 * .writepage().  This will occur due to either a sync or the usual
 * page aging behavior.  Note because a read(2) of a mmap'ed file
 * will always check the page cache first even when the ARC is out
 * of date correct data will still be returned.
 *
 * While this implementation ensures correct behavior it does have
 * have some drawbacks.  The most obvious of which is that it
 * increases the required memory footprint when access mmap'ed
 * files.  It also adds additional complexity to the code keeping
 * both caches synchronized.
 *
 * Longer term it may be possible to cleanly resolve this wart by
 * mapping page cache pages directly on to the ARC buffers.  The
 * Linux address space operations are flexible enough to allow
 * selection of which pages back a particular index.  The trick
 * would be working out the details of which subsystem is in
 * charge, the ARC, the page cache, or both.  It may also prove
 * helpful to move the ARC buffers to a scatter-gather lists
 * rather than a vmalloc'ed region.
 */
static int
zpl_mmap(struct file *filp, struct vm_area_struct *vma)
{
	struct inode *ip = filp->f_mapping->host;
	znode_t *zp = ITOZ(ip);
	int error;

	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
	if (error)
		return (error);

	error = generic_file_mmap(filp, vma);
	if (error)
		return (error);

	mutex_enter(&zp->z_lock);
	zp->z_is_mapped = 1;
	mutex_exit(&zp->z_lock);

	return (error);
}

/*
 * Populate a page with data for the Linux page cache.  This function is
 * only used to support mmap(2).  There will be an identical copy of the
 * data in the ARC which is kept up to date via .write() and .writepage().
 *
 * Current this function relies on zpl_read_common() and the O_DIRECT
 * flag to read in a page.  This works but the more correct way is to
 * update zfs_fillpage() to be Linux friendly and use that interface.
 */
static int
zpl_readpage(struct file *filp, struct page *pp)
{
	struct inode *ip;
	struct page *pl[1];
	int error = 0;

	ASSERT(PageLocked(pp));
	ip = pp->mapping->host;
	pl[0] = pp;

	error = -zfs_getpage(ip, pl, 1);

	if (error) {
		SetPageError(pp);
		ClearPageUptodate(pp);
	} else {
		ClearPageError(pp);
		SetPageUptodate(pp);
		flush_dcache_page(pp);
	}

	unlock_page(pp);
	return (error);
}

/*
 * Populate a set of pages with data for the Linux page cache.  This
 * function will only be called for read ahead and never for demand
 * paging.  For simplicity, the code relies on read_cache_pages() to
 * correctly lock each page for IO and call zpl_readpage().
 */
static int
zpl_readpages(struct file *filp, struct address_space *mapping,
	struct list_head *pages, unsigned nr_pages)
{
	return (read_cache_pages(mapping, pages,
	    (filler_t *)zpl_readpage, filp));
}

int
zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
{
	struct address_space *mapping = data;

	ASSERT(PageLocked(pp));
	ASSERT(!PageWriteback(pp));
	ASSERT(!(current->flags & PF_NOFS));

	/*
	 * Annotate this call path with a flag that indicates that it is
	 * unsafe to use KM_SLEEP during memory allocations due to the
	 * potential for a deadlock.  KM_PUSHPAGE should be used instead.
	 */
	current->flags |= PF_NOFS;
	(void) zfs_putpage(mapping->host, pp, wbc);
	current->flags &= ~PF_NOFS;

	return (0);
}

static int
zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
	znode_t		*zp = ITOZ(mapping->host);
	zfs_sb_t	*zsb = ITOZSB(mapping->host);
	enum writeback_sync_modes sync_mode;
	int result;

	ZFS_ENTER(zsb);
	if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
		wbc->sync_mode = WB_SYNC_ALL;
	ZFS_EXIT(zsb);
	sync_mode = wbc->sync_mode;

	/*
	 * We don't want to run write_cache_pages() in SYNC mode here, because
	 * that would make putpage() wait for a single page to be committed to
	 * disk every single time, resulting in atrocious performance. Instead
	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
	 * and then we commit it all in one go.
	 */
	wbc->sync_mode = WB_SYNC_NONE;
	result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
	if (sync_mode != wbc->sync_mode) {
		ZFS_ENTER(zsb);
		ZFS_VERIFY_ZP(zp);
		zil_commit(zsb->z_log, zp->z_id);
		ZFS_EXIT(zsb);

		/*
		 * We need to call write_cache_pages() again (we can't just
		 * return after the commit) because the previous call in
		 * non-SYNC mode does not guarantee that we got all the dirty
		 * pages (see the implementation of write_cache_pages() for
		 * details). That being said, this is a no-op in most cases.
		 */
		wbc->sync_mode = sync_mode;
		result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
	}
	return (result);
}

/*
 * Write out dirty pages to the ARC, this function is only required to
 * support mmap(2).  Mapped pages may be dirtied by memory operations
 * which never call .write().  These dirty pages are kept in sync with
 * the ARC buffers via this hook.
 */
static int
zpl_writepage(struct page *pp, struct writeback_control *wbc)
{
	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
		wbc->sync_mode = WB_SYNC_ALL;

	return (zpl_putpage(pp, wbc, pp->mapping));
}

/*
 * The only flag combination which matches the behavior of zfs_space()
 * is FALLOC_FL_PUNCH_HOLE.  This flag was introduced in the 2.6.38 kernel.
 */
long
zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
{
	cred_t *cr = CRED();
	int error = -EOPNOTSUPP;

	if (mode & FALLOC_FL_KEEP_SIZE)
		return (-EOPNOTSUPP);

	crhold(cr);

#ifdef FALLOC_FL_PUNCH_HOLE
	if (mode & FALLOC_FL_PUNCH_HOLE) {
		flock64_t bf;

		bf.l_type = F_WRLCK;
		bf.l_whence = 0;
		bf.l_start = offset;
		bf.l_len = len;
		bf.l_pid = 0;

		error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr);
	}
#endif /* FALLOC_FL_PUNCH_HOLE */

	crfree(cr);

	ASSERT3S(error, <=, 0);
	return (error);
}

#ifdef HAVE_FILE_FALLOCATE
static long
zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
{
	return zpl_fallocate_common(filp->f_path.dentry->d_inode,
	    mode, offset, len);
}
#endif /* HAVE_FILE_FALLOCATE */

static long
zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	switch (cmd) {
	case ZFS_IOC_GETFLAGS:
	case ZFS_IOC_SETFLAGS:
		return (-EOPNOTSUPP);
	default:
		return (-ENOTTY);
	}
}

#ifdef CONFIG_COMPAT
static long
zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	return (zpl_ioctl(filp, cmd, arg));
}
#endif /* CONFIG_COMPAT */


const struct address_space_operations zpl_address_space_operations = {
	.readpages	= zpl_readpages,
	.readpage	= zpl_readpage,
	.writepage	= zpl_writepage,
	.writepages	= zpl_writepages,
};

const struct file_operations zpl_file_operations = {
	.open		= zpl_open,
	.release	= zpl_release,
	.llseek		= zpl_llseek,
	.read		= zpl_read,
	.write		= zpl_write,
	.mmap		= zpl_mmap,
	.fsync		= zpl_fsync,
#ifdef HAVE_FILE_FALLOCATE
	.fallocate	= zpl_fallocate,
#endif /* HAVE_FILE_FALLOCATE */
	.unlocked_ioctl	= zpl_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= zpl_compat_ioctl,
#endif
};

const struct file_operations zpl_dir_file_operations = {
	.llseek		= generic_file_llseek,
	.read		= generic_read_dir,
#ifdef HAVE_VFS_ITERATE
	.iterate	= zpl_iterate,
#else
	.readdir	= zpl_readdir,
#endif
	.fsync		= zpl_fsync,
	.unlocked_ioctl = zpl_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl   = zpl_compat_ioctl,
#endif
};
Commit	Line	Data
1efb473f BB	1	/*
	2	* CDDL HEADER START
	3	*
	4	* The contents of this file are subject to the terms of the
	5	* Common Development and Distribution License (the "License").
	6	* You may not use this file except in compliance with the License.
	7	*
	8	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	9	* or http://www.opensolaris.org/os/licensing.
	10	* See the License for the specific language governing permissions
	11	* and limitations under the License.
	12	*
	13	* When distributing Covered Code, include this CDDL HEADER in each
	14	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	15	* If applicable, add the following below this CDDL HEADER, with the
	16	* fields enclosed by brackets "[]" replaced with your own identifying
	17	* information: Portions Copyright [yyyy] [name of copyright owner]
	18	*
	19	* CDDL HEADER END
	20	*/
	21	/*
	22	* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
	23	*/
	24
	25
119a394a	26	#include <sys/dmu_objset.h>
1efb473f BB	27	#include <sys/zfs_vfsops.h>
	28	#include <sys/zfs_vnops.h>
	29	#include <sys/zfs_znode.h>
	30	#include <sys/zpl.h>
	31
	32
126400a1 BB	33	static int
	34	zpl_open(struct inode ip, struct file filp)
	35	{
81e97e21	36	cred_t *cr = CRED();
126400a1 BB	37	int error;
126400a1 BB	38
7dc71949 CC	39	error = generic_file_open(ip, filp);
	40	if (error)
	41	return (error);
	42
81e97e21	43	crhold(cr);
126400a1	44	error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
81e97e21	45	crfree(cr);
126400a1 BB	46	ASSERT3S(error, <=, 0);
126400a1 BB	47
7dc71949	48	return (error);
126400a1 BB	49	}
	50
	51	static int
	52	zpl_release(struct inode ip, struct file filp)
	53	{
81e97e21	54	cred_t *cr = CRED();
126400a1 BB	55	int error;
126400a1 BB	56
78d7a5d7 BB	57	if (ITOZ(ip)->z_atime_dirty)
	58	mark_inode_dirty(ip);
	59
81e97e21	60	crhold(cr);
126400a1	61	error = -zfs_close(ip, filp->f_flags, cr);
81e97e21	62	crfree(cr);
126400a1 BB	63	ASSERT3S(error, <=, 0);
	64
	65	return (error);
	66	}
	67
1efb473f	68	static int
0f37d0c8	69	zpl_iterate(struct file filp, struct dir_context ctx)
1efb473f BB	70	{
1efb473f BB	71	struct dentry *dentry = filp->f_path.dentry;
81e97e21	72	cred_t *cr = CRED();
1efb473f BB	73	int error;
1efb473f BB	74
81e97e21	75	crhold(cr);
0f37d0c8	76	error = -zfs_readdir(dentry->d_inode, ctx, cr);
81e97e21	77	crfree(cr);
1efb473f BB	78	ASSERT3S(error, <=, 0);
	79
	80	return (error);
	81	}
	82
0f37d0c8 RY	83	#if !defined(HAVE_VFS_ITERATE)
	84	static int
	85	zpl_readdir(struct file filp, void dirent, filldir_t filldir)
	86	{
	87	struct dir_context ctx = DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
	88	int error;
	89
	90	error = zpl_iterate(filp, &ctx);
	91	filp->f_pos = ctx.pos;
	92
	93	return (error);
	94	}
	95	#endif /* HAVE_VFS_ITERATE */
	96
adcd70bd	97	#if defined(HAVE_FSYNC_WITH_DENTRY)
3117dd0b	98	/*
adcd70bd BB	99	* Linux 2.6.x - 2.6.34 API,
	100	* Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *'
	101	* to the fops->fsync() hook. For this reason, we must be careful not to
	102	* use filp unconditionally.
	103	*/
	104	static int
	105	zpl_fsync(struct file filp, struct dentry dentry, int datasync)
	106	{
	107	cred_t *cr = CRED();
	108	int error;
	109
	110	crhold(cr);
	111	error = -zfs_fsync(dentry->d_inode, datasync, cr);
	112	crfree(cr);
	113	ASSERT3S(error, <=, 0);
	114
	115	return (error);
	116	}
	117
	118	#elif defined(HAVE_FSYNC_WITHOUT_DENTRY)
	119	/*
	120	* Linux 2.6.35 - 3.0 API,
	121	* As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
3117dd0b BB	122	* redundant. The dentry is still accessible via filp->f_path.dentry,
3117dd0b BB	123	* and we are guaranteed that filp will never be NULL.
3117dd0b	124	*/
3117dd0b BB	125	static int
	126	zpl_fsync(struct file *filp, int datasync)
	127	{
adcd70bd BB	128	struct inode *inode = filp->f_mapping->host;
	129	cred_t *cr = CRED();
	130	int error;
	131
	132	crhold(cr);
	133	error = -zfs_fsync(inode, datasync, cr);
	134	crfree(cr);
	135	ASSERT3S(error, <=, 0);
	136
	137	return (error);
	138	}
	139
	140	#elif defined(HAVE_FSYNC_RANGE)
	141	/*
	142	* Linux 3.1 - 3.x API,
	143	* As of 3.1 the responsibility to call filemap_write_and_wait_range() has
	144	* been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex
	145	* lock is no longer held by the caller, for zfs we don't require the lock
	146	* to be held so we don't acquire it.
	147	*/
3117dd0b	148	static int
adcd70bd	149	zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
1efb473f	150	{
adcd70bd	151	struct inode *inode = filp->f_mapping->host;
81e97e21	152	cred_t *cr = CRED();
1efb473f BB	153	int error;
1efb473f BB	154
adcd70bd BB	155	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
	156	if (error)
	157	return (error);
	158
81e97e21	159	crhold(cr);
adcd70bd	160	error = -zfs_fsync(inode, datasync, cr);
81e97e21	161	crfree(cr);
1efb473f BB	162	ASSERT3S(error, <=, 0);
	163
	164	return (error);
	165	}
adcd70bd BB	166	#else
	167	#error "Unsupported fops->fsync() implementation"
	168	#endif
1efb473f BB	169
	170	ssize_t
	171	zpl_read_common(struct inode ip, const char buf, size_t len, loff_t pos,
d1d7e268	172	uio_seg_t segment, int flags, cred_t *cr)
1efb473f BB	173	{
1efb473f BB	174	int error;
e3dc14b8	175	ssize_t read;
1efb473f BB	176	struct iovec iov;
	177	uio_t uio;
	178
	179	iov.iov_base = (void *)buf;
	180	iov.iov_len = len;
	181
	182	uio.uio_iov = &iov;
	183	uio.uio_resid = len;
	184	uio.uio_iovcnt = 1;
	185	uio.uio_loffset = pos;
	186	uio.uio_limit = MAXOFFSET_T;
	187	uio.uio_segflg = segment;
	188
	189	error = -zfs_read(ip, &uio, flags, cr);
	190	if (error < 0)
	191	return (error);
	192
e3dc14b8 BB	193	read = len - uio.uio_resid;
	194	task_io_account_read(read);
	195
	196	return (read);
1efb473f BB	197	}
	198
	199	static ssize_t
	200	zpl_read(struct file filp, char __user buf, size_t len, loff_t *ppos)
	201	{
81e97e21	202	cred_t *cr = CRED();
1efb473f BB	203	ssize_t read;
1efb473f BB	204
81e97e21	205	crhold(cr);
1efb473f BB	206	read = zpl_read_common(filp->f_mapping->host, buf, len, *ppos,
1efb473f BB	207	UIO_USERSPACE, filp->f_flags, cr);
81e97e21	208	crfree(cr);
1efb473f BB	209
	210	if (read < 0)
	211	return (read);
	212
	213	*ppos += read;
	214	return (read);
	215	}
	216
	217	ssize_t
	218	zpl_write_common(struct inode ip, const char buf, size_t len, loff_t pos,
	219	uio_seg_t segment, int flags, cred_t *cr)
	220	{
	221	int error;
e3dc14b8	222	ssize_t wrote;
1efb473f BB	223	struct iovec iov;
	224	uio_t uio;
	225
	226	iov.iov_base = (void *)buf;
	227	iov.iov_len = len;
	228
	229	uio.uio_iov = &iov;
	230	uio.uio_resid = len,
	231	uio.uio_iovcnt = 1;
	232	uio.uio_loffset = pos;
	233	uio.uio_limit = MAXOFFSET_T;
	234	uio.uio_segflg = segment;
	235
	236	error = -zfs_write(ip, &uio, flags, cr);
	237	if (error < 0)
	238	return (error);
	239
e3dc14b8 BB	240	wrote = len - uio.uio_resid;
	241	task_io_account_write(wrote);
	242
	243	return (wrote);
1efb473f BB	244	}
	245
	246	static ssize_t
	247	zpl_write(struct file filp, const char __user buf, size_t len, loff_t *ppos)
	248	{
81e97e21	249	cred_t *cr = CRED();
1efb473f BB	250	ssize_t wrote;
1efb473f BB	251
81e97e21	252	crhold(cr);
1efb473f BB	253	wrote = zpl_write_common(filp->f_mapping->host, buf, len, *ppos,
1efb473f BB	254	UIO_USERSPACE, filp->f_flags, cr);
81e97e21	255	crfree(cr);
1efb473f BB	256
	257	if (wrote < 0)
	258	return (wrote);
	259
	260	*ppos += wrote;
	261	return (wrote);
	262	}
	263
802e7b5f LD	264	static loff_t
	265	zpl_llseek(struct file *filp, loff_t offset, int whence)
	266	{
	267	#if defined(SEEK_HOLE) && defined(SEEK_DATA)
	268	if (whence == SEEK_DATA \|\| whence == SEEK_HOLE) {
	269	struct inode *ip = filp->f_mapping->host;
	270	loff_t maxbytes = ip->i_sb->s_maxbytes;
	271	loff_t error;
	272
	273	spl_inode_lock(ip);
	274	error = -zfs_holey(ip, whence, &offset);
	275	if (error == 0)
	276	error = lseek_execute(filp, ip, offset, maxbytes);
	277	spl_inode_unlock(ip);
	278
	279	return (error);
	280	}
	281	#endif /* SEEK_HOLE && SEEK_DATA */
	282
d1d7e268	283	return (generic_file_llseek(filp, offset, whence));
802e7b5f LD	284	}
802e7b5f LD	285
c0d35759 BB	286	/*
	287	* It's worth taking a moment to describe how mmap is implemented
	288	* for zfs because it differs considerably from other Linux filesystems.
	289	* However, this issue is handled the same way under OpenSolaris.
	290	*
	291	* The issue is that by design zfs bypasses the Linux page cache and
	292	* leaves all caching up to the ARC. This has been shown to work
	293	* well for the common read(2)/write(2) case. However, mmap(2)
	294	* is problem because it relies on being tightly integrated with the
	295	* page cache. To handle this we cache mmap'ed files twice, once in
	296	* the ARC and a second time in the page cache. The code is careful
	297	* to keep both copies synchronized.
	298	*
	299	* When a file with an mmap'ed region is written to using write(2)
	300	* both the data in the ARC and existing pages in the page cache
	301	* are updated. For a read(2) data will be read first from the page
	302	* cache then the ARC if needed. Neither a write(2) or read(2) will
	303	* will ever result in new pages being added to the page cache.
	304	*
	305	* New pages are added to the page cache only via .readpage() which
	306	* is called when the vfs needs to read a page off disk to back the
	307	* virtual memory region. These pages may be modified without
	308	* notifying the ARC and will be written out periodically via
	309	* .writepage(). This will occur due to either a sync or the usual
	310	* page aging behavior. Note because a read(2) of a mmap'ed file
	311	* will always check the page cache first even when the ARC is out
	312	* of date correct data will still be returned.
	313	*
	314	* While this implementation ensures correct behavior it does have
	315	* have some drawbacks. The most obvious of which is that it
	316	* increases the required memory footprint when access mmap'ed
	317	* files. It also adds additional complexity to the code keeping
	318	* both caches synchronized.
	319	*
	320	* Longer term it may be possible to cleanly resolve this wart by
	321	* mapping page cache pages directly on to the ARC buffers. The
	322	* Linux address space operations are flexible enough to allow
	323	* selection of which pages back a particular index. The trick
	324	* would be working out the details of which subsystem is in
	325	* charge, the ARC, the page cache, or both. It may also prove
	326	* helpful to move the ARC buffers to a scatter-gather lists
	327	* rather than a vmalloc'ed region.
	328	*/
	329	static int
	330	zpl_mmap(struct file filp, struct vm_area_struct vma)
	331	{
e2e7aa2d BB	332	struct inode *ip = filp->f_mapping->host;
e2e7aa2d BB	333	znode_t *zp = ITOZ(ip);
c0d35759 BB	334	int error;
c0d35759 BB	335
e2e7aa2d BB	336	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
	337	(size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
	338	if (error)
	339	return (error);
	340
c0d35759 BB	341	error = generic_file_mmap(filp, vma);
	342	if (error)
	343	return (error);
	344
	345	mutex_enter(&zp->z_lock);
	346	zp->z_is_mapped = 1;
	347	mutex_exit(&zp->z_lock);
	348
	349	return (error);
	350	}
	351
	352	/*
	353	* Populate a page with data for the Linux page cache. This function is
	354	* only used to support mmap(2). There will be an identical copy of the
	355	* data in the ARC which is kept up to date via .write() and .writepage().
	356	*
	357	* Current this function relies on zpl_read_common() and the O_DIRECT
	358	* flag to read in a page. This works but the more correct way is to
	359	* update zfs_fillpage() to be Linux friendly and use that interface.
	360	*/
	361	static int
	362	zpl_readpage(struct file filp, struct page pp)
	363	{
	364	struct inode *ip;
dde471ef	365	struct page *pl[1];
c0d35759 BB	366	int error = 0;
	367
	368	ASSERT(PageLocked(pp));
	369	ip = pp->mapping->host;
dde471ef	370	pl[0] = pp;
c0d35759	371
dde471ef	372	error = -zfs_getpage(ip, pl, 1);
c0d35759	373
dde471ef PJ	374	if (error) {
	375	SetPageError(pp);
	376	ClearPageUptodate(pp);
	377	} else {
	378	ClearPageError(pp);
	379	SetPageUptodate(pp);
	380	flush_dcache_page(pp);
	381	}
c0d35759	382
dde471ef	383	unlock_page(pp);
d1d7e268	384	return (error);
dde471ef	385	}
c0d35759	386
f3ab88d6 BB	387	/*
	388	* Populate a set of pages with data for the Linux page cache. This
	389	* function will only be called for read ahead and never for demand
	390	* paging. For simplicity, the code relies on read_cache_pages() to
	391	* correctly lock each page for IO and call zpl_readpage().
	392	*/
	393	static int
	394	zpl_readpages(struct file filp, struct address_space mapping,
	395	struct list_head *pages, unsigned nr_pages)
	396	{
95d9fd02 BB	397	return (read_cache_pages(mapping, pages,
95d9fd02 BB	398	(filler_t *)zpl_readpage, filp));
f3ab88d6 BB	399	}
f3ab88d6 BB	400
dde471ef PJ	401	int
	402	zpl_putpage(struct page pp, struct writeback_control wbc, void *data)
	403	{
3c0e5c0f BB	404	struct address_space *mapping = data;
	405
	406	ASSERT(PageLocked(pp));
	407	ASSERT(!PageWriteback(pp));
8630650a BB	408	ASSERT(!(current->flags & PF_NOFS));
	409
	410	/*
	411	* Annotate this call path with a flag that indicates that it is
	412	* unsafe to use KM_SLEEP during memory allocations due to the
	413	* potential for a deadlock. KM_PUSHPAGE should be used instead.
	414	*/
	415	current->flags \|= PF_NOFS;
62c4165a	416	(void) zfs_putpage(mapping->host, pp, wbc);
8630650a	417	current->flags &= ~PF_NOFS;
c0d35759	418
3c0e5c0f	419	return (0);
dde471ef	420	}
c0d35759	421
dde471ef PJ	422	static int
	423	zpl_writepages(struct address_space mapping, struct writeback_control wbc)
	424	{
119a394a ED	425	znode_t *zp = ITOZ(mapping->host);
	426	zfs_sb_t *zsb = ITOZSB(mapping->host);
	427	enum writeback_sync_modes sync_mode;
	428	int result;
	429
	430	ZFS_ENTER(zsb);
	431	if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
	432	wbc->sync_mode = WB_SYNC_ALL;
	433	ZFS_EXIT(zsb);
	434	sync_mode = wbc->sync_mode;
	435
	436	/*
	437	* We don't want to run write_cache_pages() in SYNC mode here, because
	438	* that would make putpage() wait for a single page to be committed to
	439	* disk every single time, resulting in atrocious performance. Instead
	440	* we run it once in non-SYNC mode so that the ZIL gets all the data,
	441	* and then we commit it all in one go.
	442	*/
	443	wbc->sync_mode = WB_SYNC_NONE;
	444	result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
	445	if (sync_mode != wbc->sync_mode) {
	446	ZFS_ENTER(zsb);
	447	ZFS_VERIFY_ZP(zp);
	448	zil_commit(zsb->z_log, zp->z_id);
	449	ZFS_EXIT(zsb);
	450
	451	/*
	452	* We need to call write_cache_pages() again (we can't just
	453	* return after the commit) because the previous call in
	454	* non-SYNC mode does not guarantee that we got all the dirty
	455	* pages (see the implementation of write_cache_pages() for
	456	* details). That being said, this is a no-op in most cases.
	457	*/
	458	wbc->sync_mode = sync_mode;
	459	result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
	460	}
	461	return (result);
c0d35759 BB	462	}
	463
	464	/*
	465	* Write out dirty pages to the ARC, this function is only required to
	466	* support mmap(2). Mapped pages may be dirtied by memory operations
	467	* which never call .write(). These dirty pages are kept in sync with
	468	* the ARC buffers via this hook.
c0d35759 BB	469	*/
	470	static int
	471	zpl_writepage(struct page pp, struct writeback_control wbc)
	472	{
119a394a ED	473	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
	474	wbc->sync_mode = WB_SYNC_ALL;
	475
	476	return (zpl_putpage(pp, wbc, pp->mapping));
c0d35759 BB	477	}
c0d35759 BB	478
cb2d1901 ED	479	/*
	480	* The only flag combination which matches the behavior of zfs_space()
	481	* is FALLOC_FL_PUNCH_HOLE. This flag was introduced in the 2.6.38 kernel.
	482	*/
	483	long
	484	zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
	485	{
	486	cred_t *cr = CRED();
	487	int error = -EOPNOTSUPP;
	488
	489	if (mode & FALLOC_FL_KEEP_SIZE)
	490	return (-EOPNOTSUPP);
	491
	492	crhold(cr);
	493
	494	#ifdef FALLOC_FL_PUNCH_HOLE
	495	if (mode & FALLOC_FL_PUNCH_HOLE) {
	496	flock64_t bf;
	497
	498	bf.l_type = F_WRLCK;
	499	bf.l_whence = 0;
	500	bf.l_start = offset;
	501	bf.l_len = len;
	502	bf.l_pid = 0;
	503
	504	error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr);
	505	}
	506	#endif /* FALLOC_FL_PUNCH_HOLE */
	507
	508	crfree(cr);
	509
	510	ASSERT3S(error, <=, 0);
	511	return (error);
	512	}
	513
	514	#ifdef HAVE_FILE_FALLOCATE
	515	static long
	516	zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
	517	{
	518	return zpl_fallocate_common(filp->f_path.dentry->d_inode,
	519	mode, offset, len);
	520	}
	521	#endif /* HAVE_FILE_FALLOCATE */
	522
88c28395 BB	523	static long
	524	zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
	525	{
	526	switch (cmd) {
	527	case ZFS_IOC_GETFLAGS:
	528	case ZFS_IOC_SETFLAGS:
	529	return (-EOPNOTSUPP);
	530	default:
	531	return (-ENOTTY);
	532	}
	533	}
	534
	535	#ifdef CONFIG_COMPAT
	536	static long
	537	zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
	538	{
d1d7e268	539	return (zpl_ioctl(filp, cmd, arg));
88c28395 BB	540	}
	541	#endif /* CONFIG_COMPAT */
	542
	543
1efb473f	544	const struct address_space_operations zpl_address_space_operations = {
dde471ef	545	.readpages = zpl_readpages,
1efb473f BB	546	.readpage = zpl_readpage,
1efb473f BB	547	.writepage = zpl_writepage,
d1d7e268	548	.writepages = zpl_writepages,
1efb473f BB	549	};
	550
	551	const struct file_operations zpl_file_operations = {
126400a1 BB	552	.open = zpl_open,
126400a1 BB	553	.release = zpl_release,
802e7b5f	554	.llseek = zpl_llseek,
c0d35759 BB	555	.read = zpl_read,
c0d35759 BB	556	.write = zpl_write,
c0d35759	557	.mmap = zpl_mmap,
1efb473f	558	.fsync = zpl_fsync,
cb2d1901	559	#ifdef HAVE_FILE_FALLOCATE
d1d7e268	560	.fallocate = zpl_fallocate,
cb2d1901	561	#endif /* HAVE_FILE_FALLOCATE */
d1d7e268	562	.unlocked_ioctl = zpl_ioctl,
88c28395	563	#ifdef CONFIG_COMPAT
d1d7e268	564	.compat_ioctl = zpl_compat_ioctl,
88c28395	565	#endif
1efb473f BB	566	};
	567
	568	const struct file_operations zpl_dir_file_operations = {
	569	.llseek = generic_file_llseek,
	570	.read = generic_read_dir,
0f37d0c8 RY	571	#ifdef HAVE_VFS_ITERATE
	572	.iterate = zpl_iterate,
	573	#else
1efb473f	574	.readdir = zpl_readdir,
0f37d0c8	575	#endif
1efb473f	576	.fsync = zpl_fsync,
88c28395 BB	577	.unlocked_ioctl = zpl_ioctl,
	578	#ifdef CONFIG_COMPAT
	579	.compat_ioctl = zpl_compat_ioctl,
	580	#endif
1efb473f	581	};