X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=module%2Fzfs%2Fzfs_vnops.c;h=281f58249ad54b43a8cb103963ac5a47099ea1d1;hb=a7628932695997e87e6f03531a9ff342e2e1e1d6;hp=4afae6c36b1aea363b23ce19ea4e3f4095367d7d;hpb=09ec770c2cfdb105e1d4a6e7470f2456d37c65e0;p=mirror_zfs.git diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4afae6c36..281f58249 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ @@ -33,11 +33,8 @@ #include #include #include -#include #include -#include #include -#include #include #include #include @@ -45,11 +42,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -61,23 +56,19 @@ #include #include #include -#include #include #include #include #include -#include "fs/fs_subr.h" #include #include #include #include -#include #include -#include -#include #include -#include #include +#include +#include /* * Programming rules. @@ -129,7 +120,7 @@ * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent - * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. @@ -154,7 +145,7 @@ * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry @@ -278,6 +269,14 @@ zfs_holey_common(struct inode *ip, int cmd, loff_t *off) if (error == ESRCH) return (SET_ERROR(ENXIO)); + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the @@ -390,6 +389,7 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio) pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { ASSERT(PageUptodate(pp)); + unlock_page(pp); pb = kmap(pp); error = uiomove(pb + off, bytes, UIO_READ, uio); @@ -399,7 +399,6 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio) flush_dcache_page(pp); mark_page_accessed(pp); - unlock_page(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), @@ -439,15 +438,11 @@ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; int zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) { - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - ssize_t n, nbytes; - int error = 0; - rl_t *rl; -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; -#endif /* HAVE_UIO_ZEROCOPY */ + int error = 0; + boolean_t frsync = B_FALSE; + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -472,17 +467,26 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) return (0); } +#ifdef FRSYNC /* * If we're in FRSYNC mode, sync out this znode before reading it. + * Only do this for non-snapshots. + * + * Some platforms do not support FRSYNC and instead map it + * to FSYNC, which results in unnecessary calls to zil_commit. We + * only honor FRSYNC requests on platforms which support it. */ - if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + frsync = !!(ioflag & FRSYNC); +#endif + if (zfsvfs->z_log && + (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ - rl = zfs_range_lock(&zp->z_range_lock, uio->uio_loffset, uio->uio_resid, - RL_READER); + locked_range_t *lr = rangelock_enter(&zp->z_rangelock, + uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip @@ -494,9 +498,11 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) } ASSERT(uio->uio_loffset < zp->z_size); - n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + ssize_t start_resid = n; #ifdef HAVE_UIO_ZEROCOPY + xuio_t *xuio = NULL; if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; @@ -528,7 +534,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) #endif /* HAVE_UIO_ZEROCOPY */ while (n > 0) { - nbytes = MIN(n, zfs_read_chunk_size - + ssize_t nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { @@ -547,8 +553,12 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) n -= nbytes; } + + int64_t nread = start_resid - n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); + task_io_account_read(nread); out: - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (error); @@ -577,46 +587,28 @@ out: int zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) { - znode_t *zp = ITOZ(ip); - rlim64_t limit = uio->uio_limit; - ssize_t start_resid = uio->uio_resid; - ssize_t tx_bytes; - uint64_t end_size; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = ZTOZSB(zp); - zilog_t *zilog; - offset_t woff; - ssize_t n, nbytes; - rl_t *rl; - int max_blksz = zfsvfs->z_max_blksz; - int error = 0; - arc_buf_t *abuf; - const iovec_t *aiov = NULL; - xuio_t *xuio = NULL; - int write_eof; - int count = 0; - sa_bulk_attr_t bulk[4]; - uint64_t mtime[2], ctime[2]; - uint32_t uid; -#ifdef HAVE_UIO_ZEROCOPY - int i_iov = 0; - const iovec_t *iovp = uio->uio_iov; - ASSERTV(int iovcnt = uio->uio_iovcnt); -#endif + int error = 0; + ssize_t start_resid = uio->uio_resid; /* * Fasttrack empty write */ - n = start_resid; + ssize_t n = start_resid; if (n == 0) return (0); + rlim64_t limit = uio->uio_limit; if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + sa_bulk_attr_t bulk[4]; + int count = 0; + uint64_t mtime[2], ctime[2]; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, @@ -643,17 +635,18 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EPERM)); } - zilog = zfsvfs->z_log; - /* * Validate file offset */ - woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; + offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } + int max_blksz = zfsvfs->z_max_blksz; + xuio_t *xuio = NULL; + /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. @@ -665,19 +658,23 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) xuio = (xuio_t *)uio; else #endif - uio_prefaultpages(MIN(n, max_blksz), uio); + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFAULT)); + } /* * If in append mode, set the io offset pointer to eof. */ + locked_range_t *lr; if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ - rl = zfs_range_lock(&zp->z_range_lock, 0, n, RL_APPEND); - woff = rl->r_off; - if (rl->r_len == UINT64_MAX) { + lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. @@ -692,11 +689,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ - rl = zfs_range_lock(&zp->z_range_lock, woff, n, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (woff >= limit) { - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } @@ -705,9 +702,16 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) n = limit - woff; /* Will this write extend the file length? */ - write_eof = (woff + n > zp->z_size); + int write_eof = (woff + n > zp->z_size); + + uint64_t end_size = MAX(zp->z_size, woff + n); + zilog_t *zilog = zfsvfs->z_log; +#ifdef HAVE_UIO_ZEROCOPY + int i_iov = 0; + const iovec_t *iovp = uio->uio_iov; + ASSERTV(int iovcnt = uio->uio_iovcnt); +#endif - end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written @@ -715,17 +719,22 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * and allows us to do more fine-grained space accounting. */ while (n > 0) { - abuf = NULL; woff = uio->uio_loffset; - if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || - zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { - if (abuf != NULL) - dmu_return_arcbuf(abuf); + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + KUID_TO_SUID(ip->i_uid)) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + KGID_TO_SGID(ip->i_gid)) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { error = SET_ERROR(EDQUOT); break; } - if (xuio && abuf == NULL) { + arc_buf_t *abuf = NULL; + const iovec_t *aiov = NULL; + if (xuio) { #ifdef HAVE_UIO_ZEROCOPY ASSERT(i_iov < iovcnt); ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); @@ -737,8 +746,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) aiov->iov_len == arc_buf_size(abuf))); i_iov++; #endif - } else if (abuf == NULL && n >= max_blksz && - woff >= zp->z_size && + } else if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* @@ -765,7 +773,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) /* * Start a transaction. */ - tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); @@ -778,12 +786,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) } /* - * If zfs_range_lock() over-locked we grow the blocksize + * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen - * on the first iteration since zfs_range_reduce() will - * shrink down r_len to the appropriate size. + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. */ - if (rl->r_len == UINT64_MAX) { + if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { @@ -799,19 +807,31 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); - zfs_range_reduce(rl, woff, n); + rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ - nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = uio->uio_resid; + uio->uio_fault_disable = B_TRUE; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); + if (error == EFAULT) { + dmu_tx_commit(tx); + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + break; + } + continue; + } else if (error != 0) { + dmu_tx_commit(tx); + break; + } tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; @@ -826,14 +846,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, - // cppcheck-suppress nullPointer + /* cppcheck-suppress nullPointer */ aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); - dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), - woff, abuf, tx); + error = dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + if (error != 0) { + dmu_return_arcbuf(abuf); + dmu_tx_commit(tx); + break; + } } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); @@ -867,7 +892,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); - uid = KUID_TO_SUID(ip->i_uid); + uint32_t uid = KUID_TO_SUID(ip->i_uid); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && @@ -911,12 +936,16 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ASSERT(tx_bytes == nbytes); n -= nbytes; - if (!xuio && n > 0) - uio_prefaultpages(MIN(n, max_blksz), uio); + if (!xuio && n > 0) { + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + error = EFAULT; + break; + } + } } zfs_inode_update(zp); - zfs_range_unlock(rl); + rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, return error. @@ -931,6 +960,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); + int64_t nwritten = start_resid - uio->uio_resid; + dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); + task_io_account_write(nwritten); + ZFS_EXIT(zfsvfs); return (0); } @@ -956,6 +989,7 @@ zfs_iput_async(struct inode *ip) iput(ip); } +/* ARGSUSED */ void zfs_get_done(zgd_t *zgd, int error) { @@ -964,7 +998,7 @@ zfs_get_done(zgd_t *zgd, int error) if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - zfs_range_unlock(zgd->zgd_rl); + rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the @@ -972,9 +1006,6 @@ zfs_get_done(zgd_t *zgd, int error) */ zfs_iput_async(ZTOI(zp)); - if (error == 0 && zgd->zgd_bp) - zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); - kmem_free(zgd, sizeof (zgd_t)); } @@ -986,7 +1017,7 @@ static int zil_fault_io = 0; * Get data to generate a TX_WRITE intent log record. */ int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; @@ -994,13 +1025,13 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; - blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error = 0; - ASSERT(zio != NULL); - ASSERT(size != 0); + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed @@ -1017,7 +1048,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_zilog = zfsvfs->z_log; + zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* @@ -1028,8 +1059,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, size, - RL_READER); + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); @@ -1041,7 +1072,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's - * written out and it's checksum is being calculated + * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ @@ -1050,12 +1081,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; - zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, - size, RL_READER); + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; - zfs_range_unlock(zgd->zgd_rl); + rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) @@ -1071,11 +1102,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) DMU_READ_NO_PREFETCH); if (error == 0) { - blkptr_t *obp = dmu_buf_get_blkptr(db); - if (obp) { - ASSERT(BP_IS_HOLE(bp)); - *bp = *obp; - } + blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; @@ -1098,6 +1125,14 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); error = 0; } } @@ -1180,27 +1215,6 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, return (0); } return (error); -#ifdef HAVE_DNLC - } else if (!zdp->z_zfsvfs->z_norm && - (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) { - - vnode_t *tvp = dnlc_lookup(dvp, nm); - - if (tvp) { - error = zfs_fastaccesschk_execute(zdp, cr); - if (error) { - iput(tvp); - return (error); - } - if (tvp == DNLC_NO_VNODE) { - iput(tvp); - return (SET_ERROR(ENOENT)); - } else { - *vpp = tvp; - return (specvp_check(vpp, cr)); - } - } -#endif /* HAVE_DNLC */ } } @@ -1277,7 +1291,7 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. - * flag - large file flag [UNUSED]. + * flag - file flag. * vsecp - ACL to be set * * OUT: ipp - inode of created or trunc'd entry. @@ -1373,6 +1387,7 @@ top: if (zp == NULL) { uint64_t txtype; + uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory @@ -1401,7 +1416,9 @@ top: goto out; have_acl = B_TRUE; - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; @@ -1422,7 +1439,9 @@ top: dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + + error = dmu_tx_assign(tx, + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { @@ -1438,10 +1457,22 @@ top: } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + goto out; + } + if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - (void) zfs_link_create(dl, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; @@ -1531,6 +1562,7 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; + uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; @@ -1577,7 +1609,9 @@ top: goto out; have_acl = B_TRUE; - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; @@ -1597,7 +1631,7 @@ top: dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; @@ -1717,13 +1751,6 @@ top: goto out; } -#ifdef HAVE_DNLC - if (realnmp) - dnlc_remove(dvp, realnmp->pn_buf); - else - dnlc_remove(dvp, name); -#endif /* HAVE_DNLC */ - mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped); mutex_exit(&zp->z_lock); @@ -1770,7 +1797,7 @@ top: */ dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { @@ -1988,7 +2015,7 @@ top: return (error); } - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); @@ -2012,7 +2039,7 @@ top: dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { @@ -2032,13 +2059,18 @@ top: */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - /* * Now put new name in parent dir. */ - (void) zfs_link_create(dl, zp, tx, ZNEW); + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); *ipp = ZTOI(zp); @@ -2048,6 +2080,7 @@ top: zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); +out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -2057,10 +2090,14 @@ top: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - zfs_inode_update(dzp); - zfs_inode_update(zp); + if (error != 0) { + iput(ZTOI(zp)); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + } ZFS_EXIT(zfsvfs); - return (0); + return (error); } /* @@ -2151,7 +2188,7 @@ top: zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); @@ -2218,7 +2255,7 @@ out: */ /* ARGSUSED */ int -zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) +zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -2323,7 +2360,7 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) type = ZFS_DIRENT_TYPE(zap.za_first_integer); } - done = !dir_emit(ctx, zap.za_name, strlen(zap.za_name), + done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), objnum, type); if (done) break; @@ -2560,6 +2597,17 @@ zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + xoap->xoa_projinherit = + ((zp->z_pflags & ZFS_PROJINHERIT) != 0); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + xoap->xoa_projid = zp->z_projid; + XVA_SET_RTN(xvap, XAT_PROJID); + } } ZFS_TIME_DECODE(&vap->va_atime, atime); @@ -2637,6 +2685,126 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp) return (0); } +/* + * For the operation of changing file's user/group/project, we need to + * handle not only the main object that is assigned to the file directly, + * but also the ones that are used by the file via hidden xattr directory. + * + * Because the xattr directory may contains many EA entries, as to it may + * be impossible to change all of them via the transaction of changing the + * main object's user/group/project attributes. Then we have to change them + * via other multiple independent transactions one by one. It may be not good + * solution, but we have no better idea yet. + */ +static int +zfs_setattr_dir(znode_t *dzp) +{ + struct inode *dxip = ZTOI(dzp); + struct inode *xip = NULL; + zfsvfs_t *zfsvfs = ITOZSB(dxip); + objset_t *os = zfsvfs->z_os; + zap_cursor_t zc; + zap_attribute_t zap; + zfs_dirlock_t *dl; + znode_t *zp; + dmu_tx_t *tx = NULL; + uint64_t uid, gid; + sa_bulk_attr_t bulk[4]; + int count; + int err; + + zap_cursor_init(&zc, os, dzp->z_id); + while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { + count = 0; + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + err = ENXIO; + break; + } + + err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, + ZEXISTS, NULL, NULL); + if (err == ENOENT) + goto next; + if (err) + break; + + xip = ZTOI(zp); + if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && + KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && + zp->z_projid == dzp->z_projid) + goto next; + + tx = dmu_tx_create(os); + if (!(zp->z_pflags & ZFS_PROJID)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + break; + + mutex_enter(&dzp->z_lock); + + if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { + xip->i_uid = dxip->i_uid; + uid = zfs_uid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + } + + if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { + xip->i_gid = dxip->i_gid; + gid = zfs_gid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + } + + if (zp->z_projid != dzp->z_projid) { + if (!(zp->z_pflags & ZFS_PROJID)) { + zp->z_pflags |= ZFS_PROJID; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, + sizeof (zp->z_pflags)); + } + + zp->z_projid = dzp->z_projid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), + NULL, &zp->z_projid, sizeof (zp->z_projid)); + } + + mutex_exit(&dzp->z_lock); + + if (likely(count > 0)) { + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + tx = NULL; + if (err != 0 && err != ENOENT) + break; + +next: + if (xip) { + iput(xip); + xip = NULL; + zfs_dirent_unlock(dl); + } + zap_cursor_advance(&zc); + } + + if (tx) + dmu_tx_abort(tx); + if (xip) { + iput(xip); + zfs_dirent_unlock(dl); + } + zap_cursor_fini(&zc); + + return (err == ENOENT ? 0 : err); +} + /* * Set the file attributes to the values contained in the * vattr structure. @@ -2660,6 +2828,7 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; @@ -2671,17 +2840,19 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; + uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; - int err, err2; + int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; + boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; - int count = 0, xattr_count = 0; + int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); @@ -2689,6 +2860,41 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + /* + * If this is a xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + if (xoap != NULL && (mask & ATTR_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + } + zilog = zfsvfs->z_log; /* @@ -2714,17 +2920,11 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) return (SET_ERROR(EINVAL)); } - /* - * If this is an xvattr_t, then get a pointer to the structure of - * optional attributes. If this is NULL, then we have a vattr_t. - */ - xoap = xva_getxoptattr(xvap); - tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); - bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP); - xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP); + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime @@ -2732,12 +2932,12 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { - err = EPERM; + err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { - err = EPERM; + err = SET_ERROR(EPERM); goto out3; } @@ -2752,7 +2952,7 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { - err = EOVERFLOW; + err = SET_ERROR(EOVERFLOW); goto out3; } } @@ -2763,7 +2963,7 @@ top: /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { - err = EROFS; + err = SET_ERROR(EROFS); goto out3; } @@ -2870,6 +3070,16 @@ top: } } + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); + } + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { @@ -2924,7 +3134,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); - err = EPERM; + err = SET_ERROR(EPERM); goto out3; } @@ -2978,7 +3188,8 @@ top: */ mask = vap->va_mask; - if ((mask & (ATTR_UID | ATTR_GID))) { + if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { + handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); @@ -2991,10 +3202,11 @@ top: new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && - zfs_fuid_overquota(zfsvfs, B_FALSE, new_kuid)) { + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_kuid)) { if (attrzp) iput(ZTOI(attrzp)); - err = EDQUOT; + err = SET_ERROR(EDQUOT); goto out2; } } @@ -3003,15 +3215,24 @@ top: new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && - zfs_fuid_overquota(zfsvfs, B_TRUE, new_kgid)) { + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_kgid)) { if (attrzp) iput(ZTOI(attrzp)); - err = EDQUOT; + err = SET_ERROR(EDQUOT); goto out2; } } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + iput(ZTOI(attrzp)); + err = EDQUOT; + goto out2; + } } - tx = dmu_tx_create(zfsvfs->z_os); + tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; @@ -3044,8 +3265,10 @@ top: mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { - if ((mask & ATTR_XVATTR) && - XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + if (((mask & ATTR_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); @@ -3074,6 +3297,26 @@ top: * updated as a side-effect of calling this function. */ + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); @@ -3089,6 +3332,12 @@ top: SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } } if (mask & (ATTR_UID|ATTR_GID)) { @@ -3151,23 +3400,30 @@ top: &atime, sizeof (atime)); } - if (mask & ATTR_MTIME) { + if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - ZTOI(zp)->i_mtime = timespec_trunc(vap->va_mtime, + ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime, ZTOI(zp)->i_sb->s_time_gran); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } - if (mask & ATTR_CTIME) { + if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); - ZTOI(zp)->i_ctime = timespec_trunc(vap->va_ctime, + ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime, ZTOI(zp)->i_sb->s_time_gran); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, @@ -3204,6 +3460,9 @@ top: if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } + if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); @@ -3227,7 +3486,7 @@ top: mutex_exit(&attrzp->z_lock); } out: - if (err == 0 && attrzp) { + if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); @@ -3248,20 +3507,24 @@ out: if (err == ERESTART) goto top; } else { - err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + if (count > 0) + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); - if (attrzp) + if (attrzp) { + if (err2 == 0 && handle_eadir) + err2 = zfs_setattr_dir(attrzp); iput(ZTOI(attrzp)); + } zfs_inode_update(zp); } out2: - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: - kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * 7); - kmem_free(bulk, sizeof (sa_bulk_attr_t) * 7); + kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); ZFS_EXIT(zfsvfs); return (err); @@ -3554,6 +3817,19 @@ top: return (terr); } + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto out; + } + /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. @@ -3618,7 +3894,7 @@ top: zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); @@ -3652,6 +3928,8 @@ top: error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); @@ -3678,6 +3956,13 @@ top: VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } + } else { + /* + * If we had removed the existing target, subsequent + * call to zfs_link_create() to add back the same entry + * but, the new dnode (szp) should not fail. + */ + ASSERT(tzp == NULL); } } @@ -3791,7 +4076,7 @@ top: return (error); } - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); @@ -3810,7 +4095,7 @@ top: } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { @@ -3848,14 +4133,18 @@ top: /* * Insert the new object into the directory. */ - (void) zfs_link_create(dl, zp, tx, ZNEW); - - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - zfs_inode_update(dzp); - zfs_inode_update(zp); + zfs_inode_update(dzp); + zfs_inode_update(zp); + } zfs_acl_ids_free(&acl_ids); @@ -3863,10 +4152,14 @@ top: zfs_dirent_unlock(dl); - *ipp = ZTOI(zp); + if (error == 0) { + *ipp = ZTOI(zp); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } else { + iput(ZTOI(zp)); + } ZFS_EXIT(zfsvfs); return (error); @@ -3966,6 +4259,18 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr, szp = ITOZ(sip); ZFS_VERIFY_ZP(szp); + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + /* * We check i_sb because snapshots and the ctldir must have different * super blocks. @@ -4036,7 +4341,7 @@ top: zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { @@ -4124,7 +4429,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) loff_t offset; loff_t pgoff; unsigned int pglen; - rl_t *rl; dmu_tx_t *tx; caddr_t va; int err = 0; @@ -4160,8 +4464,13 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ - if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || - zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + KUID_TO_SUID(ip->i_uid)) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + KGID_TO_SGID(ip->i_gid)) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { err = EDQUOT; } #endif @@ -4193,13 +4502,14 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) redirty_page_for_writepage(wbc, pp); unlock_page(pp); - rl = zfs_range_lock(&zp->z_range_lock, pgoff, pglen, RL_WRITER); + locked_range_t *lr = rangelock_enter(&zp->z_rangelock, + pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } @@ -4207,7 +4517,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); - zfs_range_unlock(rl); + rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(pp); @@ -4219,7 +4529,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } @@ -4246,7 +4556,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) __set_page_dirty_nobuffers(pp); ClearPageError(pp); end_page_writeback(pp); - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (err); } @@ -4273,7 +4583,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) zfs_putpage_commit_cb, pp); dmu_tx_commit(tx); - zfs_range_unlock(rl); + rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* @@ -4557,19 +4867,19 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) vattr_t vap; int error; - if ((lckdat->l_whence == 2) || (whence == 2)) { - if ((error = zfs_getattr(ip, &vap, 0, CRED()) != 0)) + if ((lckdat->l_whence == SEEK_END) || (whence == SEEK_END)) { + if ((error = zfs_getattr(ip, &vap, 0, CRED()))) return (error); } switch (lckdat->l_whence) { - case 1: + case SEEK_CUR: lckdat->l_start += offset; break; - case 2: + case SEEK_END: lckdat->l_start += vap.va_size; /* FALLTHRU */ - case 0: + case SEEK_SET: break; default: return (SET_ERROR(EINVAL)); @@ -4579,13 +4889,13 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) return (SET_ERROR(EINVAL)); switch (whence) { - case 1: + case SEEK_CUR: lckdat->l_start -= offset; break; - case 2: + case SEEK_END: lckdat->l_start -= vap.va_size; /* FALLTHRU */ - case 0: + case SEEK_SET: break; default: return (SET_ERROR(EINVAL)); @@ -4606,7 +4916,7 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. - * cr - credentials of caller [UNUSED]. + * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * @@ -4640,7 +4950,7 @@ zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, return (SET_ERROR(EROFS)); } - if ((error = convoff(ip, bfp, 0, offset))) { + if ((error = convoff(ip, bfp, SEEK_SET, offset))) { ZFS_EXIT(zfsvfs); return (error); } @@ -4907,7 +5217,7 @@ zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) } #endif /* HAVE_UIO_ZEROCOPY */ -#if defined(_KERNEL) && defined(HAVE_SPL) +#if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_read);