* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
/* Portions Copyright 2007 Jeremy Teo */
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
+#include <sys/dsl_crypt.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/policy.h>
+#include <sys/zfeature.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_quota.h>
#include <sys/zfs_vfsops.h>
#include <sys/zfs_znode.h>
-static ulong_t zfs_fsync_sync_cnt = 4;
-
int
zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
{
+ int error = 0;
zfsvfs_t *zfsvfs = ZTOZSB(zp);
- (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
-
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
+ atomic_inc_32(&zp->z_sync_writes_cnt);
zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
+ atomic_dec_32(&zp->z_sync_writes_cnt);
+ zfs_exit(zfsvfs, FTAG);
}
- tsd_set(zfs_fsyncer_key, NULL);
-
- return (0);
+ return (error);
}
hole = B_FALSE;
/* Flush any mmap()'d data to disk */
- if (zn_has_cached_data(zp))
+ if (zn_has_cached_data(zp, 0, file_sz - 1))
zn_flush_cached_data(zp, B_FALSE);
- lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER);
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
zfs_rangelock_exit(lr);
zfsvfs_t *zfsvfs = ZTOZSB(zp);
int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
error = zfs_holey_common(zp, cmd, off);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
#endif /* SEEK_HOLE && SEEK_DATA */
zfsvfs_t *zfsvfs = ZTOZSB(zp);
int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
if (flag & V_ACE_MASK)
- error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+#if defined(__linux__)
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
+ zfs_init_idmap);
+#else
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
+ NULL);
+#endif
else
- error = zfs_zaccess_rwx(zp, mode, flag, cr);
+#if defined(__linux__)
+ error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap);
+#else
+ error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL);
+#endif
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
-static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
+static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
/*
* Read bytes from specified file into supplied buffer.
boolean_t frsync = B_FALSE;
zfsvfs_t *zfsvfs = ZTOZSB(zp);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
if (zp->z_pflags & ZFS_AV_QUARANTINED) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EACCES));
}
/* We don't copy out anything useful for directories. */
if (Z_ISDIR(ZTOTYPE(zp))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EISDIR));
}
* Validate file offset
*/
if (zfs_uio_offset(uio) < (offset_t)0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
* Fasttrack empty reads
*/
if (zfs_uio_resid(uio) == 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
error = mappedread_sf(zp, nbytes, uio);
else
#endif
- if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) {
+ if (zn_has_cached_data(zp, zfs_uio_offset(uio),
+ zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
error = mappedread(zp, nbytes, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
zfs_rangelock_exit(lr);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
vattr_t va = {0};
- va.va_mask = AT_MODE;
+ va.va_mask = ATTR_MODE;
va.va_nodeid = zp->z_id;
va.va_mode = newmode;
- zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, AT_MODE,
- NULL);
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
+ ATTR_MODE, NULL);
*clear_setid_bits_txgp = dmu_tx_get_txg(tx);
}
} else {
return (0);
zfsvfs_t *zfsvfs = ZTOZSB(zp);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
sa_bulk_attr_t bulk[4];
int count = 0;
* so check it explicitly here.
*/
if (zfs_is_readonly(zfsvfs)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EROFS));
}
if ((zp->z_pflags & ZFS_IMMUTABLE) ||
((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
(zfs_uio_offset(uio) < zp->z_size))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EPERM));
}
*/
offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
if (woff < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
- const uint64_t max_blksz = zfsvfs->z_max_blksz;
-
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
- * Skip this if uio contains loaned arc_buf.
*/
- if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
- ZFS_EXIT(zfsvfs);
+ ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
+ if (zfs_uio_prefaultpages(pfbytes, uio)) {
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFAULT));
}
lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
}
- if (zn_rlimit_fsize(zp, uio)) {
+ if (zn_rlimit_fsize_uio(zp, uio)) {
zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFBIG));
}
if (woff >= limit) {
zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFBIG));
}
uint64_t end_size = MAX(zp->z_size, woff + n);
zilog_t *zilog = zfsvfs->z_log;
+ boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) ||
+ (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS);
const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
break;
}
+ uint64_t blksz;
+ if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
+ if (zp->z_blksz > zfsvfs->z_max_blksz &&
+ !ISP2(zp->z_blksz)) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ blksz = 1 << highbit64(zp->z_blksz);
+ } else {
+ blksz = zfsvfs->z_max_blksz;
+ }
+ blksz = MIN(blksz, P2ROUNDUP(end_size,
+ SPA_MINBLOCKSIZE));
+ blksz = MAX(blksz, zp->z_blksz);
+ } else {
+ blksz = zp->z_blksz;
+ }
+
arc_buf_t *abuf = NULL;
- if (n >= max_blksz && woff >= zp->z_size &&
- P2PHASE(woff, max_blksz) == 0 &&
- zp->z_blksz == max_blksz) {
+ ssize_t nbytes = n;
+ if (n >= blksz && woff >= zp->z_size &&
+ P2PHASE(woff, blksz) == 0 &&
+ (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
/*
* This write covers a full block. "Borrow" a buffer
* from the dmu so that we can fill it before we enter
* holding up the transaction if the data copy hangs
* up on a pagefault (e.g., from an NFS server mapping).
*/
- size_t cbytes;
-
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- max_blksz);
+ blksz);
ASSERT(abuf != NULL);
- ASSERT(arc_buf_size(abuf) == max_blksz);
- if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
- UIO_WRITE, uio, &cbytes))) {
+ ASSERT(arc_buf_size(abuf) == blksz);
+ if ((error = zfs_uiocopy(abuf->b_data, blksz,
+ UIO_WRITE, uio, &nbytes))) {
dmu_return_arcbuf(abuf);
break;
}
- ASSERT3S(cbytes, ==, max_blksz);
+ ASSERT3S(nbytes, ==, blksz);
+ } else {
+ nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
+ P2PHASE(woff, blksz));
+ if (pfbytes < nbytes) {
+ if (zfs_uio_prefaultpages(nbytes, uio)) {
+ error = SET_ERROR(EFAULT);
+ break;
+ }
+ pfbytes = nbytes;
+ }
}
/*
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
DB_DNODE_ENTER(db);
- dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
- MIN(n, max_blksz));
+ dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
DB_DNODE_EXIT(db);
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
* shrink down lr_length to the appropriate size.
*/
if (lr->lr_length == UINT64_MAX) {
- uint64_t new_blksz;
-
- if (zp->z_blksz > max_blksz) {
- /*
- * File's blocksize is already larger than the
- * "recordsize" property. Only let it grow to
- * the next power of 2.
- */
- ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size,
- 1 << highbit64(zp->z_blksz));
- } else {
- new_blksz = MIN(end_size, max_blksz);
- }
- zfs_grow_blocksize(zp, new_blksz, tx);
+ zfs_grow_blocksize(zp, blksz, tx);
zfs_rangelock_reduce(lr, woff, n);
}
- /*
- * XXX - should we really limit each write to z_max_blksz?
- * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
- */
- const ssize_t nbytes =
- MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
ssize_t tx_bytes;
if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio);
* zfs_uio_prefaultpages, or prefaultpages may
* error, and we may break the loop early.
*/
- if (tx_bytes != zfs_uio_resid(uio))
- n -= tx_bytes - zfs_uio_resid(uio);
- if (zfs_uio_prefaultpages(MIN(n, max_blksz),
- uio)) {
- break;
- }
+ n -= tx_bytes - zfs_uio_resid(uio);
+ pfbytes -= tx_bytes - zfs_uio_resid(uio);
continue;
}
#endif
}
tx_bytes -= zfs_uio_resid(uio);
} else {
- /* Implied by abuf != NULL: */
- ASSERT3S(n, >=, max_blksz);
- ASSERT0(P2PHASE(woff, max_blksz));
- /*
- * We can simplify nbytes to MIN(n, max_blksz) since
- * P2PHASE(woff, max_blksz) is 0, and knowing
- * n >= max_blksz lets us simplify further:
- */
- ASSERT3S(nbytes, ==, max_blksz);
/*
* Thus, we're writing a full block at a block-aligned
* offset and extending the file past EOF.
zfs_uioskip(uio, nbytes);
tx_bytes = nbytes;
}
- if (tx_bytes && zn_has_cached_data(zp) &&
+ if (tx_bytes &&
+ zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
!(ioflag & O_DIRECT)) {
update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
}
* zfs_clear_setid_bits_if_necessary must precede any of
* the TX_WRITE records logged here.
*/
- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
NULL, NULL);
dmu_tx_commit(tx);
break;
ASSERT3S(tx_bytes, ==, nbytes);
n -= nbytes;
-
- if (n > 0) {
- if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
- error = SET_ERROR(EFAULT);
- break;
- }
- }
+ pfbytes -= nbytes;
}
zfs_znode_update_vfs(zp);
*/
if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
error == EFAULT) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
- if (ioflag & (O_SYNC | O_DSYNC) ||
- zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ if (commit)
zil_commit(zilog, zp->z_id);
const int64_t nwritten = start_resid - zfs_uio_resid(uio);
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
task_io_account_write(nwritten);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
error = zfs_getacl(zp, vsecp, skipaclchk, cr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
zfsvfs_t *zfsvfs = ZTOZSB(zp);
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- zilog_t *zilog = zfsvfs->z_log;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ zilog_t *zilog;
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
+ zilog = zfsvfs->z_log;
error = zfs_setacl(zp, vsecp, skipaclchk, cr);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
uint64_t zp_gen;
ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
/*
return (SET_ERROR(ENOENT));
}
- zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zp;
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
+ ASSERT3P(zio, !=, NULL);
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
}
#endif
if (error == 0)
- error = dmu_buf_hold(os, object, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
+ error = dmu_buf_hold_noread(os, object, offset, zgd,
+ &db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
kmem_free(zgd, sizeof (zgd_t));
}
+static int
+zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+ int error;
+
+ /* Swap. Not sure if the order of zfs_enter()s is important. */
+ if (zfsvfs1 > zfsvfs2) {
+ zfsvfs_t *tmpzfsvfs;
+
+ tmpzfsvfs = zfsvfs2;
+ zfsvfs2 = zfsvfs1;
+ zfsvfs1 = tmpzfsvfs;
+ }
+
+ error = zfs_enter(zfsvfs1, tag);
+ if (error != 0)
+ return (error);
+ if (zfsvfs1 != zfsvfs2) {
+ error = zfs_enter(zfsvfs2, tag);
+ if (error != 0) {
+ zfs_exit(zfsvfs1, tag);
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static void
+zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+
+ zfs_exit(zfsvfs1, tag);
+ if (zfsvfs1 != zfsvfs2)
+ zfs_exit(zfsvfs2, tag);
+}
+
+/*
+ * We split each clone request in chunks that can fit into a single ZIL
+ * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
+ * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
+ * us room for storing 1022 block pointers.
+ *
+ * On success, the function return the number of bytes copied in *lenp.
+ * Note, it doesn't return how much bytes are left to be copied.
+ * On errors which are caused by any file system limitations or
+ * brt limitations `EINVAL` is returned. In the most cases a user
+ * requested bad parameters, it could be possible to clone the file but
+ * some parameters don't match the requirements.
+ */
+int
+zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
+ uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
+{
+ zfsvfs_t *inzfsvfs, *outzfsvfs;
+ objset_t *inos, *outos;
+ zfs_locked_range_t *inlr, *outlr;
+ dmu_buf_impl_t *db;
+ dmu_tx_t *tx;
+ zilog_t *zilog;
+ uint64_t inoff, outoff, len, done;
+ uint64_t outsize, size;
+ int error;
+ int count = 0;
+ sa_bulk_attr_t bulk[3];
+ uint64_t mtime[2], ctime[2];
+ uint64_t uid, gid, projid;
+ blkptr_t *bps;
+ size_t maxblocks, nbps;
+ uint_t inblksz;
+ uint64_t clear_setid_bits_txg = 0;
+
+ inoff = *inoffp;
+ outoff = *outoffp;
+ len = *lenp;
+ done = 0;
+
+ inzfsvfs = ZTOZSB(inzp);
+ outzfsvfs = ZTOZSB(outzp);
+
+ /*
+ * We need to call zfs_enter() potentially on two different datasets,
+ * so we need a dedicated function for that.
+ */
+ error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
+ if (error != 0)
+ return (error);
+
+ inos = inzfsvfs->z_os;
+ outos = outzfsvfs->z_os;
+
+ /*
+ * Both source and destination have to belong to the same storage pool.
+ */
+ if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * outos and inos belongs to the same storage pool.
+ * see a few lines above, only one check.
+ */
+ if (!spa_feature_is_enabled(dmu_objset_spa(outos),
+ SPA_FEATURE_BLOCK_CLONING)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ ASSERT(!outzfsvfs->z_replay);
+
+ /*
+ * Block cloning from an unencrypted dataset into an encrypted
+ * dataset and vice versa is not supported.
+ */
+ if (inos->os_encrypted != outos->os_encrypted) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * Cloning across encrypted datasets is possible only if they
+ * share the same master key.
+ */
+ if (inos != outos && inos->os_encrypted &&
+ !dmu_objset_crypto_key_equal(inos, outos)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ error = zfs_verify_zp(inzp);
+ if (error == 0)
+ error = zfs_verify_zp(outzp);
+ if (error != 0) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (error);
+ }
+
+ /*
+ * We don't copy source file's flags that's why we don't allow to clone
+ * files that are in quarantine.
+ */
+ if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EACCES));
+ }
+
+ if (inoff >= inzp->z_size) {
+ *lenp = 0;
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (0);
+ }
+ if (len > inzp->z_size - inoff) {
+ len = inzp->z_size - inoff;
+ }
+ if (len == 0) {
+ *lenp = 0;
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (0);
+ }
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(outzfsvfs)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * If immutable or not appending then return EPERM.
+ * Intentionally allow ZFS_READONLY through here.
+ * See zfs_zaccess_common()
+ */
+ if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * No overlapping if we are cloning within the same file.
+ */
+ if (inzp == outzp) {
+ if (inoff < outoff + len && outoff < inoff + len) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ /*
+ * Maintain predictable lock order.
+ */
+ if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
+ inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+ RL_READER);
+ outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+ RL_WRITER);
+ } else {
+ outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+ RL_WRITER);
+ inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+ RL_READER);
+ }
+
+ inblksz = inzp->z_blksz;
+
+ /*
+ * We cannot clone into a file with different block size if we can't
+ * grow it (block size is already bigger, has more than one block, or
+ * not locked for growth). There are other possible reasons for the
+ * grow to fail, but we cover what we can before opening transaction
+ * and the rest detect after we try to do it.
+ */
+ if (inblksz < outzp->z_blksz) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+ if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz ||
+ outlr->lr_length != UINT64_MAX)) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+
+ /*
+ * Block size must be power-of-2 if destination offset != 0.
+ * There can be no multiple blocks of non-power-of-2 size.
+ */
+ if (outoff != 0 && !ISP2(inblksz)) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+
+ /*
+ * Offsets and len must be at block boundries.
+ */
+ if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+ /*
+ * Length must be multipe of blksz, except for the end of the file.
+ */
+ if ((len % inblksz) != 0 &&
+ (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+
+ /*
+ * If we are copying only one block and it is smaller than recordsize
+ * property, do not allow destination to grow beyond one block if it
+ * is not there yet. Otherwise the destination will get stuck with
+ * that block size forever, that can be as small as 512 bytes, no
+ * matter how big the destination grow later.
+ */
+ if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz &&
+ outzp->z_size <= inblksz && outoff + len > inblksz) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+
+ error = zn_rlimit_fsize(outoff + len);
+ if (error != 0) {
+ goto unlock;
+ }
+
+ if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
+ error = SET_ERROR(EFBIG);
+ goto unlock;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
+ &outzp->z_size, 8);
+
+ zilog = outzfsvfs->z_log;
+ maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
+ sizeof (bps[0]);
+
+ uid = KUID_TO_SUID(ZTOUID(outzp));
+ gid = KGID_TO_SGID(ZTOGID(outzp));
+ projid = outzp->z_projid;
+
+ bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
+
+ /*
+ * Clone the file in reasonable size chunks. Each chunk is cloned
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (len > 0) {
+ size = MIN(inblksz * maxblocks, len);
+
+ if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
+ uid) ||
+ zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
+ gid) ||
+ (projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
+ projid))) {
+ error = SET_ERROR(EDQUOT);
+ break;
+ }
+
+ nbps = maxblocks;
+ error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
+ &nbps);
+ if (error != 0) {
+ /*
+ * If we are trying to clone a block that was created
+ * in the current transaction group, error will be
+ * EAGAIN here, which we can just return to the caller
+ * so it can fallback if it likes.
+ */
+ break;
+ }
+
+ /*
+ * Start a transaction.
+ */
+ tx = dmu_tx_create(outos);
+ dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
+ db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, outzp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ break;
+ }
+
+ /*
+ * Copy source znode's block size. This is done only if the
+ * whole znode is locked (see zfs_rangelock_cb()) and only
+ * on the first iteration since zfs_rangelock_reduce() will
+ * shrink down lr_length to the appropriate size.
+ */
+ if (outlr->lr_length == UINT64_MAX) {
+ zfs_grow_blocksize(outzp, inblksz, tx);
+
+ /*
+ * Block growth may fail for many reasons we can not
+ * predict here. If it happen the cloning is doomed.
+ */
+ if (inblksz != outzp->z_blksz) {
+ error = SET_ERROR(EINVAL);
+ dmu_tx_abort(tx);
+ break;
+ }
+
+ /*
+ * Round range lock up to the block boundary, so we
+ * prevent appends until we are done.
+ */
+ zfs_rangelock_reduce(outlr, outoff,
+ ((len - 1) / inblksz + 1) * inblksz);
+ }
+
+ error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
+ bps, nbps);
+ if (error != 0) {
+ dmu_tx_commit(tx);
+ break;
+ }
+
+ zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
+ &clear_setid_bits_txg, tx);
+
+ zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((outsize = outzp->z_size) < outoff + size) {
+ (void) atomic_cas_64(&outzp->z_size, outsize,
+ outoff + size);
+ }
+
+ error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
+
+ zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
+ size, inblksz, bps, nbps);
+
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+
+ inoff += size;
+ outoff += size;
+ len -= size;
+ done += size;
+ }
+
+ vmem_free(bps, sizeof (bps[0]) * maxblocks);
+ zfs_znode_update_vfs(outzp);
+
+unlock:
+ zfs_rangelock_exit(outlr);
+ zfs_rangelock_exit(inlr);
+
+ if (done > 0) {
+ /*
+ * If we have made at least partial progress, reset the error.
+ */
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
+
+ if (outos->os_sync == ZFS_SYNC_ALWAYS) {
+ zil_commit(zilog, outzp->z_id);
+ }
+
+ *inoffp += done;
+ *outoffp += done;
+ *lenp = done;
+ } else {
+ /*
+ * If we made no progress, there must be a good reason.
+ * EOF is handled explicitly above, before the loop.
+ */
+ ASSERT3S(error, !=, 0);
+ }
+
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
+ * but we cannot do that, because when replaying we don't have source znode
+ * available. This is why we need a dedicated replay function.
+ */
+int
+zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
+ const blkptr_t *bps, size_t nbps)
+{
+ zfsvfs_t *zfsvfs;
+ dmu_buf_impl_t *db;
+ dmu_tx_t *tx;
+ int error;
+ int count = 0;
+ sa_bulk_attr_t bulk[3];
+ uint64_t mtime[2], ctime[2];
+
+ ASSERT3U(off, <, MAXOFFSET_T);
+ ASSERT3U(len, >, 0);
+ ASSERT3U(nbps, >, 0);
+
+ zfsvfs = ZTOZSB(zp);
+
+ ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+ SPA_FEATURE_BLOCK_CLONING));
+
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
+
+ ASSERT(zfsvfs->z_replay);
+ ASSERT(!zfs_is_readonly(zfsvfs));
+
+ if ((off % blksz) != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+
+ /*
+ * Start a transaction.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ zfs_exit(zfsvfs, FTAG);
+ return (error);
+ }
+
+ if (zp->z_blksz < blksz)
+ zfs_grow_blocksize(zp, blksz, tx);
+
+ dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
+
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+ if (zp->z_size < off + len)
+ zp->z_size = off + len;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+ /*
+ * zil_replaying() not only check if we are replaying ZIL, but also
+ * updates the ZIL header to record replay progress.
+ */
+ VERIFY(zil_replaying(zfsvfs->z_log, tx));
+
+ dmu_tx_commit(tx);
+
+ zfs_znode_update_vfs(zp);
+
+ zfs_exit(zfsvfs, FTAG);
+
+ return (error);
+}
+
EXPORT_SYMBOL(zfs_access);
EXPORT_SYMBOL(zfs_fsync);
EXPORT_SYMBOL(zfs_holey);
EXPORT_SYMBOL(zfs_write);
EXPORT_SYMBOL(zfs_getsecattr);
EXPORT_SYMBOL(zfs_setsecattr);
+EXPORT_SYMBOL(zfs_clone_range);
+EXPORT_SYMBOL(zfs_clone_range_replay);
-ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
"Bytes to read per chunk");