#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
extern int zfs_super_owner;
-extern int zfs_bclone_enabled;
extern void zfs_init(void);
extern void zfs_fini(void);
typedef struct zfsvfs zfsvfs_t;
struct znode;
-extern int zfs_bclone_enabled;
-
/*
* This structure emulates the vfs_t from other platforms. It's purpose
* is to facilitate the handling of mount options and minimize structural
#ifndef _SYS_FS_ZFS_VNOPS_H
#define _SYS_FS_ZFS_VNOPS_H
+
#include <sys/zfs_vnops_os.h>
+extern int zfs_bclone_enabled;
+
extern int zfs_fsync(znode_t *, int, cred_t *);
extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
If this setting is 0, then even if feature@block_cloning is enabled,
attempts to clone blocks will act as though the feature is disabled.
.
+.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
+When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
+written to disk.
+This allows the clone operation to reliably succeed when a file is
+modified and then immediately cloned.
+For small files this may be slower than making a copy of the file.
+Therefore, this setting defaults to 0 which causes a clone operation to
+immediately fail when encountering a dirty block.
+.
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
Select a BLAKE3 implementation.
.Pp
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
"Debug level");
-int zfs_bclone_enabled = 1;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
- &zfs_bclone_enabled, 0, "Enable block cloning");
-
struct zfs_jailparam {
int mount_snapshot;
};
/* CSTYLED */
module_param(zfs_delete_blocks, ulong, 0644);
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
-
-/* CSTYLED */
-module_param(zfs_bclone_enabled, uint, 0644);
-MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
-
#endif
#include <sys/zfs_vnops.h>
#include <sys/zfeature.h>
-int zfs_bclone_enabled = 1;
-
/*
* Clone part of a file via block cloning.
*
* care of that depending on how it was called.
*/
static ssize_t
-__zpl_clone_file_range(struct file *src_file, loff_t src_off,
+zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, size_t len)
{
struct inode *src_i = file_inode(src_file);
{
ssize_t ret;
+ /* Flags is reserved for future extensions and must be zero. */
if (flags != 0)
return (-EINVAL);
- /* Try to do it via zfs_clone_range() */
- ret = __zpl_clone_file_range(src_file, src_off,
+ /* Try to do it via zfs_clone_range() and allow shortening. */
+ ret = zpl_clone_file_range_impl(src_file, src_off,
dst_file, dst_off, len);
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
* range in both files and if they're the same, arrange for them to be backed
* by the same storage.
+ *
+ * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
+ * if we want. It's designed for filesystems that may need to shorten the
+ * length for alignment, EOF, or any other requirement. ZFS may shorten the
+ * request when there is outstanding dirty data which hasn't been written.
*/
loff_t
zpl_remap_file_range(struct file *src_file, loff_t src_off,
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
return (-EINVAL);
- /*
- * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
- * range if we want. Its designed for filesystems that make data past
- * EOF available, and don't want it to be visible in both files. ZFS
- * doesn't do that, so we just turn the flag off.
- */
- flags &= ~REMAP_FILE_CAN_SHORTEN;
-
+ /* No support for dedup yet */
if (flags & REMAP_FILE_DEDUP)
- /* No support for dedup yet */
return (-EOPNOTSUPP);
/* Zero length means to clone everything to the end of the file */
if (len == 0)
len = i_size_read(file_inode(src_file)) - src_off;
- return (__zpl_clone_file_range(src_file, src_off,
- dst_file, dst_off, len));
+ ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
+ dst_file, dst_off, len);
+
+ if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
+ ret = -EINVAL;
+
+ return (ret);
}
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
if (len == 0)
len = i_size_read(file_inode(src_file)) - src_off;
- return (__zpl_clone_file_range(src_file, src_off,
- dst_file, dst_off, len));
+ /* The entire length must be cloned or this is an error. */
+ ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
+ dst_file, dst_off, len);
+
+ if (ret >= 0 && ret != len)
+ ret = -EINVAL;
+
+ return (ret);
}
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
size_t len = i_size_read(file_inode(src_file));
- ssize_t ret =
- __zpl_clone_file_range(src_file, 0, dst_file, 0, len);
+ ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
fput(src_file);
if (len == 0)
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
- ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
+ ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
dst_file, fcr.fcr_dest_offset, len);
fput(src_file);
#include <sys/zfs_vfsops.h>
#include <sys/zfs_znode.h>
+/*
+ * Enable the experimental block cloning feature. If this setting is 0, then
+ * even if feature@block_cloning is enabled, attempts to clone blocks will act
+ * as though the feature is disabled.
+ */
+int zfs_bclone_enabled = 1;
+
+/*
+ * When set zfs_clone_range() waits for dirty data to be written to disk.
+ * This allows the clone operation to reliably succeed when a file is modified
+ * and then immediately cloned. For small files this may be slower than making
+ * a copy of the file and is therefore not the default. However, in certain
+ * scenarios this behavior may be desirable so a tunable is provided.
+ */
+static int zfs_bclone_wait_dirty = 0;
+
+/*
+ * Maximum bytes to read per chunk in zfs_read().
+ */
+static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
int
zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
return (error);
}
-static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
-
/*
* Read bytes from specified file into supplied buffer.
*
size_t maxblocks, nbps;
uint_t inblksz;
uint64_t clear_setid_bits_txg = 0;
+ uint64_t last_synced_txg = 0;
inoff = *inoffp;
outoff = *outoffp;
}
nbps = maxblocks;
+ last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
&nbps);
if (error != 0) {
/*
* If we are trying to clone a block that was created
- * in the current transaction group, error will be
- * EAGAIN here, which we can just return to the caller
- * so it can fallback if it likes.
+ * in the current transaction group, the error will be
+ * EAGAIN here. Based on zfs_bclone_wait_dirty either
+ * return a shortened range to the caller so it can
+ * fallback, or wait for the next TXG and check again.
*/
+ if (error == EAGAIN && zfs_bclone_wait_dirty) {
+ txg_wait_synced(dmu_objset_pool(inos),
+ last_synced_txg + 1);
+ continue;
+ }
+
break;
}
ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
"Bytes to read per chunk");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
+ "Enable block cloning");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
+ "Wait for dirty blocks when cloning");
tags = ['functional', 'compression']
[tests/functional/cp_files]
-tests = ['cp_files_001_pos', 'cp_stress']
+tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
tags = ['functional', 'cp_files']
[tests/functional/crtime]
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
+ 'cp_files/cp_files_002_pos': ['SKIP', na_reason],
'link_count/link_count_001': ['SKIP', na_reason],
'casenorm/mixed_create_failure': ['FAIL', 13215],
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
['SKIP', cfr_reason],
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
+ 'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
'fault/auto_online_002_pos': ['FAIL', 11889],
'fault/auto_replace_001_pos': ['FAIL', 14851],
'fault/auto_spare_002_pos': ['FAIL', 11889],
VOL_RECURSIVE vol.recursive UNSUPPORTED
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled
+BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty
XATTR_COMPAT xattr_compat zfs_xattr_compat
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
functional/compression/setup.ksh \
functional/cp_files/cleanup.ksh \
functional/cp_files/cp_files_001_pos.ksh \
+ functional/cp_files/cp_files_002_pos.ksh \
functional/cp_files/cp_stress.ksh \
functional/cp_files/setup.ksh \
functional/crtime/cleanup.ksh \
--- /dev/null
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
+
+#
+# DESCRIPTION:
+# Verify all cp --reflink modes work with modified file.
+#
+# STRATEGY:
+# 1. Verify "cp --reflink=never|auto|always" behaves as expected.
+# Two different modes of operation are tested.
+#
+# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL
+# when there are dirty blocks which cannot be immediately cloned.
+# This is the default behavior.
+#
+# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for
+# dirty blocks to be written to disk allowing the clone to succeed.
+# The downside to this is it may be slow which depending on the
+# situtation may defeat the point of making a clone.
+#
+
+verify_runnable "global"
+verify_block_cloning
+
+if ! is_linux; then
+ log_unsupported "cp --reflink is a GNU coreutils option"
+fi
+
+function cleanup
+{
+ datasetexists $TESTPOOL/cp-reflink && \
+ destroy_dataset $$TESTPOOL/cp-reflink -f
+ log_must set_tunable32 BCLONE_WAIT_DIRTY 0
+}
+
+function verify_copy
+{
+ src_cksum=$(sha256digest $1)
+ dst_cksum=$(sha256digest $2)
+
+ if [[ "$src_cksum" != "$dst_cksum" ]]; then
+ log_must ls -l $CP_TESTDIR
+ log_fail "checksum mismatch ($src_cksum != $dst_cksum)"
+ fi
+}
+
+log_assert "Verify all cp --reflink modes work with modified file"
+
+log_onexit cleanup
+
+SRC_FILE=src.data
+DST_FILE=dst.data
+SRC_SIZE=$(($RANDOM % 2048))
+
+# A smaller recordsize is used merely to speed up the test.
+RECORDSIZE=4096
+
+log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink
+CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink)
+
+log_must cd $CP_TESTDIR
+
+# Never wait on dirty blocks (zfs_bclone_wait_dirty=0)
+log_must set_tunable32 BCLONE_WAIT_DIRTY 0
+
+for mode in "never" "auto" "always"; do
+ log_note "Checking 'cp --reflink=$mode'"
+
+ # Create a new file and immediately copy it.
+ log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
+
+ if [[ "$mode" == "always" ]]; then
+ log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
+ log_must ls -l $CP_TESTDIR
+ else
+ log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+ verify_copy $SRC_FILE $DST_FILE
+ fi
+ log_must rm -f $DST_FILE
+
+ # Append to an existing file and immediately copy it.
+ sync_pool $TESTPOOL
+ log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
+ count=1 conv=notrunc
+ if [[ "$mode" == "always" ]]; then
+ log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
+ log_must ls -l $CP_TESTDIR
+ else
+ log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+ verify_copy $SRC_FILE $DST_FILE
+ fi
+ log_must rm -f $DST_FILE
+
+ # Overwrite a random range of an existing file and immediately copy it.
+ sync_pool $TESTPOOL
+ log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
+ seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+ if [[ "$mode" == "always" ]]; then
+ log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
+ log_must ls -l $CP_TESTDIR
+ else
+ log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+ verify_copy $SRC_FILE $DST_FILE
+ fi
+ log_must rm -f $SRC_FILE $DST_FILE
+done
+
+# Wait on dirty blocks (zfs_bclone_wait_dirty=1)
+log_must set_tunable32 BCLONE_WAIT_DIRTY 1
+
+for mode in "never" "auto" "always"; do
+ log_note "Checking 'cp --reflink=$mode'"
+
+ # Create a new file and immediately copy it.
+ log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
+ log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+ verify_copy $SRC_FILE $DST_FILE
+ log_must rm -f $DST_FILE
+
+ # Append to an existing file and immediately copy it.
+ log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
+ count=1 conv=notrunc
+ log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+ verify_copy $SRC_FILE $DST_FILE
+ log_must rm -f $DST_FILE
+
+ # Overwrite a random range of an existing file and immediately copy it.
+ log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
+ seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+ log_must cp --reflink=$mode $SRC_FILE $DST_FILE
+ verify_copy $SRC_FILE $DST_FILE
+ log_must rm -f $SRC_FILE $DST_FILE
+done
+
+log_pass