]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Add Linux posix_fadvise support
authorFinix1979 <yanchongwen@hotmail.com>
Thu, 8 Sep 2022 17:29:41 +0000 (01:29 +0800)
committerGitHub <noreply@github.com>
Thu, 8 Sep 2022 17:29:41 +0000 (10:29 -0700)
The purpose of this PR is to accepts fadvise ioctl from userland
to do read-ahead by demand.

It could dramatically improve sequential read performance especially
when primarycache is set to metadata or zfs_prefetch_disable is 1.

If the file is mmaped, generic_fadvise is also called for page cache
read-ahead besides dmu_prefetch.

Only POSIX_FADV_WILLNEED and POSIX_FADV_SEQUENTIAL are supported in
this PR currently.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Finix Yan <yancw@info2soft.com>
Closes #13694

15 files changed:
config/kernel-fadvise.m4 [new file with mode: 0644]
config/kernel-generic_fadvise.m4 [new file with mode: 0644]
config/kernel.m4
module/os/linux/zfs/zpl_file.c
tests/runfiles/linux.run
tests/zfs-tests/cmd/.gitignore
tests/zfs-tests/cmd/Makefile.am
tests/zfs-tests/cmd/file/file_fadvise.c [new file with mode: 0644]
tests/zfs-tests/include/commands.cfg
tests/zfs-tests/tests/Makefile.am
tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh
tests/zfs-tests/tests/functional/fadvise/cleanup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/fadvise/setup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh

diff --git a/config/kernel-fadvise.m4 b/config/kernel-fadvise.m4
new file mode 100644 (file)
index 0000000..08912de
--- /dev/null
@@ -0,0 +1,23 @@
+dnl #
+dnl # Linux 4.19 API
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_FADVISE], [
+       ZFS_LINUX_TEST_SRC([file_fadvise], [
+               #include <linux/fs.h>
+
+               static const struct file_operations
+                   fops __attribute__ ((unused)) = {
+                       .fadvise = NULL,
+               };
+       ],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_FADVISE], [
+       AC_MSG_CHECKING([whether fops->fadvise() exists])
+       ZFS_LINUX_TEST_RESULT([file_fadvise], [
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_FILE_FADVISE, 1, [fops->fadvise() exists])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/config/kernel-generic_fadvise.m4 b/config/kernel-generic_fadvise.m4
new file mode 100644 (file)
index 0000000..8d12206
--- /dev/null
@@ -0,0 +1,27 @@
+dnl #
+dnl # 5.3 API change
+dnl # The generic_fadvise() function is present since 4.19 kernel
+dnl # but it was not exported until Linux 5.3.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FADVISE], [
+       ZFS_LINUX_TEST_SRC([generic_fadvise], [
+               #include <linux/fs.h>
+       ], [
+               struct file *fp __attribute__ ((unused)) = NULL;
+               loff_t offset __attribute__ ((unused)) = 0;
+               loff_t len __attribute__ ((unused)) = 0;
+               int advise __attribute__ ((unused)) = 0;
+               generic_fadvise(fp, offset, len, advise);
+       ])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FADVISE], [
+       AC_MSG_CHECKING([whether generic_fadvise() is available])
+       ZFS_LINUX_TEST_RESULT_SYMBOL([generic_fadvise],
+       [generic_fadvise], [mm/fadvise.c], [
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_GENERIC_FADVISE, 1, [yes])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
index 1f274cbe4f30342889f001dbcae6176f72d56a90..6aad2cf88e0208f574027534d14aa45f5567aa8e 100644 (file)
@@ -42,6 +42,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
        ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE
        ZFS_AC_KERNEL_SRC_PDE_DATA
        ZFS_AC_KERNEL_SRC_FALLOCATE
+       ZFS_AC_KERNEL_SRC_FADVISE
+       ZFS_AC_KERNEL_SRC_GENERIC_FADVISE
        ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
        ZFS_AC_KERNEL_SRC_RWSEM
        ZFS_AC_KERNEL_SRC_SCHED
@@ -161,6 +163,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
        ZFS_AC_KERNEL_OBJTOOL
        ZFS_AC_KERNEL_PDE_DATA
        ZFS_AC_KERNEL_FALLOCATE
+       ZFS_AC_KERNEL_FADVISE
+       ZFS_AC_KERNEL_GENERIC_FADVISE
        ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
        ZFS_AC_KERNEL_RWSEM
        ZFS_AC_KERNEL_SCHED
index 43b7fb60a99708d4f790834248602c75933b6388..b0d9f37a3ec0a22fe80e2b5298c5ace8f70ead99 100644 (file)
@@ -27,6 +27,7 @@
 #ifdef CONFIG_COMPAT
 #include <linux/compat.h>
 #endif
+#include <linux/fs.h>
 #include <sys/file.h>
 #include <sys/dmu_objset.h>
 #include <sys/zfs_znode.h>
@@ -37,6 +38,9 @@
     defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO)
 #include <linux/pagemap.h>
 #endif
+#ifdef HAVE_FILE_FADVISE
+#include <linux/fadvise.h>
+#endif
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 #include <linux/writeback.h>
 #endif
@@ -906,6 +910,61 @@ zpl_ioctl_getversion(struct file *filp, void __user *arg)
        return (copy_to_user(arg, &generation, sizeof (generation)));
 }
 
+#ifdef HAVE_FILE_FADVISE
+static int
+zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
+{
+       struct inode *ip = file_inode(filp);
+       znode_t *zp = ITOZ(ip);
+       zfsvfs_t *zfsvfs = ITOZSB(ip);
+       objset_t *os = zfsvfs->z_os;
+       int error = 0;
+
+       if (S_ISFIFO(ip->i_mode))
+               return (-ESPIPE);
+
+       if (offset < 0 || len < 0)
+               return (-EINVAL);
+
+       ZFS_ENTER(zfsvfs);
+       ZFS_VERIFY_ZP(zp);
+
+       switch (advice) {
+       case POSIX_FADV_SEQUENTIAL:
+       case POSIX_FADV_WILLNEED:
+#ifdef HAVE_GENERIC_FADVISE
+               if (zn_has_cached_data(zp))
+                       error = generic_fadvise(filp, offset, len, advice);
+#endif
+               /*
+                * Pass on the caller's size directly, but note that
+                * dmu_prefetch_max will effectively cap it.  If there
+                * really is a larger sequential access pattern, perhaps
+                * dmu_zfetch will detect it.
+                */
+               if (len == 0)
+                       len = i_size_read(ip) - offset;
+
+               dmu_prefetch(os, zp->z_id, 0, offset, len,
+                   ZIO_PRIORITY_ASYNC_READ);
+               break;
+       case POSIX_FADV_NORMAL:
+       case POSIX_FADV_RANDOM:
+       case POSIX_FADV_DONTNEED:
+       case POSIX_FADV_NOREUSE:
+               /* ignored for now */
+               break;
+       default:
+               error = -EINVAL;
+               break;
+       }
+
+       ZFS_EXIT(zfsvfs);
+
+       return (error);
+}
+#endif /* HAVE_FILE_FADVISE */
+
 #define        ZFS_FL_USER_VISIBLE     (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
 #define        ZFS_FL_USER_MODIFIABLE  (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
 
@@ -1259,6 +1318,9 @@ const struct file_operations zpl_file_operations = {
        .aio_fsync      = zpl_aio_fsync,
 #endif
        .fallocate      = zpl_fallocate,
+#ifdef HAVE_FILE_FADVISE
+       .fadvise        = zpl_fadvise,
+#endif
        .unlocked_ioctl = zpl_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = zpl_compat_ioctl,
index 9b32e73afb1e51f48e2b447bcd08672b7d559b45..09dfb5eb1e1d299ae917225991ef431dd60be2fe 100644 (file)
@@ -89,6 +89,10 @@ tags = ['functional', 'devices']
 tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill']
 tags = ['functional', 'events']
 
+[tests/functional/fadvise:Linux]
+tests = ['fadvise_sequential']
+tags = ['functional', 'fadvise']
+
 [tests/functional/fallocate:Linux]
 tests = ['fallocate_prealloc', 'fallocate_zero-range']
 tags = ['functional', 'fallocate']
index 20d1382532bd3d8a55a0ba6488bb6708caed4786..1fd54c1dd5101db6eface81a66a75e1a6f549b05 100644 (file)
@@ -4,6 +4,7 @@
 /devname2devid
 /dir_rd_update
 /draid
+/file_fadvise
 /file_append
 /file_check
 /file_trunc
index 3c8faf5afbbb98013387e667ed9562ccec785274..c19c870cf6986595024da5aa41e3f51b0fad7323 100644 (file)
@@ -128,4 +128,7 @@ scripts_zfs_tests_bin_PROGRAMS  += %D%/read_dos_attributes %D%/write_dos_attribu
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/randfree_file
 %C%_randfree_file_SOURCES       = %D%/file/randfree_file.c
+
+scripts_zfs_tests_bin_PROGRAMS += %D%/file_fadvise
+%C%_file_fadvise_SOURCES  = %D%/file/file_fadvise.c
 endif
diff --git a/tests/zfs-tests/cmd/file/file_fadvise.c b/tests/zfs-tests/cmd/file/file_fadvise.c
new file mode 100644 (file)
index 0000000..e1afb6d
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2022 by Information2 Software, Inc. All rights reserved.
+ */
+
+#include "file_common.h"
+#include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+
+/*
+ * Call fadvise to prefetch data
+ */
+static const char *execname = "file_fadvise";
+
+static void
+usage(void)
+{
+       (void) fprintf(stderr,
+           "usage: %s -f filename -a advise \n", execname);
+}
+
+int
+main(int argc, char *argv[])
+{
+       char *filename = NULL;
+       int advise = 0;
+       int fd, ch;
+       int     err = 0;
+
+       while ((ch = getopt(argc, argv, "a:f:")) != EOF) {
+               switch (ch) {
+               case 'a':
+                       advise = atoll(optarg);
+                       break;
+               case 'f':
+                       filename = optarg;
+                       break;
+               case '?':
+                       (void) printf("unknown arg %c\n", optopt);
+                       usage();
+                       break;
+               }
+       }
+
+       if (!filename) {
+               (void) printf("Filename not specified (-f <file>)\n");
+               err++;
+       }
+
+       if (advise < POSIX_FADV_NORMAL || advise > POSIX_FADV_NOREUSE) {
+               (void) printf("advise is invalid\n");
+               err++;
+       }
+
+       if (err) {
+               usage(); /* no return */
+               return (1);
+       }
+
+       if ((fd = open(filename, O_RDWR, 0666)) < 0) {
+               perror("open");
+               return (1);
+       }
+
+       posix_fadvise(fd, 0, 0, advise);
+
+       close(fd);
+
+       return (0);
+}
index 4098562210bbacfc66c14e417141a11f63316936..c05b918325b75953723d68b59cb3f8bfa9276881 100644 (file)
@@ -184,6 +184,7 @@ export ZFSTEST_FILES='badsend
     devname2devid
     dir_rd_update
     draid
+    file_fadvise
     file_append
     file_check
     file_trunc
index 89b2ca866c214559692db89454fd79f5673120ad..d53316643bc5819b62d545f38199c986a7bc09d3 100644 (file)
@@ -1370,6 +1370,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
        functional/exec/exec_001_pos.ksh \
        functional/exec/exec_002_neg.ksh \
        functional/exec/setup.ksh \
+       functional/fadvise/cleanup.ksh \
+       functional/fadvise/fadvise_sequential.ksh \
+       functional/fadvise/setup.ksh \
        functional/fallocate/cleanup.ksh \
        functional/fallocate/fallocate_prealloc.ksh \
        functional/fallocate/fallocate_punch-hole.ksh \
index a0be1c2050b9e6b7f043c3073c177e783675e2ea..23e7aa57748476491630c242a9c9090cbd2820cb 100755 (executable)
@@ -76,7 +76,7 @@ while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do
        log_must zpool export $TESTPOOL
        log_must zpool import $TESTPOOL
 
-       log_mustnot eval "cat $TESTDIR/test_$type >/dev/null"
+       log_mustnot eval "dd if=$TESTDIR/test_$type of=/dev/null bs=$WRITESZ count=$NWRITES"
 
        cksum=$(zpool status -P -v $TESTPOOL | grep "$firstvdev" | \
            awk '{print $5}')
diff --git a/tests/zfs-tests/tests/functional/fadvise/cleanup.ksh b/tests/zfs-tests/tests/functional/fadvise/cleanup.ksh
new file mode 100755 (executable)
index 0000000..8b5b43a
--- /dev/null
@@ -0,0 +1,28 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Portions Copyright (c) 2022 Information2 Software, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh b/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh
new file mode 100755 (executable)
index 0000000..7b7d1d3
--- /dev/null
@@ -0,0 +1,80 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Portions Copyright (c) 2022 Information2 Software, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/math.shlib
+
+#
+# DESCRIPTION:
+# Test posix_fadvise.
+#
+# STRATEGY:
+# 1. Set primarycache to metadata in order to disable prefetch
+# 2. Write some data to file 
+# 3. get data_size field from arcstat
+# 4. call file_fadvise with POSIX_FADV_SEQUENTIAL
+# 5. get data_size field from arcstat again
+# 6. latter data_size should be bigger than former one
+#
+
+# NOTE: if HAVE_FILE_FADVISE is not defined former data_size
+# should less or eaqul to latter one
+
+verify_runnable "global"
+
+FILE=$TESTDIR/$TESTFILE0
+BLKSZ=$(get_prop recordsize $TESTPOOL)
+
+function cleanup
+{
+       log_must zfs set primarycache=all $TESTPOOL
+       [[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/*
+}
+
+getstat() {
+       awk -v c="$1" '$1 == c {print $3; exit}' /proc/spl/kstat/zfs/arcstats
+}
+
+log_assert "Ensure fadvise prefetch data"
+
+log_onexit cleanup
+
+log_must zfs set primarycache=metadata $TESTPOOL
+
+log_must file_write -o create -f $FILE -b $BLKSZ -c 1000
+sync_pool $TESTPOOL
+
+data_size1=$(getstat data_size)
+
+log_must file_fadvise -f $FILE -a 2
+sleep 10
+
+data_size2=$(getstat data_size)
+log_note "original data_size is $data_size1, final data_size is $data_size2"
+
+log_must [ $data_size1 -le $data_size2 ]
+
+log_pass "Ensure data could be prefetched"
diff --git a/tests/zfs-tests/tests/functional/fadvise/setup.ksh b/tests/zfs-tests/tests/functional/fadvise/setup.ksh
new file mode 100755 (executable)
index 0000000..8ddd733
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Portions Copyright (c) 2022 Information2 Software, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+default_setup_noexit $DISK
+log_pass
index e9517bad71318a58a9edd7fbafef4753623a69c9..bd32be9a4ff8ac922dd9dc0ba3de29bbbefb95ae 100755 (executable)
@@ -73,7 +73,7 @@ for type in "mirror" "raidz" "raidz2"; do
 
        # 4. Inject CHECKSUM ERRORS on read with a zinject error handler
        log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL
-       log_must cp $TESTFILE /dev/null
+       log_must dd if=$TESTFILE of=/dev/null bs=1M count=64
 
        # 5. Verify the ZED kicks in a hot spare and expected pool/device status
        log_note "Wait for ZED to auto-spare"