]> git.proxmox.com Git - mirror_zfs.git/commitdiff
zfs_rename: support RENAME_* flags
authorAleksa Sarai <cyphar@cyphar.com>
Sat, 22 Jun 2019 00:35:11 +0000 (10:35 +1000)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 28 Oct 2022 16:49:20 +0000 (09:49 -0700)
Implement support for Linux's RENAME_* flags (for renameat2). Aside from
being quite useful for userspace (providing race-free ways to exchange
paths and implement mv --no-clobber), they are used by overlayfs and are
thus required in order to use overlayfs-on-ZFS.

In order for us to represent the new renameat2(2) flags in the ZIL, we
create two new transaction types for the two flags which need
transactional-level support (RENAME_EXCHANGE and RENAME_WHITEOUT).
RENAME_NOREPLACE does not need any ZIL support because we know that if
the operation succeeded before creating the ZIL entry, there was no file
to be clobbered and thus it can be treated as a regular TX_RENAME.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pavel Snajdr <snajpa@snajpa.net>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Closes #12209
Closes #14070

33 files changed:
AUTHORS
cmd/zdb/zdb_il.c
cmd/ztest.c
config/kernel-rename.m4
include/os/freebsd/zfs/sys/zfs_vnops_os.h
include/os/linux/kernel/linux/vfs_compat.h
include/os/linux/spl/sys/sysmacros.h
include/os/linux/zfs/sys/zfs_vnops_os.h
include/os/linux/zfs/sys/zpl.h
include/sys/zfs_znode.h
include/sys/zil.h
module/os/freebsd/zfs/zfs_vnops_os.c
module/os/linux/zfs/zfs_dir.c
module/os/linux/zfs/zfs_vnops_os.c
module/os/linux/zfs/zfs_znode.c
module/os/linux/zfs/zpl_inode.c
module/zfs/zfs_log.c
module/zfs/zfs_replay.c
module/zfs/zil.c
module/zfs/zvol.c
tests/runfiles/linux.run
tests/test-runner/bin/zts-report.py.in
tests/zfs-tests/cmd/.gitignore
tests/zfs-tests/cmd/Makefile.am
tests/zfs-tests/cmd/renameat2.c [new file with mode: 0644]
tests/zfs-tests/include/commands.cfg
tests/zfs-tests/tests/functional/renameat2/Makefile.am [new file with mode: 0644]
tests/zfs-tests/tests/functional/renameat2/cleanup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/renameat2/renameat2_exchange.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/renameat2/renameat2_noreplace.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/renameat2/renameat2_whiteout.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/renameat2/setup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh

diff --git a/AUTHORS b/AUTHORS
index 86083ba8771514e5f0ed05c1cde811a6277275f0..c2af58d75085d79eb747399413a83387a2d9a855 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -20,6 +20,7 @@ CONTRIBUTORS:
     Alec Salazar <alec.j.salazar@gmail.com>
     Alejandro R. SedeƱo <asedeno@mit.edu>
     Alek Pinchuk <alek@nexenta.com>
+    Aleksa Sarai <cyphar@cyphar.com>
     Alex Braunegg <alex.braunegg@gmail.com>
     Alex McWhirter <alexmcwhirter@triadic.us>
     Alex Reece <alex@delphix.com>
@@ -236,6 +237,7 @@ CONTRIBUTORS:
     Paul Dagnelie <pcd@delphix.com>
     Paul Zuchowski <pzuchowski@datto.com>
     Pavel Boldin <boldin.pavel@gmail.com>
+    Pavel Snajdr <snajpa@snajpa.net>
     Pavel Zakharov <pavel.zakharov@delphix.com>
     Pawel Jakub Dawidek <pjd@FreeBSD.org>
     Pedro Giffuni <pfg@freebsd.org>
index 02cc10fb7817429d652302fc4dbc70c5e10169fe..55df1f559f6e218378de3132e65b52b19175e5c9 100644 (file)
@@ -128,6 +128,14 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, const void *arg)
        (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix,
            (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
        (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm);
+       switch (txtype) {
+       case TX_RENAME_EXCHANGE:
+               (void) printf("%sflags RENAME_EXCHANGE\n", tab_prefix);
+               break;
+       case TX_RENAME_WHITEOUT:
+               (void) printf("%sflags RENAME_WHITEOUT\n", tab_prefix);
+               break;
+       }
 }
 
 static int
@@ -330,6 +338,8 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
        {.zri_print = zil_prt_rec_write,    .zri_name = "TX_WRITE2          "},
        {.zri_print = zil_prt_rec_setsaxattr,
            .zri_name = "TX_SETSAXATTR      "},
+       {.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME_EXCHANGE "},
+       {.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME_WHITEOUT "},
 };
 
 static int
index a8f9e6b8760a14de7cccab03163f9b19537b9b1b..19edab4eb7a285edec058d821559084fc75ee453 100644 (file)
@@ -2368,6 +2368,8 @@ static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
        NULL,                   /* TX_MKDIR_ACL_ATTR */
        NULL,                   /* TX_WRITE2 */
        NULL,                   /* TX_SETSAXATTR */
+       NULL,                   /* TX_RENAME_EXCHANGE */
+       NULL,                   /* TX_RENAME_WHITEOUT */
 };
 
 /*
index 302db43f57488ccc0fea41d3e394cc592062e8f4..a2b0800ab4d29e054cb28167123e27cb7df7bbfa 100644 (file)
@@ -1,8 +1,28 @@
 AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [
+       dnl #
+       dnl # 3.9 (to 4.9) API change,
+       dnl #
+       dnl # A new version of iops->rename() was added (rename2) that takes a flag
+       dnl # argument (to support renameat2). However this separate function was
+       dnl # merged back into iops->rename() in Linux 4.9.
+       dnl #
+       ZFS_LINUX_TEST_SRC([inode_operations_rename2], [
+               #include <linux/fs.h>
+               int rename2_fn(struct inode *sip, struct dentry *sdp,
+                       struct inode *tip, struct dentry *tdp,
+                       unsigned int flags) { return 0; }
+
+               static const struct inode_operations
+                   iops __attribute__ ((unused)) = {
+                       .rename2 = rename2_fn,
+               };
+       ],[])
+
        dnl #
        dnl # 4.9 API change,
-       dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants
-       dnl # flags.
+       dnl #
+       dnl # iops->rename2() merged into iops->rename(), and iops->rename() now
+       dnl # wants flags.
        dnl #
        ZFS_LINUX_TEST_SRC([inode_operations_rename_flags], [
                #include <linux/fs.h>
@@ -16,11 +36,29 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [
                };
        ],[])
 
+       dnl #
+       dnl # EL7 compatibility
+       dnl #
+       dnl # EL7 has backported renameat2 support, but it's done by defining a
+       dnl # separate iops wrapper structure that takes the .renameat2 function.
+       dnl #
+       ZFS_LINUX_TEST_SRC([dir_inode_operations_wrapper_rename2], [
+               #include <linux/fs.h>
+               int rename2_fn(struct inode *sip, struct dentry *sdp,
+                       struct inode *tip, struct dentry *tdp,
+                       unsigned int flags) { return 0; }
+
+               static const struct inode_operations_wrapper
+                   iops __attribute__ ((unused)) = {
+                       .rename2 = rename2_fn,
+               };
+       ],[])
+
        dnl #
        dnl # 5.12 API change,
        dnl #
-       dnl # Linux 5.12 introduced passing struct user_namespace* as the first argument
-       dnl # of the rename() and other inode_operations members.
+       dnl # Linux 5.12 introduced passing struct user_namespace* as the first
+       dnl # argument of the rename() and other inode_operations members.
        dnl #
        ZFS_LINUX_TEST_SRC([inode_operations_rename_userns], [
                #include <linux/fs.h>
@@ -44,13 +82,30 @@ AC_DEFUN([ZFS_AC_KERNEL_RENAME], [
        ],[
                AC_MSG_RESULT(no)
 
-               AC_MSG_CHECKING([whether iop->rename() wants flags])
-               ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [
+               AC_MSG_CHECKING([whether iops->rename2() exists])
+               ZFS_LINUX_TEST_RESULT([inode_operations_rename2], [
                        AC_MSG_RESULT(yes)
-                       AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1,
-                               [iops->rename() wants flags])
+                       AC_DEFINE(HAVE_RENAME2, 1, [iops->rename2() exists])
                ],[
                        AC_MSG_RESULT(no)
+
+                       AC_MSG_CHECKING([whether iops->rename() wants flags])
+                       ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [
+                               AC_MSG_RESULT(yes)
+                               AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1,
+                                       [iops->rename() wants flags])
+                       ],[
+                               AC_MSG_RESULT(no)
+
+                               AC_MSG_CHECKING([whether struct inode_operations_wrapper takes .rename2()])
+                               ZFS_LINUX_TEST_RESULT([dir_inode_operations_wrapper_rename2], [
+                                       AC_MSG_RESULT(yes)
+                                       AC_DEFINE(HAVE_RENAME2_OPERATIONS_WRAPPER, 1,
+                                               [struct inode_operations_wrapper takes .rename2()])
+                               ],[
+                                       AC_MSG_RESULT(no)
+                               ])
+                       ])
                ])
        ])
 ])
index 460aecd2e708da9aa0a55a56a709f351960a55b0..839ee629a5ab25b92bf8cc92c6d33579db78023a 100644 (file)
@@ -41,7 +41,8 @@ extern int zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd,
 extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr,
     zuserns_t *mnt_ns);
 extern int zfs_rename(znode_t *sdzp, const char *snm, znode_t *tdzp,
-    const char *tnm, cred_t *cr, int flags, zuserns_t *mnt_ns);
+    const char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap,
+    zuserns_t *mnt_ns);
 extern int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
     const char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns);
 extern int zfs_link(znode_t *tdzp, znode_t *sp,
index eeed0a388ce4f8f52913aa68e8070900e0de34c6..fd0b9e8e1068eaa3f15cb04ace0022bf71e26451 100644 (file)
@@ -324,6 +324,19 @@ static inline void zfs_gid_write(struct inode *ip, gid_t gid)
        ip->i_gid = make_kgid(kcred->user_ns, gid);
 }
 
+/*
+ * 3.15 API change
+ */
+#ifndef RENAME_NOREPLACE
+#define        RENAME_NOREPLACE        (1 << 0) /* Don't overwrite target */
+#endif
+#ifndef RENAME_EXCHANGE
+#define        RENAME_EXCHANGE         (1 << 1) /* Exchange source and dest */
+#endif
+#ifndef RENAME_WHITEOUT
+#define        RENAME_WHITEOUT         (1 << 2) /* Whiteout source */
+#endif
+
 /*
  * 4.9 API change
  */
index be1f77e43bda420c763638826cd79ab4aa606d5f..99e3a6fb41c634f018c90404f310a9eb8c68e2f0 100644 (file)
@@ -120,6 +120,16 @@ extern uint32_t zone_get_hostid(void *zone);
 extern void spl_setup(void);
 extern void spl_cleanup(void);
 
+/*
+ * Only handles the first 4096 majors and first 256 minors. We don't have a
+ * libc for the kernel module so we define this inline.
+ */
+static inline dev_t
+makedev(unsigned int major, unsigned int minor)
+{
+       return ((major & 0xFFF) << 8) | (minor & 0xFF);
+}
+
 #define        highbit(x)              __fls(x)
 #define        lowbit(x)               __ffs(x)
 
index 787d258e13888ac71be3b4deaf45dbbc5161d07f..197ea9bec50054df0ccc80b03065530edcdea88a 100644 (file)
@@ -61,7 +61,8 @@ extern int zfs_getattr_fast(struct user_namespace *, struct inode *ip,
 extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr,
     zuserns_t *mnt_ns);
 extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp,
-    char *tnm, cred_t *cr, int flags, zuserns_t *mnt_ns);
+    char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap,
+    zuserns_t *mnt_ns);
 extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap,
     char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns);
 extern int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr);
index 83416d64744ce36ecd8e3cceae8b13cec3ae367b..c3ee0ae4a6006673cca243c50dc60ba7f0f672b6 100644 (file)
@@ -42,7 +42,11 @@ extern void zpl_vap_init(vattr_t *vap, struct inode *dir,
     umode_t mode, cred_t *cr, zuserns_t *mnt_ns);
 
 extern const struct inode_operations zpl_inode_operations;
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+extern const struct inode_operations_wrapper zpl_dir_inode_operations;
+#else
 extern const struct inode_operations zpl_dir_inode_operations;
+#endif
 extern const struct inode_operations zpl_symlink_inode_operations;
 extern const struct inode_operations zpl_special_inode_operations;
 
index c8656b3f6162b7af96432929c6b349dc7aa91570..88d642350691ee29919213c0519d090a32ef33e2 100644 (file)
@@ -299,6 +299,12 @@ extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname,
     znode_t *szp);
+extern void zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx,
+    uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp,
+    const char *dname, znode_t *szp);
+extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx,
+    uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp,
+    const char *dname, znode_t *szp, znode_t *wzp);
 extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, offset_t off, ssize_t len, int ioflag,
     zil_callback_t callback, void *callback_data);
index cec04f120ce335f979eee395d05348304c0585fe..9591fb4f64407644e405acc568f46ecddf8adf56 100644 (file)
@@ -164,7 +164,9 @@ typedef enum zil_create {
 #define        TX_MKDIR_ACL_ATTR       19      /* mkdir with ACL + attrs */
 #define        TX_WRITE2               20      /* dmu_sync EALREADY write */
 #define        TX_SETSAXATTR           21      /* Set sa xattrs on file */
-#define        TX_MAX_TYPE             22      /* Max transaction type */
+#define        TX_RENAME_EXCHANGE      22      /* Atomic swap via renameat2 */
+#define        TX_RENAME_WHITEOUT      23      /* Atomic whiteout via renameat2 */
+#define        TX_MAX_TYPE             24      /* Max transaction type */
 
 /*
  * The transactions for mkdir, symlink, remove, rmdir, link, and rename
@@ -317,6 +319,19 @@ typedef struct {
        /* 2 strings: names of source and destination follow this */
 } lr_rename_t;
 
+typedef struct {
+       lr_rename_t     lr_rename;      /* common rename portion */
+       /* members related to the whiteout file (based on lr_create_t) */
+       uint64_t        lr_wfoid;       /* obj id of the new whiteout file */
+       uint64_t        lr_wmode;       /* mode of object */
+       uint64_t        lr_wuid;        /* uid of whiteout */
+       uint64_t        lr_wgid;        /* gid of whiteout */
+       uint64_t        lr_wgen;        /* generation (txg of creation) */
+       uint64_t        lr_wcrtime[2];  /* creation time */
+       uint64_t        lr_wrdev;       /* always makedev(0, 0) */
+       /* 2 strings: names of source and destination follow this */
+} lr_rename_whiteout_t;
+
 typedef struct {
        lr_t            lr_common;      /* common portion of log record */
        uint64_t        lr_foid;        /* file object to write */
index 362e02751ee440c9e14f6f13a3a39d437f307ff2..bcf4e2f18d83d15508ca37f9c3c86f55196dd8d8 100644 (file)
@@ -3420,7 +3420,7 @@ out:
 
 int
 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
-    cred_t *cr, int flags, zuserns_t *mnt_ns)
+    cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns)
 {
        struct componentname scn, tcn;
        vnode_t *sdvp, *tdvp;
@@ -3428,6 +3428,9 @@ zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
        int error;
        svp = tvp = NULL;
 
+       if (rflags != 0 || wo_vap != NULL)
+               return (SET_ERROR(EINVAL));
+
        sdvp = ZTOV(sdzp);
        tdvp = ZTOV(tdzp);
        error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
index fb6c28f95c3b65f5743a9fb1cdecf8eeb7ba041c..b4e4146b09e9e73bfb0385135d4454e01ed67c91 100644 (file)
@@ -1035,7 +1035,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
                }
 
                /* The only error is !zfs_dirempty() and we checked earlier. */
-               ASSERT3U(zfs_drop_nlink_locked(zp, tx, &unlinked), ==, 0);
+               error = zfs_drop_nlink_locked(zp, tx, &unlinked);
+               ASSERT3U(error, ==, 0);
                mutex_exit(&zp->z_lock);
        } else {
                error = zfs_dropname(dl, zp, dzp, tx, flag);
index f02cefea222b1acd721dfbe49d299091489ddd4e..545d8ad8d79c5af974fcce28ccff198c2a8172d6 100644 (file)
@@ -2655,6 +2655,8 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
  *             tnm     - New entry name.
  *             cr      - credentials of caller.
  *             flags   - case flags
+ *             rflags  - RENAME_* flags
+ *             wa_vap  - attributes for RENAME_WHITEOUT (must be a char 0:0).
  *             mnt_ns  - user namespace of the mount
  *
  *     RETURN: 0 on success, error code on failure.
@@ -2664,7 +2666,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
  */
 int
 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
-    cred_t *cr, int flags, zuserns_t *mnt_ns)
+    cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns)
 {
        znode_t         *szp, *tzp;
        zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
@@ -2676,10 +2678,33 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
        int             error = 0;
        int             zflg = 0;
        boolean_t       waited = B_FALSE;
+       /* Needed for whiteout inode creation. */
+       boolean_t       fuid_dirtied;
+       zfs_acl_ids_t   acl_ids;
+       boolean_t       have_acl = B_FALSE;
+       znode_t         *wzp = NULL;
+
 
        if (snm == NULL || tnm == NULL)
                return (SET_ERROR(EINVAL));
 
+       if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+               return (SET_ERROR(EINVAL));
+
+       /* Already checked by Linux VFS, but just to make sure. */
+       if (rflags & RENAME_EXCHANGE &&
+           (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
+               return (SET_ERROR(EINVAL));
+
+       /*
+        * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
+        * right kind of vattr_t for the whiteout file. These are set
+        * internally by ZFS so should never be incorrect.
+        */
+       VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
+       VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
+       VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
+
        if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
                return (error);
        zilog = zfsvfs->z_log;
@@ -2856,7 +2881,6 @@ top:
         * Note that if target and source are the same, this can be
         * done in a single check.
         */
-
        if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
                goto out;
 
@@ -2873,15 +2897,21 @@ top:
         * Does target exist?
         */
        if (tzp) {
+               if (rflags & RENAME_NOREPLACE) {
+                       error = SET_ERROR(EEXIST);
+                       goto out;
+               }
                /*
-                * Source and target must be the same type.
+                * Source and target must be the same type (unless exchanging).
                 */
-               boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
-               boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
+               if (!(rflags & RENAME_EXCHANGE)) {
+                       boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
+                       boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
 
-               if (s_is_dir != t_is_dir) {
-                       error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
-                       goto out;
+                       if (s_is_dir != t_is_dir) {
+                               error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
+                               goto out;
+                       }
                }
                /*
                 * POSIX dictates that when the source and target
@@ -2892,12 +2922,43 @@ top:
                        error = 0;
                        goto out;
                }
+       } else if (rflags & RENAME_EXCHANGE) {
+               /* Target must exist for RENAME_EXCHANGE. */
+               error = SET_ERROR(ENOENT);
+               goto out;
+       }
+
+       /* Set up inode creation for RENAME_WHITEOUT. */
+       if (rflags & RENAME_WHITEOUT) {
+               /*
+                * Whiteout files are not regular files or directories, so to
+                * match zfs_create() we do not inherit the project id.
+                */
+               uint64_t wo_projid = ZFS_DEFAULT_PROJID;
+
+               error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
+               if (error)
+                       goto out;
+
+               if (!have_acl) {
+                       error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
+                           &acl_ids, mnt_ns);
+                       if (error)
+                               goto out;
+                       have_acl = B_TRUE;
+               }
+
+               if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
+                       error = SET_ERROR(EDQUOT);
+                       goto out;
+               }
        }
 
        tx = dmu_tx_create(zfsvfs->z_os);
        dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
        dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
-       dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+       dmu_tx_hold_zap(tx, sdzp->z_id,
+           (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
        dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
        if (sdzp != tdzp) {
                dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
@@ -2907,7 +2968,21 @@ top:
                dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
                zfs_sa_upgrade_txholds(tx, tzp);
        }
+       if (rflags & RENAME_WHITEOUT) {
+               dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+                   ZFS_SA_BASE_ATTR_SIZE);
 
+               dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
+               dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+               if (!zfsvfs->z_use_sa &&
+                   acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+                       dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+                           0, acl_ids.z_aclp->z_acl_bytes);
+               }
+       }
+       fuid_dirtied = zfsvfs->z_fuid_dirty;
+       if (fuid_dirtied)
+               zfs_fuid_txhold(zfsvfs, tx);
        zfs_sa_upgrade_txholds(tx, szp);
        dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
        error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
@@ -2946,7 +3021,7 @@ top:
 
        error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
            (void *)&szp->z_pflags, sizeof (uint64_t), tx);
-       ASSERT0(error);
+       VERIFY0(error);
 
        error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
        if (error)
@@ -2956,13 +3031,30 @@ top:
         * Unlink the target.
         */
        if (tzp) {
-               error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+               int tzflg = zflg;
+
+               if (rflags & RENAME_EXCHANGE) {
+                       /* This inode will be re-linked soon. */
+                       tzflg |= ZRENAMING;
+
+                       tzp->z_pflags |= ZFS_AV_MODIFIED;
+                       if (sdzp->z_pflags & ZFS_PROJINHERIT)
+                               tzp->z_pflags |= ZFS_PROJINHERIT;
+
+                       error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+                           (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
+                       ASSERT0(error);
+               }
+               error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
                if (error)
                        goto commit_link_szp;
        }
 
        /*
-        * Create a new link at the target.
+        * Create the new target links:
+        *   * We always link the target.
+        *   * RENAME_EXCHANGE: Link the old target to the source.
+        *   * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
         */
        error = zfs_link_create(tdl, szp, tx, ZRENAMING);
        if (error) {
@@ -2975,18 +3067,55 @@ top:
                goto commit_link_tzp;
        }
 
-       zfs_log_rename(zilog, tx, TX_RENAME |
-           (flags & FIGNORECASE ? TX_CI : 0), sdzp,
-           sdl->dl_name, tdzp, tdl->dl_name, szp);
+       switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
+       case RENAME_EXCHANGE:
+               error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
+               /*
+                * The same argument as zfs_link_create() failing for
+                * szp applies here, since the source directory must
+                * have had an entry we are replacing.
+                */
+               ASSERT0(error);
+               if (error)
+                       goto commit_unlink_td_szp;
+               break;
+       case RENAME_WHITEOUT:
+               zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
+               error = zfs_link_create(sdl, wzp, tx, ZNEW);
+               if (error) {
+                       zfs_znode_delete(wzp, tx);
+                       remove_inode_hash(ZTOI(wzp));
+                       goto commit_unlink_td_szp;
+               }
+               break;
+       }
+
+       if (fuid_dirtied)
+               zfs_fuid_sync(zfsvfs, tx);
+
+       switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
+       case RENAME_EXCHANGE:
+               zfs_log_rename_exchange(zilog, tx,
+                   (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
+                   tdzp, tdl->dl_name, szp);
+               break;
+       case RENAME_WHITEOUT:
+               zfs_log_rename_whiteout(zilog, tx,
+                   (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
+                   tdzp, tdl->dl_name, szp, wzp);
+               break;
+       default:
+               ASSERT0(rflags & ~RENAME_NOREPLACE);
+               zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
+                   sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+               break;
+       }
 
 commit:
        dmu_tx_commit(tx);
 out:
-       if (zl != NULL)
-               zfs_rename_unlock(&zl);
-
-       zfs_dirent_unlock(sdl);
-       zfs_dirent_unlock(tdl);
+       if (have_acl)
+               zfs_acl_ids_free(&acl_ids);
 
        zfs_znode_update_vfs(sdzp);
        if (sdzp == tdzp)
@@ -2997,11 +3126,21 @@ out:
 
        zfs_znode_update_vfs(szp);
        zrele(szp);
+       if (wzp) {
+               zfs_znode_update_vfs(wzp);
+               zrele(wzp);
+       }
        if (tzp) {
                zfs_znode_update_vfs(tzp);
                zrele(tzp);
        }
 
+       if (zl != NULL)
+               zfs_rename_unlock(&zl);
+
+       zfs_dirent_unlock(sdl);
+       zfs_dirent_unlock(tdl);
+
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
                zil_commit(zilog, 0);
 
@@ -3012,23 +3151,31 @@ out:
         * Clean-up path for broken link state.
         *
         * At this point we are in a (very) bad state, so we need to do our
-        * best to correct the state. In particular, the nlink of szp is wrong
-        * because we were destroying and creating links with ZRENAMING.
+        * best to correct the state. In particular, all of the nlinks are
+        * wrong because we were destroying and creating links with ZRENAMING.
+        *
+        * In some form, all of these operations have to resolve the state:
+        *
+        *  * link_destroy() *must* succeed. Fortunately, this is very likely
+        *    since we only just created it.
         *
-        * link_create()s are allowed to fail (though they shouldn't because we
-        * only just unlinked them and are putting the entries back during
-        * clean-up). But if they fail, we can just forcefully drop the nlink
-        * value to (at the very least) avoid broken nlink values -- though in
-        * the case of non-empty directories we will have to panic.
+        *  * link_create()s are allowed to fail (though they shouldn't because
+        *    we only just unlinked them and are putting the entries back
+        *    during clean-up). But if they fail, we can just forcefully drop
+        *    the nlink value to (at the very least) avoid broken nlink values
+        *    -- though in the case of non-empty directories we will have to
+        *    panic (otherwise we'd have a leaked directory with a broken ..).
         */
+commit_unlink_td_szp:
+       VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
 commit_link_tzp:
        if (tzp) {
                if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
-                       VERIFY3U(zfs_drop_nlink(tzp, tx, NULL), ==, 0);
+                       VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
        }
 commit_link_szp:
        if (zfs_link_create(sdl, szp, tx, ZRENAMING))
-               VERIFY3U(zfs_drop_nlink(szp, tx, NULL), ==, 0);
+               VERIFY0(zfs_drop_nlink(szp, tx, NULL));
        goto commit;
 }
 
index 3ded79a30a6f2d2bea1707dc9a5330b6b021220c..c8f6e02bd2248c222d9f2bdd70abe05703e17c85 100644 (file)
@@ -422,7 +422,12 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
                break;
 
        case S_IFDIR:
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+               ip->i_flags |= S_IOPS_WRAPPER;
+               ip->i_op = &zpl_dir_inode_operations.ops;
+#else
                ip->i_op = &zpl_dir_inode_operations;
+#endif
                ip->i_fop = &zpl_dir_file_operations;
                ITOZ(ip)->z_zn_prefetch = B_TRUE;
                break;
index 9b702c535ea7a11515b651c8e2c400ee9cb7b364..64016f9ac1deb978d1b283d61243c2240371bbfd 100644 (file)
@@ -24,6 +24,7 @@
  */
 
 
+#include <sys/sysmacros.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
@@ -498,35 +499,42 @@ static int
 #ifdef HAVE_IOPS_RENAME_USERNS
 zpl_rename2(struct user_namespace *user_ns, struct inode *sdip,
     struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
-    unsigned int flags)
+    unsigned int rflags)
 #else
 zpl_rename2(struct inode *sdip, struct dentry *sdentry,
-    struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+    struct inode *tdip, struct dentry *tdentry, unsigned int rflags)
 #endif
 {
        cred_t *cr = CRED();
+       vattr_t *wo_vap = NULL;
        int error;
        fstrans_cookie_t cookie;
 #ifndef HAVE_IOPS_RENAME_USERNS
        zuserns_t *user_ns = NULL;
 #endif
 
-       /* We don't have renameat2(2) support */
-       if (flags)
-               return (-EINVAL);
-
        crhold(cr);
+       if (rflags & RENAME_WHITEOUT) {
+               wo_vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+               zpl_vap_init(wo_vap, sdip, S_IFCHR, cr, user_ns);
+               wo_vap->va_rdev = makedevice(0, 0);
+       }
+
        cookie = spl_fstrans_mark();
        error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip),
-           dname(tdentry), cr, 0, user_ns);
+           dname(tdentry), cr, 0, rflags, wo_vap, user_ns);
        spl_fstrans_unmark(cookie);
+       if (wo_vap)
+               kmem_free(wo_vap, sizeof (vattr_t));
        crfree(cr);
        ASSERT3S(error, <=, 0);
 
        return (error);
 }
 
-#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS)
+#if !defined(HAVE_IOPS_RENAME_USERNS) && \
+       !defined(HAVE_RENAME_WANTS_FLAGS) && \
+       !defined(HAVE_RENAME2)
 static int
 zpl_rename(struct inode *sdip, struct dentry *sdentry,
     struct inode *tdip, struct dentry *tdentry)
@@ -745,7 +753,12 @@ const struct inode_operations zpl_inode_operations = {
 #endif /* CONFIG_FS_POSIX_ACL */
 };
 
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+const struct inode_operations_wrapper zpl_dir_inode_operations = {
+       .ops = {
+#else
 const struct inode_operations zpl_dir_inode_operations = {
+#endif
        .create         = zpl_create,
        .lookup         = zpl_lookup,
        .link           = zpl_link,
@@ -754,7 +767,9 @@ const struct inode_operations zpl_dir_inode_operations = {
        .mkdir          = zpl_mkdir,
        .rmdir          = zpl_rmdir,
        .mknod          = zpl_mknod,
-#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
+#ifdef HAVE_RENAME2
+       .rename2        = zpl_rename2,
+#elif defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
        .rename         = zpl_rename2,
 #else
        .rename         = zpl_rename,
@@ -776,6 +791,10 @@ const struct inode_operations zpl_dir_inode_operations = {
 #endif /* HAVE_SET_ACL */
        .get_acl        = zpl_get_acl,
 #endif /* CONFIG_FS_POSIX_ACL */
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+       },
+       .rename2        = zpl_rename2,
+#endif
 };
 
 const struct inode_operations zpl_symlink_inode_operations = {
index 245699882aa9e103979c9aa443727e22ed58c717..77bf9140d52d013bd0bd5c4ceb5de58cd44b652e 100644 (file)
@@ -494,25 +494,101 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        zil_itx_assign(zilog, itx, tx);
 }
 
+static void
+do_zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
+    const char *sname, znode_t *tdzp, const char *dname, znode_t *szp)
+{
+       itx_t *itx;
+       lr_rename_t *lr;
+       size_t snamesize = strlen(sname) + 1;
+       size_t dnamesize = strlen(dname) + 1;
+
+       if (zil_replaying(zilog, tx))
+               return;
+
+       itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+       lr = (lr_rename_t *)&itx->itx_lr;
+       lr->lr_sdoid = sdzp->z_id;
+       lr->lr_tdoid = tdzp->z_id;
+       memcpy((char *)(lr + 1), sname, snamesize);
+       memcpy((char *)(lr + 1) + snamesize, dname, dnamesize);
+       itx->itx_oid = szp->z_id;
+
+       zil_itx_assign(zilog, itx, tx);
+}
+
 /*
  * Handles TX_RENAME transactions.
  */
 void
 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
     const char *sname, znode_t *tdzp, const char *dname, znode_t *szp)
+{
+       txtype |= TX_RENAME;
+       do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
+}
+
+/*
+ * Handles TX_RENAME_EXCHANGE transactions.
+ */
+void
+zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname,
+    znode_t *szp)
+{
+       txtype |= TX_RENAME_EXCHANGE;
+       do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
+}
+
+/*
+ * Handles TX_RENAME_WHITEOUT transactions.
+ *
+ * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call
+ * zfs_mknode() on replay which requires stashing bits as with TX_CREATE.
+ */
+void
+zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname,
+    znode_t *szp, znode_t *wzp)
 {
        itx_t *itx;
-       lr_rename_t *lr;
+       lr_rename_whiteout_t *lr;
        size_t snamesize = strlen(sname) + 1;
        size_t dnamesize = strlen(dname) + 1;
 
        if (zil_replaying(zilog, tx))
                return;
 
+       txtype |= TX_RENAME_WHITEOUT;
        itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
-       lr = (lr_rename_t *)&itx->itx_lr;
-       lr->lr_sdoid = sdzp->z_id;
-       lr->lr_tdoid = tdzp->z_id;
+       lr = (lr_rename_whiteout_t *)&itx->itx_lr;
+       lr->lr_rename.lr_sdoid = sdzp->z_id;
+       lr->lr_rename.lr_tdoid = tdzp->z_id;
+
+       /*
+        * RENAME_WHITEOUT will create an entry at the source znode, so we need
+        * to store the same data that the equivalent call to zfs_log_create()
+        * would.
+        */
+       lr->lr_wfoid = wzp->z_id;
+       LR_FOID_SET_SLOTS(lr->lr_wfoid, wzp->z_dnodesize >> DNODE_SHIFT);
+       (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(wzp)), &lr->lr_wgen,
+           sizeof (uint64_t));
+       (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(wzp)),
+           lr->lr_wcrtime, sizeof (uint64_t) * 2);
+       lr->lr_wmode = wzp->z_mode;
+       lr->lr_wuid = (uint64_t)KUID_TO_SUID(ZTOUID(wzp));
+       lr->lr_wgid = (uint64_t)KGID_TO_SGID(ZTOGID(wzp));
+
+       /*
+        * This rdev will always be makdevice(0, 0) but because the ZIL log and
+        * replay code needs to be platform independent (and there is no
+        * platform independent makdev()) we need to copy the one created
+        * during the rename operation.
+        */
+       (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(wzp)), &lr->lr_wrdev,
+           sizeof (lr->lr_wrdev));
+
        memcpy((char *)(lr + 1), sname, snamesize);
        memcpy((char *)(lr + 1) + snamesize, dname, dnamesize);
        itx->itx_oid = szp->z_id;
index 45c2fa3720cff67f5f173f9624d9142d8e326327..5e20ce3319b43f9a71162d848584ff3ddc51e0ca 100644 (file)
@@ -643,18 +643,21 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
 }
 
 static int
-zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname,
+    char *tname, uint64_t rflags, vattr_t *wo_vap)
 {
-       zfsvfs_t *zfsvfs = arg1;
-       lr_rename_t *lr = arg2;
-       char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
-       char *tname = sname + strlen(sname) + 1;
        znode_t *sdzp, *tdzp;
-       int error;
-       int vflg = 0;
+       int error, vflg = 0;
 
-       if (byteswap)
-               byteswap_uint64_array(lr, sizeof (*lr));
+       /* Only Linux currently supports RENAME_* flags. */
+#ifdef __linux__
+       VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT));
+
+       /* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */
+       VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
+#else
+       VERIFY0(rflags);
+#endif
 
        if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
                return (error);
@@ -667,13 +670,94 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
        if (lr->lr_common.lrc_txtype & TX_CI)
                vflg |= FIGNORECASE;
 
-       error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, NULL);
+       error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
+           wo_vap, NULL);
 
        zrele(tdzp);
        zrele(sdzp);
        return (error);
 }
 
+static int
+zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+{
+       zfsvfs_t *zfsvfs = arg1;
+       lr_rename_t *lr = arg2;
+       char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+       char *tname = sname + strlen(sname) + 1;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL));
+}
+
+static int
+zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap)
+{
+#ifdef __linux__
+       zfsvfs_t *zfsvfs = arg1;
+       lr_rename_t *lr = arg2;
+       char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+       char *tname = sname + strlen(sname) + 1;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE,
+           NULL));
+#else
+       return (SET_ERROR(ENOTSUP));
+#endif
+}
+
+static int
+zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
+{
+#ifdef __linux__
+       zfsvfs_t *zfsvfs = arg1;
+       lr_rename_whiteout_t *lr = arg2;
+       int error;
+       /* sname and tname follow lr_rename_whiteout_t */
+       char *sname = (char *)(lr + 1);
+       char *tname = sname + strlen(sname) + 1;
+       /* For the whiteout file. */
+       xvattr_t xva;
+       uint64_t objid;
+       uint64_t dnodesize;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       objid = LR_FOID_GET_OBJ(lr->lr_wfoid);
+       dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT;
+
+       xva_init(&xva);
+       zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
+           lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid);
+
+       /*
+        * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which
+        * assigns the object's creation time, generation number, and dnode
+        * slot count. The generic zfs_rename() has no concept of these
+        * attributes, so we smuggle the values inside the vattr's otherwise
+        * unused va_ctime, va_nblocks, and va_fsid fields.
+        */
+       ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime);
+       xva.xva_vattr.va_nblocks = lr->lr_wgen;
+       xva.xva_vattr.va_fsid = dnodesize;
+
+       error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+       if (error)
+               return (error);
+
+       return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname,
+           RENAME_WHITEOUT, &xva.xva_vattr));
+#else
+       return (SET_ERROR(ENOTSUP));
+#endif
+}
+
 static int
 zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
@@ -1069,4 +1153,6 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
        zfs_replay_create_acl,  /* TX_MKDIR_ACL_ATTR */
        zfs_replay_write2,      /* TX_WRITE2 */
        zfs_replay_setsaxattr,  /* TX_SETSAXATTR */
+       zfs_replay_rename_exchange,     /* TX_RENAME_EXCHANGE */
+       zfs_replay_rename_whiteout,     /* TX_RENAME_WHITEOUT */
 };
index 6bb99c4b1cdf53789a3e36986a12a5e47511b22b..23afc8a40bb4fd283811d1962ab42cad25e32be1 100644 (file)
@@ -759,11 +759,9 @@ zil_commit_activate_saxattr_feature(zilog_t *zilog)
        uint64_t txg = 0;
        dmu_tx_t *tx = NULL;
 
-       if (spa_feature_is_enabled(zilog->zl_spa,
-           SPA_FEATURE_ZILSAXATTR) &&
+       if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
            dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
-           !dsl_dataset_feature_is_active(ds,
-           SPA_FEATURE_ZILSAXATTR)) {
+           !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
                tx = dmu_tx_create(zilog->zl_os);
                VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
                dsl_dataset_dirty(ds, tx);
index be8ee34f27ae4496db081c7c71c4ceb4eb5f4ca9..20578a8223b2892b9d7b626e561ae2061b55efe1 100644 (file)
@@ -514,6 +514,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
        zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
        zvol_replay_err,        /* TX_WRITE2 */
        zvol_replay_err,        /* TX_SETSAXATTR */
+       zvol_replay_err,        /* TX_RENAME_EXCHANGE */
+       zvol_replay_err,        /* TX_RENAME_WHITEOUT */
 };
 
 /*
index 21e0f882dc4029c78b0b8b9b251427b9044fd68f..13f7efd96bd3e9d62ec19711bc565adf9b1624da 100644 (file)
@@ -157,6 +157,10 @@ tags = ['functional', 'projectquota']
 tests = ['read_dos_attrs_001', 'write_dos_attrs_001']
 tags = ['functional', 'dos_attributes']
 
+[tests/functional/renameat2:Linux]
+tests = ['renameat2_noreplace', 'renameat2_exchange', 'renameat2_whiteout']
+tags = ['functional', 'renameat2']
+
 [tests/functional/rsend:Linux]
 tests = ['send_realloc_dnode_size', 'send_encrypted_files']
 tags = ['functional', 'rsend']
index e7d338fcf8a98ae06a6249ba283b214549575106..1cebf50827b9776e0ffcb7d6d645498fa05a9bbb 100755 (executable)
@@ -69,6 +69,11 @@ exec_reason = 'Test user execute permissions required for utilities'
 #
 python_deps_reason = 'Python modules missing: python3-cffi'
 
+#
+# Some tests require that the kernel supports renameat2 syscall.
+#
+renameat2_reason = 'Kernel renameat2 support required'
+
 #
 # Some tests require the O_TMPFILE flag which was first introduced in the
 # 3.11 kernel.
@@ -231,6 +236,7 @@ maybe = {
     'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946],
     'projectquota/setup': ['SKIP', exec_reason],
     'removal/removal_condense_export': ['FAIL', known_reason],
+    'renameat2/setup': ['SKIP', renameat2_reason],
     'reservation/reservation_008_pos': ['FAIL', 7741],
     'reservation/reservation_018_pos': ['FAIL', 5642],
     'snapshot/clone_001_pos': ['FAIL', known_reason],
index 0ec450e248db97a6b9c76ac20c9939286977620c..f68f580728189afeeb7c125842218955fe548672 100644 (file)
@@ -27,6 +27,7 @@
 /randwritecomp
 /read_dos_attributes
 /readmmap
+/renameat2
 /rename_dir
 /rm_lnkcnt_zero_file
 /send_doall
index 673a18b4c083bf8b12d1c2571982266cab791890..066abb6ce3b52702f6971aedaecfea4dcca48cee 100644 (file)
@@ -112,10 +112,10 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/edonr_test %D%/skein_test \
 %C%_edonr_test_LDADD = $(%C%_skein_test_LDADD)
 %C%_blake3_test_LDADD = $(%C%_skein_test_LDADD)
 
-
 if BUILD_LINUX
 scripts_zfs_tests_bin_PROGRAMS += %D%/getversion
 scripts_zfs_tests_bin_PROGRAMS += %D%/user_ns_exec
+scripts_zfs_tests_bin_PROGRAMS += %D%/renameat2
 scripts_zfs_tests_bin_PROGRAMS += %D%/xattrtest
 scripts_zfs_tests_bin_PROGRAMS += %D%/zed_fd_spill-zedlet
 scripts_zfs_tests_bin_PROGRAMS += %D%/idmap_util
@@ -127,7 +127,6 @@ scripts_zfs_tests_bin_PROGRAMS  += %D%/read_dos_attributes %D%/write_dos_attribu
 %C%_read_dos_attributes_SOURCES  = %D%/linux_dos_attributes/read_dos_attributes.c
 %C%_write_dos_attributes_SOURCES = %D%/linux_dos_attributes/write_dos_attributes.c
 
-
 scripts_zfs_tests_bin_PROGRAMS += %D%/randfree_file
 %C%_randfree_file_SOURCES       = %D%/file/randfree_file.c
 
diff --git a/tests/zfs-tests/cmd/renameat2.c b/tests/zfs-tests/cmd/renameat2.c
new file mode 100644 (file)
index 0000000..a9d0a8b
--- /dev/null
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: CDDL-1.0 OR MPL-2.0 */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2019 SUSE LLC
+ */
+
+/*
+ * mv(1) doesn't currently support RENAME_{EXCHANGE,WHITEOUT} so this is a very
+ * simple renameat2(2) wrapper for the OpenZFS self-tests.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+
+#ifndef SYS_renameat2
+#ifdef __NR_renameat2
+#define        SYS_renameat2 __NR_renameat2
+#elif defined(__x86_64__)
+#define        SYS_renameat2 316
+#elif defined(__i386__)
+#define        SYS_renameat2 353
+#elif defined(__arm__) || defined(__aarch64__)
+#define        SYS_renameat2 382
+#else
+#error "SYS_renameat2 not known for this architecture."
+#endif
+#endif
+
+#ifndef RENAME_NOREPLACE
+#define        RENAME_NOREPLACE        (1 << 0) /* Don't overwrite target */
+#endif
+#ifndef RENAME_EXCHANGE
+#define        RENAME_EXCHANGE         (1 << 1) /* Exchange source and dest */
+#endif
+#ifndef RENAME_WHITEOUT
+#define        RENAME_WHITEOUT         (1 << 2) /* Whiteout source */
+#endif
+
+/* glibc doesn't provide renameat2 wrapper, let's use our own */
+static int
+sys_renameat2(int olddirfd, const char *oldpath,
+    int newdirfd, const char *newpath, unsigned int flags)
+{
+       int ret = syscall(SYS_renameat2, olddirfd, oldpath, newdirfd, newpath,
+           flags);
+       return ((ret < 0) ? -errno : ret);
+}
+
+static void
+usage(void)
+{
+       fprintf(stderr, "usage: renameat2 [-Cnwx] src dst\n");
+       exit(1);
+}
+
+static void
+check(void)
+{
+       int err = sys_renameat2(AT_FDCWD, ".", AT_FDCWD, ".", RENAME_EXCHANGE);
+       exit(err == -ENOSYS);
+}
+
+int
+main(int argc, char **argv)
+{
+       char *src, *dst;
+       int ch, err;
+       unsigned int flags = 0;
+
+       while ((ch = getopt(argc, argv, "Cnwx")) >= 0) {
+               switch (ch) {
+                       case 'C':
+                               check();
+                               break;
+                       case 'n':
+                               flags |= RENAME_NOREPLACE;
+                               break;
+                       case 'w':
+                               flags |= RENAME_WHITEOUT;
+                               break;
+                       case 'x':
+                               flags |= RENAME_EXCHANGE;
+                               break;
+                       default:
+                               usage();
+                               break;
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc != 2)
+               usage();
+       src = argv[0];
+       dst = argv[1];
+
+       err = sys_renameat2(AT_FDCWD, src, AT_FDCWD, dst, flags);
+       if (err < 0)
+               fprintf(stderr, "renameat2: %s", strerror(-err));
+       return (err != 0);
+}
index 30514361ad5749f655779b966bbd131bd41d2712..b3cfe149ffa744d41cac908b7aa430b5c67da836 100644 (file)
@@ -208,6 +208,7 @@ export ZFSTEST_FILES='badsend
     randwritecomp
     readmmap
     read_dos_attributes
+    renameat2
     rename_dir
     rm_lnkcnt_zero_file
     send_doall
diff --git a/tests/zfs-tests/tests/functional/renameat2/Makefile.am b/tests/zfs-tests/tests/functional/renameat2/Makefile.am
new file mode 100644 (file)
index 0000000..bd8d6c9
--- /dev/null
@@ -0,0 +1,7 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/renameat2
+dist_pkgdata_SCRIPTS = \
+       setup.ksh \
+       cleanup.ksh \
+       renameat2_noreplace.ksh \
+       renameat2_exchange.ksh \
+       renameat2_whiteout.ksh
diff --git a/tests/zfs-tests/tests/functional/renameat2/cleanup.ksh b/tests/zfs-tests/tests/functional/renameat2/cleanup.ksh
new file mode 100755 (executable)
index 0000000..3166bd6
--- /dev/null
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/renameat2/renameat2_exchange.ksh b/tests/zfs-tests/tests/functional/renameat2/renameat2_exchange.ksh
new file mode 100755 (executable)
index 0000000..94e5623
--- /dev/null
@@ -0,0 +1,61 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
+# Copyright (C) 2019 SUSE LLC
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+function cleanup
+{
+       log_must rm -rf $TESTDIR/*
+}
+
+log_assert "ZFS supports RENAME_EXCHANGE."
+log_onexit cleanup
+
+cd $TESTDIR
+echo "foo" > foo
+echo "bar" > bar
+
+# Self-exchange is a no-op.
+log_must renameat2 -x foo foo
+log_must grep '^foo$' foo
+
+# Basic exchange.
+log_must renameat2 -x foo bar
+log_must grep '^bar$' foo
+log_must grep '^foo$' bar
+
+# And exchange back.
+log_must renameat2 -x foo bar
+log_must grep '^foo$' foo
+log_must grep '^bar$' bar
+
+# Exchange with a bad path should fail.
+log_mustnot renameat2 -x bar baz
+
+log_pass "ZFS supports RENAME_EXCHANGE as expected."
diff --git a/tests/zfs-tests/tests/functional/renameat2/renameat2_noreplace.ksh b/tests/zfs-tests/tests/functional/renameat2/renameat2_noreplace.ksh
new file mode 100755 (executable)
index 0000000..d75b94f
--- /dev/null
@@ -0,0 +1,51 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
+# Copyright (C) 2019 SUSE LLC
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+function cleanup
+{
+       log_must rm -rf $TESTDIR/*
+}
+
+log_assert "ZFS supports RENAME_NOREPLACE."
+log_onexit cleanup
+
+cd $TESTDIR
+touch foo bar
+
+# Clobbers should always fail.
+log_mustnot renameat2 -n foo foo
+log_mustnot renameat2 -n foo bar
+log_mustnot renameat2 -n bar foo
+
+# Regular renames should succeed.
+log_must renameat2 -n bar baz
+
+log_pass "ZFS supports RENAME_NOREPLACE as expected."
diff --git a/tests/zfs-tests/tests/functional/renameat2/renameat2_whiteout.ksh b/tests/zfs-tests/tests/functional/renameat2/renameat2_whiteout.ksh
new file mode 100755 (executable)
index 0000000..8ecb074
--- /dev/null
@@ -0,0 +1,50 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
+# Copyright (C) 2019 SUSE LLC
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+function cleanup
+{
+       log_must rm -rf $TESTDIR/*
+}
+
+log_assert "ZFS supports RENAME_WHITEOUT."
+log_onexit cleanup
+
+cd $TESTDIR
+echo "whiteout" > whiteout
+
+# Straight-forward rename-with-whiteout.
+log_must renameat2 -w whiteout new
+# Check new file.
+log_must grep '^whiteout$' new
+# Check that the whiteout is actually a {0,0} char device.
+log_must grep '^character special file:0:0$' <<<"$(stat -c '%F:%t:%T' whiteout)"
+
+log_pass "ZFS supports RENAME_WHITEOUT as expected."
diff --git a/tests/zfs-tests/tests/functional/renameat2/setup.ksh b/tests/zfs-tests/tests/functional/renameat2/setup.ksh
new file mode 100755 (executable)
index 0000000..b8c26d5
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
+# Copyright (C) 2019 SUSE LLC
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+if ! is_linux ; then
+       log_unsupported "renameat2 is linux-only"
+elif ! renameat2 -C ; then
+       log_unsupported "renameat2 not supported on this (pre-3.15) linux kernel"
+fi
+
+DISK=${DISKS%% *}
+default_setup $DISK
index eddecbc2db7ec37306c08427199effd15f1c2bd1..8f3585a5997f5b5cc04d6f4700fe67bfa1e0cb8b 100755 (executable)
@@ -175,6 +175,29 @@ log_must ln /$TESTPOOL/$TESTFS/link_and_unlink \
    /$TESTPOOL/$TESTFS/link_and_unlink.link
 log_must rm /$TESTPOOL/$TESTFS/link_and_unlink.link
 
+# We can't test RENAME_* flags without renameat2(2) support.
+if ! is_linux ; then
+       log_note "renameat2 is linux-only"
+elif ! renameat2 -C ; then
+       log_note "renameat2 not supported on this (pre-3.15) linux kernel"
+else
+       # TX_RENAME_EXCHANGE
+       log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/xchg-a bs=1k count=1
+       log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/xchg-b bs=1k count=1
+       log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/xchg-c bs=1k count=1
+       log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/xchg-d bs=1k count=1
+       # rotate the files around
+       log_must renameat2 -x /$TESTPOOL/$TESTFS/xchg-{a,b}
+       log_must renameat2 -x /$TESTPOOL/$TESTFS/xchg-{b,c}
+       log_must renameat2 -x /$TESTPOOL/$TESTFS/xchg-{c,a}
+       # exchange same path
+       log_must renameat2 -x /$TESTPOOL/$TESTFS/xchg-{d,d}
+
+       # TX_RENAME_WHITEOUT
+       log_must mkfile 1k /$TESTPOOL/$TESTFS/whiteout
+       log_must renameat2 -w /$TESTPOOL/$TESTFS/whiteout{,-moved}
+fi
+
 #
 # 4. Copy TESTFS to temporary location (TESTDIR/copy)
 #