module/os/linux/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/atomic.h>
  45 #include <sys/pathname.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/zfs_dir.h>
  49 #include <sys/zfs_acl.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/dmu.h>
  53 #include <sys/dmu_objset.h>
  54 #include <sys/spa.h>
  55 #include <sys/txg.h>
  56 #include <sys/dbuf.h>
  57 #include <sys/zap.h>
  58 #include <sys/sa.h>
  59 #include <sys/policy.h>
  60 #include <sys/sunddi.h>
  61 #include <sys/sid.h>
  62 #include <sys/zfs_ctldir.h>
  63 #include <sys/zfs_fuid.h>
  64 #include <sys/zfs_quota.h>
  65 #include <sys/zfs_sa.h>
  66 #include <sys/zfs_vnops.h>
  67 #include <sys/zfs_rlock.h>
  68 #include <sys/cred.h>
  69 #include <sys/zpl.h>
  70 #include <sys/zil.h>
  71 #include <sys/sa_impl.h>
  72
  73 /*
  74  * Programming rules.
  75  *
  76  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  77  * properly lock its in-core state, create a DMU transaction, do the work,
  78  * record this work in the intent log (ZIL), commit the DMU transaction,
  79  * and wait for the intent log to commit if it is a synchronous operation.
  80  * Moreover, the vnode ops must work in both normal and log replay context.
  81  * The ordering of events is important to avoid deadlocks and references
  82  * to freed memory.  The example below illustrates the following Big Rules:
  83  *
  84  *  (1) A check must be made in each zfs thread for a mounted file system.
  85  *      This is done avoiding races using zfs_enter(zfsvfs).
  86  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
  87  *      must be checked with zfs_verify_zp(zp).  Both of these macros
  88  *      can return EIO from the calling function.
  89  *
  90  *  (2) zrele() should always be the last thing except for zil_commit() (if
  91  *      necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
  92  *      last reference, the vnode/znode can be freed, so the zp may point to
  93  *      freed memory.  Second, the last reference will call zfs_zinactive(),
  94  *      which may induce a lot of work -- pushing cached pages (which acquires
  95  *      range locks) and syncing out cached atime changes.  Third,
  96  *      zfs_zinactive() may require a new tx, which could deadlock the system
  97  *      if you were already holding one. This deadlock occurs because the tx
  98  *      currently being operated on prevents a txg from syncing, which
  99  *      prevents the new tx from progressing, resulting in a deadlock.  If you
 100  *      must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
 101  *      is a synonym for zrele().
 102  *
 103  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 104  *      as they can span dmu_tx_assign() calls.
 105  *
 106  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 107  *      dmu_tx_assign().  This is critical because we don't want to block
 108  *      while holding locks.
 109  *
 110  *      If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
 111  *      reduces lock contention and CPU usage when we must wait (note that if
 112  *      throughput is constrained by the storage, nearly every transaction
 113  *      must wait).
 114  *
 115  *      Note, in particular, that if a lock is sometimes acquired before
 116  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 117  *      to use a non-blocking assign can deadlock the system.  The scenario:
 118  *
 119  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 120  *      Thread B is in an already-assigned tx, and blocks for this lock.
 121  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 122  *      forever, because the previous txg can't quiesce until B's tx commits.
 123  *
 124  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 125  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 126  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 127  *      to indicate that this operation has already called dmu_tx_wait().
 128  *      This will ensure that we don't retry forever, waiting a short bit
 129  *      each time.
 130  *
 131  *  (5) If the operation succeeded, generate the intent log entry for it
 132  *      before dropping locks.  This ensures that the ordering of events
 133  *      in the intent log matches the order in which they actually occurred.
 134  *      During ZIL replay the zfs_log_* functions will update the sequence
 135  *      number to indicate the zil transaction has replayed.
 136  *
 137  *  (6) At the end of each vnode op, the DMU tx must always commit,
 138  *      regardless of whether there were any errors.
 139  *
 140  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 141  *      to ensure that synchronous semantics are provided when necessary.
 142  *
 143  * In general, this is how things should be ordered in each vnode op:
 144  *
 145  *      zfs_enter(zfsvfs);              // exit if unmounted
 146  * top:
 147  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 148  *      rw_enter(...);                  // grab any other locks you need
 149  *      tx = dmu_tx_create(...);        // get DMU tx
 150  *      dmu_tx_hold_*();                // hold each object you might modify
 151  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 152  *      if (error) {
 153  *              rw_exit(...);           // drop locks
 154  *              zfs_dirent_unlock(dl);  // unlock directory entry
 155  *              zrele(...);             // release held znodes
 156  *              if (error == ERESTART) {
 157  *                      waited = B_TRUE;
 158  *                      dmu_tx_wait(tx);
 159  *                      dmu_tx_abort(tx);
 160  *                      goto top;
 161  *              }
 162  *              dmu_tx_abort(tx);       // abort DMU tx
 163  *              zfs_exit(zfsvfs);       // finished in zfs
 164  *              return (error);         // really out of space
 165  *      }
 166  *      error = do_real_work();         // do whatever this VOP does
 167  *      if (error == 0)
 168  *              zfs_log_*(...);         // on success, make ZIL entry
 169  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 170  *      rw_exit(...);                   // drop locks
 171  *      zfs_dirent_unlock(dl);          // unlock directory entry
 172  *      zrele(...);                     // release held znodes
 173  *      zil_commit(zilog, foid);        // synchronous when necessary
 174  *      zfs_exit(zfsvfs);               // finished in zfs
 175  *      return (error);                 // done, report error
 176  */
 177 int
 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 179 {
 180         (void) cr;
 181         znode_t *zp = ITOZ(ip);
 182         zfsvfs_t *zfsvfs = ITOZSB(ip);
 183         int error;
 184
 185         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 186                 return (error);
 187
 188         /* Honor ZFS_APPENDONLY file attribute */
 189         if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
 190             ((flag & O_APPEND) == 0)) {
 191                 zfs_exit(zfsvfs, FTAG);
 192                 return (SET_ERROR(EPERM));
 193         }
 194
 195         /* Keep a count of the synchronous opens in the znode */
 196         if (flag & O_SYNC)
 197                 atomic_inc_32(&zp->z_sync_cnt);
 198
 199         zfs_exit(zfsvfs, FTAG);
 200         return (0);
 201 }
 202
 203 int
 204 zfs_close(struct inode *ip, int flag, cred_t *cr)
 205 {
 206         (void) cr;
 207         znode_t *zp = ITOZ(ip);
 208         zfsvfs_t *zfsvfs = ITOZSB(ip);
 209         int error;
 210
 211         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 212                 return (error);
 213
 214         /* Decrement the synchronous opens in the znode */
 215         if (flag & O_SYNC)
 216                 atomic_dec_32(&zp->z_sync_cnt);
 217
 218         zfs_exit(zfsvfs, FTAG);
 219         return (0);
 220 }
 221
 222 #if defined(_KERNEL)
 223
 224 static int zfs_fillpage(struct inode *ip, struct page *pp);
 225
 226 /*
 227  * When a file is memory mapped, we must keep the IO data synchronized
 228  * between the DMU cache and the memory mapped pages.  Update all mapped
 229  * pages with the contents of the coresponding dmu buffer.
 230  */
 231 void
 232 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 233 {
 234         struct address_space *mp = ZTOI(zp)->i_mapping;
 235         int64_t off = start & (PAGE_SIZE - 1);
 236
 237         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 238                 uint64_t nbytes = MIN(PAGE_SIZE - off, len);
 239
 240                 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 241                 if (pp) {
 242                         if (mapping_writably_mapped(mp))
 243                                 flush_dcache_page(pp);
 244
 245                         void *pb = kmap(pp);
 246                         int error = dmu_read(os, zp->z_id, start + off,
 247                             nbytes, pb + off, DMU_READ_PREFETCH);
 248                         kunmap(pp);
 249
 250                         if (error) {
 251                                 SetPageError(pp);
 252                                 ClearPageUptodate(pp);
 253                         } else {
 254                                 ClearPageError(pp);
 255                                 SetPageUptodate(pp);
 256
 257                                 if (mapping_writably_mapped(mp))
 258                                         flush_dcache_page(pp);
 259
 260                                 mark_page_accessed(pp);
 261                         }
 262
 263                         unlock_page(pp);
 264                         put_page(pp);
 265                 }
 266
 267                 len -= nbytes;
 268                 off = 0;
 269         }
 270 }
 271
 272 /*
 273  * When a file is memory mapped, we must keep the I/O data synchronized
 274  * between the DMU cache and the memory mapped pages.  Preferentially read
 275  * from memory mapped pages, otherwise fallback to reading through the dmu.
 276  */
 277 int
 278 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 279 {
 280         struct inode *ip = ZTOI(zp);
 281         struct address_space *mp = ip->i_mapping;
 282         int64_t start = uio->uio_loffset;
 283         int64_t off = start & (PAGE_SIZE - 1);
 284         int len = nbytes;
 285         int error = 0;
 286
 287         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 288                 uint64_t bytes = MIN(PAGE_SIZE - off, len);
 289
 290                 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 291                 if (pp) {
 292                         /*
 293                          * If filemap_fault() retries there exists a window
 294                          * where the page will be unlocked and not up to date.
 295                          * In this case we must try and fill the page.
 296                          */
 297                         if (unlikely(!PageUptodate(pp))) {
 298                                 error = zfs_fillpage(ip, pp);
 299                                 if (error) {
 300                                         unlock_page(pp);
 301                                         put_page(pp);
 302                                         return (error);
 303                                 }
 304                         }
 305
 306                         ASSERT(PageUptodate(pp) || PageDirty(pp));
 307
 308                         unlock_page(pp);
 309
 310                         void *pb = kmap(pp);
 311                         error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 312                         kunmap(pp);
 313
 314                         if (mapping_writably_mapped(mp))
 315                                 flush_dcache_page(pp);
 316
 317                         mark_page_accessed(pp);
 318                         put_page(pp);
 319                 } else {
 320                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 321                             uio, bytes);
 322                 }
 323
 324                 len -= bytes;
 325                 off = 0;
 326
 327                 if (error)
 328                         break;
 329         }
 330
 331         return (error);
 332 }
 333 #endif /* _KERNEL */
 334
 335 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 336
 337 /*
 338  * Write the bytes to a file.
 339  *
 340  *      IN:     zp      - znode of file to be written to
 341  *              data    - bytes to write
 342  *              len     - number of bytes to write
 343  *              pos     - offset to start writing at
 344  *
 345  *      OUT:    resid   - remaining bytes to write
 346  *
 347  *      RETURN: 0 if success
 348  *              positive error code if failure.  EIO is returned
 349  *              for a short write when residp isn't provided.
 350  *
 351  * Timestamps:
 352  *      zp - ctime|mtime updated if byte count > 0
 353  */
 354 int
 355 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 356     loff_t pos, size_t *residp)
 357 {
 358         fstrans_cookie_t cookie;
 359         int error;
 360
 361         struct iovec iov;
 362         iov.iov_base = (void *)data;
 363         iov.iov_len = len;
 364
 365         zfs_uio_t uio;
 366         zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
 367
 368         cookie = spl_fstrans_mark();
 369         error = zfs_write(zp, &uio, 0, kcred);
 370         spl_fstrans_unmark(cookie);
 371
 372         if (error == 0) {
 373                 if (residp != NULL)
 374                         *residp = zfs_uio_resid(&uio);
 375                 else if (zfs_uio_resid(&uio) != 0)
 376                         error = SET_ERROR(EIO);
 377         }
 378
 379         return (error);
 380 }
 381
 382 static void
 383 zfs_rele_async_task(void *arg)
 384 {
 385         iput(arg);
 386 }
 387
 388 void
 389 zfs_zrele_async(znode_t *zp)
 390 {
 391         struct inode *ip = ZTOI(zp);
 392         objset_t *os = ITOZSB(ip)->z_os;
 393
 394         ASSERT(atomic_read(&ip->i_count) > 0);
 395         ASSERT(os != NULL);
 396
 397         /*
 398          * If decrementing the count would put us at 0, we can't do it inline
 399          * here, because that would be synchronous. Instead, dispatch an iput
 400          * to run later.
 401          *
 402          * For more information on the dangers of a synchronous iput, see the
 403          * header comment of this file.
 404          */
 405         if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 406                 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 407                     zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 408         }
 409 }
 410
 411
 412 /*
 413  * Lookup an entry in a directory, or an extended attribute directory.
 414  * If it exists, return a held inode reference for it.
 415  *
 416  *      IN:     zdp     - znode of directory to search.
 417  *              nm      - name of entry to lookup.
 418  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 419  *              cr      - credentials of caller.
 420  *              direntflags - directory lookup flags
 421  *              realpnp - returned pathname.
 422  *
 423  *      OUT:    zpp     - znode of located entry, NULL if not found.
 424  *
 425  *      RETURN: 0 on success, error code on failure.
 426  *
 427  * Timestamps:
 428  *      NA
 429  */
 430 int
 431 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
 432     int *direntflags, pathname_t *realpnp)
 433 {
 434         zfsvfs_t *zfsvfs = ZTOZSB(zdp);
 435         int error = 0;
 436
 437         /*
 438          * Fast path lookup, however we must skip DNLC lookup
 439          * for case folding or normalizing lookups because the
 440          * DNLC code only stores the passed in name.  This means
 441          * creating 'a' and removing 'A' on a case insensitive
 442          * file system would work, but DNLC still thinks 'a'
 443          * exists and won't let you create it again on the next
 444          * pass through fast path.
 445          */
 446         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 447
 448                 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 449                         return (SET_ERROR(ENOTDIR));
 450                 } else if (zdp->z_sa_hdl == NULL) {
 451                         return (SET_ERROR(EIO));
 452                 }
 453
 454                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 455                         error = zfs_fastaccesschk_execute(zdp, cr);
 456                         if (!error) {
 457                                 *zpp = zdp;
 458                                 zhold(*zpp);
 459                                 return (0);
 460                         }
 461                         return (error);
 462                 }
 463         }
 464
 465         if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 466                 return (error);
 467
 468         *zpp = NULL;
 469
 470         if (flags & LOOKUP_XATTR) {
 471                 /*
 472                  * We don't allow recursive attributes..
 473                  * Maybe someday we will.
 474                  */
 475                 if (zdp->z_pflags & ZFS_XATTR) {
 476                         zfs_exit(zfsvfs, FTAG);
 477                         return (SET_ERROR(EINVAL));
 478                 }
 479
 480                 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
 481                         zfs_exit(zfsvfs, FTAG);
 482                         return (error);
 483                 }
 484
 485                 /*
 486                  * Do we have permission to get into attribute directory?
 487                  */
 488
 489                 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
 490                     B_TRUE, cr, zfs_init_idmap))) {
 491                         zrele(*zpp);
 492                         *zpp = NULL;
 493                 }
 494
 495                 zfs_exit(zfsvfs, FTAG);
 496                 return (error);
 497         }
 498
 499         if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 500                 zfs_exit(zfsvfs, FTAG);
 501                 return (SET_ERROR(ENOTDIR));
 502         }
 503
 504         /*
 505          * Check accessibility of directory.
 506          */
 507
 508         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 509             zfs_init_idmap))) {
 510                 zfs_exit(zfsvfs, FTAG);
 511                 return (error);
 512         }
 513
 514         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 515             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 516                 zfs_exit(zfsvfs, FTAG);
 517                 return (SET_ERROR(EILSEQ));
 518         }
 519
 520         error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
 521         if ((error == 0) && (*zpp))
 522                 zfs_znode_update_vfs(*zpp);
 523
 524         zfs_exit(zfsvfs, FTAG);
 525         return (error);
 526 }
 527
 528 /*
 529  * Attempt to create a new entry in a directory.  If the entry
 530  * already exists, truncate the file if permissible, else return
 531  * an error.  Return the ip of the created or trunc'd file.
 532  *
 533  *      IN:     dzp     - znode of directory to put new file entry in.
 534  *              name    - name of new file entry.
 535  *              vap     - attributes of new file.
 536  *              excl    - flag indicating exclusive or non-exclusive mode.
 537  *              mode    - mode to open file with.
 538  *              cr      - credentials of caller.
 539  *              flag    - file flag.
 540  *              vsecp   - ACL to be set
 541  *              mnt_ns  - user namespace of the mount
 542  *
 543  *      OUT:    zpp     - znode of created or trunc'd entry.
 544  *
 545  *      RETURN: 0 on success, error code on failure.
 546  *
 547  * Timestamps:
 548  *      dzp - ctime|mtime updated if new entry created
 549  *       zp - ctime|mtime always, atime if new
 550  */
 551 int
 552 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
 553     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
 554     zidmap_t *mnt_ns)
 555 {
 556         znode_t         *zp;
 557         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 558         zilog_t         *zilog;
 559         objset_t        *os;
 560         zfs_dirlock_t   *dl;
 561         dmu_tx_t        *tx;
 562         int             error;
 563         uid_t           uid;
 564         gid_t           gid;
 565         zfs_acl_ids_t   acl_ids;
 566         boolean_t       fuid_dirtied;
 567         boolean_t       have_acl = B_FALSE;
 568         boolean_t       waited = B_FALSE;
 569         boolean_t       skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 570
 571         /*
 572          * If we have an ephemeral id, ACL, or XVATTR then
 573          * make sure file system is at proper version
 574          */
 575
 576         gid = crgetgid(cr);
 577         uid = crgetuid(cr);
 578
 579         if (zfsvfs->z_use_fuids == B_FALSE &&
 580             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 581                 return (SET_ERROR(EINVAL));
 582
 583         if (name == NULL)
 584                 return (SET_ERROR(EINVAL));
 585
 586         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 587                 return (error);
 588         os = zfsvfs->z_os;
 589         zilog = zfsvfs->z_log;
 590
 591         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 592             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 593                 zfs_exit(zfsvfs, FTAG);
 594                 return (SET_ERROR(EILSEQ));
 595         }
 596
 597         if (vap->va_mask & ATTR_XVATTR) {
 598                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 599                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 600                         zfs_exit(zfsvfs, FTAG);
 601                         return (error);
 602                 }
 603         }
 604
 605 top:
 606         *zpp = NULL;
 607         if (*name == '\0') {
 608                 /*
 609                  * Null component name refers to the directory itself.
 610                  */
 611                 zhold(dzp);
 612                 zp = dzp;
 613                 dl = NULL;
 614                 error = 0;
 615         } else {
 616                 /* possible igrab(zp) */
 617                 int zflg = 0;
 618
 619                 if (flag & FIGNORECASE)
 620                         zflg |= ZCILOOK;
 621
 622                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 623                     NULL, NULL);
 624                 if (error) {
 625                         if (have_acl)
 626                                 zfs_acl_ids_free(&acl_ids);
 627                         if (strcmp(name, "..") == 0)
 628                                 error = SET_ERROR(EISDIR);
 629                         zfs_exit(zfsvfs, FTAG);
 630                         return (error);
 631                 }
 632         }
 633
 634         if (zp == NULL) {
 635                 uint64_t txtype;
 636                 uint64_t projid = ZFS_DEFAULT_PROJID;
 637
 638                 /*
 639                  * Create a new file object and update the directory
 640                  * to reference it.
 641                  */
 642                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
 643                     mnt_ns))) {
 644                         if (have_acl)
 645                                 zfs_acl_ids_free(&acl_ids);
 646                         goto out;
 647                 }
 648
 649                 /*
 650                  * We only support the creation of regular files in
 651                  * extended attribute directories.
 652                  */
 653
 654                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 655                         if (have_acl)
 656                                 zfs_acl_ids_free(&acl_ids);
 657                         error = SET_ERROR(EINVAL);
 658                         goto out;
 659                 }
 660
 661                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 662                     cr, vsecp, &acl_ids, mnt_ns)) != 0)
 663                         goto out;
 664                 have_acl = B_TRUE;
 665
 666                 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 667                         projid = zfs_inherit_projid(dzp);
 668                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 669                         zfs_acl_ids_free(&acl_ids);
 670                         error = SET_ERROR(EDQUOT);
 671                         goto out;
 672                 }
 673
 674                 tx = dmu_tx_create(os);
 675
 676                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 677                     ZFS_SA_BASE_ATTR_SIZE);
 678
 679                 fuid_dirtied = zfsvfs->z_fuid_dirty;
 680                 if (fuid_dirtied)
 681                         zfs_fuid_txhold(zfsvfs, tx);
 682                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 683                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 684                 if (!zfsvfs->z_use_sa &&
 685                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 686                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 687                             0, acl_ids.z_aclp->z_acl_bytes);
 688                 }
 689
 690                 error = dmu_tx_assign(tx,
 691                     (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 692                 if (error) {
 693                         zfs_dirent_unlock(dl);
 694                         if (error == ERESTART) {
 695                                 waited = B_TRUE;
 696                                 dmu_tx_wait(tx);
 697                                 dmu_tx_abort(tx);
 698                                 goto top;
 699                         }
 700                         zfs_acl_ids_free(&acl_ids);
 701                         dmu_tx_abort(tx);
 702                         zfs_exit(zfsvfs, FTAG);
 703                         return (error);
 704                 }
 705                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 706
 707                 error = zfs_link_create(dl, zp, tx, ZNEW);
 708                 if (error != 0) {
 709                         /*
 710                          * Since, we failed to add the directory entry for it,
 711                          * delete the newly created dnode.
 712                          */
 713                         zfs_znode_delete(zp, tx);
 714                         remove_inode_hash(ZTOI(zp));
 715                         zfs_acl_ids_free(&acl_ids);
 716                         dmu_tx_commit(tx);
 717                         goto out;
 718                 }
 719
 720                 if (fuid_dirtied)
 721                         zfs_fuid_sync(zfsvfs, tx);
 722
 723                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 724                 if (flag & FIGNORECASE)
 725                         txtype |= TX_CI;
 726                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 727                     vsecp, acl_ids.z_fuidp, vap);
 728                 zfs_acl_ids_free(&acl_ids);
 729                 dmu_tx_commit(tx);
 730         } else {
 731                 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
 732
 733                 if (have_acl)
 734                         zfs_acl_ids_free(&acl_ids);
 735
 736                 /*
 737                  * A directory entry already exists for this name.
 738                  */
 739                 /*
 740                  * Can't truncate an existing file if in exclusive mode.
 741                  */
 742                 if (excl) {
 743                         error = SET_ERROR(EEXIST);
 744                         goto out;
 745                 }
 746                 /*
 747                  * Can't open a directory for writing.
 748                  */
 749                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
 750                         error = SET_ERROR(EISDIR);
 751                         goto out;
 752                 }
 753                 /*
 754                  * Verify requested access to file.
 755                  */
 756                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
 757                     mnt_ns))) {
 758                         goto out;
 759                 }
 760
 761                 mutex_enter(&dzp->z_lock);
 762                 dzp->z_seq++;
 763                 mutex_exit(&dzp->z_lock);
 764
 765                 /*
 766                  * Truncate regular files if requested.
 767                  */
 768                 if (S_ISREG(ZTOI(zp)->i_mode) &&
 769                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 770                         /* we can't hold any locks when calling zfs_freesp() */
 771                         if (dl) {
 772                                 zfs_dirent_unlock(dl);
 773                                 dl = NULL;
 774                         }
 775                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
 776                 }
 777         }
 778 out:
 779
 780         if (dl)
 781                 zfs_dirent_unlock(dl);
 782
 783         if (error) {
 784                 if (zp)
 785                         zrele(zp);
 786         } else {
 787                 zfs_znode_update_vfs(dzp);
 788                 zfs_znode_update_vfs(zp);
 789                 *zpp = zp;
 790         }
 791
 792         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 793                 zil_commit(zilog, 0);
 794
 795         zfs_exit(zfsvfs, FTAG);
 796         return (error);
 797 }
 798
 799 int
 800 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
 801     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
 802     zidmap_t *mnt_ns)
 803 {
 804         (void) excl, (void) mode, (void) flag;
 805         znode_t         *zp = NULL, *dzp = ITOZ(dip);
 806         zfsvfs_t        *zfsvfs = ITOZSB(dip);
 807         objset_t        *os;
 808         dmu_tx_t        *tx;
 809         int             error;
 810         uid_t           uid;
 811         gid_t           gid;
 812         zfs_acl_ids_t   acl_ids;
 813         uint64_t        projid = ZFS_DEFAULT_PROJID;
 814         boolean_t       fuid_dirtied;
 815         boolean_t       have_acl = B_FALSE;
 816         boolean_t       waited = B_FALSE;
 817
 818         /*
 819          * If we have an ephemeral id, ACL, or XVATTR then
 820          * make sure file system is at proper version
 821          */
 822
 823         gid = crgetgid(cr);
 824         uid = crgetuid(cr);
 825
 826         if (zfsvfs->z_use_fuids == B_FALSE &&
 827             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 828                 return (SET_ERROR(EINVAL));
 829
 830         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 831                 return (error);
 832         os = zfsvfs->z_os;
 833
 834         if (vap->va_mask & ATTR_XVATTR) {
 835                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 836                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 837                         zfs_exit(zfsvfs, FTAG);
 838                         return (error);
 839                 }
 840         }
 841
 842 top:
 843         *ipp = NULL;
 844
 845         /*
 846          * Create a new file object and update the directory
 847          * to reference it.
 848          */
 849         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 850                 if (have_acl)
 851                         zfs_acl_ids_free(&acl_ids);
 852                 goto out;
 853         }
 854
 855         if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 856             cr, vsecp, &acl_ids, mnt_ns)) != 0)
 857                 goto out;
 858         have_acl = B_TRUE;
 859
 860         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 861                 projid = zfs_inherit_projid(dzp);
 862         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 863                 zfs_acl_ids_free(&acl_ids);
 864                 error = SET_ERROR(EDQUOT);
 865                 goto out;
 866         }
 867
 868         tx = dmu_tx_create(os);
 869
 870         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 871             ZFS_SA_BASE_ATTR_SIZE);
 872         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 873
 874         fuid_dirtied = zfsvfs->z_fuid_dirty;
 875         if (fuid_dirtied)
 876                 zfs_fuid_txhold(zfsvfs, tx);
 877         if (!zfsvfs->z_use_sa &&
 878             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 879                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 880                     0, acl_ids.z_aclp->z_acl_bytes);
 881         }
 882         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 883         if (error) {
 884                 if (error == ERESTART) {
 885                         waited = B_TRUE;
 886                         dmu_tx_wait(tx);
 887                         dmu_tx_abort(tx);
 888                         goto top;
 889                 }
 890                 zfs_acl_ids_free(&acl_ids);
 891                 dmu_tx_abort(tx);
 892                 zfs_exit(zfsvfs, FTAG);
 893                 return (error);
 894         }
 895         zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 896
 897         if (fuid_dirtied)
 898                 zfs_fuid_sync(zfsvfs, tx);
 899
 900         /* Add to unlinked set */
 901         zp->z_unlinked = B_TRUE;
 902         zfs_unlinked_add(zp, tx);
 903         zfs_acl_ids_free(&acl_ids);
 904         dmu_tx_commit(tx);
 905 out:
 906
 907         if (error) {
 908                 if (zp)
 909                         zrele(zp);
 910         } else {
 911                 zfs_znode_update_vfs(dzp);
 912                 zfs_znode_update_vfs(zp);
 913                 *ipp = ZTOI(zp);
 914         }
 915
 916         zfs_exit(zfsvfs, FTAG);
 917         return (error);
 918 }
 919
 920 /*
 921  * Remove an entry from a directory.
 922  *
 923  *      IN:     dzp     - znode of directory to remove entry from.
 924  *              name    - name of entry to remove.
 925  *              cr      - credentials of caller.
 926  *              flags   - case flags.
 927  *
 928  *      RETURN: 0 if success
 929  *              error code if failure
 930  *
 931  * Timestamps:
 932  *      dzp - ctime|mtime
 933  *       ip - ctime (if nlink > 0)
 934  */
 935
 936 static uint64_t null_xattr = 0;
 937
 938 int
 939 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 940 {
 941         znode_t         *zp;
 942         znode_t         *xzp;
 943         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 944         zilog_t         *zilog;
 945         uint64_t        acl_obj, xattr_obj;
 946         uint64_t        xattr_obj_unlinked = 0;
 947         uint64_t        obj = 0;
 948         uint64_t        links;
 949         zfs_dirlock_t   *dl;
 950         dmu_tx_t        *tx;
 951         boolean_t       may_delete_now, delete_now = FALSE;
 952         boolean_t       unlinked, toobig = FALSE;
 953         uint64_t        txtype;
 954         pathname_t      *realnmp = NULL;
 955         pathname_t      realnm;
 956         int             error;
 957         int             zflg = ZEXISTS;
 958         boolean_t       waited = B_FALSE;
 959
 960         if (name == NULL)
 961                 return (SET_ERROR(EINVAL));
 962
 963         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 964                 return (error);
 965         zilog = zfsvfs->z_log;
 966
 967         if (flags & FIGNORECASE) {
 968                 zflg |= ZCILOOK;
 969                 pn_alloc(&realnm);
 970                 realnmp = &realnm;
 971         }
 972
 973 top:
 974         xattr_obj = 0;
 975         xzp = NULL;
 976         /*
 977          * Attempt to lock directory; fail if entry doesn't exist.
 978          */
 979         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 980             NULL, realnmp))) {
 981                 if (realnmp)
 982                         pn_free(realnmp);
 983                 zfs_exit(zfsvfs, FTAG);
 984                 return (error);
 985         }
 986
 987         if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
 988                 goto out;
 989         }
 990
 991         /*
 992          * Need to use rmdir for removing directories.
 993          */
 994         if (S_ISDIR(ZTOI(zp)->i_mode)) {
 995                 error = SET_ERROR(EPERM);
 996                 goto out;
 997         }
 998
 999         mutex_enter(&zp->z_lock);
1000         may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
1001             !zn_has_cached_data(zp, 0, LLONG_MAX);
1002         mutex_exit(&zp->z_lock);
1003
1004         /*
1005          * We may delete the znode now, or we may put it in the unlinked set;
1006          * it depends on whether we're the last link, and on whether there are
1007          * other holds on the inode.  So we dmu_tx_hold() the right things to
1008          * allow for either case.
1009          */
1010         obj = zp->z_id;
1011         tx = dmu_tx_create(zfsvfs->z_os);
1012         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1013         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1014         zfs_sa_upgrade_txholds(tx, zp);
1015         zfs_sa_upgrade_txholds(tx, dzp);
1016         if (may_delete_now) {
1017                 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1018                 /* if the file is too big, only hold_free a token amount */
1019                 dmu_tx_hold_free(tx, zp->z_id, 0,
1020                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1021         }
1022
1023         /* are there any extended attributes? */
1024         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1025             &xattr_obj, sizeof (xattr_obj));
1026         if (error == 0 && xattr_obj) {
1027                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1028                 ASSERT0(error);
1029                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1030                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1031         }
1032
1033         mutex_enter(&zp->z_lock);
1034         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1035                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1036         mutex_exit(&zp->z_lock);
1037
1038         /* charge as an update -- would be nice not to charge at all */
1039         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1040
1041         /*
1042          * Mark this transaction as typically resulting in a net free of space
1043          */
1044         dmu_tx_mark_netfree(tx);
1045
1046         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1047         if (error) {
1048                 zfs_dirent_unlock(dl);
1049                 if (error == ERESTART) {
1050                         waited = B_TRUE;
1051                         dmu_tx_wait(tx);
1052                         dmu_tx_abort(tx);
1053                         zrele(zp);
1054                         if (xzp)
1055                                 zrele(xzp);
1056                         goto top;
1057                 }
1058                 if (realnmp)
1059                         pn_free(realnmp);
1060                 dmu_tx_abort(tx);
1061                 zrele(zp);
1062                 if (xzp)
1063                         zrele(xzp);
1064                 zfs_exit(zfsvfs, FTAG);
1065                 return (error);
1066         }
1067
1068         /*
1069          * Remove the directory entry.
1070          */
1071         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1072
1073         if (error) {
1074                 dmu_tx_commit(tx);
1075                 goto out;
1076         }
1077
1078         if (unlinked) {
1079                 /*
1080                  * Hold z_lock so that we can make sure that the ACL obj
1081                  * hasn't changed.  Could have been deleted due to
1082                  * zfs_sa_upgrade().
1083                  */
1084                 mutex_enter(&zp->z_lock);
1085                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1086                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1087                 delete_now = may_delete_now && !toobig &&
1088                     atomic_read(&ZTOI(zp)->i_count) == 1 &&
1089                     !zn_has_cached_data(zp, 0, LLONG_MAX) &&
1090                     xattr_obj == xattr_obj_unlinked &&
1091                     zfs_external_acl(zp) == acl_obj;
1092                 VERIFY_IMPLY(xattr_obj_unlinked, xzp);
1093         }
1094
1095         if (delete_now) {
1096                 if (xattr_obj_unlinked) {
1097                         ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1098                         mutex_enter(&xzp->z_lock);
1099                         xzp->z_unlinked = B_TRUE;
1100                         clear_nlink(ZTOI(xzp));
1101                         links = 0;
1102                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1103                             &links, sizeof (links), tx);
1104                         ASSERT3U(error,  ==,  0);
1105                         mutex_exit(&xzp->z_lock);
1106                         zfs_unlinked_add(xzp, tx);
1107
1108                         if (zp->z_is_sa)
1109                                 error = sa_remove(zp->z_sa_hdl,
1110                                     SA_ZPL_XATTR(zfsvfs), tx);
1111                         else
1112                                 error = sa_update(zp->z_sa_hdl,
1113                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
1114                                     sizeof (uint64_t), tx);
1115                         ASSERT0(error);
1116                 }
1117                 /*
1118                  * Add to the unlinked set because a new reference could be
1119                  * taken concurrently resulting in a deferred destruction.
1120                  */
1121                 zfs_unlinked_add(zp, tx);
1122                 mutex_exit(&zp->z_lock);
1123         } else if (unlinked) {
1124                 mutex_exit(&zp->z_lock);
1125                 zfs_unlinked_add(zp, tx);
1126         }
1127
1128         txtype = TX_REMOVE;
1129         if (flags & FIGNORECASE)
1130                 txtype |= TX_CI;
1131         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1132
1133         dmu_tx_commit(tx);
1134 out:
1135         if (realnmp)
1136                 pn_free(realnmp);
1137
1138         zfs_dirent_unlock(dl);
1139         zfs_znode_update_vfs(dzp);
1140         zfs_znode_update_vfs(zp);
1141
1142         if (delete_now)
1143                 zrele(zp);
1144         else
1145                 zfs_zrele_async(zp);
1146
1147         if (xzp) {
1148                 zfs_znode_update_vfs(xzp);
1149                 zfs_zrele_async(xzp);
1150         }
1151
1152         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1153                 zil_commit(zilog, 0);
1154
1155         zfs_exit(zfsvfs, FTAG);
1156         return (error);
1157 }
1158
1159 /*
1160  * Create a new directory and insert it into dzp using the name
1161  * provided.  Return a pointer to the inserted directory.
1162  *
1163  *      IN:     dzp     - znode of directory to add subdir to.
1164  *              dirname - name of new directory.
1165  *              vap     - attributes of new directory.
1166  *              cr      - credentials of caller.
1167  *              flags   - case flags.
1168  *              vsecp   - ACL to be set
1169  *              mnt_ns  - user namespace of the mount
1170  *
1171  *      OUT:    zpp     - znode of created directory.
1172  *
1173  *      RETURN: 0 if success
1174  *              error code if failure
1175  *
1176  * Timestamps:
1177  *      dzp - ctime|mtime updated
1178  *      zpp - ctime|mtime|atime updated
1179  */
1180 int
1181 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1182     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1183 {
1184         znode_t         *zp;
1185         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1186         zilog_t         *zilog;
1187         zfs_dirlock_t   *dl;
1188         uint64_t        txtype;
1189         dmu_tx_t        *tx;
1190         int             error;
1191         int             zf = ZNEW;
1192         uid_t           uid;
1193         gid_t           gid = crgetgid(cr);
1194         zfs_acl_ids_t   acl_ids;
1195         boolean_t       fuid_dirtied;
1196         boolean_t       waited = B_FALSE;
1197
1198         ASSERT(S_ISDIR(vap->va_mode));
1199
1200         /*
1201          * If we have an ephemeral id, ACL, or XVATTR then
1202          * make sure file system is at proper version
1203          */
1204
1205         uid = crgetuid(cr);
1206         if (zfsvfs->z_use_fuids == B_FALSE &&
1207             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1208                 return (SET_ERROR(EINVAL));
1209
1210         if (dirname == NULL)
1211                 return (SET_ERROR(EINVAL));
1212
1213         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1214                 return (error);
1215         zilog = zfsvfs->z_log;
1216
1217         if (dzp->z_pflags & ZFS_XATTR) {
1218                 zfs_exit(zfsvfs, FTAG);
1219                 return (SET_ERROR(EINVAL));
1220         }
1221
1222         if (zfsvfs->z_utf8 && u8_validate(dirname,
1223             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1224                 zfs_exit(zfsvfs, FTAG);
1225                 return (SET_ERROR(EILSEQ));
1226         }
1227         if (flags & FIGNORECASE)
1228                 zf |= ZCILOOK;
1229
1230         if (vap->va_mask & ATTR_XVATTR) {
1231                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1232                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1233                         zfs_exit(zfsvfs, FTAG);
1234                         return (error);
1235                 }
1236         }
1237
1238         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1239             vsecp, &acl_ids, mnt_ns)) != 0) {
1240                 zfs_exit(zfsvfs, FTAG);
1241                 return (error);
1242         }
1243         /*
1244          * First make sure the new directory doesn't exist.
1245          *
1246          * Existence is checked first to make sure we don't return
1247          * EACCES instead of EEXIST which can cause some applications
1248          * to fail.
1249          */
1250 top:
1251         *zpp = NULL;
1252
1253         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1254             NULL, NULL))) {
1255                 zfs_acl_ids_free(&acl_ids);
1256                 zfs_exit(zfsvfs, FTAG);
1257                 return (error);
1258         }
1259
1260         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1261             mnt_ns))) {
1262                 zfs_acl_ids_free(&acl_ids);
1263                 zfs_dirent_unlock(dl);
1264                 zfs_exit(zfsvfs, FTAG);
1265                 return (error);
1266         }
1267
1268         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1269                 zfs_acl_ids_free(&acl_ids);
1270                 zfs_dirent_unlock(dl);
1271                 zfs_exit(zfsvfs, FTAG);
1272                 return (SET_ERROR(EDQUOT));
1273         }
1274
1275         /*
1276          * Add a new entry to the directory.
1277          */
1278         tx = dmu_tx_create(zfsvfs->z_os);
1279         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1280         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1281         fuid_dirtied = zfsvfs->z_fuid_dirty;
1282         if (fuid_dirtied)
1283                 zfs_fuid_txhold(zfsvfs, tx);
1284         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1285                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1286                     acl_ids.z_aclp->z_acl_bytes);
1287         }
1288
1289         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1290             ZFS_SA_BASE_ATTR_SIZE);
1291
1292         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1293         if (error) {
1294                 zfs_dirent_unlock(dl);
1295                 if (error == ERESTART) {
1296                         waited = B_TRUE;
1297                         dmu_tx_wait(tx);
1298                         dmu_tx_abort(tx);
1299                         goto top;
1300                 }
1301                 zfs_acl_ids_free(&acl_ids);
1302                 dmu_tx_abort(tx);
1303                 zfs_exit(zfsvfs, FTAG);
1304                 return (error);
1305         }
1306
1307         /*
1308          * Create new node.
1309          */
1310         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1311
1312         /*
1313          * Now put new name in parent dir.
1314          */
1315         error = zfs_link_create(dl, zp, tx, ZNEW);
1316         if (error != 0) {
1317                 zfs_znode_delete(zp, tx);
1318                 remove_inode_hash(ZTOI(zp));
1319                 goto out;
1320         }
1321
1322         if (fuid_dirtied)
1323                 zfs_fuid_sync(zfsvfs, tx);
1324
1325         *zpp = zp;
1326
1327         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1328         if (flags & FIGNORECASE)
1329                 txtype |= TX_CI;
1330         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1331             acl_ids.z_fuidp, vap);
1332
1333 out:
1334         zfs_acl_ids_free(&acl_ids);
1335
1336         dmu_tx_commit(tx);
1337
1338         zfs_dirent_unlock(dl);
1339
1340         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1341                 zil_commit(zilog, 0);
1342
1343         if (error != 0) {
1344                 zrele(zp);
1345         } else {
1346                 zfs_znode_update_vfs(dzp);
1347                 zfs_znode_update_vfs(zp);
1348         }
1349         zfs_exit(zfsvfs, FTAG);
1350         return (error);
1351 }
1352
1353 /*
1354  * Remove a directory subdir entry.  If the current working
1355  * directory is the same as the subdir to be removed, the
1356  * remove will fail.
1357  *
1358  *      IN:     dzp     - znode of directory to remove from.
1359  *              name    - name of directory to be removed.
1360  *              cwd     - inode of current working directory.
1361  *              cr      - credentials of caller.
1362  *              flags   - case flags
1363  *
1364  *      RETURN: 0 on success, error code on failure.
1365  *
1366  * Timestamps:
1367  *      dzp - ctime|mtime updated
1368  */
1369 int
1370 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1371     int flags)
1372 {
1373         znode_t         *zp;
1374         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1375         zilog_t         *zilog;
1376         zfs_dirlock_t   *dl;
1377         dmu_tx_t        *tx;
1378         int             error;
1379         int             zflg = ZEXISTS;
1380         boolean_t       waited = B_FALSE;
1381
1382         if (name == NULL)
1383                 return (SET_ERROR(EINVAL));
1384
1385         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1386                 return (error);
1387         zilog = zfsvfs->z_log;
1388
1389         if (flags & FIGNORECASE)
1390                 zflg |= ZCILOOK;
1391 top:
1392         zp = NULL;
1393
1394         /*
1395          * Attempt to lock directory; fail if entry doesn't exist.
1396          */
1397         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1398             NULL, NULL))) {
1399                 zfs_exit(zfsvfs, FTAG);
1400                 return (error);
1401         }
1402
1403         if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1404                 goto out;
1405         }
1406
1407         if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1408                 error = SET_ERROR(ENOTDIR);
1409                 goto out;
1410         }
1411
1412         if (zp == cwd) {
1413                 error = SET_ERROR(EINVAL);
1414                 goto out;
1415         }
1416
1417         /*
1418          * Grab a lock on the directory to make sure that no one is
1419          * trying to add (or lookup) entries while we are removing it.
1420          */
1421         rw_enter(&zp->z_name_lock, RW_WRITER);
1422
1423         /*
1424          * Grab a lock on the parent pointer to make sure we play well
1425          * with the treewalk and directory rename code.
1426          */
1427         rw_enter(&zp->z_parent_lock, RW_WRITER);
1428
1429         tx = dmu_tx_create(zfsvfs->z_os);
1430         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1431         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1432         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1433         zfs_sa_upgrade_txholds(tx, zp);
1434         zfs_sa_upgrade_txholds(tx, dzp);
1435         dmu_tx_mark_netfree(tx);
1436         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1437         if (error) {
1438                 rw_exit(&zp->z_parent_lock);
1439                 rw_exit(&zp->z_name_lock);
1440                 zfs_dirent_unlock(dl);
1441                 if (error == ERESTART) {
1442                         waited = B_TRUE;
1443                         dmu_tx_wait(tx);
1444                         dmu_tx_abort(tx);
1445                         zrele(zp);
1446                         goto top;
1447                 }
1448                 dmu_tx_abort(tx);
1449                 zrele(zp);
1450                 zfs_exit(zfsvfs, FTAG);
1451                 return (error);
1452         }
1453
1454         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1455
1456         if (error == 0) {
1457                 uint64_t txtype = TX_RMDIR;
1458                 if (flags & FIGNORECASE)
1459                         txtype |= TX_CI;
1460                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1461                     B_FALSE);
1462         }
1463
1464         dmu_tx_commit(tx);
1465
1466         rw_exit(&zp->z_parent_lock);
1467         rw_exit(&zp->z_name_lock);
1468 out:
1469         zfs_dirent_unlock(dl);
1470
1471         zfs_znode_update_vfs(dzp);
1472         zfs_znode_update_vfs(zp);
1473         zrele(zp);
1474
1475         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1476                 zil_commit(zilog, 0);
1477
1478         zfs_exit(zfsvfs, FTAG);
1479         return (error);
1480 }
1481
1482 /*
1483  * Read directory entries from the given directory cursor position and emit
1484  * name and position for each entry.
1485  *
1486  *      IN:     ip      - inode of directory to read.
1487  *              ctx     - directory entry context.
1488  *              cr      - credentials of caller.
1489  *
1490  *      RETURN: 0 if success
1491  *              error code if failure
1492  *
1493  * Timestamps:
1494  *      ip - atime updated
1495  *
1496  * Note that the low 4 bits of the cookie returned by zap is always zero.
1497  * This allows us to use the low range for "special" directory entries:
1498  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1499  * we use the offset 2 for the '.zfs' directory.
1500  */
1501 int
1502 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
1503 {
1504         (void) cr;
1505         znode_t         *zp = ITOZ(ip);
1506         zfsvfs_t        *zfsvfs = ITOZSB(ip);
1507         objset_t        *os;
1508         zap_cursor_t    zc;
1509         zap_attribute_t zap;
1510         int             error;
1511         uint8_t         prefetch;
1512         uint8_t         type;
1513         int             done = 0;
1514         uint64_t        parent;
1515         uint64_t        offset; /* must be unsigned; checks for < 1 */
1516
1517         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1518                 return (error);
1519
1520         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1521             &parent, sizeof (parent))) != 0)
1522                 goto out;
1523
1524         /*
1525          * Quit if directory has been removed (posix)
1526          */
1527         if (zp->z_unlinked)
1528                 goto out;
1529
1530         error = 0;
1531         os = zfsvfs->z_os;
1532         offset = ctx->pos;
1533         prefetch = zp->z_zn_prefetch;
1534
1535         /*
1536          * Initialize the iterator cursor.
1537          */
1538         if (offset <= 3) {
1539                 /*
1540                  * Start iteration from the beginning of the directory.
1541                  */
1542                 zap_cursor_init(&zc, os, zp->z_id);
1543         } else {
1544                 /*
1545                  * The offset is a serialized cursor.
1546                  */
1547                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1548         }
1549
1550         /*
1551          * Transform to file-system independent format
1552          */
1553         while (!done) {
1554                 uint64_t objnum;
1555                 /*
1556                  * Special case `.', `..', and `.zfs'.
1557                  */
1558                 if (offset == 0) {
1559                         (void) strcpy(zap.za_name, ".");
1560                         zap.za_normalization_conflict = 0;
1561                         objnum = zp->z_id;
1562                         type = DT_DIR;
1563                 } else if (offset == 1) {
1564                         (void) strcpy(zap.za_name, "..");
1565                         zap.za_normalization_conflict = 0;
1566                         objnum = parent;
1567                         type = DT_DIR;
1568                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1569                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1570                         zap.za_normalization_conflict = 0;
1571                         objnum = ZFSCTL_INO_ROOT;
1572                         type = DT_DIR;
1573                 } else {
1574                         /*
1575                          * Grab next entry.
1576                          */
1577                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
1578                                 if (error == ENOENT)
1579                                         break;
1580                                 else
1581                                         goto update;
1582                         }
1583
1584                         /*
1585                          * Allow multiple entries provided the first entry is
1586                          * the object id.  Non-zpl consumers may safely make
1587                          * use of the additional space.
1588                          *
1589                          * XXX: This should be a feature flag for compatibility
1590                          */
1591                         if (zap.za_integer_length != 8 ||
1592                             zap.za_num_integers == 0) {
1593                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1594                                     "entry, obj = %lld, offset = %lld, "
1595                                     "length = %d, num = %lld\n",
1596                                     (u_longlong_t)zp->z_id,
1597                                     (u_longlong_t)offset,
1598                                     zap.za_integer_length,
1599                                     (u_longlong_t)zap.za_num_integers);
1600                                 error = SET_ERROR(ENXIO);
1601                                 goto update;
1602                         }
1603
1604                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1605                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1606                 }
1607
1608                 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
1609                     objnum, type);
1610                 if (done)
1611                         break;
1612
1613                 /* Prefetch znode */
1614                 if (prefetch) {
1615                         dmu_prefetch(os, objnum, 0, 0, 0,
1616                             ZIO_PRIORITY_SYNC_READ);
1617                 }
1618
1619                 /*
1620                  * Move to the next entry, fill in the previous offset.
1621                  */
1622                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1623                         zap_cursor_advance(&zc);
1624                         offset = zap_cursor_serialize(&zc);
1625                 } else {
1626                         offset += 1;
1627                 }
1628                 ctx->pos = offset;
1629         }
1630         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1631
1632 update:
1633         zap_cursor_fini(&zc);
1634         if (error == ENOENT)
1635                 error = 0;
1636 out:
1637         zfs_exit(zfsvfs, FTAG);
1638
1639         return (error);
1640 }
1641
1642 /*
1643  * Get the basic file attributes and place them in the provided kstat
1644  * structure.  The inode is assumed to be the authoritative source
1645  * for most of the attributes.  However, the znode currently has the
1646  * authoritative atime, blksize, and block count.
1647  *
1648  *      IN:     ip      - inode of file.
1649  *
1650  *      OUT:    sp      - kstat values.
1651  *
1652  *      RETURN: 0 (always succeeds)
1653  */
1654 int
1655 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
1656 {
1657         znode_t *zp = ITOZ(ip);
1658         zfsvfs_t *zfsvfs = ITOZSB(ip);
1659         uint32_t blksize;
1660         u_longlong_t nblocks;
1661         int error;
1662
1663         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1664                 return (error);
1665
1666         mutex_enter(&zp->z_lock);
1667
1668         zpl_generic_fillattr(user_ns, ip, sp);
1669         /*
1670          * +1 link count for root inode with visible '.zfs' directory.
1671          */
1672         if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1673                 if (sp->nlink < ZFS_LINK_MAX)
1674                         sp->nlink++;
1675
1676         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1677         sp->blksize = blksize;
1678         sp->blocks = nblocks;
1679
1680         if (unlikely(zp->z_blksz == 0)) {
1681                 /*
1682                  * Block size hasn't been set; suggest maximal I/O transfers.
1683                  */
1684                 sp->blksize = zfsvfs->z_max_blksz;
1685         }
1686
1687         mutex_exit(&zp->z_lock);
1688
1689         /*
1690          * Required to prevent NFS client from detecting different inode
1691          * numbers of snapshot root dentry before and after snapshot mount.
1692          */
1693         if (zfsvfs->z_issnap) {
1694                 if (ip->i_sb->s_root->d_inode == ip)
1695                         sp->ino = ZFSCTL_INO_SNAPDIRS -
1696                             dmu_objset_id(zfsvfs->z_os);
1697         }
1698
1699         zfs_exit(zfsvfs, FTAG);
1700
1701         return (0);
1702 }
1703
1704 /*
1705  * For the operation of changing file's user/group/project, we need to
1706  * handle not only the main object that is assigned to the file directly,
1707  * but also the ones that are used by the file via hidden xattr directory.
1708  *
1709  * Because the xattr directory may contains many EA entries, as to it may
1710  * be impossible to change all of them via the transaction of changing the
1711  * main object's user/group/project attributes. Then we have to change them
1712  * via other multiple independent transactions one by one. It may be not good
1713  * solution, but we have no better idea yet.
1714  */
1715 static int
1716 zfs_setattr_dir(znode_t *dzp)
1717 {
1718         struct inode    *dxip = ZTOI(dzp);
1719         struct inode    *xip = NULL;
1720         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1721         objset_t        *os = zfsvfs->z_os;
1722         zap_cursor_t    zc;
1723         zap_attribute_t zap;
1724         zfs_dirlock_t   *dl;
1725         znode_t         *zp = NULL;
1726         dmu_tx_t        *tx = NULL;
1727         uint64_t        uid, gid;
1728         sa_bulk_attr_t  bulk[4];
1729         int             count;
1730         int             err;
1731
1732         zap_cursor_init(&zc, os, dzp->z_id);
1733         while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
1734                 count = 0;
1735                 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
1736                         err = ENXIO;
1737                         break;
1738                 }
1739
1740                 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
1741                     ZEXISTS, NULL, NULL);
1742                 if (err == ENOENT)
1743                         goto next;
1744                 if (err)
1745                         break;
1746
1747                 xip = ZTOI(zp);
1748                 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1749                     KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1750                     zp->z_projid == dzp->z_projid)
1751                         goto next;
1752
1753                 tx = dmu_tx_create(os);
1754                 if (!(zp->z_pflags & ZFS_PROJID))
1755                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1756                 else
1757                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1758
1759                 err = dmu_tx_assign(tx, TXG_WAIT);
1760                 if (err)
1761                         break;
1762
1763                 mutex_enter(&dzp->z_lock);
1764
1765                 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1766                         xip->i_uid = dxip->i_uid;
1767                         uid = zfs_uid_read(dxip);
1768                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1769                             &uid, sizeof (uid));
1770                 }
1771
1772                 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1773                         xip->i_gid = dxip->i_gid;
1774                         gid = zfs_gid_read(dxip);
1775                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1776                             &gid, sizeof (gid));
1777                 }
1778
1779                 if (zp->z_projid != dzp->z_projid) {
1780                         if (!(zp->z_pflags & ZFS_PROJID)) {
1781                                 zp->z_pflags |= ZFS_PROJID;
1782                                 SA_ADD_BULK_ATTR(bulk, count,
1783                                     SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
1784                                     sizeof (zp->z_pflags));
1785                         }
1786
1787                         zp->z_projid = dzp->z_projid;
1788                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
1789                             NULL, &zp->z_projid, sizeof (zp->z_projid));
1790                 }
1791
1792                 mutex_exit(&dzp->z_lock);
1793
1794                 if (likely(count > 0)) {
1795                         err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1796                         dmu_tx_commit(tx);
1797                 } else {
1798                         dmu_tx_abort(tx);
1799                 }
1800                 tx = NULL;
1801                 if (err != 0 && err != ENOENT)
1802                         break;
1803
1804 next:
1805                 if (zp) {
1806                         zrele(zp);
1807                         zp = NULL;
1808                         zfs_dirent_unlock(dl);
1809                 }
1810                 zap_cursor_advance(&zc);
1811         }
1812
1813         if (tx)
1814                 dmu_tx_abort(tx);
1815         if (zp) {
1816                 zrele(zp);
1817                 zfs_dirent_unlock(dl);
1818         }
1819         zap_cursor_fini(&zc);
1820
1821         return (err == ENOENT ? 0 : err);
1822 }
1823
1824 /*
1825  * Set the file attributes to the values contained in the
1826  * vattr structure.
1827  *
1828  *      IN:     zp      - znode of file to be modified.
1829  *              vap     - new attribute values.
1830  *                        If ATTR_XVATTR set, then optional attrs are being set
1831  *              flags   - ATTR_UTIME set if non-default time values provided.
1832  *                      - ATTR_NOACLCHECK (CIFS context only).
1833  *              cr      - credentials of caller.
1834  *              mnt_ns  - user namespace of the mount
1835  *
1836  *      RETURN: 0 if success
1837  *              error code if failure
1838  *
1839  * Timestamps:
1840  *      ip - ctime updated, mtime updated if size changed.
1841  */
1842 int
1843 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
1844 {
1845         struct inode    *ip;
1846         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
1847         objset_t        *os = zfsvfs->z_os;
1848         zilog_t         *zilog;
1849         dmu_tx_t        *tx;
1850         vattr_t         oldva;
1851         xvattr_t        *tmpxvattr;
1852         uint_t          mask = vap->va_mask;
1853         uint_t          saved_mask = 0;
1854         int             trim_mask = 0;
1855         uint64_t        new_mode;
1856         uint64_t        new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1857         uint64_t        xattr_obj;
1858         uint64_t        mtime[2], ctime[2], atime[2];
1859         uint64_t        projid = ZFS_INVALID_PROJID;
1860         znode_t         *attrzp;
1861         int             need_policy = FALSE;
1862         int             err, err2 = 0;
1863         zfs_fuid_info_t *fuidp = NULL;
1864         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1865         xoptattr_t      *xoap;
1866         zfs_acl_t       *aclp;
1867         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1868         boolean_t       fuid_dirtied = B_FALSE;
1869         boolean_t       handle_eadir = B_FALSE;
1870         sa_bulk_attr_t  *bulk, *xattr_bulk;
1871         int             count = 0, xattr_count = 0, bulks = 8;
1872
1873         if (mask == 0)
1874                 return (0);
1875
1876         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1877                 return (err);
1878         ip = ZTOI(zp);
1879
1880         /*
1881          * If this is a xvattr_t, then get a pointer to the structure of
1882          * optional attributes.  If this is NULL, then we have a vattr_t.
1883          */
1884         xoap = xva_getxoptattr(xvap);
1885         if (xoap != NULL && (mask & ATTR_XVATTR)) {
1886                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1887                         if (!dmu_objset_projectquota_enabled(os) ||
1888                             (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1889                                 zfs_exit(zfsvfs, FTAG);
1890                                 return (SET_ERROR(ENOTSUP));
1891                         }
1892
1893                         projid = xoap->xoa_projid;
1894                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
1895                                 zfs_exit(zfsvfs, FTAG);
1896                                 return (SET_ERROR(EINVAL));
1897                         }
1898
1899                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1900                                 projid = ZFS_INVALID_PROJID;
1901                         else
1902                                 need_policy = TRUE;
1903                 }
1904
1905                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1906                     (xoap->xoa_projinherit !=
1907                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1908                     (!dmu_objset_projectquota_enabled(os) ||
1909                     (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1910                         zfs_exit(zfsvfs, FTAG);
1911                         return (SET_ERROR(ENOTSUP));
1912                 }
1913         }
1914
1915         zilog = zfsvfs->z_log;
1916
1917         /*
1918          * Make sure that if we have ephemeral uid/gid or xvattr specified
1919          * that file system is at proper version level
1920          */
1921
1922         if (zfsvfs->z_use_fuids == B_FALSE &&
1923             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
1924             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
1925             (mask & ATTR_XVATTR))) {
1926                 zfs_exit(zfsvfs, FTAG);
1927                 return (SET_ERROR(EINVAL));
1928         }
1929
1930         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
1931                 zfs_exit(zfsvfs, FTAG);
1932                 return (SET_ERROR(EISDIR));
1933         }
1934
1935         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
1936                 zfs_exit(zfsvfs, FTAG);
1937                 return (SET_ERROR(EINVAL));
1938         }
1939
1940         tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
1941         xva_init(tmpxvattr);
1942
1943         bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1944         xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1945
1946         /*
1947          * Immutable files can only alter immutable bit and atime
1948          */
1949         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
1950             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
1951             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
1952                 err = SET_ERROR(EPERM);
1953                 goto out3;
1954         }
1955
1956         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
1957                 err = SET_ERROR(EPERM);
1958                 goto out3;
1959         }
1960
1961         /*
1962          * Verify timestamps doesn't overflow 32 bits.
1963          * ZFS can handle large timestamps, but 32bit syscalls can't
1964          * handle times greater than 2039.  This check should be removed
1965          * once large timestamps are fully supported.
1966          */
1967         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
1968                 if (((mask & ATTR_ATIME) &&
1969                     TIMESPEC_OVERFLOW(&vap->va_atime)) ||
1970                     ((mask & ATTR_MTIME) &&
1971                     TIMESPEC_OVERFLOW(&vap->va_mtime))) {
1972                         err = SET_ERROR(EOVERFLOW);
1973                         goto out3;
1974                 }
1975         }
1976
1977 top:
1978         attrzp = NULL;
1979         aclp = NULL;
1980
1981         /* Can this be moved to before the top label? */
1982         if (zfs_is_readonly(zfsvfs)) {
1983                 err = SET_ERROR(EROFS);
1984                 goto out3;
1985         }
1986
1987         /*
1988          * First validate permissions
1989          */
1990
1991         if (mask & ATTR_SIZE) {
1992                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
1993                     mnt_ns);
1994                 if (err)
1995                         goto out3;
1996
1997                 /*
1998                  * XXX - Note, we are not providing any open
1999                  * mode flags here (like FNDELAY), so we may
2000                  * block if there are locks present... this
2001                  * should be addressed in openat().
2002                  */
2003                 /* XXX - would it be OK to generate a log record here? */
2004                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2005                 if (err)
2006                         goto out3;
2007         }
2008
2009         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2010             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2011             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2012             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2013             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2014             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2015             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2016             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2017                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2018                     skipaclchk, cr, mnt_ns);
2019         }
2020
2021         if (mask & (ATTR_UID|ATTR_GID)) {
2022                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
2023                 int     take_owner;
2024                 int     take_group;
2025                 uid_t   uid;
2026                 gid_t   gid;
2027
2028                 /*
2029                  * NOTE: even if a new mode is being set,
2030                  * we may clear S_ISUID/S_ISGID bits.
2031                  */
2032
2033                 if (!(mask & ATTR_MODE))
2034                         vap->va_mode = zp->z_mode;
2035
2036                 /*
2037                  * Take ownership or chgrp to group we are a member of
2038                  */
2039
2040                 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
2041                     vap->va_uid);
2042                 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
2043                     vap->va_gid);
2044                 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
2045                 take_group = (mask & ATTR_GID) &&
2046                     zfs_groupmember(zfsvfs, gid, cr);
2047
2048                 /*
2049                  * If both ATTR_UID and ATTR_GID are set then take_owner and
2050                  * take_group must both be set in order to allow taking
2051                  * ownership.
2052                  *
2053                  * Otherwise, send the check through secpolicy_vnode_setattr()
2054                  *
2055                  */
2056
2057                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2058                     take_owner && take_group) ||
2059                     ((idmask == ATTR_UID) && take_owner) ||
2060                     ((idmask == ATTR_GID) && take_group)) {
2061                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2062                             skipaclchk, cr, mnt_ns) == 0) {
2063                                 /*
2064                                  * Remove setuid/setgid for non-privileged users
2065                                  */
2066                                 (void) secpolicy_setid_clear(vap, cr);
2067                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2068                         } else {
2069                                 need_policy =  TRUE;
2070                         }
2071                 } else {
2072                         need_policy =  TRUE;
2073                 }
2074         }
2075
2076         mutex_enter(&zp->z_lock);
2077         oldva.va_mode = zp->z_mode;
2078         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2079         if (mask & ATTR_XVATTR) {
2080                 /*
2081                  * Update xvattr mask to include only those attributes
2082                  * that are actually changing.
2083                  *
2084                  * the bits will be restored prior to actually setting
2085                  * the attributes so the caller thinks they were set.
2086                  */
2087                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2088                         if (xoap->xoa_appendonly !=
2089                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2090                                 need_policy = TRUE;
2091                         } else {
2092                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2093                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2094                         }
2095                 }
2096
2097                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2098                         if (xoap->xoa_projinherit !=
2099                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2100                                 need_policy = TRUE;
2101                         } else {
2102                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2103                                 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2104                         }
2105                 }
2106
2107                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2108                         if (xoap->xoa_nounlink !=
2109                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2110                                 need_policy = TRUE;
2111                         } else {
2112                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2113                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2114                         }
2115                 }
2116
2117                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2118                         if (xoap->xoa_immutable !=
2119                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2120                                 need_policy = TRUE;
2121                         } else {
2122                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2123                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2124                         }
2125                 }
2126
2127                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2128                         if (xoap->xoa_nodump !=
2129                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2130                                 need_policy = TRUE;
2131                         } else {
2132                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2133                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2134                         }
2135                 }
2136
2137                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2138                         if (xoap->xoa_av_modified !=
2139                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2140                                 need_policy = TRUE;
2141                         } else {
2142                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2143                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2144                         }
2145                 }
2146
2147                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2148                         if ((!S_ISREG(ip->i_mode) &&
2149                             xoap->xoa_av_quarantined) ||
2150                             xoap->xoa_av_quarantined !=
2151                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2152                                 need_policy = TRUE;
2153                         } else {
2154                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2155                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2156                         }
2157                 }
2158
2159                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2160                         mutex_exit(&zp->z_lock);
2161                         err = SET_ERROR(EPERM);
2162                         goto out3;
2163                 }
2164
2165                 if (need_policy == FALSE &&
2166                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2167                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2168                         need_policy = TRUE;
2169                 }
2170         }
2171
2172         mutex_exit(&zp->z_lock);
2173
2174         if (mask & ATTR_MODE) {
2175                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2176                     mnt_ns) == 0) {
2177                         err = secpolicy_setid_setsticky_clear(ip, vap,
2178                             &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
2179                         if (err)
2180                                 goto out3;
2181                         trim_mask |= ATTR_MODE;
2182                 } else {
2183                         need_policy = TRUE;
2184                 }
2185         }
2186
2187         if (need_policy) {
2188                 /*
2189                  * If trim_mask is set then take ownership
2190                  * has been granted or write_acl is present and user
2191                  * has the ability to modify mode.  In that case remove
2192                  * UID|GID and or MODE from mask so that
2193                  * secpolicy_vnode_setattr() doesn't revoke it.
2194                  */
2195
2196                 if (trim_mask) {
2197                         saved_mask = vap->va_mask;
2198                         vap->va_mask &= ~trim_mask;
2199                 }
2200                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2201                     zfs_zaccess_unix, zp);
2202                 if (err)
2203                         goto out3;
2204
2205                 if (trim_mask)
2206                         vap->va_mask |= saved_mask;
2207         }
2208
2209         /*
2210          * secpolicy_vnode_setattr, or take ownership may have
2211          * changed va_mask
2212          */
2213         mask = vap->va_mask;
2214
2215         if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2216                 handle_eadir = B_TRUE;
2217                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2218                     &xattr_obj, sizeof (xattr_obj));
2219
2220                 if (err == 0 && xattr_obj) {
2221                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2222                         if (err)
2223                                 goto out2;
2224                 }
2225                 if (mask & ATTR_UID) {
2226                         new_kuid = zfs_fuid_create(zfsvfs,
2227                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2228                         if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2229                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2230                             new_kuid)) {
2231                                 if (attrzp)
2232                                         zrele(attrzp);
2233                                 err = SET_ERROR(EDQUOT);
2234                                 goto out2;
2235                         }
2236                 }
2237
2238                 if (mask & ATTR_GID) {
2239                         new_kgid = zfs_fuid_create(zfsvfs,
2240                             (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2241                         if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2242                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2243                             new_kgid)) {
2244                                 if (attrzp)
2245                                         zrele(attrzp);
2246                                 err = SET_ERROR(EDQUOT);
2247                                 goto out2;
2248                         }
2249                 }
2250
2251                 if (projid != ZFS_INVALID_PROJID &&
2252                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2253                         if (attrzp)
2254                                 zrele(attrzp);
2255                         err = EDQUOT;
2256                         goto out2;
2257                 }
2258         }
2259         tx = dmu_tx_create(os);
2260
2261         if (mask & ATTR_MODE) {
2262                 uint64_t pmode = zp->z_mode;
2263                 uint64_t acl_obj;
2264                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2265
2266                 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2267                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2268                         err = EPERM;
2269                         goto out;
2270                 }
2271
2272                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2273                         goto out;
2274
2275                 mutex_enter(&zp->z_lock);
2276                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2277                         /*
2278                          * Are we upgrading ACL from old V0 format
2279                          * to V1 format?
2280                          */
2281                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2282                             zfs_znode_acl_version(zp) ==
2283                             ZFS_ACL_VERSION_INITIAL) {
2284                                 dmu_tx_hold_free(tx, acl_obj, 0,
2285                                     DMU_OBJECT_END);
2286                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2287                                     0, aclp->z_acl_bytes);
2288                         } else {
2289                                 dmu_tx_hold_write(tx, acl_obj, 0,
2290                                     aclp->z_acl_bytes);
2291                         }
2292                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2293                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2294                             0, aclp->z_acl_bytes);
2295                 }
2296                 mutex_exit(&zp->z_lock);
2297                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2298         } else {
2299                 if (((mask & ATTR_XVATTR) &&
2300                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2301                     (projid != ZFS_INVALID_PROJID &&
2302                     !(zp->z_pflags & ZFS_PROJID)))
2303                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2304                 else
2305                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2306         }
2307
2308         if (attrzp) {
2309                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2310         }
2311
2312         fuid_dirtied = zfsvfs->z_fuid_dirty;
2313         if (fuid_dirtied)
2314                 zfs_fuid_txhold(zfsvfs, tx);
2315
2316         zfs_sa_upgrade_txholds(tx, zp);
2317
2318         err = dmu_tx_assign(tx, TXG_WAIT);
2319         if (err)
2320                 goto out;
2321
2322         count = 0;
2323         /*
2324          * Set each attribute requested.
2325          * We group settings according to the locks they need to acquire.
2326          *
2327          * Note: you cannot set ctime directly, although it will be
2328          * updated as a side-effect of calling this function.
2329          */
2330
2331         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2332                 /*
2333                  * For the existed object that is upgraded from old system,
2334                  * its on-disk layout has no slot for the project ID attribute.
2335                  * But quota accounting logic needs to access related slots by
2336                  * offset directly. So we need to adjust old objects' layout
2337                  * to make the project ID to some unified and fixed offset.
2338                  */
2339                 if (attrzp)
2340                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2341                 if (err == 0)
2342                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2343
2344                 if (unlikely(err == EEXIST))
2345                         err = 0;
2346                 else if (err != 0)
2347                         goto out;
2348                 else
2349                         projid = ZFS_INVALID_PROJID;
2350         }
2351
2352         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2353                 mutex_enter(&zp->z_acl_lock);
2354         mutex_enter(&zp->z_lock);
2355
2356         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2357             &zp->z_pflags, sizeof (zp->z_pflags));
2358
2359         if (attrzp) {
2360                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2361                         mutex_enter(&attrzp->z_acl_lock);
2362                 mutex_enter(&attrzp->z_lock);
2363                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2364                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2365                     sizeof (attrzp->z_pflags));
2366                 if (projid != ZFS_INVALID_PROJID) {
2367                         attrzp->z_projid = projid;
2368                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2369                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2370                             sizeof (attrzp->z_projid));
2371                 }
2372         }
2373
2374         if (mask & (ATTR_UID|ATTR_GID)) {
2375
2376                 if (mask & ATTR_UID) {
2377                         ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2378                         new_uid = zfs_uid_read(ZTOI(zp));
2379                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2380                             &new_uid, sizeof (new_uid));
2381                         if (attrzp) {
2382                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2383                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2384                                     sizeof (new_uid));
2385                                 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2386                         }
2387                 }
2388
2389                 if (mask & ATTR_GID) {
2390                         ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2391                         new_gid = zfs_gid_read(ZTOI(zp));
2392                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2393                             NULL, &new_gid, sizeof (new_gid));
2394                         if (attrzp) {
2395                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2396                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2397                                     sizeof (new_gid));
2398                                 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2399                         }
2400                 }
2401                 if (!(mask & ATTR_MODE)) {
2402                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2403                             NULL, &new_mode, sizeof (new_mode));
2404                         new_mode = zp->z_mode;
2405                 }
2406                 err = zfs_acl_chown_setattr(zp);
2407                 ASSERT(err == 0);
2408                 if (attrzp) {
2409                         err = zfs_acl_chown_setattr(attrzp);
2410                         ASSERT(err == 0);
2411                 }
2412         }
2413
2414         if (mask & ATTR_MODE) {
2415                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2416                     &new_mode, sizeof (new_mode));
2417                 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2418                 ASSERT3P(aclp, !=, NULL);
2419                 err = zfs_aclset_common(zp, aclp, cr, tx);
2420                 ASSERT0(err);
2421                 if (zp->z_acl_cached)
2422                         zfs_acl_free(zp->z_acl_cached);
2423                 zp->z_acl_cached = aclp;
2424                 aclp = NULL;
2425         }
2426
2427         if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2428                 zp->z_atime_dirty = B_FALSE;
2429                 ZFS_TIME_ENCODE(&ip->i_atime, atime);
2430                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2431                     &atime, sizeof (atime));
2432         }
2433
2434         if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2435                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2436                 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
2437                     vap->va_mtime, ZTOI(zp));
2438
2439                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2440                     mtime, sizeof (mtime));
2441         }
2442
2443         if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2444                 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2445                 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
2446                     ZTOI(zp));
2447                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2448                     ctime, sizeof (ctime));
2449         }
2450
2451         if (projid != ZFS_INVALID_PROJID) {
2452                 zp->z_projid = projid;
2453                 SA_ADD_BULK_ATTR(bulk, count,
2454                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2455                     sizeof (zp->z_projid));
2456         }
2457
2458         if (attrzp && mask) {
2459                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2460                     SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2461                     sizeof (ctime));
2462         }
2463
2464         /*
2465          * Do this after setting timestamps to prevent timestamp
2466          * update from toggling bit
2467          */
2468
2469         if (xoap && (mask & ATTR_XVATTR)) {
2470
2471                 /*
2472                  * restore trimmed off masks
2473                  * so that return masks can be set for caller.
2474                  */
2475
2476                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2477                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2478                 }
2479                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2480                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2481                 }
2482                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2483                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2484                 }
2485                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2486                         XVA_SET_REQ(xvap, XAT_NODUMP);
2487                 }
2488                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2489                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2490                 }
2491                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2492                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2493                 }
2494                 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2495                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2496                 }
2497
2498                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2499                         ASSERT(S_ISREG(ip->i_mode));
2500
2501                 zfs_xvattr_set(zp, xvap, tx);
2502         }
2503
2504         if (fuid_dirtied)
2505                 zfs_fuid_sync(zfsvfs, tx);
2506
2507         if (mask != 0)
2508                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2509
2510         mutex_exit(&zp->z_lock);
2511         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2512                 mutex_exit(&zp->z_acl_lock);
2513
2514         if (attrzp) {
2515                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2516                         mutex_exit(&attrzp->z_acl_lock);
2517                 mutex_exit(&attrzp->z_lock);
2518         }
2519 out:
2520         if (err == 0 && xattr_count > 0) {
2521                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2522                     xattr_count, tx);
2523                 ASSERT(err2 == 0);
2524         }
2525
2526         if (aclp)
2527                 zfs_acl_free(aclp);
2528
2529         if (fuidp) {
2530                 zfs_fuid_info_free(fuidp);
2531                 fuidp = NULL;
2532         }
2533
2534         if (err) {
2535                 dmu_tx_abort(tx);
2536                 if (attrzp)
2537                         zrele(attrzp);
2538                 if (err == ERESTART)
2539                         goto top;
2540         } else {
2541                 if (count > 0)
2542                         err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2543                 dmu_tx_commit(tx);
2544                 if (attrzp) {
2545                         if (err2 == 0 && handle_eadir)
2546                                 err = zfs_setattr_dir(attrzp);
2547                         zrele(attrzp);
2548                 }
2549                 zfs_znode_update_vfs(zp);
2550         }
2551
2552 out2:
2553         if (os->os_sync == ZFS_SYNC_ALWAYS)
2554                 zil_commit(zilog, 0);
2555
2556 out3:
2557         kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2558         kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2559         kmem_free(tmpxvattr, sizeof (xvattr_t));
2560         zfs_exit(zfsvfs, FTAG);
2561         return (err);
2562 }
2563
2564 typedef struct zfs_zlock {
2565         krwlock_t       *zl_rwlock;     /* lock we acquired */
2566         znode_t         *zl_znode;      /* znode we held */
2567         struct zfs_zlock *zl_next;      /* next in list */
2568 } zfs_zlock_t;
2569
2570 /*
2571  * Drop locks and release vnodes that were held by zfs_rename_lock().
2572  */
2573 static void
2574 zfs_rename_unlock(zfs_zlock_t **zlpp)
2575 {
2576         zfs_zlock_t *zl;
2577
2578         while ((zl = *zlpp) != NULL) {
2579                 if (zl->zl_znode != NULL)
2580                         zfs_zrele_async(zl->zl_znode);
2581                 rw_exit(zl->zl_rwlock);
2582                 *zlpp = zl->zl_next;
2583                 kmem_free(zl, sizeof (*zl));
2584         }
2585 }
2586
2587 /*
2588  * Search back through the directory tree, using the ".." entries.
2589  * Lock each directory in the chain to prevent concurrent renames.
2590  * Fail any attempt to move a directory into one of its own descendants.
2591  * XXX - z_parent_lock can overlap with map or grow locks
2592  */
2593 static int
2594 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2595 {
2596         zfs_zlock_t     *zl;
2597         znode_t         *zp = tdzp;
2598         uint64_t        rootid = ZTOZSB(zp)->z_root;
2599         uint64_t        oidp = zp->z_id;
2600         krwlock_t       *rwlp = &szp->z_parent_lock;
2601         krw_t           rw = RW_WRITER;
2602
2603         /*
2604          * First pass write-locks szp and compares to zp->z_id.
2605          * Later passes read-lock zp and compare to zp->z_parent.
2606          */
2607         do {
2608                 if (!rw_tryenter(rwlp, rw)) {
2609                         /*
2610                          * Another thread is renaming in this path.
2611                          * Note that if we are a WRITER, we don't have any
2612                          * parent_locks held yet.
2613                          */
2614                         if (rw == RW_READER && zp->z_id > szp->z_id) {
2615                                 /*
2616                                  * Drop our locks and restart
2617                                  */
2618                                 zfs_rename_unlock(&zl);
2619                                 *zlpp = NULL;
2620                                 zp = tdzp;
2621                                 oidp = zp->z_id;
2622                                 rwlp = &szp->z_parent_lock;
2623                                 rw = RW_WRITER;
2624                                 continue;
2625                         } else {
2626                                 /*
2627                                  * Wait for other thread to drop its locks
2628                                  */
2629                                 rw_enter(rwlp, rw);
2630                         }
2631                 }
2632
2633                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2634                 zl->zl_rwlock = rwlp;
2635                 zl->zl_znode = NULL;
2636                 zl->zl_next = *zlpp;
2637                 *zlpp = zl;
2638
2639                 if (oidp == szp->z_id)          /* We're a descendant of szp */
2640                         return (SET_ERROR(EINVAL));
2641
2642                 if (oidp == rootid)             /* We've hit the top */
2643                         return (0);
2644
2645                 if (rw == RW_READER) {          /* i.e. not the first pass */
2646                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2647                         if (error)
2648                                 return (error);
2649                         zl->zl_znode = zp;
2650                 }
2651                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2652                     &oidp, sizeof (oidp));
2653                 rwlp = &zp->z_parent_lock;
2654                 rw = RW_READER;
2655
2656         } while (zp->z_id != sdzp->z_id);
2657
2658         return (0);
2659 }
2660
2661 /*
2662  * Move an entry from the provided source directory to the target
2663  * directory.  Change the entry name as indicated.
2664  *
2665  *      IN:     sdzp    - Source directory containing the "old entry".
2666  *              snm     - Old entry name.
2667  *              tdzp    - Target directory to contain the "new entry".
2668  *              tnm     - New entry name.
2669  *              cr      - credentials of caller.
2670  *              flags   - case flags
2671  *              rflags  - RENAME_* flags
2672  *              wa_vap  - attributes for RENAME_WHITEOUT (must be a char 0:0).
2673  *              mnt_ns  - user namespace of the mount
2674  *
2675  *      RETURN: 0 on success, error code on failure.
2676  *
2677  * Timestamps:
2678  *      sdzp,tdzp - ctime|mtime updated
2679  */
2680 int
2681 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2682     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
2683 {
2684         znode_t         *szp, *tzp;
2685         zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
2686         zilog_t         *zilog;
2687         zfs_dirlock_t   *sdl, *tdl;
2688         dmu_tx_t        *tx;
2689         zfs_zlock_t     *zl;
2690         int             cmp, serr, terr;
2691         int             error = 0;
2692         int             zflg = 0;
2693         boolean_t       waited = B_FALSE;
2694         /* Needed for whiteout inode creation. */
2695         boolean_t       fuid_dirtied;
2696         zfs_acl_ids_t   acl_ids;
2697         boolean_t       have_acl = B_FALSE;
2698         znode_t         *wzp = NULL;
2699
2700
2701         if (snm == NULL || tnm == NULL)
2702                 return (SET_ERROR(EINVAL));
2703
2704         if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2705                 return (SET_ERROR(EINVAL));
2706
2707         /* Already checked by Linux VFS, but just to make sure. */
2708         if (rflags & RENAME_EXCHANGE &&
2709             (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
2710                 return (SET_ERROR(EINVAL));
2711
2712         /*
2713          * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
2714          * right kind of vattr_t for the whiteout file. These are set
2715          * internally by ZFS so should never be incorrect.
2716          */
2717         VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
2718         VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
2719         VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
2720
2721         if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2722                 return (error);
2723         zilog = zfsvfs->z_log;
2724
2725         if ((error = zfs_verify_zp(tdzp)) != 0) {
2726                 zfs_exit(zfsvfs, FTAG);
2727                 return (error);
2728         }
2729
2730         /*
2731          * We check i_sb because snapshots and the ctldir must have different
2732          * super blocks.
2733          */
2734         if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2735             zfsctl_is_node(ZTOI(tdzp))) {
2736                 zfs_exit(zfsvfs, FTAG);
2737                 return (SET_ERROR(EXDEV));
2738         }
2739
2740         if (zfsvfs->z_utf8 && u8_validate(tnm,
2741             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2742                 zfs_exit(zfsvfs, FTAG);
2743                 return (SET_ERROR(EILSEQ));
2744         }
2745
2746         if (flags & FIGNORECASE)
2747                 zflg |= ZCILOOK;
2748
2749 top:
2750         szp = NULL;
2751         tzp = NULL;
2752         zl = NULL;
2753
2754         /*
2755          * This is to prevent the creation of links into attribute space
2756          * by renaming a linked file into/outof an attribute directory.
2757          * See the comment in zfs_link() for why this is considered bad.
2758          */
2759         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2760                 zfs_exit(zfsvfs, FTAG);
2761                 return (SET_ERROR(EINVAL));
2762         }
2763
2764         /*
2765          * Lock source and target directory entries.  To prevent deadlock,
2766          * a lock ordering must be defined.  We lock the directory with
2767          * the smallest object id first, or if it's a tie, the one with
2768          * the lexically first name.
2769          */
2770         if (sdzp->z_id < tdzp->z_id) {
2771                 cmp = -1;
2772         } else if (sdzp->z_id > tdzp->z_id) {
2773                 cmp = 1;
2774         } else {
2775                 /*
2776                  * First compare the two name arguments without
2777                  * considering any case folding.
2778                  */
2779                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2780
2781                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2782                 ASSERT(error == 0 || !zfsvfs->z_utf8);
2783                 if (cmp == 0) {
2784                         /*
2785                          * POSIX: "If the old argument and the new argument
2786                          * both refer to links to the same existing file,
2787                          * the rename() function shall return successfully
2788                          * and perform no other action."
2789                          */
2790                         zfs_exit(zfsvfs, FTAG);
2791                         return (0);
2792                 }
2793                 /*
2794                  * If the file system is case-folding, then we may
2795                  * have some more checking to do.  A case-folding file
2796                  * system is either supporting mixed case sensitivity
2797                  * access or is completely case-insensitive.  Note
2798                  * that the file system is always case preserving.
2799                  *
2800                  * In mixed sensitivity mode case sensitive behavior
2801                  * is the default.  FIGNORECASE must be used to
2802                  * explicitly request case insensitive behavior.
2803                  *
2804                  * If the source and target names provided differ only
2805                  * by case (e.g., a request to rename 'tim' to 'Tim'),
2806                  * we will treat this as a special case in the
2807                  * case-insensitive mode: as long as the source name
2808                  * is an exact match, we will allow this to proceed as
2809                  * a name-change request.
2810                  */
2811                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2812                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
2813                     flags & FIGNORECASE)) &&
2814                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2815                     &error) == 0) {
2816                         /*
2817                          * case preserving rename request, require exact
2818                          * name matches
2819                          */
2820                         zflg |= ZCIEXACT;
2821                         zflg &= ~ZCILOOK;
2822                 }
2823         }
2824
2825         /*
2826          * If the source and destination directories are the same, we should
2827          * grab the z_name_lock of that directory only once.
2828          */
2829         if (sdzp == tdzp) {
2830                 zflg |= ZHAVELOCK;
2831                 rw_enter(&sdzp->z_name_lock, RW_READER);
2832         }
2833
2834         if (cmp < 0) {
2835                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2836                     ZEXISTS | zflg, NULL, NULL);
2837                 terr = zfs_dirent_lock(&tdl,
2838                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2839         } else {
2840                 terr = zfs_dirent_lock(&tdl,
2841                     tdzp, tnm, &tzp, zflg, NULL, NULL);
2842                 serr = zfs_dirent_lock(&sdl,
2843                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2844                     NULL, NULL);
2845         }
2846
2847         if (serr) {
2848                 /*
2849                  * Source entry invalid or not there.
2850                  */
2851                 if (!terr) {
2852                         zfs_dirent_unlock(tdl);
2853                         if (tzp)
2854                                 zrele(tzp);
2855                 }
2856
2857                 if (sdzp == tdzp)
2858                         rw_exit(&sdzp->z_name_lock);
2859
2860                 if (strcmp(snm, "..") == 0)
2861                         serr = EINVAL;
2862                 zfs_exit(zfsvfs, FTAG);
2863                 return (serr);
2864         }
2865         if (terr) {
2866                 zfs_dirent_unlock(sdl);
2867                 zrele(szp);
2868
2869                 if (sdzp == tdzp)
2870                         rw_exit(&sdzp->z_name_lock);
2871
2872                 if (strcmp(tnm, "..") == 0)
2873                         terr = EINVAL;
2874                 zfs_exit(zfsvfs, FTAG);
2875                 return (terr);
2876         }
2877
2878         /*
2879          * If we are using project inheritance, means if the directory has
2880          * ZFS_PROJINHERIT set, then its descendant directories will inherit
2881          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2882          * such case, we only allow renames into our tree when the project
2883          * IDs are the same.
2884          */
2885         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2886             tdzp->z_projid != szp->z_projid) {
2887                 error = SET_ERROR(EXDEV);
2888                 goto out;
2889         }
2890
2891         /*
2892          * Must have write access at the source to remove the old entry
2893          * and write access at the target to create the new entry.
2894          * Note that if target and source are the same, this can be
2895          * done in a single check.
2896          */
2897         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
2898                 goto out;
2899
2900         if (S_ISDIR(ZTOI(szp)->i_mode)) {
2901                 /*
2902                  * Check to make sure rename is valid.
2903                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2904                  */
2905                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2906                         goto out;
2907         }
2908
2909         /*
2910          * Does target exist?
2911          */
2912         if (tzp) {
2913                 if (rflags & RENAME_NOREPLACE) {
2914                         error = SET_ERROR(EEXIST);
2915                         goto out;
2916                 }
2917                 /*
2918                  * Source and target must be the same type (unless exchanging).
2919                  */
2920                 if (!(rflags & RENAME_EXCHANGE)) {
2921                         boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
2922                         boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
2923
2924                         if (s_is_dir != t_is_dir) {
2925                                 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
2926                                 goto out;
2927                         }
2928                 }
2929                 /*
2930                  * POSIX dictates that when the source and target
2931                  * entries refer to the same file object, rename
2932                  * must do nothing and exit without error.
2933                  */
2934                 if (szp->z_id == tzp->z_id) {
2935                         error = 0;
2936                         goto out;
2937                 }
2938         } else if (rflags & RENAME_EXCHANGE) {
2939                 /* Target must exist for RENAME_EXCHANGE. */
2940                 error = SET_ERROR(ENOENT);
2941                 goto out;
2942         }
2943
2944         /* Set up inode creation for RENAME_WHITEOUT. */
2945         if (rflags & RENAME_WHITEOUT) {
2946                 /*
2947                  * Whiteout files are not regular files or directories, so to
2948                  * match zfs_create() we do not inherit the project id.
2949                  */
2950                 uint64_t wo_projid = ZFS_DEFAULT_PROJID;
2951
2952                 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
2953                 if (error)
2954                         goto out;
2955
2956                 if (!have_acl) {
2957                         error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
2958                             &acl_ids, mnt_ns);
2959                         if (error)
2960                                 goto out;
2961                         have_acl = B_TRUE;
2962                 }
2963
2964                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
2965                         error = SET_ERROR(EDQUOT);
2966                         goto out;
2967                 }
2968         }
2969
2970         tx = dmu_tx_create(zfsvfs->z_os);
2971         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
2972         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2973         dmu_tx_hold_zap(tx, sdzp->z_id,
2974             (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
2975         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2976         if (sdzp != tdzp) {
2977                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
2978                 zfs_sa_upgrade_txholds(tx, tdzp);
2979         }
2980         if (tzp) {
2981                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
2982                 zfs_sa_upgrade_txholds(tx, tzp);
2983         }
2984         if (rflags & RENAME_WHITEOUT) {
2985                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2986                     ZFS_SA_BASE_ATTR_SIZE);
2987
2988                 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
2989                 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2990                 if (!zfsvfs->z_use_sa &&
2991                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2992                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2993                             0, acl_ids.z_aclp->z_acl_bytes);
2994                 }
2995         }
2996         fuid_dirtied = zfsvfs->z_fuid_dirty;
2997         if (fuid_dirtied)
2998                 zfs_fuid_txhold(zfsvfs, tx);
2999         zfs_sa_upgrade_txholds(tx, szp);
3000         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3001         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3002         if (error) {
3003                 if (zl != NULL)
3004                         zfs_rename_unlock(&zl);
3005                 zfs_dirent_unlock(sdl);
3006                 zfs_dirent_unlock(tdl);
3007
3008                 if (sdzp == tdzp)
3009                         rw_exit(&sdzp->z_name_lock);
3010
3011                 if (error == ERESTART) {
3012                         waited = B_TRUE;
3013                         dmu_tx_wait(tx);
3014                         dmu_tx_abort(tx);
3015                         zrele(szp);
3016                         if (tzp)
3017                                 zrele(tzp);
3018                         goto top;
3019                 }
3020                 dmu_tx_abort(tx);
3021                 zrele(szp);
3022                 if (tzp)
3023                         zrele(tzp);
3024                 zfs_exit(zfsvfs, FTAG);
3025                 return (error);
3026         }
3027
3028         /*
3029          * Unlink the source.
3030          */
3031         szp->z_pflags |= ZFS_AV_MODIFIED;
3032         if (tdzp->z_pflags & ZFS_PROJINHERIT)
3033                 szp->z_pflags |= ZFS_PROJINHERIT;
3034
3035         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3036             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3037         VERIFY0(error);
3038
3039         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3040         if (error)
3041                 goto commit;
3042
3043         /*
3044          * Unlink the target.
3045          */
3046         if (tzp) {
3047                 int tzflg = zflg;
3048
3049                 if (rflags & RENAME_EXCHANGE) {
3050                         /* This inode will be re-linked soon. */
3051                         tzflg |= ZRENAMING;
3052
3053                         tzp->z_pflags |= ZFS_AV_MODIFIED;
3054                         if (sdzp->z_pflags & ZFS_PROJINHERIT)
3055                                 tzp->z_pflags |= ZFS_PROJINHERIT;
3056
3057                         error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3058                             (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
3059                         ASSERT0(error);
3060                 }
3061                 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
3062                 if (error)
3063                         goto commit_link_szp;
3064         }
3065
3066         /*
3067          * Create the new target links:
3068          *   * We always link the target.
3069          *   * RENAME_EXCHANGE: Link the old target to the source.
3070          *   * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
3071          */
3072         error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3073         if (error) {
3074                 /*
3075                  * If we have removed the existing target, a subsequent call to
3076                  * zfs_link_create() to add back the same entry, but with a new
3077                  * dnode (szp), should not fail.
3078                  */
3079                 ASSERT3P(tzp, ==, NULL);
3080                 goto commit_link_tzp;
3081         }
3082
3083         switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3084         case RENAME_EXCHANGE:
3085                 error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
3086                 /*
3087                  * The same argument as zfs_link_create() failing for
3088                  * szp applies here, since the source directory must
3089                  * have had an entry we are replacing.
3090                  */
3091                 ASSERT0(error);
3092                 if (error)
3093                         goto commit_unlink_td_szp;
3094                 break;
3095         case RENAME_WHITEOUT:
3096                 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
3097                 error = zfs_link_create(sdl, wzp, tx, ZNEW);
3098                 if (error) {
3099                         zfs_znode_delete(wzp, tx);
3100                         remove_inode_hash(ZTOI(wzp));
3101                         goto commit_unlink_td_szp;
3102                 }
3103                 break;
3104         }
3105
3106         if (fuid_dirtied)
3107                 zfs_fuid_sync(zfsvfs, tx);
3108
3109         switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3110         case RENAME_EXCHANGE:
3111                 zfs_log_rename_exchange(zilog, tx,
3112                     (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3113                     tdzp, tdl->dl_name, szp);
3114                 break;
3115         case RENAME_WHITEOUT:
3116                 zfs_log_rename_whiteout(zilog, tx,
3117                     (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3118                     tdzp, tdl->dl_name, szp, wzp);
3119                 break;
3120         default:
3121                 ASSERT0(rflags & ~RENAME_NOREPLACE);
3122                 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
3123                     sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3124                 break;
3125         }
3126
3127 commit:
3128         dmu_tx_commit(tx);
3129 out:
3130         if (have_acl)
3131                 zfs_acl_ids_free(&acl_ids);
3132
3133         zfs_znode_update_vfs(sdzp);
3134         if (sdzp == tdzp)
3135                 rw_exit(&sdzp->z_name_lock);
3136
3137         if (sdzp != tdzp)
3138                 zfs_znode_update_vfs(tdzp);
3139
3140         zfs_znode_update_vfs(szp);
3141         zrele(szp);
3142         if (wzp) {
3143                 zfs_znode_update_vfs(wzp);
3144                 zrele(wzp);
3145         }
3146         if (tzp) {
3147                 zfs_znode_update_vfs(tzp);
3148                 zrele(tzp);
3149         }
3150
3151         if (zl != NULL)
3152                 zfs_rename_unlock(&zl);
3153
3154         zfs_dirent_unlock(sdl);
3155         zfs_dirent_unlock(tdl);
3156
3157         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3158                 zil_commit(zilog, 0);
3159
3160         zfs_exit(zfsvfs, FTAG);
3161         return (error);
3162
3163         /*
3164          * Clean-up path for broken link state.
3165          *
3166          * At this point we are in a (very) bad state, so we need to do our
3167          * best to correct the state. In particular, all of the nlinks are
3168          * wrong because we were destroying and creating links with ZRENAMING.
3169          *
3170          * In some form, all of these operations have to resolve the state:
3171          *
3172          *  * link_destroy() *must* succeed. Fortunately, this is very likely
3173          *    since we only just created it.
3174          *
3175          *  * link_create()s are allowed to fail (though they shouldn't because
3176          *    we only just unlinked them and are putting the entries back
3177          *    during clean-up). But if they fail, we can just forcefully drop
3178          *    the nlink value to (at the very least) avoid broken nlink values
3179          *    -- though in the case of non-empty directories we will have to
3180          *    panic (otherwise we'd have a leaked directory with a broken ..).
3181          */
3182 commit_unlink_td_szp:
3183         VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
3184 commit_link_tzp:
3185         if (tzp) {
3186                 if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
3187                         VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
3188         }
3189 commit_link_szp:
3190         if (zfs_link_create(sdl, szp, tx, ZRENAMING))
3191                 VERIFY0(zfs_drop_nlink(szp, tx, NULL));
3192         goto commit;
3193 }
3194
3195 /*
3196  * Insert the indicated symbolic reference entry into the directory.
3197  *
3198  *      IN:     dzp     - Directory to contain new symbolic link.
3199  *              name    - Name of directory entry in dip.
3200  *              vap     - Attributes of new entry.
3201  *              link    - Name for new symlink entry.
3202  *              cr      - credentials of caller.
3203  *              flags   - case flags
3204  *              mnt_ns  - user namespace of the mount
3205  *
3206  *      OUT:    zpp     - Znode for new symbolic link.
3207  *
3208  *      RETURN: 0 on success, error code on failure.
3209  *
3210  * Timestamps:
3211  *      dip - ctime|mtime updated
3212  */
3213 int
3214 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3215     znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3216 {
3217         znode_t         *zp;
3218         zfs_dirlock_t   *dl;
3219         dmu_tx_t        *tx;
3220         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
3221         zilog_t         *zilog;
3222         uint64_t        len = strlen(link);
3223         int             error;
3224         int             zflg = ZNEW;
3225         zfs_acl_ids_t   acl_ids;
3226         boolean_t       fuid_dirtied;
3227         uint64_t        txtype = TX_SYMLINK;
3228         boolean_t       waited = B_FALSE;
3229
3230         ASSERT(S_ISLNK(vap->va_mode));
3231
3232         if (name == NULL)
3233                 return (SET_ERROR(EINVAL));
3234
3235         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3236                 return (error);
3237         zilog = zfsvfs->z_log;
3238
3239         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3240             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3241                 zfs_exit(zfsvfs, FTAG);
3242                 return (SET_ERROR(EILSEQ));
3243         }
3244         if (flags & FIGNORECASE)
3245                 zflg |= ZCILOOK;
3246
3247         if (len > MAXPATHLEN) {
3248                 zfs_exit(zfsvfs, FTAG);
3249                 return (SET_ERROR(ENAMETOOLONG));
3250         }
3251
3252         if ((error = zfs_acl_ids_create(dzp, 0,
3253             vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
3254                 zfs_exit(zfsvfs, FTAG);
3255                 return (error);
3256         }
3257 top:
3258         *zpp = NULL;
3259
3260         /*
3261          * Attempt to lock directory; fail if entry already exists.
3262          */
3263         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3264         if (error) {
3265                 zfs_acl_ids_free(&acl_ids);
3266                 zfs_exit(zfsvfs, FTAG);
3267                 return (error);
3268         }
3269
3270         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3271                 zfs_acl_ids_free(&acl_ids);
3272                 zfs_dirent_unlock(dl);
3273                 zfs_exit(zfsvfs, FTAG);
3274                 return (error);
3275         }
3276
3277         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3278                 zfs_acl_ids_free(&acl_ids);
3279                 zfs_dirent_unlock(dl);
3280                 zfs_exit(zfsvfs, FTAG);
3281                 return (SET_ERROR(EDQUOT));
3282         }
3283         tx = dmu_tx_create(zfsvfs->z_os);
3284         fuid_dirtied = zfsvfs->z_fuid_dirty;
3285         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3286         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3287         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3288             ZFS_SA_BASE_ATTR_SIZE + len);
3289         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3290         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3291                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3292                     acl_ids.z_aclp->z_acl_bytes);
3293         }
3294         if (fuid_dirtied)
3295                 zfs_fuid_txhold(zfsvfs, tx);
3296         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3297         if (error) {
3298                 zfs_dirent_unlock(dl);
3299                 if (error == ERESTART) {
3300                         waited = B_TRUE;
3301                         dmu_tx_wait(tx);
3302                         dmu_tx_abort(tx);
3303                         goto top;
3304                 }
3305                 zfs_acl_ids_free(&acl_ids);
3306                 dmu_tx_abort(tx);
3307                 zfs_exit(zfsvfs, FTAG);
3308                 return (error);
3309         }
3310
3311         /*
3312          * Create a new object for the symlink.
3313          * for version 4 ZPL datasets the symlink will be an SA attribute
3314          */
3315         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3316
3317         if (fuid_dirtied)
3318                 zfs_fuid_sync(zfsvfs, tx);
3319
3320         mutex_enter(&zp->z_lock);
3321         if (zp->z_is_sa)
3322                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3323                     link, len, tx);
3324         else
3325                 zfs_sa_symlink(zp, link, len, tx);
3326         mutex_exit(&zp->z_lock);
3327
3328         zp->z_size = len;
3329         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3330             &zp->z_size, sizeof (zp->z_size), tx);
3331         /*
3332          * Insert the new object into the directory.
3333          */
3334         error = zfs_link_create(dl, zp, tx, ZNEW);
3335         if (error != 0) {
3336                 zfs_znode_delete(zp, tx);
3337                 remove_inode_hash(ZTOI(zp));
3338         } else {
3339                 if (flags & FIGNORECASE)
3340                         txtype |= TX_CI;
3341                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3342
3343                 zfs_znode_update_vfs(dzp);
3344                 zfs_znode_update_vfs(zp);
3345         }
3346
3347         zfs_acl_ids_free(&acl_ids);
3348
3349         dmu_tx_commit(tx);
3350
3351         zfs_dirent_unlock(dl);
3352
3353         if (error == 0) {
3354                 *zpp = zp;
3355
3356                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3357                         zil_commit(zilog, 0);
3358         } else {
3359                 zrele(zp);
3360         }
3361
3362         zfs_exit(zfsvfs, FTAG);
3363         return (error);
3364 }
3365
3366 /*
3367  * Return, in the buffer contained in the provided uio structure,
3368  * the symbolic path referred to by ip.
3369  *
3370  *      IN:     ip      - inode of symbolic link
3371  *              uio     - structure to contain the link path.
3372  *              cr      - credentials of caller.
3373  *
3374  *      RETURN: 0 if success
3375  *              error code if failure
3376  *
3377  * Timestamps:
3378  *      ip - atime updated
3379  */
3380 int
3381 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3382 {
3383         (void) cr;
3384         znode_t         *zp = ITOZ(ip);
3385         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3386         int             error;
3387
3388         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3389                 return (error);
3390
3391         mutex_enter(&zp->z_lock);
3392         if (zp->z_is_sa)
3393                 error = sa_lookup_uio(zp->z_sa_hdl,
3394                     SA_ZPL_SYMLINK(zfsvfs), uio);
3395         else
3396                 error = zfs_sa_readlink(zp, uio);
3397         mutex_exit(&zp->z_lock);
3398
3399         zfs_exit(zfsvfs, FTAG);
3400         return (error);
3401 }
3402
3403 /*
3404  * Insert a new entry into directory tdzp referencing szp.
3405  *
3406  *      IN:     tdzp    - Directory to contain new entry.
3407  *              szp     - znode of new entry.
3408  *              name    - name of new entry.
3409  *              cr      - credentials of caller.
3410  *              flags   - case flags.
3411  *
3412  *      RETURN: 0 if success
3413  *              error code if failure
3414  *
3415  * Timestamps:
3416  *      tdzp - ctime|mtime updated
3417  *       szp - ctime updated
3418  */
3419 int
3420 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3421     int flags)
3422 {
3423         struct inode *sip = ZTOI(szp);
3424         znode_t         *tzp;
3425         zfsvfs_t        *zfsvfs = ZTOZSB(tdzp);
3426         zilog_t         *zilog;
3427         zfs_dirlock_t   *dl;
3428         dmu_tx_t        *tx;
3429         int             error;
3430         int             zf = ZNEW;
3431         uint64_t        parent;
3432         uid_t           owner;
3433         boolean_t       waited = B_FALSE;
3434         boolean_t       is_tmpfile = 0;
3435         uint64_t        txg;
3436 #ifdef HAVE_TMPFILE
3437         is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3438 #endif
3439         ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3440
3441         if (name == NULL)
3442                 return (SET_ERROR(EINVAL));
3443
3444         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3445                 return (error);
3446         zilog = zfsvfs->z_log;
3447
3448         /*
3449          * POSIX dictates that we return EPERM here.
3450          * Better choices include ENOTSUP or EISDIR.
3451          */
3452         if (S_ISDIR(sip->i_mode)) {
3453                 zfs_exit(zfsvfs, FTAG);
3454                 return (SET_ERROR(EPERM));
3455         }
3456
3457         if ((error = zfs_verify_zp(szp)) != 0) {
3458                 zfs_exit(zfsvfs, FTAG);
3459                 return (error);
3460         }
3461
3462         /*
3463          * If we are using project inheritance, means if the directory has
3464          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3465          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3466          * such case, we only allow hard link creation in our tree when the
3467          * project IDs are the same.
3468          */
3469         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3470             tdzp->z_projid != szp->z_projid) {
3471                 zfs_exit(zfsvfs, FTAG);
3472                 return (SET_ERROR(EXDEV));
3473         }
3474
3475         /*
3476          * We check i_sb because snapshots and the ctldir must have different
3477          * super blocks.
3478          */
3479         if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3480                 zfs_exit(zfsvfs, FTAG);
3481                 return (SET_ERROR(EXDEV));
3482         }
3483
3484         /* Prevent links to .zfs/shares files */
3485
3486         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3487             &parent, sizeof (uint64_t))) != 0) {
3488                 zfs_exit(zfsvfs, FTAG);
3489                 return (error);
3490         }
3491         if (parent == zfsvfs->z_shares_dir) {
3492                 zfs_exit(zfsvfs, FTAG);
3493                 return (SET_ERROR(EPERM));
3494         }
3495
3496         if (zfsvfs->z_utf8 && u8_validate(name,
3497             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3498                 zfs_exit(zfsvfs, FTAG);
3499                 return (SET_ERROR(EILSEQ));
3500         }
3501         if (flags & FIGNORECASE)
3502                 zf |= ZCILOOK;
3503
3504         /*
3505          * We do not support links between attributes and non-attributes
3506          * because of the potential security risk of creating links
3507          * into "normal" file space in order to circumvent restrictions
3508          * imposed in attribute space.
3509          */
3510         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3511                 zfs_exit(zfsvfs, FTAG);
3512                 return (SET_ERROR(EINVAL));
3513         }
3514
3515         owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3516             cr, ZFS_OWNER);
3517         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3518                 zfs_exit(zfsvfs, FTAG);
3519                 return (SET_ERROR(EPERM));
3520         }
3521
3522         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
3523             zfs_init_idmap))) {
3524                 zfs_exit(zfsvfs, FTAG);
3525                 return (error);
3526         }
3527
3528 top:
3529         /*
3530          * Attempt to lock directory; fail if entry already exists.
3531          */
3532         error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3533         if (error) {
3534                 zfs_exit(zfsvfs, FTAG);
3535                 return (error);
3536         }
3537
3538         tx = dmu_tx_create(zfsvfs->z_os);
3539         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3540         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3541         if (is_tmpfile)
3542                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3543
3544         zfs_sa_upgrade_txholds(tx, szp);
3545         zfs_sa_upgrade_txholds(tx, tdzp);
3546         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3547         if (error) {
3548                 zfs_dirent_unlock(dl);
3549                 if (error == ERESTART) {
3550                         waited = B_TRUE;
3551                         dmu_tx_wait(tx);
3552                         dmu_tx_abort(tx);
3553                         goto top;
3554                 }
3555                 dmu_tx_abort(tx);
3556                 zfs_exit(zfsvfs, FTAG);
3557                 return (error);
3558         }
3559         /* unmark z_unlinked so zfs_link_create will not reject */
3560         if (is_tmpfile)
3561                 szp->z_unlinked = B_FALSE;
3562         error = zfs_link_create(dl, szp, tx, 0);
3563
3564         if (error == 0) {
3565                 uint64_t txtype = TX_LINK;
3566                 /*
3567                  * tmpfile is created to be in z_unlinkedobj, so remove it.
3568                  * Also, we don't log in ZIL, because all previous file
3569                  * operation on the tmpfile are ignored by ZIL. Instead we
3570                  * always wait for txg to sync to make sure all previous
3571                  * operation are sync safe.
3572                  */
3573                 if (is_tmpfile) {
3574                         VERIFY(zap_remove_int(zfsvfs->z_os,
3575                             zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
3576                 } else {
3577                         if (flags & FIGNORECASE)
3578                                 txtype |= TX_CI;
3579                         zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3580                 }
3581         } else if (is_tmpfile) {
3582                 /* restore z_unlinked since when linking failed */
3583                 szp->z_unlinked = B_TRUE;
3584         }
3585         txg = dmu_tx_get_txg(tx);
3586         dmu_tx_commit(tx);
3587
3588         zfs_dirent_unlock(dl);
3589
3590         if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3591                 zil_commit(zilog, 0);
3592
3593         if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
3594                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
3595
3596         zfs_znode_update_vfs(tdzp);
3597         zfs_znode_update_vfs(szp);
3598         zfs_exit(zfsvfs, FTAG);
3599         return (error);
3600 }
3601
3602 static void
3603 zfs_putpage_sync_commit_cb(void *arg)
3604 {
3605         struct page *pp = arg;
3606
3607         ClearPageError(pp);
3608         end_page_writeback(pp);
3609 }
3610
3611 static void
3612 zfs_putpage_async_commit_cb(void *arg)
3613 {
3614         struct page *pp = arg;
3615         znode_t *zp = ITOZ(pp->mapping->host);
3616
3617         ClearPageError(pp);
3618         end_page_writeback(pp);
3619         atomic_dec_32(&zp->z_async_writes_cnt);
3620 }
3621
3622 /*
3623  * Push a page out to disk, once the page is on stable storage the
3624  * registered commit callback will be run as notification of completion.
3625  *
3626  *      IN:     ip       - page mapped for inode.
3627  *              pp       - page to push (page is locked)
3628  *              wbc      - writeback control data
3629  *              for_sync - does the caller intend to wait synchronously for the
3630  *                         page writeback to complete?
3631  *
3632  *      RETURN: 0 if success
3633  *              error code if failure
3634  *
3635  * Timestamps:
3636  *      ip - ctime|mtime updated
3637  */
3638 int
3639 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3640     boolean_t for_sync)
3641 {
3642         znode_t         *zp = ITOZ(ip);
3643         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3644         loff_t          offset;
3645         loff_t          pgoff;
3646         unsigned int    pglen;
3647         dmu_tx_t        *tx;
3648         caddr_t         va;
3649         int             err = 0;
3650         uint64_t        mtime[2], ctime[2];
3651         sa_bulk_attr_t  bulk[3];
3652         int             cnt = 0;
3653         struct address_space *mapping;
3654
3655         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3656                 return (err);
3657
3658         ASSERT(PageLocked(pp));
3659
3660         pgoff = page_offset(pp);        /* Page byte-offset in file */
3661         offset = i_size_read(ip);       /* File length in bytes */
3662         pglen = MIN(PAGE_SIZE,          /* Page length in bytes */
3663             P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3664
3665         /* Page is beyond end of file */
3666         if (pgoff >= offset) {
3667                 unlock_page(pp);
3668                 zfs_exit(zfsvfs, FTAG);
3669                 return (0);
3670         }
3671
3672         /* Truncate page length to end of file */
3673         if (pgoff + pglen > offset)
3674                 pglen = offset - pgoff;
3675
3676 #if 0
3677         /*
3678          * FIXME: Allow mmap writes past its quota.  The correct fix
3679          * is to register a page_mkwrite() handler to count the page
3680          * against its quota when it is about to be dirtied.
3681          */
3682         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3683             KUID_TO_SUID(ip->i_uid)) ||
3684             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3685             KGID_TO_SGID(ip->i_gid)) ||
3686             (zp->z_projid != ZFS_DEFAULT_PROJID &&
3687             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3688             zp->z_projid))) {
3689                 err = EDQUOT;
3690         }
3691 #endif
3692
3693         /*
3694          * The ordering here is critical and must adhere to the following
3695          * rules in order to avoid deadlocking in either zfs_read() or
3696          * zfs_free_range() due to a lock inversion.
3697          *
3698          * 1) The page must be unlocked prior to acquiring the range lock.
3699          *    This is critical because zfs_read() calls find_lock_page()
3700          *    which may block on the page lock while holding the range lock.
3701          *
3702          * 2) Before setting or clearing write back on a page the range lock
3703          *    must be held in order to prevent a lock inversion with the
3704          *    zfs_free_range() function.
3705          *
3706          * This presents a problem because upon entering this function the
3707          * page lock is already held.  To safely acquire the range lock the
3708          * page lock must be dropped.  This creates a window where another
3709          * process could truncate, invalidate, dirty, or write out the page.
3710          *
3711          * Therefore, after successfully reacquiring the range and page locks
3712          * the current page state is checked.  In the common case everything
3713          * will be as is expected and it can be written out.  However, if
3714          * the page state has changed it must be handled accordingly.
3715          */
3716         mapping = pp->mapping;
3717         redirty_page_for_writepage(wbc, pp);
3718         unlock_page(pp);
3719
3720         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3721             pgoff, pglen, RL_WRITER);
3722         lock_page(pp);
3723
3724         /* Page mapping changed or it was no longer dirty, we're done */
3725         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3726                 unlock_page(pp);
3727                 zfs_rangelock_exit(lr);
3728                 zfs_exit(zfsvfs, FTAG);
3729                 return (0);
3730         }
3731
3732         /* Another process started write block if required */
3733         if (PageWriteback(pp)) {
3734                 unlock_page(pp);
3735                 zfs_rangelock_exit(lr);
3736
3737                 if (wbc->sync_mode != WB_SYNC_NONE) {
3738                         /*
3739                          * Speed up any non-sync page writebacks since
3740                          * they may take several seconds to complete.
3741                          * Refer to the comment in zpl_fsync() (when
3742                          * HAVE_FSYNC_RANGE is defined) for details.
3743                          */
3744                         if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
3745                                 zil_commit(zfsvfs->z_log, zp->z_id);
3746                         }
3747
3748                         if (PageWriteback(pp))
3749 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3750                                 folio_wait_bit(page_folio(pp), PG_writeback);
3751 #else
3752                                 wait_on_page_bit(pp, PG_writeback);
3753 #endif
3754                 }
3755
3756                 zfs_exit(zfsvfs, FTAG);
3757                 return (0);
3758         }
3759
3760         /* Clear the dirty flag the required locks are held */
3761         if (!clear_page_dirty_for_io(pp)) {
3762                 unlock_page(pp);
3763                 zfs_rangelock_exit(lr);
3764                 zfs_exit(zfsvfs, FTAG);
3765                 return (0);
3766         }
3767
3768         /*
3769          * Counterpart for redirty_page_for_writepage() above.  This page
3770          * was in fact not skipped and should not be counted as if it were.
3771          */
3772         wbc->pages_skipped--;
3773         if (!for_sync)
3774                 atomic_inc_32(&zp->z_async_writes_cnt);
3775         set_page_writeback(pp);
3776         unlock_page(pp);
3777
3778         tx = dmu_tx_create(zfsvfs->z_os);
3779         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3780         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3781         zfs_sa_upgrade_txholds(tx, zp);
3782
3783         err = dmu_tx_assign(tx, TXG_NOWAIT);
3784         if (err != 0) {
3785                 if (err == ERESTART)
3786                         dmu_tx_wait(tx);
3787
3788                 dmu_tx_abort(tx);
3789 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3790                 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3791 #else
3792                 __set_page_dirty_nobuffers(pp);
3793 #endif
3794                 ClearPageError(pp);
3795                 end_page_writeback(pp);
3796                 if (!for_sync)
3797                         atomic_dec_32(&zp->z_async_writes_cnt);
3798                 zfs_rangelock_exit(lr);
3799                 zfs_exit(zfsvfs, FTAG);
3800                 return (err);
3801         }
3802
3803         va = kmap(pp);
3804         ASSERT3U(pglen, <=, PAGE_SIZE);
3805         dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
3806         kunmap(pp);
3807
3808         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3809         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3810         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3811             &zp->z_pflags, 8);
3812
3813         /* Preserve the mtime and ctime provided by the inode */
3814         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3815         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3816         zp->z_atime_dirty = B_FALSE;
3817         zp->z_seq++;
3818
3819         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3820
3821         zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
3822             for_sync ? zfs_putpage_sync_commit_cb :
3823             zfs_putpage_async_commit_cb, pp);
3824
3825         dmu_tx_commit(tx);
3826
3827         zfs_rangelock_exit(lr);
3828
3829         if (wbc->sync_mode != WB_SYNC_NONE) {
3830                 /*
3831                  * Note that this is rarely called under writepages(), because
3832                  * writepages() normally handles the entire commit for
3833                  * performance reasons.
3834                  */
3835                 zil_commit(zfsvfs->z_log, zp->z_id);
3836         } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
3837                 /*
3838                  * If the caller does not intend to wait synchronously
3839                  * for this page writeback to complete and there are active
3840                  * synchronous calls on this file, do a commit so that
3841                  * the latter don't accidentally end up waiting for
3842                  * our writeback to complete. Refer to the comment in
3843                  * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
3844                  */
3845                 zil_commit(zfsvfs->z_log, zp->z_id);
3846         }
3847
3848         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3849
3850         zfs_exit(zfsvfs, FTAG);
3851         return (err);
3852 }
3853
3854 /*
3855  * Update the system attributes when the inode has been dirtied.  For the
3856  * moment we only update the mode, atime, mtime, and ctime.
3857  */
3858 int
3859 zfs_dirty_inode(struct inode *ip, int flags)
3860 {
3861         znode_t         *zp = ITOZ(ip);
3862         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3863         dmu_tx_t        *tx;
3864         uint64_t        mode, atime[2], mtime[2], ctime[2];
3865         sa_bulk_attr_t  bulk[4];
3866         int             error = 0;
3867         int             cnt = 0;
3868
3869         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
3870                 return (0);
3871
3872         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3873                 return (error);
3874
3875 #ifdef I_DIRTY_TIME
3876         /*
3877          * This is the lazytime semantic introduced in Linux 4.0
3878          * This flag will only be called from update_time when lazytime is set.
3879          * (Note, I_DIRTY_SYNC will also set if not lazytime)
3880          * Fortunately mtime and ctime are managed within ZFS itself, so we
3881          * only need to dirty atime.
3882          */
3883         if (flags == I_DIRTY_TIME) {
3884                 zp->z_atime_dirty = B_TRUE;
3885                 goto out;
3886         }
3887 #endif
3888
3889         tx = dmu_tx_create(zfsvfs->z_os);
3890
3891         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3892         zfs_sa_upgrade_txholds(tx, zp);
3893
3894         error = dmu_tx_assign(tx, TXG_WAIT);
3895         if (error) {
3896                 dmu_tx_abort(tx);
3897                 goto out;
3898         }
3899
3900         mutex_enter(&zp->z_lock);
3901         zp->z_atime_dirty = B_FALSE;
3902
3903         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
3904         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
3905         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3906         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3907
3908         /* Preserve the mode, mtime and ctime provided by the inode */
3909         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3910         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3911         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3912         mode = ip->i_mode;
3913
3914         zp->z_mode = mode;
3915
3916         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3917         mutex_exit(&zp->z_lock);
3918
3919         dmu_tx_commit(tx);
3920 out:
3921         zfs_exit(zfsvfs, FTAG);
3922         return (error);
3923 }
3924
3925 void
3926 zfs_inactive(struct inode *ip)
3927 {
3928         znode_t *zp = ITOZ(ip);
3929         zfsvfs_t *zfsvfs = ITOZSB(ip);
3930         uint64_t atime[2];
3931         int error;
3932         int need_unlock = 0;
3933
3934         /* Only read lock if we haven't already write locked, e.g. rollback */
3935         if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
3936                 need_unlock = 1;
3937                 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3938         }
3939         if (zp->z_sa_hdl == NULL) {
3940                 if (need_unlock)
3941                         rw_exit(&zfsvfs->z_teardown_inactive_lock);
3942                 return;
3943         }
3944
3945         if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
3946                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3947
3948                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3949                 zfs_sa_upgrade_txholds(tx, zp);
3950                 error = dmu_tx_assign(tx, TXG_WAIT);
3951                 if (error) {
3952                         dmu_tx_abort(tx);
3953                 } else {
3954                         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3955                         mutex_enter(&zp->z_lock);
3956                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3957                             (void *)&atime, sizeof (atime), tx);
3958                         zp->z_atime_dirty = B_FALSE;
3959                         mutex_exit(&zp->z_lock);
3960                         dmu_tx_commit(tx);
3961                 }
3962         }
3963
3964         zfs_zinactive(zp);
3965         if (need_unlock)
3966                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3967 }
3968
3969 /*
3970  * Fill pages with data from the disk.
3971  */
3972 static int
3973 zfs_fillpage(struct inode *ip, struct page *pp)
3974 {
3975         zfsvfs_t *zfsvfs = ITOZSB(ip);
3976         loff_t i_size = i_size_read(ip);
3977         u_offset_t io_off = page_offset(pp);
3978         size_t io_len = PAGE_SIZE;
3979
3980         ASSERT3U(io_off, <, i_size);
3981
3982         if (io_off + io_len > i_size)
3983                 io_len = i_size - io_off;
3984
3985         void *va = kmap(pp);
3986         int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
3987             io_len, va, DMU_READ_PREFETCH);
3988         if (io_len != PAGE_SIZE)
3989                 memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
3990         kunmap(pp);
3991
3992         if (error) {
3993                 /* convert checksum errors into IO errors */
3994                 if (error == ECKSUM)
3995                         error = SET_ERROR(EIO);
3996
3997                 SetPageError(pp);
3998                 ClearPageUptodate(pp);
3999         } else {
4000                 ClearPageError(pp);
4001                 SetPageUptodate(pp);
4002         }
4003
4004         return (error);
4005 }
4006
4007 /*
4008  * Uses zfs_fillpage to read data from the file and fill the page.
4009  *
4010  *      IN:     ip       - inode of file to get data from.
4011  *              pp       - page to read
4012  *
4013  *      RETURN: 0 on success, error code on failure.
4014  *
4015  * Timestamps:
4016  *      vp - atime updated
4017  */
4018 int
4019 zfs_getpage(struct inode *ip, struct page *pp)
4020 {
4021         zfsvfs_t *zfsvfs = ITOZSB(ip);
4022         znode_t *zp = ITOZ(ip);
4023         int error;
4024
4025         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4026                 return (error);
4027
4028         error = zfs_fillpage(ip, pp);
4029         if (error == 0)
4030                 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
4031
4032         zfs_exit(zfsvfs, FTAG);
4033
4034         return (error);
4035 }
4036
4037 /*
4038  * Check ZFS specific permissions to memory map a section of a file.
4039  *
4040  *      IN:     ip      - inode of the file to mmap
4041  *              off     - file offset
4042  *              addrp   - start address in memory region
4043  *              len     - length of memory region
4044  *              vm_flags- address flags
4045  *
4046  *      RETURN: 0 if success
4047  *              error code if failure
4048  */
4049 int
4050 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4051     unsigned long vm_flags)
4052 {
4053         (void) addrp;
4054         znode_t  *zp = ITOZ(ip);
4055         zfsvfs_t *zfsvfs = ITOZSB(ip);
4056         int error;
4057
4058         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4059                 return (error);
4060
4061         if ((vm_flags & VM_WRITE) && (zp->z_pflags &
4062             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4063                 zfs_exit(zfsvfs, FTAG);
4064                 return (SET_ERROR(EPERM));
4065         }
4066
4067         if ((vm_flags & (VM_READ | VM_EXEC)) &&
4068             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4069                 zfs_exit(zfsvfs, FTAG);
4070                 return (SET_ERROR(EACCES));
4071         }
4072
4073         if (off < 0 || len > MAXOFFSET_T - off) {
4074                 zfs_exit(zfsvfs, FTAG);
4075                 return (SET_ERROR(ENXIO));
4076         }
4077
4078         zfs_exit(zfsvfs, FTAG);
4079         return (0);
4080 }
4081
4082 /*
4083  * Free or allocate space in a file.  Currently, this function only
4084  * supports the `F_FREESP' command.  However, this command is somewhat
4085  * misnamed, as its functionality includes the ability to allocate as
4086  * well as free space.
4087  *
4088  *      IN:     zp      - znode of file to free data in.
4089  *              cmd     - action to take (only F_FREESP supported).
4090  *              bfp     - section of file to free/alloc.
4091  *              flag    - current file open mode flags.
4092  *              offset  - current file offset.
4093  *              cr      - credentials of caller.
4094  *
4095  *      RETURN: 0 on success, error code on failure.
4096  *
4097  * Timestamps:
4098  *      zp - ctime|mtime updated
4099  */
4100 int
4101 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
4102     offset_t offset, cred_t *cr)
4103 {
4104         (void) offset;
4105         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
4106         uint64_t        off, len;
4107         int             error;
4108
4109         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4110                 return (error);
4111
4112         if (cmd != F_FREESP) {
4113                 zfs_exit(zfsvfs, FTAG);
4114                 return (SET_ERROR(EINVAL));
4115         }
4116
4117         /*
4118          * Callers might not be able to detect properly that we are read-only,
4119          * so check it explicitly here.
4120          */
4121         if (zfs_is_readonly(zfsvfs)) {
4122                 zfs_exit(zfsvfs, FTAG);
4123                 return (SET_ERROR(EROFS));
4124         }
4125
4126         if (bfp->l_len < 0) {
4127                 zfs_exit(zfsvfs, FTAG);
4128                 return (SET_ERROR(EINVAL));
4129         }
4130
4131         /*
4132          * Permissions aren't checked on Solaris because on this OS
4133          * zfs_space() can only be called with an opened file handle.
4134          * On Linux we can get here through truncate_range() which
4135          * operates directly on inodes, so we need to check access rights.
4136          */
4137         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
4138             zfs_init_idmap))) {
4139                 zfs_exit(zfsvfs, FTAG);
4140                 return (error);
4141         }
4142
4143         off = bfp->l_start;
4144         len = bfp->l_len; /* 0 means from off to end of file */
4145
4146         error = zfs_freesp(zp, off, len, flag, TRUE);
4147
4148         zfs_exit(zfsvfs, FTAG);
4149         return (error);
4150 }
4151
4152 int
4153 zfs_fid(struct inode *ip, fid_t *fidp)
4154 {
4155         znode_t         *zp = ITOZ(ip);
4156         zfsvfs_t        *zfsvfs = ITOZSB(ip);
4157         uint32_t        gen;
4158         uint64_t        gen64;
4159         uint64_t        object = zp->z_id;
4160         zfid_short_t    *zfid;
4161         int             size, i, error;
4162
4163         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
4164                 return (error);
4165
4166         if (fidp->fid_len < SHORT_FID_LEN) {
4167                 fidp->fid_len = SHORT_FID_LEN;
4168                 zfs_exit(zfsvfs, FTAG);
4169                 return (SET_ERROR(ENOSPC));
4170         }
4171
4172         if ((error = zfs_verify_zp(zp)) != 0) {
4173                 zfs_exit(zfsvfs, FTAG);
4174                 return (error);
4175         }
4176
4177         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4178             &gen64, sizeof (uint64_t))) != 0) {
4179                 zfs_exit(zfsvfs, FTAG);
4180                 return (error);
4181         }
4182
4183         gen = (uint32_t)gen64;
4184
4185         size = SHORT_FID_LEN;
4186
4187         zfid = (zfid_short_t *)fidp;
4188
4189         zfid->zf_len = size;
4190
4191         for (i = 0; i < sizeof (zfid->zf_object); i++)
4192                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4193
4194         /* Must have a non-zero generation number to distinguish from .zfs */
4195         if (gen == 0)
4196                 gen = 1;
4197         for (i = 0; i < sizeof (zfid->zf_gen); i++)
4198                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4199
4200         zfs_exit(zfsvfs, FTAG);
4201         return (0);
4202 }
4203
4204 #if defined(_KERNEL)
4205 EXPORT_SYMBOL(zfs_open);
4206 EXPORT_SYMBOL(zfs_close);
4207 EXPORT_SYMBOL(zfs_lookup);
4208 EXPORT_SYMBOL(zfs_create);
4209 EXPORT_SYMBOL(zfs_tmpfile);
4210 EXPORT_SYMBOL(zfs_remove);
4211 EXPORT_SYMBOL(zfs_mkdir);
4212 EXPORT_SYMBOL(zfs_rmdir);
4213 EXPORT_SYMBOL(zfs_readdir);
4214 EXPORT_SYMBOL(zfs_getattr_fast);
4215 EXPORT_SYMBOL(zfs_setattr);
4216 EXPORT_SYMBOL(zfs_rename);
4217 EXPORT_SYMBOL(zfs_symlink);
4218 EXPORT_SYMBOL(zfs_readlink);
4219 EXPORT_SYMBOL(zfs_link);
4220 EXPORT_SYMBOL(zfs_inactive);
4221 EXPORT_SYMBOL(zfs_space);
4222 EXPORT_SYMBOL(zfs_fid);
4223 EXPORT_SYMBOL(zfs_getpage);
4224 EXPORT_SYMBOL(zfs_putpage);
4225 EXPORT_SYMBOL(zfs_dirty_inode);
4226 EXPORT_SYMBOL(zfs_map);
4227
4228 /* CSTYLED */
4229 module_param(zfs_delete_blocks, ulong, 0644);
4230 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4231
4232 #endif