module/os/linux/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/atomic.h>
  45 #include <sys/pathname.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/zfs_dir.h>
  49 #include <sys/zfs_acl.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/dmu.h>
  53 #include <sys/dmu_objset.h>
  54 #include <sys/spa.h>
  55 #include <sys/txg.h>
  56 #include <sys/dbuf.h>
  57 #include <sys/zap.h>
  58 #include <sys/sa.h>
  59 #include <sys/policy.h>
  60 #include <sys/sunddi.h>
  61 #include <sys/sid.h>
  62 #include <sys/zfs_ctldir.h>
  63 #include <sys/zfs_fuid.h>
  64 #include <sys/zfs_quota.h>
  65 #include <sys/zfs_sa.h>
  66 #include <sys/zfs_vnops.h>
  67 #include <sys/zfs_rlock.h>
  68 #include <sys/cred.h>
  69 #include <sys/zpl.h>
  70 #include <sys/zil.h>
  71 #include <sys/sa_impl.h>
  72
  73 /*
  74  * Programming rules.
  75  *
  76  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  77  * properly lock its in-core state, create a DMU transaction, do the work,
  78  * record this work in the intent log (ZIL), commit the DMU transaction,
  79  * and wait for the intent log to commit if it is a synchronous operation.
  80  * Moreover, the vnode ops must work in both normal and log replay context.
  81  * The ordering of events is important to avoid deadlocks and references
  82  * to freed memory.  The example below illustrates the following Big Rules:
  83  *
  84  *  (1) A check must be made in each zfs thread for a mounted file system.
  85  *      This is done avoiding races using zfs_enter(zfsvfs).
  86  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
  87  *      must be checked with zfs_verify_zp(zp).  Both of these macros
  88  *      can return EIO from the calling function.
  89  *
  90  *  (2) zrele() should always be the last thing except for zil_commit() (if
  91  *      necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
  92  *      last reference, the vnode/znode can be freed, so the zp may point to
  93  *      freed memory.  Second, the last reference will call zfs_zinactive(),
  94  *      which may induce a lot of work -- pushing cached pages (which acquires
  95  *      range locks) and syncing out cached atime changes.  Third,
  96  *      zfs_zinactive() may require a new tx, which could deadlock the system
  97  *      if you were already holding one. This deadlock occurs because the tx
  98  *      currently being operated on prevents a txg from syncing, which
  99  *      prevents the new tx from progressing, resulting in a deadlock.  If you
 100  *      must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
 101  *      is a synonym for zrele().
 102  *
 103  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 104  *      as they can span dmu_tx_assign() calls.
 105  *
 106  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 107  *      dmu_tx_assign().  This is critical because we don't want to block
 108  *      while holding locks.
 109  *
 110  *      If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
 111  *      reduces lock contention and CPU usage when we must wait (note that if
 112  *      throughput is constrained by the storage, nearly every transaction
 113  *      must wait).
 114  *
 115  *      Note, in particular, that if a lock is sometimes acquired before
 116  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 117  *      to use a non-blocking assign can deadlock the system.  The scenario:
 118  *
 119  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 120  *      Thread B is in an already-assigned tx, and blocks for this lock.
 121  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 122  *      forever, because the previous txg can't quiesce until B's tx commits.
 123  *
 124  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 125  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 126  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 127  *      to indicate that this operation has already called dmu_tx_wait().
 128  *      This will ensure that we don't retry forever, waiting a short bit
 129  *      each time.
 130  *
 131  *  (5) If the operation succeeded, generate the intent log entry for it
 132  *      before dropping locks.  This ensures that the ordering of events
 133  *      in the intent log matches the order in which they actually occurred.
 134  *      During ZIL replay the zfs_log_* functions will update the sequence
 135  *      number to indicate the zil transaction has replayed.
 136  *
 137  *  (6) At the end of each vnode op, the DMU tx must always commit,
 138  *      regardless of whether there were any errors.
 139  *
 140  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 141  *      to ensure that synchronous semantics are provided when necessary.
 142  *
 143  * In general, this is how things should be ordered in each vnode op:
 144  *
 145  *      zfs_enter(zfsvfs);              // exit if unmounted
 146  * top:
 147  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 148  *      rw_enter(...);                  // grab any other locks you need
 149  *      tx = dmu_tx_create(...);        // get DMU tx
 150  *      dmu_tx_hold_*();                // hold each object you might modify
 151  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 152  *      if (error) {
 153  *              rw_exit(...);           // drop locks
 154  *              zfs_dirent_unlock(dl);  // unlock directory entry
 155  *              zrele(...);             // release held znodes
 156  *              if (error == ERESTART) {
 157  *                      waited = B_TRUE;
 158  *                      dmu_tx_wait(tx);
 159  *                      dmu_tx_abort(tx);
 160  *                      goto top;
 161  *              }
 162  *              dmu_tx_abort(tx);       // abort DMU tx
 163  *              zfs_exit(zfsvfs);       // finished in zfs
 164  *              return (error);         // really out of space
 165  *      }
 166  *      error = do_real_work();         // do whatever this VOP does
 167  *      if (error == 0)
 168  *              zfs_log_*(...);         // on success, make ZIL entry
 169  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 170  *      rw_exit(...);                   // drop locks
 171  *      zfs_dirent_unlock(dl);          // unlock directory entry
 172  *      zrele(...);                     // release held znodes
 173  *      zil_commit(zilog, foid);        // synchronous when necessary
 174  *      zfs_exit(zfsvfs);               // finished in zfs
 175  *      return (error);                 // done, report error
 176  */
 177 int
 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 179 {
 180         (void) cr;
 181         znode_t *zp = ITOZ(ip);
 182         zfsvfs_t *zfsvfs = ITOZSB(ip);
 183         int error;
 184
 185         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 186                 return (error);
 187
 188         /* Honor ZFS_APPENDONLY file attribute */
 189         if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
 190             ((flag & O_APPEND) == 0)) {
 191                 zfs_exit(zfsvfs, FTAG);
 192                 return (SET_ERROR(EPERM));
 193         }
 194
 195         /*
 196          * Keep a count of the synchronous opens in the znode.  On first
 197          * synchronous open we must convert all previous async transactions
 198          * into sync to keep correct ordering.
 199          */
 200         if (flag & O_SYNC) {
 201                 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
 202                         zil_async_to_sync(zfsvfs->z_log, zp->z_id);
 203         }
 204
 205         zfs_exit(zfsvfs, FTAG);
 206         return (0);
 207 }
 208
 209 int
 210 zfs_close(struct inode *ip, int flag, cred_t *cr)
 211 {
 212         (void) cr;
 213         znode_t *zp = ITOZ(ip);
 214         zfsvfs_t *zfsvfs = ITOZSB(ip);
 215         int error;
 216
 217         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 218                 return (error);
 219
 220         /* Decrement the synchronous opens in the znode */
 221         if (flag & O_SYNC)
 222                 atomic_dec_32(&zp->z_sync_cnt);
 223
 224         zfs_exit(zfsvfs, FTAG);
 225         return (0);
 226 }
 227
 228 #if defined(_KERNEL)
 229
 230 static int zfs_fillpage(struct inode *ip, struct page *pp);
 231
 232 /*
 233  * When a file is memory mapped, we must keep the IO data synchronized
 234  * between the DMU cache and the memory mapped pages.  Update all mapped
 235  * pages with the contents of the coresponding dmu buffer.
 236  */
 237 void
 238 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 239 {
 240         struct address_space *mp = ZTOI(zp)->i_mapping;
 241         int64_t off = start & (PAGE_SIZE - 1);
 242
 243         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 244                 uint64_t nbytes = MIN(PAGE_SIZE - off, len);
 245
 246                 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 247                 if (pp) {
 248                         if (mapping_writably_mapped(mp))
 249                                 flush_dcache_page(pp);
 250
 251                         void *pb = kmap(pp);
 252                         int error = dmu_read(os, zp->z_id, start + off,
 253                             nbytes, pb + off, DMU_READ_PREFETCH);
 254                         kunmap(pp);
 255
 256                         if (error) {
 257                                 SetPageError(pp);
 258                                 ClearPageUptodate(pp);
 259                         } else {
 260                                 ClearPageError(pp);
 261                                 SetPageUptodate(pp);
 262
 263                                 if (mapping_writably_mapped(mp))
 264                                         flush_dcache_page(pp);
 265
 266                                 mark_page_accessed(pp);
 267                         }
 268
 269                         unlock_page(pp);
 270                         put_page(pp);
 271                 }
 272
 273                 len -= nbytes;
 274                 off = 0;
 275         }
 276 }
 277
 278 /*
 279  * When a file is memory mapped, we must keep the I/O data synchronized
 280  * between the DMU cache and the memory mapped pages.  Preferentially read
 281  * from memory mapped pages, otherwise fallback to reading through the dmu.
 282  */
 283 int
 284 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 285 {
 286         struct inode *ip = ZTOI(zp);
 287         struct address_space *mp = ip->i_mapping;
 288         int64_t start = uio->uio_loffset;
 289         int64_t off = start & (PAGE_SIZE - 1);
 290         int len = nbytes;
 291         int error = 0;
 292
 293         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 294                 uint64_t bytes = MIN(PAGE_SIZE - off, len);
 295
 296                 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 297                 if (pp) {
 298                         /*
 299                          * If filemap_fault() retries there exists a window
 300                          * where the page will be unlocked and not up to date.
 301                          * In this case we must try and fill the page.
 302                          */
 303                         if (unlikely(!PageUptodate(pp))) {
 304                                 error = zfs_fillpage(ip, pp);
 305                                 if (error) {
 306                                         unlock_page(pp);
 307                                         put_page(pp);
 308                                         return (error);
 309                                 }
 310                         }
 311
 312                         ASSERT(PageUptodate(pp) || PageDirty(pp));
 313
 314                         unlock_page(pp);
 315
 316                         void *pb = kmap(pp);
 317                         error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 318                         kunmap(pp);
 319
 320                         if (mapping_writably_mapped(mp))
 321                                 flush_dcache_page(pp);
 322
 323                         mark_page_accessed(pp);
 324                         put_page(pp);
 325                 } else {
 326                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 327                             uio, bytes);
 328                 }
 329
 330                 len -= bytes;
 331                 off = 0;
 332
 333                 if (error)
 334                         break;
 335         }
 336
 337         return (error);
 338 }
 339 #endif /* _KERNEL */
 340
 341 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 342
 343 /*
 344  * Write the bytes to a file.
 345  *
 346  *      IN:     zp      - znode of file to be written to
 347  *              data    - bytes to write
 348  *              len     - number of bytes to write
 349  *              pos     - offset to start writing at
 350  *
 351  *      OUT:    resid   - remaining bytes to write
 352  *
 353  *      RETURN: 0 if success
 354  *              positive error code if failure.  EIO is returned
 355  *              for a short write when residp isn't provided.
 356  *
 357  * Timestamps:
 358  *      zp - ctime|mtime updated if byte count > 0
 359  */
 360 int
 361 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 362     loff_t pos, size_t *residp)
 363 {
 364         fstrans_cookie_t cookie;
 365         int error;
 366
 367         struct iovec iov;
 368         iov.iov_base = (void *)data;
 369         iov.iov_len = len;
 370
 371         zfs_uio_t uio;
 372         zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
 373
 374         cookie = spl_fstrans_mark();
 375         error = zfs_write(zp, &uio, 0, kcred);
 376         spl_fstrans_unmark(cookie);
 377
 378         if (error == 0) {
 379                 if (residp != NULL)
 380                         *residp = zfs_uio_resid(&uio);
 381                 else if (zfs_uio_resid(&uio) != 0)
 382                         error = SET_ERROR(EIO);
 383         }
 384
 385         return (error);
 386 }
 387
 388 static void
 389 zfs_rele_async_task(void *arg)
 390 {
 391         iput(arg);
 392 }
 393
 394 void
 395 zfs_zrele_async(znode_t *zp)
 396 {
 397         struct inode *ip = ZTOI(zp);
 398         objset_t *os = ITOZSB(ip)->z_os;
 399
 400         ASSERT(atomic_read(&ip->i_count) > 0);
 401         ASSERT(os != NULL);
 402
 403         /*
 404          * If decrementing the count would put us at 0, we can't do it inline
 405          * here, because that would be synchronous. Instead, dispatch an iput
 406          * to run later.
 407          *
 408          * For more information on the dangers of a synchronous iput, see the
 409          * header comment of this file.
 410          */
 411         if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 412                 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 413                     zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 414         }
 415 }
 416
 417
 418 /*
 419  * Lookup an entry in a directory, or an extended attribute directory.
 420  * If it exists, return a held inode reference for it.
 421  *
 422  *      IN:     zdp     - znode of directory to search.
 423  *              nm      - name of entry to lookup.
 424  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 425  *              cr      - credentials of caller.
 426  *              direntflags - directory lookup flags
 427  *              realpnp - returned pathname.
 428  *
 429  *      OUT:    zpp     - znode of located entry, NULL if not found.
 430  *
 431  *      RETURN: 0 on success, error code on failure.
 432  *
 433  * Timestamps:
 434  *      NA
 435  */
 436 int
 437 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
 438     int *direntflags, pathname_t *realpnp)
 439 {
 440         zfsvfs_t *zfsvfs = ZTOZSB(zdp);
 441         int error = 0;
 442
 443         /*
 444          * Fast path lookup, however we must skip DNLC lookup
 445          * for case folding or normalizing lookups because the
 446          * DNLC code only stores the passed in name.  This means
 447          * creating 'a' and removing 'A' on a case insensitive
 448          * file system would work, but DNLC still thinks 'a'
 449          * exists and won't let you create it again on the next
 450          * pass through fast path.
 451          */
 452         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 453
 454                 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 455                         return (SET_ERROR(ENOTDIR));
 456                 } else if (zdp->z_sa_hdl == NULL) {
 457                         return (SET_ERROR(EIO));
 458                 }
 459
 460                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 461                         error = zfs_fastaccesschk_execute(zdp, cr);
 462                         if (!error) {
 463                                 *zpp = zdp;
 464                                 zhold(*zpp);
 465                                 return (0);
 466                         }
 467                         return (error);
 468                 }
 469         }
 470
 471         if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 472                 return (error);
 473
 474         *zpp = NULL;
 475
 476         if (flags & LOOKUP_XATTR) {
 477                 /*
 478                  * We don't allow recursive attributes..
 479                  * Maybe someday we will.
 480                  */
 481                 if (zdp->z_pflags & ZFS_XATTR) {
 482                         zfs_exit(zfsvfs, FTAG);
 483                         return (SET_ERROR(EINVAL));
 484                 }
 485
 486                 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
 487                         zfs_exit(zfsvfs, FTAG);
 488                         return (error);
 489                 }
 490
 491                 /*
 492                  * Do we have permission to get into attribute directory?
 493                  */
 494
 495                 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
 496                     B_TRUE, cr, zfs_init_idmap))) {
 497                         zrele(*zpp);
 498                         *zpp = NULL;
 499                 }
 500
 501                 zfs_exit(zfsvfs, FTAG);
 502                 return (error);
 503         }
 504
 505         if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 506                 zfs_exit(zfsvfs, FTAG);
 507                 return (SET_ERROR(ENOTDIR));
 508         }
 509
 510         /*
 511          * Check accessibility of directory.
 512          */
 513
 514         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 515             zfs_init_idmap))) {
 516                 zfs_exit(zfsvfs, FTAG);
 517                 return (error);
 518         }
 519
 520         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 521             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 522                 zfs_exit(zfsvfs, FTAG);
 523                 return (SET_ERROR(EILSEQ));
 524         }
 525
 526         error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
 527         if ((error == 0) && (*zpp))
 528                 zfs_znode_update_vfs(*zpp);
 529
 530         zfs_exit(zfsvfs, FTAG);
 531         return (error);
 532 }
 533
 534 /*
 535  * Attempt to create a new entry in a directory.  If the entry
 536  * already exists, truncate the file if permissible, else return
 537  * an error.  Return the ip of the created or trunc'd file.
 538  *
 539  *      IN:     dzp     - znode of directory to put new file entry in.
 540  *              name    - name of new file entry.
 541  *              vap     - attributes of new file.
 542  *              excl    - flag indicating exclusive or non-exclusive mode.
 543  *              mode    - mode to open file with.
 544  *              cr      - credentials of caller.
 545  *              flag    - file flag.
 546  *              vsecp   - ACL to be set
 547  *              mnt_ns  - user namespace of the mount
 548  *
 549  *      OUT:    zpp     - znode of created or trunc'd entry.
 550  *
 551  *      RETURN: 0 on success, error code on failure.
 552  *
 553  * Timestamps:
 554  *      dzp - ctime|mtime updated if new entry created
 555  *       zp - ctime|mtime always, atime if new
 556  */
 557 int
 558 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
 559     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
 560     zidmap_t *mnt_ns)
 561 {
 562         znode_t         *zp;
 563         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 564         zilog_t         *zilog;
 565         objset_t        *os;
 566         zfs_dirlock_t   *dl;
 567         dmu_tx_t        *tx;
 568         int             error;
 569         uid_t           uid;
 570         gid_t           gid;
 571         zfs_acl_ids_t   acl_ids;
 572         boolean_t       fuid_dirtied;
 573         boolean_t       have_acl = B_FALSE;
 574         boolean_t       waited = B_FALSE;
 575         boolean_t       skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 576
 577         /*
 578          * If we have an ephemeral id, ACL, or XVATTR then
 579          * make sure file system is at proper version
 580          */
 581
 582         gid = crgetgid(cr);
 583         uid = crgetuid(cr);
 584
 585         if (zfsvfs->z_use_fuids == B_FALSE &&
 586             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 587                 return (SET_ERROR(EINVAL));
 588
 589         if (name == NULL)
 590                 return (SET_ERROR(EINVAL));
 591
 592         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 593                 return (error);
 594         os = zfsvfs->z_os;
 595         zilog = zfsvfs->z_log;
 596
 597         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 598             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 599                 zfs_exit(zfsvfs, FTAG);
 600                 return (SET_ERROR(EILSEQ));
 601         }
 602
 603         if (vap->va_mask & ATTR_XVATTR) {
 604                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 605                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 606                         zfs_exit(zfsvfs, FTAG);
 607                         return (error);
 608                 }
 609         }
 610
 611 top:
 612         *zpp = NULL;
 613         if (*name == '\0') {
 614                 /*
 615                  * Null component name refers to the directory itself.
 616                  */
 617                 zhold(dzp);
 618                 zp = dzp;
 619                 dl = NULL;
 620                 error = 0;
 621         } else {
 622                 /* possible igrab(zp) */
 623                 int zflg = 0;
 624
 625                 if (flag & FIGNORECASE)
 626                         zflg |= ZCILOOK;
 627
 628                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 629                     NULL, NULL);
 630                 if (error) {
 631                         if (have_acl)
 632                                 zfs_acl_ids_free(&acl_ids);
 633                         if (strcmp(name, "..") == 0)
 634                                 error = SET_ERROR(EISDIR);
 635                         zfs_exit(zfsvfs, FTAG);
 636                         return (error);
 637                 }
 638         }
 639
 640         if (zp == NULL) {
 641                 uint64_t txtype;
 642                 uint64_t projid = ZFS_DEFAULT_PROJID;
 643
 644                 /*
 645                  * Create a new file object and update the directory
 646                  * to reference it.
 647                  */
 648                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
 649                     mnt_ns))) {
 650                         if (have_acl)
 651                                 zfs_acl_ids_free(&acl_ids);
 652                         goto out;
 653                 }
 654
 655                 /*
 656                  * We only support the creation of regular files in
 657                  * extended attribute directories.
 658                  */
 659
 660                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 661                         if (have_acl)
 662                                 zfs_acl_ids_free(&acl_ids);
 663                         error = SET_ERROR(EINVAL);
 664                         goto out;
 665                 }
 666
 667                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 668                     cr, vsecp, &acl_ids, mnt_ns)) != 0)
 669                         goto out;
 670                 have_acl = B_TRUE;
 671
 672                 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 673                         projid = zfs_inherit_projid(dzp);
 674                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 675                         zfs_acl_ids_free(&acl_ids);
 676                         error = SET_ERROR(EDQUOT);
 677                         goto out;
 678                 }
 679
 680                 tx = dmu_tx_create(os);
 681
 682                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 683                     ZFS_SA_BASE_ATTR_SIZE);
 684
 685                 fuid_dirtied = zfsvfs->z_fuid_dirty;
 686                 if (fuid_dirtied)
 687                         zfs_fuid_txhold(zfsvfs, tx);
 688                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 689                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 690                 if (!zfsvfs->z_use_sa &&
 691                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 692                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 693                             0, acl_ids.z_aclp->z_acl_bytes);
 694                 }
 695
 696                 error = dmu_tx_assign(tx,
 697                     (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 698                 if (error) {
 699                         zfs_dirent_unlock(dl);
 700                         if (error == ERESTART) {
 701                                 waited = B_TRUE;
 702                                 dmu_tx_wait(tx);
 703                                 dmu_tx_abort(tx);
 704                                 goto top;
 705                         }
 706                         zfs_acl_ids_free(&acl_ids);
 707                         dmu_tx_abort(tx);
 708                         zfs_exit(zfsvfs, FTAG);
 709                         return (error);
 710                 }
 711                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 712
 713                 error = zfs_link_create(dl, zp, tx, ZNEW);
 714                 if (error != 0) {
 715                         /*
 716                          * Since, we failed to add the directory entry for it,
 717                          * delete the newly created dnode.
 718                          */
 719                         zfs_znode_delete(zp, tx);
 720                         remove_inode_hash(ZTOI(zp));
 721                         zfs_acl_ids_free(&acl_ids);
 722                         dmu_tx_commit(tx);
 723                         goto out;
 724                 }
 725
 726                 if (fuid_dirtied)
 727                         zfs_fuid_sync(zfsvfs, tx);
 728
 729                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 730                 if (flag & FIGNORECASE)
 731                         txtype |= TX_CI;
 732                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 733                     vsecp, acl_ids.z_fuidp, vap);
 734                 zfs_acl_ids_free(&acl_ids);
 735                 dmu_tx_commit(tx);
 736         } else {
 737                 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
 738
 739                 if (have_acl)
 740                         zfs_acl_ids_free(&acl_ids);
 741
 742                 /*
 743                  * A directory entry already exists for this name.
 744                  */
 745                 /*
 746                  * Can't truncate an existing file if in exclusive mode.
 747                  */
 748                 if (excl) {
 749                         error = SET_ERROR(EEXIST);
 750                         goto out;
 751                 }
 752                 /*
 753                  * Can't open a directory for writing.
 754                  */
 755                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
 756                         error = SET_ERROR(EISDIR);
 757                         goto out;
 758                 }
 759                 /*
 760                  * Verify requested access to file.
 761                  */
 762                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
 763                     mnt_ns))) {
 764                         goto out;
 765                 }
 766
 767                 mutex_enter(&dzp->z_lock);
 768                 dzp->z_seq++;
 769                 mutex_exit(&dzp->z_lock);
 770
 771                 /*
 772                  * Truncate regular files if requested.
 773                  */
 774                 if (S_ISREG(ZTOI(zp)->i_mode) &&
 775                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 776                         /* we can't hold any locks when calling zfs_freesp() */
 777                         if (dl) {
 778                                 zfs_dirent_unlock(dl);
 779                                 dl = NULL;
 780                         }
 781                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
 782                 }
 783         }
 784 out:
 785
 786         if (dl)
 787                 zfs_dirent_unlock(dl);
 788
 789         if (error) {
 790                 if (zp)
 791                         zrele(zp);
 792         } else {
 793                 zfs_znode_update_vfs(dzp);
 794                 zfs_znode_update_vfs(zp);
 795                 *zpp = zp;
 796         }
 797
 798         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 799                 zil_commit(zilog, 0);
 800
 801         zfs_exit(zfsvfs, FTAG);
 802         return (error);
 803 }
 804
 805 int
 806 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
 807     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
 808     zidmap_t *mnt_ns)
 809 {
 810         (void) excl, (void) mode, (void) flag;
 811         znode_t         *zp = NULL, *dzp = ITOZ(dip);
 812         zfsvfs_t        *zfsvfs = ITOZSB(dip);
 813         objset_t        *os;
 814         dmu_tx_t        *tx;
 815         int             error;
 816         uid_t           uid;
 817         gid_t           gid;
 818         zfs_acl_ids_t   acl_ids;
 819         uint64_t        projid = ZFS_DEFAULT_PROJID;
 820         boolean_t       fuid_dirtied;
 821         boolean_t       have_acl = B_FALSE;
 822         boolean_t       waited = B_FALSE;
 823
 824         /*
 825          * If we have an ephemeral id, ACL, or XVATTR then
 826          * make sure file system is at proper version
 827          */
 828
 829         gid = crgetgid(cr);
 830         uid = crgetuid(cr);
 831
 832         if (zfsvfs->z_use_fuids == B_FALSE &&
 833             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 834                 return (SET_ERROR(EINVAL));
 835
 836         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 837                 return (error);
 838         os = zfsvfs->z_os;
 839
 840         if (vap->va_mask & ATTR_XVATTR) {
 841                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 842                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 843                         zfs_exit(zfsvfs, FTAG);
 844                         return (error);
 845                 }
 846         }
 847
 848 top:
 849         *ipp = NULL;
 850
 851         /*
 852          * Create a new file object and update the directory
 853          * to reference it.
 854          */
 855         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 856                 if (have_acl)
 857                         zfs_acl_ids_free(&acl_ids);
 858                 goto out;
 859         }
 860
 861         if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 862             cr, vsecp, &acl_ids, mnt_ns)) != 0)
 863                 goto out;
 864         have_acl = B_TRUE;
 865
 866         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 867                 projid = zfs_inherit_projid(dzp);
 868         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 869                 zfs_acl_ids_free(&acl_ids);
 870                 error = SET_ERROR(EDQUOT);
 871                 goto out;
 872         }
 873
 874         tx = dmu_tx_create(os);
 875
 876         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 877             ZFS_SA_BASE_ATTR_SIZE);
 878         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 879
 880         fuid_dirtied = zfsvfs->z_fuid_dirty;
 881         if (fuid_dirtied)
 882                 zfs_fuid_txhold(zfsvfs, tx);
 883         if (!zfsvfs->z_use_sa &&
 884             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 885                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 886                     0, acl_ids.z_aclp->z_acl_bytes);
 887         }
 888         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 889         if (error) {
 890                 if (error == ERESTART) {
 891                         waited = B_TRUE;
 892                         dmu_tx_wait(tx);
 893                         dmu_tx_abort(tx);
 894                         goto top;
 895                 }
 896                 zfs_acl_ids_free(&acl_ids);
 897                 dmu_tx_abort(tx);
 898                 zfs_exit(zfsvfs, FTAG);
 899                 return (error);
 900         }
 901         zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 902
 903         if (fuid_dirtied)
 904                 zfs_fuid_sync(zfsvfs, tx);
 905
 906         /* Add to unlinked set */
 907         zp->z_unlinked = B_TRUE;
 908         zfs_unlinked_add(zp, tx);
 909         zfs_acl_ids_free(&acl_ids);
 910         dmu_tx_commit(tx);
 911 out:
 912
 913         if (error) {
 914                 if (zp)
 915                         zrele(zp);
 916         } else {
 917                 zfs_znode_update_vfs(dzp);
 918                 zfs_znode_update_vfs(zp);
 919                 *ipp = ZTOI(zp);
 920         }
 921
 922         zfs_exit(zfsvfs, FTAG);
 923         return (error);
 924 }
 925
 926 /*
 927  * Remove an entry from a directory.
 928  *
 929  *      IN:     dzp     - znode of directory to remove entry from.
 930  *              name    - name of entry to remove.
 931  *              cr      - credentials of caller.
 932  *              flags   - case flags.
 933  *
 934  *      RETURN: 0 if success
 935  *              error code if failure
 936  *
 937  * Timestamps:
 938  *      dzp - ctime|mtime
 939  *       ip - ctime (if nlink > 0)
 940  */
 941
 942 static uint64_t null_xattr = 0;
 943
 944 int
 945 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 946 {
 947         znode_t         *zp;
 948         znode_t         *xzp;
 949         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 950         zilog_t         *zilog;
 951         uint64_t        acl_obj, xattr_obj;
 952         uint64_t        xattr_obj_unlinked = 0;
 953         uint64_t        obj = 0;
 954         uint64_t        links;
 955         zfs_dirlock_t   *dl;
 956         dmu_tx_t        *tx;
 957         boolean_t       may_delete_now, delete_now = FALSE;
 958         boolean_t       unlinked, toobig = FALSE;
 959         uint64_t        txtype;
 960         pathname_t      *realnmp = NULL;
 961         pathname_t      realnm;
 962         int             error;
 963         int             zflg = ZEXISTS;
 964         boolean_t       waited = B_FALSE;
 965
 966         if (name == NULL)
 967                 return (SET_ERROR(EINVAL));
 968
 969         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 970                 return (error);
 971         zilog = zfsvfs->z_log;
 972
 973         if (flags & FIGNORECASE) {
 974                 zflg |= ZCILOOK;
 975                 pn_alloc(&realnm);
 976                 realnmp = &realnm;
 977         }
 978
 979 top:
 980         xattr_obj = 0;
 981         xzp = NULL;
 982         /*
 983          * Attempt to lock directory; fail if entry doesn't exist.
 984          */
 985         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 986             NULL, realnmp))) {
 987                 if (realnmp)
 988                         pn_free(realnmp);
 989                 zfs_exit(zfsvfs, FTAG);
 990                 return (error);
 991         }
 992
 993         if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
 994                 goto out;
 995         }
 996
 997         /*
 998          * Need to use rmdir for removing directories.
 999          */
1000         if (S_ISDIR(ZTOI(zp)->i_mode)) {
1001                 error = SET_ERROR(EPERM);
1002                 goto out;
1003         }
1004
1005         mutex_enter(&zp->z_lock);
1006         may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
1007             !zn_has_cached_data(zp, 0, LLONG_MAX);
1008         mutex_exit(&zp->z_lock);
1009
1010         /*
1011          * We may delete the znode now, or we may put it in the unlinked set;
1012          * it depends on whether we're the last link, and on whether there are
1013          * other holds on the inode.  So we dmu_tx_hold() the right things to
1014          * allow for either case.
1015          */
1016         obj = zp->z_id;
1017         tx = dmu_tx_create(zfsvfs->z_os);
1018         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1019         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1020         zfs_sa_upgrade_txholds(tx, zp);
1021         zfs_sa_upgrade_txholds(tx, dzp);
1022         if (may_delete_now) {
1023                 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1024                 /* if the file is too big, only hold_free a token amount */
1025                 dmu_tx_hold_free(tx, zp->z_id, 0,
1026                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1027         }
1028
1029         /* are there any extended attributes? */
1030         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1031             &xattr_obj, sizeof (xattr_obj));
1032         if (error == 0 && xattr_obj) {
1033                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1034                 ASSERT0(error);
1035                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1036                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1037         }
1038
1039         mutex_enter(&zp->z_lock);
1040         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1041                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1042         mutex_exit(&zp->z_lock);
1043
1044         /* charge as an update -- would be nice not to charge at all */
1045         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1046
1047         /*
1048          * Mark this transaction as typically resulting in a net free of space
1049          */
1050         dmu_tx_mark_netfree(tx);
1051
1052         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1053         if (error) {
1054                 zfs_dirent_unlock(dl);
1055                 if (error == ERESTART) {
1056                         waited = B_TRUE;
1057                         dmu_tx_wait(tx);
1058                         dmu_tx_abort(tx);
1059                         zrele(zp);
1060                         if (xzp)
1061                                 zrele(xzp);
1062                         goto top;
1063                 }
1064                 if (realnmp)
1065                         pn_free(realnmp);
1066                 dmu_tx_abort(tx);
1067                 zrele(zp);
1068                 if (xzp)
1069                         zrele(xzp);
1070                 zfs_exit(zfsvfs, FTAG);
1071                 return (error);
1072         }
1073
1074         /*
1075          * Remove the directory entry.
1076          */
1077         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1078
1079         if (error) {
1080                 dmu_tx_commit(tx);
1081                 goto out;
1082         }
1083
1084         if (unlinked) {
1085                 /*
1086                  * Hold z_lock so that we can make sure that the ACL obj
1087                  * hasn't changed.  Could have been deleted due to
1088                  * zfs_sa_upgrade().
1089                  */
1090                 mutex_enter(&zp->z_lock);
1091                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1092                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1093                 delete_now = may_delete_now && !toobig &&
1094                     atomic_read(&ZTOI(zp)->i_count) == 1 &&
1095                     !zn_has_cached_data(zp, 0, LLONG_MAX) &&
1096                     xattr_obj == xattr_obj_unlinked &&
1097                     zfs_external_acl(zp) == acl_obj;
1098                 VERIFY_IMPLY(xattr_obj_unlinked, xzp);
1099         }
1100
1101         if (delete_now) {
1102                 if (xattr_obj_unlinked) {
1103                         ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1104                         mutex_enter(&xzp->z_lock);
1105                         xzp->z_unlinked = B_TRUE;
1106                         clear_nlink(ZTOI(xzp));
1107                         links = 0;
1108                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1109                             &links, sizeof (links), tx);
1110                         ASSERT3U(error,  ==,  0);
1111                         mutex_exit(&xzp->z_lock);
1112                         zfs_unlinked_add(xzp, tx);
1113
1114                         if (zp->z_is_sa)
1115                                 error = sa_remove(zp->z_sa_hdl,
1116                                     SA_ZPL_XATTR(zfsvfs), tx);
1117                         else
1118                                 error = sa_update(zp->z_sa_hdl,
1119                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
1120                                     sizeof (uint64_t), tx);
1121                         ASSERT0(error);
1122                 }
1123                 /*
1124                  * Add to the unlinked set because a new reference could be
1125                  * taken concurrently resulting in a deferred destruction.
1126                  */
1127                 zfs_unlinked_add(zp, tx);
1128                 mutex_exit(&zp->z_lock);
1129         } else if (unlinked) {
1130                 mutex_exit(&zp->z_lock);
1131                 zfs_unlinked_add(zp, tx);
1132         }
1133
1134         txtype = TX_REMOVE;
1135         if (flags & FIGNORECASE)
1136                 txtype |= TX_CI;
1137         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1138
1139         dmu_tx_commit(tx);
1140 out:
1141         if (realnmp)
1142                 pn_free(realnmp);
1143
1144         zfs_dirent_unlock(dl);
1145         zfs_znode_update_vfs(dzp);
1146         zfs_znode_update_vfs(zp);
1147
1148         if (delete_now)
1149                 zrele(zp);
1150         else
1151                 zfs_zrele_async(zp);
1152
1153         if (xzp) {
1154                 zfs_znode_update_vfs(xzp);
1155                 zfs_zrele_async(xzp);
1156         }
1157
1158         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1159                 zil_commit(zilog, 0);
1160
1161         zfs_exit(zfsvfs, FTAG);
1162         return (error);
1163 }
1164
1165 /*
1166  * Create a new directory and insert it into dzp using the name
1167  * provided.  Return a pointer to the inserted directory.
1168  *
1169  *      IN:     dzp     - znode of directory to add subdir to.
1170  *              dirname - name of new directory.
1171  *              vap     - attributes of new directory.
1172  *              cr      - credentials of caller.
1173  *              flags   - case flags.
1174  *              vsecp   - ACL to be set
1175  *              mnt_ns  - user namespace of the mount
1176  *
1177  *      OUT:    zpp     - znode of created directory.
1178  *
1179  *      RETURN: 0 if success
1180  *              error code if failure
1181  *
1182  * Timestamps:
1183  *      dzp - ctime|mtime updated
1184  *      zpp - ctime|mtime|atime updated
1185  */
1186 int
1187 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1188     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1189 {
1190         znode_t         *zp;
1191         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1192         zilog_t         *zilog;
1193         zfs_dirlock_t   *dl;
1194         uint64_t        txtype;
1195         dmu_tx_t        *tx;
1196         int             error;
1197         int             zf = ZNEW;
1198         uid_t           uid;
1199         gid_t           gid = crgetgid(cr);
1200         zfs_acl_ids_t   acl_ids;
1201         boolean_t       fuid_dirtied;
1202         boolean_t       waited = B_FALSE;
1203
1204         ASSERT(S_ISDIR(vap->va_mode));
1205
1206         /*
1207          * If we have an ephemeral id, ACL, or XVATTR then
1208          * make sure file system is at proper version
1209          */
1210
1211         uid = crgetuid(cr);
1212         if (zfsvfs->z_use_fuids == B_FALSE &&
1213             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1214                 return (SET_ERROR(EINVAL));
1215
1216         if (dirname == NULL)
1217                 return (SET_ERROR(EINVAL));
1218
1219         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1220                 return (error);
1221         zilog = zfsvfs->z_log;
1222
1223         if (dzp->z_pflags & ZFS_XATTR) {
1224                 zfs_exit(zfsvfs, FTAG);
1225                 return (SET_ERROR(EINVAL));
1226         }
1227
1228         if (zfsvfs->z_utf8 && u8_validate(dirname,
1229             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1230                 zfs_exit(zfsvfs, FTAG);
1231                 return (SET_ERROR(EILSEQ));
1232         }
1233         if (flags & FIGNORECASE)
1234                 zf |= ZCILOOK;
1235
1236         if (vap->va_mask & ATTR_XVATTR) {
1237                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1238                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1239                         zfs_exit(zfsvfs, FTAG);
1240                         return (error);
1241                 }
1242         }
1243
1244         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1245             vsecp, &acl_ids, mnt_ns)) != 0) {
1246                 zfs_exit(zfsvfs, FTAG);
1247                 return (error);
1248         }
1249         /*
1250          * First make sure the new directory doesn't exist.
1251          *
1252          * Existence is checked first to make sure we don't return
1253          * EACCES instead of EEXIST which can cause some applications
1254          * to fail.
1255          */
1256 top:
1257         *zpp = NULL;
1258
1259         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1260             NULL, NULL))) {
1261                 zfs_acl_ids_free(&acl_ids);
1262                 zfs_exit(zfsvfs, FTAG);
1263                 return (error);
1264         }
1265
1266         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1267             mnt_ns))) {
1268                 zfs_acl_ids_free(&acl_ids);
1269                 zfs_dirent_unlock(dl);
1270                 zfs_exit(zfsvfs, FTAG);
1271                 return (error);
1272         }
1273
1274         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1275                 zfs_acl_ids_free(&acl_ids);
1276                 zfs_dirent_unlock(dl);
1277                 zfs_exit(zfsvfs, FTAG);
1278                 return (SET_ERROR(EDQUOT));
1279         }
1280
1281         /*
1282          * Add a new entry to the directory.
1283          */
1284         tx = dmu_tx_create(zfsvfs->z_os);
1285         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1286         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1287         fuid_dirtied = zfsvfs->z_fuid_dirty;
1288         if (fuid_dirtied)
1289                 zfs_fuid_txhold(zfsvfs, tx);
1290         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1291                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1292                     acl_ids.z_aclp->z_acl_bytes);
1293         }
1294
1295         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1296             ZFS_SA_BASE_ATTR_SIZE);
1297
1298         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1299         if (error) {
1300                 zfs_dirent_unlock(dl);
1301                 if (error == ERESTART) {
1302                         waited = B_TRUE;
1303                         dmu_tx_wait(tx);
1304                         dmu_tx_abort(tx);
1305                         goto top;
1306                 }
1307                 zfs_acl_ids_free(&acl_ids);
1308                 dmu_tx_abort(tx);
1309                 zfs_exit(zfsvfs, FTAG);
1310                 return (error);
1311         }
1312
1313         /*
1314          * Create new node.
1315          */
1316         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1317
1318         /*
1319          * Now put new name in parent dir.
1320          */
1321         error = zfs_link_create(dl, zp, tx, ZNEW);
1322         if (error != 0) {
1323                 zfs_znode_delete(zp, tx);
1324                 remove_inode_hash(ZTOI(zp));
1325                 goto out;
1326         }
1327
1328         if (fuid_dirtied)
1329                 zfs_fuid_sync(zfsvfs, tx);
1330
1331         *zpp = zp;
1332
1333         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1334         if (flags & FIGNORECASE)
1335                 txtype |= TX_CI;
1336         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1337             acl_ids.z_fuidp, vap);
1338
1339 out:
1340         zfs_acl_ids_free(&acl_ids);
1341
1342         dmu_tx_commit(tx);
1343
1344         zfs_dirent_unlock(dl);
1345
1346         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1347                 zil_commit(zilog, 0);
1348
1349         if (error != 0) {
1350                 zrele(zp);
1351         } else {
1352                 zfs_znode_update_vfs(dzp);
1353                 zfs_znode_update_vfs(zp);
1354         }
1355         zfs_exit(zfsvfs, FTAG);
1356         return (error);
1357 }
1358
1359 /*
1360  * Remove a directory subdir entry.  If the current working
1361  * directory is the same as the subdir to be removed, the
1362  * remove will fail.
1363  *
1364  *      IN:     dzp     - znode of directory to remove from.
1365  *              name    - name of directory to be removed.
1366  *              cwd     - inode of current working directory.
1367  *              cr      - credentials of caller.
1368  *              flags   - case flags
1369  *
1370  *      RETURN: 0 on success, error code on failure.
1371  *
1372  * Timestamps:
1373  *      dzp - ctime|mtime updated
1374  */
1375 int
1376 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1377     int flags)
1378 {
1379         znode_t         *zp;
1380         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1381         zilog_t         *zilog;
1382         zfs_dirlock_t   *dl;
1383         dmu_tx_t        *tx;
1384         int             error;
1385         int             zflg = ZEXISTS;
1386         boolean_t       waited = B_FALSE;
1387
1388         if (name == NULL)
1389                 return (SET_ERROR(EINVAL));
1390
1391         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1392                 return (error);
1393         zilog = zfsvfs->z_log;
1394
1395         if (flags & FIGNORECASE)
1396                 zflg |= ZCILOOK;
1397 top:
1398         zp = NULL;
1399
1400         /*
1401          * Attempt to lock directory; fail if entry doesn't exist.
1402          */
1403         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1404             NULL, NULL))) {
1405                 zfs_exit(zfsvfs, FTAG);
1406                 return (error);
1407         }
1408
1409         if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1410                 goto out;
1411         }
1412
1413         if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1414                 error = SET_ERROR(ENOTDIR);
1415                 goto out;
1416         }
1417
1418         if (zp == cwd) {
1419                 error = SET_ERROR(EINVAL);
1420                 goto out;
1421         }
1422
1423         /*
1424          * Grab a lock on the directory to make sure that no one is
1425          * trying to add (or lookup) entries while we are removing it.
1426          */
1427         rw_enter(&zp->z_name_lock, RW_WRITER);
1428
1429         /*
1430          * Grab a lock on the parent pointer to make sure we play well
1431          * with the treewalk and directory rename code.
1432          */
1433         rw_enter(&zp->z_parent_lock, RW_WRITER);
1434
1435         tx = dmu_tx_create(zfsvfs->z_os);
1436         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1437         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1438         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1439         zfs_sa_upgrade_txholds(tx, zp);
1440         zfs_sa_upgrade_txholds(tx, dzp);
1441         dmu_tx_mark_netfree(tx);
1442         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1443         if (error) {
1444                 rw_exit(&zp->z_parent_lock);
1445                 rw_exit(&zp->z_name_lock);
1446                 zfs_dirent_unlock(dl);
1447                 if (error == ERESTART) {
1448                         waited = B_TRUE;
1449                         dmu_tx_wait(tx);
1450                         dmu_tx_abort(tx);
1451                         zrele(zp);
1452                         goto top;
1453                 }
1454                 dmu_tx_abort(tx);
1455                 zrele(zp);
1456                 zfs_exit(zfsvfs, FTAG);
1457                 return (error);
1458         }
1459
1460         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1461
1462         if (error == 0) {
1463                 uint64_t txtype = TX_RMDIR;
1464                 if (flags & FIGNORECASE)
1465                         txtype |= TX_CI;
1466                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1467                     B_FALSE);
1468         }
1469
1470         dmu_tx_commit(tx);
1471
1472         rw_exit(&zp->z_parent_lock);
1473         rw_exit(&zp->z_name_lock);
1474 out:
1475         zfs_dirent_unlock(dl);
1476
1477         zfs_znode_update_vfs(dzp);
1478         zfs_znode_update_vfs(zp);
1479         zrele(zp);
1480
1481         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1482                 zil_commit(zilog, 0);
1483
1484         zfs_exit(zfsvfs, FTAG);
1485         return (error);
1486 }
1487
1488 /*
1489  * Read directory entries from the given directory cursor position and emit
1490  * name and position for each entry.
1491  *
1492  *      IN:     ip      - inode of directory to read.
1493  *              ctx     - directory entry context.
1494  *              cr      - credentials of caller.
1495  *
1496  *      RETURN: 0 if success
1497  *              error code if failure
1498  *
1499  * Timestamps:
1500  *      ip - atime updated
1501  *
1502  * Note that the low 4 bits of the cookie returned by zap is always zero.
1503  * This allows us to use the low range for "special" directory entries:
1504  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1505  * we use the offset 2 for the '.zfs' directory.
1506  */
1507 int
1508 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
1509 {
1510         (void) cr;
1511         znode_t         *zp = ITOZ(ip);
1512         zfsvfs_t        *zfsvfs = ITOZSB(ip);
1513         objset_t        *os;
1514         zap_cursor_t    zc;
1515         zap_attribute_t zap;
1516         int             error;
1517         uint8_t         prefetch;
1518         uint8_t         type;
1519         int             done = 0;
1520         uint64_t        parent;
1521         uint64_t        offset; /* must be unsigned; checks for < 1 */
1522
1523         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1524                 return (error);
1525
1526         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1527             &parent, sizeof (parent))) != 0)
1528                 goto out;
1529
1530         /*
1531          * Quit if directory has been removed (posix)
1532          */
1533         if (zp->z_unlinked)
1534                 goto out;
1535
1536         error = 0;
1537         os = zfsvfs->z_os;
1538         offset = ctx->pos;
1539         prefetch = zp->z_zn_prefetch;
1540
1541         /*
1542          * Initialize the iterator cursor.
1543          */
1544         if (offset <= 3) {
1545                 /*
1546                  * Start iteration from the beginning of the directory.
1547                  */
1548                 zap_cursor_init(&zc, os, zp->z_id);
1549         } else {
1550                 /*
1551                  * The offset is a serialized cursor.
1552                  */
1553                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1554         }
1555
1556         /*
1557          * Transform to file-system independent format
1558          */
1559         while (!done) {
1560                 uint64_t objnum;
1561                 /*
1562                  * Special case `.', `..', and `.zfs'.
1563                  */
1564                 if (offset == 0) {
1565                         (void) strcpy(zap.za_name, ".");
1566                         zap.za_normalization_conflict = 0;
1567                         objnum = zp->z_id;
1568                         type = DT_DIR;
1569                 } else if (offset == 1) {
1570                         (void) strcpy(zap.za_name, "..");
1571                         zap.za_normalization_conflict = 0;
1572                         objnum = parent;
1573                         type = DT_DIR;
1574                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1575                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1576                         zap.za_normalization_conflict = 0;
1577                         objnum = ZFSCTL_INO_ROOT;
1578                         type = DT_DIR;
1579                 } else {
1580                         /*
1581                          * Grab next entry.
1582                          */
1583                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
1584                                 if (error == ENOENT)
1585                                         break;
1586                                 else
1587                                         goto update;
1588                         }
1589
1590                         /*
1591                          * Allow multiple entries provided the first entry is
1592                          * the object id.  Non-zpl consumers may safely make
1593                          * use of the additional space.
1594                          *
1595                          * XXX: This should be a feature flag for compatibility
1596                          */
1597                         if (zap.za_integer_length != 8 ||
1598                             zap.za_num_integers == 0) {
1599                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1600                                     "entry, obj = %lld, offset = %lld, "
1601                                     "length = %d, num = %lld\n",
1602                                     (u_longlong_t)zp->z_id,
1603                                     (u_longlong_t)offset,
1604                                     zap.za_integer_length,
1605                                     (u_longlong_t)zap.za_num_integers);
1606                                 error = SET_ERROR(ENXIO);
1607                                 goto update;
1608                         }
1609
1610                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1611                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1612                 }
1613
1614                 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
1615                     objnum, type);
1616                 if (done)
1617                         break;
1618
1619                 if (prefetch)
1620                         dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1621
1622                 /*
1623                  * Move to the next entry, fill in the previous offset.
1624                  */
1625                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1626                         zap_cursor_advance(&zc);
1627                         offset = zap_cursor_serialize(&zc);
1628                 } else {
1629                         offset += 1;
1630                 }
1631                 ctx->pos = offset;
1632         }
1633         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1634
1635 update:
1636         zap_cursor_fini(&zc);
1637         if (error == ENOENT)
1638                 error = 0;
1639 out:
1640         zfs_exit(zfsvfs, FTAG);
1641
1642         return (error);
1643 }
1644
1645 /*
1646  * Get the basic file attributes and place them in the provided kstat
1647  * structure.  The inode is assumed to be the authoritative source
1648  * for most of the attributes.  However, the znode currently has the
1649  * authoritative atime, blksize, and block count.
1650  *
1651  *      IN:     ip      - inode of file.
1652  *
1653  *      OUT:    sp      - kstat values.
1654  *
1655  *      RETURN: 0 (always succeeds)
1656  */
1657 int
1658 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
1659 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
1660     struct kstat *sp)
1661 #else
1662 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
1663 #endif
1664 {
1665         znode_t *zp = ITOZ(ip);
1666         zfsvfs_t *zfsvfs = ITOZSB(ip);
1667         uint32_t blksize;
1668         u_longlong_t nblocks;
1669         int error;
1670
1671         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1672                 return (error);
1673
1674         mutex_enter(&zp->z_lock);
1675
1676 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
1677         zpl_generic_fillattr(user_ns, request_mask, ip, sp);
1678 #else
1679         zpl_generic_fillattr(user_ns, ip, sp);
1680 #endif
1681         /*
1682          * +1 link count for root inode with visible '.zfs' directory.
1683          */
1684         if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1685                 if (sp->nlink < ZFS_LINK_MAX)
1686                         sp->nlink++;
1687
1688         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1689         sp->blksize = blksize;
1690         sp->blocks = nblocks;
1691
1692         if (unlikely(zp->z_blksz == 0)) {
1693                 /*
1694                  * Block size hasn't been set; suggest maximal I/O transfers.
1695                  */
1696                 sp->blksize = zfsvfs->z_max_blksz;
1697         }
1698
1699         mutex_exit(&zp->z_lock);
1700
1701         /*
1702          * Required to prevent NFS client from detecting different inode
1703          * numbers of snapshot root dentry before and after snapshot mount.
1704          */
1705         if (zfsvfs->z_issnap) {
1706                 if (ip->i_sb->s_root->d_inode == ip)
1707                         sp->ino = ZFSCTL_INO_SNAPDIRS -
1708                             dmu_objset_id(zfsvfs->z_os);
1709         }
1710
1711         zfs_exit(zfsvfs, FTAG);
1712
1713         return (0);
1714 }
1715
1716 /*
1717  * For the operation of changing file's user/group/project, we need to
1718  * handle not only the main object that is assigned to the file directly,
1719  * but also the ones that are used by the file via hidden xattr directory.
1720  *
1721  * Because the xattr directory may contains many EA entries, as to it may
1722  * be impossible to change all of them via the transaction of changing the
1723  * main object's user/group/project attributes. Then we have to change them
1724  * via other multiple independent transactions one by one. It may be not good
1725  * solution, but we have no better idea yet.
1726  */
1727 static int
1728 zfs_setattr_dir(znode_t *dzp)
1729 {
1730         struct inode    *dxip = ZTOI(dzp);
1731         struct inode    *xip = NULL;
1732         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1733         objset_t        *os = zfsvfs->z_os;
1734         zap_cursor_t    zc;
1735         zap_attribute_t zap;
1736         zfs_dirlock_t   *dl;
1737         znode_t         *zp = NULL;
1738         dmu_tx_t        *tx = NULL;
1739         uint64_t        uid, gid;
1740         sa_bulk_attr_t  bulk[4];
1741         int             count;
1742         int             err;
1743
1744         zap_cursor_init(&zc, os, dzp->z_id);
1745         while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
1746                 count = 0;
1747                 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
1748                         err = ENXIO;
1749                         break;
1750                 }
1751
1752                 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
1753                     ZEXISTS, NULL, NULL);
1754                 if (err == ENOENT)
1755                         goto next;
1756                 if (err)
1757                         break;
1758
1759                 xip = ZTOI(zp);
1760                 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1761                     KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1762                     zp->z_projid == dzp->z_projid)
1763                         goto next;
1764
1765                 tx = dmu_tx_create(os);
1766                 if (!(zp->z_pflags & ZFS_PROJID))
1767                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1768                 else
1769                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1770
1771                 err = dmu_tx_assign(tx, TXG_WAIT);
1772                 if (err)
1773                         break;
1774
1775                 mutex_enter(&dzp->z_lock);
1776
1777                 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1778                         xip->i_uid = dxip->i_uid;
1779                         uid = zfs_uid_read(dxip);
1780                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1781                             &uid, sizeof (uid));
1782                 }
1783
1784                 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1785                         xip->i_gid = dxip->i_gid;
1786                         gid = zfs_gid_read(dxip);
1787                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1788                             &gid, sizeof (gid));
1789                 }
1790
1791                 if (zp->z_projid != dzp->z_projid) {
1792                         if (!(zp->z_pflags & ZFS_PROJID)) {
1793                                 zp->z_pflags |= ZFS_PROJID;
1794                                 SA_ADD_BULK_ATTR(bulk, count,
1795                                     SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
1796                                     sizeof (zp->z_pflags));
1797                         }
1798
1799                         zp->z_projid = dzp->z_projid;
1800                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
1801                             NULL, &zp->z_projid, sizeof (zp->z_projid));
1802                 }
1803
1804                 mutex_exit(&dzp->z_lock);
1805
1806                 if (likely(count > 0)) {
1807                         err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1808                         dmu_tx_commit(tx);
1809                 } else {
1810                         dmu_tx_abort(tx);
1811                 }
1812                 tx = NULL;
1813                 if (err != 0 && err != ENOENT)
1814                         break;
1815
1816 next:
1817                 if (zp) {
1818                         zrele(zp);
1819                         zp = NULL;
1820                         zfs_dirent_unlock(dl);
1821                 }
1822                 zap_cursor_advance(&zc);
1823         }
1824
1825         if (tx)
1826                 dmu_tx_abort(tx);
1827         if (zp) {
1828                 zrele(zp);
1829                 zfs_dirent_unlock(dl);
1830         }
1831         zap_cursor_fini(&zc);
1832
1833         return (err == ENOENT ? 0 : err);
1834 }
1835
1836 /*
1837  * Set the file attributes to the values contained in the
1838  * vattr structure.
1839  *
1840  *      IN:     zp      - znode of file to be modified.
1841  *              vap     - new attribute values.
1842  *                        If ATTR_XVATTR set, then optional attrs are being set
1843  *              flags   - ATTR_UTIME set if non-default time values provided.
1844  *                      - ATTR_NOACLCHECK (CIFS context only).
1845  *              cr      - credentials of caller.
1846  *              mnt_ns  - user namespace of the mount
1847  *
1848  *      RETURN: 0 if success
1849  *              error code if failure
1850  *
1851  * Timestamps:
1852  *      ip - ctime updated, mtime updated if size changed.
1853  */
1854 int
1855 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
1856 {
1857         struct inode    *ip;
1858         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
1859         objset_t        *os;
1860         zilog_t         *zilog;
1861         dmu_tx_t        *tx;
1862         vattr_t         oldva;
1863         xvattr_t        *tmpxvattr;
1864         uint_t          mask = vap->va_mask;
1865         uint_t          saved_mask = 0;
1866         int             trim_mask = 0;
1867         uint64_t        new_mode;
1868         uint64_t        new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1869         uint64_t        xattr_obj;
1870         uint64_t        mtime[2], ctime[2], atime[2];
1871         uint64_t        projid = ZFS_INVALID_PROJID;
1872         znode_t         *attrzp;
1873         int             need_policy = FALSE;
1874         int             err, err2 = 0;
1875         zfs_fuid_info_t *fuidp = NULL;
1876         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1877         xoptattr_t      *xoap;
1878         zfs_acl_t       *aclp;
1879         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1880         boolean_t       fuid_dirtied = B_FALSE;
1881         boolean_t       handle_eadir = B_FALSE;
1882         sa_bulk_attr_t  *bulk, *xattr_bulk;
1883         int             count = 0, xattr_count = 0, bulks = 8;
1884
1885         if (mask == 0)
1886                 return (0);
1887
1888         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1889                 return (err);
1890         ip = ZTOI(zp);
1891         os = zfsvfs->z_os;
1892
1893         /*
1894          * If this is a xvattr_t, then get a pointer to the structure of
1895          * optional attributes.  If this is NULL, then we have a vattr_t.
1896          */
1897         xoap = xva_getxoptattr(xvap);
1898         if (xoap != NULL && (mask & ATTR_XVATTR)) {
1899                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1900                         if (!dmu_objset_projectquota_enabled(os) ||
1901                             (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1902                                 zfs_exit(zfsvfs, FTAG);
1903                                 return (SET_ERROR(ENOTSUP));
1904                         }
1905
1906                         projid = xoap->xoa_projid;
1907                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
1908                                 zfs_exit(zfsvfs, FTAG);
1909                                 return (SET_ERROR(EINVAL));
1910                         }
1911
1912                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1913                                 projid = ZFS_INVALID_PROJID;
1914                         else
1915                                 need_policy = TRUE;
1916                 }
1917
1918                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1919                     (xoap->xoa_projinherit !=
1920                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1921                     (!dmu_objset_projectquota_enabled(os) ||
1922                     (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1923                         zfs_exit(zfsvfs, FTAG);
1924                         return (SET_ERROR(ENOTSUP));
1925                 }
1926         }
1927
1928         zilog = zfsvfs->z_log;
1929
1930         /*
1931          * Make sure that if we have ephemeral uid/gid or xvattr specified
1932          * that file system is at proper version level
1933          */
1934
1935         if (zfsvfs->z_use_fuids == B_FALSE &&
1936             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
1937             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
1938             (mask & ATTR_XVATTR))) {
1939                 zfs_exit(zfsvfs, FTAG);
1940                 return (SET_ERROR(EINVAL));
1941         }
1942
1943         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
1944                 zfs_exit(zfsvfs, FTAG);
1945                 return (SET_ERROR(EISDIR));
1946         }
1947
1948         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
1949                 zfs_exit(zfsvfs, FTAG);
1950                 return (SET_ERROR(EINVAL));
1951         }
1952
1953         tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
1954         xva_init(tmpxvattr);
1955
1956         bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1957         xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1958
1959         /*
1960          * Immutable files can only alter immutable bit and atime
1961          */
1962         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
1963             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
1964             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
1965                 err = SET_ERROR(EPERM);
1966                 goto out3;
1967         }
1968
1969         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
1970                 err = SET_ERROR(EPERM);
1971                 goto out3;
1972         }
1973
1974         /*
1975          * Verify timestamps doesn't overflow 32 bits.
1976          * ZFS can handle large timestamps, but 32bit syscalls can't
1977          * handle times greater than 2039.  This check should be removed
1978          * once large timestamps are fully supported.
1979          */
1980         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
1981                 if (((mask & ATTR_ATIME) &&
1982                     TIMESPEC_OVERFLOW(&vap->va_atime)) ||
1983                     ((mask & ATTR_MTIME) &&
1984                     TIMESPEC_OVERFLOW(&vap->va_mtime))) {
1985                         err = SET_ERROR(EOVERFLOW);
1986                         goto out3;
1987                 }
1988         }
1989
1990 top:
1991         attrzp = NULL;
1992         aclp = NULL;
1993
1994         /* Can this be moved to before the top label? */
1995         if (zfs_is_readonly(zfsvfs)) {
1996                 err = SET_ERROR(EROFS);
1997                 goto out3;
1998         }
1999
2000         /*
2001          * First validate permissions
2002          */
2003
2004         if (mask & ATTR_SIZE) {
2005                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
2006                     mnt_ns);
2007                 if (err)
2008                         goto out3;
2009
2010                 /*
2011                  * XXX - Note, we are not providing any open
2012                  * mode flags here (like FNDELAY), so we may
2013                  * block if there are locks present... this
2014                  * should be addressed in openat().
2015                  */
2016                 /* XXX - would it be OK to generate a log record here? */
2017                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2018                 if (err)
2019                         goto out3;
2020         }
2021
2022         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2023             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2024             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2025             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2026             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2027             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2028             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2029             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2030                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2031                     skipaclchk, cr, mnt_ns);
2032         }
2033
2034         if (mask & (ATTR_UID|ATTR_GID)) {
2035                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
2036                 int     take_owner;
2037                 int     take_group;
2038                 uid_t   uid;
2039                 gid_t   gid;
2040
2041                 /*
2042                  * NOTE: even if a new mode is being set,
2043                  * we may clear S_ISUID/S_ISGID bits.
2044                  */
2045
2046                 if (!(mask & ATTR_MODE))
2047                         vap->va_mode = zp->z_mode;
2048
2049                 /*
2050                  * Take ownership or chgrp to group we are a member of
2051                  */
2052
2053                 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
2054                     vap->va_uid);
2055                 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
2056                     vap->va_gid);
2057                 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
2058                 take_group = (mask & ATTR_GID) &&
2059                     zfs_groupmember(zfsvfs, gid, cr);
2060
2061                 /*
2062                  * If both ATTR_UID and ATTR_GID are set then take_owner and
2063                  * take_group must both be set in order to allow taking
2064                  * ownership.
2065                  *
2066                  * Otherwise, send the check through secpolicy_vnode_setattr()
2067                  *
2068                  */
2069
2070                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2071                     take_owner && take_group) ||
2072                     ((idmask == ATTR_UID) && take_owner) ||
2073                     ((idmask == ATTR_GID) && take_group)) {
2074                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2075                             skipaclchk, cr, mnt_ns) == 0) {
2076                                 /*
2077                                  * Remove setuid/setgid for non-privileged users
2078                                  */
2079                                 (void) secpolicy_setid_clear(vap, cr);
2080                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2081                         } else {
2082                                 need_policy =  TRUE;
2083                         }
2084                 } else {
2085                         need_policy =  TRUE;
2086                 }
2087         }
2088
2089         mutex_enter(&zp->z_lock);
2090         oldva.va_mode = zp->z_mode;
2091         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2092         if (mask & ATTR_XVATTR) {
2093                 /*
2094                  * Update xvattr mask to include only those attributes
2095                  * that are actually changing.
2096                  *
2097                  * the bits will be restored prior to actually setting
2098                  * the attributes so the caller thinks they were set.
2099                  */
2100                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2101                         if (xoap->xoa_appendonly !=
2102                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2103                                 need_policy = TRUE;
2104                         } else {
2105                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2106                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2107                         }
2108                 }
2109
2110                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2111                         if (xoap->xoa_projinherit !=
2112                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2113                                 need_policy = TRUE;
2114                         } else {
2115                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2116                                 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2117                         }
2118                 }
2119
2120                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2121                         if (xoap->xoa_nounlink !=
2122                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2123                                 need_policy = TRUE;
2124                         } else {
2125                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2126                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2127                         }
2128                 }
2129
2130                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2131                         if (xoap->xoa_immutable !=
2132                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2133                                 need_policy = TRUE;
2134                         } else {
2135                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2136                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2137                         }
2138                 }
2139
2140                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2141                         if (xoap->xoa_nodump !=
2142                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2143                                 need_policy = TRUE;
2144                         } else {
2145                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2146                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2147                         }
2148                 }
2149
2150                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2151                         if (xoap->xoa_av_modified !=
2152                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2153                                 need_policy = TRUE;
2154                         } else {
2155                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2156                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2157                         }
2158                 }
2159
2160                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2161                         if ((!S_ISREG(ip->i_mode) &&
2162                             xoap->xoa_av_quarantined) ||
2163                             xoap->xoa_av_quarantined !=
2164                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2165                                 need_policy = TRUE;
2166                         } else {
2167                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2168                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2169                         }
2170                 }
2171
2172                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2173                         mutex_exit(&zp->z_lock);
2174                         err = SET_ERROR(EPERM);
2175                         goto out3;
2176                 }
2177
2178                 if (need_policy == FALSE &&
2179                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2180                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2181                         need_policy = TRUE;
2182                 }
2183         }
2184
2185         mutex_exit(&zp->z_lock);
2186
2187         if (mask & ATTR_MODE) {
2188                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2189                     mnt_ns) == 0) {
2190                         err = secpolicy_setid_setsticky_clear(ip, vap,
2191                             &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
2192                         if (err)
2193                                 goto out3;
2194                         trim_mask |= ATTR_MODE;
2195                 } else {
2196                         need_policy = TRUE;
2197                 }
2198         }
2199
2200         if (need_policy) {
2201                 /*
2202                  * If trim_mask is set then take ownership
2203                  * has been granted or write_acl is present and user
2204                  * has the ability to modify mode.  In that case remove
2205                  * UID|GID and or MODE from mask so that
2206                  * secpolicy_vnode_setattr() doesn't revoke it.
2207                  */
2208
2209                 if (trim_mask) {
2210                         saved_mask = vap->va_mask;
2211                         vap->va_mask &= ~trim_mask;
2212                 }
2213                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2214                     zfs_zaccess_unix, zp);
2215                 if (err)
2216                         goto out3;
2217
2218                 if (trim_mask)
2219                         vap->va_mask |= saved_mask;
2220         }
2221
2222         /*
2223          * secpolicy_vnode_setattr, or take ownership may have
2224          * changed va_mask
2225          */
2226         mask = vap->va_mask;
2227
2228         if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2229                 handle_eadir = B_TRUE;
2230                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2231                     &xattr_obj, sizeof (xattr_obj));
2232
2233                 if (err == 0 && xattr_obj) {
2234                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2235                         if (err)
2236                                 goto out2;
2237                 }
2238                 if (mask & ATTR_UID) {
2239                         new_kuid = zfs_fuid_create(zfsvfs,
2240                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2241                         if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2242                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2243                             new_kuid)) {
2244                                 if (attrzp)
2245                                         zrele(attrzp);
2246                                 err = SET_ERROR(EDQUOT);
2247                                 goto out2;
2248                         }
2249                 }
2250
2251                 if (mask & ATTR_GID) {
2252                         new_kgid = zfs_fuid_create(zfsvfs,
2253                             (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2254                         if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2255                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2256                             new_kgid)) {
2257                                 if (attrzp)
2258                                         zrele(attrzp);
2259                                 err = SET_ERROR(EDQUOT);
2260                                 goto out2;
2261                         }
2262                 }
2263
2264                 if (projid != ZFS_INVALID_PROJID &&
2265                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2266                         if (attrzp)
2267                                 zrele(attrzp);
2268                         err = EDQUOT;
2269                         goto out2;
2270                 }
2271         }
2272         tx = dmu_tx_create(os);
2273
2274         if (mask & ATTR_MODE) {
2275                 uint64_t pmode = zp->z_mode;
2276                 uint64_t acl_obj;
2277                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2278
2279                 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2280                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2281                         err = EPERM;
2282                         goto out;
2283                 }
2284
2285                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2286                         goto out;
2287
2288                 mutex_enter(&zp->z_lock);
2289                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2290                         /*
2291                          * Are we upgrading ACL from old V0 format
2292                          * to V1 format?
2293                          */
2294                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2295                             zfs_znode_acl_version(zp) ==
2296                             ZFS_ACL_VERSION_INITIAL) {
2297                                 dmu_tx_hold_free(tx, acl_obj, 0,
2298                                     DMU_OBJECT_END);
2299                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2300                                     0, aclp->z_acl_bytes);
2301                         } else {
2302                                 dmu_tx_hold_write(tx, acl_obj, 0,
2303                                     aclp->z_acl_bytes);
2304                         }
2305                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2306                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2307                             0, aclp->z_acl_bytes);
2308                 }
2309                 mutex_exit(&zp->z_lock);
2310                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2311         } else {
2312                 if (((mask & ATTR_XVATTR) &&
2313                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2314                     (projid != ZFS_INVALID_PROJID &&
2315                     !(zp->z_pflags & ZFS_PROJID)))
2316                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2317                 else
2318                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2319         }
2320
2321         if (attrzp) {
2322                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2323         }
2324
2325         fuid_dirtied = zfsvfs->z_fuid_dirty;
2326         if (fuid_dirtied)
2327                 zfs_fuid_txhold(zfsvfs, tx);
2328
2329         zfs_sa_upgrade_txholds(tx, zp);
2330
2331         err = dmu_tx_assign(tx, TXG_WAIT);
2332         if (err)
2333                 goto out;
2334
2335         count = 0;
2336         /*
2337          * Set each attribute requested.
2338          * We group settings according to the locks they need to acquire.
2339          *
2340          * Note: you cannot set ctime directly, although it will be
2341          * updated as a side-effect of calling this function.
2342          */
2343
2344         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2345                 /*
2346                  * For the existed object that is upgraded from old system,
2347                  * its on-disk layout has no slot for the project ID attribute.
2348                  * But quota accounting logic needs to access related slots by
2349                  * offset directly. So we need to adjust old objects' layout
2350                  * to make the project ID to some unified and fixed offset.
2351                  */
2352                 if (attrzp)
2353                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2354                 if (err == 0)
2355                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2356
2357                 if (unlikely(err == EEXIST))
2358                         err = 0;
2359                 else if (err != 0)
2360                         goto out;
2361                 else
2362                         projid = ZFS_INVALID_PROJID;
2363         }
2364
2365         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2366                 mutex_enter(&zp->z_acl_lock);
2367         mutex_enter(&zp->z_lock);
2368
2369         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2370             &zp->z_pflags, sizeof (zp->z_pflags));
2371
2372         if (attrzp) {
2373                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2374                         mutex_enter(&attrzp->z_acl_lock);
2375                 mutex_enter(&attrzp->z_lock);
2376                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2377                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2378                     sizeof (attrzp->z_pflags));
2379                 if (projid != ZFS_INVALID_PROJID) {
2380                         attrzp->z_projid = projid;
2381                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2382                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2383                             sizeof (attrzp->z_projid));
2384                 }
2385         }
2386
2387         if (mask & (ATTR_UID|ATTR_GID)) {
2388
2389                 if (mask & ATTR_UID) {
2390                         ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2391                         new_uid = zfs_uid_read(ZTOI(zp));
2392                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2393                             &new_uid, sizeof (new_uid));
2394                         if (attrzp) {
2395                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2396                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2397                                     sizeof (new_uid));
2398                                 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2399                         }
2400                 }
2401
2402                 if (mask & ATTR_GID) {
2403                         ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2404                         new_gid = zfs_gid_read(ZTOI(zp));
2405                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2406                             NULL, &new_gid, sizeof (new_gid));
2407                         if (attrzp) {
2408                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2409                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2410                                     sizeof (new_gid));
2411                                 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2412                         }
2413                 }
2414                 if (!(mask & ATTR_MODE)) {
2415                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2416                             NULL, &new_mode, sizeof (new_mode));
2417                         new_mode = zp->z_mode;
2418                 }
2419                 err = zfs_acl_chown_setattr(zp);
2420                 ASSERT(err == 0);
2421                 if (attrzp) {
2422                         err = zfs_acl_chown_setattr(attrzp);
2423                         ASSERT(err == 0);
2424                 }
2425         }
2426
2427         if (mask & ATTR_MODE) {
2428                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2429                     &new_mode, sizeof (new_mode));
2430                 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2431                 ASSERT3P(aclp, !=, NULL);
2432                 err = zfs_aclset_common(zp, aclp, cr, tx);
2433                 ASSERT0(err);
2434                 if (zp->z_acl_cached)
2435                         zfs_acl_free(zp->z_acl_cached);
2436                 zp->z_acl_cached = aclp;
2437                 aclp = NULL;
2438         }
2439
2440         if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2441                 zp->z_atime_dirty = B_FALSE;
2442                 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
2443                 ZFS_TIME_ENCODE(&tmp_atime, atime);
2444                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2445                     &atime, sizeof (atime));
2446         }
2447
2448         if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2449                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2450                 zpl_inode_set_mtime_to_ts(ZTOI(zp),
2451                     zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
2452
2453                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2454                     mtime, sizeof (mtime));
2455         }
2456
2457         if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2458                 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2459                 zpl_inode_set_ctime_to_ts(ZTOI(zp),
2460                     zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
2461                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2462                     ctime, sizeof (ctime));
2463         }
2464
2465         if (projid != ZFS_INVALID_PROJID) {
2466                 zp->z_projid = projid;
2467                 SA_ADD_BULK_ATTR(bulk, count,
2468                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2469                     sizeof (zp->z_projid));
2470         }
2471
2472         if (attrzp && mask) {
2473                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2474                     SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2475                     sizeof (ctime));
2476         }
2477
2478         /*
2479          * Do this after setting timestamps to prevent timestamp
2480          * update from toggling bit
2481          */
2482
2483         if (xoap && (mask & ATTR_XVATTR)) {
2484
2485                 /*
2486                  * restore trimmed off masks
2487                  * so that return masks can be set for caller.
2488                  */
2489
2490                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2491                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2492                 }
2493                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2494                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2495                 }
2496                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2497                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2498                 }
2499                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2500                         XVA_SET_REQ(xvap, XAT_NODUMP);
2501                 }
2502                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2503                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2504                 }
2505                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2506                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2507                 }
2508                 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2509                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2510                 }
2511
2512                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2513                         ASSERT(S_ISREG(ip->i_mode));
2514
2515                 zfs_xvattr_set(zp, xvap, tx);
2516         }
2517
2518         if (fuid_dirtied)
2519                 zfs_fuid_sync(zfsvfs, tx);
2520
2521         if (mask != 0)
2522                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2523
2524         mutex_exit(&zp->z_lock);
2525         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2526                 mutex_exit(&zp->z_acl_lock);
2527
2528         if (attrzp) {
2529                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2530                         mutex_exit(&attrzp->z_acl_lock);
2531                 mutex_exit(&attrzp->z_lock);
2532         }
2533 out:
2534         if (err == 0 && xattr_count > 0) {
2535                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2536                     xattr_count, tx);
2537                 ASSERT(err2 == 0);
2538         }
2539
2540         if (aclp)
2541                 zfs_acl_free(aclp);
2542
2543         if (fuidp) {
2544                 zfs_fuid_info_free(fuidp);
2545                 fuidp = NULL;
2546         }
2547
2548         if (err) {
2549                 dmu_tx_abort(tx);
2550                 if (attrzp)
2551                         zrele(attrzp);
2552                 if (err == ERESTART)
2553                         goto top;
2554         } else {
2555                 if (count > 0)
2556                         err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2557                 dmu_tx_commit(tx);
2558                 if (attrzp) {
2559                         if (err2 == 0 && handle_eadir)
2560                                 err = zfs_setattr_dir(attrzp);
2561                         zrele(attrzp);
2562                 }
2563                 zfs_znode_update_vfs(zp);
2564         }
2565
2566 out2:
2567         if (os->os_sync == ZFS_SYNC_ALWAYS)
2568                 zil_commit(zilog, 0);
2569
2570 out3:
2571         kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2572         kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2573         kmem_free(tmpxvattr, sizeof (xvattr_t));
2574         zfs_exit(zfsvfs, FTAG);
2575         return (err);
2576 }
2577
2578 typedef struct zfs_zlock {
2579         krwlock_t       *zl_rwlock;     /* lock we acquired */
2580         znode_t         *zl_znode;      /* znode we held */
2581         struct zfs_zlock *zl_next;      /* next in list */
2582 } zfs_zlock_t;
2583
2584 /*
2585  * Drop locks and release vnodes that were held by zfs_rename_lock().
2586  */
2587 static void
2588 zfs_rename_unlock(zfs_zlock_t **zlpp)
2589 {
2590         zfs_zlock_t *zl;
2591
2592         while ((zl = *zlpp) != NULL) {
2593                 if (zl->zl_znode != NULL)
2594                         zfs_zrele_async(zl->zl_znode);
2595                 rw_exit(zl->zl_rwlock);
2596                 *zlpp = zl->zl_next;
2597                 kmem_free(zl, sizeof (*zl));
2598         }
2599 }
2600
2601 /*
2602  * Search back through the directory tree, using the ".." entries.
2603  * Lock each directory in the chain to prevent concurrent renames.
2604  * Fail any attempt to move a directory into one of its own descendants.
2605  * XXX - z_parent_lock can overlap with map or grow locks
2606  */
2607 static int
2608 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2609 {
2610         zfs_zlock_t     *zl;
2611         znode_t         *zp = tdzp;
2612         uint64_t        rootid = ZTOZSB(zp)->z_root;
2613         uint64_t        oidp = zp->z_id;
2614         krwlock_t       *rwlp = &szp->z_parent_lock;
2615         krw_t           rw = RW_WRITER;
2616
2617         /*
2618          * First pass write-locks szp and compares to zp->z_id.
2619          * Later passes read-lock zp and compare to zp->z_parent.
2620          */
2621         do {
2622                 if (!rw_tryenter(rwlp, rw)) {
2623                         /*
2624                          * Another thread is renaming in this path.
2625                          * Note that if we are a WRITER, we don't have any
2626                          * parent_locks held yet.
2627                          */
2628                         if (rw == RW_READER && zp->z_id > szp->z_id) {
2629                                 /*
2630                                  * Drop our locks and restart
2631                                  */
2632                                 zfs_rename_unlock(&zl);
2633                                 *zlpp = NULL;
2634                                 zp = tdzp;
2635                                 oidp = zp->z_id;
2636                                 rwlp = &szp->z_parent_lock;
2637                                 rw = RW_WRITER;
2638                                 continue;
2639                         } else {
2640                                 /*
2641                                  * Wait for other thread to drop its locks
2642                                  */
2643                                 rw_enter(rwlp, rw);
2644                         }
2645                 }
2646
2647                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2648                 zl->zl_rwlock = rwlp;
2649                 zl->zl_znode = NULL;
2650                 zl->zl_next = *zlpp;
2651                 *zlpp = zl;
2652
2653                 if (oidp == szp->z_id)          /* We're a descendant of szp */
2654                         return (SET_ERROR(EINVAL));
2655
2656                 if (oidp == rootid)             /* We've hit the top */
2657                         return (0);
2658
2659                 if (rw == RW_READER) {          /* i.e. not the first pass */
2660                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2661                         if (error)
2662                                 return (error);
2663                         zl->zl_znode = zp;
2664                 }
2665                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2666                     &oidp, sizeof (oidp));
2667                 rwlp = &zp->z_parent_lock;
2668                 rw = RW_READER;
2669
2670         } while (zp->z_id != sdzp->z_id);
2671
2672         return (0);
2673 }
2674
2675 /*
2676  * Move an entry from the provided source directory to the target
2677  * directory.  Change the entry name as indicated.
2678  *
2679  *      IN:     sdzp    - Source directory containing the "old entry".
2680  *              snm     - Old entry name.
2681  *              tdzp    - Target directory to contain the "new entry".
2682  *              tnm     - New entry name.
2683  *              cr      - credentials of caller.
2684  *              flags   - case flags
2685  *              rflags  - RENAME_* flags
2686  *              wa_vap  - attributes for RENAME_WHITEOUT (must be a char 0:0).
2687  *              mnt_ns  - user namespace of the mount
2688  *
2689  *      RETURN: 0 on success, error code on failure.
2690  *
2691  * Timestamps:
2692  *      sdzp,tdzp - ctime|mtime updated
2693  */
2694 int
2695 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2696     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
2697 {
2698         znode_t         *szp, *tzp;
2699         zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
2700         zilog_t         *zilog;
2701         zfs_dirlock_t   *sdl, *tdl;
2702         dmu_tx_t        *tx;
2703         zfs_zlock_t     *zl;
2704         int             cmp, serr, terr;
2705         int             error = 0;
2706         int             zflg = 0;
2707         boolean_t       waited = B_FALSE;
2708         /* Needed for whiteout inode creation. */
2709         boolean_t       fuid_dirtied;
2710         zfs_acl_ids_t   acl_ids;
2711         boolean_t       have_acl = B_FALSE;
2712         znode_t         *wzp = NULL;
2713
2714
2715         if (snm == NULL || tnm == NULL)
2716                 return (SET_ERROR(EINVAL));
2717
2718         if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2719                 return (SET_ERROR(EINVAL));
2720
2721         /* Already checked by Linux VFS, but just to make sure. */
2722         if (rflags & RENAME_EXCHANGE &&
2723             (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
2724                 return (SET_ERROR(EINVAL));
2725
2726         /*
2727          * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
2728          * right kind of vattr_t for the whiteout file. These are set
2729          * internally by ZFS so should never be incorrect.
2730          */
2731         VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
2732         VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
2733         VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
2734
2735         if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2736                 return (error);
2737         zilog = zfsvfs->z_log;
2738
2739         if ((error = zfs_verify_zp(tdzp)) != 0) {
2740                 zfs_exit(zfsvfs, FTAG);
2741                 return (error);
2742         }
2743
2744         /*
2745          * We check i_sb because snapshots and the ctldir must have different
2746          * super blocks.
2747          */
2748         if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2749             zfsctl_is_node(ZTOI(tdzp))) {
2750                 zfs_exit(zfsvfs, FTAG);
2751                 return (SET_ERROR(EXDEV));
2752         }
2753
2754         if (zfsvfs->z_utf8 && u8_validate(tnm,
2755             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2756                 zfs_exit(zfsvfs, FTAG);
2757                 return (SET_ERROR(EILSEQ));
2758         }
2759
2760         if (flags & FIGNORECASE)
2761                 zflg |= ZCILOOK;
2762
2763 top:
2764         szp = NULL;
2765         tzp = NULL;
2766         zl = NULL;
2767
2768         /*
2769          * This is to prevent the creation of links into attribute space
2770          * by renaming a linked file into/outof an attribute directory.
2771          * See the comment in zfs_link() for why this is considered bad.
2772          */
2773         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2774                 zfs_exit(zfsvfs, FTAG);
2775                 return (SET_ERROR(EINVAL));
2776         }
2777
2778         /*
2779          * Lock source and target directory entries.  To prevent deadlock,
2780          * a lock ordering must be defined.  We lock the directory with
2781          * the smallest object id first, or if it's a tie, the one with
2782          * the lexically first name.
2783          */
2784         if (sdzp->z_id < tdzp->z_id) {
2785                 cmp = -1;
2786         } else if (sdzp->z_id > tdzp->z_id) {
2787                 cmp = 1;
2788         } else {
2789                 /*
2790                  * First compare the two name arguments without
2791                  * considering any case folding.
2792                  */
2793                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2794
2795                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2796                 ASSERT(error == 0 || !zfsvfs->z_utf8);
2797                 if (cmp == 0) {
2798                         /*
2799                          * POSIX: "If the old argument and the new argument
2800                          * both refer to links to the same existing file,
2801                          * the rename() function shall return successfully
2802                          * and perform no other action."
2803                          */
2804                         zfs_exit(zfsvfs, FTAG);
2805                         return (0);
2806                 }
2807                 /*
2808                  * If the file system is case-folding, then we may
2809                  * have some more checking to do.  A case-folding file
2810                  * system is either supporting mixed case sensitivity
2811                  * access or is completely case-insensitive.  Note
2812                  * that the file system is always case preserving.
2813                  *
2814                  * In mixed sensitivity mode case sensitive behavior
2815                  * is the default.  FIGNORECASE must be used to
2816                  * explicitly request case insensitive behavior.
2817                  *
2818                  * If the source and target names provided differ only
2819                  * by case (e.g., a request to rename 'tim' to 'Tim'),
2820                  * we will treat this as a special case in the
2821                  * case-insensitive mode: as long as the source name
2822                  * is an exact match, we will allow this to proceed as
2823                  * a name-change request.
2824                  */
2825                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2826                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
2827                     flags & FIGNORECASE)) &&
2828                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2829                     &error) == 0) {
2830                         /*
2831                          * case preserving rename request, require exact
2832                          * name matches
2833                          */
2834                         zflg |= ZCIEXACT;
2835                         zflg &= ~ZCILOOK;
2836                 }
2837         }
2838
2839         /*
2840          * If the source and destination directories are the same, we should
2841          * grab the z_name_lock of that directory only once.
2842          */
2843         if (sdzp == tdzp) {
2844                 zflg |= ZHAVELOCK;
2845                 rw_enter(&sdzp->z_name_lock, RW_READER);
2846         }
2847
2848         if (cmp < 0) {
2849                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2850                     ZEXISTS | zflg, NULL, NULL);
2851                 terr = zfs_dirent_lock(&tdl,
2852                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2853         } else {
2854                 terr = zfs_dirent_lock(&tdl,
2855                     tdzp, tnm, &tzp, zflg, NULL, NULL);
2856                 serr = zfs_dirent_lock(&sdl,
2857                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2858                     NULL, NULL);
2859         }
2860
2861         if (serr) {
2862                 /*
2863                  * Source entry invalid or not there.
2864                  */
2865                 if (!terr) {
2866                         zfs_dirent_unlock(tdl);
2867                         if (tzp)
2868                                 zrele(tzp);
2869                 }
2870
2871                 if (sdzp == tdzp)
2872                         rw_exit(&sdzp->z_name_lock);
2873
2874                 if (strcmp(snm, "..") == 0)
2875                         serr = EINVAL;
2876                 zfs_exit(zfsvfs, FTAG);
2877                 return (serr);
2878         }
2879         if (terr) {
2880                 zfs_dirent_unlock(sdl);
2881                 zrele(szp);
2882
2883                 if (sdzp == tdzp)
2884                         rw_exit(&sdzp->z_name_lock);
2885
2886                 if (strcmp(tnm, "..") == 0)
2887                         terr = EINVAL;
2888                 zfs_exit(zfsvfs, FTAG);
2889                 return (terr);
2890         }
2891
2892         /*
2893          * If we are using project inheritance, means if the directory has
2894          * ZFS_PROJINHERIT set, then its descendant directories will inherit
2895          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2896          * such case, we only allow renames into our tree when the project
2897          * IDs are the same.
2898          */
2899         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2900             tdzp->z_projid != szp->z_projid) {
2901                 error = SET_ERROR(EXDEV);
2902                 goto out;
2903         }
2904
2905         /*
2906          * Must have write access at the source to remove the old entry
2907          * and write access at the target to create the new entry.
2908          * Note that if target and source are the same, this can be
2909          * done in a single check.
2910          */
2911         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
2912                 goto out;
2913
2914         if (S_ISDIR(ZTOI(szp)->i_mode)) {
2915                 /*
2916                  * Check to make sure rename is valid.
2917                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2918                  */
2919                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2920                         goto out;
2921         }
2922
2923         /*
2924          * Does target exist?
2925          */
2926         if (tzp) {
2927                 if (rflags & RENAME_NOREPLACE) {
2928                         error = SET_ERROR(EEXIST);
2929                         goto out;
2930                 }
2931                 /*
2932                  * Source and target must be the same type (unless exchanging).
2933                  */
2934                 if (!(rflags & RENAME_EXCHANGE)) {
2935                         boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
2936                         boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
2937
2938                         if (s_is_dir != t_is_dir) {
2939                                 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
2940                                 goto out;
2941                         }
2942                 }
2943                 /*
2944                  * POSIX dictates that when the source and target
2945                  * entries refer to the same file object, rename
2946                  * must do nothing and exit without error.
2947                  */
2948                 if (szp->z_id == tzp->z_id) {
2949                         error = 0;
2950                         goto out;
2951                 }
2952         } else if (rflags & RENAME_EXCHANGE) {
2953                 /* Target must exist for RENAME_EXCHANGE. */
2954                 error = SET_ERROR(ENOENT);
2955                 goto out;
2956         }
2957
2958         /* Set up inode creation for RENAME_WHITEOUT. */
2959         if (rflags & RENAME_WHITEOUT) {
2960                 /*
2961                  * Whiteout files are not regular files or directories, so to
2962                  * match zfs_create() we do not inherit the project id.
2963                  */
2964                 uint64_t wo_projid = ZFS_DEFAULT_PROJID;
2965
2966                 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
2967                 if (error)
2968                         goto out;
2969
2970                 if (!have_acl) {
2971                         error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
2972                             &acl_ids, mnt_ns);
2973                         if (error)
2974                                 goto out;
2975                         have_acl = B_TRUE;
2976                 }
2977
2978                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
2979                         error = SET_ERROR(EDQUOT);
2980                         goto out;
2981                 }
2982         }
2983
2984         tx = dmu_tx_create(zfsvfs->z_os);
2985         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
2986         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2987         dmu_tx_hold_zap(tx, sdzp->z_id,
2988             (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
2989         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2990         if (sdzp != tdzp) {
2991                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
2992                 zfs_sa_upgrade_txholds(tx, tdzp);
2993         }
2994         if (tzp) {
2995                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
2996                 zfs_sa_upgrade_txholds(tx, tzp);
2997         }
2998         if (rflags & RENAME_WHITEOUT) {
2999                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3000                     ZFS_SA_BASE_ATTR_SIZE);
3001
3002                 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
3003                 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3004                 if (!zfsvfs->z_use_sa &&
3005                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3006                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3007                             0, acl_ids.z_aclp->z_acl_bytes);
3008                 }
3009         }
3010         fuid_dirtied = zfsvfs->z_fuid_dirty;
3011         if (fuid_dirtied)
3012                 zfs_fuid_txhold(zfsvfs, tx);
3013         zfs_sa_upgrade_txholds(tx, szp);
3014         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3015         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3016         if (error) {
3017                 if (zl != NULL)
3018                         zfs_rename_unlock(&zl);
3019                 zfs_dirent_unlock(sdl);
3020                 zfs_dirent_unlock(tdl);
3021
3022                 if (sdzp == tdzp)
3023                         rw_exit(&sdzp->z_name_lock);
3024
3025                 if (error == ERESTART) {
3026                         waited = B_TRUE;
3027                         dmu_tx_wait(tx);
3028                         dmu_tx_abort(tx);
3029                         zrele(szp);
3030                         if (tzp)
3031                                 zrele(tzp);
3032                         goto top;
3033                 }
3034                 dmu_tx_abort(tx);
3035                 zrele(szp);
3036                 if (tzp)
3037                         zrele(tzp);
3038                 zfs_exit(zfsvfs, FTAG);
3039                 return (error);
3040         }
3041
3042         /*
3043          * Unlink the source.
3044          */
3045         szp->z_pflags |= ZFS_AV_MODIFIED;
3046         if (tdzp->z_pflags & ZFS_PROJINHERIT)
3047                 szp->z_pflags |= ZFS_PROJINHERIT;
3048
3049         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3050             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3051         VERIFY0(error);
3052
3053         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3054         if (error)
3055                 goto commit;
3056
3057         /*
3058          * Unlink the target.
3059          */
3060         if (tzp) {
3061                 int tzflg = zflg;
3062
3063                 if (rflags & RENAME_EXCHANGE) {
3064                         /* This inode will be re-linked soon. */
3065                         tzflg |= ZRENAMING;
3066
3067                         tzp->z_pflags |= ZFS_AV_MODIFIED;
3068                         if (sdzp->z_pflags & ZFS_PROJINHERIT)
3069                                 tzp->z_pflags |= ZFS_PROJINHERIT;
3070
3071                         error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3072                             (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
3073                         ASSERT0(error);
3074                 }
3075                 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
3076                 if (error)
3077                         goto commit_link_szp;
3078         }
3079
3080         /*
3081          * Create the new target links:
3082          *   * We always link the target.
3083          *   * RENAME_EXCHANGE: Link the old target to the source.
3084          *   * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
3085          */
3086         error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3087         if (error) {
3088                 /*
3089                  * If we have removed the existing target, a subsequent call to
3090                  * zfs_link_create() to add back the same entry, but with a new
3091                  * dnode (szp), should not fail.
3092                  */
3093                 ASSERT3P(tzp, ==, NULL);
3094                 goto commit_link_tzp;
3095         }
3096
3097         switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3098         case RENAME_EXCHANGE:
3099                 error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
3100                 /*
3101                  * The same argument as zfs_link_create() failing for
3102                  * szp applies here, since the source directory must
3103                  * have had an entry we are replacing.
3104                  */
3105                 ASSERT0(error);
3106                 if (error)
3107                         goto commit_unlink_td_szp;
3108                 break;
3109         case RENAME_WHITEOUT:
3110                 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
3111                 error = zfs_link_create(sdl, wzp, tx, ZNEW);
3112                 if (error) {
3113                         zfs_znode_delete(wzp, tx);
3114                         remove_inode_hash(ZTOI(wzp));
3115                         goto commit_unlink_td_szp;
3116                 }
3117                 break;
3118         }
3119
3120         if (fuid_dirtied)
3121                 zfs_fuid_sync(zfsvfs, tx);
3122
3123         switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3124         case RENAME_EXCHANGE:
3125                 zfs_log_rename_exchange(zilog, tx,
3126                     (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3127                     tdzp, tdl->dl_name, szp);
3128                 break;
3129         case RENAME_WHITEOUT:
3130                 zfs_log_rename_whiteout(zilog, tx,
3131                     (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3132                     tdzp, tdl->dl_name, szp, wzp);
3133                 break;
3134         default:
3135                 ASSERT0(rflags & ~RENAME_NOREPLACE);
3136                 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
3137                     sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3138                 break;
3139         }
3140
3141 commit:
3142         dmu_tx_commit(tx);
3143 out:
3144         if (have_acl)
3145                 zfs_acl_ids_free(&acl_ids);
3146
3147         zfs_znode_update_vfs(sdzp);
3148         if (sdzp == tdzp)
3149                 rw_exit(&sdzp->z_name_lock);
3150
3151         if (sdzp != tdzp)
3152                 zfs_znode_update_vfs(tdzp);
3153
3154         zfs_znode_update_vfs(szp);
3155         zrele(szp);
3156         if (wzp) {
3157                 zfs_znode_update_vfs(wzp);
3158                 zrele(wzp);
3159         }
3160         if (tzp) {
3161                 zfs_znode_update_vfs(tzp);
3162                 zrele(tzp);
3163         }
3164
3165         if (zl != NULL)
3166                 zfs_rename_unlock(&zl);
3167
3168         zfs_dirent_unlock(sdl);
3169         zfs_dirent_unlock(tdl);
3170
3171         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3172                 zil_commit(zilog, 0);
3173
3174         zfs_exit(zfsvfs, FTAG);
3175         return (error);
3176
3177         /*
3178          * Clean-up path for broken link state.
3179          *
3180          * At this point we are in a (very) bad state, so we need to do our
3181          * best to correct the state. In particular, all of the nlinks are
3182          * wrong because we were destroying and creating links with ZRENAMING.
3183          *
3184          * In some form, all of these operations have to resolve the state:
3185          *
3186          *  * link_destroy() *must* succeed. Fortunately, this is very likely
3187          *    since we only just created it.
3188          *
3189          *  * link_create()s are allowed to fail (though they shouldn't because
3190          *    we only just unlinked them and are putting the entries back
3191          *    during clean-up). But if they fail, we can just forcefully drop
3192          *    the nlink value to (at the very least) avoid broken nlink values
3193          *    -- though in the case of non-empty directories we will have to
3194          *    panic (otherwise we'd have a leaked directory with a broken ..).
3195          */
3196 commit_unlink_td_szp:
3197         VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
3198 commit_link_tzp:
3199         if (tzp) {
3200                 if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
3201                         VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
3202         }
3203 commit_link_szp:
3204         if (zfs_link_create(sdl, szp, tx, ZRENAMING))
3205                 VERIFY0(zfs_drop_nlink(szp, tx, NULL));
3206         goto commit;
3207 }
3208
3209 /*
3210  * Insert the indicated symbolic reference entry into the directory.
3211  *
3212  *      IN:     dzp     - Directory to contain new symbolic link.
3213  *              name    - Name of directory entry in dip.
3214  *              vap     - Attributes of new entry.
3215  *              link    - Name for new symlink entry.
3216  *              cr      - credentials of caller.
3217  *              flags   - case flags
3218  *              mnt_ns  - user namespace of the mount
3219  *
3220  *      OUT:    zpp     - Znode for new symbolic link.
3221  *
3222  *      RETURN: 0 on success, error code on failure.
3223  *
3224  * Timestamps:
3225  *      dip - ctime|mtime updated
3226  */
3227 int
3228 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3229     znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3230 {
3231         znode_t         *zp;
3232         zfs_dirlock_t   *dl;
3233         dmu_tx_t        *tx;
3234         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
3235         zilog_t         *zilog;
3236         uint64_t        len = strlen(link);
3237         int             error;
3238         int             zflg = ZNEW;
3239         zfs_acl_ids_t   acl_ids;
3240         boolean_t       fuid_dirtied;
3241         uint64_t        txtype = TX_SYMLINK;
3242         boolean_t       waited = B_FALSE;
3243
3244         ASSERT(S_ISLNK(vap->va_mode));
3245
3246         if (name == NULL)
3247                 return (SET_ERROR(EINVAL));
3248
3249         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3250                 return (error);
3251         zilog = zfsvfs->z_log;
3252
3253         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3254             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3255                 zfs_exit(zfsvfs, FTAG);
3256                 return (SET_ERROR(EILSEQ));
3257         }
3258         if (flags & FIGNORECASE)
3259                 zflg |= ZCILOOK;
3260
3261         if (len > MAXPATHLEN) {
3262                 zfs_exit(zfsvfs, FTAG);
3263                 return (SET_ERROR(ENAMETOOLONG));
3264         }
3265
3266         if ((error = zfs_acl_ids_create(dzp, 0,
3267             vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
3268                 zfs_exit(zfsvfs, FTAG);
3269                 return (error);
3270         }
3271 top:
3272         *zpp = NULL;
3273
3274         /*
3275          * Attempt to lock directory; fail if entry already exists.
3276          */
3277         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3278         if (error) {
3279                 zfs_acl_ids_free(&acl_ids);
3280                 zfs_exit(zfsvfs, FTAG);
3281                 return (error);
3282         }
3283
3284         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3285                 zfs_acl_ids_free(&acl_ids);
3286                 zfs_dirent_unlock(dl);
3287                 zfs_exit(zfsvfs, FTAG);
3288                 return (error);
3289         }
3290
3291         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3292                 zfs_acl_ids_free(&acl_ids);
3293                 zfs_dirent_unlock(dl);
3294                 zfs_exit(zfsvfs, FTAG);
3295                 return (SET_ERROR(EDQUOT));
3296         }
3297         tx = dmu_tx_create(zfsvfs->z_os);
3298         fuid_dirtied = zfsvfs->z_fuid_dirty;
3299         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3300         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3301         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3302             ZFS_SA_BASE_ATTR_SIZE + len);
3303         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3304         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3305                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3306                     acl_ids.z_aclp->z_acl_bytes);
3307         }
3308         if (fuid_dirtied)
3309                 zfs_fuid_txhold(zfsvfs, tx);
3310         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3311         if (error) {
3312                 zfs_dirent_unlock(dl);
3313                 if (error == ERESTART) {
3314                         waited = B_TRUE;
3315                         dmu_tx_wait(tx);
3316                         dmu_tx_abort(tx);
3317                         goto top;
3318                 }
3319                 zfs_acl_ids_free(&acl_ids);
3320                 dmu_tx_abort(tx);
3321                 zfs_exit(zfsvfs, FTAG);
3322                 return (error);
3323         }
3324
3325         /*
3326          * Create a new object for the symlink.
3327          * for version 4 ZPL datasets the symlink will be an SA attribute
3328          */
3329         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3330
3331         if (fuid_dirtied)
3332                 zfs_fuid_sync(zfsvfs, tx);
3333
3334         mutex_enter(&zp->z_lock);
3335         if (zp->z_is_sa)
3336                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3337                     link, len, tx);
3338         else
3339                 zfs_sa_symlink(zp, link, len, tx);
3340         mutex_exit(&zp->z_lock);
3341
3342         zp->z_size = len;
3343         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3344             &zp->z_size, sizeof (zp->z_size), tx);
3345         /*
3346          * Insert the new object into the directory.
3347          */
3348         error = zfs_link_create(dl, zp, tx, ZNEW);
3349         if (error != 0) {
3350                 zfs_znode_delete(zp, tx);
3351                 remove_inode_hash(ZTOI(zp));
3352         } else {
3353                 if (flags & FIGNORECASE)
3354                         txtype |= TX_CI;
3355                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3356
3357                 zfs_znode_update_vfs(dzp);
3358                 zfs_znode_update_vfs(zp);
3359         }
3360
3361         zfs_acl_ids_free(&acl_ids);
3362
3363         dmu_tx_commit(tx);
3364
3365         zfs_dirent_unlock(dl);
3366
3367         if (error == 0) {
3368                 *zpp = zp;
3369
3370                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3371                         zil_commit(zilog, 0);
3372         } else {
3373                 zrele(zp);
3374         }
3375
3376         zfs_exit(zfsvfs, FTAG);
3377         return (error);
3378 }
3379
3380 /*
3381  * Return, in the buffer contained in the provided uio structure,
3382  * the symbolic path referred to by ip.
3383  *
3384  *      IN:     ip      - inode of symbolic link
3385  *              uio     - structure to contain the link path.
3386  *              cr      - credentials of caller.
3387  *
3388  *      RETURN: 0 if success
3389  *              error code if failure
3390  *
3391  * Timestamps:
3392  *      ip - atime updated
3393  */
3394 int
3395 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3396 {
3397         (void) cr;
3398         znode_t         *zp = ITOZ(ip);
3399         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3400         int             error;
3401
3402         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3403                 return (error);
3404
3405         mutex_enter(&zp->z_lock);
3406         if (zp->z_is_sa)
3407                 error = sa_lookup_uio(zp->z_sa_hdl,
3408                     SA_ZPL_SYMLINK(zfsvfs), uio);
3409         else
3410                 error = zfs_sa_readlink(zp, uio);
3411         mutex_exit(&zp->z_lock);
3412
3413         zfs_exit(zfsvfs, FTAG);
3414         return (error);
3415 }
3416
3417 /*
3418  * Insert a new entry into directory tdzp referencing szp.
3419  *
3420  *      IN:     tdzp    - Directory to contain new entry.
3421  *              szp     - znode of new entry.
3422  *              name    - name of new entry.
3423  *              cr      - credentials of caller.
3424  *              flags   - case flags.
3425  *
3426  *      RETURN: 0 if success
3427  *              error code if failure
3428  *
3429  * Timestamps:
3430  *      tdzp - ctime|mtime updated
3431  *       szp - ctime updated
3432  */
3433 int
3434 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3435     int flags)
3436 {
3437         struct inode *sip = ZTOI(szp);
3438         znode_t         *tzp;
3439         zfsvfs_t        *zfsvfs = ZTOZSB(tdzp);
3440         zilog_t         *zilog;
3441         zfs_dirlock_t   *dl;
3442         dmu_tx_t        *tx;
3443         int             error;
3444         int             zf = ZNEW;
3445         uint64_t        parent;
3446         uid_t           owner;
3447         boolean_t       waited = B_FALSE;
3448         boolean_t       is_tmpfile = 0;
3449         uint64_t        txg;
3450 #ifdef HAVE_TMPFILE
3451         is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3452 #endif
3453         ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3454
3455         if (name == NULL)
3456                 return (SET_ERROR(EINVAL));
3457
3458         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3459                 return (error);
3460         zilog = zfsvfs->z_log;
3461
3462         /*
3463          * POSIX dictates that we return EPERM here.
3464          * Better choices include ENOTSUP or EISDIR.
3465          */
3466         if (S_ISDIR(sip->i_mode)) {
3467                 zfs_exit(zfsvfs, FTAG);
3468                 return (SET_ERROR(EPERM));
3469         }
3470
3471         if ((error = zfs_verify_zp(szp)) != 0) {
3472                 zfs_exit(zfsvfs, FTAG);
3473                 return (error);
3474         }
3475
3476         /*
3477          * If we are using project inheritance, means if the directory has
3478          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3479          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3480          * such case, we only allow hard link creation in our tree when the
3481          * project IDs are the same.
3482          */
3483         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3484             tdzp->z_projid != szp->z_projid) {
3485                 zfs_exit(zfsvfs, FTAG);
3486                 return (SET_ERROR(EXDEV));
3487         }
3488
3489         /*
3490          * We check i_sb because snapshots and the ctldir must have different
3491          * super blocks.
3492          */
3493         if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3494                 zfs_exit(zfsvfs, FTAG);
3495                 return (SET_ERROR(EXDEV));
3496         }
3497
3498         /* Prevent links to .zfs/shares files */
3499
3500         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3501             &parent, sizeof (uint64_t))) != 0) {
3502                 zfs_exit(zfsvfs, FTAG);
3503                 return (error);
3504         }
3505         if (parent == zfsvfs->z_shares_dir) {
3506                 zfs_exit(zfsvfs, FTAG);
3507                 return (SET_ERROR(EPERM));
3508         }
3509
3510         if (zfsvfs->z_utf8 && u8_validate(name,
3511             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3512                 zfs_exit(zfsvfs, FTAG);
3513                 return (SET_ERROR(EILSEQ));
3514         }
3515         if (flags & FIGNORECASE)
3516                 zf |= ZCILOOK;
3517
3518         /*
3519          * We do not support links between attributes and non-attributes
3520          * because of the potential security risk of creating links
3521          * into "normal" file space in order to circumvent restrictions
3522          * imposed in attribute space.
3523          */
3524         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3525                 zfs_exit(zfsvfs, FTAG);
3526                 return (SET_ERROR(EINVAL));
3527         }
3528
3529         owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3530             cr, ZFS_OWNER);
3531         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3532                 zfs_exit(zfsvfs, FTAG);
3533                 return (SET_ERROR(EPERM));
3534         }
3535
3536         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
3537             zfs_init_idmap))) {
3538                 zfs_exit(zfsvfs, FTAG);
3539                 return (error);
3540         }
3541
3542 top:
3543         /*
3544          * Attempt to lock directory; fail if entry already exists.
3545          */
3546         error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3547         if (error) {
3548                 zfs_exit(zfsvfs, FTAG);
3549                 return (error);
3550         }
3551
3552         tx = dmu_tx_create(zfsvfs->z_os);
3553         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3554         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3555         if (is_tmpfile)
3556                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3557
3558         zfs_sa_upgrade_txholds(tx, szp);
3559         zfs_sa_upgrade_txholds(tx, tdzp);
3560         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3561         if (error) {
3562                 zfs_dirent_unlock(dl);
3563                 if (error == ERESTART) {
3564                         waited = B_TRUE;
3565                         dmu_tx_wait(tx);
3566                         dmu_tx_abort(tx);
3567                         goto top;
3568                 }
3569                 dmu_tx_abort(tx);
3570                 zfs_exit(zfsvfs, FTAG);
3571                 return (error);
3572         }
3573         /* unmark z_unlinked so zfs_link_create will not reject */
3574         if (is_tmpfile)
3575                 szp->z_unlinked = B_FALSE;
3576         error = zfs_link_create(dl, szp, tx, 0);
3577
3578         if (error == 0) {
3579                 uint64_t txtype = TX_LINK;
3580                 /*
3581                  * tmpfile is created to be in z_unlinkedobj, so remove it.
3582                  * Also, we don't log in ZIL, because all previous file
3583                  * operation on the tmpfile are ignored by ZIL. Instead we
3584                  * always wait for txg to sync to make sure all previous
3585                  * operation are sync safe.
3586                  */
3587                 if (is_tmpfile) {
3588                         VERIFY(zap_remove_int(zfsvfs->z_os,
3589                             zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
3590                 } else {
3591                         if (flags & FIGNORECASE)
3592                                 txtype |= TX_CI;
3593                         zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3594                 }
3595         } else if (is_tmpfile) {
3596                 /* restore z_unlinked since when linking failed */
3597                 szp->z_unlinked = B_TRUE;
3598         }
3599         txg = dmu_tx_get_txg(tx);
3600         dmu_tx_commit(tx);
3601
3602         zfs_dirent_unlock(dl);
3603
3604         if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3605                 zil_commit(zilog, 0);
3606
3607         if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
3608                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
3609
3610         zfs_znode_update_vfs(tdzp);
3611         zfs_znode_update_vfs(szp);
3612         zfs_exit(zfsvfs, FTAG);
3613         return (error);
3614 }
3615
3616 static void
3617 zfs_putpage_sync_commit_cb(void *arg)
3618 {
3619         struct page *pp = arg;
3620
3621         ClearPageError(pp);
3622         end_page_writeback(pp);
3623 }
3624
3625 static void
3626 zfs_putpage_async_commit_cb(void *arg)
3627 {
3628         struct page *pp = arg;
3629         znode_t *zp = ITOZ(pp->mapping->host);
3630
3631         ClearPageError(pp);
3632         end_page_writeback(pp);
3633         atomic_dec_32(&zp->z_async_writes_cnt);
3634 }
3635
3636 /*
3637  * Push a page out to disk, once the page is on stable storage the
3638  * registered commit callback will be run as notification of completion.
3639  *
3640  *      IN:     ip       - page mapped for inode.
3641  *              pp       - page to push (page is locked)
3642  *              wbc      - writeback control data
3643  *              for_sync - does the caller intend to wait synchronously for the
3644  *                         page writeback to complete?
3645  *
3646  *      RETURN: 0 if success
3647  *              error code if failure
3648  *
3649  * Timestamps:
3650  *      ip - ctime|mtime updated
3651  */
3652 int
3653 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3654     boolean_t for_sync)
3655 {
3656         znode_t         *zp = ITOZ(ip);
3657         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3658         loff_t          offset;
3659         loff_t          pgoff;
3660         unsigned int    pglen;
3661         dmu_tx_t        *tx;
3662         caddr_t         va;
3663         int             err = 0;
3664         uint64_t        mtime[2], ctime[2];
3665         inode_timespec_t tmp_ts;
3666         sa_bulk_attr_t  bulk[3];
3667         int             cnt = 0;
3668         struct address_space *mapping;
3669
3670         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3671                 return (err);
3672
3673         ASSERT(PageLocked(pp));
3674
3675         pgoff = page_offset(pp);        /* Page byte-offset in file */
3676         offset = i_size_read(ip);       /* File length in bytes */
3677         pglen = MIN(PAGE_SIZE,          /* Page length in bytes */
3678             P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3679
3680         /* Page is beyond end of file */
3681         if (pgoff >= offset) {
3682                 unlock_page(pp);
3683                 zfs_exit(zfsvfs, FTAG);
3684                 return (0);
3685         }
3686
3687         /* Truncate page length to end of file */
3688         if (pgoff + pglen > offset)
3689                 pglen = offset - pgoff;
3690
3691 #if 0
3692         /*
3693          * FIXME: Allow mmap writes past its quota.  The correct fix
3694          * is to register a page_mkwrite() handler to count the page
3695          * against its quota when it is about to be dirtied.
3696          */
3697         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3698             KUID_TO_SUID(ip->i_uid)) ||
3699             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3700             KGID_TO_SGID(ip->i_gid)) ||
3701             (zp->z_projid != ZFS_DEFAULT_PROJID &&
3702             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3703             zp->z_projid))) {
3704                 err = EDQUOT;
3705         }
3706 #endif
3707
3708         /*
3709          * The ordering here is critical and must adhere to the following
3710          * rules in order to avoid deadlocking in either zfs_read() or
3711          * zfs_free_range() due to a lock inversion.
3712          *
3713          * 1) The page must be unlocked prior to acquiring the range lock.
3714          *    This is critical because zfs_read() calls find_lock_page()
3715          *    which may block on the page lock while holding the range lock.
3716          *
3717          * 2) Before setting or clearing write back on a page the range lock
3718          *    must be held in order to prevent a lock inversion with the
3719          *    zfs_free_range() function.
3720          *
3721          * This presents a problem because upon entering this function the
3722          * page lock is already held.  To safely acquire the range lock the
3723          * page lock must be dropped.  This creates a window where another
3724          * process could truncate, invalidate, dirty, or write out the page.
3725          *
3726          * Therefore, after successfully reacquiring the range and page locks
3727          * the current page state is checked.  In the common case everything
3728          * will be as is expected and it can be written out.  However, if
3729          * the page state has changed it must be handled accordingly.
3730          */
3731         mapping = pp->mapping;
3732         redirty_page_for_writepage(wbc, pp);
3733         unlock_page(pp);
3734
3735         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3736             pgoff, pglen, RL_WRITER);
3737         lock_page(pp);
3738
3739         /* Page mapping changed or it was no longer dirty, we're done */
3740         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3741                 unlock_page(pp);
3742                 zfs_rangelock_exit(lr);
3743                 zfs_exit(zfsvfs, FTAG);
3744                 return (0);
3745         }
3746
3747         /* Another process started write block if required */
3748         if (PageWriteback(pp)) {
3749                 unlock_page(pp);
3750                 zfs_rangelock_exit(lr);
3751
3752                 if (wbc->sync_mode != WB_SYNC_NONE) {
3753                         /*
3754                          * Speed up any non-sync page writebacks since
3755                          * they may take several seconds to complete.
3756                          * Refer to the comment in zpl_fsync() (when
3757                          * HAVE_FSYNC_RANGE is defined) for details.
3758                          */
3759                         if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
3760                                 zil_commit(zfsvfs->z_log, zp->z_id);
3761                         }
3762
3763                         if (PageWriteback(pp))
3764 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3765                                 folio_wait_bit(page_folio(pp), PG_writeback);
3766 #else
3767                                 wait_on_page_bit(pp, PG_writeback);
3768 #endif
3769                 }
3770
3771                 zfs_exit(zfsvfs, FTAG);
3772                 return (0);
3773         }
3774
3775         /* Clear the dirty flag the required locks are held */
3776         if (!clear_page_dirty_for_io(pp)) {
3777                 unlock_page(pp);
3778                 zfs_rangelock_exit(lr);
3779                 zfs_exit(zfsvfs, FTAG);
3780                 return (0);
3781         }
3782
3783         /*
3784          * Counterpart for redirty_page_for_writepage() above.  This page
3785          * was in fact not skipped and should not be counted as if it were.
3786          */
3787         wbc->pages_skipped--;
3788         if (!for_sync)
3789                 atomic_inc_32(&zp->z_async_writes_cnt);
3790         set_page_writeback(pp);
3791         unlock_page(pp);
3792
3793         tx = dmu_tx_create(zfsvfs->z_os);
3794         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3795         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3796         zfs_sa_upgrade_txholds(tx, zp);
3797
3798         err = dmu_tx_assign(tx, TXG_NOWAIT);
3799         if (err != 0) {
3800                 if (err == ERESTART)
3801                         dmu_tx_wait(tx);
3802
3803                 dmu_tx_abort(tx);
3804 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3805                 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3806 #else
3807                 __set_page_dirty_nobuffers(pp);
3808 #endif
3809                 ClearPageError(pp);
3810                 end_page_writeback(pp);
3811                 if (!for_sync)
3812                         atomic_dec_32(&zp->z_async_writes_cnt);
3813                 zfs_rangelock_exit(lr);
3814                 zfs_exit(zfsvfs, FTAG);
3815                 return (err);
3816         }
3817
3818         va = kmap(pp);
3819         ASSERT3U(pglen, <=, PAGE_SIZE);
3820         dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
3821         kunmap(pp);
3822
3823         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3824         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3825         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3826             &zp->z_pflags, 8);
3827
3828         /* Preserve the mtime and ctime provided by the inode */
3829         tmp_ts = zpl_inode_get_mtime(ip);
3830         ZFS_TIME_ENCODE(&tmp_ts, mtime);
3831         tmp_ts = zpl_inode_get_ctime(ip);
3832         ZFS_TIME_ENCODE(&tmp_ts, ctime);
3833         zp->z_atime_dirty = B_FALSE;
3834         zp->z_seq++;
3835
3836         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3837
3838         boolean_t commit = B_FALSE;
3839         if (wbc->sync_mode != WB_SYNC_NONE) {
3840                 /*
3841                  * Note that this is rarely called under writepages(), because
3842                  * writepages() normally handles the entire commit for
3843                  * performance reasons.
3844                  */
3845                 commit = B_TRUE;
3846         } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
3847                 /*
3848                  * If the caller does not intend to wait synchronously
3849                  * for this page writeback to complete and there are active
3850                  * synchronous calls on this file, do a commit so that
3851                  * the latter don't accidentally end up waiting for
3852                  * our writeback to complete. Refer to the comment in
3853                  * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
3854                  */
3855                 commit = B_TRUE;
3856         }
3857
3858         zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
3859             for_sync ? zfs_putpage_sync_commit_cb :
3860             zfs_putpage_async_commit_cb, pp);
3861
3862         dmu_tx_commit(tx);
3863
3864         zfs_rangelock_exit(lr);
3865
3866         if (commit)
3867                 zil_commit(zfsvfs->z_log, zp->z_id);
3868
3869         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3870
3871         zfs_exit(zfsvfs, FTAG);
3872         return (err);
3873 }
3874
3875 /*
3876  * Update the system attributes when the inode has been dirtied.  For the
3877  * moment we only update the mode, atime, mtime, and ctime.
3878  */
3879 int
3880 zfs_dirty_inode(struct inode *ip, int flags)
3881 {
3882         znode_t         *zp = ITOZ(ip);
3883         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3884         dmu_tx_t        *tx;
3885         uint64_t        mode, atime[2], mtime[2], ctime[2];
3886         inode_timespec_t tmp_ts;
3887         sa_bulk_attr_t  bulk[4];
3888         int             error = 0;
3889         int             cnt = 0;
3890
3891         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
3892                 return (0);
3893
3894         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3895                 return (error);
3896
3897 #ifdef I_DIRTY_TIME
3898         /*
3899          * This is the lazytime semantic introduced in Linux 4.0
3900          * This flag will only be called from update_time when lazytime is set.
3901          * (Note, I_DIRTY_SYNC will also set if not lazytime)
3902          * Fortunately mtime and ctime are managed within ZFS itself, so we
3903          * only need to dirty atime.
3904          */
3905         if (flags == I_DIRTY_TIME) {
3906                 zp->z_atime_dirty = B_TRUE;
3907                 goto out;
3908         }
3909 #endif
3910
3911         tx = dmu_tx_create(zfsvfs->z_os);
3912
3913         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3914         zfs_sa_upgrade_txholds(tx, zp);
3915
3916         error = dmu_tx_assign(tx, TXG_WAIT);
3917         if (error) {
3918                 dmu_tx_abort(tx);
3919                 goto out;
3920         }
3921
3922         mutex_enter(&zp->z_lock);
3923         zp->z_atime_dirty = B_FALSE;
3924
3925         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
3926         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
3927         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3928         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3929
3930         /* Preserve the mode, mtime and ctime provided by the inode */
3931         tmp_ts = zpl_inode_get_atime(ip);
3932         ZFS_TIME_ENCODE(&tmp_ts, atime);
3933         tmp_ts = zpl_inode_get_mtime(ip);
3934         ZFS_TIME_ENCODE(&tmp_ts, mtime);
3935         tmp_ts = zpl_inode_get_ctime(ip);
3936         ZFS_TIME_ENCODE(&tmp_ts, ctime);
3937         mode = ip->i_mode;
3938
3939         zp->z_mode = mode;
3940
3941         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3942         mutex_exit(&zp->z_lock);
3943
3944         dmu_tx_commit(tx);
3945 out:
3946         zfs_exit(zfsvfs, FTAG);
3947         return (error);
3948 }
3949
3950 void
3951 zfs_inactive(struct inode *ip)
3952 {
3953         znode_t *zp = ITOZ(ip);
3954         zfsvfs_t *zfsvfs = ITOZSB(ip);
3955         uint64_t atime[2];
3956         int error;
3957         int need_unlock = 0;
3958
3959         /* Only read lock if we haven't already write locked, e.g. rollback */
3960         if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
3961                 need_unlock = 1;
3962                 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3963         }
3964         if (zp->z_sa_hdl == NULL) {
3965                 if (need_unlock)
3966                         rw_exit(&zfsvfs->z_teardown_inactive_lock);
3967                 return;
3968         }
3969
3970         if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
3971                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3972
3973                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3974                 zfs_sa_upgrade_txholds(tx, zp);
3975                 error = dmu_tx_assign(tx, TXG_WAIT);
3976                 if (error) {
3977                         dmu_tx_abort(tx);
3978                 } else {
3979                         inode_timespec_t tmp_atime;
3980                         tmp_atime = zpl_inode_get_atime(ip);
3981                         ZFS_TIME_ENCODE(&tmp_atime, atime);
3982                         mutex_enter(&zp->z_lock);
3983                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3984                             (void *)&atime, sizeof (atime), tx);
3985                         zp->z_atime_dirty = B_FALSE;
3986                         mutex_exit(&zp->z_lock);
3987                         dmu_tx_commit(tx);
3988                 }
3989         }
3990
3991         zfs_zinactive(zp);
3992         if (need_unlock)
3993                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3994 }
3995
3996 /*
3997  * Fill pages with data from the disk.
3998  */
3999 static int
4000 zfs_fillpage(struct inode *ip, struct page *pp)
4001 {
4002         zfsvfs_t *zfsvfs = ITOZSB(ip);
4003         loff_t i_size = i_size_read(ip);
4004         u_offset_t io_off = page_offset(pp);
4005         size_t io_len = PAGE_SIZE;
4006
4007         ASSERT3U(io_off, <, i_size);
4008
4009         if (io_off + io_len > i_size)
4010                 io_len = i_size - io_off;
4011
4012         void *va = kmap(pp);
4013         int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
4014             io_len, va, DMU_READ_PREFETCH);
4015         if (io_len != PAGE_SIZE)
4016                 memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
4017         kunmap(pp);
4018
4019         if (error) {
4020                 /* convert checksum errors into IO errors */
4021                 if (error == ECKSUM)
4022                         error = SET_ERROR(EIO);
4023
4024                 SetPageError(pp);
4025                 ClearPageUptodate(pp);
4026         } else {
4027                 ClearPageError(pp);
4028                 SetPageUptodate(pp);
4029         }
4030
4031         return (error);
4032 }
4033
4034 /*
4035  * Uses zfs_fillpage to read data from the file and fill the page.
4036  *
4037  *      IN:     ip       - inode of file to get data from.
4038  *              pp       - page to read
4039  *
4040  *      RETURN: 0 on success, error code on failure.
4041  *
4042  * Timestamps:
4043  *      vp - atime updated
4044  */
4045 int
4046 zfs_getpage(struct inode *ip, struct page *pp)
4047 {
4048         zfsvfs_t *zfsvfs = ITOZSB(ip);
4049         znode_t *zp = ITOZ(ip);
4050         int error;
4051
4052         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4053                 return (error);
4054
4055         error = zfs_fillpage(ip, pp);
4056         if (error == 0)
4057                 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
4058
4059         zfs_exit(zfsvfs, FTAG);
4060
4061         return (error);
4062 }
4063
4064 /*
4065  * Check ZFS specific permissions to memory map a section of a file.
4066  *
4067  *      IN:     ip      - inode of the file to mmap
4068  *              off     - file offset
4069  *              addrp   - start address in memory region
4070  *              len     - length of memory region
4071  *              vm_flags- address flags
4072  *
4073  *      RETURN: 0 if success
4074  *              error code if failure
4075  */
4076 int
4077 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4078     unsigned long vm_flags)
4079 {
4080         (void) addrp;
4081         znode_t  *zp = ITOZ(ip);
4082         zfsvfs_t *zfsvfs = ITOZSB(ip);
4083         int error;
4084
4085         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4086                 return (error);
4087
4088         if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
4089             (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4090                 zfs_exit(zfsvfs, FTAG);
4091                 return (SET_ERROR(EPERM));
4092         }
4093
4094         if ((vm_flags & (VM_READ | VM_EXEC)) &&
4095             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4096                 zfs_exit(zfsvfs, FTAG);
4097                 return (SET_ERROR(EACCES));
4098         }
4099
4100         if (off < 0 || len > MAXOFFSET_T - off) {
4101                 zfs_exit(zfsvfs, FTAG);
4102                 return (SET_ERROR(ENXIO));
4103         }
4104
4105         zfs_exit(zfsvfs, FTAG);
4106         return (0);
4107 }
4108
4109 /*
4110  * Free or allocate space in a file.  Currently, this function only
4111  * supports the `F_FREESP' command.  However, this command is somewhat
4112  * misnamed, as its functionality includes the ability to allocate as
4113  * well as free space.
4114  *
4115  *      IN:     zp      - znode of file to free data in.
4116  *              cmd     - action to take (only F_FREESP supported).
4117  *              bfp     - section of file to free/alloc.
4118  *              flag    - current file open mode flags.
4119  *              offset  - current file offset.
4120  *              cr      - credentials of caller.
4121  *
4122  *      RETURN: 0 on success, error code on failure.
4123  *
4124  * Timestamps:
4125  *      zp - ctime|mtime updated
4126  */
4127 int
4128 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
4129     offset_t offset, cred_t *cr)
4130 {
4131         (void) offset;
4132         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
4133         uint64_t        off, len;
4134         int             error;
4135
4136         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4137                 return (error);
4138
4139         if (cmd != F_FREESP) {
4140                 zfs_exit(zfsvfs, FTAG);
4141                 return (SET_ERROR(EINVAL));
4142         }
4143
4144         /*
4145          * Callers might not be able to detect properly that we are read-only,
4146          * so check it explicitly here.
4147          */
4148         if (zfs_is_readonly(zfsvfs)) {
4149                 zfs_exit(zfsvfs, FTAG);
4150                 return (SET_ERROR(EROFS));
4151         }
4152
4153         if (bfp->l_len < 0) {
4154                 zfs_exit(zfsvfs, FTAG);
4155                 return (SET_ERROR(EINVAL));
4156         }
4157
4158         /*
4159          * Permissions aren't checked on Solaris because on this OS
4160          * zfs_space() can only be called with an opened file handle.
4161          * On Linux we can get here through truncate_range() which
4162          * operates directly on inodes, so we need to check access rights.
4163          */
4164         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
4165             zfs_init_idmap))) {
4166                 zfs_exit(zfsvfs, FTAG);
4167                 return (error);
4168         }
4169
4170         off = bfp->l_start;
4171         len = bfp->l_len; /* 0 means from off to end of file */
4172
4173         error = zfs_freesp(zp, off, len, flag, TRUE);
4174
4175         zfs_exit(zfsvfs, FTAG);
4176         return (error);
4177 }
4178
4179 int
4180 zfs_fid(struct inode *ip, fid_t *fidp)
4181 {
4182         znode_t         *zp = ITOZ(ip);
4183         zfsvfs_t        *zfsvfs = ITOZSB(ip);
4184         uint32_t        gen;
4185         uint64_t        gen64;
4186         uint64_t        object = zp->z_id;
4187         zfid_short_t    *zfid;
4188         int             size, i, error;
4189
4190         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
4191                 return (error);
4192
4193         if (fidp->fid_len < SHORT_FID_LEN) {
4194                 fidp->fid_len = SHORT_FID_LEN;
4195                 zfs_exit(zfsvfs, FTAG);
4196                 return (SET_ERROR(ENOSPC));
4197         }
4198
4199         if ((error = zfs_verify_zp(zp)) != 0) {
4200                 zfs_exit(zfsvfs, FTAG);
4201                 return (error);
4202         }
4203
4204         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4205             &gen64, sizeof (uint64_t))) != 0) {
4206                 zfs_exit(zfsvfs, FTAG);
4207                 return (error);
4208         }
4209
4210         gen = (uint32_t)gen64;
4211
4212         size = SHORT_FID_LEN;
4213
4214         zfid = (zfid_short_t *)fidp;
4215
4216         zfid->zf_len = size;
4217
4218         for (i = 0; i < sizeof (zfid->zf_object); i++)
4219                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4220
4221         /* Must have a non-zero generation number to distinguish from .zfs */
4222         if (gen == 0)
4223                 gen = 1;
4224         for (i = 0; i < sizeof (zfid->zf_gen); i++)
4225                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4226
4227         zfs_exit(zfsvfs, FTAG);
4228         return (0);
4229 }
4230
4231 #if defined(_KERNEL)
4232 EXPORT_SYMBOL(zfs_open);
4233 EXPORT_SYMBOL(zfs_close);
4234 EXPORT_SYMBOL(zfs_lookup);
4235 EXPORT_SYMBOL(zfs_create);
4236 EXPORT_SYMBOL(zfs_tmpfile);
4237 EXPORT_SYMBOL(zfs_remove);
4238 EXPORT_SYMBOL(zfs_mkdir);
4239 EXPORT_SYMBOL(zfs_rmdir);
4240 EXPORT_SYMBOL(zfs_readdir);
4241 EXPORT_SYMBOL(zfs_getattr_fast);
4242 EXPORT_SYMBOL(zfs_setattr);
4243 EXPORT_SYMBOL(zfs_rename);
4244 EXPORT_SYMBOL(zfs_symlink);
4245 EXPORT_SYMBOL(zfs_readlink);
4246 EXPORT_SYMBOL(zfs_link);
4247 EXPORT_SYMBOL(zfs_inactive);
4248 EXPORT_SYMBOL(zfs_space);
4249 EXPORT_SYMBOL(zfs_fid);
4250 EXPORT_SYMBOL(zfs_getpage);
4251 EXPORT_SYMBOL(zfs_putpage);
4252 EXPORT_SYMBOL(zfs_dirty_inode);
4253 EXPORT_SYMBOL(zfs_map);
4254
4255 /* CSTYLED */
4256 module_param(zfs_delete_blocks, ulong, 0644);
4257 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4258 #endif