module/os/linux/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/atomic.h>
  45 #include <sys/pathname.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/zfs_dir.h>
  49 #include <sys/zfs_acl.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/dmu.h>
  53 #include <sys/dmu_objset.h>
  54 #include <sys/spa.h>
  55 #include <sys/txg.h>
  56 #include <sys/dbuf.h>
  57 #include <sys/zap.h>
  58 #include <sys/sa.h>
  59 #include <sys/policy.h>
  60 #include <sys/sunddi.h>
  61 #include <sys/sid.h>
  62 #include <sys/zfs_ctldir.h>
  63 #include <sys/zfs_fuid.h>
  64 #include <sys/zfs_quota.h>
  65 #include <sys/zfs_sa.h>
  66 #include <sys/zfs_vnops.h>
  67 #include <sys/zfs_rlock.h>
  68 #include <sys/cred.h>
  69 #include <sys/zpl.h>
  70 #include <sys/zil.h>
  71 #include <sys/sa_impl.h>
  72
  73 /*
  74  * Programming rules.
  75  *
  76  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  77  * properly lock its in-core state, create a DMU transaction, do the work,
  78  * record this work in the intent log (ZIL), commit the DMU transaction,
  79  * and wait for the intent log to commit if it is a synchronous operation.
  80  * Moreover, the vnode ops must work in both normal and log replay context.
  81  * The ordering of events is important to avoid deadlocks and references
  82  * to freed memory.  The example below illustrates the following Big Rules:
  83  *
  84  *  (1) A check must be made in each zfs thread for a mounted file system.
  85  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  86  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  87  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  88  *      can return EIO from the calling function.
  89  *
  90  *  (2) zrele() should always be the last thing except for zil_commit() (if
  91  *      necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the
  92  *      last reference, the vnode/znode can be freed, so the zp may point to
  93  *      freed memory.  Second, the last reference will call zfs_zinactive(),
  94  *      which may induce a lot of work -- pushing cached pages (which acquires
  95  *      range locks) and syncing out cached atime changes.  Third,
  96  *      zfs_zinactive() may require a new tx, which could deadlock the system
  97  *      if you were already holding one. This deadlock occurs because the tx
  98  *      currently being operated on prevents a txg from syncing, which
  99  *      prevents the new tx from progressing, resulting in a deadlock.  If you
 100  *      must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
 101  *      is a synonym for zrele().
 102  *
 103  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 104  *      as they can span dmu_tx_assign() calls.
 105  *
 106  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 107  *      dmu_tx_assign().  This is critical because we don't want to block
 108  *      while holding locks.
 109  *
 110  *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
 111  *      reduces lock contention and CPU usage when we must wait (note that if
 112  *      throughput is constrained by the storage, nearly every transaction
 113  *      must wait).
 114  *
 115  *      Note, in particular, that if a lock is sometimes acquired before
 116  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 117  *      to use a non-blocking assign can deadlock the system.  The scenario:
 118  *
 119  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 120  *      Thread B is in an already-assigned tx, and blocks for this lock.
 121  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 122  *      forever, because the previous txg can't quiesce until B's tx commits.
 123  *
 124  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 125  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 126  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 127  *      to indicate that this operation has already called dmu_tx_wait().
 128  *      This will ensure that we don't retry forever, waiting a short bit
 129  *      each time.
 130  *
 131  *  (5) If the operation succeeded, generate the intent log entry for it
 132  *      before dropping locks.  This ensures that the ordering of events
 133  *      in the intent log matches the order in which they actually occurred.
 134  *      During ZIL replay the zfs_log_* functions will update the sequence
 135  *      number to indicate the zil transaction has replayed.
 136  *
 137  *  (6) At the end of each vnode op, the DMU tx must always commit,
 138  *      regardless of whether there were any errors.
 139  *
 140  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 141  *      to ensure that synchronous semantics are provided when necessary.
 142  *
 143  * In general, this is how things should be ordered in each vnode op:
 144  *
 145  *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 146  * top:
 147  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 148  *      rw_enter(...);                  // grab any other locks you need
 149  *      tx = dmu_tx_create(...);        // get DMU tx
 150  *      dmu_tx_hold_*();                // hold each object you might modify
 151  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 152  *      if (error) {
 153  *              rw_exit(...);           // drop locks
 154  *              zfs_dirent_unlock(dl);  // unlock directory entry
 155  *              zrele(...);             // release held znodes
 156  *              if (error == ERESTART) {
 157  *                      waited = B_TRUE;
 158  *                      dmu_tx_wait(tx);
 159  *                      dmu_tx_abort(tx);
 160  *                      goto top;
 161  *              }
 162  *              dmu_tx_abort(tx);       // abort DMU tx
 163  *              ZFS_EXIT(zfsvfs);       // finished in zfs
 164  *              return (error);         // really out of space
 165  *      }
 166  *      error = do_real_work();         // do whatever this VOP does
 167  *      if (error == 0)
 168  *              zfs_log_*(...);         // on success, make ZIL entry
 169  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 170  *      rw_exit(...);                   // drop locks
 171  *      zfs_dirent_unlock(dl);          // unlock directory entry
 172  *      zrele(...);                     // release held znodes
 173  *      zil_commit(zilog, foid);        // synchronous when necessary
 174  *      ZFS_EXIT(zfsvfs);               // finished in zfs
 175  *      return (error);                 // done, report error
 176  */
 177
 178 /* ARGSUSED */
 179 int
 180 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 181 {
 182         znode_t *zp = ITOZ(ip);
 183         zfsvfs_t *zfsvfs = ITOZSB(ip);
 184
 185         ZFS_ENTER(zfsvfs);
 186         ZFS_VERIFY_ZP(zp);
 187
 188         /* Honor ZFS_APPENDONLY file attribute */
 189         if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 190             ((flag & O_APPEND) == 0)) {
 191                 ZFS_EXIT(zfsvfs);
 192                 return (SET_ERROR(EPERM));
 193         }
 194
 195         /* Keep a count of the synchronous opens in the znode */
 196         if (flag & O_SYNC)
 197                 atomic_inc_32(&zp->z_sync_cnt);
 198
 199         ZFS_EXIT(zfsvfs);
 200         return (0);
 201 }
 202
 203 /* ARGSUSED */
 204 int
 205 zfs_close(struct inode *ip, int flag, cred_t *cr)
 206 {
 207         znode_t *zp = ITOZ(ip);
 208         zfsvfs_t *zfsvfs = ITOZSB(ip);
 209
 210         ZFS_ENTER(zfsvfs);
 211         ZFS_VERIFY_ZP(zp);
 212
 213         /* Decrement the synchronous opens in the znode */
 214         if (flag & O_SYNC)
 215                 atomic_dec_32(&zp->z_sync_cnt);
 216
 217         ZFS_EXIT(zfsvfs);
 218         return (0);
 219 }
 220
 221 #if defined(_KERNEL)
 222 /*
 223  * When a file is memory mapped, we must keep the IO data synchronized
 224  * between the DMU cache and the memory mapped pages.  What this means:
 225  *
 226  * On Write:    If we find a memory mapped page, we write to *both*
 227  *              the page and the dmu buffer.
 228  */
 229 void
 230 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 231 {
 232         struct inode *ip = ZTOI(zp);
 233         struct address_space *mp = ip->i_mapping;
 234         struct page *pp;
 235         uint64_t nbytes;
 236         int64_t off;
 237         void *pb;
 238
 239         off = start & (PAGE_SIZE-1);
 240         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 241                 nbytes = MIN(PAGE_SIZE - off, len);
 242
 243                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
 244                 if (pp) {
 245                         if (mapping_writably_mapped(mp))
 246                                 flush_dcache_page(pp);
 247
 248                         pb = kmap(pp);
 249                         (void) dmu_read(os, zp->z_id, start + off, nbytes,
 250                             pb + off, DMU_READ_PREFETCH);
 251                         kunmap(pp);
 252
 253                         if (mapping_writably_mapped(mp))
 254                                 flush_dcache_page(pp);
 255
 256                         mark_page_accessed(pp);
 257                         SetPageUptodate(pp);
 258                         ClearPageError(pp);
 259                         unlock_page(pp);
 260                         put_page(pp);
 261                 }
 262
 263                 len -= nbytes;
 264                 off = 0;
 265         }
 266 }
 267
 268 /*
 269  * When a file is memory mapped, we must keep the IO data synchronized
 270  * between the DMU cache and the memory mapped pages.  What this means:
 271  *
 272  * On Read:     We "read" preferentially from memory mapped pages,
 273  *              else we default from the dmu buffer.
 274  *
 275  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 276  *       the file is memory mapped.
 277  */
 278 int
 279 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 280 {
 281         struct inode *ip = ZTOI(zp);
 282         struct address_space *mp = ip->i_mapping;
 283         struct page *pp;
 284         int64_t start, off;
 285         uint64_t bytes;
 286         int len = nbytes;
 287         int error = 0;
 288         void *pb;
 289
 290         start = uio->uio_loffset;
 291         off = start & (PAGE_SIZE-1);
 292         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 293                 bytes = MIN(PAGE_SIZE - off, len);
 294
 295                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
 296                 if (pp) {
 297                         ASSERT(PageUptodate(pp));
 298                         unlock_page(pp);
 299
 300                         pb = kmap(pp);
 301                         error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 302                         kunmap(pp);
 303
 304                         if (mapping_writably_mapped(mp))
 305                                 flush_dcache_page(pp);
 306
 307                         mark_page_accessed(pp);
 308                         put_page(pp);
 309                 } else {
 310                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 311                             uio, bytes);
 312                 }
 313
 314                 len -= bytes;
 315                 off = 0;
 316                 if (error)
 317                         break;
 318         }
 319         return (error);
 320 }
 321 #endif /* _KERNEL */
 322
 323 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 324
 325 /*
 326  * Write the bytes to a file.
 327  *
 328  *      IN:     zp      - znode of file to be written to
 329  *              data    - bytes to write
 330  *              len     - number of bytes to write
 331  *              pos     - offset to start writing at
 332  *
 333  *      OUT:    resid   - remaining bytes to write
 334  *
 335  *      RETURN: 0 if success
 336  *              positive error code if failure.  EIO is returned
 337  *              for a short write when residp isn't provided.
 338  *
 339  * Timestamps:
 340  *      zp - ctime|mtime updated if byte count > 0
 341  */
 342 int
 343 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 344     loff_t pos, size_t *residp)
 345 {
 346         fstrans_cookie_t cookie;
 347         int error;
 348
 349         struct iovec iov;
 350         iov.iov_base = (void *)data;
 351         iov.iov_len = len;
 352
 353         zfs_uio_t uio;
 354         zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
 355
 356         cookie = spl_fstrans_mark();
 357         error = zfs_write(zp, &uio, 0, kcred);
 358         spl_fstrans_unmark(cookie);
 359
 360         if (error == 0) {
 361                 if (residp != NULL)
 362                         *residp = zfs_uio_resid(&uio);
 363                 else if (zfs_uio_resid(&uio) != 0)
 364                         error = SET_ERROR(EIO);
 365         }
 366
 367         return (error);
 368 }
 369
 370 static void
 371 zfs_rele_async_task(void *arg)
 372 {
 373         iput(arg);
 374 }
 375
 376 void
 377 zfs_zrele_async(znode_t *zp)
 378 {
 379         struct inode *ip = ZTOI(zp);
 380         objset_t *os = ITOZSB(ip)->z_os;
 381
 382         ASSERT(atomic_read(&ip->i_count) > 0);
 383         ASSERT(os != NULL);
 384
 385         /*
 386          * If decrementing the count would put us at 0, we can't do it inline
 387          * here, because that would be synchronous. Instead, dispatch an iput
 388          * to run later.
 389          *
 390          * For more information on the dangers of a synchronous iput, see the
 391          * header comment of this file.
 392          */
 393         if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 394                 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 395                     zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 396         }
 397 }
 398
 399
 400 /*
 401  * Lookup an entry in a directory, or an extended attribute directory.
 402  * If it exists, return a held inode reference for it.
 403  *
 404  *      IN:     zdp     - znode of directory to search.
 405  *              nm      - name of entry to lookup.
 406  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 407  *              cr      - credentials of caller.
 408  *              direntflags - directory lookup flags
 409  *              realpnp - returned pathname.
 410  *
 411  *      OUT:    zpp     - znode of located entry, NULL if not found.
 412  *
 413  *      RETURN: 0 on success, error code on failure.
 414  *
 415  * Timestamps:
 416  *      NA
 417  */
 418 /* ARGSUSED */
 419 int
 420 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
 421     int *direntflags, pathname_t *realpnp)
 422 {
 423         zfsvfs_t *zfsvfs = ZTOZSB(zdp);
 424         int error = 0;
 425
 426         /*
 427          * Fast path lookup, however we must skip DNLC lookup
 428          * for case folding or normalizing lookups because the
 429          * DNLC code only stores the passed in name.  This means
 430          * creating 'a' and removing 'A' on a case insensitive
 431          * file system would work, but DNLC still thinks 'a'
 432          * exists and won't let you create it again on the next
 433          * pass through fast path.
 434          */
 435         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 436
 437                 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 438                         return (SET_ERROR(ENOTDIR));
 439                 } else if (zdp->z_sa_hdl == NULL) {
 440                         return (SET_ERROR(EIO));
 441                 }
 442
 443                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 444                         error = zfs_fastaccesschk_execute(zdp, cr);
 445                         if (!error) {
 446                                 *zpp = zdp;
 447                                 zhold(*zpp);
 448                                 return (0);
 449                         }
 450                         return (error);
 451                 }
 452         }
 453
 454         ZFS_ENTER(zfsvfs);
 455         ZFS_VERIFY_ZP(zdp);
 456
 457         *zpp = NULL;
 458
 459         if (flags & LOOKUP_XATTR) {
 460                 /*
 461                  * We don't allow recursive attributes..
 462                  * Maybe someday we will.
 463                  */
 464                 if (zdp->z_pflags & ZFS_XATTR) {
 465                         ZFS_EXIT(zfsvfs);
 466                         return (SET_ERROR(EINVAL));
 467                 }
 468
 469                 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
 470                         ZFS_EXIT(zfsvfs);
 471                         return (error);
 472                 }
 473
 474                 /*
 475                  * Do we have permission to get into attribute directory?
 476                  */
 477
 478                 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
 479                     B_FALSE, cr))) {
 480                         zrele(*zpp);
 481                         *zpp = NULL;
 482                 }
 483
 484                 ZFS_EXIT(zfsvfs);
 485                 return (error);
 486         }
 487
 488         if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 489                 ZFS_EXIT(zfsvfs);
 490                 return (SET_ERROR(ENOTDIR));
 491         }
 492
 493         /*
 494          * Check accessibility of directory.
 495          */
 496
 497         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
 498                 ZFS_EXIT(zfsvfs);
 499                 return (error);
 500         }
 501
 502         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 503             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 504                 ZFS_EXIT(zfsvfs);
 505                 return (SET_ERROR(EILSEQ));
 506         }
 507
 508         error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
 509         if ((error == 0) && (*zpp))
 510                 zfs_znode_update_vfs(*zpp);
 511
 512         ZFS_EXIT(zfsvfs);
 513         return (error);
 514 }
 515
 516 /*
 517  * Attempt to create a new entry in a directory.  If the entry
 518  * already exists, truncate the file if permissible, else return
 519  * an error.  Return the ip of the created or trunc'd file.
 520  *
 521  *      IN:     dzp     - znode of directory to put new file entry in.
 522  *              name    - name of new file entry.
 523  *              vap     - attributes of new file.
 524  *              excl    - flag indicating exclusive or non-exclusive mode.
 525  *              mode    - mode to open file with.
 526  *              cr      - credentials of caller.
 527  *              flag    - file flag.
 528  *              vsecp   - ACL to be set
 529  *
 530  *      OUT:    zpp     - znode of created or trunc'd entry.
 531  *
 532  *      RETURN: 0 on success, error code on failure.
 533  *
 534  * Timestamps:
 535  *      dzp - ctime|mtime updated if new entry created
 536  *       zp - ctime|mtime always, atime if new
 537  */
 538
 539 /* ARGSUSED */
 540 int
 541 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
 542     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
 543 {
 544         znode_t         *zp;
 545         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 546         zilog_t         *zilog;
 547         objset_t        *os;
 548         zfs_dirlock_t   *dl;
 549         dmu_tx_t        *tx;
 550         int             error;
 551         uid_t           uid;
 552         gid_t           gid;
 553         zfs_acl_ids_t   acl_ids;
 554         boolean_t       fuid_dirtied;
 555         boolean_t       have_acl = B_FALSE;
 556         boolean_t       waited = B_FALSE;
 557
 558         /*
 559          * If we have an ephemeral id, ACL, or XVATTR then
 560          * make sure file system is at proper version
 561          */
 562
 563         gid = crgetgid(cr);
 564         uid = crgetuid(cr);
 565
 566         if (zfsvfs->z_use_fuids == B_FALSE &&
 567             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 568                 return (SET_ERROR(EINVAL));
 569
 570         if (name == NULL)
 571                 return (SET_ERROR(EINVAL));
 572
 573         ZFS_ENTER(zfsvfs);
 574         ZFS_VERIFY_ZP(dzp);
 575         os = zfsvfs->z_os;
 576         zilog = zfsvfs->z_log;
 577
 578         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 579             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 580                 ZFS_EXIT(zfsvfs);
 581                 return (SET_ERROR(EILSEQ));
 582         }
 583
 584         if (vap->va_mask & ATTR_XVATTR) {
 585                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 586                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 587                         ZFS_EXIT(zfsvfs);
 588                         return (error);
 589                 }
 590         }
 591
 592 top:
 593         *zpp = NULL;
 594         if (*name == '\0') {
 595                 /*
 596                  * Null component name refers to the directory itself.
 597                  */
 598                 zhold(dzp);
 599                 zp = dzp;
 600                 dl = NULL;
 601                 error = 0;
 602         } else {
 603                 /* possible igrab(zp) */
 604                 int zflg = 0;
 605
 606                 if (flag & FIGNORECASE)
 607                         zflg |= ZCILOOK;
 608
 609                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 610                     NULL, NULL);
 611                 if (error) {
 612                         if (have_acl)
 613                                 zfs_acl_ids_free(&acl_ids);
 614                         if (strcmp(name, "..") == 0)
 615                                 error = SET_ERROR(EISDIR);
 616                         ZFS_EXIT(zfsvfs);
 617                         return (error);
 618                 }
 619         }
 620
 621         if (zp == NULL) {
 622                 uint64_t txtype;
 623                 uint64_t projid = ZFS_DEFAULT_PROJID;
 624
 625                 /*
 626                  * Create a new file object and update the directory
 627                  * to reference it.
 628                  */
 629                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 630                         if (have_acl)
 631                                 zfs_acl_ids_free(&acl_ids);
 632                         goto out;
 633                 }
 634
 635                 /*
 636                  * We only support the creation of regular files in
 637                  * extended attribute directories.
 638                  */
 639
 640                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 641                         if (have_acl)
 642                                 zfs_acl_ids_free(&acl_ids);
 643                         error = SET_ERROR(EINVAL);
 644                         goto out;
 645                 }
 646
 647                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 648                     cr, vsecp, &acl_ids)) != 0)
 649                         goto out;
 650                 have_acl = B_TRUE;
 651
 652                 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 653                         projid = zfs_inherit_projid(dzp);
 654                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 655                         zfs_acl_ids_free(&acl_ids);
 656                         error = SET_ERROR(EDQUOT);
 657                         goto out;
 658                 }
 659
 660                 tx = dmu_tx_create(os);
 661
 662                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 663                     ZFS_SA_BASE_ATTR_SIZE);
 664
 665                 fuid_dirtied = zfsvfs->z_fuid_dirty;
 666                 if (fuid_dirtied)
 667                         zfs_fuid_txhold(zfsvfs, tx);
 668                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 669                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 670                 if (!zfsvfs->z_use_sa &&
 671                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 672                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 673                             0, acl_ids.z_aclp->z_acl_bytes);
 674                 }
 675
 676                 error = dmu_tx_assign(tx,
 677                     (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 678                 if (error) {
 679                         zfs_dirent_unlock(dl);
 680                         if (error == ERESTART) {
 681                                 waited = B_TRUE;
 682                                 dmu_tx_wait(tx);
 683                                 dmu_tx_abort(tx);
 684                                 goto top;
 685                         }
 686                         zfs_acl_ids_free(&acl_ids);
 687                         dmu_tx_abort(tx);
 688                         ZFS_EXIT(zfsvfs);
 689                         return (error);
 690                 }
 691                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 692
 693                 error = zfs_link_create(dl, zp, tx, ZNEW);
 694                 if (error != 0) {
 695                         /*
 696                          * Since, we failed to add the directory entry for it,
 697                          * delete the newly created dnode.
 698                          */
 699                         zfs_znode_delete(zp, tx);
 700                         remove_inode_hash(ZTOI(zp));
 701                         zfs_acl_ids_free(&acl_ids);
 702                         dmu_tx_commit(tx);
 703                         goto out;
 704                 }
 705
 706                 if (fuid_dirtied)
 707                         zfs_fuid_sync(zfsvfs, tx);
 708
 709                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 710                 if (flag & FIGNORECASE)
 711                         txtype |= TX_CI;
 712                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 713                     vsecp, acl_ids.z_fuidp, vap);
 714                 zfs_acl_ids_free(&acl_ids);
 715                 dmu_tx_commit(tx);
 716         } else {
 717                 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
 718
 719                 if (have_acl)
 720                         zfs_acl_ids_free(&acl_ids);
 721                 have_acl = B_FALSE;
 722
 723                 /*
 724                  * A directory entry already exists for this name.
 725                  */
 726                 /*
 727                  * Can't truncate an existing file if in exclusive mode.
 728                  */
 729                 if (excl) {
 730                         error = SET_ERROR(EEXIST);
 731                         goto out;
 732                 }
 733                 /*
 734                  * Can't open a directory for writing.
 735                  */
 736                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
 737                         error = SET_ERROR(EISDIR);
 738                         goto out;
 739                 }
 740                 /*
 741                  * Verify requested access to file.
 742                  */
 743                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
 744                         goto out;
 745                 }
 746
 747                 mutex_enter(&dzp->z_lock);
 748                 dzp->z_seq++;
 749                 mutex_exit(&dzp->z_lock);
 750
 751                 /*
 752                  * Truncate regular files if requested.
 753                  */
 754                 if (S_ISREG(ZTOI(zp)->i_mode) &&
 755                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 756                         /* we can't hold any locks when calling zfs_freesp() */
 757                         if (dl) {
 758                                 zfs_dirent_unlock(dl);
 759                                 dl = NULL;
 760                         }
 761                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
 762                 }
 763         }
 764 out:
 765
 766         if (dl)
 767                 zfs_dirent_unlock(dl);
 768
 769         if (error) {
 770                 if (zp)
 771                         zrele(zp);
 772         } else {
 773                 zfs_znode_update_vfs(dzp);
 774                 zfs_znode_update_vfs(zp);
 775                 *zpp = zp;
 776         }
 777
 778         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 779                 zil_commit(zilog, 0);
 780
 781         ZFS_EXIT(zfsvfs);
 782         return (error);
 783 }
 784
 785 /* ARGSUSED */
 786 int
 787 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
 788     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
 789 {
 790         znode_t         *zp = NULL, *dzp = ITOZ(dip);
 791         zfsvfs_t        *zfsvfs = ITOZSB(dip);
 792         objset_t        *os;
 793         dmu_tx_t        *tx;
 794         int             error;
 795         uid_t           uid;
 796         gid_t           gid;
 797         zfs_acl_ids_t   acl_ids;
 798         uint64_t        projid = ZFS_DEFAULT_PROJID;
 799         boolean_t       fuid_dirtied;
 800         boolean_t       have_acl = B_FALSE;
 801         boolean_t       waited = B_FALSE;
 802
 803         /*
 804          * If we have an ephemeral id, ACL, or XVATTR then
 805          * make sure file system is at proper version
 806          */
 807
 808         gid = crgetgid(cr);
 809         uid = crgetuid(cr);
 810
 811         if (zfsvfs->z_use_fuids == B_FALSE &&
 812             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 813                 return (SET_ERROR(EINVAL));
 814
 815         ZFS_ENTER(zfsvfs);
 816         ZFS_VERIFY_ZP(dzp);
 817         os = zfsvfs->z_os;
 818
 819         if (vap->va_mask & ATTR_XVATTR) {
 820                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 821                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 822                         ZFS_EXIT(zfsvfs);
 823                         return (error);
 824                 }
 825         }
 826
 827 top:
 828         *ipp = NULL;
 829
 830         /*
 831          * Create a new file object and update the directory
 832          * to reference it.
 833          */
 834         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 835                 if (have_acl)
 836                         zfs_acl_ids_free(&acl_ids);
 837                 goto out;
 838         }
 839
 840         if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 841             cr, vsecp, &acl_ids)) != 0)
 842                 goto out;
 843         have_acl = B_TRUE;
 844
 845         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 846                 projid = zfs_inherit_projid(dzp);
 847         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 848                 zfs_acl_ids_free(&acl_ids);
 849                 error = SET_ERROR(EDQUOT);
 850                 goto out;
 851         }
 852
 853         tx = dmu_tx_create(os);
 854
 855         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 856             ZFS_SA_BASE_ATTR_SIZE);
 857         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 858
 859         fuid_dirtied = zfsvfs->z_fuid_dirty;
 860         if (fuid_dirtied)
 861                 zfs_fuid_txhold(zfsvfs, tx);
 862         if (!zfsvfs->z_use_sa &&
 863             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 864                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 865                     0, acl_ids.z_aclp->z_acl_bytes);
 866         }
 867         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 868         if (error) {
 869                 if (error == ERESTART) {
 870                         waited = B_TRUE;
 871                         dmu_tx_wait(tx);
 872                         dmu_tx_abort(tx);
 873                         goto top;
 874                 }
 875                 zfs_acl_ids_free(&acl_ids);
 876                 dmu_tx_abort(tx);
 877                 ZFS_EXIT(zfsvfs);
 878                 return (error);
 879         }
 880         zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 881
 882         if (fuid_dirtied)
 883                 zfs_fuid_sync(zfsvfs, tx);
 884
 885         /* Add to unlinked set */
 886         zp->z_unlinked = B_TRUE;
 887         zfs_unlinked_add(zp, tx);
 888         zfs_acl_ids_free(&acl_ids);
 889         dmu_tx_commit(tx);
 890 out:
 891
 892         if (error) {
 893                 if (zp)
 894                         zrele(zp);
 895         } else {
 896                 zfs_znode_update_vfs(dzp);
 897                 zfs_znode_update_vfs(zp);
 898                 *ipp = ZTOI(zp);
 899         }
 900
 901         ZFS_EXIT(zfsvfs);
 902         return (error);
 903 }
 904
 905 /*
 906  * Remove an entry from a directory.
 907  *
 908  *      IN:     dzp     - znode of directory to remove entry from.
 909  *              name    - name of entry to remove.
 910  *              cr      - credentials of caller.
 911  *              flags   - case flags.
 912  *
 913  *      RETURN: 0 if success
 914  *              error code if failure
 915  *
 916  * Timestamps:
 917  *      dzp - ctime|mtime
 918  *       ip - ctime (if nlink > 0)
 919  */
 920
 921 uint64_t null_xattr = 0;
 922
 923 /*ARGSUSED*/
 924 int
 925 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 926 {
 927         znode_t         *zp;
 928         znode_t         *xzp;
 929         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 930         zilog_t         *zilog;
 931         uint64_t        acl_obj, xattr_obj;
 932         uint64_t        xattr_obj_unlinked = 0;
 933         uint64_t        obj = 0;
 934         uint64_t        links;
 935         zfs_dirlock_t   *dl;
 936         dmu_tx_t        *tx;
 937         boolean_t       may_delete_now, delete_now = FALSE;
 938         boolean_t       unlinked, toobig = FALSE;
 939         uint64_t        txtype;
 940         pathname_t      *realnmp = NULL;
 941         pathname_t      realnm;
 942         int             error;
 943         int             zflg = ZEXISTS;
 944         boolean_t       waited = B_FALSE;
 945
 946         if (name == NULL)
 947                 return (SET_ERROR(EINVAL));
 948
 949         ZFS_ENTER(zfsvfs);
 950         ZFS_VERIFY_ZP(dzp);
 951         zilog = zfsvfs->z_log;
 952
 953         if (flags & FIGNORECASE) {
 954                 zflg |= ZCILOOK;
 955                 pn_alloc(&realnm);
 956                 realnmp = &realnm;
 957         }
 958
 959 top:
 960         xattr_obj = 0;
 961         xzp = NULL;
 962         /*
 963          * Attempt to lock directory; fail if entry doesn't exist.
 964          */
 965         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 966             NULL, realnmp))) {
 967                 if (realnmp)
 968                         pn_free(realnmp);
 969                 ZFS_EXIT(zfsvfs);
 970                 return (error);
 971         }
 972
 973         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
 974                 goto out;
 975         }
 976
 977         /*
 978          * Need to use rmdir for removing directories.
 979          */
 980         if (S_ISDIR(ZTOI(zp)->i_mode)) {
 981                 error = SET_ERROR(EPERM);
 982                 goto out;
 983         }
 984
 985         mutex_enter(&zp->z_lock);
 986         may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
 987             !(zp->z_is_mapped);
 988         mutex_exit(&zp->z_lock);
 989
 990         /*
 991          * We may delete the znode now, or we may put it in the unlinked set;
 992          * it depends on whether we're the last link, and on whether there are
 993          * other holds on the inode.  So we dmu_tx_hold() the right things to
 994          * allow for either case.
 995          */
 996         obj = zp->z_id;
 997         tx = dmu_tx_create(zfsvfs->z_os);
 998         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 999         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1000         zfs_sa_upgrade_txholds(tx, zp);
1001         zfs_sa_upgrade_txholds(tx, dzp);
1002         if (may_delete_now) {
1003                 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1004                 /* if the file is too big, only hold_free a token amount */
1005                 dmu_tx_hold_free(tx, zp->z_id, 0,
1006                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1007         }
1008
1009         /* are there any extended attributes? */
1010         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1011             &xattr_obj, sizeof (xattr_obj));
1012         if (error == 0 && xattr_obj) {
1013                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1014                 ASSERT0(error);
1015                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1016                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1017         }
1018
1019         mutex_enter(&zp->z_lock);
1020         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1021                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1022         mutex_exit(&zp->z_lock);
1023
1024         /* charge as an update -- would be nice not to charge at all */
1025         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1026
1027         /*
1028          * Mark this transaction as typically resulting in a net free of space
1029          */
1030         dmu_tx_mark_netfree(tx);
1031
1032         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1033         if (error) {
1034                 zfs_dirent_unlock(dl);
1035                 if (error == ERESTART) {
1036                         waited = B_TRUE;
1037                         dmu_tx_wait(tx);
1038                         dmu_tx_abort(tx);
1039                         zrele(zp);
1040                         if (xzp)
1041                                 zrele(xzp);
1042                         goto top;
1043                 }
1044                 if (realnmp)
1045                         pn_free(realnmp);
1046                 dmu_tx_abort(tx);
1047                 zrele(zp);
1048                 if (xzp)
1049                         zrele(xzp);
1050                 ZFS_EXIT(zfsvfs);
1051                 return (error);
1052         }
1053
1054         /*
1055          * Remove the directory entry.
1056          */
1057         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1058
1059         if (error) {
1060                 dmu_tx_commit(tx);
1061                 goto out;
1062         }
1063
1064         if (unlinked) {
1065                 /*
1066                  * Hold z_lock so that we can make sure that the ACL obj
1067                  * hasn't changed.  Could have been deleted due to
1068                  * zfs_sa_upgrade().
1069                  */
1070                 mutex_enter(&zp->z_lock);
1071                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1072                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1073                 delete_now = may_delete_now && !toobig &&
1074                     atomic_read(&ZTOI(zp)->i_count) == 1 &&
1075                     !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
1076                     zfs_external_acl(zp) == acl_obj;
1077         }
1078
1079         if (delete_now) {
1080                 if (xattr_obj_unlinked) {
1081                         ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1082                         mutex_enter(&xzp->z_lock);
1083                         xzp->z_unlinked = B_TRUE;
1084                         clear_nlink(ZTOI(xzp));
1085                         links = 0;
1086                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1087                             &links, sizeof (links), tx);
1088                         ASSERT3U(error,  ==,  0);
1089                         mutex_exit(&xzp->z_lock);
1090                         zfs_unlinked_add(xzp, tx);
1091
1092                         if (zp->z_is_sa)
1093                                 error = sa_remove(zp->z_sa_hdl,
1094                                     SA_ZPL_XATTR(zfsvfs), tx);
1095                         else
1096                                 error = sa_update(zp->z_sa_hdl,
1097                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
1098                                     sizeof (uint64_t), tx);
1099                         ASSERT0(error);
1100                 }
1101                 /*
1102                  * Add to the unlinked set because a new reference could be
1103                  * taken concurrently resulting in a deferred destruction.
1104                  */
1105                 zfs_unlinked_add(zp, tx);
1106                 mutex_exit(&zp->z_lock);
1107         } else if (unlinked) {
1108                 mutex_exit(&zp->z_lock);
1109                 zfs_unlinked_add(zp, tx);
1110         }
1111
1112         txtype = TX_REMOVE;
1113         if (flags & FIGNORECASE)
1114                 txtype |= TX_CI;
1115         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1116
1117         dmu_tx_commit(tx);
1118 out:
1119         if (realnmp)
1120                 pn_free(realnmp);
1121
1122         zfs_dirent_unlock(dl);
1123         zfs_znode_update_vfs(dzp);
1124         zfs_znode_update_vfs(zp);
1125
1126         if (delete_now)
1127                 zrele(zp);
1128         else
1129                 zfs_zrele_async(zp);
1130
1131         if (xzp) {
1132                 zfs_znode_update_vfs(xzp);
1133                 zfs_zrele_async(xzp);
1134         }
1135
1136         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1137                 zil_commit(zilog, 0);
1138
1139         ZFS_EXIT(zfsvfs);
1140         return (error);
1141 }
1142
1143 /*
1144  * Create a new directory and insert it into dzp using the name
1145  * provided.  Return a pointer to the inserted directory.
1146  *
1147  *      IN:     dzp     - znode of directory to add subdir to.
1148  *              dirname - name of new directory.
1149  *              vap     - attributes of new directory.
1150  *              cr      - credentials of caller.
1151  *              flags   - case flags.
1152  *              vsecp   - ACL to be set
1153  *
1154  *      OUT:    zpp     - znode of created directory.
1155  *
1156  *      RETURN: 0 if success
1157  *              error code if failure
1158  *
1159  * Timestamps:
1160  *      dzp - ctime|mtime updated
1161  *      zpp - ctime|mtime|atime updated
1162  */
1163 /*ARGSUSED*/
1164 int
1165 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1166     cred_t *cr, int flags, vsecattr_t *vsecp)
1167 {
1168         znode_t         *zp;
1169         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1170         zilog_t         *zilog;
1171         zfs_dirlock_t   *dl;
1172         uint64_t        txtype;
1173         dmu_tx_t        *tx;
1174         int             error;
1175         int             zf = ZNEW;
1176         uid_t           uid;
1177         gid_t           gid = crgetgid(cr);
1178         zfs_acl_ids_t   acl_ids;
1179         boolean_t       fuid_dirtied;
1180         boolean_t       waited = B_FALSE;
1181
1182         ASSERT(S_ISDIR(vap->va_mode));
1183
1184         /*
1185          * If we have an ephemeral id, ACL, or XVATTR then
1186          * make sure file system is at proper version
1187          */
1188
1189         uid = crgetuid(cr);
1190         if (zfsvfs->z_use_fuids == B_FALSE &&
1191             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1192                 return (SET_ERROR(EINVAL));
1193
1194         if (dirname == NULL)
1195                 return (SET_ERROR(EINVAL));
1196
1197         ZFS_ENTER(zfsvfs);
1198         ZFS_VERIFY_ZP(dzp);
1199         zilog = zfsvfs->z_log;
1200
1201         if (dzp->z_pflags & ZFS_XATTR) {
1202                 ZFS_EXIT(zfsvfs);
1203                 return (SET_ERROR(EINVAL));
1204         }
1205
1206         if (zfsvfs->z_utf8 && u8_validate(dirname,
1207             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1208                 ZFS_EXIT(zfsvfs);
1209                 return (SET_ERROR(EILSEQ));
1210         }
1211         if (flags & FIGNORECASE)
1212                 zf |= ZCILOOK;
1213
1214         if (vap->va_mask & ATTR_XVATTR) {
1215                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1216                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1217                         ZFS_EXIT(zfsvfs);
1218                         return (error);
1219                 }
1220         }
1221
1222         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1223             vsecp, &acl_ids)) != 0) {
1224                 ZFS_EXIT(zfsvfs);
1225                 return (error);
1226         }
1227         /*
1228          * First make sure the new directory doesn't exist.
1229          *
1230          * Existence is checked first to make sure we don't return
1231          * EACCES instead of EEXIST which can cause some applications
1232          * to fail.
1233          */
1234 top:
1235         *zpp = NULL;
1236
1237         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1238             NULL, NULL))) {
1239                 zfs_acl_ids_free(&acl_ids);
1240                 ZFS_EXIT(zfsvfs);
1241                 return (error);
1242         }
1243
1244         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
1245                 zfs_acl_ids_free(&acl_ids);
1246                 zfs_dirent_unlock(dl);
1247                 ZFS_EXIT(zfsvfs);
1248                 return (error);
1249         }
1250
1251         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1252                 zfs_acl_ids_free(&acl_ids);
1253                 zfs_dirent_unlock(dl);
1254                 ZFS_EXIT(zfsvfs);
1255                 return (SET_ERROR(EDQUOT));
1256         }
1257
1258         /*
1259          * Add a new entry to the directory.
1260          */
1261         tx = dmu_tx_create(zfsvfs->z_os);
1262         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1263         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1264         fuid_dirtied = zfsvfs->z_fuid_dirty;
1265         if (fuid_dirtied)
1266                 zfs_fuid_txhold(zfsvfs, tx);
1267         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1268                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1269                     acl_ids.z_aclp->z_acl_bytes);
1270         }
1271
1272         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1273             ZFS_SA_BASE_ATTR_SIZE);
1274
1275         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1276         if (error) {
1277                 zfs_dirent_unlock(dl);
1278                 if (error == ERESTART) {
1279                         waited = B_TRUE;
1280                         dmu_tx_wait(tx);
1281                         dmu_tx_abort(tx);
1282                         goto top;
1283                 }
1284                 zfs_acl_ids_free(&acl_ids);
1285                 dmu_tx_abort(tx);
1286                 ZFS_EXIT(zfsvfs);
1287                 return (error);
1288         }
1289
1290         /*
1291          * Create new node.
1292          */
1293         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1294
1295         /*
1296          * Now put new name in parent dir.
1297          */
1298         error = zfs_link_create(dl, zp, tx, ZNEW);
1299         if (error != 0) {
1300                 zfs_znode_delete(zp, tx);
1301                 remove_inode_hash(ZTOI(zp));
1302                 goto out;
1303         }
1304
1305         if (fuid_dirtied)
1306                 zfs_fuid_sync(zfsvfs, tx);
1307
1308         *zpp = zp;
1309
1310         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1311         if (flags & FIGNORECASE)
1312                 txtype |= TX_CI;
1313         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1314             acl_ids.z_fuidp, vap);
1315
1316 out:
1317         zfs_acl_ids_free(&acl_ids);
1318
1319         dmu_tx_commit(tx);
1320
1321         zfs_dirent_unlock(dl);
1322
1323         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1324                 zil_commit(zilog, 0);
1325
1326         if (error != 0) {
1327                 zrele(zp);
1328         } else {
1329                 zfs_znode_update_vfs(dzp);
1330                 zfs_znode_update_vfs(zp);
1331         }
1332         ZFS_EXIT(zfsvfs);
1333         return (error);
1334 }
1335
1336 /*
1337  * Remove a directory subdir entry.  If the current working
1338  * directory is the same as the subdir to be removed, the
1339  * remove will fail.
1340  *
1341  *      IN:     dzp     - znode of directory to remove from.
1342  *              name    - name of directory to be removed.
1343  *              cwd     - inode of current working directory.
1344  *              cr      - credentials of caller.
1345  *              flags   - case flags
1346  *
1347  *      RETURN: 0 on success, error code on failure.
1348  *
1349  * Timestamps:
1350  *      dzp - ctime|mtime updated
1351  */
1352 /*ARGSUSED*/
1353 int
1354 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1355     int flags)
1356 {
1357         znode_t         *zp;
1358         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1359         zilog_t         *zilog;
1360         zfs_dirlock_t   *dl;
1361         dmu_tx_t        *tx;
1362         int             error;
1363         int             zflg = ZEXISTS;
1364         boolean_t       waited = B_FALSE;
1365
1366         if (name == NULL)
1367                 return (SET_ERROR(EINVAL));
1368
1369         ZFS_ENTER(zfsvfs);
1370         ZFS_VERIFY_ZP(dzp);
1371         zilog = zfsvfs->z_log;
1372
1373         if (flags & FIGNORECASE)
1374                 zflg |= ZCILOOK;
1375 top:
1376         zp = NULL;
1377
1378         /*
1379          * Attempt to lock directory; fail if entry doesn't exist.
1380          */
1381         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1382             NULL, NULL))) {
1383                 ZFS_EXIT(zfsvfs);
1384                 return (error);
1385         }
1386
1387         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1388                 goto out;
1389         }
1390
1391         if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1392                 error = SET_ERROR(ENOTDIR);
1393                 goto out;
1394         }
1395
1396         if (zp == cwd) {
1397                 error = SET_ERROR(EINVAL);
1398                 goto out;
1399         }
1400
1401         /*
1402          * Grab a lock on the directory to make sure that no one is
1403          * trying to add (or lookup) entries while we are removing it.
1404          */
1405         rw_enter(&zp->z_name_lock, RW_WRITER);
1406
1407         /*
1408          * Grab a lock on the parent pointer to make sure we play well
1409          * with the treewalk and directory rename code.
1410          */
1411         rw_enter(&zp->z_parent_lock, RW_WRITER);
1412
1413         tx = dmu_tx_create(zfsvfs->z_os);
1414         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1415         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1416         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1417         zfs_sa_upgrade_txholds(tx, zp);
1418         zfs_sa_upgrade_txholds(tx, dzp);
1419         dmu_tx_mark_netfree(tx);
1420         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1421         if (error) {
1422                 rw_exit(&zp->z_parent_lock);
1423                 rw_exit(&zp->z_name_lock);
1424                 zfs_dirent_unlock(dl);
1425                 if (error == ERESTART) {
1426                         waited = B_TRUE;
1427                         dmu_tx_wait(tx);
1428                         dmu_tx_abort(tx);
1429                         zrele(zp);
1430                         goto top;
1431                 }
1432                 dmu_tx_abort(tx);
1433                 zrele(zp);
1434                 ZFS_EXIT(zfsvfs);
1435                 return (error);
1436         }
1437
1438         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1439
1440         if (error == 0) {
1441                 uint64_t txtype = TX_RMDIR;
1442                 if (flags & FIGNORECASE)
1443                         txtype |= TX_CI;
1444                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1445                     B_FALSE);
1446         }
1447
1448         dmu_tx_commit(tx);
1449
1450         rw_exit(&zp->z_parent_lock);
1451         rw_exit(&zp->z_name_lock);
1452 out:
1453         zfs_dirent_unlock(dl);
1454
1455         zfs_znode_update_vfs(dzp);
1456         zfs_znode_update_vfs(zp);
1457         zrele(zp);
1458
1459         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1460                 zil_commit(zilog, 0);
1461
1462         ZFS_EXIT(zfsvfs);
1463         return (error);
1464 }
1465
1466 /*
1467  * Read directory entries from the given directory cursor position and emit
1468  * name and position for each entry.
1469  *
1470  *      IN:     ip      - inode of directory to read.
1471  *              ctx     - directory entry context.
1472  *              cr      - credentials of caller.
1473  *
1474  *      RETURN: 0 if success
1475  *              error code if failure
1476  *
1477  * Timestamps:
1478  *      ip - atime updated
1479  *
1480  * Note that the low 4 bits of the cookie returned by zap is always zero.
1481  * This allows us to use the low range for "special" directory entries:
1482  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1483  * we use the offset 2 for the '.zfs' directory.
1484  */
1485 /* ARGSUSED */
1486 int
1487 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
1488 {
1489         znode_t         *zp = ITOZ(ip);
1490         zfsvfs_t        *zfsvfs = ITOZSB(ip);
1491         objset_t        *os;
1492         zap_cursor_t    zc;
1493         zap_attribute_t zap;
1494         int             error;
1495         uint8_t         prefetch;
1496         uint8_t         type;
1497         int             done = 0;
1498         uint64_t        parent;
1499         uint64_t        offset; /* must be unsigned; checks for < 1 */
1500
1501         ZFS_ENTER(zfsvfs);
1502         ZFS_VERIFY_ZP(zp);
1503
1504         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1505             &parent, sizeof (parent))) != 0)
1506                 goto out;
1507
1508         /*
1509          * Quit if directory has been removed (posix)
1510          */
1511         if (zp->z_unlinked)
1512                 goto out;
1513
1514         error = 0;
1515         os = zfsvfs->z_os;
1516         offset = ctx->pos;
1517         prefetch = zp->z_zn_prefetch;
1518
1519         /*
1520          * Initialize the iterator cursor.
1521          */
1522         if (offset <= 3) {
1523                 /*
1524                  * Start iteration from the beginning of the directory.
1525                  */
1526                 zap_cursor_init(&zc, os, zp->z_id);
1527         } else {
1528                 /*
1529                  * The offset is a serialized cursor.
1530                  */
1531                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1532         }
1533
1534         /*
1535          * Transform to file-system independent format
1536          */
1537         while (!done) {
1538                 uint64_t objnum;
1539                 /*
1540                  * Special case `.', `..', and `.zfs'.
1541                  */
1542                 if (offset == 0) {
1543                         (void) strcpy(zap.za_name, ".");
1544                         zap.za_normalization_conflict = 0;
1545                         objnum = zp->z_id;
1546                         type = DT_DIR;
1547                 } else if (offset == 1) {
1548                         (void) strcpy(zap.za_name, "..");
1549                         zap.za_normalization_conflict = 0;
1550                         objnum = parent;
1551                         type = DT_DIR;
1552                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1553                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1554                         zap.za_normalization_conflict = 0;
1555                         objnum = ZFSCTL_INO_ROOT;
1556                         type = DT_DIR;
1557                 } else {
1558                         /*
1559                          * Grab next entry.
1560                          */
1561                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
1562                                 if (error == ENOENT)
1563                                         break;
1564                                 else
1565                                         goto update;
1566                         }
1567
1568                         /*
1569                          * Allow multiple entries provided the first entry is
1570                          * the object id.  Non-zpl consumers may safely make
1571                          * use of the additional space.
1572                          *
1573                          * XXX: This should be a feature flag for compatibility
1574                          */
1575                         if (zap.za_integer_length != 8 ||
1576                             zap.za_num_integers == 0) {
1577                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1578                                     "entry, obj = %lld, offset = %lld, "
1579                                     "length = %d, num = %lld\n",
1580                                     (u_longlong_t)zp->z_id,
1581                                     (u_longlong_t)offset,
1582                                     zap.za_integer_length,
1583                                     (u_longlong_t)zap.za_num_integers);
1584                                 error = SET_ERROR(ENXIO);
1585                                 goto update;
1586                         }
1587
1588                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1589                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1590                 }
1591
1592                 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
1593                     objnum, type);
1594                 if (done)
1595                         break;
1596
1597                 /* Prefetch znode */
1598                 if (prefetch) {
1599                         dmu_prefetch(os, objnum, 0, 0, 0,
1600                             ZIO_PRIORITY_SYNC_READ);
1601                 }
1602
1603                 /*
1604                  * Move to the next entry, fill in the previous offset.
1605                  */
1606                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1607                         zap_cursor_advance(&zc);
1608                         offset = zap_cursor_serialize(&zc);
1609                 } else {
1610                         offset += 1;
1611                 }
1612                 ctx->pos = offset;
1613         }
1614         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1615
1616 update:
1617         zap_cursor_fini(&zc);
1618         if (error == ENOENT)
1619                 error = 0;
1620 out:
1621         ZFS_EXIT(zfsvfs);
1622
1623         return (error);
1624 }
1625
1626 /*
1627  * Get the basic file attributes and place them in the provided kstat
1628  * structure.  The inode is assumed to be the authoritative source
1629  * for most of the attributes.  However, the znode currently has the
1630  * authoritative atime, blksize, and block count.
1631  *
1632  *      IN:     ip      - inode of file.
1633  *
1634  *      OUT:    sp      - kstat values.
1635  *
1636  *      RETURN: 0 (always succeeds)
1637  */
1638 /* ARGSUSED */
1639 int
1640 zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip,
1641     struct kstat *sp)
1642 {
1643         znode_t *zp = ITOZ(ip);
1644         zfsvfs_t *zfsvfs = ITOZSB(ip);
1645         uint32_t blksize;
1646         u_longlong_t nblocks;
1647
1648         ZFS_ENTER(zfsvfs);
1649         ZFS_VERIFY_ZP(zp);
1650
1651         mutex_enter(&zp->z_lock);
1652
1653         zpl_generic_fillattr(user_ns, ip, sp);
1654         /*
1655          * +1 link count for root inode with visible '.zfs' directory.
1656          */
1657         if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1658                 if (sp->nlink < ZFS_LINK_MAX)
1659                         sp->nlink++;
1660
1661         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1662         sp->blksize = blksize;
1663         sp->blocks = nblocks;
1664
1665         if (unlikely(zp->z_blksz == 0)) {
1666                 /*
1667                  * Block size hasn't been set; suggest maximal I/O transfers.
1668                  */
1669                 sp->blksize = zfsvfs->z_max_blksz;
1670         }
1671
1672         mutex_exit(&zp->z_lock);
1673
1674         /*
1675          * Required to prevent NFS client from detecting different inode
1676          * numbers of snapshot root dentry before and after snapshot mount.
1677          */
1678         if (zfsvfs->z_issnap) {
1679                 if (ip->i_sb->s_root->d_inode == ip)
1680                         sp->ino = ZFSCTL_INO_SNAPDIRS -
1681                             dmu_objset_id(zfsvfs->z_os);
1682         }
1683
1684         ZFS_EXIT(zfsvfs);
1685
1686         return (0);
1687 }
1688
1689 /*
1690  * For the operation of changing file's user/group/project, we need to
1691  * handle not only the main object that is assigned to the file directly,
1692  * but also the ones that are used by the file via hidden xattr directory.
1693  *
1694  * Because the xattr directory may contains many EA entries, as to it may
1695  * be impossible to change all of them via the transaction of changing the
1696  * main object's user/group/project attributes. Then we have to change them
1697  * via other multiple independent transactions one by one. It may be not good
1698  * solution, but we have no better idea yet.
1699  */
1700 static int
1701 zfs_setattr_dir(znode_t *dzp)
1702 {
1703         struct inode    *dxip = ZTOI(dzp);
1704         struct inode    *xip = NULL;
1705         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1706         objset_t        *os = zfsvfs->z_os;
1707         zap_cursor_t    zc;
1708         zap_attribute_t zap;
1709         zfs_dirlock_t   *dl;
1710         znode_t         *zp = NULL;
1711         dmu_tx_t        *tx = NULL;
1712         uint64_t        uid, gid;
1713         sa_bulk_attr_t  bulk[4];
1714         int             count;
1715         int             err;
1716
1717         zap_cursor_init(&zc, os, dzp->z_id);
1718         while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
1719                 count = 0;
1720                 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
1721                         err = ENXIO;
1722                         break;
1723                 }
1724
1725                 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
1726                     ZEXISTS, NULL, NULL);
1727                 if (err == ENOENT)
1728                         goto next;
1729                 if (err)
1730                         break;
1731
1732                 xip = ZTOI(zp);
1733                 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1734                     KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1735                     zp->z_projid == dzp->z_projid)
1736                         goto next;
1737
1738                 tx = dmu_tx_create(os);
1739                 if (!(zp->z_pflags & ZFS_PROJID))
1740                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1741                 else
1742                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1743
1744                 err = dmu_tx_assign(tx, TXG_WAIT);
1745                 if (err)
1746                         break;
1747
1748                 mutex_enter(&dzp->z_lock);
1749
1750                 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1751                         xip->i_uid = dxip->i_uid;
1752                         uid = zfs_uid_read(dxip);
1753                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1754                             &uid, sizeof (uid));
1755                 }
1756
1757                 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1758                         xip->i_gid = dxip->i_gid;
1759                         gid = zfs_gid_read(dxip);
1760                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1761                             &gid, sizeof (gid));
1762                 }
1763
1764                 if (zp->z_projid != dzp->z_projid) {
1765                         if (!(zp->z_pflags & ZFS_PROJID)) {
1766                                 zp->z_pflags |= ZFS_PROJID;
1767                                 SA_ADD_BULK_ATTR(bulk, count,
1768                                     SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
1769                                     sizeof (zp->z_pflags));
1770                         }
1771
1772                         zp->z_projid = dzp->z_projid;
1773                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
1774                             NULL, &zp->z_projid, sizeof (zp->z_projid));
1775                 }
1776
1777                 mutex_exit(&dzp->z_lock);
1778
1779                 if (likely(count > 0)) {
1780                         err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1781                         dmu_tx_commit(tx);
1782                 } else {
1783                         dmu_tx_abort(tx);
1784                 }
1785                 tx = NULL;
1786                 if (err != 0 && err != ENOENT)
1787                         break;
1788
1789 next:
1790                 if (zp) {
1791                         zrele(zp);
1792                         zp = NULL;
1793                         zfs_dirent_unlock(dl);
1794                 }
1795                 zap_cursor_advance(&zc);
1796         }
1797
1798         if (tx)
1799                 dmu_tx_abort(tx);
1800         if (zp) {
1801                 zrele(zp);
1802                 zfs_dirent_unlock(dl);
1803         }
1804         zap_cursor_fini(&zc);
1805
1806         return (err == ENOENT ? 0 : err);
1807 }
1808
1809 /*
1810  * Set the file attributes to the values contained in the
1811  * vattr structure.
1812  *
1813  *      IN:     zp      - znode of file to be modified.
1814  *              vap     - new attribute values.
1815  *                        If ATTR_XVATTR set, then optional attrs are being set
1816  *              flags   - ATTR_UTIME set if non-default time values provided.
1817  *                      - ATTR_NOACLCHECK (CIFS context only).
1818  *              cr      - credentials of caller.
1819  *
1820  *      RETURN: 0 if success
1821  *              error code if failure
1822  *
1823  * Timestamps:
1824  *      ip - ctime updated, mtime updated if size changed.
1825  */
1826 /* ARGSUSED */
1827 int
1828 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
1829 {
1830         struct inode    *ip;
1831         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
1832         objset_t        *os = zfsvfs->z_os;
1833         zilog_t         *zilog;
1834         dmu_tx_t        *tx;
1835         vattr_t         oldva;
1836         xvattr_t        *tmpxvattr;
1837         uint_t          mask = vap->va_mask;
1838         uint_t          saved_mask = 0;
1839         int             trim_mask = 0;
1840         uint64_t        new_mode;
1841         uint64_t        new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1842         uint64_t        xattr_obj;
1843         uint64_t        mtime[2], ctime[2], atime[2];
1844         uint64_t        projid = ZFS_INVALID_PROJID;
1845         znode_t         *attrzp;
1846         int             need_policy = FALSE;
1847         int             err, err2 = 0;
1848         zfs_fuid_info_t *fuidp = NULL;
1849         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1850         xoptattr_t      *xoap;
1851         zfs_acl_t       *aclp;
1852         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1853         boolean_t       fuid_dirtied = B_FALSE;
1854         boolean_t       handle_eadir = B_FALSE;
1855         sa_bulk_attr_t  *bulk, *xattr_bulk;
1856         int             count = 0, xattr_count = 0, bulks = 8;
1857
1858         if (mask == 0)
1859                 return (0);
1860
1861         ZFS_ENTER(zfsvfs);
1862         ZFS_VERIFY_ZP(zp);
1863         ip = ZTOI(zp);
1864
1865         /*
1866          * If this is a xvattr_t, then get a pointer to the structure of
1867          * optional attributes.  If this is NULL, then we have a vattr_t.
1868          */
1869         xoap = xva_getxoptattr(xvap);
1870         if (xoap != NULL && (mask & ATTR_XVATTR)) {
1871                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1872                         if (!dmu_objset_projectquota_enabled(os) ||
1873                             (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1874                                 ZFS_EXIT(zfsvfs);
1875                                 return (SET_ERROR(ENOTSUP));
1876                         }
1877
1878                         projid = xoap->xoa_projid;
1879                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
1880                                 ZFS_EXIT(zfsvfs);
1881                                 return (SET_ERROR(EINVAL));
1882                         }
1883
1884                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1885                                 projid = ZFS_INVALID_PROJID;
1886                         else
1887                                 need_policy = TRUE;
1888                 }
1889
1890                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1891                     (xoap->xoa_projinherit !=
1892                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1893                     (!dmu_objset_projectquota_enabled(os) ||
1894                     (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1895                         ZFS_EXIT(zfsvfs);
1896                         return (SET_ERROR(ENOTSUP));
1897                 }
1898         }
1899
1900         zilog = zfsvfs->z_log;
1901
1902         /*
1903          * Make sure that if we have ephemeral uid/gid or xvattr specified
1904          * that file system is at proper version level
1905          */
1906
1907         if (zfsvfs->z_use_fuids == B_FALSE &&
1908             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
1909             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
1910             (mask & ATTR_XVATTR))) {
1911                 ZFS_EXIT(zfsvfs);
1912                 return (SET_ERROR(EINVAL));
1913         }
1914
1915         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
1916                 ZFS_EXIT(zfsvfs);
1917                 return (SET_ERROR(EISDIR));
1918         }
1919
1920         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
1921                 ZFS_EXIT(zfsvfs);
1922                 return (SET_ERROR(EINVAL));
1923         }
1924
1925         tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
1926         xva_init(tmpxvattr);
1927
1928         bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1929         xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1930
1931         /*
1932          * Immutable files can only alter immutable bit and atime
1933          */
1934         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
1935             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
1936             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
1937                 err = SET_ERROR(EPERM);
1938                 goto out3;
1939         }
1940
1941         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
1942                 err = SET_ERROR(EPERM);
1943                 goto out3;
1944         }
1945
1946         /*
1947          * Verify timestamps doesn't overflow 32 bits.
1948          * ZFS can handle large timestamps, but 32bit syscalls can't
1949          * handle times greater than 2039.  This check should be removed
1950          * once large timestamps are fully supported.
1951          */
1952         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
1953                 if (((mask & ATTR_ATIME) &&
1954                     TIMESPEC_OVERFLOW(&vap->va_atime)) ||
1955                     ((mask & ATTR_MTIME) &&
1956                     TIMESPEC_OVERFLOW(&vap->va_mtime))) {
1957                         err = SET_ERROR(EOVERFLOW);
1958                         goto out3;
1959                 }
1960         }
1961
1962 top:
1963         attrzp = NULL;
1964         aclp = NULL;
1965
1966         /* Can this be moved to before the top label? */
1967         if (zfs_is_readonly(zfsvfs)) {
1968                 err = SET_ERROR(EROFS);
1969                 goto out3;
1970         }
1971
1972         /*
1973          * First validate permissions
1974          */
1975
1976         if (mask & ATTR_SIZE) {
1977                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
1978                 if (err)
1979                         goto out3;
1980
1981                 /*
1982                  * XXX - Note, we are not providing any open
1983                  * mode flags here (like FNDELAY), so we may
1984                  * block if there are locks present... this
1985                  * should be addressed in openat().
1986                  */
1987                 /* XXX - would it be OK to generate a log record here? */
1988                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
1989                 if (err)
1990                         goto out3;
1991         }
1992
1993         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
1994             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
1995             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
1996             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
1997             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
1998             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
1999             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2000             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2001                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2002                     skipaclchk, cr);
2003         }
2004
2005         if (mask & (ATTR_UID|ATTR_GID)) {
2006                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
2007                 int     take_owner;
2008                 int     take_group;
2009
2010                 /*
2011                  * NOTE: even if a new mode is being set,
2012                  * we may clear S_ISUID/S_ISGID bits.
2013                  */
2014
2015                 if (!(mask & ATTR_MODE))
2016                         vap->va_mode = zp->z_mode;
2017
2018                 /*
2019                  * Take ownership or chgrp to group we are a member of
2020                  */
2021
2022                 take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
2023                 take_group = (mask & ATTR_GID) &&
2024                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2025
2026                 /*
2027                  * If both ATTR_UID and ATTR_GID are set then take_owner and
2028                  * take_group must both be set in order to allow taking
2029                  * ownership.
2030                  *
2031                  * Otherwise, send the check through secpolicy_vnode_setattr()
2032                  *
2033                  */
2034
2035                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2036                     take_owner && take_group) ||
2037                     ((idmask == ATTR_UID) && take_owner) ||
2038                     ((idmask == ATTR_GID) && take_group)) {
2039                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2040                             skipaclchk, cr) == 0) {
2041                                 /*
2042                                  * Remove setuid/setgid for non-privileged users
2043                                  */
2044                                 (void) secpolicy_setid_clear(vap, cr);
2045                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2046                         } else {
2047                                 need_policy =  TRUE;
2048                         }
2049                 } else {
2050                         need_policy =  TRUE;
2051                 }
2052         }
2053
2054         mutex_enter(&zp->z_lock);
2055         oldva.va_mode = zp->z_mode;
2056         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2057         if (mask & ATTR_XVATTR) {
2058                 /*
2059                  * Update xvattr mask to include only those attributes
2060                  * that are actually changing.
2061                  *
2062                  * the bits will be restored prior to actually setting
2063                  * the attributes so the caller thinks they were set.
2064                  */
2065                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2066                         if (xoap->xoa_appendonly !=
2067                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2068                                 need_policy = TRUE;
2069                         } else {
2070                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2071                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2072                         }
2073                 }
2074
2075                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2076                         if (xoap->xoa_projinherit !=
2077                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2078                                 need_policy = TRUE;
2079                         } else {
2080                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2081                                 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2082                         }
2083                 }
2084
2085                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2086                         if (xoap->xoa_nounlink !=
2087                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2088                                 need_policy = TRUE;
2089                         } else {
2090                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2091                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2092                         }
2093                 }
2094
2095                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2096                         if (xoap->xoa_immutable !=
2097                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2098                                 need_policy = TRUE;
2099                         } else {
2100                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2101                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2102                         }
2103                 }
2104
2105                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2106                         if (xoap->xoa_nodump !=
2107                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2108                                 need_policy = TRUE;
2109                         } else {
2110                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2111                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2112                         }
2113                 }
2114
2115                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2116                         if (xoap->xoa_av_modified !=
2117                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2118                                 need_policy = TRUE;
2119                         } else {
2120                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2121                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2122                         }
2123                 }
2124
2125                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2126                         if ((!S_ISREG(ip->i_mode) &&
2127                             xoap->xoa_av_quarantined) ||
2128                             xoap->xoa_av_quarantined !=
2129                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2130                                 need_policy = TRUE;
2131                         } else {
2132                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2133                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2134                         }
2135                 }
2136
2137                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2138                         mutex_exit(&zp->z_lock);
2139                         err = SET_ERROR(EPERM);
2140                         goto out3;
2141                 }
2142
2143                 if (need_policy == FALSE &&
2144                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2145                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2146                         need_policy = TRUE;
2147                 }
2148         }
2149
2150         mutex_exit(&zp->z_lock);
2151
2152         if (mask & ATTR_MODE) {
2153                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2154                         err = secpolicy_setid_setsticky_clear(ip, vap,
2155                             &oldva, cr);
2156                         if (err)
2157                                 goto out3;
2158
2159                         trim_mask |= ATTR_MODE;
2160                 } else {
2161                         need_policy = TRUE;
2162                 }
2163         }
2164
2165         if (need_policy) {
2166                 /*
2167                  * If trim_mask is set then take ownership
2168                  * has been granted or write_acl is present and user
2169                  * has the ability to modify mode.  In that case remove
2170                  * UID|GID and or MODE from mask so that
2171                  * secpolicy_vnode_setattr() doesn't revoke it.
2172                  */
2173
2174                 if (trim_mask) {
2175                         saved_mask = vap->va_mask;
2176                         vap->va_mask &= ~trim_mask;
2177                 }
2178                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2179                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2180                 if (err)
2181                         goto out3;
2182
2183                 if (trim_mask)
2184                         vap->va_mask |= saved_mask;
2185         }
2186
2187         /*
2188          * secpolicy_vnode_setattr, or take ownership may have
2189          * changed va_mask
2190          */
2191         mask = vap->va_mask;
2192
2193         if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2194                 handle_eadir = B_TRUE;
2195                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2196                     &xattr_obj, sizeof (xattr_obj));
2197
2198                 if (err == 0 && xattr_obj) {
2199                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2200                         if (err)
2201                                 goto out2;
2202                 }
2203                 if (mask & ATTR_UID) {
2204                         new_kuid = zfs_fuid_create(zfsvfs,
2205                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2206                         if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2207                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2208                             new_kuid)) {
2209                                 if (attrzp)
2210                                         zrele(attrzp);
2211                                 err = SET_ERROR(EDQUOT);
2212                                 goto out2;
2213                         }
2214                 }
2215
2216                 if (mask & ATTR_GID) {
2217                         new_kgid = zfs_fuid_create(zfsvfs,
2218                             (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2219                         if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2220                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2221                             new_kgid)) {
2222                                 if (attrzp)
2223                                         zrele(attrzp);
2224                                 err = SET_ERROR(EDQUOT);
2225                                 goto out2;
2226                         }
2227                 }
2228
2229                 if (projid != ZFS_INVALID_PROJID &&
2230                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2231                         if (attrzp)
2232                                 zrele(attrzp);
2233                         err = EDQUOT;
2234                         goto out2;
2235                 }
2236         }
2237         tx = dmu_tx_create(os);
2238
2239         if (mask & ATTR_MODE) {
2240                 uint64_t pmode = zp->z_mode;
2241                 uint64_t acl_obj;
2242                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2243
2244                 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2245                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2246                         err = EPERM;
2247                         goto out;
2248                 }
2249
2250                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2251                         goto out;
2252
2253                 mutex_enter(&zp->z_lock);
2254                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2255                         /*
2256                          * Are we upgrading ACL from old V0 format
2257                          * to V1 format?
2258                          */
2259                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2260                             zfs_znode_acl_version(zp) ==
2261                             ZFS_ACL_VERSION_INITIAL) {
2262                                 dmu_tx_hold_free(tx, acl_obj, 0,
2263                                     DMU_OBJECT_END);
2264                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2265                                     0, aclp->z_acl_bytes);
2266                         } else {
2267                                 dmu_tx_hold_write(tx, acl_obj, 0,
2268                                     aclp->z_acl_bytes);
2269                         }
2270                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2271                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2272                             0, aclp->z_acl_bytes);
2273                 }
2274                 mutex_exit(&zp->z_lock);
2275                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2276         } else {
2277                 if (((mask & ATTR_XVATTR) &&
2278                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2279                     (projid != ZFS_INVALID_PROJID &&
2280                     !(zp->z_pflags & ZFS_PROJID)))
2281                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2282                 else
2283                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2284         }
2285
2286         if (attrzp) {
2287                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2288         }
2289
2290         fuid_dirtied = zfsvfs->z_fuid_dirty;
2291         if (fuid_dirtied)
2292                 zfs_fuid_txhold(zfsvfs, tx);
2293
2294         zfs_sa_upgrade_txholds(tx, zp);
2295
2296         err = dmu_tx_assign(tx, TXG_WAIT);
2297         if (err)
2298                 goto out;
2299
2300         count = 0;
2301         /*
2302          * Set each attribute requested.
2303          * We group settings according to the locks they need to acquire.
2304          *
2305          * Note: you cannot set ctime directly, although it will be
2306          * updated as a side-effect of calling this function.
2307          */
2308
2309         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2310                 /*
2311                  * For the existed object that is upgraded from old system,
2312                  * its on-disk layout has no slot for the project ID attribute.
2313                  * But quota accounting logic needs to access related slots by
2314                  * offset directly. So we need to adjust old objects' layout
2315                  * to make the project ID to some unified and fixed offset.
2316                  */
2317                 if (attrzp)
2318                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2319                 if (err == 0)
2320                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2321
2322                 if (unlikely(err == EEXIST))
2323                         err = 0;
2324                 else if (err != 0)
2325                         goto out;
2326                 else
2327                         projid = ZFS_INVALID_PROJID;
2328         }
2329
2330         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2331                 mutex_enter(&zp->z_acl_lock);
2332         mutex_enter(&zp->z_lock);
2333
2334         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2335             &zp->z_pflags, sizeof (zp->z_pflags));
2336
2337         if (attrzp) {
2338                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2339                         mutex_enter(&attrzp->z_acl_lock);
2340                 mutex_enter(&attrzp->z_lock);
2341                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2342                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2343                     sizeof (attrzp->z_pflags));
2344                 if (projid != ZFS_INVALID_PROJID) {
2345                         attrzp->z_projid = projid;
2346                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2347                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2348                             sizeof (attrzp->z_projid));
2349                 }
2350         }
2351
2352         if (mask & (ATTR_UID|ATTR_GID)) {
2353
2354                 if (mask & ATTR_UID) {
2355                         ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2356                         new_uid = zfs_uid_read(ZTOI(zp));
2357                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2358                             &new_uid, sizeof (new_uid));
2359                         if (attrzp) {
2360                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2361                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2362                                     sizeof (new_uid));
2363                                 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2364                         }
2365                 }
2366
2367                 if (mask & ATTR_GID) {
2368                         ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2369                         new_gid = zfs_gid_read(ZTOI(zp));
2370                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2371                             NULL, &new_gid, sizeof (new_gid));
2372                         if (attrzp) {
2373                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2374                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2375                                     sizeof (new_gid));
2376                                 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2377                         }
2378                 }
2379                 if (!(mask & ATTR_MODE)) {
2380                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2381                             NULL, &new_mode, sizeof (new_mode));
2382                         new_mode = zp->z_mode;
2383                 }
2384                 err = zfs_acl_chown_setattr(zp);
2385                 ASSERT(err == 0);
2386                 if (attrzp) {
2387                         err = zfs_acl_chown_setattr(attrzp);
2388                         ASSERT(err == 0);
2389                 }
2390         }
2391
2392         if (mask & ATTR_MODE) {
2393                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2394                     &new_mode, sizeof (new_mode));
2395                 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2396                 ASSERT3P(aclp, !=, NULL);
2397                 err = zfs_aclset_common(zp, aclp, cr, tx);
2398                 ASSERT0(err);
2399                 if (zp->z_acl_cached)
2400                         zfs_acl_free(zp->z_acl_cached);
2401                 zp->z_acl_cached = aclp;
2402                 aclp = NULL;
2403         }
2404
2405         if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2406                 zp->z_atime_dirty = B_FALSE;
2407                 ZFS_TIME_ENCODE(&ip->i_atime, atime);
2408                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2409                     &atime, sizeof (atime));
2410         }
2411
2412         if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2413                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2414                 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
2415                     vap->va_mtime, ZTOI(zp));
2416
2417                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2418                     mtime, sizeof (mtime));
2419         }
2420
2421         if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2422                 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2423                 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
2424                     ZTOI(zp));
2425                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2426                     ctime, sizeof (ctime));
2427         }
2428
2429         if (projid != ZFS_INVALID_PROJID) {
2430                 zp->z_projid = projid;
2431                 SA_ADD_BULK_ATTR(bulk, count,
2432                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2433                     sizeof (zp->z_projid));
2434         }
2435
2436         if (attrzp && mask) {
2437                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2438                     SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2439                     sizeof (ctime));
2440         }
2441
2442         /*
2443          * Do this after setting timestamps to prevent timestamp
2444          * update from toggling bit
2445          */
2446
2447         if (xoap && (mask & ATTR_XVATTR)) {
2448
2449                 /*
2450                  * restore trimmed off masks
2451                  * so that return masks can be set for caller.
2452                  */
2453
2454                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2455                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2456                 }
2457                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2458                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2459                 }
2460                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2461                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2462                 }
2463                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2464                         XVA_SET_REQ(xvap, XAT_NODUMP);
2465                 }
2466                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2467                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2468                 }
2469                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2470                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2471                 }
2472                 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2473                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2474                 }
2475
2476                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2477                         ASSERT(S_ISREG(ip->i_mode));
2478
2479                 zfs_xvattr_set(zp, xvap, tx);
2480         }
2481
2482         if (fuid_dirtied)
2483                 zfs_fuid_sync(zfsvfs, tx);
2484
2485         if (mask != 0)
2486                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2487
2488         mutex_exit(&zp->z_lock);
2489         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2490                 mutex_exit(&zp->z_acl_lock);
2491
2492         if (attrzp) {
2493                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2494                         mutex_exit(&attrzp->z_acl_lock);
2495                 mutex_exit(&attrzp->z_lock);
2496         }
2497 out:
2498         if (err == 0 && xattr_count > 0) {
2499                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2500                     xattr_count, tx);
2501                 ASSERT(err2 == 0);
2502         }
2503
2504         if (aclp)
2505                 zfs_acl_free(aclp);
2506
2507         if (fuidp) {
2508                 zfs_fuid_info_free(fuidp);
2509                 fuidp = NULL;
2510         }
2511
2512         if (err) {
2513                 dmu_tx_abort(tx);
2514                 if (attrzp)
2515                         zrele(attrzp);
2516                 if (err == ERESTART)
2517                         goto top;
2518         } else {
2519                 if (count > 0)
2520                         err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2521                 dmu_tx_commit(tx);
2522                 if (attrzp) {
2523                         if (err2 == 0 && handle_eadir)
2524                                 err2 = zfs_setattr_dir(attrzp);
2525                         zrele(attrzp);
2526                 }
2527                 zfs_znode_update_vfs(zp);
2528         }
2529
2530 out2:
2531         if (os->os_sync == ZFS_SYNC_ALWAYS)
2532                 zil_commit(zilog, 0);
2533
2534 out3:
2535         kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2536         kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2537         kmem_free(tmpxvattr, sizeof (xvattr_t));
2538         ZFS_EXIT(zfsvfs);
2539         return (err);
2540 }
2541
2542 typedef struct zfs_zlock {
2543         krwlock_t       *zl_rwlock;     /* lock we acquired */
2544         znode_t         *zl_znode;      /* znode we held */
2545         struct zfs_zlock *zl_next;      /* next in list */
2546 } zfs_zlock_t;
2547
2548 /*
2549  * Drop locks and release vnodes that were held by zfs_rename_lock().
2550  */
2551 static void
2552 zfs_rename_unlock(zfs_zlock_t **zlpp)
2553 {
2554         zfs_zlock_t *zl;
2555
2556         while ((zl = *zlpp) != NULL) {
2557                 if (zl->zl_znode != NULL)
2558                         zfs_zrele_async(zl->zl_znode);
2559                 rw_exit(zl->zl_rwlock);
2560                 *zlpp = zl->zl_next;
2561                 kmem_free(zl, sizeof (*zl));
2562         }
2563 }
2564
2565 /*
2566  * Search back through the directory tree, using the ".." entries.
2567  * Lock each directory in the chain to prevent concurrent renames.
2568  * Fail any attempt to move a directory into one of its own descendants.
2569  * XXX - z_parent_lock can overlap with map or grow locks
2570  */
2571 static int
2572 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2573 {
2574         zfs_zlock_t     *zl;
2575         znode_t         *zp = tdzp;
2576         uint64_t        rootid = ZTOZSB(zp)->z_root;
2577         uint64_t        oidp = zp->z_id;
2578         krwlock_t       *rwlp = &szp->z_parent_lock;
2579         krw_t           rw = RW_WRITER;
2580
2581         /*
2582          * First pass write-locks szp and compares to zp->z_id.
2583          * Later passes read-lock zp and compare to zp->z_parent.
2584          */
2585         do {
2586                 if (!rw_tryenter(rwlp, rw)) {
2587                         /*
2588                          * Another thread is renaming in this path.
2589                          * Note that if we are a WRITER, we don't have any
2590                          * parent_locks held yet.
2591                          */
2592                         if (rw == RW_READER && zp->z_id > szp->z_id) {
2593                                 /*
2594                                  * Drop our locks and restart
2595                                  */
2596                                 zfs_rename_unlock(&zl);
2597                                 *zlpp = NULL;
2598                                 zp = tdzp;
2599                                 oidp = zp->z_id;
2600                                 rwlp = &szp->z_parent_lock;
2601                                 rw = RW_WRITER;
2602                                 continue;
2603                         } else {
2604                                 /*
2605                                  * Wait for other thread to drop its locks
2606                                  */
2607                                 rw_enter(rwlp, rw);
2608                         }
2609                 }
2610
2611                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2612                 zl->zl_rwlock = rwlp;
2613                 zl->zl_znode = NULL;
2614                 zl->zl_next = *zlpp;
2615                 *zlpp = zl;
2616
2617                 if (oidp == szp->z_id)          /* We're a descendant of szp */
2618                         return (SET_ERROR(EINVAL));
2619
2620                 if (oidp == rootid)             /* We've hit the top */
2621                         return (0);
2622
2623                 if (rw == RW_READER) {          /* i.e. not the first pass */
2624                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2625                         if (error)
2626                                 return (error);
2627                         zl->zl_znode = zp;
2628                 }
2629                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2630                     &oidp, sizeof (oidp));
2631                 rwlp = &zp->z_parent_lock;
2632                 rw = RW_READER;
2633
2634         } while (zp->z_id != sdzp->z_id);
2635
2636         return (0);
2637 }
2638
2639 /*
2640  * Move an entry from the provided source directory to the target
2641  * directory.  Change the entry name as indicated.
2642  *
2643  *      IN:     sdzp    - Source directory containing the "old entry".
2644  *              snm     - Old entry name.
2645  *              tdzp    - Target directory to contain the "new entry".
2646  *              tnm     - New entry name.
2647  *              cr      - credentials of caller.
2648  *              flags   - case flags
2649  *
2650  *      RETURN: 0 on success, error code on failure.
2651  *
2652  * Timestamps:
2653  *      sdzp,tdzp - ctime|mtime updated
2654  */
2655 /*ARGSUSED*/
2656 int
2657 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2658     cred_t *cr, int flags)
2659 {
2660         znode_t         *szp, *tzp;
2661         zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
2662         zilog_t         *zilog;
2663         zfs_dirlock_t   *sdl, *tdl;
2664         dmu_tx_t        *tx;
2665         zfs_zlock_t     *zl;
2666         int             cmp, serr, terr;
2667         int             error = 0;
2668         int             zflg = 0;
2669         boolean_t       waited = B_FALSE;
2670
2671         if (snm == NULL || tnm == NULL)
2672                 return (SET_ERROR(EINVAL));
2673
2674         ZFS_ENTER(zfsvfs);
2675         ZFS_VERIFY_ZP(sdzp);
2676         zilog = zfsvfs->z_log;
2677
2678         ZFS_VERIFY_ZP(tdzp);
2679
2680         /*
2681          * We check i_sb because snapshots and the ctldir must have different
2682          * super blocks.
2683          */
2684         if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2685             zfsctl_is_node(ZTOI(tdzp))) {
2686                 ZFS_EXIT(zfsvfs);
2687                 return (SET_ERROR(EXDEV));
2688         }
2689
2690         if (zfsvfs->z_utf8 && u8_validate(tnm,
2691             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2692                 ZFS_EXIT(zfsvfs);
2693                 return (SET_ERROR(EILSEQ));
2694         }
2695
2696         if (flags & FIGNORECASE)
2697                 zflg |= ZCILOOK;
2698
2699 top:
2700         szp = NULL;
2701         tzp = NULL;
2702         zl = NULL;
2703
2704         /*
2705          * This is to prevent the creation of links into attribute space
2706          * by renaming a linked file into/outof an attribute directory.
2707          * See the comment in zfs_link() for why this is considered bad.
2708          */
2709         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2710                 ZFS_EXIT(zfsvfs);
2711                 return (SET_ERROR(EINVAL));
2712         }
2713
2714         /*
2715          * Lock source and target directory entries.  To prevent deadlock,
2716          * a lock ordering must be defined.  We lock the directory with
2717          * the smallest object id first, or if it's a tie, the one with
2718          * the lexically first name.
2719          */
2720         if (sdzp->z_id < tdzp->z_id) {
2721                 cmp = -1;
2722         } else if (sdzp->z_id > tdzp->z_id) {
2723                 cmp = 1;
2724         } else {
2725                 /*
2726                  * First compare the two name arguments without
2727                  * considering any case folding.
2728                  */
2729                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2730
2731                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2732                 ASSERT(error == 0 || !zfsvfs->z_utf8);
2733                 if (cmp == 0) {
2734                         /*
2735                          * POSIX: "If the old argument and the new argument
2736                          * both refer to links to the same existing file,
2737                          * the rename() function shall return successfully
2738                          * and perform no other action."
2739                          */
2740                         ZFS_EXIT(zfsvfs);
2741                         return (0);
2742                 }
2743                 /*
2744                  * If the file system is case-folding, then we may
2745                  * have some more checking to do.  A case-folding file
2746                  * system is either supporting mixed case sensitivity
2747                  * access or is completely case-insensitive.  Note
2748                  * that the file system is always case preserving.
2749                  *
2750                  * In mixed sensitivity mode case sensitive behavior
2751                  * is the default.  FIGNORECASE must be used to
2752                  * explicitly request case insensitive behavior.
2753                  *
2754                  * If the source and target names provided differ only
2755                  * by case (e.g., a request to rename 'tim' to 'Tim'),
2756                  * we will treat this as a special case in the
2757                  * case-insensitive mode: as long as the source name
2758                  * is an exact match, we will allow this to proceed as
2759                  * a name-change request.
2760                  */
2761                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2762                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
2763                     flags & FIGNORECASE)) &&
2764                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2765                     &error) == 0) {
2766                         /*
2767                          * case preserving rename request, require exact
2768                          * name matches
2769                          */
2770                         zflg |= ZCIEXACT;
2771                         zflg &= ~ZCILOOK;
2772                 }
2773         }
2774
2775         /*
2776          * If the source and destination directories are the same, we should
2777          * grab the z_name_lock of that directory only once.
2778          */
2779         if (sdzp == tdzp) {
2780                 zflg |= ZHAVELOCK;
2781                 rw_enter(&sdzp->z_name_lock, RW_READER);
2782         }
2783
2784         if (cmp < 0) {
2785                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2786                     ZEXISTS | zflg, NULL, NULL);
2787                 terr = zfs_dirent_lock(&tdl,
2788                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2789         } else {
2790                 terr = zfs_dirent_lock(&tdl,
2791                     tdzp, tnm, &tzp, zflg, NULL, NULL);
2792                 serr = zfs_dirent_lock(&sdl,
2793                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2794                     NULL, NULL);
2795         }
2796
2797         if (serr) {
2798                 /*
2799                  * Source entry invalid or not there.
2800                  */
2801                 if (!terr) {
2802                         zfs_dirent_unlock(tdl);
2803                         if (tzp)
2804                                 zrele(tzp);
2805                 }
2806
2807                 if (sdzp == tdzp)
2808                         rw_exit(&sdzp->z_name_lock);
2809
2810                 if (strcmp(snm, "..") == 0)
2811                         serr = EINVAL;
2812                 ZFS_EXIT(zfsvfs);
2813                 return (serr);
2814         }
2815         if (terr) {
2816                 zfs_dirent_unlock(sdl);
2817                 zrele(szp);
2818
2819                 if (sdzp == tdzp)
2820                         rw_exit(&sdzp->z_name_lock);
2821
2822                 if (strcmp(tnm, "..") == 0)
2823                         terr = EINVAL;
2824                 ZFS_EXIT(zfsvfs);
2825                 return (terr);
2826         }
2827
2828         /*
2829          * If we are using project inheritance, means if the directory has
2830          * ZFS_PROJINHERIT set, then its descendant directories will inherit
2831          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2832          * such case, we only allow renames into our tree when the project
2833          * IDs are the same.
2834          */
2835         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2836             tdzp->z_projid != szp->z_projid) {
2837                 error = SET_ERROR(EXDEV);
2838                 goto out;
2839         }
2840
2841         /*
2842          * Must have write access at the source to remove the old entry
2843          * and write access at the target to create the new entry.
2844          * Note that if target and source are the same, this can be
2845          * done in a single check.
2846          */
2847
2848         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
2849                 goto out;
2850
2851         if (S_ISDIR(ZTOI(szp)->i_mode)) {
2852                 /*
2853                  * Check to make sure rename is valid.
2854                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2855                  */
2856                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2857                         goto out;
2858         }
2859
2860         /*
2861          * Does target exist?
2862          */
2863         if (tzp) {
2864                 /*
2865                  * Source and target must be the same type.
2866                  */
2867                 if (S_ISDIR(ZTOI(szp)->i_mode)) {
2868                         if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
2869                                 error = SET_ERROR(ENOTDIR);
2870                                 goto out;
2871                         }
2872                 } else {
2873                         if (S_ISDIR(ZTOI(tzp)->i_mode)) {
2874                                 error = SET_ERROR(EISDIR);
2875                                 goto out;
2876                         }
2877                 }
2878                 /*
2879                  * POSIX dictates that when the source and target
2880                  * entries refer to the same file object, rename
2881                  * must do nothing and exit without error.
2882                  */
2883                 if (szp->z_id == tzp->z_id) {
2884                         error = 0;
2885                         goto out;
2886                 }
2887         }
2888
2889         tx = dmu_tx_create(zfsvfs->z_os);
2890         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
2891         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2892         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2893         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2894         if (sdzp != tdzp) {
2895                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
2896                 zfs_sa_upgrade_txholds(tx, tdzp);
2897         }
2898         if (tzp) {
2899                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
2900                 zfs_sa_upgrade_txholds(tx, tzp);
2901         }
2902
2903         zfs_sa_upgrade_txholds(tx, szp);
2904         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2905         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2906         if (error) {
2907                 if (zl != NULL)
2908                         zfs_rename_unlock(&zl);
2909                 zfs_dirent_unlock(sdl);
2910                 zfs_dirent_unlock(tdl);
2911
2912                 if (sdzp == tdzp)
2913                         rw_exit(&sdzp->z_name_lock);
2914
2915                 if (error == ERESTART) {
2916                         waited = B_TRUE;
2917                         dmu_tx_wait(tx);
2918                         dmu_tx_abort(tx);
2919                         zrele(szp);
2920                         if (tzp)
2921                                 zrele(tzp);
2922                         goto top;
2923                 }
2924                 dmu_tx_abort(tx);
2925                 zrele(szp);
2926                 if (tzp)
2927                         zrele(tzp);
2928                 ZFS_EXIT(zfsvfs);
2929                 return (error);
2930         }
2931
2932         if (tzp)        /* Attempt to remove the existing target */
2933                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
2934
2935         if (error == 0) {
2936                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2937                 if (error == 0) {
2938                         szp->z_pflags |= ZFS_AV_MODIFIED;
2939                         if (tdzp->z_pflags & ZFS_PROJINHERIT)
2940                                 szp->z_pflags |= ZFS_PROJINHERIT;
2941
2942                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
2943                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
2944                         ASSERT0(error);
2945
2946                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2947                         if (error == 0) {
2948                                 zfs_log_rename(zilog, tx, TX_RENAME |
2949                                     (flags & FIGNORECASE ? TX_CI : 0), sdzp,
2950                                     sdl->dl_name, tdzp, tdl->dl_name, szp);
2951                         } else {
2952                                 /*
2953                                  * At this point, we have successfully created
2954                                  * the target name, but have failed to remove
2955                                  * the source name.  Since the create was done
2956                                  * with the ZRENAMING flag, there are
2957                                  * complications; for one, the link count is
2958                                  * wrong.  The easiest way to deal with this
2959                                  * is to remove the newly created target, and
2960                                  * return the original error.  This must
2961                                  * succeed; fortunately, it is very unlikely to
2962                                  * fail, since we just created it.
2963                                  */
2964                                 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
2965                                     ZRENAMING, NULL), ==, 0);
2966                         }
2967                 } else {
2968                         /*
2969                          * If we had removed the existing target, subsequent
2970                          * call to zfs_link_create() to add back the same entry
2971                          * but, the new dnode (szp) should not fail.
2972                          */
2973                         ASSERT(tzp == NULL);
2974                 }
2975         }
2976
2977         dmu_tx_commit(tx);
2978 out:
2979         if (zl != NULL)
2980                 zfs_rename_unlock(&zl);
2981
2982         zfs_dirent_unlock(sdl);
2983         zfs_dirent_unlock(tdl);
2984
2985         zfs_znode_update_vfs(sdzp);
2986         if (sdzp == tdzp)
2987                 rw_exit(&sdzp->z_name_lock);
2988
2989         if (sdzp != tdzp)
2990                 zfs_znode_update_vfs(tdzp);
2991
2992         zfs_znode_update_vfs(szp);
2993         zrele(szp);
2994         if (tzp) {
2995                 zfs_znode_update_vfs(tzp);
2996                 zrele(tzp);
2997         }
2998
2999         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3000                 zil_commit(zilog, 0);
3001
3002         ZFS_EXIT(zfsvfs);
3003         return (error);
3004 }
3005
3006 /*
3007  * Insert the indicated symbolic reference entry into the directory.
3008  *
3009  *      IN:     dzp     - Directory to contain new symbolic link.
3010  *              name    - Name of directory entry in dip.
3011  *              vap     - Attributes of new entry.
3012  *              link    - Name for new symlink entry.
3013  *              cr      - credentials of caller.
3014  *              flags   - case flags
3015  *
3016  *      OUT:    zpp     - Znode for new symbolic link.
3017  *
3018  *      RETURN: 0 on success, error code on failure.
3019  *
3020  * Timestamps:
3021  *      dip - ctime|mtime updated
3022  */
3023 /*ARGSUSED*/
3024 int
3025 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3026     znode_t **zpp, cred_t *cr, int flags)
3027 {
3028         znode_t         *zp;
3029         zfs_dirlock_t   *dl;
3030         dmu_tx_t        *tx;
3031         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
3032         zilog_t         *zilog;
3033         uint64_t        len = strlen(link);
3034         int             error;
3035         int             zflg = ZNEW;
3036         zfs_acl_ids_t   acl_ids;
3037         boolean_t       fuid_dirtied;
3038         uint64_t        txtype = TX_SYMLINK;
3039         boolean_t       waited = B_FALSE;
3040
3041         ASSERT(S_ISLNK(vap->va_mode));
3042
3043         if (name == NULL)
3044                 return (SET_ERROR(EINVAL));
3045
3046         ZFS_ENTER(zfsvfs);
3047         ZFS_VERIFY_ZP(dzp);
3048         zilog = zfsvfs->z_log;
3049
3050         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3051             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3052                 ZFS_EXIT(zfsvfs);
3053                 return (SET_ERROR(EILSEQ));
3054         }
3055         if (flags & FIGNORECASE)
3056                 zflg |= ZCILOOK;
3057
3058         if (len > MAXPATHLEN) {
3059                 ZFS_EXIT(zfsvfs);
3060                 return (SET_ERROR(ENAMETOOLONG));
3061         }
3062
3063         if ((error = zfs_acl_ids_create(dzp, 0,
3064             vap, cr, NULL, &acl_ids)) != 0) {
3065                 ZFS_EXIT(zfsvfs);
3066                 return (error);
3067         }
3068 top:
3069         *zpp = NULL;
3070
3071         /*
3072          * Attempt to lock directory; fail if entry already exists.
3073          */
3074         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3075         if (error) {
3076                 zfs_acl_ids_free(&acl_ids);
3077                 ZFS_EXIT(zfsvfs);
3078                 return (error);
3079         }
3080
3081         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3082                 zfs_acl_ids_free(&acl_ids);
3083                 zfs_dirent_unlock(dl);
3084                 ZFS_EXIT(zfsvfs);
3085                 return (error);
3086         }
3087
3088         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3089                 zfs_acl_ids_free(&acl_ids);
3090                 zfs_dirent_unlock(dl);
3091                 ZFS_EXIT(zfsvfs);
3092                 return (SET_ERROR(EDQUOT));
3093         }
3094         tx = dmu_tx_create(zfsvfs->z_os);
3095         fuid_dirtied = zfsvfs->z_fuid_dirty;
3096         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3097         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3098         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3099             ZFS_SA_BASE_ATTR_SIZE + len);
3100         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3101         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3102                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3103                     acl_ids.z_aclp->z_acl_bytes);
3104         }
3105         if (fuid_dirtied)
3106                 zfs_fuid_txhold(zfsvfs, tx);
3107         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3108         if (error) {
3109                 zfs_dirent_unlock(dl);
3110                 if (error == ERESTART) {
3111                         waited = B_TRUE;
3112                         dmu_tx_wait(tx);
3113                         dmu_tx_abort(tx);
3114                         goto top;
3115                 }
3116                 zfs_acl_ids_free(&acl_ids);
3117                 dmu_tx_abort(tx);
3118                 ZFS_EXIT(zfsvfs);
3119                 return (error);
3120         }
3121
3122         /*
3123          * Create a new object for the symlink.
3124          * for version 4 ZPL datasets the symlink will be an SA attribute
3125          */
3126         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3127
3128         if (fuid_dirtied)
3129                 zfs_fuid_sync(zfsvfs, tx);
3130
3131         mutex_enter(&zp->z_lock);
3132         if (zp->z_is_sa)
3133                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3134                     link, len, tx);
3135         else
3136                 zfs_sa_symlink(zp, link, len, tx);
3137         mutex_exit(&zp->z_lock);
3138
3139         zp->z_size = len;
3140         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3141             &zp->z_size, sizeof (zp->z_size), tx);
3142         /*
3143          * Insert the new object into the directory.
3144          */
3145         error = zfs_link_create(dl, zp, tx, ZNEW);
3146         if (error != 0) {
3147                 zfs_znode_delete(zp, tx);
3148                 remove_inode_hash(ZTOI(zp));
3149         } else {
3150                 if (flags & FIGNORECASE)
3151                         txtype |= TX_CI;
3152                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3153
3154                 zfs_znode_update_vfs(dzp);
3155                 zfs_znode_update_vfs(zp);
3156         }
3157
3158         zfs_acl_ids_free(&acl_ids);
3159
3160         dmu_tx_commit(tx);
3161
3162         zfs_dirent_unlock(dl);
3163
3164         if (error == 0) {
3165                 *zpp = zp;
3166
3167                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3168                         zil_commit(zilog, 0);
3169         } else {
3170                 zrele(zp);
3171         }
3172
3173         ZFS_EXIT(zfsvfs);
3174         return (error);
3175 }
3176
3177 /*
3178  * Return, in the buffer contained in the provided uio structure,
3179  * the symbolic path referred to by ip.
3180  *
3181  *      IN:     ip      - inode of symbolic link
3182  *              uio     - structure to contain the link path.
3183  *              cr      - credentials of caller.
3184  *
3185  *      RETURN: 0 if success
3186  *              error code if failure
3187  *
3188  * Timestamps:
3189  *      ip - atime updated
3190  */
3191 /* ARGSUSED */
3192 int
3193 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3194 {
3195         znode_t         *zp = ITOZ(ip);
3196         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3197         int             error;
3198
3199         ZFS_ENTER(zfsvfs);
3200         ZFS_VERIFY_ZP(zp);
3201
3202         mutex_enter(&zp->z_lock);
3203         if (zp->z_is_sa)
3204                 error = sa_lookup_uio(zp->z_sa_hdl,
3205                     SA_ZPL_SYMLINK(zfsvfs), uio);
3206         else
3207                 error = zfs_sa_readlink(zp, uio);
3208         mutex_exit(&zp->z_lock);
3209
3210         ZFS_EXIT(zfsvfs);
3211         return (error);
3212 }
3213
3214 /*
3215  * Insert a new entry into directory tdzp referencing szp.
3216  *
3217  *      IN:     tdzp    - Directory to contain new entry.
3218  *              szp     - znode of new entry.
3219  *              name    - name of new entry.
3220  *              cr      - credentials of caller.
3221  *              flags   - case flags.
3222  *
3223  *      RETURN: 0 if success
3224  *              error code if failure
3225  *
3226  * Timestamps:
3227  *      tdzp - ctime|mtime updated
3228  *       szp - ctime updated
3229  */
3230 /* ARGSUSED */
3231 int
3232 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3233     int flags)
3234 {
3235         struct inode *sip = ZTOI(szp);
3236         znode_t         *tzp;
3237         zfsvfs_t        *zfsvfs = ZTOZSB(tdzp);
3238         zilog_t         *zilog;
3239         zfs_dirlock_t   *dl;
3240         dmu_tx_t        *tx;
3241         int             error;
3242         int             zf = ZNEW;
3243         uint64_t        parent;
3244         uid_t           owner;
3245         boolean_t       waited = B_FALSE;
3246         boolean_t       is_tmpfile = 0;
3247         uint64_t        txg;
3248 #ifdef HAVE_TMPFILE
3249         is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3250 #endif
3251         ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3252
3253         if (name == NULL)
3254                 return (SET_ERROR(EINVAL));
3255
3256         ZFS_ENTER(zfsvfs);
3257         ZFS_VERIFY_ZP(tdzp);
3258         zilog = zfsvfs->z_log;
3259
3260         /*
3261          * POSIX dictates that we return EPERM here.
3262          * Better choices include ENOTSUP or EISDIR.
3263          */
3264         if (S_ISDIR(sip->i_mode)) {
3265                 ZFS_EXIT(zfsvfs);
3266                 return (SET_ERROR(EPERM));
3267         }
3268
3269         ZFS_VERIFY_ZP(szp);
3270
3271         /*
3272          * If we are using project inheritance, means if the directory has
3273          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3274          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3275          * such case, we only allow hard link creation in our tree when the
3276          * project IDs are the same.
3277          */
3278         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3279             tdzp->z_projid != szp->z_projid) {
3280                 ZFS_EXIT(zfsvfs);
3281                 return (SET_ERROR(EXDEV));
3282         }
3283
3284         /*
3285          * We check i_sb because snapshots and the ctldir must have different
3286          * super blocks.
3287          */
3288         if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3289                 ZFS_EXIT(zfsvfs);
3290                 return (SET_ERROR(EXDEV));
3291         }
3292
3293         /* Prevent links to .zfs/shares files */
3294
3295         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3296             &parent, sizeof (uint64_t))) != 0) {
3297                 ZFS_EXIT(zfsvfs);
3298                 return (error);
3299         }
3300         if (parent == zfsvfs->z_shares_dir) {
3301                 ZFS_EXIT(zfsvfs);
3302                 return (SET_ERROR(EPERM));
3303         }
3304
3305         if (zfsvfs->z_utf8 && u8_validate(name,
3306             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3307                 ZFS_EXIT(zfsvfs);
3308                 return (SET_ERROR(EILSEQ));
3309         }
3310         if (flags & FIGNORECASE)
3311                 zf |= ZCILOOK;
3312
3313         /*
3314          * We do not support links between attributes and non-attributes
3315          * because of the potential security risk of creating links
3316          * into "normal" file space in order to circumvent restrictions
3317          * imposed in attribute space.
3318          */
3319         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3320                 ZFS_EXIT(zfsvfs);
3321                 return (SET_ERROR(EINVAL));
3322         }
3323
3324         owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3325             cr, ZFS_OWNER);
3326         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3327                 ZFS_EXIT(zfsvfs);
3328                 return (SET_ERROR(EPERM));
3329         }
3330
3331         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3332                 ZFS_EXIT(zfsvfs);
3333                 return (error);
3334         }
3335
3336 top:
3337         /*
3338          * Attempt to lock directory; fail if entry already exists.
3339          */
3340         error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3341         if (error) {
3342                 ZFS_EXIT(zfsvfs);
3343                 return (error);
3344         }
3345
3346         tx = dmu_tx_create(zfsvfs->z_os);
3347         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3348         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3349         if (is_tmpfile)
3350                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3351
3352         zfs_sa_upgrade_txholds(tx, szp);
3353         zfs_sa_upgrade_txholds(tx, tdzp);
3354         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3355         if (error) {
3356                 zfs_dirent_unlock(dl);
3357                 if (error == ERESTART) {
3358                         waited = B_TRUE;
3359                         dmu_tx_wait(tx);
3360                         dmu_tx_abort(tx);
3361                         goto top;
3362                 }
3363                 dmu_tx_abort(tx);
3364                 ZFS_EXIT(zfsvfs);
3365                 return (error);
3366         }
3367         /* unmark z_unlinked so zfs_link_create will not reject */
3368         if (is_tmpfile)
3369                 szp->z_unlinked = B_FALSE;
3370         error = zfs_link_create(dl, szp, tx, 0);
3371
3372         if (error == 0) {
3373                 uint64_t txtype = TX_LINK;
3374                 /*
3375                  * tmpfile is created to be in z_unlinkedobj, so remove it.
3376                  * Also, we don't log in ZIL, because all previous file
3377                  * operation on the tmpfile are ignored by ZIL. Instead we
3378                  * always wait for txg to sync to make sure all previous
3379                  * operation are sync safe.
3380                  */
3381                 if (is_tmpfile) {
3382                         VERIFY(zap_remove_int(zfsvfs->z_os,
3383                             zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
3384                 } else {
3385                         if (flags & FIGNORECASE)
3386                                 txtype |= TX_CI;
3387                         zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3388                 }
3389         } else if (is_tmpfile) {
3390                 /* restore z_unlinked since when linking failed */
3391                 szp->z_unlinked = B_TRUE;
3392         }
3393         txg = dmu_tx_get_txg(tx);
3394         dmu_tx_commit(tx);
3395
3396         zfs_dirent_unlock(dl);
3397
3398         if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3399                 zil_commit(zilog, 0);
3400
3401         if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
3402                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
3403
3404         zfs_znode_update_vfs(tdzp);
3405         zfs_znode_update_vfs(szp);
3406         ZFS_EXIT(zfsvfs);
3407         return (error);
3408 }
3409
3410 static void
3411 zfs_putpage_commit_cb(void *arg)
3412 {
3413         struct page *pp = arg;
3414
3415         ClearPageError(pp);
3416         end_page_writeback(pp);
3417 }
3418
3419 /*
3420  * Push a page out to disk, once the page is on stable storage the
3421  * registered commit callback will be run as notification of completion.
3422  *
3423  *      IN:     ip      - page mapped for inode.
3424  *              pp      - page to push (page is locked)
3425  *              wbc     - writeback control data
3426  *
3427  *      RETURN: 0 if success
3428  *              error code if failure
3429  *
3430  * Timestamps:
3431  *      ip - ctime|mtime updated
3432  */
3433 /* ARGSUSED */
3434 int
3435 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
3436 {
3437         znode_t         *zp = ITOZ(ip);
3438         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3439         loff_t          offset;
3440         loff_t          pgoff;
3441         unsigned int    pglen;
3442         dmu_tx_t        *tx;
3443         caddr_t         va;
3444         int             err = 0;
3445         uint64_t        mtime[2], ctime[2];
3446         sa_bulk_attr_t  bulk[3];
3447         int             cnt = 0;
3448         struct address_space *mapping;
3449
3450         ZFS_ENTER(zfsvfs);
3451         ZFS_VERIFY_ZP(zp);
3452
3453         ASSERT(PageLocked(pp));
3454
3455         pgoff = page_offset(pp);        /* Page byte-offset in file */
3456         offset = i_size_read(ip);       /* File length in bytes */
3457         pglen = MIN(PAGE_SIZE,          /* Page length in bytes */
3458             P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3459
3460         /* Page is beyond end of file */
3461         if (pgoff >= offset) {
3462                 unlock_page(pp);
3463                 ZFS_EXIT(zfsvfs);
3464                 return (0);
3465         }
3466
3467         /* Truncate page length to end of file */
3468         if (pgoff + pglen > offset)
3469                 pglen = offset - pgoff;
3470
3471 #if 0
3472         /*
3473          * FIXME: Allow mmap writes past its quota.  The correct fix
3474          * is to register a page_mkwrite() handler to count the page
3475          * against its quota when it is about to be dirtied.
3476          */
3477         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3478             KUID_TO_SUID(ip->i_uid)) ||
3479             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3480             KGID_TO_SGID(ip->i_gid)) ||
3481             (zp->z_projid != ZFS_DEFAULT_PROJID &&
3482             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3483             zp->z_projid))) {
3484                 err = EDQUOT;
3485         }
3486 #endif
3487
3488         /*
3489          * The ordering here is critical and must adhere to the following
3490          * rules in order to avoid deadlocking in either zfs_read() or
3491          * zfs_free_range() due to a lock inversion.
3492          *
3493          * 1) The page must be unlocked prior to acquiring the range lock.
3494          *    This is critical because zfs_read() calls find_lock_page()
3495          *    which may block on the page lock while holding the range lock.
3496          *
3497          * 2) Before setting or clearing write back on a page the range lock
3498          *    must be held in order to prevent a lock inversion with the
3499          *    zfs_free_range() function.
3500          *
3501          * This presents a problem because upon entering this function the
3502          * page lock is already held.  To safely acquire the range lock the
3503          * page lock must be dropped.  This creates a window where another
3504          * process could truncate, invalidate, dirty, or write out the page.
3505          *
3506          * Therefore, after successfully reacquiring the range and page locks
3507          * the current page state is checked.  In the common case everything
3508          * will be as is expected and it can be written out.  However, if
3509          * the page state has changed it must be handled accordingly.
3510          */
3511         mapping = pp->mapping;
3512         redirty_page_for_writepage(wbc, pp);
3513         unlock_page(pp);
3514
3515         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3516             pgoff, pglen, RL_WRITER);
3517         lock_page(pp);
3518
3519         /* Page mapping changed or it was no longer dirty, we're done */
3520         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3521                 unlock_page(pp);
3522                 zfs_rangelock_exit(lr);
3523                 ZFS_EXIT(zfsvfs);
3524                 return (0);
3525         }
3526
3527         /* Another process started write block if required */
3528         if (PageWriteback(pp)) {
3529                 unlock_page(pp);
3530                 zfs_rangelock_exit(lr);
3531
3532                 if (wbc->sync_mode != WB_SYNC_NONE) {
3533                         if (PageWriteback(pp))
3534 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3535                                 folio_wait_bit(page_folio(pp), PG_writeback);
3536 #else
3537                                 wait_on_page_bit(pp, PG_writeback);
3538 #endif
3539                 }
3540
3541                 ZFS_EXIT(zfsvfs);
3542                 return (0);
3543         }
3544
3545         /* Clear the dirty flag the required locks are held */
3546         if (!clear_page_dirty_for_io(pp)) {
3547                 unlock_page(pp);
3548                 zfs_rangelock_exit(lr);
3549                 ZFS_EXIT(zfsvfs);
3550                 return (0);
3551         }
3552
3553         /*
3554          * Counterpart for redirty_page_for_writepage() above.  This page
3555          * was in fact not skipped and should not be counted as if it were.
3556          */
3557         wbc->pages_skipped--;
3558         set_page_writeback(pp);
3559         unlock_page(pp);
3560
3561         tx = dmu_tx_create(zfsvfs->z_os);
3562         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3563         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3564         zfs_sa_upgrade_txholds(tx, zp);
3565
3566         err = dmu_tx_assign(tx, TXG_NOWAIT);
3567         if (err != 0) {
3568                 if (err == ERESTART)
3569                         dmu_tx_wait(tx);
3570
3571                 dmu_tx_abort(tx);
3572                 __set_page_dirty_nobuffers(pp);
3573                 ClearPageError(pp);
3574                 end_page_writeback(pp);
3575                 zfs_rangelock_exit(lr);
3576                 ZFS_EXIT(zfsvfs);
3577                 return (err);
3578         }
3579
3580         va = kmap(pp);
3581         ASSERT3U(pglen, <=, PAGE_SIZE);
3582         dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
3583         kunmap(pp);
3584
3585         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3586         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3587         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3588             &zp->z_pflags, 8);
3589
3590         /* Preserve the mtime and ctime provided by the inode */
3591         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3592         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3593         zp->z_atime_dirty = B_FALSE;
3594         zp->z_seq++;
3595
3596         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3597
3598         zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
3599             zfs_putpage_commit_cb, pp);
3600         dmu_tx_commit(tx);
3601
3602         zfs_rangelock_exit(lr);
3603
3604         if (wbc->sync_mode != WB_SYNC_NONE) {
3605                 /*
3606                  * Note that this is rarely called under writepages(), because
3607                  * writepages() normally handles the entire commit for
3608                  * performance reasons.
3609                  */
3610                 zil_commit(zfsvfs->z_log, zp->z_id);
3611         }
3612
3613         ZFS_EXIT(zfsvfs);
3614         return (err);
3615 }
3616
3617 /*
3618  * Update the system attributes when the inode has been dirtied.  For the
3619  * moment we only update the mode, atime, mtime, and ctime.
3620  */
3621 int
3622 zfs_dirty_inode(struct inode *ip, int flags)
3623 {
3624         znode_t         *zp = ITOZ(ip);
3625         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3626         dmu_tx_t        *tx;
3627         uint64_t        mode, atime[2], mtime[2], ctime[2];
3628         sa_bulk_attr_t  bulk[4];
3629         int             error = 0;
3630         int             cnt = 0;
3631
3632         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
3633                 return (0);
3634
3635         ZFS_ENTER(zfsvfs);
3636         ZFS_VERIFY_ZP(zp);
3637
3638 #ifdef I_DIRTY_TIME
3639         /*
3640          * This is the lazytime semantic introduced in Linux 4.0
3641          * This flag will only be called from update_time when lazytime is set.
3642          * (Note, I_DIRTY_SYNC will also set if not lazytime)
3643          * Fortunately mtime and ctime are managed within ZFS itself, so we
3644          * only need to dirty atime.
3645          */
3646         if (flags == I_DIRTY_TIME) {
3647                 zp->z_atime_dirty = B_TRUE;
3648                 goto out;
3649         }
3650 #endif
3651
3652         tx = dmu_tx_create(zfsvfs->z_os);
3653
3654         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3655         zfs_sa_upgrade_txholds(tx, zp);
3656
3657         error = dmu_tx_assign(tx, TXG_WAIT);
3658         if (error) {
3659                 dmu_tx_abort(tx);
3660                 goto out;
3661         }
3662
3663         mutex_enter(&zp->z_lock);
3664         zp->z_atime_dirty = B_FALSE;
3665
3666         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
3667         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
3668         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3669         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3670
3671         /* Preserve the mode, mtime and ctime provided by the inode */
3672         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3673         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3674         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3675         mode = ip->i_mode;
3676
3677         zp->z_mode = mode;
3678
3679         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3680         mutex_exit(&zp->z_lock);
3681
3682         dmu_tx_commit(tx);
3683 out:
3684         ZFS_EXIT(zfsvfs);
3685         return (error);
3686 }
3687
3688 /*ARGSUSED*/
3689 void
3690 zfs_inactive(struct inode *ip)
3691 {
3692         znode_t *zp = ITOZ(ip);
3693         zfsvfs_t *zfsvfs = ITOZSB(ip);
3694         uint64_t atime[2];
3695         int error;
3696         int need_unlock = 0;
3697
3698         /* Only read lock if we haven't already write locked, e.g. rollback */
3699         if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
3700                 need_unlock = 1;
3701                 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3702         }
3703         if (zp->z_sa_hdl == NULL) {
3704                 if (need_unlock)
3705                         rw_exit(&zfsvfs->z_teardown_inactive_lock);
3706                 return;
3707         }
3708
3709         if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
3710                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3711
3712                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3713                 zfs_sa_upgrade_txholds(tx, zp);
3714                 error = dmu_tx_assign(tx, TXG_WAIT);
3715                 if (error) {
3716                         dmu_tx_abort(tx);
3717                 } else {
3718                         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3719                         mutex_enter(&zp->z_lock);
3720                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3721                             (void *)&atime, sizeof (atime), tx);
3722                         zp->z_atime_dirty = B_FALSE;
3723                         mutex_exit(&zp->z_lock);
3724                         dmu_tx_commit(tx);
3725                 }
3726         }
3727
3728         zfs_zinactive(zp);
3729         if (need_unlock)
3730                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3731 }
3732
3733 /*
3734  * Fill pages with data from the disk.
3735  */
3736 static int
3737 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
3738 {
3739         znode_t *zp = ITOZ(ip);
3740         zfsvfs_t *zfsvfs = ITOZSB(ip);
3741         objset_t *os;
3742         struct page *cur_pp;
3743         u_offset_t io_off, total;
3744         size_t io_len;
3745         loff_t i_size;
3746         unsigned page_idx;
3747         int err;
3748
3749         os = zfsvfs->z_os;
3750         io_len = nr_pages << PAGE_SHIFT;
3751         i_size = i_size_read(ip);
3752         io_off = page_offset(pl[0]);
3753
3754         if (io_off + io_len > i_size)
3755                 io_len = i_size - io_off;
3756
3757         /*
3758          * Iterate over list of pages and read each page individually.
3759          */
3760         page_idx = 0;
3761         for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
3762                 caddr_t va;
3763
3764                 cur_pp = pl[page_idx++];
3765                 va = kmap(cur_pp);
3766                 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
3767                     DMU_READ_PREFETCH);
3768                 kunmap(cur_pp);
3769                 if (err) {
3770                         /* convert checksum errors into IO errors */
3771                         if (err == ECKSUM)
3772                                 err = SET_ERROR(EIO);
3773                         return (err);
3774                 }
3775         }
3776
3777         return (0);
3778 }
3779
3780 /*
3781  * Uses zfs_fillpage to read data from the file and fill the pages.
3782  *
3783  *      IN:     ip       - inode of file to get data from.
3784  *              pl       - list of pages to read
3785  *              nr_pages - number of pages to read
3786  *
3787  *      RETURN: 0 on success, error code on failure.
3788  *
3789  * Timestamps:
3790  *      vp - atime updated
3791  */
3792 /* ARGSUSED */
3793 int
3794 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
3795 {
3796         znode_t  *zp  = ITOZ(ip);
3797         zfsvfs_t *zfsvfs = ITOZSB(ip);
3798         int      err;
3799
3800         if (pl == NULL)
3801                 return (0);
3802
3803         ZFS_ENTER(zfsvfs);
3804         ZFS_VERIFY_ZP(zp);
3805
3806         err = zfs_fillpage(ip, pl, nr_pages);
3807
3808         ZFS_EXIT(zfsvfs);
3809         return (err);
3810 }
3811
3812 /*
3813  * Check ZFS specific permissions to memory map a section of a file.
3814  *
3815  *      IN:     ip      - inode of the file to mmap
3816  *              off     - file offset
3817  *              addrp   - start address in memory region
3818  *              len     - length of memory region
3819  *              vm_flags- address flags
3820  *
3821  *      RETURN: 0 if success
3822  *              error code if failure
3823  */
3824 /*ARGSUSED*/
3825 int
3826 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
3827     unsigned long vm_flags)
3828 {
3829         znode_t  *zp = ITOZ(ip);
3830         zfsvfs_t *zfsvfs = ITOZSB(ip);
3831
3832         ZFS_ENTER(zfsvfs);
3833         ZFS_VERIFY_ZP(zp);
3834
3835         if ((vm_flags & VM_WRITE) && (zp->z_pflags &
3836             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
3837                 ZFS_EXIT(zfsvfs);
3838                 return (SET_ERROR(EPERM));
3839         }
3840
3841         if ((vm_flags & (VM_READ | VM_EXEC)) &&
3842             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
3843                 ZFS_EXIT(zfsvfs);
3844                 return (SET_ERROR(EACCES));
3845         }
3846
3847         if (off < 0 || len > MAXOFFSET_T - off) {
3848                 ZFS_EXIT(zfsvfs);
3849                 return (SET_ERROR(ENXIO));
3850         }
3851
3852         ZFS_EXIT(zfsvfs);
3853         return (0);
3854 }
3855
3856 /*
3857  * Free or allocate space in a file.  Currently, this function only
3858  * supports the `F_FREESP' command.  However, this command is somewhat
3859  * misnamed, as its functionality includes the ability to allocate as
3860  * well as free space.
3861  *
3862  *      IN:     zp      - znode of file to free data in.
3863  *              cmd     - action to take (only F_FREESP supported).
3864  *              bfp     - section of file to free/alloc.
3865  *              flag    - current file open mode flags.
3866  *              offset  - current file offset.
3867  *              cr      - credentials of caller.
3868  *
3869  *      RETURN: 0 on success, error code on failure.
3870  *
3871  * Timestamps:
3872  *      zp - ctime|mtime updated
3873  */
3874 /* ARGSUSED */
3875 int
3876 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
3877     offset_t offset, cred_t *cr)
3878 {
3879         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
3880         uint64_t        off, len;
3881         int             error;
3882
3883         ZFS_ENTER(zfsvfs);
3884         ZFS_VERIFY_ZP(zp);
3885
3886         if (cmd != F_FREESP) {
3887                 ZFS_EXIT(zfsvfs);
3888                 return (SET_ERROR(EINVAL));
3889         }
3890
3891         /*
3892          * Callers might not be able to detect properly that we are read-only,
3893          * so check it explicitly here.
3894          */
3895         if (zfs_is_readonly(zfsvfs)) {
3896                 ZFS_EXIT(zfsvfs);
3897                 return (SET_ERROR(EROFS));
3898         }
3899
3900         if (bfp->l_len < 0) {
3901                 ZFS_EXIT(zfsvfs);
3902                 return (SET_ERROR(EINVAL));
3903         }
3904
3905         /*
3906          * Permissions aren't checked on Solaris because on this OS
3907          * zfs_space() can only be called with an opened file handle.
3908          * On Linux we can get here through truncate_range() which
3909          * operates directly on inodes, so we need to check access rights.
3910          */
3911         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
3912                 ZFS_EXIT(zfsvfs);
3913                 return (error);
3914         }
3915
3916         off = bfp->l_start;
3917         len = bfp->l_len; /* 0 means from off to end of file */
3918
3919         error = zfs_freesp(zp, off, len, flag, TRUE);
3920
3921         ZFS_EXIT(zfsvfs);
3922         return (error);
3923 }
3924
3925 /*ARGSUSED*/
3926 int
3927 zfs_fid(struct inode *ip, fid_t *fidp)
3928 {
3929         znode_t         *zp = ITOZ(ip);
3930         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3931         uint32_t        gen;
3932         uint64_t        gen64;
3933         uint64_t        object = zp->z_id;
3934         zfid_short_t    *zfid;
3935         int             size, i, error;
3936
3937         ZFS_ENTER(zfsvfs);
3938
3939         if (fidp->fid_len < SHORT_FID_LEN) {
3940                 fidp->fid_len = SHORT_FID_LEN;
3941                 ZFS_EXIT(zfsvfs);
3942                 return (SET_ERROR(ENOSPC));
3943         }
3944
3945         ZFS_VERIFY_ZP(zp);
3946
3947         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
3948             &gen64, sizeof (uint64_t))) != 0) {
3949                 ZFS_EXIT(zfsvfs);
3950                 return (error);
3951         }
3952
3953         gen = (uint32_t)gen64;
3954
3955         size = SHORT_FID_LEN;
3956
3957         zfid = (zfid_short_t *)fidp;
3958
3959         zfid->zf_len = size;
3960
3961         for (i = 0; i < sizeof (zfid->zf_object); i++)
3962                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3963
3964         /* Must have a non-zero generation number to distinguish from .zfs */
3965         if (gen == 0)
3966                 gen = 1;
3967         for (i = 0; i < sizeof (zfid->zf_gen); i++)
3968                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3969
3970         ZFS_EXIT(zfsvfs);
3971         return (0);
3972 }
3973
3974 #if defined(_KERNEL)
3975 EXPORT_SYMBOL(zfs_open);
3976 EXPORT_SYMBOL(zfs_close);
3977 EXPORT_SYMBOL(zfs_lookup);
3978 EXPORT_SYMBOL(zfs_create);
3979 EXPORT_SYMBOL(zfs_tmpfile);
3980 EXPORT_SYMBOL(zfs_remove);
3981 EXPORT_SYMBOL(zfs_mkdir);
3982 EXPORT_SYMBOL(zfs_rmdir);
3983 EXPORT_SYMBOL(zfs_readdir);
3984 EXPORT_SYMBOL(zfs_getattr_fast);
3985 EXPORT_SYMBOL(zfs_setattr);
3986 EXPORT_SYMBOL(zfs_rename);
3987 EXPORT_SYMBOL(zfs_symlink);
3988 EXPORT_SYMBOL(zfs_readlink);
3989 EXPORT_SYMBOL(zfs_link);
3990 EXPORT_SYMBOL(zfs_inactive);
3991 EXPORT_SYMBOL(zfs_space);
3992 EXPORT_SYMBOL(zfs_fid);
3993 EXPORT_SYMBOL(zfs_getpage);
3994 EXPORT_SYMBOL(zfs_putpage);
3995 EXPORT_SYMBOL(zfs_dirty_inode);
3996 EXPORT_SYMBOL(zfs_map);
3997
3998 /* CSTYLED */
3999 module_param(zfs_delete_blocks, ulong, 0644);
4000 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4001
4002 #endif