module/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /* Portions Copyright 2007 Jeremy Teo */
  26 /* Portions Copyright 2010 Robert Milkowski */
  27
  28
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/time.h>
  32 #include <sys/systm.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/resource.h>
  35 #include <sys/vfs.h>
  36 #include <sys/vfs_opreg.h>
  37 #include <sys/file.h>
  38 #include <sys/stat.h>
  39 #include <sys/kmem.h>
  40 #include <sys/taskq.h>
  41 #include <sys/uio.h>
  42 #include <sys/vmsystm.h>
  43 #include <sys/atomic.h>
  44 #include <vm/pvn.h>
  45 #include <sys/pathname.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/unistd.h>
  49 #include <sys/zfs_dir.h>
  50 #include <sys/zfs_acl.h>
  51 #include <sys/zfs_ioctl.h>
  52 #include <sys/fs/zfs.h>
  53 #include <sys/dmu.h>
  54 #include <sys/dmu_objset.h>
  55 #include <sys/spa.h>
  56 #include <sys/txg.h>
  57 #include <sys/dbuf.h>
  58 #include <sys/zap.h>
  59 #include <sys/sa.h>
  60 #include <sys/dirent.h>
  61 #include <sys/policy.h>
  62 #include <sys/sunddi.h>
  63 #include <sys/sid.h>
  64 #include <sys/mode.h>
  65 #include "fs/fs_subr.h"
  66 #include <sys/zfs_fuid.h>
  67 #include <sys/zfs_sa.h>
  68 #include <sys/zfs_vnops.h>
  69 #include <sys/dnlc.h>
  70 #include <sys/zfs_rlock.h>
  71 #include <sys/extdirent.h>
  72 #include <sys/kidmap.h>
  73 #include <sys/cred.h>
  74 #include <sys/attr.h>
  75
  76 /*
  77  * Programming rules.
  78  *
  79  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  80  * properly lock its in-core state, create a DMU transaction, do the work,
  81  * record this work in the intent log (ZIL), commit the DMU transaction,
  82  * and wait for the intent log to commit if it is a synchronous operation.
  83  * Moreover, the vnode ops must work in both normal and log replay context.
  84  * The ordering of events is important to avoid deadlocks and references
  85  * to freed memory.  The example below illustrates the following Big Rules:
  86  *
  87  *  (1) A check must be made in each zfs thread for a mounted file system.
  88  *      This is done avoiding races using ZFS_ENTER(zsb).
  89  *      A ZFS_EXIT(zsb) is needed before all returns.  Any znodes
  90  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  91  *      can return EIO from the calling function.
  92  *
  93  *  (2) iput() should always be the last thing except for zil_commit()
  94  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
  95  *      First, if it's the last reference, the vnode/znode
  96  *      can be freed, so the zp may point to freed memory.  Second, the last
  97  *      reference will call zfs_zinactive(), which may induce a lot of work --
  98  *      pushing cached pages (which acquires range locks) and syncing out
  99  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 100  *      which could deadlock the system if you were already holding one.
 101  *      If you must call iput() within a tx then use iput_ASYNC().
 102  *
 103  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 104  *      as they can span dmu_tx_assign() calls.
 105  *
 106  *  (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
 107  *      This is critical because we don't want to block while holding locks.
 108  *      Note, in particular, that if a lock is sometimes acquired before
 109  *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 110  *      use a non-blocking assign can deadlock the system.  The scenario:
 111  *
 112  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 113  *      Thread B is in an already-assigned tx, and blocks for this lock.
 114  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 115  *      forever, because the previous txg can't quiesce until B's tx commits.
 116  *
 117  *      If dmu_tx_assign() returns ERESTART and zsb->z_assign is TXG_NOWAIT,
 118  *      then drop all locks, call dmu_tx_wait(), and try again.
 119  *
 120  *  (5) If the operation succeeded, generate the intent log entry for it
 121  *      before dropping locks.  This ensures that the ordering of events
 122  *      in the intent log matches the order in which they actually occurred.
 123  *      During ZIL replay the zfs_log_* functions will update the sequence
 124  *      number to indicate the zil transaction has replayed.
 125  *
 126  *  (6) At the end of each vnode op, the DMU tx must always commit,
 127  *      regardless of whether there were any errors.
 128  *
 129  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 130  *      to ensure that synchronous semantics are provided when necessary.
 131  *
 132  * In general, this is how things should be ordered in each vnode op:
 133  *
 134  *      ZFS_ENTER(zsb);         // exit if unmounted
 135  * top:
 136  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 137  *      rw_enter(...);                  // grab any other locks you need
 138  *      tx = dmu_tx_create(...);        // get DMU tx
 139  *      dmu_tx_hold_*();                // hold each object you might modify
 140  *      error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
 141  *      if (error) {
 142  *              rw_exit(...);           // drop locks
 143  *              zfs_dirent_unlock(dl);  // unlock directory entry
 144  *              iput(...);              // release held vnodes
 145  *              if (error == ERESTART) {
 146  *                      dmu_tx_wait(tx);
 147  *                      dmu_tx_abort(tx);
 148  *                      goto top;
 149  *              }
 150  *              dmu_tx_abort(tx);       // abort DMU tx
 151  *              ZFS_EXIT(zsb);  // finished in zfs
 152  *              return (error);         // really out of space
 153  *      }
 154  *      error = do_real_work();         // do whatever this VOP does
 155  *      if (error == 0)
 156  *              zfs_log_*(...);         // on success, make ZIL entry
 157  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 158  *      rw_exit(...);                   // drop locks
 159  *      zfs_dirent_unlock(dl);          // unlock directory entry
 160  *      iput(...);                      // release held vnodes
 161  *      zil_commit(zilog, foid);        // synchronous when necessary
 162  *      ZFS_EXIT(zsb);          // finished in zfs
 163  *      return (error);                 // done, report error
 164  */
 165
 166 #if defined(_KERNEL) && defined(HAVE_MMAP)
 167 /*
 168  * Utility functions to map and unmap a single physical page.  These
 169  * are used to manage the mappable copies of ZFS file data, and therefore
 170  * do not update ref/mod bits.
 171  */
 172 caddr_t
 173 zfs_map_page(page_t *pp, enum seg_rw rw)
 174 {
 175         if (kpm_enable)
 176                 return (hat_kpm_mapin(pp, 0));
 177         ASSERT(rw == S_READ || rw == S_WRITE);
 178         return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 179             (caddr_t)-1));
 180 }
 181
 182 void
 183 zfs_unmap_page(page_t *pp, caddr_t addr)
 184 {
 185         if (kpm_enable) {
 186                 hat_kpm_mapout(pp, 0, addr);
 187         } else {
 188                 ppmapout(addr);
 189         }
 190 }
 191
 192 /*
 193  * When a file is memory mapped, we must keep the IO data synchronized
 194  * between the DMU cache and the memory mapped pages.  What this means:
 195  *
 196  * On Write:    If we find a memory mapped page, we write to *both*
 197  *              the page and the dmu buffer.
 198  */
 199 static void
 200 update_pages(struct inode *ip, int64_t start, int len, objset_t *os,
 201     uint64_t oid)
 202 {
 203         int64_t off;
 204
 205         off = start & PAGEOFFSET;
 206         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 207                 page_t *pp;
 208                 uint64_t nbytes = MIN(PAGESIZE - off, len);
 209
 210                 if (pp = page_lookup(ip, start, SE_SHARED)) {
 211                         caddr_t va;
 212
 213                         va = zfs_map_page(pp, S_WRITE);
 214                         (void) dmu_read(os, oid, start+off, nbytes, va+off,
 215                             DMU_READ_PREFETCH);
 216                         zfs_unmap_page(pp, va);
 217                         page_unlock(pp);
 218                 }
 219                 len -= nbytes;
 220                 off = 0;
 221         }
 222 }
 223
 224 /*
 225  * When a file is memory mapped, we must keep the IO data synchronized
 226  * between the DMU cache and the memory mapped pages.  What this means:
 227  *
 228  * On Read:     We "read" preferentially from memory mapped pages,
 229  *              else we default from the dmu buffer.
 230  *
 231  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 232  *      the file is memory mapped.
 233  */
 234 static int
 235 mappedread(struct inode *ip, int nbytes, uio_t *uio)
 236 {
 237         znode_t *zp = ITOZ(ip);
 238         objset_t *os = ITOZSB(ip)->z_os;
 239         int64_t start, off;
 240         int len = nbytes;
 241         int error = 0;
 242
 243         start = uio->uio_loffset;
 244         off = start & PAGEOFFSET;
 245         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 246                 page_t *pp;
 247                 uint64_t bytes = MIN(PAGESIZE - off, len);
 248
 249                 if (pp = page_lookup(ip, start, SE_SHARED)) {
 250                         caddr_t va;
 251
 252                         va = zfs_map_page(pp, S_READ);
 253                         error = uiomove(va + off, bytes, UIO_READ, uio);
 254                         zfs_unmap_page(pp, va);
 255                         page_unlock(pp);
 256                 } else {
 257                         error = dmu_read_uio(os, zp->z_id, uio, bytes);
 258                 }
 259                 len -= bytes;
 260                 off = 0;
 261                 if (error)
 262                         break;
 263         }
 264         return (error);
 265 }
 266 #endif /* _KERNEL && HAVE_MMAP */
 267
 268 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 269
 270 /*
 271  * Read bytes from specified file into supplied buffer.
 272  *
 273  *      IN:     ip      - inode of file to be read from.
 274  *              uio     - structure supplying read location, range info,
 275  *                        and return buffer.
 276  *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 277  *              cr      - credentials of caller.
 278  *
 279  *      OUT:    uio     - updated offset and range, buffer filled.
 280  *
 281  *      RETURN: 0 if success
 282  *              error code if failure
 283  *
 284  * Side Effects:
 285  *      inode - atime updated if byte count > 0
 286  */
 287 /* ARGSUSED */
 288 int
 289 zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 290 {
 291         znode_t         *zp = ITOZ(ip);
 292         zfs_sb_t        *zsb = ITOZSB(ip);
 293         objset_t        *os;
 294         ssize_t         n, nbytes;
 295         int             error = 0;
 296         rl_t            *rl;
 297 #ifdef HAVE_UIO_ZEROCOPY
 298         xuio_t          *xuio = NULL;
 299 #endif /* HAVE_UIO_ZEROCOPY */
 300
 301         ZFS_ENTER(zsb);
 302         ZFS_VERIFY_ZP(zp);
 303         os = zsb->z_os;
 304
 305         if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 306                 ZFS_EXIT(zsb);
 307                 return (EACCES);
 308         }
 309
 310         /*
 311          * Validate file offset
 312          */
 313         if (uio->uio_loffset < (offset_t)0) {
 314                 ZFS_EXIT(zsb);
 315                 return (EINVAL);
 316         }
 317
 318         /*
 319          * Fasttrack empty reads
 320          */
 321         if (uio->uio_resid == 0) {
 322                 ZFS_EXIT(zsb);
 323                 return (0);
 324         }
 325
 326 #ifdef HAVE_MANDLOCKS
 327         /*
 328          * Check for mandatory locks
 329          */
 330         if (MANDMODE(zp->z_mode)) {
 331                 if (error = chklock(ip, FREAD,
 332                     uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 333                         ZFS_EXIT(zsb);
 334                         return (error);
 335                 }
 336         }
 337 #endif /* HAVE_MANDLOCK */
 338
 339         /*
 340          * If we're in FRSYNC mode, sync out this znode before reading it.
 341          */
 342         if (ioflag & FRSYNC || zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
 343                 zil_commit(zsb->z_log, zp->z_id);
 344
 345         /*
 346          * Lock the range against changes.
 347          */
 348         rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 349
 350         /*
 351          * If we are reading past end-of-file we can skip
 352          * to the end; but we might still need to set atime.
 353          */
 354         if (uio->uio_loffset >= zp->z_size) {
 355                 error = 0;
 356                 goto out;
 357         }
 358
 359         ASSERT(uio->uio_loffset < zp->z_size);
 360         n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 361
 362 #ifdef HAVE_UIO_ZEROCOPY
 363         if ((uio->uio_extflg == UIO_XUIO) &&
 364             (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 365                 int nblk;
 366                 int blksz = zp->z_blksz;
 367                 uint64_t offset = uio->uio_loffset;
 368
 369                 xuio = (xuio_t *)uio;
 370                 if ((ISP2(blksz))) {
 371                         nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 372                             blksz)) / blksz;
 373                 } else {
 374                         ASSERT(offset + n <= blksz);
 375                         nblk = 1;
 376                 }
 377                 (void) dmu_xuio_init(xuio, nblk);
 378
 379                 if (vn_has_cached_data(ip)) {
 380                         /*
 381                          * For simplicity, we always allocate a full buffer
 382                          * even if we only expect to read a portion of a block.
 383                          */
 384                         while (--nblk >= 0) {
 385                                 (void) dmu_xuio_add(xuio,
 386                                     dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 387                                     blksz), 0, blksz);
 388                         }
 389                 }
 390         }
 391 #endif /* HAVE_UIO_ZEROCOPY */
 392
 393         while (n > 0) {
 394                 nbytes = MIN(n, zfs_read_chunk_size -
 395                     P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 396
 397 /* XXX: Drop this, ARC update handled by zpl layer */
 398 #ifdef HAVE_MMAP
 399                 if (vn_has_cached_data(ip))
 400                         error = mappedread(ip, nbytes, uio);
 401                 else
 402                         error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 403 #else
 404                 error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 405 #endif /* HAVE_MMAP */
 406                 if (error) {
 407                         /* convert checksum errors into IO errors */
 408                         if (error == ECKSUM)
 409                                 error = EIO;
 410                         break;
 411                 }
 412
 413                 n -= nbytes;
 414         }
 415 out:
 416         zfs_range_unlock(rl);
 417
 418         ZFS_ACCESSTIME_STAMP(zsb, zp);
 419         zfs_inode_update(zp);
 420         ZFS_EXIT(zsb);
 421         return (error);
 422 }
 423 EXPORT_SYMBOL(zfs_read);
 424
 425 /*
 426  * Write the bytes to a file.
 427  *
 428  *      IN:     ip      - inode of file to be written to.
 429  *              uio     - structure supplying write location, range info,
 430  *                        and data buffer.
 431  *              ioflag  - FAPPEND flag set if in append mode.
 432  *              cr      - credentials of caller.
 433  *
 434  *      OUT:    uio     - updated offset and range.
 435  *
 436  *      RETURN: 0 if success
 437  *              error code if failure
 438  *
 439  * Timestamps:
 440  *      ip - ctime|mtime updated if byte count > 0
 441  */
 442
 443 /* ARGSUSED */
 444 int
 445 zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 446 {
 447         znode_t         *zp = ITOZ(ip);
 448         rlim64_t        limit = uio->uio_limit;
 449         ssize_t         start_resid = uio->uio_resid;
 450         ssize_t         tx_bytes;
 451         uint64_t        end_size;
 452         dmu_tx_t        *tx;
 453         zfs_sb_t        *zsb = ZTOZSB(zp);
 454         zilog_t         *zilog;
 455         offset_t        woff;
 456         ssize_t         n, nbytes;
 457         rl_t            *rl;
 458         int             max_blksz = zsb->z_max_blksz;
 459         int             error = 0;
 460         arc_buf_t       *abuf;
 461         iovec_t         *aiov = NULL;
 462         xuio_t          *xuio = NULL;
 463         int             i_iov = 0;
 464         iovec_t         *iovp = uio->uio_iov;
 465         int             write_eof;
 466         int             count = 0;
 467         sa_bulk_attr_t  bulk[4];
 468         uint64_t        mtime[2], ctime[2];
 469         ASSERTV(int     iovcnt = uio->uio_iovcnt);
 470
 471         /*
 472          * Fasttrack empty write
 473          */
 474         n = start_resid;
 475         if (n == 0)
 476                 return (0);
 477
 478         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 479                 limit = MAXOFFSET_T;
 480
 481         ZFS_ENTER(zsb);
 482         ZFS_VERIFY_ZP(zp);
 483
 484         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
 485         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);
 486         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), NULL, &zp->z_size, 8);
 487         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL,
 488             &zp->z_pflags, 8);
 489
 490         /*
 491          * If immutable or not appending then return EPERM
 492          */
 493         if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 494             ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 495             (uio->uio_loffset < zp->z_size))) {
 496                 ZFS_EXIT(zsb);
 497                 return (EPERM);
 498         }
 499
 500         zilog = zsb->z_log;
 501
 502         /*
 503          * Validate file offset
 504          */
 505         woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 506         if (woff < 0) {
 507                 ZFS_EXIT(zsb);
 508                 return (EINVAL);
 509         }
 510
 511 #ifdef HAVE_MANDLOCKS
 512         /*
 513          * Check for mandatory locks before calling zfs_range_lock()
 514          * in order to prevent a deadlock with locks set via fcntl().
 515          */
 516         if (MANDMODE((mode_t)zp->z_mode) &&
 517             (error = chklock(ip, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 518                 ZFS_EXIT(zsb);
 519                 return (error);
 520         }
 521 #endif /* HAVE_MANDLOCKS */
 522
 523 #ifdef HAVE_UIO_ZEROCOPY
 524         /*
 525          * Pre-fault the pages to ensure slow (eg NFS) pages
 526          * don't hold up txg.
 527          * Skip this if uio contains loaned arc_buf.
 528          */
 529         if ((uio->uio_extflg == UIO_XUIO) &&
 530             (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 531                 xuio = (xuio_t *)uio;
 532         else
 533                 uio_prefaultpages(MIN(n, max_blksz), uio);
 534 #endif /* HAVE_UIO_ZEROCOPY */
 535
 536         /*
 537          * If in append mode, set the io offset pointer to eof.
 538          */
 539         if (ioflag & FAPPEND) {
 540                 /*
 541                  * Obtain an appending range lock to guarantee file append
 542                  * semantics.  We reset the write offset once we have the lock.
 543                  */
 544                 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 545                 woff = rl->r_off;
 546                 if (rl->r_len == UINT64_MAX) {
 547                         /*
 548                          * We overlocked the file because this write will cause
 549                          * the file block size to increase.
 550                          * Note that zp_size cannot change with this lock held.
 551                          */
 552                         woff = zp->z_size;
 553                 }
 554                 uio->uio_loffset = woff;
 555         } else {
 556                 /*
 557                  * Note that if the file block size will change as a result of
 558                  * this write, then this range lock will lock the entire file
 559                  * so that we can re-write the block safely.
 560                  */
 561                 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 562         }
 563
 564         if (woff >= limit) {
 565                 zfs_range_unlock(rl);
 566                 ZFS_EXIT(zsb);
 567                 return (EFBIG);
 568         }
 569
 570         if ((woff + n) > limit || woff > (limit - n))
 571                 n = limit - woff;
 572
 573         /* Will this write extend the file length? */
 574         write_eof = (woff + n > zp->z_size);
 575
 576         end_size = MAX(zp->z_size, woff + n);
 577
 578         /*
 579          * Write the file in reasonable size chunks.  Each chunk is written
 580          * in a separate transaction; this keeps the intent log records small
 581          * and allows us to do more fine-grained space accounting.
 582          */
 583         while (n > 0) {
 584                 abuf = NULL;
 585                 woff = uio->uio_loffset;
 586 again:
 587                 if (zfs_owner_overquota(zsb, zp, B_FALSE) ||
 588                     zfs_owner_overquota(zsb, zp, B_TRUE)) {
 589                         if (abuf != NULL)
 590                                 dmu_return_arcbuf(abuf);
 591                         error = EDQUOT;
 592                         break;
 593                 }
 594
 595                 if (xuio && abuf == NULL) {
 596                         ASSERT(i_iov < iovcnt);
 597                         aiov = &iovp[i_iov];
 598                         abuf = dmu_xuio_arcbuf(xuio, i_iov);
 599                         dmu_xuio_clear(xuio, i_iov);
 600                         ASSERT((aiov->iov_base == abuf->b_data) ||
 601                             ((char *)aiov->iov_base - (char *)abuf->b_data +
 602                             aiov->iov_len == arc_buf_size(abuf)));
 603                         i_iov++;
 604                 } else if (abuf == NULL && n >= max_blksz &&
 605                     woff >= zp->z_size &&
 606                     P2PHASE(woff, max_blksz) == 0 &&
 607                     zp->z_blksz == max_blksz) {
 608                         /*
 609                          * This write covers a full block.  "Borrow" a buffer
 610                          * from the dmu so that we can fill it before we enter
 611                          * a transaction.  This avoids the possibility of
 612                          * holding up the transaction if the data copy hangs
 613                          * up on a pagefault (e.g., from an NFS server mapping).
 614                          */
 615                         size_t cbytes;
 616
 617                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 618                             max_blksz);
 619                         ASSERT(abuf != NULL);
 620                         ASSERT(arc_buf_size(abuf) == max_blksz);
 621                         if ((error = uiocopy(abuf->b_data, max_blksz,
 622                             UIO_WRITE, uio, &cbytes))) {
 623                                 dmu_return_arcbuf(abuf);
 624                                 break;
 625                         }
 626                         ASSERT(cbytes == max_blksz);
 627                 }
 628
 629                 /*
 630                  * Start a transaction.
 631                  */
 632                 tx = dmu_tx_create(zsb->z_os);
 633                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 634                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 635                 zfs_sa_upgrade_txholds(tx, zp);
 636                 error = dmu_tx_assign(tx, TXG_NOWAIT);
 637                 if (error) {
 638                         if (error == ERESTART) {
 639                                 dmu_tx_wait(tx);
 640                                 dmu_tx_abort(tx);
 641                                 goto again;
 642                         }
 643                         dmu_tx_abort(tx);
 644                         if (abuf != NULL)
 645                                 dmu_return_arcbuf(abuf);
 646                         break;
 647                 }
 648
 649                 /*
 650                  * If zfs_range_lock() over-locked we grow the blocksize
 651                  * and then reduce the lock range.  This will only happen
 652                  * on the first iteration since zfs_range_reduce() will
 653                  * shrink down r_len to the appropriate size.
 654                  */
 655                 if (rl->r_len == UINT64_MAX) {
 656                         uint64_t new_blksz;
 657
 658                         if (zp->z_blksz > max_blksz) {
 659                                 ASSERT(!ISP2(zp->z_blksz));
 660                                 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 661                         } else {
 662                                 new_blksz = MIN(end_size, max_blksz);
 663                         }
 664                         zfs_grow_blocksize(zp, new_blksz, tx);
 665                         zfs_range_reduce(rl, woff, n);
 666                 }
 667
 668                 /*
 669                  * XXX - should we really limit each write to z_max_blksz?
 670                  * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 671                  */
 672                 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 673
 674                 if (abuf == NULL) {
 675                         tx_bytes = uio->uio_resid;
 676                         error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 677                             uio, nbytes, tx);
 678                         tx_bytes -= uio->uio_resid;
 679                 } else {
 680                         tx_bytes = nbytes;
 681                         ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 682                         /*
 683                          * If this is not a full block write, but we are
 684                          * extending the file past EOF and this data starts
 685                          * block-aligned, use assign_arcbuf().  Otherwise,
 686                          * write via dmu_write().
 687                          */
 688                         if (tx_bytes < max_blksz && (!write_eof ||
 689                             aiov->iov_base != abuf->b_data)) {
 690                                 ASSERT(xuio);
 691                                 dmu_write(zsb->z_os, zp->z_id, woff,
 692                                     aiov->iov_len, aiov->iov_base, tx);
 693                                 dmu_return_arcbuf(abuf);
 694                                 xuio_stat_wbuf_copied();
 695                         } else {
 696                                 ASSERT(xuio || tx_bytes == max_blksz);
 697                                 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 698                                     woff, abuf, tx);
 699                         }
 700                         ASSERT(tx_bytes <= uio->uio_resid);
 701                         uioskip(uio, tx_bytes);
 702                 }
 703 /* XXX: Drop this, ARC update handled by zpl layer */
 704 #ifdef HAVE_MMAP
 705                 if (tx_bytes && vn_has_cached_data(ip)) {
 706                         update_pages(ip, woff,
 707                             tx_bytes, zsb->z_os, zp->z_id);
 708                 }
 709 #endif /* HAVE_MMAP */
 710
 711                 /*
 712                  * If we made no progress, we're done.  If we made even
 713                  * partial progress, update the znode and ZIL accordingly.
 714                  */
 715                 if (tx_bytes == 0) {
 716                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zsb),
 717                             (void *)&zp->z_size, sizeof (uint64_t), tx);
 718                         dmu_tx_commit(tx);
 719                         ASSERT(error != 0);
 720                         break;
 721                 }
 722
 723                 /*
 724                  * Clear Set-UID/Set-GID bits on successful write if not
 725                  * privileged and at least one of the excute bits is set.
 726                  *
 727                  * It would be nice to to this after all writes have
 728                  * been done, but that would still expose the ISUID/ISGID
 729                  * to another app after the partial write is committed.
 730                  *
 731                  * Note: we don't call zfs_fuid_map_id() here because
 732                  * user 0 is not an ephemeral uid.
 733                  */
 734                 mutex_enter(&zp->z_acl_lock);
 735                 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 736                     (S_IXUSR >> 6))) != 0 &&
 737                     (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 738                     secpolicy_vnode_setid_retain(cr,
 739                     (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 740                         uint64_t newmode;
 741                         zp->z_mode &= ~(S_ISUID | S_ISGID);
 742                         newmode = zp->z_mode;
 743                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zsb),
 744                             (void *)&newmode, sizeof (uint64_t), tx);
 745                 }
 746                 mutex_exit(&zp->z_acl_lock);
 747
 748                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 749                     B_TRUE);
 750
 751                 /*
 752                  * Update the file size (zp_size) if it has changed;
 753                  * account for possible concurrent updates.
 754                  */
 755                 while ((end_size = zp->z_size) < uio->uio_loffset) {
 756                         (void) atomic_cas_64(&zp->z_size, end_size,
 757                             uio->uio_loffset);
 758                         ASSERT(error == 0);
 759                 }
 760                 /*
 761                  * If we are replaying and eof is non zero then force
 762                  * the file size to the specified eof. Note, there's no
 763                  * concurrency during replay.
 764                  */
 765                 if (zsb->z_replay && zsb->z_replay_eof != 0)
 766                         zp->z_size = zsb->z_replay_eof;
 767
 768                 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 769
 770                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 771                 dmu_tx_commit(tx);
 772
 773                 if (error != 0)
 774                         break;
 775                 ASSERT(tx_bytes == nbytes);
 776                 n -= nbytes;
 777
 778                 if (!xuio && n > 0)
 779                         uio_prefaultpages(MIN(n, max_blksz), uio);
 780         }
 781
 782         zfs_range_unlock(rl);
 783
 784         /*
 785          * If we're in replay mode, or we made no progress, return error.
 786          * Otherwise, it's at least a partial write, so it's successful.
 787          */
 788         if (zsb->z_replay || uio->uio_resid == start_resid) {
 789                 ZFS_EXIT(zsb);
 790                 return (error);
 791         }
 792
 793         if (ioflag & (FSYNC | FDSYNC) ||
 794             zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
 795                 zil_commit(zilog, zp->z_id);
 796
 797         zfs_inode_update(zp);
 798         ZFS_EXIT(zsb);
 799         return (0);
 800 }
 801 EXPORT_SYMBOL(zfs_write);
 802
 803 static void
 804 iput_async(struct inode *ip, taskq_t *taskq)
 805 {
 806         ASSERT(atomic_read(&ip->i_count) > 0);
 807         if (atomic_read(&ip->i_count) == 1)
 808                 taskq_dispatch(taskq, (task_func_t *)iput, ip, TQ_SLEEP);
 809         else
 810                 iput(ip);
 811 }
 812
 813 void
 814 zfs_get_done(zgd_t *zgd, int error)
 815 {
 816         znode_t *zp = zgd->zgd_private;
 817         objset_t *os = ZTOZSB(zp)->z_os;
 818
 819         if (zgd->zgd_db)
 820                 dmu_buf_rele(zgd->zgd_db, zgd);
 821
 822         zfs_range_unlock(zgd->zgd_rl);
 823
 824         /*
 825          * Release the vnode asynchronously as we currently have the
 826          * txg stopped from syncing.
 827          */
 828         iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os)));
 829
 830         if (error == 0 && zgd->zgd_bp)
 831                 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 832
 833         kmem_free(zgd, sizeof (zgd_t));
 834 }
 835
 836 #ifdef DEBUG
 837 static int zil_fault_io = 0;
 838 #endif
 839
 840 /*
 841  * Get data to generate a TX_WRITE intent log record.
 842  */
 843 int
 844 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 845 {
 846         zfs_sb_t *zsb = arg;
 847         objset_t *os = zsb->z_os;
 848         znode_t *zp;
 849         uint64_t object = lr->lr_foid;
 850         uint64_t offset = lr->lr_offset;
 851         uint64_t size = lr->lr_length;
 852         blkptr_t *bp = &lr->lr_blkptr;
 853         dmu_buf_t *db;
 854         zgd_t *zgd;
 855         int error = 0;
 856
 857         ASSERT(zio != NULL);
 858         ASSERT(size != 0);
 859
 860         /*
 861          * Nothing to do if the file has been removed
 862          */
 863         if (zfs_zget(zsb, object, &zp) != 0)
 864                 return (ENOENT);
 865         if (zp->z_unlinked) {
 866                 /*
 867                  * Release the vnode asynchronously as we currently have the
 868                  * txg stopped from syncing.
 869                  */
 870                 iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os)));
 871                 return (ENOENT);
 872         }
 873
 874         zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 875         zgd->zgd_zilog = zsb->z_log;
 876         zgd->zgd_private = zp;
 877
 878         /*
 879          * Write records come in two flavors: immediate and indirect.
 880          * For small writes it's cheaper to store the data with the
 881          * log record (immediate); for large writes it's cheaper to
 882          * sync the data and get a pointer to it (indirect) so that
 883          * we don't have to write the data twice.
 884          */
 885         if (buf != NULL) { /* immediate write */
 886                 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 887                 /* test for truncation needs to be done while range locked */
 888                 if (offset >= zp->z_size) {
 889                         error = ENOENT;
 890                 } else {
 891                         error = dmu_read(os, object, offset, size, buf,
 892                             DMU_READ_NO_PREFETCH);
 893                 }
 894                 ASSERT(error == 0 || error == ENOENT);
 895         } else { /* indirect write */
 896                 /*
 897                  * Have to lock the whole block to ensure when it's
 898                  * written out and it's checksum is being calculated
 899                  * that no one can change the data. We need to re-check
 900                  * blocksize after we get the lock in case it's changed!
 901                  */
 902                 for (;;) {
 903                         uint64_t blkoff;
 904                         size = zp->z_blksz;
 905                         blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 906                         offset -= blkoff;
 907                         zgd->zgd_rl = zfs_range_lock(zp, offset, size,
 908                             RL_READER);
 909                         if (zp->z_blksz == size)
 910                                 break;
 911                         offset += blkoff;
 912                         zfs_range_unlock(zgd->zgd_rl);
 913                 }
 914                 /* test for truncation needs to be done while range locked */
 915                 if (lr->lr_offset >= zp->z_size)
 916                         error = ENOENT;
 917 #ifdef DEBUG
 918                 if (zil_fault_io) {
 919                         error = EIO;
 920                         zil_fault_io = 0;
 921                 }
 922 #endif
 923                 if (error == 0)
 924                         error = dmu_buf_hold(os, object, offset, zgd, &db,
 925                             DMU_READ_NO_PREFETCH);
 926
 927                 if (error == 0) {
 928                         zgd->zgd_db = db;
 929                         zgd->zgd_bp = bp;
 930
 931                         ASSERT(db->db_offset == offset);
 932                         ASSERT(db->db_size == size);
 933
 934                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
 935                             zfs_get_done, zgd);
 936                         ASSERT(error || lr->lr_length <= zp->z_blksz);
 937
 938                         /*
 939                          * On success, we need to wait for the write I/O
 940                          * initiated by dmu_sync() to complete before we can
 941                          * release this dbuf.  We will finish everything up
 942                          * in the zfs_get_done() callback.
 943                          */
 944                         if (error == 0)
 945                                 return (0);
 946
 947                         if (error == EALREADY) {
 948                                 lr->lr_common.lrc_txtype = TX_WRITE2;
 949                                 error = 0;
 950                         }
 951                 }
 952         }
 953
 954         zfs_get_done(zgd, error);
 955
 956         return (error);
 957 }
 958
 959 /*ARGSUSED*/
 960 int
 961 zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
 962 {
 963         znode_t *zp = ITOZ(ip);
 964         zfs_sb_t *zsb = ITOZSB(ip);
 965         int error;
 966
 967         ZFS_ENTER(zsb);
 968         ZFS_VERIFY_ZP(zp);
 969
 970         if (flag & V_ACE_MASK)
 971                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
 972         else
 973                 error = zfs_zaccess_rwx(zp, mode, flag, cr);
 974
 975         ZFS_EXIT(zsb);
 976         return (error);
 977 }
 978 EXPORT_SYMBOL(zfs_access);
 979
 980 /*
 981  * Lookup an entry in a directory, or an extended attribute directory.
 982  * If it exists, return a held inode reference for it.
 983  *
 984  *      IN:     dip     - inode of directory to search.
 985  *              nm      - name of entry to lookup.
 986  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 987  *              cr      - credentials of caller.
 988  *              direntflags - directory lookup flags
 989  *              realpnp - returned pathname.
 990  *
 991  *      OUT:    ipp     - inode of located entry, NULL if not found.
 992  *
 993  *      RETURN: 0 if success
 994  *              error code if failure
 995  *
 996  * Timestamps:
 997  *      NA
 998  */
 999 /* ARGSUSED */
1000 int
1001 zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags,
1002     cred_t *cr, int *direntflags, pathname_t *realpnp)
1003 {
1004         znode_t *zdp = ITOZ(dip);
1005         zfs_sb_t *zsb = ITOZSB(dip);
1006         int error = 0;
1007
1008         /* fast path */
1009         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1010
1011                 if (!S_ISDIR(dip->i_mode)) {
1012                         return (ENOTDIR);
1013                 } else if (zdp->z_sa_hdl == NULL) {
1014                         return (EIO);
1015                 }
1016
1017                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1018                         error = zfs_fastaccesschk_execute(zdp, cr);
1019                         if (!error) {
1020                                 *ipp = dip;
1021                                 igrab(*ipp);
1022                                 return (0);
1023                         }
1024                         return (error);
1025 #ifdef HAVE_DNLC
1026                 } else {
1027                         vnode_t *tvp = dnlc_lookup(dvp, nm);
1028
1029                         if (tvp) {
1030                                 error = zfs_fastaccesschk_execute(zdp, cr);
1031                                 if (error) {
1032                                         iput(tvp);
1033                                         return (error);
1034                                 }
1035                                 if (tvp == DNLC_NO_VNODE) {
1036                                         iput(tvp);
1037                                         return (ENOENT);
1038                                 } else {
1039                                         *vpp = tvp;
1040                                         return (specvp_check(vpp, cr));
1041                                 }
1042                         }
1043 #endif /* HAVE_DNLC */
1044                 }
1045         }
1046
1047         ZFS_ENTER(zsb);
1048         ZFS_VERIFY_ZP(zdp);
1049
1050         *ipp = NULL;
1051
1052         if (flags & LOOKUP_XATTR) {
1053                 /*
1054                  * If the xattr property is off, refuse the lookup request.
1055                  */
1056                 if (!(zsb->z_flags & ZSB_XATTR_USER)) {
1057                         ZFS_EXIT(zsb);
1058                         return (EINVAL);
1059                 }
1060
1061                 /*
1062                  * We don't allow recursive attributes..
1063                  * Maybe someday we will.
1064                  */
1065                 if (zdp->z_pflags & ZFS_XATTR) {
1066                         ZFS_EXIT(zsb);
1067                         return (EINVAL);
1068                 }
1069
1070                 if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) {
1071                         ZFS_EXIT(zsb);
1072                         return (error);
1073                 }
1074
1075                 /*
1076                  * Do we have permission to get into attribute directory?
1077                  */
1078
1079                 if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0,
1080                     B_FALSE, cr))) {
1081                         iput(*ipp);
1082                         *ipp = NULL;
1083                 }
1084
1085                 ZFS_EXIT(zsb);
1086                 return (error);
1087         }
1088
1089         if (!S_ISDIR(dip->i_mode)) {
1090                 ZFS_EXIT(zsb);
1091                 return (ENOTDIR);
1092         }
1093
1094         /*
1095          * Check accessibility of directory.
1096          */
1097
1098         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
1099                 ZFS_EXIT(zsb);
1100                 return (error);
1101         }
1102
1103         if (zsb->z_utf8 && u8_validate(nm, strlen(nm),
1104             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1105                 ZFS_EXIT(zsb);
1106                 return (EILSEQ);
1107         }
1108
1109         error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp);
1110         if ((error == 0) && (*ipp))
1111                 zfs_inode_update(ITOZ(*ipp));
1112
1113         ZFS_EXIT(zsb);
1114         return (error);
1115 }
1116 EXPORT_SYMBOL(zfs_lookup);
1117
1118 /*
1119  * Attempt to create a new entry in a directory.  If the entry
1120  * already exists, truncate the file if permissible, else return
1121  * an error.  Return the ip of the created or trunc'd file.
1122  *
1123  *      IN:     dip     - inode of directory to put new file entry in.
1124  *              name    - name of new file entry.
1125  *              vap     - attributes of new file.
1126  *              excl    - flag indicating exclusive or non-exclusive mode.
1127  *              mode    - mode to open file with.
1128  *              cr      - credentials of caller.
1129  *              flag    - large file flag [UNUSED].
1130  *              vsecp   - ACL to be set
1131  *
1132  *      OUT:    ipp     - inode of created or trunc'd entry.
1133  *
1134  *      RETURN: 0 if success
1135  *              error code if failure
1136  *
1137  * Timestamps:
1138  *      dip - ctime|mtime updated if new entry created
1139  *       ip - ctime|mtime always, atime if new
1140  */
1141
1142 /* ARGSUSED */
1143 int
1144 zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
1145     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
1146 {
1147         znode_t         *zp, *dzp = ITOZ(dip);
1148         zfs_sb_t        *zsb = ITOZSB(dip);
1149         zilog_t         *zilog;
1150         objset_t        *os;
1151         zfs_dirlock_t   *dl;
1152         dmu_tx_t        *tx;
1153         int             error;
1154         uid_t           uid;
1155         gid_t           gid;
1156         zfs_acl_ids_t   acl_ids;
1157         boolean_t       fuid_dirtied;
1158         boolean_t       have_acl = B_FALSE;
1159
1160         /*
1161          * If we have an ephemeral id, ACL, or XVATTR then
1162          * make sure file system is at proper version
1163          */
1164
1165         gid = crgetgid(cr);
1166         uid = crgetuid(cr);
1167
1168         if (zsb->z_use_fuids == B_FALSE &&
1169             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1170                 return (EINVAL);
1171
1172         ZFS_ENTER(zsb);
1173         ZFS_VERIFY_ZP(dzp);
1174         os = zsb->z_os;
1175         zilog = zsb->z_log;
1176
1177         if (zsb->z_utf8 && u8_validate(name, strlen(name),
1178             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1179                 ZFS_EXIT(zsb);
1180                 return (EILSEQ);
1181         }
1182
1183 #ifdef HAVE_XVATTR
1184         if (vap->va_mask & AT_XVATTR) {
1185                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1186                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1187                         ZFS_EXIT(zsb);
1188                         return (error);
1189                 }
1190         }
1191 #endif /* HAVE_XVATTR */
1192
1193 top:
1194         *ipp = NULL;
1195         if (*name == '\0') {
1196                 /*
1197                  * Null component name refers to the directory itself.
1198                  */
1199                 igrab(dip);
1200                 zp = dzp;
1201                 dl = NULL;
1202                 error = 0;
1203         } else {
1204                 /* possible igrab(zp) */
1205                 int zflg = 0;
1206
1207                 if (flag & FIGNORECASE)
1208                         zflg |= ZCILOOK;
1209
1210                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1211                     NULL, NULL);
1212                 if (error) {
1213                         if (have_acl)
1214                                 zfs_acl_ids_free(&acl_ids);
1215                         if (strcmp(name, "..") == 0)
1216                                 error = EISDIR;
1217                         ZFS_EXIT(zsb);
1218                         return (error);
1219                 }
1220         }
1221
1222         if (zp == NULL) {
1223                 uint64_t txtype;
1224
1225                 /*
1226                  * Create a new file object and update the directory
1227                  * to reference it.
1228                  */
1229                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
1230                         if (have_acl)
1231                                 zfs_acl_ids_free(&acl_ids);
1232                         goto out;
1233                 }
1234
1235                 /*
1236                  * We only support the creation of regular files in
1237                  * extended attribute directories.
1238                  */
1239
1240                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
1241                         if (have_acl)
1242                                 zfs_acl_ids_free(&acl_ids);
1243                         error = EINVAL;
1244                         goto out;
1245                 }
1246
1247                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1248                     cr, vsecp, &acl_ids)) != 0)
1249                         goto out;
1250                 have_acl = B_TRUE;
1251
1252                 if (zfs_acl_ids_overquota(zsb, &acl_ids)) {
1253                         zfs_acl_ids_free(&acl_ids);
1254                         error = EDQUOT;
1255                         goto out;
1256                 }
1257
1258                 tx = dmu_tx_create(os);
1259
1260                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1261                     ZFS_SA_BASE_ATTR_SIZE);
1262
1263                 fuid_dirtied = zsb->z_fuid_dirty;
1264                 if (fuid_dirtied)
1265                         zfs_fuid_txhold(zsb, tx);
1266                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1267                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1268                 if (!zsb->z_use_sa &&
1269                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1270                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1271                             0, acl_ids.z_aclp->z_acl_bytes);
1272                 }
1273                 error = dmu_tx_assign(tx, TXG_NOWAIT);
1274                 if (error) {
1275                         zfs_dirent_unlock(dl);
1276                         if (error == ERESTART) {
1277                                 dmu_tx_wait(tx);
1278                                 dmu_tx_abort(tx);
1279                                 goto top;
1280                         }
1281                         zfs_acl_ids_free(&acl_ids);
1282                         dmu_tx_abort(tx);
1283                         ZFS_EXIT(zsb);
1284                         return (error);
1285                 }
1286                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1287
1288                 if (fuid_dirtied)
1289                         zfs_fuid_sync(zsb, tx);
1290
1291                 (void) zfs_link_create(dl, zp, tx, ZNEW);
1292                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1293                 if (flag & FIGNORECASE)
1294                         txtype |= TX_CI;
1295                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1296                     vsecp, acl_ids.z_fuidp, vap);
1297                 zfs_acl_ids_free(&acl_ids);
1298                 dmu_tx_commit(tx);
1299         } else {
1300                 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1301
1302                 if (have_acl)
1303                         zfs_acl_ids_free(&acl_ids);
1304                 have_acl = B_FALSE;
1305
1306                 /*
1307                  * A directory entry already exists for this name.
1308                  */
1309                 /*
1310                  * Can't truncate an existing file if in exclusive mode.
1311                  */
1312                 if (excl) {
1313                         error = EEXIST;
1314                         goto out;
1315                 }
1316                 /*
1317                  * Can't open a directory for writing.
1318                  */
1319                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1320                         error = EISDIR;
1321                         goto out;
1322                 }
1323                 /*
1324                  * Verify requested access to file.
1325                  */
1326                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1327                         goto out;
1328                 }
1329
1330                 mutex_enter(&dzp->z_lock);
1331                 dzp->z_seq++;
1332                 mutex_exit(&dzp->z_lock);
1333
1334                 /*
1335                  * Truncate regular files if requested.
1336                  */
1337                 if (S_ISREG(ZTOI(zp)->i_mode) &&
1338                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
1339                         /* we can't hold any locks when calling zfs_freesp() */
1340                         zfs_dirent_unlock(dl);
1341                         dl = NULL;
1342                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
1343                 }
1344         }
1345 out:
1346
1347         if (dl)
1348                 zfs_dirent_unlock(dl);
1349
1350         if (error) {
1351                 if (zp)
1352                         iput(ZTOI(zp));
1353         } else {
1354                 zfs_inode_update(dzp);
1355                 zfs_inode_update(zp);
1356                 *ipp = ZTOI(zp);
1357         }
1358
1359         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
1360                 zil_commit(zilog, 0);
1361
1362         ZFS_EXIT(zsb);
1363         return (error);
1364 }
1365 EXPORT_SYMBOL(zfs_create);
1366
1367 /*
1368  * Remove an entry from a directory.
1369  *
1370  *      IN:     dip     - inode of directory to remove entry from.
1371  *              name    - name of entry to remove.
1372  *              cr      - credentials of caller.
1373  *
1374  *      RETURN: 0 if success
1375  *              error code if failure
1376  *
1377  * Timestamps:
1378  *      dip - ctime|mtime
1379  *       ip - ctime (if nlink > 0)
1380  */
1381
1382 uint64_t null_xattr = 0;
1383
1384 /*ARGSUSED*/
1385 int
1386 zfs_remove(struct inode *dip, char *name, cred_t *cr)
1387 {
1388         znode_t         *zp, *dzp = ITOZ(dip);
1389         znode_t         *xzp;
1390         struct inode    *ip;
1391         zfs_sb_t        *zsb = ITOZSB(dip);
1392         zilog_t         *zilog;
1393         uint64_t        xattr_obj;
1394         uint64_t        xattr_obj_unlinked = 0;
1395         uint64_t        obj = 0;
1396         zfs_dirlock_t   *dl;
1397         dmu_tx_t        *tx;
1398         boolean_t       unlinked;
1399         uint64_t        txtype;
1400         pathname_t      *realnmp = NULL;
1401 #ifdef HAVE_PN_UTILS
1402         pathname_t      realnm;
1403 #endif /* HAVE_PN_UTILS */
1404         int             error;
1405         int             zflg = ZEXISTS;
1406
1407         ZFS_ENTER(zsb);
1408         ZFS_VERIFY_ZP(dzp);
1409         zilog = zsb->z_log;
1410
1411 #ifdef HAVE_PN_UTILS
1412         if (flags & FIGNORECASE) {
1413                 zflg |= ZCILOOK;
1414                 pn_alloc(&realnm);
1415                 realnmp = &realnm;
1416         }
1417 #endif /* HAVE_PN_UTILS */
1418
1419 top:
1420         xattr_obj = 0;
1421         xzp = NULL;
1422         /*
1423          * Attempt to lock directory; fail if entry doesn't exist.
1424          */
1425         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1426             NULL, realnmp))) {
1427 #ifdef HAVE_PN_UTILS
1428                 if (realnmp)
1429                         pn_free(realnmp);
1430 #endif /* HAVE_PN_UTILS */
1431                 ZFS_EXIT(zsb);
1432                 return (error);
1433         }
1434
1435         ip = ZTOI(zp);
1436
1437         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1438                 goto out;
1439         }
1440
1441         /*
1442          * Need to use rmdir for removing directories.
1443          */
1444         if (S_ISDIR(ip->i_mode)) {
1445                 error = EPERM;
1446                 goto out;
1447         }
1448
1449 #ifdef HAVE_DNLC
1450         if (realnmp)
1451                 dnlc_remove(dvp, realnmp->pn_buf);
1452         else
1453                 dnlc_remove(dvp, name);
1454 #endif /* HAVE_DNLC */
1455
1456         /*
1457          * We never delete the znode and always place it in the unlinked
1458          * set.  The dentry cache will always hold the last reference and
1459          * is responsible for safely freeing the znode.
1460          */
1461         obj = zp->z_id;
1462         tx = dmu_tx_create(zsb->z_os);
1463         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1464         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1465         zfs_sa_upgrade_txholds(tx, zp);
1466         zfs_sa_upgrade_txholds(tx, dzp);
1467
1468         /* are there any extended attributes? */
1469         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb),
1470             &xattr_obj, sizeof (xattr_obj));
1471         if (error == 0 && xattr_obj) {
1472                 error = zfs_zget(zsb, xattr_obj, &xzp);
1473                 ASSERT3U(error, ==, 0);
1474                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1475                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1476         }
1477
1478         /* charge as an update -- would be nice not to charge at all */
1479         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
1480
1481         error = dmu_tx_assign(tx, TXG_NOWAIT);
1482         if (error) {
1483                 zfs_dirent_unlock(dl);
1484                 iput(ip);
1485                 if (xzp)
1486                         iput(ZTOI(xzp));
1487                 if (error == ERESTART) {
1488                         dmu_tx_wait(tx);
1489                         dmu_tx_abort(tx);
1490                         goto top;
1491                 }
1492 #ifdef HAVE_PN_UTILS
1493                 if (realnmp)
1494                         pn_free(realnmp);
1495 #endif /* HAVE_PN_UTILS */
1496                 dmu_tx_abort(tx);
1497                 ZFS_EXIT(zsb);
1498                 return (error);
1499         }
1500
1501         /*
1502          * Remove the directory entry.
1503          */
1504         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1505
1506         if (error) {
1507                 dmu_tx_commit(tx);
1508                 goto out;
1509         }
1510
1511         if (unlinked) {
1512                 /*
1513                  * Hold z_lock so that we can make sure that the ACL obj
1514                  * hasn't changed.  Could have been deleted due to
1515                  * zfs_sa_upgrade().
1516                  */
1517                 mutex_enter(&zp->z_lock);
1518                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb),
1519                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1520                 mutex_exit(&zp->z_lock);
1521                 zfs_unlinked_add(zp, tx);
1522         }
1523
1524         txtype = TX_REMOVE;
1525 #ifdef HAVE_PN_UTILS
1526         if (flags & FIGNORECASE)
1527                 txtype |= TX_CI;
1528 #endif /* HAVE_PN_UTILS */
1529         zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1530
1531         dmu_tx_commit(tx);
1532 out:
1533 #ifdef HAVE_PN_UTILS
1534         if (realnmp)
1535                 pn_free(realnmp);
1536 #endif /* HAVE_PN_UTILS */
1537
1538         zfs_dirent_unlock(dl);
1539         zfs_inode_update(dzp);
1540         zfs_inode_update(zp);
1541         if (xzp)
1542                 zfs_inode_update(xzp);
1543
1544         iput(ip);
1545         if (xzp)
1546                 iput(ZTOI(xzp));
1547
1548         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
1549                 zil_commit(zilog, 0);
1550
1551         ZFS_EXIT(zsb);
1552         return (error);
1553 }
1554 EXPORT_SYMBOL(zfs_remove);
1555
1556 /*
1557  * Create a new directory and insert it into dip using the name
1558  * provided.  Return a pointer to the inserted directory.
1559  *
1560  *      IN:     dip     - inode of directory to add subdir to.
1561  *              dirname - name of new directory.
1562  *              vap     - attributes of new directory.
1563  *              cr      - credentials of caller.
1564  *              vsecp   - ACL to be set
1565  *
1566  *      OUT:    ipp     - inode of created directory.
1567  *
1568  *      RETURN: 0 if success
1569  *              error code if failure
1570  *
1571  * Timestamps:
1572  *      dip - ctime|mtime updated
1573  *      ipp - ctime|mtime|atime updated
1574  */
1575 /*ARGSUSED*/
1576 int
1577 zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
1578     cred_t *cr, int flags, vsecattr_t *vsecp)
1579 {
1580         znode_t         *zp, *dzp = ITOZ(dip);
1581         zfs_sb_t        *zsb = ITOZSB(dip);
1582         zilog_t         *zilog;
1583         zfs_dirlock_t   *dl;
1584         uint64_t        txtype;
1585         dmu_tx_t        *tx;
1586         int             error;
1587         int             zf = ZNEW;
1588         uid_t           uid;
1589         gid_t           gid = crgetgid(cr);
1590         zfs_acl_ids_t   acl_ids;
1591         boolean_t       fuid_dirtied;
1592
1593         ASSERT(S_ISDIR(vap->va_mode));
1594
1595         /*
1596          * If we have an ephemeral id, ACL, or XVATTR then
1597          * make sure file system is at proper version
1598          */
1599
1600         uid = crgetuid(cr);
1601         if (zsb->z_use_fuids == B_FALSE &&
1602             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1603                 return (EINVAL);
1604
1605         ZFS_ENTER(zsb);
1606         ZFS_VERIFY_ZP(dzp);
1607         zilog = zsb->z_log;
1608
1609         if (dzp->z_pflags & ZFS_XATTR) {
1610                 ZFS_EXIT(zsb);
1611                 return (EINVAL);
1612         }
1613
1614         if (zsb->z_utf8 && u8_validate(dirname,
1615             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1616                 ZFS_EXIT(zsb);
1617                 return (EILSEQ);
1618         }
1619         if (flags & FIGNORECASE)
1620                 zf |= ZCILOOK;
1621
1622 #ifdef HAVE_XVATTR
1623         if (vap->va_mask & AT_XVATTR) {
1624                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1625                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1626                         ZFS_EXIT(zsb);
1627                         return (error);
1628                 }
1629         }
1630 #endif /* HAVE_XVATTR */
1631
1632         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1633             vsecp, &acl_ids)) != 0) {
1634                 ZFS_EXIT(zsb);
1635                 return (error);
1636         }
1637         /*
1638          * First make sure the new directory doesn't exist.
1639          *
1640          * Existence is checked first to make sure we don't return
1641          * EACCES instead of EEXIST which can cause some applications
1642          * to fail.
1643          */
1644 top:
1645         *ipp = NULL;
1646
1647         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1648             NULL, NULL))) {
1649                 zfs_acl_ids_free(&acl_ids);
1650                 ZFS_EXIT(zsb);
1651                 return (error);
1652         }
1653
1654         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
1655                 zfs_acl_ids_free(&acl_ids);
1656                 zfs_dirent_unlock(dl);
1657                 ZFS_EXIT(zsb);
1658                 return (error);
1659         }
1660
1661         if (zfs_acl_ids_overquota(zsb, &acl_ids)) {
1662                 zfs_acl_ids_free(&acl_ids);
1663                 zfs_dirent_unlock(dl);
1664                 ZFS_EXIT(zsb);
1665                 return (EDQUOT);
1666         }
1667
1668         /*
1669          * Add a new entry to the directory.
1670          */
1671         tx = dmu_tx_create(zsb->z_os);
1672         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1673         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1674         fuid_dirtied = zsb->z_fuid_dirty;
1675         if (fuid_dirtied)
1676                 zfs_fuid_txhold(zsb, tx);
1677         if (!zsb->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1678                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1679                     acl_ids.z_aclp->z_acl_bytes);
1680         }
1681
1682         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1683             ZFS_SA_BASE_ATTR_SIZE);
1684
1685         error = dmu_tx_assign(tx, TXG_NOWAIT);
1686         if (error) {
1687                 zfs_dirent_unlock(dl);
1688                 if (error == ERESTART) {
1689                         dmu_tx_wait(tx);
1690                         dmu_tx_abort(tx);
1691                         goto top;
1692                 }
1693                 zfs_acl_ids_free(&acl_ids);
1694                 dmu_tx_abort(tx);
1695                 ZFS_EXIT(zsb);
1696                 return (error);
1697         }
1698
1699         /*
1700          * Create new node.
1701          */
1702         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1703
1704         if (fuid_dirtied)
1705                 zfs_fuid_sync(zsb, tx);
1706
1707         /*
1708          * Now put new name in parent dir.
1709          */
1710         (void) zfs_link_create(dl, zp, tx, ZNEW);
1711
1712         *ipp = ZTOI(zp);
1713
1714         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1715         if (flags & FIGNORECASE)
1716                 txtype |= TX_CI;
1717         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1718             acl_ids.z_fuidp, vap);
1719
1720         zfs_acl_ids_free(&acl_ids);
1721
1722         dmu_tx_commit(tx);
1723
1724         zfs_dirent_unlock(dl);
1725
1726         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
1727                 zil_commit(zilog, 0);
1728
1729         zfs_inode_update(dzp);
1730         zfs_inode_update(zp);
1731         ZFS_EXIT(zsb);
1732         return (0);
1733 }
1734 EXPORT_SYMBOL(zfs_mkdir);
1735
1736 /*
1737  * Remove a directory subdir entry.  If the current working
1738  * directory is the same as the subdir to be removed, the
1739  * remove will fail.
1740  *
1741  *      IN:     dip     - inode of directory to remove from.
1742  *              name    - name of directory to be removed.
1743  *              cwd     - inode of current working directory.
1744  *              cr      - credentials of caller.
1745  *              flags   - case flags
1746  *
1747  *      RETURN: 0 if success
1748  *              error code if failure
1749  *
1750  * Timestamps:
1751  *      dip - ctime|mtime updated
1752  */
1753 /*ARGSUSED*/
1754 int
1755 zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
1756     int flags)
1757 {
1758         znode_t         *dzp = ITOZ(dip);
1759         znode_t         *zp;
1760         struct inode    *ip;
1761         zfs_sb_t        *zsb = ITOZSB(dip);
1762         zilog_t         *zilog;
1763         zfs_dirlock_t   *dl;
1764         dmu_tx_t        *tx;
1765         int             error;
1766         int             zflg = ZEXISTS;
1767
1768         ZFS_ENTER(zsb);
1769         ZFS_VERIFY_ZP(dzp);
1770         zilog = zsb->z_log;
1771
1772         if (flags & FIGNORECASE)
1773                 zflg |= ZCILOOK;
1774 top:
1775         zp = NULL;
1776
1777         /*
1778          * Attempt to lock directory; fail if entry doesn't exist.
1779          */
1780         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1781             NULL, NULL))) {
1782                 ZFS_EXIT(zsb);
1783                 return (error);
1784         }
1785
1786         ip = ZTOI(zp);
1787
1788         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1789                 goto out;
1790         }
1791
1792         if (!S_ISDIR(ip->i_mode)) {
1793                 error = ENOTDIR;
1794                 goto out;
1795         }
1796
1797         if (ip == cwd) {
1798                 error = EINVAL;
1799                 goto out;
1800         }
1801
1802         /*
1803          * Grab a lock on the directory to make sure that noone is
1804          * trying to add (or lookup) entries while we are removing it.
1805          */
1806         rw_enter(&zp->z_name_lock, RW_WRITER);
1807
1808         /*
1809          * Grab a lock on the parent pointer to make sure we play well
1810          * with the treewalk and directory rename code.
1811          */
1812         rw_enter(&zp->z_parent_lock, RW_WRITER);
1813
1814         tx = dmu_tx_create(zsb->z_os);
1815         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1816         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1817         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
1818         zfs_sa_upgrade_txholds(tx, zp);
1819         zfs_sa_upgrade_txholds(tx, dzp);
1820         error = dmu_tx_assign(tx, TXG_NOWAIT);
1821         if (error) {
1822                 rw_exit(&zp->z_parent_lock);
1823                 rw_exit(&zp->z_name_lock);
1824                 zfs_dirent_unlock(dl);
1825                 iput(ip);
1826                 if (error == ERESTART) {
1827                         dmu_tx_wait(tx);
1828                         dmu_tx_abort(tx);
1829                         goto top;
1830                 }
1831                 dmu_tx_abort(tx);
1832                 ZFS_EXIT(zsb);
1833                 return (error);
1834         }
1835
1836         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1837
1838         if (error == 0) {
1839                 uint64_t txtype = TX_RMDIR;
1840                 if (flags & FIGNORECASE)
1841                         txtype |= TX_CI;
1842                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
1843         }
1844
1845         dmu_tx_commit(tx);
1846
1847         rw_exit(&zp->z_parent_lock);
1848         rw_exit(&zp->z_name_lock);
1849 out:
1850         zfs_dirent_unlock(dl);
1851
1852         iput(ip);
1853
1854         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
1855                 zil_commit(zilog, 0);
1856
1857         zfs_inode_update(dzp);
1858         zfs_inode_update(zp);
1859         ZFS_EXIT(zsb);
1860         return (error);
1861 }
1862 EXPORT_SYMBOL(zfs_rmdir);
1863
1864 /*
1865  * Read as many directory entries as will fit into the provided
1866  * dirent buffer from the given directory cursor position.
1867  *
1868  *      IN:     ip      - inode of directory to read.
1869  *              dirent  - buffer for directory entries.
1870  *
1871  *      OUT:    dirent  - filler buffer of directory entries.
1872  *
1873  *      RETURN: 0 if success
1874  *              error code if failure
1875  *
1876  * Timestamps:
1877  *      ip - atime updated
1878  *
1879  * Note that the low 4 bits of the cookie returned by zap is always zero.
1880  * This allows us to use the low range for "special" directory entries:
1881  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1882  * we use the offset 2 for the '.zfs' directory.
1883  */
1884 /* ARGSUSED */
1885 int
1886 zfs_readdir(struct inode *ip, void *dirent, filldir_t filldir,
1887     loff_t *pos, cred_t *cr)
1888 {
1889         znode_t         *zp = ITOZ(ip);
1890         zfs_sb_t        *zsb = ITOZSB(ip);
1891         objset_t        *os;
1892         zap_cursor_t    zc;
1893         zap_attribute_t zap;
1894         int             outcount;
1895         int             error;
1896         uint8_t         prefetch;
1897         int             done = 0;
1898         uint64_t        parent;
1899
1900         ZFS_ENTER(zsb);
1901         ZFS_VERIFY_ZP(zp);
1902
1903         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zsb),
1904             &parent, sizeof (parent))) != 0)
1905                 goto out;
1906
1907         /*
1908          * Quit if directory has been removed (posix)
1909          */
1910         error = 0;
1911         if (zp->z_unlinked)
1912                 goto out;
1913
1914         os = zsb->z_os;
1915         prefetch = zp->z_zn_prefetch;
1916
1917         /*
1918          * Initialize the iterator cursor.
1919          */
1920         if (*pos <= 3) {
1921                 /*
1922                  * Start iteration from the beginning of the directory.
1923                  */
1924                 zap_cursor_init(&zc, os, zp->z_id);
1925         } else {
1926                 /*
1927                  * The offset is a serialized cursor.
1928                  */
1929                 zap_cursor_init_serialized(&zc, os, zp->z_id, *pos);
1930         }
1931
1932         /*
1933          * Transform to file-system independent format
1934          */
1935         outcount = 0;
1936
1937         while (!done) {
1938                 uint64_t objnum;
1939                 /*
1940                  * Special case `.', `..', and `.zfs'.
1941                  */
1942                 if (*pos == 0) {
1943                         (void) strcpy(zap.za_name, ".");
1944                         zap.za_normalization_conflict = 0;
1945                         objnum = zp->z_id;
1946                 } else if (*pos == 1) {
1947                         (void) strcpy(zap.za_name, "..");
1948                         zap.za_normalization_conflict = 0;
1949                         objnum = parent;
1950                 } else if (*pos == 2 && zfs_show_ctldir(zp)) {
1951                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1952                         zap.za_normalization_conflict = 0;
1953                         objnum = ZFSCTL_INO_ROOT;
1954                 } else {
1955                         /*
1956                          * Grab next entry.
1957                          */
1958                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
1959                                 if (error == ENOENT)
1960                                         break;
1961                                 else
1962                                         goto update;
1963                         }
1964
1965                         if (zap.za_integer_length != 8 ||
1966                             zap.za_num_integers != 1) {
1967                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1968                                     "entry, obj = %lld, offset = %lld\n",
1969                                     (u_longlong_t)zp->z_id,
1970                                     (u_longlong_t)*pos);
1971                                 error = ENXIO;
1972                                 goto update;
1973                         }
1974
1975                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1976                 }
1977                 done = filldir(dirent, zap.za_name, strlen(zap.za_name),
1978                                zap_cursor_serialize(&zc), objnum, 0);
1979                 if (done) {
1980                         break;
1981                 }
1982
1983                 /* Prefetch znode */
1984                 if (prefetch) {
1985                         dmu_prefetch(os, objnum, 0, 0);
1986                 }
1987
1988                 if (*pos >= 2) {
1989                         zap_cursor_advance(&zc);
1990                         *pos = zap_cursor_serialize(&zc);
1991                 } else {
1992                         (*pos)++;
1993                 }
1994         }
1995         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1996
1997 update:
1998         zap_cursor_fini(&zc);
1999         if (error == ENOENT)
2000                 error = 0;
2001
2002         ZFS_ACCESSTIME_STAMP(zsb, zp);
2003         zfs_inode_update(zp);
2004
2005 out:
2006         ZFS_EXIT(zsb);
2007
2008         return (error);
2009 }
2010 EXPORT_SYMBOL(zfs_readdir);
2011
2012 ulong_t zfs_fsync_sync_cnt = 4;
2013
2014 int
2015 zfs_fsync(struct inode *ip, int syncflag, cred_t *cr)
2016 {
2017         znode_t *zp = ITOZ(ip);
2018         zfs_sb_t *zsb = ITOZSB(ip);
2019
2020         (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2021
2022         if (zsb->z_os->os_sync != ZFS_SYNC_DISABLED) {
2023                 ZFS_ENTER(zsb);
2024                 ZFS_VERIFY_ZP(zp);
2025                 zil_commit(zsb->z_log, zp->z_id);
2026                 ZFS_EXIT(zsb);
2027         }
2028         return (0);
2029 }
2030 EXPORT_SYMBOL(zfs_fsync);
2031
2032
2033 /*
2034  * Get the requested file attributes and place them in the provided
2035  * vattr structure.
2036  *
2037  *      IN:     ip      - inode of file.
2038  *              stat    - kstat structure to fill in.
2039  *              flags   - ATTR_NOACLCHECK (CIFS server context)
2040  *              cr      - credentials of caller.
2041  *
2042  *      OUT:    stat    - filled in kstat values.
2043  */
2044 /* ARGSUSED */
2045 int
2046 zfs_getattr(struct inode *ip, struct kstat *stat, int flags, cred_t *cr)
2047 {
2048         znode_t *zp = ITOZ(ip);
2049         zfs_sb_t *zsb = ITOZSB(ip);
2050         int     error = 0;
2051         uint64_t links;
2052         uint64_t mtime[2], ctime[2];
2053         uint32_t blksz;
2054         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2055         sa_bulk_attr_t bulk[2];
2056         int count = 0;
2057
2058         ZFS_ENTER(zsb);
2059         ZFS_VERIFY_ZP(zp);
2060
2061         zfs_fuid_map_ids(zp, cr, &stat->uid, &stat->gid);
2062
2063         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
2064         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);
2065
2066         if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2067                 ZFS_EXIT(zsb);
2068                 return (error);
2069         }
2070
2071         /*
2072          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2073          * Also, if we are the owner don't bother, since owner should
2074          * always be allowed to read basic attributes of file.
2075          */
2076         if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2077             (stat->uid != crgetuid(cr))) {
2078                 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2079                     skipaclchk, cr))) {
2080                         ZFS_EXIT(zsb);
2081                         return (error);
2082                 }
2083         }
2084
2085         /*
2086          * Return all attributes.  It's cheaper to provide the answer
2087          * than to determine whether we were asked the question.
2088          */
2089
2090         mutex_enter(&zp->z_lock);
2091         stat->ino = ip->i_ino;
2092         stat->mode = zp->z_mode;
2093         stat->uid = zp->z_uid;
2094         stat->gid = zp->z_gid;
2095         if ((zp->z_id == zsb->z_root) && zfs_show_ctldir(zp))
2096                 links = zp->z_links + 1;
2097         else
2098                 links = zp->z_links;
2099         stat->nlink = MIN(links, ZFS_LINK_MAX);
2100         stat->size = i_size_read(ip);
2101         stat->rdev = ip->i_rdev;
2102         stat->dev = ip->i_rdev;
2103
2104         ZFS_TIME_DECODE(&stat->atime, zp->z_atime);
2105         ZFS_TIME_DECODE(&stat->mtime, mtime);
2106         ZFS_TIME_DECODE(&stat->ctime, ctime);
2107
2108         mutex_exit(&zp->z_lock);
2109
2110         sa_object_size(zp->z_sa_hdl, &blksz, &stat->blocks);
2111         stat->blksize = (1 << ip->i_blkbits);
2112
2113         if (zp->z_blksz == 0) {
2114                 /*
2115                  * Block size hasn't been set; suggest maximal I/O transfers.
2116                  */
2117                 stat->blksize = zsb->z_max_blksz;
2118         }
2119
2120         ZFS_EXIT(zsb);
2121         return (0);
2122 }
2123 EXPORT_SYMBOL(zfs_getattr);
2124
2125 /*
2126  * Set the file attributes to the values contained in the
2127  * vattr structure.
2128  *
2129  *      IN:     ip      - inode of file to be modified.
2130  *              vap     - new attribute values.
2131  *                        If AT_XVATTR set, then optional attrs are being set
2132  *              flags   - ATTR_UTIME set if non-default time values provided.
2133  *                      - ATTR_NOACLCHECK (CIFS context only).
2134  *              cr      - credentials of caller.
2135  *
2136  *      RETURN: 0 if success
2137  *              error code if failure
2138  *
2139  * Timestamps:
2140  *      ip - ctime updated, mtime updated if size changed.
2141  */
2142 /* ARGSUSED */
2143 int
2144 zfs_setattr(struct inode *ip, struct iattr *attr, int flags, cred_t *cr)
2145 {
2146         znode_t         *zp = ITOZ(ip);
2147         zfs_sb_t        *zsb = ITOZSB(ip);
2148         zilog_t         *zilog;
2149         dmu_tx_t        *tx;
2150         vattr_t         oldva;
2151         uint_t          mask = attr->ia_valid;
2152         uint_t          saved_mask;
2153         int             trim_mask = 0;
2154         uint64_t        new_mode;
2155         uint64_t        new_uid, new_gid;
2156         uint64_t        xattr_obj;
2157         uint64_t        mtime[2], ctime[2];
2158         znode_t         *attrzp;
2159         int             need_policy = FALSE;
2160         int             err, err2;
2161         zfs_fuid_info_t *fuidp = NULL;
2162         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2163         zfs_acl_t       *aclp = NULL;
2164         boolean_t       fuid_dirtied = B_FALSE;
2165         sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2166         int             count = 0, xattr_count = 0;
2167
2168         if (mask == 0)
2169                 return (0);
2170
2171         ZFS_ENTER(zsb);
2172         ZFS_VERIFY_ZP(zp);
2173
2174         zilog = zsb->z_log;
2175
2176         /*
2177          * Make sure that if we have ephemeral uid/gid or xvattr specified
2178          * that file system is at proper version level
2179          */
2180         if (zsb->z_use_fuids == B_FALSE &&
2181             (((mask & ATTR_UID) && IS_EPHEMERAL(attr->ia_uid)) ||
2182             ((mask & ATTR_GID) && IS_EPHEMERAL(attr->ia_gid)))) {
2183                 ZFS_EXIT(zsb);
2184                 return (EINVAL);
2185         }
2186
2187         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2188                 ZFS_EXIT(zsb);
2189                 return (EISDIR);
2190         }
2191
2192         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2193                 ZFS_EXIT(zsb);
2194                 return (EINVAL);
2195         }
2196
2197         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2198                 ZFS_EXIT(zsb);
2199                 return (EPERM);
2200         }
2201
2202 top:
2203         attrzp = NULL;
2204         aclp = NULL;
2205
2206         /* Can this be moved to before the top label? */
2207         if (zsb->z_vfs->mnt_flags & MNT_READONLY) {
2208                 ZFS_EXIT(zsb);
2209                 return (EROFS);
2210         }
2211
2212         /*
2213          * First validate permissions
2214          */
2215
2216         if (mask & ATTR_SIZE) {
2217                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2218                 if (err) {
2219                         ZFS_EXIT(zsb);
2220                         return (err);
2221                 }
2222                 /*
2223                  * XXX - Note, we are not providing any open
2224                  * mode flags here (like FNDELAY), so we may
2225                  * block if there are locks present... this
2226                  * should be addressed in openat().
2227                  */
2228                 /* XXX - would it be OK to generate a log record here? */
2229                 err = zfs_freesp(zp, attr->ia_size, 0, 0, FALSE);
2230                 if (err) {
2231                         ZFS_EXIT(zsb);
2232                         return (err);
2233                 }
2234
2235                 /* Careful negative Linux return code here */
2236                 err = -vmtruncate(ip, attr->ia_size);
2237                 if (err) {
2238                         ZFS_EXIT(zsb);
2239                         return (err);
2240                 }
2241         }
2242
2243         if (mask & (ATTR_UID|ATTR_GID)) {
2244                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
2245                 int     take_owner;
2246                 int     take_group;
2247
2248                 /*
2249                  * NOTE: even if a new mode is being set,
2250                  * we may clear S_ISUID/S_ISGID bits.
2251                  */
2252
2253                 if (!(mask & ATTR_MODE))
2254                         attr->ia_mode = zp->z_mode;
2255
2256                 /*
2257                  * Take ownership or chgrp to group we are a member of
2258                  */
2259
2260                 take_owner = (mask & ATTR_UID) &&
2261                     (attr->ia_uid == crgetuid(cr));
2262                 take_group = (mask & ATTR_GID) &&
2263                     zfs_groupmember(zsb, attr->ia_gid, cr);
2264
2265                 /*
2266                  * If both AT_UID and AT_GID are set then take_owner and
2267                  * take_group must both be set in order to allow taking
2268                  * ownership.
2269                  *
2270                  * Otherwise, send the check through secpolicy_vnode_setattr()
2271                  *
2272                  */
2273
2274                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2275                     take_owner && take_group) ||
2276                     ((idmask == ATTR_UID) && take_owner) ||
2277                     ((idmask == ATTR_GID) && take_group)) {
2278                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2279                             skipaclchk, cr) == 0) {
2280                                 /*
2281                                  * Remove setuid/setgid for non-privileged users
2282                                  */
2283                                 secpolicy_setid_clear(attr, cr);
2284                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2285                         } else {
2286                                 need_policy =  TRUE;
2287                         }
2288                 } else {
2289                         need_policy =  TRUE;
2290                 }
2291         }
2292
2293         mutex_enter(&zp->z_lock);
2294         oldva.va_mode = zp->z_mode;
2295         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2296
2297         mutex_exit(&zp->z_lock);
2298
2299         if (mask & ATTR_MODE) {
2300                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2301                         err = secpolicy_setid_setsticky_clear(ip, attr,
2302                             &oldva, cr);
2303                         if (err) {
2304                                 ZFS_EXIT(zsb);
2305                                 return (err);
2306                         }
2307                         trim_mask |= ATTR_MODE;
2308                 } else {
2309                         need_policy = TRUE;
2310                 }
2311         }
2312
2313         if (need_policy) {
2314                 /*
2315                  * If trim_mask is set then take ownership
2316                  * has been granted or write_acl is present and user
2317                  * has the ability to modify mode.  In that case remove
2318                  * UID|GID and or MODE from mask so that
2319                  * secpolicy_vnode_setattr() doesn't revoke it.
2320                  */
2321
2322                 if (trim_mask) {
2323                         saved_mask = attr->ia_valid;
2324                         attr->ia_valid &= ~trim_mask;
2325                 }
2326                 err = secpolicy_vnode_setattr(cr, ip, attr, &oldva, flags,
2327                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2328                 if (err) {
2329                         ZFS_EXIT(zsb);
2330                         return (err);
2331                 }
2332
2333                 if (trim_mask)
2334                         attr->ia_valid |= saved_mask;
2335         }
2336
2337         /*
2338          * secpolicy_vnode_setattr, or take ownership may have
2339          * changed va_mask
2340          */
2341         mask = attr->ia_valid;
2342
2343         if ((mask & (ATTR_UID | ATTR_GID))) {
2344                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb),
2345                     &xattr_obj, sizeof (xattr_obj));
2346
2347                 if (err == 0 && xattr_obj) {
2348                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2349                         if (err)
2350                                 goto out2;
2351                 }
2352                 if (mask & ATTR_UID) {
2353                         new_uid = zfs_fuid_create(zsb,
2354                             (uint64_t)attr->ia_uid, cr, ZFS_OWNER, &fuidp);
2355                         if (new_uid != zp->z_uid &&
2356                             zfs_fuid_overquota(zsb, B_FALSE, new_uid)) {
2357                                 if (attrzp)
2358                                         iput(ZTOI(attrzp));
2359                                 err = EDQUOT;
2360                                 goto out2;
2361                         }
2362                 }
2363
2364                 if (mask & ATTR_GID) {
2365                         new_gid = zfs_fuid_create(zsb, (uint64_t)attr->ia_gid,
2366                             cr, ZFS_GROUP, &fuidp);
2367                         if (new_gid != zp->z_gid &&
2368                             zfs_fuid_overquota(zsb, B_TRUE, new_gid)) {
2369                                 if (attrzp)
2370                                         iput(ZTOI(attrzp));
2371                                 err = EDQUOT;
2372                                 goto out2;
2373                         }
2374                 }
2375         }
2376         tx = dmu_tx_create(zsb->z_os);
2377
2378         if (mask & ATTR_MODE) {
2379                 uint64_t pmode = zp->z_mode;
2380                 uint64_t acl_obj;
2381                 new_mode = (pmode & S_IFMT) | (attr->ia_mode & ~S_IFMT);
2382
2383                 zfs_acl_chmod_setattr(zp, &aclp, new_mode);
2384
2385                 mutex_enter(&zp->z_lock);
2386                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2387                         /*
2388                          * Are we upgrading ACL from old V0 format
2389                          * to V1 format?
2390                          */
2391                         if (zsb->z_version >= ZPL_VERSION_FUID &&
2392                             zfs_znode_acl_version(zp) ==
2393                             ZFS_ACL_VERSION_INITIAL) {
2394                                 dmu_tx_hold_free(tx, acl_obj, 0,
2395                                     DMU_OBJECT_END);
2396                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2397                                     0, aclp->z_acl_bytes);
2398                         } else {
2399                                 dmu_tx_hold_write(tx, acl_obj, 0,
2400                                     aclp->z_acl_bytes);
2401                         }
2402                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2403                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2404                             0, aclp->z_acl_bytes);
2405                 }
2406                 mutex_exit(&zp->z_lock);
2407                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2408         } else {
2409                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2410         }
2411
2412         if (attrzp) {
2413                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2414         }
2415
2416         fuid_dirtied = zsb->z_fuid_dirty;
2417         if (fuid_dirtied)
2418                 zfs_fuid_txhold(zsb, tx);
2419
2420         zfs_sa_upgrade_txholds(tx, zp);
2421
2422         err = dmu_tx_assign(tx, TXG_NOWAIT);
2423         if (err) {
2424                 if (err == ERESTART)
2425                         dmu_tx_wait(tx);
2426                 goto out;
2427         }
2428
2429         count = 0;
2430         /*
2431          * Set each attribute requested.
2432          * We group settings according to the locks they need to acquire.
2433          *
2434          * Note: you cannot set ctime directly, although it will be
2435          * updated as a side-effect of calling this function.
2436          */
2437
2438
2439         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2440                 mutex_enter(&zp->z_acl_lock);
2441         mutex_enter(&zp->z_lock);
2442
2443         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL,
2444             &zp->z_pflags, sizeof (zp->z_pflags));
2445
2446         if (attrzp) {
2447                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2448                         mutex_enter(&attrzp->z_acl_lock);
2449                 mutex_enter(&attrzp->z_lock);
2450                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2451                     SA_ZPL_FLAGS(zsb), NULL, &attrzp->z_pflags,
2452                     sizeof (attrzp->z_pflags));
2453         }
2454
2455         if (mask & (ATTR_UID|ATTR_GID)) {
2456
2457                 if (mask & ATTR_UID) {
2458                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zsb), NULL,
2459                             &new_uid, sizeof (new_uid));
2460                         zp->z_uid = new_uid;
2461                         if (attrzp) {
2462                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2463                                     SA_ZPL_UID(zsb), NULL, &new_uid,
2464                                     sizeof (new_uid));
2465                                 attrzp->z_uid = new_uid;
2466                         }
2467                 }
2468
2469                 if (mask & ATTR_GID) {
2470                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zsb),
2471                             NULL, &new_gid, sizeof (new_gid));
2472                         zp->z_gid = new_gid;
2473                         if (attrzp) {
2474                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2475                                     SA_ZPL_GID(zsb), NULL, &new_gid,
2476                                     sizeof (new_gid));
2477                                 attrzp->z_gid = new_gid;
2478                         }
2479                 }
2480                 if (!(mask & ATTR_MODE)) {
2481                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb),
2482                             NULL, &new_mode, sizeof (new_mode));
2483                         new_mode = zp->z_mode;
2484                 }
2485                 err = zfs_acl_chown_setattr(zp);
2486                 ASSERT(err == 0);
2487                 if (attrzp) {
2488                         err = zfs_acl_chown_setattr(attrzp);
2489                         ASSERT(err == 0);
2490                 }
2491         }
2492
2493         if (mask & ATTR_MODE) {
2494                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL,
2495                     &new_mode, sizeof (new_mode));
2496                 zp->z_mode = new_mode;
2497                 ASSERT3U((uintptr_t)aclp, !=, NULL);
2498                 err = zfs_aclset_common(zp, aclp, cr, tx);
2499                 ASSERT3U(err, ==, 0);
2500                 if (zp->z_acl_cached)
2501                         zfs_acl_free(zp->z_acl_cached);
2502                 zp->z_acl_cached = aclp;
2503                 aclp = NULL;
2504         }
2505
2506
2507         if (mask & ATTR_ATIME) {
2508                 ZFS_TIME_ENCODE(&attr->ia_atime, zp->z_atime);
2509                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zsb), NULL,
2510                     &zp->z_atime, sizeof (zp->z_atime));
2511         }
2512
2513         if (mask & ATTR_MTIME) {
2514                 ZFS_TIME_ENCODE(&attr->ia_mtime, mtime);
2515                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL,
2516                     mtime, sizeof (mtime));
2517         }
2518
2519         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2520         if (mask & ATTR_SIZE && !(mask & ATTR_MTIME)) {
2521                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb),
2522                     NULL, mtime, sizeof (mtime));
2523                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
2524                     &ctime, sizeof (ctime));
2525                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
2526                     B_TRUE);
2527         } else if (mask != 0) {
2528                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
2529                     &ctime, sizeof (ctime));
2530                 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
2531                     B_TRUE);
2532                 if (attrzp) {
2533                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2534                             SA_ZPL_CTIME(zsb), NULL,
2535                             &ctime, sizeof (ctime));
2536                         zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
2537                             mtime, ctime, B_TRUE);
2538                 }
2539         }
2540         /*
2541          * Do this after setting timestamps to prevent timestamp
2542          * update from toggling bit
2543          */
2544
2545         if (fuid_dirtied)
2546                 zfs_fuid_sync(zsb, tx);
2547
2548         if (mask != 0)
2549                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, attr, mask, fuidp);
2550
2551         mutex_exit(&zp->z_lock);
2552         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2553                 mutex_exit(&zp->z_acl_lock);
2554
2555         if (attrzp) {
2556                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2557                         mutex_exit(&attrzp->z_acl_lock);
2558                 mutex_exit(&attrzp->z_lock);
2559         }
2560 out:
2561         if (err == 0 && attrzp) {
2562                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2563                     xattr_count, tx);
2564                 ASSERT(err2 == 0);
2565         }
2566
2567         if (attrzp)
2568                 iput(ZTOI(attrzp));
2569         if (aclp)
2570                 zfs_acl_free(aclp);
2571
2572         if (fuidp) {
2573                 zfs_fuid_info_free(fuidp);
2574                 fuidp = NULL;
2575         }
2576
2577         if (err) {
2578                 dmu_tx_abort(tx);
2579                 if (err == ERESTART)
2580                         goto top;
2581         } else {
2582                 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2583                 dmu_tx_commit(tx);
2584                  zfs_inode_update(zp);
2585         }
2586
2587 out2:
2588         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
2589                 zil_commit(zilog, 0);
2590
2591         ZFS_EXIT(zsb);
2592         return (err);
2593 }
2594 EXPORT_SYMBOL(zfs_setattr);
2595
2596 typedef struct zfs_zlock {
2597         krwlock_t       *zl_rwlock;     /* lock we acquired */
2598         znode_t         *zl_znode;      /* znode we held */
2599         struct zfs_zlock *zl_next;      /* next in list */
2600 } zfs_zlock_t;
2601
2602 /*
2603  * Drop locks and release vnodes that were held by zfs_rename_lock().
2604  */
2605 static void
2606 zfs_rename_unlock(zfs_zlock_t **zlpp)
2607 {
2608         zfs_zlock_t *zl;
2609
2610         while ((zl = *zlpp) != NULL) {
2611                 if (zl->zl_znode != NULL)
2612                         iput(ZTOI(zl->zl_znode));
2613                 rw_exit(zl->zl_rwlock);
2614                 *zlpp = zl->zl_next;
2615                 kmem_free(zl, sizeof (*zl));
2616         }
2617 }
2618
2619 /*
2620  * Search back through the directory tree, using the ".." entries.
2621  * Lock each directory in the chain to prevent concurrent renames.
2622  * Fail any attempt to move a directory into one of its own descendants.
2623  * XXX - z_parent_lock can overlap with map or grow locks
2624  */
2625 static int
2626 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2627 {
2628         zfs_zlock_t     *zl;
2629         znode_t         *zp = tdzp;
2630         uint64_t        rootid = ZTOZSB(zp)->z_root;
2631         uint64_t        oidp = zp->z_id;
2632         krwlock_t       *rwlp = &szp->z_parent_lock;
2633         krw_t           rw = RW_WRITER;
2634
2635         /*
2636          * First pass write-locks szp and compares to zp->z_id.
2637          * Later passes read-lock zp and compare to zp->z_parent.
2638          */
2639         do {
2640                 if (!rw_tryenter(rwlp, rw)) {
2641                         /*
2642                          * Another thread is renaming in this path.
2643                          * Note that if we are a WRITER, we don't have any
2644                          * parent_locks held yet.
2645                          */
2646                         if (rw == RW_READER && zp->z_id > szp->z_id) {
2647                                 /*
2648                                  * Drop our locks and restart
2649                                  */
2650                                 zfs_rename_unlock(&zl);
2651                                 *zlpp = NULL;
2652                                 zp = tdzp;
2653                                 oidp = zp->z_id;
2654                                 rwlp = &szp->z_parent_lock;
2655                                 rw = RW_WRITER;
2656                                 continue;
2657                         } else {
2658                                 /*
2659                                  * Wait for other thread to drop its locks
2660                                  */
2661                                 rw_enter(rwlp, rw);
2662                         }
2663                 }
2664
2665                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2666                 zl->zl_rwlock = rwlp;
2667                 zl->zl_znode = NULL;
2668                 zl->zl_next = *zlpp;
2669                 *zlpp = zl;
2670
2671                 if (oidp == szp->z_id)          /* We're a descendant of szp */
2672                         return (EINVAL);
2673
2674                 if (oidp == rootid)             /* We've hit the top */
2675                         return (0);
2676
2677                 if (rw == RW_READER) {          /* i.e. not the first pass */
2678                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2679                         if (error)
2680                                 return (error);
2681                         zl->zl_znode = zp;
2682                 }
2683                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2684                     &oidp, sizeof (oidp));
2685                 rwlp = &zp->z_parent_lock;
2686                 rw = RW_READER;
2687
2688         } while (zp->z_id != sdzp->z_id);
2689
2690         return (0);
2691 }
2692
2693 /*
2694  * Move an entry from the provided source directory to the target
2695  * directory.  Change the entry name as indicated.
2696  *
2697  *      IN:     sdip    - Source directory containing the "old entry".
2698  *              snm     - Old entry name.
2699  *              tdip    - Target directory to contain the "new entry".
2700  *              tnm     - New entry name.
2701  *              cr      - credentials of caller.
2702  *              flags   - case flags
2703  *
2704  *      RETURN: 0 if success
2705  *              error code if failure
2706  *
2707  * Timestamps:
2708  *      sdip,tdip - ctime|mtime updated
2709  */
2710 /*ARGSUSED*/
2711 int
2712 zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
2713     cred_t *cr, int flags)
2714 {
2715         znode_t         *tdzp, *szp, *tzp;
2716         znode_t         *sdzp = ITOZ(sdip);
2717         zfs_sb_t        *zsb = ITOZSB(sdip);
2718         zilog_t         *zilog;
2719         zfs_dirlock_t   *sdl, *tdl;
2720         dmu_tx_t        *tx;
2721         zfs_zlock_t     *zl;
2722         int             cmp, serr, terr;
2723         int             error = 0;
2724         int             zflg = 0;
2725
2726         ZFS_ENTER(zsb);
2727         ZFS_VERIFY_ZP(sdzp);
2728         zilog = zsb->z_log;
2729
2730         if (tdip->i_sb != sdip->i_sb) {
2731                 ZFS_EXIT(zsb);
2732                 return (EXDEV);
2733         }
2734
2735         tdzp = ITOZ(tdip);
2736         ZFS_VERIFY_ZP(tdzp);
2737         if (zsb->z_utf8 && u8_validate(tnm,
2738             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2739                 ZFS_EXIT(zsb);
2740                 return (EILSEQ);
2741         }
2742
2743         if (flags & FIGNORECASE)
2744                 zflg |= ZCILOOK;
2745
2746 top:
2747         szp = NULL;
2748         tzp = NULL;
2749         zl = NULL;
2750
2751         /*
2752          * This is to prevent the creation of links into attribute space
2753          * by renaming a linked file into/outof an attribute directory.
2754          * See the comment in zfs_link() for why this is considered bad.
2755          */
2756         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2757                 ZFS_EXIT(zsb);
2758                 return (EINVAL);
2759         }
2760
2761         /*
2762          * Lock source and target directory entries.  To prevent deadlock,
2763          * a lock ordering must be defined.  We lock the directory with
2764          * the smallest object id first, or if it's a tie, the one with
2765          * the lexically first name.
2766          */
2767         if (sdzp->z_id < tdzp->z_id) {
2768                 cmp = -1;
2769         } else if (sdzp->z_id > tdzp->z_id) {
2770                 cmp = 1;
2771         } else {
2772                 /*
2773                  * First compare the two name arguments without
2774                  * considering any case folding.
2775                  */
2776                 int nofold = (zsb->z_norm & ~U8_TEXTPREP_TOUPPER);
2777
2778                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2779                 ASSERT(error == 0 || !zsb->z_utf8);
2780                 if (cmp == 0) {
2781                         /*
2782                          * POSIX: "If the old argument and the new argument
2783                          * both refer to links to the same existing file,
2784                          * the rename() function shall return successfully
2785                          * and perform no other action."
2786                          */
2787                         ZFS_EXIT(zsb);
2788                         return (0);
2789                 }
2790                 /*
2791                  * If the file system is case-folding, then we may
2792                  * have some more checking to do.  A case-folding file
2793                  * system is either supporting mixed case sensitivity
2794                  * access or is completely case-insensitive.  Note
2795                  * that the file system is always case preserving.
2796                  *
2797                  * In mixed sensitivity mode case sensitive behavior
2798                  * is the default.  FIGNORECASE must be used to
2799                  * explicitly request case insensitive behavior.
2800                  *
2801                  * If the source and target names provided differ only
2802                  * by case (e.g., a request to rename 'tim' to 'Tim'),
2803                  * we will treat this as a special case in the
2804                  * case-insensitive mode: as long as the source name
2805                  * is an exact match, we will allow this to proceed as
2806                  * a name-change request.
2807                  */
2808                 if ((zsb->z_case == ZFS_CASE_INSENSITIVE ||
2809                     (zsb->z_case == ZFS_CASE_MIXED &&
2810                     flags & FIGNORECASE)) &&
2811                     u8_strcmp(snm, tnm, 0, zsb->z_norm, U8_UNICODE_LATEST,
2812                     &error) == 0) {
2813                         /*
2814                          * case preserving rename request, require exact
2815                          * name matches
2816                          */
2817                         zflg |= ZCIEXACT;
2818                         zflg &= ~ZCILOOK;
2819                 }
2820         }
2821
2822         /*
2823          * If the source and destination directories are the same, we should
2824          * grab the z_name_lock of that directory only once.
2825          */
2826         if (sdzp == tdzp) {
2827                 zflg |= ZHAVELOCK;
2828                 rw_enter(&sdzp->z_name_lock, RW_READER);
2829         }
2830
2831         if (cmp < 0) {
2832                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2833                     ZEXISTS | zflg, NULL, NULL);
2834                 terr = zfs_dirent_lock(&tdl,
2835                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2836         } else {
2837                 terr = zfs_dirent_lock(&tdl,
2838                     tdzp, tnm, &tzp, zflg, NULL, NULL);
2839                 serr = zfs_dirent_lock(&sdl,
2840                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2841                     NULL, NULL);
2842         }
2843
2844         if (serr) {
2845                 /*
2846                  * Source entry invalid or not there.
2847                  */
2848                 if (!terr) {
2849                         zfs_dirent_unlock(tdl);
2850                         if (tzp)
2851                                 iput(ZTOI(tzp));
2852                 }
2853
2854                 if (sdzp == tdzp)
2855                         rw_exit(&sdzp->z_name_lock);
2856
2857                 if (strcmp(snm, "..") == 0)
2858                         serr = EINVAL;
2859                 ZFS_EXIT(zsb);
2860                 return (serr);
2861         }
2862         if (terr) {
2863                 zfs_dirent_unlock(sdl);
2864                 iput(ZTOI(szp));
2865
2866                 if (sdzp == tdzp)
2867                         rw_exit(&sdzp->z_name_lock);
2868
2869                 if (strcmp(tnm, "..") == 0)
2870                         terr = EINVAL;
2871                 ZFS_EXIT(zsb);
2872                 return (terr);
2873         }
2874
2875         /*
2876          * Must have write access at the source to remove the old entry
2877          * and write access at the target to create the new entry.
2878          * Note that if target and source are the same, this can be
2879          * done in a single check.
2880          */
2881
2882         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
2883                 goto out;
2884
2885         if (S_ISDIR(ZTOI(szp)->i_mode)) {
2886                 /*
2887                  * Check to make sure rename is valid.
2888                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2889                  */
2890                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2891                         goto out;
2892         }
2893
2894         /*
2895          * Does target exist?
2896          */
2897         if (tzp) {
2898                 /*
2899                  * Source and target must be the same type.
2900                  */
2901                 if (S_ISDIR(ZTOI(szp)->i_mode)) {
2902                         if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
2903                                 error = ENOTDIR;
2904                                 goto out;
2905                         }
2906                 } else {
2907                         if (S_ISDIR(ZTOI(tzp)->i_mode)) {
2908                                 error = EISDIR;
2909                                 goto out;
2910                         }
2911                 }
2912                 /*
2913                  * POSIX dictates that when the source and target
2914                  * entries refer to the same file object, rename
2915                  * must do nothing and exit without error.
2916                  */
2917                 if (szp->z_id == tzp->z_id) {
2918                         error = 0;
2919                         goto out;
2920                 }
2921         }
2922
2923         tx = dmu_tx_create(zsb->z_os);
2924         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
2925         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2926         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2927         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2928         if (sdzp != tdzp) {
2929                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
2930                 zfs_sa_upgrade_txholds(tx, tdzp);
2931         }
2932         if (tzp) {
2933                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
2934                 zfs_sa_upgrade_txholds(tx, tzp);
2935         }
2936
2937         zfs_sa_upgrade_txholds(tx, szp);
2938         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
2939         error = dmu_tx_assign(tx, TXG_NOWAIT);
2940         if (error) {
2941                 if (zl != NULL)
2942                         zfs_rename_unlock(&zl);
2943                 zfs_dirent_unlock(sdl);
2944                 zfs_dirent_unlock(tdl);
2945
2946                 if (sdzp == tdzp)
2947                         rw_exit(&sdzp->z_name_lock);
2948
2949                 iput(ZTOI(szp));
2950                 if (tzp)
2951                         iput(ZTOI(tzp));
2952                 if (error == ERESTART) {
2953                         dmu_tx_wait(tx);
2954                         dmu_tx_abort(tx);
2955                         goto top;
2956                 }
2957                 dmu_tx_abort(tx);
2958                 ZFS_EXIT(zsb);
2959                 return (error);
2960         }
2961
2962         if (tzp)        /* Attempt to remove the existing target */
2963                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
2964
2965         if (error == 0) {
2966                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2967                 if (error == 0) {
2968                         szp->z_pflags |= ZFS_AV_MODIFIED;
2969
2970                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zsb),
2971                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
2972                         ASSERT3U(error, ==, 0);
2973
2974                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2975                         if (error == 0) {
2976                                 zfs_log_rename(zilog, tx, TX_RENAME |
2977                                     (flags & FIGNORECASE ? TX_CI : 0), sdzp,
2978                                     sdl->dl_name, tdzp, tdl->dl_name, szp);
2979                         } else {
2980                                 /*
2981                                  * At this point, we have successfully created
2982                                  * the target name, but have failed to remove
2983                                  * the source name.  Since the create was done
2984                                  * with the ZRENAMING flag, there are
2985                                  * complications; for one, the link count is
2986                                  * wrong.  The easiest way to deal with this
2987                                  * is to remove the newly created target, and
2988                                  * return the original error.  This must
2989                                  * succeed; fortunately, it is very unlikely to
2990                                  * fail, since we just created it.
2991                                  */
2992                                 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
2993                                     ZRENAMING, NULL), ==, 0);
2994                         }
2995                 }
2996         }
2997
2998         dmu_tx_commit(tx);
2999 out:
3000         if (zl != NULL)
3001                 zfs_rename_unlock(&zl);
3002
3003         zfs_dirent_unlock(sdl);
3004         zfs_dirent_unlock(tdl);
3005
3006         zfs_inode_update(sdzp);
3007         if (sdzp == tdzp)
3008                 rw_exit(&sdzp->z_name_lock);
3009
3010         if (sdzp != tdzp)
3011                 zfs_inode_update(tdzp);
3012
3013         zfs_inode_update(szp);
3014         iput(ZTOI(szp));
3015         if (tzp) {
3016                 zfs_inode_update(tzp);
3017                 iput(ZTOI(tzp));
3018         }
3019
3020         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
3021                 zil_commit(zilog, 0);
3022
3023         ZFS_EXIT(zsb);
3024         return (error);
3025 }
3026 EXPORT_SYMBOL(zfs_rename);
3027
3028 /*
3029  * Insert the indicated symbolic reference entry into the directory.
3030  *
3031  *      IN:     dip     - Directory to contain new symbolic link.
3032  *              link    - Name for new symlink entry.
3033  *              vap     - Attributes of new entry.
3034  *              target  - Target path of new symlink.
3035  *
3036  *              cr      - credentials of caller.
3037  *              flags   - case flags
3038  *
3039  *      RETURN: 0 if success
3040  *              error code if failure
3041  *
3042  * Timestamps:
3043  *      dip - ctime|mtime updated
3044  */
3045 /*ARGSUSED*/
3046 int
3047 zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
3048     struct inode **ipp, cred_t *cr, int flags)
3049 {
3050         znode_t         *zp, *dzp = ITOZ(dip);
3051         zfs_dirlock_t   *dl;
3052         dmu_tx_t        *tx;
3053         zfs_sb_t        *zsb = ITOZSB(dip);
3054         zilog_t         *zilog;
3055         uint64_t        len = strlen(link);
3056         int             error;
3057         int             zflg = ZNEW;
3058         zfs_acl_ids_t   acl_ids;
3059         boolean_t       fuid_dirtied;
3060         uint64_t        txtype = TX_SYMLINK;
3061
3062         ASSERT(S_ISLNK(vap->va_mode));
3063
3064         ZFS_ENTER(zsb);
3065         ZFS_VERIFY_ZP(dzp);
3066         zilog = zsb->z_log;
3067
3068         if (zsb->z_utf8 && u8_validate(name, strlen(name),
3069             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3070                 ZFS_EXIT(zsb);
3071                 return (EILSEQ);
3072         }
3073         if (flags & FIGNORECASE)
3074                 zflg |= ZCILOOK;
3075
3076         if (len > MAXPATHLEN) {
3077                 ZFS_EXIT(zsb);
3078                 return (ENAMETOOLONG);
3079         }
3080
3081         if ((error = zfs_acl_ids_create(dzp, 0,
3082             vap, cr, NULL, &acl_ids)) != 0) {
3083                 ZFS_EXIT(zsb);
3084                 return (error);
3085         }
3086 top:
3087         *ipp = NULL;
3088
3089         /*
3090          * Attempt to lock directory; fail if entry already exists.
3091          */
3092         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3093         if (error) {
3094                 zfs_acl_ids_free(&acl_ids);
3095                 ZFS_EXIT(zsb);
3096                 return (error);
3097         }
3098
3099         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3100                 zfs_acl_ids_free(&acl_ids);
3101                 zfs_dirent_unlock(dl);
3102                 ZFS_EXIT(zsb);
3103                 return (error);
3104         }
3105
3106         if (zfs_acl_ids_overquota(zsb, &acl_ids)) {
3107                 zfs_acl_ids_free(&acl_ids);
3108                 zfs_dirent_unlock(dl);
3109                 ZFS_EXIT(zsb);
3110                 return (EDQUOT);
3111         }
3112         tx = dmu_tx_create(zsb->z_os);
3113         fuid_dirtied = zsb->z_fuid_dirty;
3114         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3115         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3116         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3117             ZFS_SA_BASE_ATTR_SIZE + len);
3118         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3119         if (!zsb->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3120                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3121                     acl_ids.z_aclp->z_acl_bytes);
3122         }
3123         if (fuid_dirtied)
3124                 zfs_fuid_txhold(zsb, tx);
3125         error = dmu_tx_assign(tx, TXG_NOWAIT);
3126         if (error) {
3127                 zfs_dirent_unlock(dl);
3128                 if (error == ERESTART) {
3129                         dmu_tx_wait(tx);
3130                         dmu_tx_abort(tx);
3131                         goto top;
3132                 }
3133                 zfs_acl_ids_free(&acl_ids);
3134                 dmu_tx_abort(tx);
3135                 ZFS_EXIT(zsb);
3136                 return (error);
3137         }
3138
3139         /*
3140          * Create a new object for the symlink.
3141          * for version 4 ZPL datsets the symlink will be an SA attribute
3142          */
3143         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3144
3145         if (fuid_dirtied)
3146                 zfs_fuid_sync(zsb, tx);
3147
3148         mutex_enter(&zp->z_lock);
3149         if (zp->z_is_sa)
3150                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zsb),
3151                     link, len, tx);
3152         else
3153                 zfs_sa_symlink(zp, link, len, tx);
3154         mutex_exit(&zp->z_lock);
3155
3156         zp->z_size = len;
3157         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zsb),
3158             &zp->z_size, sizeof (zp->z_size), tx);
3159         /*
3160          * Insert the new object into the directory.
3161          */
3162         (void) zfs_link_create(dl, zp, tx, ZNEW);
3163
3164         if (flags & FIGNORECASE)
3165                 txtype |= TX_CI;
3166         zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3167
3168         zfs_inode_update(dzp);
3169         zfs_inode_update(zp);
3170
3171         zfs_acl_ids_free(&acl_ids);
3172
3173         dmu_tx_commit(tx);
3174
3175         zfs_dirent_unlock(dl);
3176
3177         *ipp = ZTOI(zp);
3178         iput(ZTOI(zp));
3179
3180         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
3181                 zil_commit(zilog, 0);
3182
3183         ZFS_EXIT(zsb);
3184         return (error);
3185 }
3186 EXPORT_SYMBOL(zfs_symlink);
3187
3188 /*
3189  * Return, in the buffer contained in the provided uio structure,
3190  * the symbolic path referred to by ip.
3191  *
3192  *      IN:     dentry  - dentry of symbolic link.
3193  *              nd      - namedata for symlink
3194  *
3195  *      RETURN: 0 if success
3196  *              error code if failure
3197  *
3198  * Timestamps:
3199  *      ip - atime updated
3200  */
3201 /* ARGSUSED */
3202 int
3203 zfs_follow_link(struct dentry *dentry, struct nameidata *nd)
3204 {
3205         struct inode    *ip = dentry->d_inode;
3206         znode_t         *zp = ITOZ(ip);
3207         zfs_sb_t        *zsb = ITOZSB(ip);
3208         struct iovec    iov;
3209         uio_t           uio;
3210         int             error;
3211
3212         ZFS_ENTER(zsb);
3213         ZFS_VERIFY_ZP(zp);
3214
3215         iov.iov_len = MAXPATHLEN + 1;
3216         iov.iov_base = kmem_zalloc(iov.iov_len, KM_SLEEP);
3217
3218         uio.uio_iov = &iov;
3219         uio.uio_iovcnt = 1;
3220         uio.uio_resid = iov.iov_len;
3221         uio.uio_segflg = UIO_SYSSPACE;
3222
3223         mutex_enter(&zp->z_lock);
3224         if (zp->z_is_sa)
3225                 error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zsb), &uio);
3226         else
3227                 error = zfs_sa_readlink(zp, &uio);
3228         mutex_exit(&zp->z_lock);
3229
3230         ZFS_ACCESSTIME_STAMP(zsb, zp);
3231         zfs_inode_update(zp);
3232
3233         if (error) {
3234                 kmem_free(iov.iov_base, iov.iov_len);
3235                 nd_set_link(nd, ERR_PTR(error));
3236         } else {
3237                 nd_set_link(nd, iov.iov_base);
3238         }
3239
3240         ZFS_EXIT(zsb);
3241         return (error);
3242 }
3243 EXPORT_SYMBOL(zfs_follow_link);
3244
3245 /*
3246  * Insert a new entry into directory tdip referencing sip.
3247  *
3248  *      IN:     tdip    - Directory to contain new entry.
3249  *              sip     - inode of new entry.
3250  *              name    - name of new entry.
3251  *              cr      - credentials of caller.
3252  *
3253  *      RETURN: 0 if success
3254  *              error code if failure
3255  *
3256  * Timestamps:
3257  *      tdip - ctime|mtime updated
3258  *       sip - ctime updated
3259  */
3260 /* ARGSUSED */
3261 int
3262 zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr)
3263 {
3264         znode_t         *dzp = ITOZ(tdip);
3265         znode_t         *tzp, *szp;
3266         zfs_sb_t        *zsb = ITOZSB(tdip);
3267         zilog_t         *zilog;
3268         zfs_dirlock_t   *dl;
3269         dmu_tx_t        *tx;
3270         int             error;
3271         int             zf = ZNEW;
3272         uint64_t        parent;
3273         uid_t           owner;
3274
3275         ASSERT(S_ISDIR(tdip->i_mode));
3276
3277         ZFS_ENTER(zsb);
3278         ZFS_VERIFY_ZP(dzp);
3279         zilog = zsb->z_log;
3280
3281         /*
3282          * POSIX dictates that we return EPERM here.
3283          * Better choices include ENOTSUP or EISDIR.
3284          */
3285         if (S_ISDIR(sip->i_mode)) {
3286                 ZFS_EXIT(zsb);
3287                 return (EPERM);
3288         }
3289
3290         if (sip->i_sb != tdip->i_sb) {
3291                 ZFS_EXIT(zsb);
3292                 return (EXDEV);
3293         }
3294
3295         szp = ITOZ(sip);
3296         ZFS_VERIFY_ZP(szp);
3297
3298         /* Prevent links to .zfs/shares files */
3299
3300         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zsb),
3301             &parent, sizeof (uint64_t))) != 0) {
3302                 ZFS_EXIT(zsb);
3303                 return (error);
3304         }
3305         if (parent == zsb->z_shares_dir) {
3306                 ZFS_EXIT(zsb);
3307                 return (EPERM);
3308         }
3309
3310         if (zsb->z_utf8 && u8_validate(name,
3311             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3312                 ZFS_EXIT(zsb);
3313                 return (EILSEQ);
3314         }
3315 #ifdef HAVE_PN_UTILS
3316         if (flags & FIGNORECASE)
3317                 zf |= ZCILOOK;
3318 #endif /* HAVE_PN_UTILS */
3319
3320         /*
3321          * We do not support links between attributes and non-attributes
3322          * because of the potential security risk of creating links
3323          * into "normal" file space in order to circumvent restrictions
3324          * imposed in attribute space.
3325          */
3326         if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
3327                 ZFS_EXIT(zsb);
3328                 return (EINVAL);
3329         }
3330
3331         owner = zfs_fuid_map_id(zsb, szp->z_uid, cr, ZFS_OWNER);
3332         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3333                 ZFS_EXIT(zsb);
3334                 return (EPERM);
3335         }
3336
3337         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3338                 ZFS_EXIT(zsb);
3339                 return (error);
3340         }
3341
3342 top:
3343         /*
3344          * Attempt to lock directory; fail if entry already exists.
3345          */
3346         error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3347         if (error) {
3348                 ZFS_EXIT(zsb);
3349                 return (error);
3350         }
3351
3352         tx = dmu_tx_create(zsb->z_os);
3353         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3354         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3355         zfs_sa_upgrade_txholds(tx, szp);
3356         zfs_sa_upgrade_txholds(tx, dzp);
3357         error = dmu_tx_assign(tx, TXG_NOWAIT);
3358         if (error) {
3359                 zfs_dirent_unlock(dl);
3360                 if (error == ERESTART) {
3361                         dmu_tx_wait(tx);
3362                         dmu_tx_abort(tx);
3363                         goto top;
3364                 }
3365                 dmu_tx_abort(tx);
3366                 ZFS_EXIT(zsb);
3367                 return (error);
3368         }
3369
3370         error = zfs_link_create(dl, szp, tx, 0);
3371
3372         if (error == 0) {
3373                 uint64_t txtype = TX_LINK;
3374 #ifdef HAVE_PN_UTILS
3375                 if (flags & FIGNORECASE)
3376                         txtype |= TX_CI;
3377 #endif /* HAVE_PN_UTILS */
3378                 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3379         }
3380
3381         dmu_tx_commit(tx);
3382
3383         zfs_dirent_unlock(dl);
3384
3385         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
3386                 zil_commit(zilog, 0);
3387
3388         zfs_inode_update(dzp);
3389         zfs_inode_update(szp);
3390         ZFS_EXIT(zsb);
3391         return (error);
3392 }
3393 EXPORT_SYMBOL(zfs_link);
3394
3395 /*
3396  * zfs_null_putapage() is used when the file system has been force
3397  * unmounted. It just drops the pages.
3398  */
3399 /* ARGSUSED */
3400 static int
3401 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
3402                 size_t *lenp, int flags, cred_t *cr)
3403 {
3404         pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
3405         return (0);
3406 }
3407
3408 /*
3409  * Push a page out to disk, klustering if possible.
3410  *
3411  *      IN:     vp      - file to push page to.
3412  *              pp      - page to push.
3413  *              flags   - additional flags.
3414  *              cr      - credentials of caller.
3415  *
3416  *      OUT:    offp    - start of range pushed.
3417  *              lenp    - len of range pushed.
3418  *
3419  *      RETURN: 0 if success
3420  *              error code if failure
3421  *
3422  * NOTE: callers must have locked the page to be pushed.  On
3423  * exit, the page (and all other pages in the kluster) must be
3424  * unlocked.
3425  */
3426 /* ARGSUSED */
3427 static int
3428 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
3429                 size_t *lenp, int flags, cred_t *cr)
3430 {
3431         znode_t         *zp = VTOZ(vp);
3432         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3433         dmu_tx_t        *tx;
3434         u_offset_t      off, koff;
3435         size_t          len, klen;
3436         int             err;
3437
3438         off = pp->p_offset;
3439         len = PAGESIZE;
3440         /*
3441          * If our blocksize is bigger than the page size, try to kluster
3442          * multiple pages so that we write a full block (thus avoiding
3443          * a read-modify-write).
3444          */
3445         if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
3446                 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
3447                 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
3448                 ASSERT(koff <= zp->z_size);
3449                 if (koff + klen > zp->z_size)
3450                         klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
3451                 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
3452         }
3453         ASSERT3U(btop(len), ==, btopr(len));
3454
3455         /*
3456          * Can't push pages past end-of-file.
3457          */
3458         if (off >= zp->z_size) {
3459                 /* ignore all pages */
3460                 err = 0;
3461                 goto out;
3462         } else if (off + len > zp->z_size) {
3463                 int npages = btopr(zp->z_size - off);
3464                 page_t *trunc;
3465
3466                 page_list_break(&pp, &trunc, npages);
3467                 /* ignore pages past end of file */
3468                 if (trunc)
3469                         pvn_write_done(trunc, flags);
3470                 len = zp->z_size - off;
3471         }
3472
3473         if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
3474             zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
3475                 err = EDQUOT;
3476                 goto out;
3477         }
3478 top:
3479         tx = dmu_tx_create(zfsvfs->z_os);
3480         dmu_tx_hold_write(tx, zp->z_id, off, len);
3481
3482         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3483         zfs_sa_upgrade_txholds(tx, zp);
3484         err = dmu_tx_assign(tx, TXG_NOWAIT);
3485         if (err != 0) {
3486                 if (err == ERESTART) {
3487                         dmu_tx_wait(tx);
3488                         dmu_tx_abort(tx);
3489                         goto top;
3490                 }
3491                 dmu_tx_abort(tx);
3492                 goto out;
3493         }
3494
3495         if (zp->z_blksz <= PAGESIZE) {
3496                 caddr_t va = zfs_map_page(pp, S_READ);
3497                 ASSERT3U(len, <=, PAGESIZE);
3498                 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
3499                 zfs_unmap_page(pp, va);
3500         } else {
3501                 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
3502         }
3503
3504         if (err == 0) {
3505                 uint64_t mtime[2], ctime[2];
3506                 sa_bulk_attr_t bulk[3];
3507                 int count = 0;
3508
3509                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3510                     &mtime, 16);
3511                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3512                     &ctime, 16);
3513                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3514                     &zp->z_pflags, 8);
3515                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3516                     B_TRUE);
3517                 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
3518         }
3519         dmu_tx_commit(tx);
3520
3521 out:
3522         pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
3523         if (offp)
3524                 *offp = off;
3525         if (lenp)
3526                 *lenp = len;
3527
3528         return (err);
3529 }
3530
3531 /*
3532  * Copy the portion of the file indicated from pages into the file.
3533  * The pages are stored in a page list attached to the files vnode.
3534  *
3535  *      IN:     vp      - vnode of file to push page data to.
3536  *              off     - position in file to put data.
3537  *              len     - amount of data to write.
3538  *              flags   - flags to control the operation.
3539  *              cr      - credentials of caller.
3540  *              ct      - caller context.
3541  *
3542  *      RETURN: 0 if success
3543  *              error code if failure
3544  *
3545  * Timestamps:
3546  *      vp - ctime|mtime updated
3547  */
3548 /*ARGSUSED*/
3549 static int
3550 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
3551     caller_context_t *ct)
3552 {
3553         znode_t         *zp = VTOZ(vp);
3554         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3555         page_t          *pp;
3556         size_t          io_len;
3557         u_offset_t      io_off;
3558         uint_t          blksz;
3559         rl_t            *rl;
3560         int             error = 0;
3561
3562         ZFS_ENTER(zfsvfs);
3563         ZFS_VERIFY_ZP(zp);
3564
3565         /*
3566          * Align this request to the file block size in case we kluster.
3567          * XXX - this can result in pretty aggresive locking, which can
3568          * impact simultanious read/write access.  One option might be
3569          * to break up long requests (len == 0) into block-by-block
3570          * operations to get narrower locking.
3571          */
3572         blksz = zp->z_blksz;
3573         if (ISP2(blksz))
3574                 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
3575         else
3576                 io_off = 0;
3577         if (len > 0 && ISP2(blksz))
3578                 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
3579         else
3580                 io_len = 0;
3581
3582         if (io_len == 0) {
3583                 /*
3584                  * Search the entire vp list for pages >= io_off.
3585                  */
3586                 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
3587                 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
3588                 goto out;
3589         }
3590         rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
3591
3592         if (off > zp->z_size) {
3593                 /* past end of file */
3594                 zfs_range_unlock(rl);
3595                 ZFS_EXIT(zfsvfs);
3596                 return (0);
3597         }
3598
3599         len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
3600
3601         for (off = io_off; io_off < off + len; io_off += io_len) {
3602                 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
3603                         pp = page_lookup(vp, io_off,
3604                             (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
3605                 } else {
3606                         pp = page_lookup_nowait(vp, io_off,
3607                             (flags & B_FREE) ? SE_EXCL : SE_SHARED);
3608                 }
3609
3610                 if (pp != NULL && pvn_getdirty(pp, flags)) {
3611                         int err;
3612
3613                         /*
3614                          * Found a dirty page to push
3615                          */
3616                         err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
3617                         if (err)
3618                                 error = err;
3619                 } else {
3620                         io_len = PAGESIZE;
3621                 }
3622         }
3623 out:
3624         zfs_range_unlock(rl);
3625         if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3626                 zil_commit(zfsvfs->z_log, zp->z_id);
3627         ZFS_EXIT(zfsvfs);
3628         return (error);
3629 }
3630
3631 /*ARGSUSED*/
3632 void
3633 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3634 {
3635         znode_t *zp = VTOZ(vp);
3636         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3637         int error;
3638
3639         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3640         if (zp->z_sa_hdl == NULL) {
3641                 /*
3642                  * The fs has been unmounted, or we did a
3643                  * suspend/resume and this file no longer exists.
3644                  */
3645                 if (vn_has_cached_data(vp)) {
3646                         (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
3647                             B_INVAL, cr);
3648                 }
3649
3650                 mutex_enter(&zp->z_lock);
3651                 mutex_enter(&vp->v_lock);
3652                 ASSERT(vp->v_count == 1);
3653                 vp->v_count = 0;
3654                 mutex_exit(&vp->v_lock);
3655                 mutex_exit(&zp->z_lock);
3656                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3657                 zfs_znode_free(zp);
3658                 return;
3659         }
3660
3661         /*
3662          * Attempt to push any data in the page cache.  If this fails
3663          * we will get kicked out later in zfs_zinactive().
3664          */
3665         if (vn_has_cached_data(vp)) {
3666                 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
3667                     cr);
3668         }
3669
3670         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
3671                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3672
3673                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3674                 zfs_sa_upgrade_txholds(tx, zp);
3675                 error = dmu_tx_assign(tx, TXG_WAIT);
3676                 if (error) {
3677                         dmu_tx_abort(tx);
3678                 } else {
3679                         mutex_enter(&zp->z_lock);
3680                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zsb),
3681                             (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
3682                         zp->z_atime_dirty = 0;
3683                         mutex_exit(&zp->z_lock);
3684                         dmu_tx_commit(tx);
3685                 }
3686         }
3687
3688         zfs_zinactive(zp);
3689         rw_exit(&zsb->z_teardown_inactive_lock);
3690 }
3691 EXPORT_SYMBOL(zfs_inactive);
3692
3693 /*
3694  * Bounds-check the seek operation.
3695  *
3696  *      IN:     ip      - inode seeking within
3697  *              ooff    - old file offset
3698  *              noffp   - pointer to new file offset
3699  *              ct      - caller context
3700  *
3701  *      RETURN: 0 if success
3702  *              EINVAL if new offset invalid
3703  */
3704 /* ARGSUSED */
3705 int
3706 zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp,
3707     caller_context_t *ct)
3708 {
3709         if (S_ISDIR(ip->i_mode))
3710                 return (0);
3711         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
3712 }
3713 EXPORT_SYMBOL(zfs_seek);
3714
3715 /*
3716  * Pre-filter the generic locking function to trap attempts to place
3717  * a mandatory lock on a memory mapped file.
3718  */
3719 static int
3720 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
3721     flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
3722 {
3723         znode_t *zp = VTOZ(vp);
3724         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3725
3726         ZFS_ENTER(zfsvfs);
3727         ZFS_VERIFY_ZP(zp);
3728
3729         /*
3730          * We are following the UFS semantics with respect to mapcnt
3731          * here: If we see that the file is mapped already, then we will
3732          * return an error, but we don't worry about races between this
3733          * function and zfs_map().
3734          */
3735         if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
3736                 ZFS_EXIT(zfsvfs);
3737                 return (EAGAIN);
3738         }
3739         ZFS_EXIT(zfsvfs);
3740         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
3741 }
3742
3743 /*
3744  * If we can't find a page in the cache, we will create a new page
3745  * and fill it with file data.  For efficiency, we may try to fill
3746  * multiple pages at once (klustering) to fill up the supplied page
3747  * list.  Note that the pages to be filled are held with an exclusive
3748  * lock to prevent access by other threads while they are being filled.
3749  */
3750 static int
3751 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
3752     caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
3753 {
3754         znode_t *zp = VTOZ(vp);
3755         page_t *pp, *cur_pp;
3756         objset_t *os = zp->z_zfsvfs->z_os;
3757         u_offset_t io_off, total;
3758         size_t io_len;
3759         int err;
3760
3761         if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
3762                 /*
3763                  * We only have a single page, don't bother klustering
3764                  */
3765                 io_off = off;
3766                 io_len = PAGESIZE;
3767                 pp = page_create_va(vp, io_off, io_len,
3768                     PG_EXCL | PG_WAIT, seg, addr);
3769         } else {
3770                 /*
3771                  * Try to find enough pages to fill the page list
3772                  */
3773                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3774                     &io_len, off, plsz, 0);
3775         }
3776         if (pp == NULL) {
3777                 /*
3778                  * The page already exists, nothing to do here.
3779                  */
3780                 *pl = NULL;
3781                 return (0);
3782         }
3783
3784         /*
3785          * Fill the pages in the kluster.
3786          */
3787         cur_pp = pp;
3788         for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
3789                 caddr_t va;
3790
3791                 ASSERT3U(io_off, ==, cur_pp->p_offset);
3792                 va = zfs_map_page(cur_pp, S_WRITE);
3793                 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
3794                     DMU_READ_PREFETCH);
3795                 zfs_unmap_page(cur_pp, va);
3796                 if (err) {
3797                         /* On error, toss the entire kluster */
3798                         pvn_read_done(pp, B_ERROR);
3799                         /* convert checksum errors into IO errors */
3800                         if (err == ECKSUM)
3801                                 err = EIO;
3802                         return (err);
3803                 }
3804                 cur_pp = cur_pp->p_next;
3805         }
3806
3807         /*
3808          * Fill in the page list array from the kluster starting
3809          * from the desired offset `off'.
3810          * NOTE: the page list will always be null terminated.
3811          */
3812         pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3813         ASSERT(pl == NULL || (*pl)->p_offset == off);
3814
3815         return (0);
3816 }
3817
3818 /*
3819  * Return pointers to the pages for the file region [off, off + len]
3820  * in the pl array.  If plsz is greater than len, this function may
3821  * also return page pointers from after the specified region
3822  * (i.e. the region [off, off + plsz]).  These additional pages are
3823  * only returned if they are already in the cache, or were created as
3824  * part of a klustered read.
3825  *
3826  *      IN:     vp      - vnode of file to get data from.
3827  *              off     - position in file to get data from.
3828  *              len     - amount of data to retrieve.
3829  *              plsz    - length of provided page list.
3830  *              seg     - segment to obtain pages for.
3831  *              addr    - virtual address of fault.
3832  *              rw      - mode of created pages.
3833  *              cr      - credentials of caller.
3834  *              ct      - caller context.
3835  *
3836  *      OUT:    protp   - protection mode of created pages.
3837  *              pl      - list of pages created.
3838  *
3839  *      RETURN: 0 if success
3840  *              error code if failure
3841  *
3842  * Timestamps:
3843  *      vp - atime updated
3844  */
3845 /* ARGSUSED */
3846 static int
3847 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3848         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3849         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3850 {
3851         znode_t         *zp = VTOZ(vp);
3852         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3853         page_t          **pl0 = pl;
3854         int             err = 0;
3855
3856         /* we do our own caching, faultahead is unnecessary */
3857         if (pl == NULL)
3858                 return (0);
3859         else if (len > plsz)
3860                 len = plsz;
3861         else
3862                 len = P2ROUNDUP(len, PAGESIZE);
3863         ASSERT(plsz >= len);
3864
3865         ZFS_ENTER(zfsvfs);
3866         ZFS_VERIFY_ZP(zp);
3867
3868         if (protp)
3869                 *protp = PROT_ALL;
3870
3871         /*
3872          * Loop through the requested range [off, off + len) looking
3873          * for pages.  If we don't find a page, we will need to create
3874          * a new page and fill it with data from the file.
3875          */
3876         while (len > 0) {
3877                 if (*pl = page_lookup(vp, off, SE_SHARED))
3878                         *(pl+1) = NULL;
3879                 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
3880                         goto out;
3881                 while (*pl) {
3882                         ASSERT3U((*pl)->p_offset, ==, off);
3883                         off += PAGESIZE;
3884                         addr += PAGESIZE;
3885                         if (len > 0) {
3886                                 ASSERT3U(len, >=, PAGESIZE);
3887                                 len -= PAGESIZE;
3888                         }
3889                         ASSERT3U(plsz, >=, PAGESIZE);
3890                         plsz -= PAGESIZE;
3891                         pl++;
3892                 }
3893         }
3894
3895         /*
3896          * Fill out the page array with any pages already in the cache.
3897          */
3898         while (plsz > 0 &&
3899             (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
3900                         off += PAGESIZE;
3901                         plsz -= PAGESIZE;
3902         }
3903 out:
3904         if (err) {
3905                 /*
3906                  * Release any pages we have previously locked.
3907                  */
3908                 while (pl > pl0)
3909                         page_unlock(*--pl);
3910         } else {
3911                 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3912         }
3913
3914         *pl = NULL;
3915
3916         ZFS_EXIT(zfsvfs);
3917         return (err);
3918 }
3919
3920 /*
3921  * Request a memory map for a section of a file.  This code interacts
3922  * with common code and the VM system as follows:
3923  *
3924  *      common code calls mmap(), which ends up in smmap_common()
3925  *
3926  *      this calls VOP_MAP(), which takes you into (say) zfs
3927  *
3928  *      zfs_map() calls as_map(), passing segvn_create() as the callback
3929  *
3930  *      segvn_create() creates the new segment and calls VOP_ADDMAP()
3931  *
3932  *      zfs_addmap() updates z_mapcnt
3933  */
3934 /*ARGSUSED*/
3935 static int
3936 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
3937     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
3938     caller_context_t *ct)
3939 {
3940         znode_t *zp = VTOZ(vp);
3941         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3942         segvn_crargs_t  vn_a;
3943         int             error;
3944
3945         ZFS_ENTER(zfsvfs);
3946         ZFS_VERIFY_ZP(zp);
3947
3948         if ((prot & PROT_WRITE) && (zp->z_pflags &
3949             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
3950                 ZFS_EXIT(zfsvfs);
3951                 return (EPERM);
3952         }
3953
3954         if ((prot & (PROT_READ | PROT_EXEC)) &&
3955             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
3956                 ZFS_EXIT(zfsvfs);
3957                 return (EACCES);
3958         }
3959
3960         if (vp->v_flag & VNOMAP) {
3961                 ZFS_EXIT(zfsvfs);
3962                 return (ENOSYS);
3963         }
3964
3965         if (off < 0 || len > MAXOFFSET_T - off) {
3966                 ZFS_EXIT(zfsvfs);
3967                 return (ENXIO);
3968         }
3969
3970         if (vp->v_type != VREG) {
3971                 ZFS_EXIT(zfsvfs);
3972                 return (ENODEV);
3973         }
3974
3975         /*
3976          * If file is locked, disallow mapping.
3977          */
3978         if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
3979                 ZFS_EXIT(zfsvfs);
3980                 return (EAGAIN);
3981         }
3982
3983         as_rangelock(as);
3984         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
3985         if (error != 0) {
3986                 as_rangeunlock(as);
3987                 ZFS_EXIT(zfsvfs);
3988                 return (error);
3989         }
3990
3991         vn_a.vp = vp;
3992         vn_a.offset = (u_offset_t)off;
3993         vn_a.type = flags & MAP_TYPE;
3994         vn_a.prot = prot;
3995         vn_a.maxprot = maxprot;
3996         vn_a.cred = cr;
3997         vn_a.amp = NULL;
3998         vn_a.flags = flags & ~MAP_TYPE;
3999         vn_a.szc = 0;
4000         vn_a.lgrp_mem_policy_flags = 0;
4001
4002         error = as_map(as, *addrp, len, segvn_create, &vn_a);
4003
4004         as_rangeunlock(as);
4005         ZFS_EXIT(zfsvfs);
4006         return (error);
4007 }
4008
4009 /* ARGSUSED */
4010 static int
4011 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4012     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4013     caller_context_t *ct)
4014 {
4015         uint64_t pages = btopr(len);
4016
4017         atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4018         return (0);
4019 }
4020
4021 /*
4022  * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4023  * more accurate mtime for the associated file.  Since we don't have a way of
4024  * detecting when the data was actually modified, we have to resort to
4025  * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4026  * last page is pushed.  The problem occurs when the msync() call is omitted,
4027  * which by far the most common case:
4028  *
4029  *      open()
4030  *      mmap()
4031  *      <modify memory>
4032  *      munmap()
4033  *      close()
4034  *      <time lapse>
4035  *      putpage() via fsflush
4036  *
4037  * If we wait until fsflush to come along, we can have a modification time that
4038  * is some arbitrary point in the future.  In order to prevent this in the
4039  * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4040  * torn down.
4041  */
4042 /* ARGSUSED */
4043 static int
4044 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4045     size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4046     caller_context_t *ct)
4047 {
4048         uint64_t pages = btopr(len);
4049
4050         ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4051         atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4052
4053         if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4054             vn_has_cached_data(vp))
4055                 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4056
4057         return (0);
4058 }
4059
4060 /*
4061  * convoff - converts the given data (start, whence) to the
4062  * given whence.
4063  */
4064 int
4065 convoff(struct inode *ip, flock64_t *lckdat, int  whence, offset_t offset)
4066 {
4067         struct kstat stat;
4068         int error;
4069
4070         if ((lckdat->l_whence == 2) || (whence == 2)) {
4071                 if ((error = zfs_getattr(ip, &stat, 0, CRED()) != 0))
4072                         return (error);
4073         }
4074
4075         switch (lckdat->l_whence) {
4076         case 1:
4077                 lckdat->l_start += offset;
4078                 break;
4079         case 2:
4080                 lckdat->l_start += stat.size;
4081                 /* FALLTHRU */
4082         case 0:
4083                 break;
4084         default:
4085                 return (EINVAL);
4086         }
4087
4088         if (lckdat->l_start < 0)
4089                 return (EINVAL);
4090
4091         switch (whence) {
4092         case 1:
4093                 lckdat->l_start -= offset;
4094                 break;
4095         case 2:
4096                 lckdat->l_start -= stat.size;
4097                 /* FALLTHRU */
4098         case 0:
4099                 break;
4100         default:
4101                 return (EINVAL);
4102         }
4103
4104         lckdat->l_whence = (short)whence;
4105         return (0);
4106 }
4107
4108 /*
4109  * Free or allocate space in a file.  Currently, this function only
4110  * supports the `F_FREESP' command.  However, this command is somewhat
4111  * misnamed, as its functionality includes the ability to allocate as
4112  * well as free space.
4113  *
4114  *      IN:     ip      - inode of file to free data in.
4115  *              cmd     - action to take (only F_FREESP supported).
4116  *              bfp     - section of file to free/alloc.
4117  *              flag    - current file open mode flags.
4118  *              offset  - current file offset.
4119  *              cr      - credentials of caller [UNUSED].
4120  *
4121  *      RETURN: 0 if success
4122  *              error code if failure
4123  *
4124  * Timestamps:
4125  *      ip - ctime|mtime updated
4126  */
4127 /* ARGSUSED */
4128 int
4129 zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag,
4130     offset_t offset, cred_t *cr)
4131 {
4132         znode_t         *zp = ITOZ(ip);
4133         zfs_sb_t        *zsb = ITOZSB(ip);
4134         uint64_t        off, len;
4135         int             error;
4136
4137         ZFS_ENTER(zsb);
4138         ZFS_VERIFY_ZP(zp);
4139
4140         if (cmd != F_FREESP) {
4141                 ZFS_EXIT(zsb);
4142                 return (EINVAL);
4143         }
4144
4145         if ((error = convoff(ip, bfp, 0, offset))) {
4146                 ZFS_EXIT(zsb);
4147                 return (error);
4148         }
4149
4150         if (bfp->l_len < 0) {
4151                 ZFS_EXIT(zsb);
4152                 return (EINVAL);
4153         }
4154
4155         off = bfp->l_start;
4156         len = bfp->l_len; /* 0 means from off to end of file */
4157
4158         error = zfs_freesp(zp, off, len, flag, TRUE);
4159
4160         ZFS_EXIT(zsb);
4161         return (error);
4162 }
4163 EXPORT_SYMBOL(zfs_space);
4164
4165 /*ARGSUSED*/
4166 int
4167 zfs_fid(struct inode *ip, fid_t *fidp)
4168 {
4169         znode_t         *zp = ITOZ(ip);
4170         zfs_sb_t        *zsb = ITOZSB(ip);
4171         uint32_t        gen;
4172         uint64_t        gen64;
4173         uint64_t        object = zp->z_id;
4174         zfid_short_t    *zfid;
4175         int             size, i, error;
4176
4177         ZFS_ENTER(zsb);
4178         ZFS_VERIFY_ZP(zp);
4179
4180         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zsb),
4181             &gen64, sizeof (uint64_t))) != 0) {
4182                 ZFS_EXIT(zsb);
4183                 return (error);
4184         }
4185
4186         gen = (uint32_t)gen64;
4187
4188         size = (zsb->z_parent != zsb) ? LONG_FID_LEN : SHORT_FID_LEN;
4189         if (fidp->fid_len < size) {
4190                 fidp->fid_len = size;
4191                 ZFS_EXIT(zsb);
4192                 return (ENOSPC);
4193         }
4194
4195         zfid = (zfid_short_t *)fidp;
4196
4197         zfid->zf_len = size;
4198
4199         for (i = 0; i < sizeof (zfid->zf_object); i++)
4200                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4201
4202         /* Must have a non-zero generation number to distinguish from .zfs */
4203         if (gen == 0)
4204                 gen = 1;
4205         for (i = 0; i < sizeof (zfid->zf_gen); i++)
4206                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4207
4208         if (size == LONG_FID_LEN) {
4209                 uint64_t        objsetid = dmu_objset_id(zsb->z_os);
4210                 zfid_long_t     *zlfid;
4211
4212                 zlfid = (zfid_long_t *)fidp;
4213
4214                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4215                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4216
4217                 /* XXX - this should be the generation number for the objset */
4218                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4219                         zlfid->zf_setgen[i] = 0;
4220         }
4221
4222         ZFS_EXIT(zsb);
4223         return (0);
4224 }
4225 EXPORT_SYMBOL(zfs_fid);
4226
4227 /*ARGSUSED*/
4228 int
4229 zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
4230 {
4231         znode_t *zp = ITOZ(ip);
4232         zfs_sb_t *zsb = ITOZSB(ip);
4233         int error;
4234         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4235
4236         ZFS_ENTER(zsb);
4237         ZFS_VERIFY_ZP(zp);
4238         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4239         ZFS_EXIT(zsb);
4240
4241         return (error);
4242 }
4243 EXPORT_SYMBOL(zfs_getsecattr);
4244
4245 /*ARGSUSED*/
4246 int
4247 zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
4248 {
4249         znode_t *zp = ITOZ(ip);
4250         zfs_sb_t *zsb = ITOZSB(ip);
4251         int error;
4252         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4253         zilog_t *zilog = zsb->z_log;
4254
4255         ZFS_ENTER(zsb);
4256         ZFS_VERIFY_ZP(zp);
4257
4258         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4259
4260         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
4261                 zil_commit(zilog, 0);
4262
4263         ZFS_EXIT(zsb);
4264         return (error);
4265 }
4266 EXPORT_SYMBOL(zfs_setsecattr);
4267
4268 #ifdef HAVE_UIO_ZEROCOPY
4269 /*
4270  * Tunable, both must be a power of 2.
4271  *
4272  * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
4273  * zcr_blksz_max: if set to less than the file block size, allow loaning out of
4274  *              an arcbuf for a partial block read
4275  */
4276 int zcr_blksz_min = (1 << 10);  /* 1K */
4277 int zcr_blksz_max = (1 << 17);  /* 128K */
4278
4279 /*ARGSUSED*/
4280 static int
4281 zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
4282 {
4283         znode_t *zp = ITOZ(ip);
4284         zfs_sb_t *zsb = ITOZSB(ip);
4285         int max_blksz = zsb->z_max_blksz;
4286         uio_t *uio = &xuio->xu_uio;
4287         ssize_t size = uio->uio_resid;
4288         offset_t offset = uio->uio_loffset;
4289         int blksz;
4290         int fullblk, i;
4291         arc_buf_t *abuf;
4292         ssize_t maxsize;
4293         int preamble, postamble;
4294
4295         if (xuio->xu_type != UIOTYPE_ZEROCOPY)
4296                 return (EINVAL);
4297
4298         ZFS_ENTER(zsb);
4299         ZFS_VERIFY_ZP(zp);
4300         switch (ioflag) {
4301         case UIO_WRITE:
4302                 /*
4303                  * Loan out an arc_buf for write if write size is bigger than
4304                  * max_blksz, and the file's block size is also max_blksz.
4305                  */
4306                 blksz = max_blksz;
4307                 if (size < blksz || zp->z_blksz != blksz) {
4308                         ZFS_EXIT(zsb);
4309                         return (EINVAL);
4310                 }
4311                 /*
4312                  * Caller requests buffers for write before knowing where the
4313                  * write offset might be (e.g. NFS TCP write).
4314                  */
4315                 if (offset == -1) {
4316                         preamble = 0;
4317                 } else {
4318                         preamble = P2PHASE(offset, blksz);
4319                         if (preamble) {
4320                                 preamble = blksz - preamble;
4321                                 size -= preamble;
4322                         }
4323                 }
4324
4325                 postamble = P2PHASE(size, blksz);
4326                 size -= postamble;
4327
4328                 fullblk = size / blksz;
4329                 (void) dmu_xuio_init(xuio,
4330                     (preamble != 0) + fullblk + (postamble != 0));
4331
4332                 /*
4333                  * Have to fix iov base/len for partial buffers.  They
4334                  * currently represent full arc_buf's.
4335                  */
4336                 if (preamble) {
4337                         /* data begins in the middle of the arc_buf */
4338                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4339                             blksz);
4340                         ASSERT(abuf);
4341                         (void) dmu_xuio_add(xuio, abuf,
4342                             blksz - preamble, preamble);
4343                 }
4344
4345                 for (i = 0; i < fullblk; i++) {
4346                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4347                             blksz);
4348                         ASSERT(abuf);
4349                         (void) dmu_xuio_add(xuio, abuf, 0, blksz);
4350                 }
4351
4352                 if (postamble) {
4353                         /* data ends in the middle of the arc_buf */
4354                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4355                             blksz);
4356                         ASSERT(abuf);
4357                         (void) dmu_xuio_add(xuio, abuf, 0, postamble);
4358                 }
4359                 break;
4360         case UIO_READ:
4361                 /*
4362                  * Loan out an arc_buf for read if the read size is larger than
4363                  * the current file block size.  Block alignment is not
4364                  * considered.  Partial arc_buf will be loaned out for read.
4365                  */
4366                 blksz = zp->z_blksz;
4367                 if (blksz < zcr_blksz_min)
4368                         blksz = zcr_blksz_min;
4369                 if (blksz > zcr_blksz_max)
4370                         blksz = zcr_blksz_max;
4371                 /* avoid potential complexity of dealing with it */
4372                 if (blksz > max_blksz) {
4373                         ZFS_EXIT(zsb);
4374                         return (EINVAL);
4375                 }
4376
4377                 maxsize = zp->z_size - uio->uio_loffset;
4378                 if (size > maxsize)
4379                         size = maxsize;
4380
4381                 if (size < blksz) {
4382                         ZFS_EXIT(zsb);
4383                         return (EINVAL);
4384                 }
4385                 break;
4386         default:
4387                 ZFS_EXIT(zsb);
4388                 return (EINVAL);
4389         }
4390
4391         uio->uio_extflg = UIO_XUIO;
4392         XUIO_XUZC_RW(xuio) = ioflag;
4393         ZFS_EXIT(zsb);
4394         return (0);
4395 }
4396
4397 /*ARGSUSED*/
4398 static int
4399 zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
4400 {
4401         int i;
4402         arc_buf_t *abuf;
4403         int ioflag = XUIO_XUZC_RW(xuio);
4404
4405         ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
4406
4407         i = dmu_xuio_cnt(xuio);
4408         while (i-- > 0) {
4409                 abuf = dmu_xuio_arcbuf(xuio, i);
4410                 /*
4411                  * if abuf == NULL, it must be a write buffer
4412                  * that has been returned in zfs_write().
4413                  */
4414                 if (abuf)
4415                         dmu_return_arcbuf(abuf);
4416                 ASSERT(abuf || ioflag == UIO_WRITE);
4417         }
4418
4419         dmu_xuio_fini(xuio);
4420         return (0);
4421 }
4422 #endif /* HAVE_UIO_ZEROCOPY */