module/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27 /* Portions Copyright 2010 Robert Milkowski */
  28
  29
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/time.h>
  33 #include <sys/systm.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/resource.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vfs_opreg.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/atomic.h>
  45 #include <vm/pvn.h>
  46 #include <sys/pathname.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/errno.h>
  49 #include <sys/unistd.h>
  50 #include <sys/zfs_dir.h>
  51 #include <sys/zfs_acl.h>
  52 #include <sys/zfs_ioctl.h>
  53 #include <sys/fs/zfs.h>
  54 #include <sys/dmu.h>
  55 #include <sys/dmu_objset.h>
  56 #include <sys/spa.h>
  57 #include <sys/txg.h>
  58 #include <sys/dbuf.h>
  59 #include <sys/zap.h>
  60 #include <sys/sa.h>
  61 #include <sys/dirent.h>
  62 #include <sys/policy.h>
  63 #include <sys/sunddi.h>
  64 #include <sys/sid.h>
  65 #include <sys/mode.h>
  66 #include "fs/fs_subr.h"
  67 #include <sys/zfs_ctldir.h>
  68 #include <sys/zfs_fuid.h>
  69 #include <sys/zfs_sa.h>
  70 #include <sys/zfs_vnops.h>
  71 #include <sys/dnlc.h>
  72 #include <sys/zfs_rlock.h>
  73 #include <sys/extdirent.h>
  74 #include <sys/kidmap.h>
  75 #include <sys/cred.h>
  76 #include <sys/attr.h>
  77 #include <sys/zpl.h>
  78
  79 /*
  80  * Programming rules.
  81  *
  82  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  83  * properly lock its in-core state, create a DMU transaction, do the work,
  84  * record this work in the intent log (ZIL), commit the DMU transaction,
  85  * and wait for the intent log to commit if it is a synchronous operation.
  86  * Moreover, the vnode ops must work in both normal and log replay context.
  87  * The ordering of events is important to avoid deadlocks and references
  88  * to freed memory.  The example below illustrates the following Big Rules:
  89  *
  90  *  (1) A check must be made in each zfs thread for a mounted file system.
  91  *      This is done avoiding races using ZFS_ENTER(zsb).
  92  *      A ZFS_EXIT(zsb) is needed before all returns.  Any znodes
  93  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  94  *      can return EIO from the calling function.
  95  *
  96  *  (2) iput() should always be the last thing except for zil_commit()
  97  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
  98  *      First, if it's the last reference, the vnode/znode
  99  *      can be freed, so the zp may point to freed memory.  Second, the last
 100  *      reference will call zfs_zinactive(), which may induce a lot of work --
 101  *      pushing cached pages (which acquires range locks) and syncing out
 102  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 103  *      which could deadlock the system if you were already holding one.
 104  *      If you must call iput() within a tx then use iput_ASYNC().
 105  *
 106  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 107  *      as they can span dmu_tx_assign() calls.
 108  *
 109  *  (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
 110  *      This is critical because we don't want to block while holding locks.
 111  *      Note, in particular, that if a lock is sometimes acquired before
 112  *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 113  *      use a non-blocking assign can deadlock the system.  The scenario:
 114  *
 115  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 116  *      Thread B is in an already-assigned tx, and blocks for this lock.
 117  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 118  *      forever, because the previous txg can't quiesce until B's tx commits.
 119  *
 120  *      If dmu_tx_assign() returns ERESTART and zsb->z_assign is TXG_NOWAIT,
 121  *      then drop all locks, call dmu_tx_wait(), and try again.
 122  *
 123  *  (5) If the operation succeeded, generate the intent log entry for it
 124  *      before dropping locks.  This ensures that the ordering of events
 125  *      in the intent log matches the order in which they actually occurred.
 126  *      During ZIL replay the zfs_log_* functions will update the sequence
 127  *      number to indicate the zil transaction has replayed.
 128  *
 129  *  (6) At the end of each vnode op, the DMU tx must always commit,
 130  *      regardless of whether there were any errors.
 131  *
 132  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 133  *      to ensure that synchronous semantics are provided when necessary.
 134  *
 135  * In general, this is how things should be ordered in each vnode op:
 136  *
 137  *      ZFS_ENTER(zsb);         // exit if unmounted
 138  * top:
 139  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 140  *      rw_enter(...);                  // grab any other locks you need
 141  *      tx = dmu_tx_create(...);        // get DMU tx
 142  *      dmu_tx_hold_*();                // hold each object you might modify
 143  *      error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
 144  *      if (error) {
 145  *              rw_exit(...);           // drop locks
 146  *              zfs_dirent_unlock(dl);  // unlock directory entry
 147  *              iput(...);              // release held vnodes
 148  *              if (error == ERESTART) {
 149  *                      dmu_tx_wait(tx);
 150  *                      dmu_tx_abort(tx);
 151  *                      goto top;
 152  *              }
 153  *              dmu_tx_abort(tx);       // abort DMU tx
 154  *              ZFS_EXIT(zsb);  // finished in zfs
 155  *              return (error);         // really out of space
 156  *      }
 157  *      error = do_real_work();         // do whatever this VOP does
 158  *      if (error == 0)
 159  *              zfs_log_*(...);         // on success, make ZIL entry
 160  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 161  *      rw_exit(...);                   // drop locks
 162  *      zfs_dirent_unlock(dl);          // unlock directory entry
 163  *      iput(...);                      // release held vnodes
 164  *      zil_commit(zilog, foid);        // synchronous when necessary
 165  *      ZFS_EXIT(zsb);          // finished in zfs
 166  *      return (error);                 // done, report error
 167  */
 168
 169 /*
 170  * Virus scanning is unsupported.  It would be possible to add a hook
 171  * here to performance the required virus scan.  This could be done
 172  * entirely in the kernel or potentially as an update to invoke a
 173  * scanning utility.
 174  */
 175 static int
 176 zfs_vscan(struct inode *ip, cred_t *cr, int async)
 177 {
 178         return (0);
 179 }
 180
 181 /* ARGSUSED */
 182 int
 183 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 184 {
 185         znode_t *zp = ITOZ(ip);
 186         zfs_sb_t *zsb = ITOZSB(ip);
 187
 188         ZFS_ENTER(zsb);
 189         ZFS_VERIFY_ZP(zp);
 190
 191         /* Honor ZFS_APPENDONLY file attribute */
 192         if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 193             ((flag & O_APPEND) == 0)) {
 194                 ZFS_EXIT(zsb);
 195                 return (SET_ERROR(EPERM));
 196         }
 197
 198         /* Virus scan eligible files on open */
 199         if (!zfs_has_ctldir(zp) && zsb->z_vscan && S_ISREG(ip->i_mode) &&
 200             !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 201                 if (zfs_vscan(ip, cr, 0) != 0) {
 202                         ZFS_EXIT(zsb);
 203                         return (SET_ERROR(EACCES));
 204                 }
 205         }
 206
 207         /* Keep a count of the synchronous opens in the znode */
 208         if (flag & O_SYNC)
 209                 atomic_inc_32(&zp->z_sync_cnt);
 210
 211         ZFS_EXIT(zsb);
 212         return (0);
 213 }
 214 EXPORT_SYMBOL(zfs_open);
 215
 216 /* ARGSUSED */
 217 int
 218 zfs_close(struct inode *ip, int flag, cred_t *cr)
 219 {
 220         znode_t *zp = ITOZ(ip);
 221         zfs_sb_t *zsb = ITOZSB(ip);
 222
 223         ZFS_ENTER(zsb);
 224         ZFS_VERIFY_ZP(zp);
 225
 226         /*
 227          * Zero the synchronous opens in the znode.  Under Linux the
 228          * zfs_close() hook is not symmetric with zfs_open(), it is
 229          * only called once when the last reference is dropped.
 230          */
 231         if (flag & O_SYNC)
 232                 zp->z_sync_cnt = 0;
 233
 234         if (!zfs_has_ctldir(zp) && zsb->z_vscan && S_ISREG(ip->i_mode) &&
 235             !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 236                 VERIFY(zfs_vscan(ip, cr, 1) == 0);
 237
 238         ZFS_EXIT(zsb);
 239         return (0);
 240 }
 241 EXPORT_SYMBOL(zfs_close);
 242
 243 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
 244 /*
 245  * Lseek support for finding holes (cmd == SEEK_HOLE) and
 246  * data (cmd == SEEK_DATA). "off" is an in/out parameter.
 247  */
 248 static int
 249 zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
 250 {
 251         znode_t *zp = ITOZ(ip);
 252         uint64_t noff = (uint64_t)*off; /* new offset */
 253         uint64_t file_sz;
 254         int error;
 255         boolean_t hole;
 256
 257         file_sz = zp->z_size;
 258         if (noff >= file_sz)  {
 259                 return (SET_ERROR(ENXIO));
 260         }
 261
 262         if (cmd == SEEK_HOLE)
 263                 hole = B_TRUE;
 264         else
 265                 hole = B_FALSE;
 266
 267         error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
 268
 269         /* end of file? */
 270         if ((error == ESRCH) || (noff > file_sz)) {
 271                 /*
 272                  * Handle the virtual hole at the end of file.
 273                  */
 274                 if (hole) {
 275                         *off = file_sz;
 276                         return (0);
 277                 }
 278                 return (SET_ERROR(ENXIO));
 279         }
 280
 281         if (noff < *off)
 282                 return (error);
 283         *off = noff;
 284         return (error);
 285 }
 286
 287 int
 288 zfs_holey(struct inode *ip, int cmd, loff_t *off)
 289 {
 290         znode_t *zp = ITOZ(ip);
 291         zfs_sb_t *zsb = ITOZSB(ip);
 292         int error;
 293
 294         ZFS_ENTER(zsb);
 295         ZFS_VERIFY_ZP(zp);
 296
 297         error = zfs_holey_common(ip, cmd, off);
 298
 299         ZFS_EXIT(zsb);
 300         return (error);
 301 }
 302 EXPORT_SYMBOL(zfs_holey);
 303 #endif /* SEEK_HOLE && SEEK_DATA */
 304
 305 #if defined(_KERNEL)
 306 /*
 307  * When a file is memory mapped, we must keep the IO data synchronized
 308  * between the DMU cache and the memory mapped pages.  What this means:
 309  *
 310  * On Write:    If we find a memory mapped page, we write to *both*
 311  *              the page and the dmu buffer.
 312  */
 313 static void
 314 update_pages(struct inode *ip, int64_t start, int len,
 315     objset_t *os, uint64_t oid)
 316 {
 317         struct address_space *mp = ip->i_mapping;
 318         struct page *pp;
 319         uint64_t nbytes;
 320         int64_t off;
 321         void *pb;
 322
 323         off = start & (PAGE_CACHE_SIZE-1);
 324         for (start &= PAGE_CACHE_MASK; len > 0; start += PAGE_CACHE_SIZE) {
 325                 nbytes = MIN(PAGE_CACHE_SIZE - off, len);
 326
 327                 pp = find_lock_page(mp, start >> PAGE_CACHE_SHIFT);
 328                 if (pp) {
 329                         if (mapping_writably_mapped(mp))
 330                                 flush_dcache_page(pp);
 331
 332                         pb = kmap(pp);
 333                         (void) dmu_read(os, oid, start+off, nbytes, pb+off,
 334                             DMU_READ_PREFETCH);
 335                         kunmap(pp);
 336
 337                         if (mapping_writably_mapped(mp))
 338                                 flush_dcache_page(pp);
 339
 340                         mark_page_accessed(pp);
 341                         SetPageUptodate(pp);
 342                         ClearPageError(pp);
 343                         unlock_page(pp);
 344                         page_cache_release(pp);
 345                 }
 346
 347                 len -= nbytes;
 348                 off = 0;
 349         }
 350 }
 351
 352 /*
 353  * When a file is memory mapped, we must keep the IO data synchronized
 354  * between the DMU cache and the memory mapped pages.  What this means:
 355  *
 356  * On Read:     We "read" preferentially from memory mapped pages,
 357  *              else we default from the dmu buffer.
 358  *
 359  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 360  *       the file is memory mapped.
 361  */
 362 static int
 363 mappedread(struct inode *ip, int nbytes, uio_t *uio)
 364 {
 365         struct address_space *mp = ip->i_mapping;
 366         struct page *pp;
 367         znode_t *zp = ITOZ(ip);
 368         objset_t *os = ITOZSB(ip)->z_os;
 369         int64_t start, off;
 370         uint64_t bytes;
 371         int len = nbytes;
 372         int error = 0;
 373         void *pb;
 374
 375         start = uio->uio_loffset;
 376         off = start & (PAGE_CACHE_SIZE-1);
 377         for (start &= PAGE_CACHE_MASK; len > 0; start += PAGE_CACHE_SIZE) {
 378                 bytes = MIN(PAGE_CACHE_SIZE - off, len);
 379
 380                 pp = find_lock_page(mp, start >> PAGE_CACHE_SHIFT);
 381                 if (pp) {
 382                         ASSERT(PageUptodate(pp));
 383
 384                         pb = kmap(pp);
 385                         error = uiomove(pb + off, bytes, UIO_READ, uio);
 386                         kunmap(pp);
 387
 388                         if (mapping_writably_mapped(mp))
 389                                 flush_dcache_page(pp);
 390
 391                         mark_page_accessed(pp);
 392                         unlock_page(pp);
 393                         page_cache_release(pp);
 394                 } else {
 395                         error = dmu_read_uio(os, zp->z_id, uio, bytes);
 396                 }
 397
 398                 len -= bytes;
 399                 off = 0;
 400                 if (error)
 401                         break;
 402         }
 403         return (error);
 404 }
 405 #endif /* _KERNEL */
 406
 407 unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 408
 409 /*
 410  * Read bytes from specified file into supplied buffer.
 411  *
 412  *      IN:     ip      - inode of file to be read from.
 413  *              uio     - structure supplying read location, range info,
 414  *                        and return buffer.
 415  *              ioflag  - FSYNC flags; used to provide FRSYNC semantics.
 416  *                        O_DIRECT flag; used to bypass page cache.
 417  *              cr      - credentials of caller.
 418  *
 419  *      OUT:    uio     - updated offset and range, buffer filled.
 420  *
 421  *      RETURN: 0 on success, error code on failure.
 422  *
 423  * Side Effects:
 424  *      inode - atime updated if byte count > 0
 425  */
 426 /* ARGSUSED */
 427 int
 428 zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 429 {
 430         znode_t         *zp = ITOZ(ip);
 431         zfs_sb_t        *zsb = ITOZSB(ip);
 432         objset_t        *os;
 433         ssize_t         n, nbytes;
 434         int             error = 0;
 435         rl_t            *rl;
 436 #ifdef HAVE_UIO_ZEROCOPY
 437         xuio_t          *xuio = NULL;
 438 #endif /* HAVE_UIO_ZEROCOPY */
 439
 440         ZFS_ENTER(zsb);
 441         ZFS_VERIFY_ZP(zp);
 442         os = zsb->z_os;
 443
 444         if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 445                 ZFS_EXIT(zsb);
 446                 return (SET_ERROR(EACCES));
 447         }
 448
 449         /*
 450          * Validate file offset
 451          */
 452         if (uio->uio_loffset < (offset_t)0) {
 453                 ZFS_EXIT(zsb);
 454                 return (SET_ERROR(EINVAL));
 455         }
 456
 457         /*
 458          * Fasttrack empty reads
 459          */
 460         if (uio->uio_resid == 0) {
 461                 ZFS_EXIT(zsb);
 462                 return (0);
 463         }
 464
 465         /*
 466          * Check for mandatory locks
 467          */
 468         if (mandatory_lock(ip) &&
 469             !lock_may_read(ip, uio->uio_loffset, uio->uio_resid)) {
 470                 ZFS_EXIT(zsb);
 471                 return (SET_ERROR(EAGAIN));
 472         }
 473
 474         /*
 475          * If we're in FRSYNC mode, sync out this znode before reading it.
 476          */
 477         if (ioflag & FRSYNC || zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
 478                 zil_commit(zsb->z_log, zp->z_id);
 479
 480         /*
 481          * Lock the range against changes.
 482          */
 483         rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 484
 485         /*
 486          * If we are reading past end-of-file we can skip
 487          * to the end; but we might still need to set atime.
 488          */
 489         if (uio->uio_loffset >= zp->z_size) {
 490                 error = 0;
 491                 goto out;
 492         }
 493
 494         ASSERT(uio->uio_loffset < zp->z_size);
 495         n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 496
 497 #ifdef HAVE_UIO_ZEROCOPY
 498         if ((uio->uio_extflg == UIO_XUIO) &&
 499             (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 500                 int nblk;
 501                 int blksz = zp->z_blksz;
 502                 uint64_t offset = uio->uio_loffset;
 503
 504                 xuio = (xuio_t *)uio;
 505                 if ((ISP2(blksz))) {
 506                         nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 507                             blksz)) / blksz;
 508                 } else {
 509                         ASSERT(offset + n <= blksz);
 510                         nblk = 1;
 511                 }
 512                 (void) dmu_xuio_init(xuio, nblk);
 513
 514                 if (vn_has_cached_data(ip)) {
 515                         /*
 516                          * For simplicity, we always allocate a full buffer
 517                          * even if we only expect to read a portion of a block.
 518                          */
 519                         while (--nblk >= 0) {
 520                                 (void) dmu_xuio_add(xuio,
 521                                     dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 522                                     blksz), 0, blksz);
 523                         }
 524                 }
 525         }
 526 #endif /* HAVE_UIO_ZEROCOPY */
 527
 528         while (n > 0) {
 529                 nbytes = MIN(n, zfs_read_chunk_size -
 530                     P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 531
 532                 if (zp->z_is_mapped && !(ioflag & O_DIRECT))
 533                         error = mappedread(ip, nbytes, uio);
 534                 else
 535                         error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 536
 537                 if (error) {
 538                         /* convert checksum errors into IO errors */
 539                         if (error == ECKSUM)
 540                                 error = SET_ERROR(EIO);
 541                         break;
 542                 }
 543
 544                 n -= nbytes;
 545         }
 546 out:
 547         zfs_range_unlock(rl);
 548
 549         ZFS_ACCESSTIME_STAMP(zsb, zp);
 550         zfs_inode_update(zp);
 551         ZFS_EXIT(zsb);
 552         return (error);
 553 }
 554 EXPORT_SYMBOL(zfs_read);
 555
 556 /*
 557  * Write the bytes to a file.
 558  *
 559  *      IN:     ip      - inode of file to be written to.
 560  *              uio     - structure supplying write location, range info,
 561  *                        and data buffer.
 562  *              ioflag  - FAPPEND flag set if in append mode.
 563  *                        O_DIRECT flag; used to bypass page cache.
 564  *              cr      - credentials of caller.
 565  *
 566  *      OUT:    uio     - updated offset and range.
 567  *
 568  *      RETURN: 0 if success
 569  *              error code if failure
 570  *
 571  * Timestamps:
 572  *      ip - ctime|mtime updated if byte count > 0
 573  */
 574
 575 /* ARGSUSED */
 576 int
 577 zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 578 {
 579         znode_t         *zp = ITOZ(ip);
 580         rlim64_t        limit = uio->uio_limit;
 581         ssize_t         start_resid = uio->uio_resid;
 582         ssize_t         tx_bytes;
 583         uint64_t        end_size;
 584         dmu_tx_t        *tx;
 585         zfs_sb_t        *zsb = ZTOZSB(zp);
 586         zilog_t         *zilog;
 587         offset_t        woff;
 588         ssize_t         n, nbytes;
 589         rl_t            *rl;
 590         int             max_blksz = zsb->z_max_blksz;
 591         int             error = 0;
 592         arc_buf_t       *abuf;
 593         iovec_t         *aiov = NULL;
 594         xuio_t          *xuio = NULL;
 595         int             i_iov = 0;
 596         iovec_t         *iovp = uio->uio_iov;
 597         int             write_eof;
 598         int             count = 0;
 599         sa_bulk_attr_t  bulk[4];
 600         uint64_t        mtime[2], ctime[2];
 601         ASSERTV(int     iovcnt = uio->uio_iovcnt);
 602
 603         /*
 604          * Fasttrack empty write
 605          */
 606         n = start_resid;
 607         if (n == 0)
 608                 return (0);
 609
 610         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 611                 limit = MAXOFFSET_T;
 612
 613         ZFS_ENTER(zsb);
 614         ZFS_VERIFY_ZP(zp);
 615
 616         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
 617         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);
 618         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), NULL, &zp->z_size, 8);
 619         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL,
 620             &zp->z_pflags, 8);
 621
 622         /*
 623          * If immutable or not appending then return EPERM
 624          */
 625         if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 626             ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 627             (uio->uio_loffset < zp->z_size))) {
 628                 ZFS_EXIT(zsb);
 629                 return (SET_ERROR(EPERM));
 630         }
 631
 632         zilog = zsb->z_log;
 633
 634         /*
 635          * Validate file offset
 636          */
 637         woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 638         if (woff < 0) {
 639                 ZFS_EXIT(zsb);
 640                 return (SET_ERROR(EINVAL));
 641         }
 642
 643         /*
 644          * Check for mandatory locks before calling zfs_range_lock()
 645          * in order to prevent a deadlock with locks set via fcntl().
 646          */
 647         if (mandatory_lock(ip) && !lock_may_write(ip, woff, n)) {
 648                 ZFS_EXIT(zsb);
 649                 return (SET_ERROR(EAGAIN));
 650         }
 651
 652         /*
 653          * Pre-fault the pages to ensure slow (eg NFS) pages
 654          * don't hold up txg.
 655          * Skip this if uio contains loaned arc_buf.
 656          */
 657 #ifdef HAVE_UIO_ZEROCOPY
 658         if ((uio->uio_extflg == UIO_XUIO) &&
 659             (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 660                 xuio = (xuio_t *)uio;
 661         else
 662 #endif
 663                 uio_prefaultpages(MIN(n, max_blksz), uio);
 664
 665         /*
 666          * If in append mode, set the io offset pointer to eof.
 667          */
 668         if (ioflag & FAPPEND) {
 669                 /*
 670                  * Obtain an appending range lock to guarantee file append
 671                  * semantics.  We reset the write offset once we have the lock.
 672                  */
 673                 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 674                 woff = rl->r_off;
 675                 if (rl->r_len == UINT64_MAX) {
 676                         /*
 677                          * We overlocked the file because this write will cause
 678                          * the file block size to increase.
 679                          * Note that zp_size cannot change with this lock held.
 680                          */
 681                         woff = zp->z_size;
 682                 }
 683                 uio->uio_loffset = woff;
 684         } else {
 685                 /*
 686                  * Note that if the file block size will change as a result of
 687                  * this write, then this range lock will lock the entire file
 688                  * so that we can re-write the block safely.
 689                  */
 690                 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 691         }
 692
 693         if (woff >= limit) {
 694                 zfs_range_unlock(rl);
 695                 ZFS_EXIT(zsb);
 696                 return (SET_ERROR(EFBIG));
 697         }
 698
 699         if ((woff + n) > limit || woff > (limit - n))
 700                 n = limit - woff;
 701
 702         /* Will this write extend the file length? */
 703         write_eof = (woff + n > zp->z_size);
 704
 705         end_size = MAX(zp->z_size, woff + n);
 706
 707         /*
 708          * Write the file in reasonable size chunks.  Each chunk is written
 709          * in a separate transaction; this keeps the intent log records small
 710          * and allows us to do more fine-grained space accounting.
 711          */
 712         while (n > 0) {
 713                 abuf = NULL;
 714                 woff = uio->uio_loffset;
 715 again:
 716                 if (zfs_owner_overquota(zsb, zp, B_FALSE) ||
 717                     zfs_owner_overquota(zsb, zp, B_TRUE)) {
 718                         if (abuf != NULL)
 719                                 dmu_return_arcbuf(abuf);
 720                         error = SET_ERROR(EDQUOT);
 721                         break;
 722                 }
 723
 724                 if (xuio && abuf == NULL) {
 725                         ASSERT(i_iov < iovcnt);
 726                         aiov = &iovp[i_iov];
 727                         abuf = dmu_xuio_arcbuf(xuio, i_iov);
 728                         dmu_xuio_clear(xuio, i_iov);
 729                         ASSERT((aiov->iov_base == abuf->b_data) ||
 730                             ((char *)aiov->iov_base - (char *)abuf->b_data +
 731                             aiov->iov_len == arc_buf_size(abuf)));
 732                         i_iov++;
 733                 } else if (abuf == NULL && n >= max_blksz &&
 734                     woff >= zp->z_size &&
 735                     P2PHASE(woff, max_blksz) == 0 &&
 736                     zp->z_blksz == max_blksz) {
 737                         /*
 738                          * This write covers a full block.  "Borrow" a buffer
 739                          * from the dmu so that we can fill it before we enter
 740                          * a transaction.  This avoids the possibility of
 741                          * holding up the transaction if the data copy hangs
 742                          * up on a pagefault (e.g., from an NFS server mapping).
 743                          */
 744                         size_t cbytes;
 745
 746                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 747                             max_blksz);
 748                         ASSERT(abuf != NULL);
 749                         ASSERT(arc_buf_size(abuf) == max_blksz);
 750                         if ((error = uiocopy(abuf->b_data, max_blksz,
 751                             UIO_WRITE, uio, &cbytes))) {
 752                                 dmu_return_arcbuf(abuf);
 753                                 break;
 754                         }
 755                         ASSERT(cbytes == max_blksz);
 756                 }
 757
 758                 /*
 759                  * Start a transaction.
 760                  */
 761                 tx = dmu_tx_create(zsb->z_os);
 762                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 763                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 764                 zfs_sa_upgrade_txholds(tx, zp);
 765                 error = dmu_tx_assign(tx, TXG_NOWAIT);
 766                 if (error) {
 767                         if (error == ERESTART) {
 768                                 dmu_tx_wait(tx);
 769                                 dmu_tx_abort(tx);
 770                                 goto again;
 771                         }
 772                         dmu_tx_abort(tx);
 773                         if (abuf != NULL)
 774                                 dmu_return_arcbuf(abuf);
 775                         break;
 776                 }
 777
 778                 /*
 779                  * If zfs_range_lock() over-locked we grow the blocksize
 780                  * and then reduce the lock range.  This will only happen
 781                  * on the first iteration since zfs_range_reduce() will
 782                  * shrink down r_len to the appropriate size.
 783                  */
 784                 if (rl->r_len == UINT64_MAX) {
 785                         uint64_t new_blksz;
 786
 787                         if (zp->z_blksz > max_blksz) {
 788                                 ASSERT(!ISP2(zp->z_blksz));
 789                                 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 790                         } else {
 791                                 new_blksz = MIN(end_size, max_blksz);
 792                         }
 793                         zfs_grow_blocksize(zp, new_blksz, tx);
 794                         zfs_range_reduce(rl, woff, n);
 795                 }
 796
 797                 /*
 798                  * XXX - should we really limit each write to z_max_blksz?
 799                  * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 800                  */
 801                 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 802
 803                 if (abuf == NULL) {
 804                         tx_bytes = uio->uio_resid;
 805                         error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 806                             uio, nbytes, tx);
 807                         tx_bytes -= uio->uio_resid;
 808                 } else {
 809                         tx_bytes = nbytes;
 810                         ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 811                         /*
 812                          * If this is not a full block write, but we are
 813                          * extending the file past EOF and this data starts
 814                          * block-aligned, use assign_arcbuf().  Otherwise,
 815                          * write via dmu_write().
 816                          */
 817                         if (tx_bytes < max_blksz && (!write_eof ||
 818                             aiov->iov_base != abuf->b_data)) {
 819                                 ASSERT(xuio);
 820                                 dmu_write(zsb->z_os, zp->z_id, woff,
 821                                     aiov->iov_len, aiov->iov_base, tx);
 822                                 dmu_return_arcbuf(abuf);
 823                                 xuio_stat_wbuf_copied();
 824                         } else {
 825                                 ASSERT(xuio || tx_bytes == max_blksz);
 826                                 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 827                                     woff, abuf, tx);
 828                         }
 829                         ASSERT(tx_bytes <= uio->uio_resid);
 830                         uioskip(uio, tx_bytes);
 831                 }
 832
 833                 if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT))
 834                         update_pages(ip, woff, tx_bytes, zsb->z_os, zp->z_id);
 835
 836                 /*
 837                  * If we made no progress, we're done.  If we made even
 838                  * partial progress, update the znode and ZIL accordingly.
 839                  */
 840                 if (tx_bytes == 0) {
 841                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zsb),
 842                             (void *)&zp->z_size, sizeof (uint64_t), tx);
 843                         dmu_tx_commit(tx);
 844                         ASSERT(error != 0);
 845                         break;
 846                 }
 847
 848                 /*
 849                  * Clear Set-UID/Set-GID bits on successful write if not
 850                  * privileged and at least one of the excute bits is set.
 851                  *
 852                  * It would be nice to to this after all writes have
 853                  * been done, but that would still expose the ISUID/ISGID
 854                  * to another app after the partial write is committed.
 855                  *
 856                  * Note: we don't call zfs_fuid_map_id() here because
 857                  * user 0 is not an ephemeral uid.
 858                  */
 859                 mutex_enter(&zp->z_acl_lock);
 860                 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 861                     (S_IXUSR >> 6))) != 0 &&
 862                     (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 863                     secpolicy_vnode_setid_retain(cr,
 864                     (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 865                         uint64_t newmode;
 866                         zp->z_mode &= ~(S_ISUID | S_ISGID);
 867                         newmode = zp->z_mode;
 868                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zsb),
 869                             (void *)&newmode, sizeof (uint64_t), tx);
 870                 }
 871                 mutex_exit(&zp->z_acl_lock);
 872
 873                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 874                     B_TRUE);
 875
 876                 /*
 877                  * Update the file size (zp_size) if it has changed;
 878                  * account for possible concurrent updates.
 879                  */
 880                 while ((end_size = zp->z_size) < uio->uio_loffset) {
 881                         (void) atomic_cas_64(&zp->z_size, end_size,
 882                             uio->uio_loffset);
 883                         ASSERT(error == 0);
 884                 }
 885                 /*
 886                  * If we are replaying and eof is non zero then force
 887                  * the file size to the specified eof. Note, there's no
 888                  * concurrency during replay.
 889                  */
 890                 if (zsb->z_replay && zsb->z_replay_eof != 0)
 891                         zp->z_size = zsb->z_replay_eof;
 892
 893                 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 894
 895                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 896                 dmu_tx_commit(tx);
 897
 898                 if (error != 0)
 899                         break;
 900                 ASSERT(tx_bytes == nbytes);
 901                 n -= nbytes;
 902
 903                 if (!xuio && n > 0)
 904                         uio_prefaultpages(MIN(n, max_blksz), uio);
 905         }
 906
 907         zfs_range_unlock(rl);
 908
 909         /*
 910          * If we're in replay mode, or we made no progress, return error.
 911          * Otherwise, it's at least a partial write, so it's successful.
 912          */
 913         if (zsb->z_replay || uio->uio_resid == start_resid) {
 914                 ZFS_EXIT(zsb);
 915                 return (error);
 916         }
 917
 918         if (ioflag & (FSYNC | FDSYNC) ||
 919             zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
 920                 zil_commit(zilog, zp->z_id);
 921
 922         zfs_inode_update(zp);
 923         ZFS_EXIT(zsb);
 924         return (0);
 925 }
 926 EXPORT_SYMBOL(zfs_write);
 927
 928 static void
 929 iput_async(struct inode *ip, taskq_t *taskq)
 930 {
 931         ASSERT(atomic_read(&ip->i_count) > 0);
 932         if (atomic_read(&ip->i_count) == 1)
 933                 taskq_dispatch(taskq, (task_func_t *)iput, ip, TQ_PUSHPAGE);
 934         else
 935                 iput(ip);
 936 }
 937
 938 void
 939 zfs_get_done(zgd_t *zgd, int error)
 940 {
 941         znode_t *zp = zgd->zgd_private;
 942         objset_t *os = ZTOZSB(zp)->z_os;
 943
 944         if (zgd->zgd_db)
 945                 dmu_buf_rele(zgd->zgd_db, zgd);
 946
 947         zfs_range_unlock(zgd->zgd_rl);
 948
 949         /*
 950          * Release the vnode asynchronously as we currently have the
 951          * txg stopped from syncing.
 952          */
 953         iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os)));
 954
 955         if (error == 0 && zgd->zgd_bp)
 956                 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 957
 958         kmem_free(zgd, sizeof (zgd_t));
 959 }
 960
 961 #ifdef DEBUG
 962 static int zil_fault_io = 0;
 963 #endif
 964
 965 /*
 966  * Get data to generate a TX_WRITE intent log record.
 967  */
 968 int
 969 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 970 {
 971         zfs_sb_t *zsb = arg;
 972         objset_t *os = zsb->z_os;
 973         znode_t *zp;
 974         uint64_t object = lr->lr_foid;
 975         uint64_t offset = lr->lr_offset;
 976         uint64_t size = lr->lr_length;
 977         blkptr_t *bp = &lr->lr_blkptr;
 978         dmu_buf_t *db;
 979         zgd_t *zgd;
 980         int error = 0;
 981
 982         ASSERT(zio != NULL);
 983         ASSERT(size != 0);
 984
 985         /*
 986          * Nothing to do if the file has been removed
 987          */
 988         if (zfs_zget(zsb, object, &zp) != 0)
 989                 return (SET_ERROR(ENOENT));
 990         if (zp->z_unlinked) {
 991                 /*
 992                  * Release the vnode asynchronously as we currently have the
 993                  * txg stopped from syncing.
 994                  */
 995                 iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os)));
 996                 return (SET_ERROR(ENOENT));
 997         }
 998
 999         zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_PUSHPAGE);
1000         zgd->zgd_zilog = zsb->z_log;
1001         zgd->zgd_private = zp;
1002
1003         /*
1004          * Write records come in two flavors: immediate and indirect.
1005          * For small writes it's cheaper to store the data with the
1006          * log record (immediate); for large writes it's cheaper to
1007          * sync the data and get a pointer to it (indirect) so that
1008          * we don't have to write the data twice.
1009          */
1010         if (buf != NULL) { /* immediate write */
1011                 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1012                 /* test for truncation needs to be done while range locked */
1013                 if (offset >= zp->z_size) {
1014                         error = SET_ERROR(ENOENT);
1015                 } else {
1016                         error = dmu_read(os, object, offset, size, buf,
1017                             DMU_READ_NO_PREFETCH);
1018                 }
1019                 ASSERT(error == 0 || error == ENOENT);
1020         } else { /* indirect write */
1021                 /*
1022                  * Have to lock the whole block to ensure when it's
1023                  * written out and it's checksum is being calculated
1024                  * that no one can change the data. We need to re-check
1025                  * blocksize after we get the lock in case it's changed!
1026                  */
1027                 for (;;) {
1028                         uint64_t blkoff;
1029                         size = zp->z_blksz;
1030                         blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1031                         offset -= blkoff;
1032                         zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1033                             RL_READER);
1034                         if (zp->z_blksz == size)
1035                                 break;
1036                         offset += blkoff;
1037                         zfs_range_unlock(zgd->zgd_rl);
1038                 }
1039                 /* test for truncation needs to be done while range locked */
1040                 if (lr->lr_offset >= zp->z_size)
1041                         error = SET_ERROR(ENOENT);
1042 #ifdef DEBUG
1043                 if (zil_fault_io) {
1044                         error = SET_ERROR(EIO);
1045                         zil_fault_io = 0;
1046                 }
1047 #endif
1048                 if (error == 0)
1049                         error = dmu_buf_hold(os, object, offset, zgd, &db,
1050                             DMU_READ_NO_PREFETCH);
1051
1052                 if (error == 0) {
1053                         blkptr_t *obp = dmu_buf_get_blkptr(db);
1054                         if (obp) {
1055                                 ASSERT(BP_IS_HOLE(bp));
1056                                 *bp = *obp;
1057                         }
1058
1059                         zgd->zgd_db = db;
1060                         zgd->zgd_bp = bp;
1061
1062                         ASSERT(db->db_offset == offset);
1063                         ASSERT(db->db_size == size);
1064
1065                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
1066                             zfs_get_done, zgd);
1067                         ASSERT(error || lr->lr_length <= zp->z_blksz);
1068
1069                         /*
1070                          * On success, we need to wait for the write I/O
1071                          * initiated by dmu_sync() to complete before we can
1072                          * release this dbuf.  We will finish everything up
1073                          * in the zfs_get_done() callback.
1074                          */
1075                         if (error == 0)
1076                                 return (0);
1077
1078                         if (error == EALREADY) {
1079                                 lr->lr_common.lrc_txtype = TX_WRITE2;
1080                                 error = 0;
1081                         }
1082                 }
1083         }
1084
1085         zfs_get_done(zgd, error);
1086
1087         return (error);
1088 }
1089
1090 /*ARGSUSED*/
1091 int
1092 zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
1093 {
1094         znode_t *zp = ITOZ(ip);
1095         zfs_sb_t *zsb = ITOZSB(ip);
1096         int error;
1097
1098         ZFS_ENTER(zsb);
1099         ZFS_VERIFY_ZP(zp);
1100
1101         if (flag & V_ACE_MASK)
1102                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1103         else
1104                 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1105
1106         ZFS_EXIT(zsb);
1107         return (error);
1108 }
1109 EXPORT_SYMBOL(zfs_access);
1110
1111 /*
1112  * Lookup an entry in a directory, or an extended attribute directory.
1113  * If it exists, return a held inode reference for it.
1114  *
1115  *      IN:     dip     - inode of directory to search.
1116  *              nm      - name of entry to lookup.
1117  *              flags   - LOOKUP_XATTR set if looking for an attribute.
1118  *              cr      - credentials of caller.
1119  *              direntflags - directory lookup flags
1120  *              realpnp - returned pathname.
1121  *
1122  *      OUT:    ipp     - inode of located entry, NULL if not found.
1123  *
1124  *      RETURN: 0 on success, error code on failure.
1125  *
1126  * Timestamps:
1127  *      NA
1128  */
1129 /* ARGSUSED */
1130 int
1131 zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags,
1132     cred_t *cr, int *direntflags, pathname_t *realpnp)
1133 {
1134         znode_t *zdp = ITOZ(dip);
1135         zfs_sb_t *zsb = ITOZSB(dip);
1136         int error = 0;
1137
1138         /* fast path */
1139         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1140
1141                 if (!S_ISDIR(dip->i_mode)) {
1142                         return (SET_ERROR(ENOTDIR));
1143                 } else if (zdp->z_sa_hdl == NULL) {
1144                         return (SET_ERROR(EIO));
1145                 }
1146
1147                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1148                         error = zfs_fastaccesschk_execute(zdp, cr);
1149                         if (!error) {
1150                                 *ipp = dip;
1151                                 igrab(*ipp);
1152                                 return (0);
1153                         }
1154                         return (error);
1155 #ifdef HAVE_DNLC
1156                 } else {
1157                         vnode_t *tvp = dnlc_lookup(dvp, nm);
1158
1159                         if (tvp) {
1160                                 error = zfs_fastaccesschk_execute(zdp, cr);
1161                                 if (error) {
1162                                         iput(tvp);
1163                                         return (error);
1164                                 }
1165                                 if (tvp == DNLC_NO_VNODE) {
1166                                         iput(tvp);
1167                                         return (SET_ERROR(ENOENT));
1168                                 } else {
1169                                         *vpp = tvp;
1170                                         return (specvp_check(vpp, cr));
1171                                 }
1172                         }
1173 #endif /* HAVE_DNLC */
1174                 }
1175         }
1176
1177         ZFS_ENTER(zsb);
1178         ZFS_VERIFY_ZP(zdp);
1179
1180         *ipp = NULL;
1181
1182         if (flags & LOOKUP_XATTR) {
1183                 /*
1184                  * We don't allow recursive attributes..
1185                  * Maybe someday we will.
1186                  */
1187                 if (zdp->z_pflags & ZFS_XATTR) {
1188                         ZFS_EXIT(zsb);
1189                         return (SET_ERROR(EINVAL));
1190                 }
1191
1192                 if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) {
1193                         ZFS_EXIT(zsb);
1194                         return (error);
1195                 }
1196
1197                 /*
1198                  * Do we have permission to get into attribute directory?
1199                  */
1200
1201                 if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0,
1202                     B_FALSE, cr))) {
1203                         iput(*ipp);
1204                         *ipp = NULL;
1205                 }
1206
1207                 ZFS_EXIT(zsb);
1208                 return (error);
1209         }
1210
1211         if (!S_ISDIR(dip->i_mode)) {
1212                 ZFS_EXIT(zsb);
1213                 return (SET_ERROR(ENOTDIR));
1214         }
1215
1216         /*
1217          * Check accessibility of directory.
1218          */
1219
1220         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
1221                 ZFS_EXIT(zsb);
1222                 return (error);
1223         }
1224
1225         if (zsb->z_utf8 && u8_validate(nm, strlen(nm),
1226             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1227                 ZFS_EXIT(zsb);
1228                 return (SET_ERROR(EILSEQ));
1229         }
1230
1231         error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp);
1232         if ((error == 0) && (*ipp))
1233                 zfs_inode_update(ITOZ(*ipp));
1234
1235         ZFS_EXIT(zsb);
1236         return (error);
1237 }
1238 EXPORT_SYMBOL(zfs_lookup);
1239
1240 /*
1241  * Attempt to create a new entry in a directory.  If the entry
1242  * already exists, truncate the file if permissible, else return
1243  * an error.  Return the ip of the created or trunc'd file.
1244  *
1245  *      IN:     dip     - inode of directory to put new file entry in.
1246  *              name    - name of new file entry.
1247  *              vap     - attributes of new file.
1248  *              excl    - flag indicating exclusive or non-exclusive mode.
1249  *              mode    - mode to open file with.
1250  *              cr      - credentials of caller.
1251  *              flag    - large file flag [UNUSED].
1252  *              vsecp   - ACL to be set
1253  *
1254  *      OUT:    ipp     - inode of created or trunc'd entry.
1255  *
1256  *      RETURN: 0 on success, error code on failure.
1257  *
1258  * Timestamps:
1259  *      dip - ctime|mtime updated if new entry created
1260  *       ip - ctime|mtime always, atime if new
1261  */
1262
1263 /* ARGSUSED */
1264 int
1265 zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
1266     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
1267 {
1268         znode_t         *zp, *dzp = ITOZ(dip);
1269         zfs_sb_t        *zsb = ITOZSB(dip);
1270         zilog_t         *zilog;
1271         objset_t        *os;
1272         zfs_dirlock_t   *dl;
1273         dmu_tx_t        *tx;
1274         int             error;
1275         uid_t           uid;
1276         gid_t           gid;
1277         zfs_acl_ids_t   acl_ids;
1278         boolean_t       fuid_dirtied;
1279         boolean_t       have_acl = B_FALSE;
1280
1281         /*
1282          * If we have an ephemeral id, ACL, or XVATTR then
1283          * make sure file system is at proper version
1284          */
1285
1286         gid = crgetgid(cr);
1287         uid = crgetuid(cr);
1288
1289         if (zsb->z_use_fuids == B_FALSE &&
1290             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1291                 return (SET_ERROR(EINVAL));
1292
1293         ZFS_ENTER(zsb);
1294         ZFS_VERIFY_ZP(dzp);
1295         os = zsb->z_os;
1296         zilog = zsb->z_log;
1297
1298         if (zsb->z_utf8 && u8_validate(name, strlen(name),
1299             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1300                 ZFS_EXIT(zsb);
1301                 return (SET_ERROR(EILSEQ));
1302         }
1303
1304         if (vap->va_mask & ATTR_XVATTR) {
1305                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1306                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1307                         ZFS_EXIT(zsb);
1308                         return (error);
1309                 }
1310         }
1311
1312 top:
1313         *ipp = NULL;
1314         if (*name == '\0') {
1315                 /*
1316                  * Null component name refers to the directory itself.
1317                  */
1318                 igrab(dip);
1319                 zp = dzp;
1320                 dl = NULL;
1321                 error = 0;
1322         } else {
1323                 /* possible igrab(zp) */
1324                 int zflg = 0;
1325
1326                 if (flag & FIGNORECASE)
1327                         zflg |= ZCILOOK;
1328
1329                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1330                     NULL, NULL);
1331                 if (error) {
1332                         if (have_acl)
1333                                 zfs_acl_ids_free(&acl_ids);
1334                         if (strcmp(name, "..") == 0)
1335                                 error = SET_ERROR(EISDIR);
1336                         ZFS_EXIT(zsb);
1337                         return (error);
1338                 }
1339         }
1340
1341         if (zp == NULL) {
1342                 uint64_t txtype;
1343
1344                 /*
1345                  * Create a new file object and update the directory
1346                  * to reference it.
1347                  */
1348                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
1349                         if (have_acl)
1350                                 zfs_acl_ids_free(&acl_ids);
1351                         goto out;
1352                 }
1353
1354                 /*
1355                  * We only support the creation of regular files in
1356                  * extended attribute directories.
1357                  */
1358
1359                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
1360                         if (have_acl)
1361                                 zfs_acl_ids_free(&acl_ids);
1362                         error = SET_ERROR(EINVAL);
1363                         goto out;
1364                 }
1365
1366                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1367                     cr, vsecp, &acl_ids)) != 0)
1368                         goto out;
1369                 have_acl = B_TRUE;
1370
1371                 if (zfs_acl_ids_overquota(zsb, &acl_ids)) {
1372                         zfs_acl_ids_free(&acl_ids);
1373                         error = SET_ERROR(EDQUOT);
1374                         goto out;
1375                 }
1376
1377                 tx = dmu_tx_create(os);
1378
1379                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1380                     ZFS_SA_BASE_ATTR_SIZE);
1381
1382                 fuid_dirtied = zsb->z_fuid_dirty;
1383                 if (fuid_dirtied)
1384                         zfs_fuid_txhold(zsb, tx);
1385                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1386                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1387                 if (!zsb->z_use_sa &&
1388                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1389                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1390                             0, acl_ids.z_aclp->z_acl_bytes);
1391                 }
1392                 error = dmu_tx_assign(tx, TXG_NOWAIT);
1393                 if (error) {
1394                         zfs_dirent_unlock(dl);
1395                         if (error == ERESTART) {
1396                                 dmu_tx_wait(tx);
1397                                 dmu_tx_abort(tx);
1398                                 goto top;
1399                         }
1400                         zfs_acl_ids_free(&acl_ids);
1401                         dmu_tx_abort(tx);
1402                         ZFS_EXIT(zsb);
1403                         return (error);
1404                 }
1405                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1406
1407                 if (fuid_dirtied)
1408                         zfs_fuid_sync(zsb, tx);
1409
1410                 (void) zfs_link_create(dl, zp, tx, ZNEW);
1411                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1412                 if (flag & FIGNORECASE)
1413                         txtype |= TX_CI;
1414                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1415                     vsecp, acl_ids.z_fuidp, vap);
1416                 zfs_acl_ids_free(&acl_ids);
1417                 dmu_tx_commit(tx);
1418         } else {
1419                 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1420
1421                 if (have_acl)
1422                         zfs_acl_ids_free(&acl_ids);
1423                 have_acl = B_FALSE;
1424
1425                 /*
1426                  * A directory entry already exists for this name.
1427                  */
1428                 /*
1429                  * Can't truncate an existing file if in exclusive mode.
1430                  */
1431                 if (excl) {
1432                         error = SET_ERROR(EEXIST);
1433                         goto out;
1434                 }
1435                 /*
1436                  * Can't open a directory for writing.
1437                  */
1438                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1439                         error = SET_ERROR(EISDIR);
1440                         goto out;
1441                 }
1442                 /*
1443                  * Verify requested access to file.
1444                  */
1445                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1446                         goto out;
1447                 }
1448
1449                 mutex_enter(&dzp->z_lock);
1450                 dzp->z_seq++;
1451                 mutex_exit(&dzp->z_lock);
1452
1453                 /*
1454                  * Truncate regular files if requested.
1455                  */
1456                 if (S_ISREG(ZTOI(zp)->i_mode) &&
1457                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
1458                         /* we can't hold any locks when calling zfs_freesp() */
1459                         zfs_dirent_unlock(dl);
1460                         dl = NULL;
1461                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
1462                 }
1463         }
1464 out:
1465
1466         if (dl)
1467                 zfs_dirent_unlock(dl);
1468
1469         if (error) {
1470                 if (zp)
1471                         iput(ZTOI(zp));
1472         } else {
1473                 zfs_inode_update(dzp);
1474                 zfs_inode_update(zp);
1475                 *ipp = ZTOI(zp);
1476         }
1477
1478         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
1479                 zil_commit(zilog, 0);
1480
1481         ZFS_EXIT(zsb);
1482         return (error);
1483 }
1484 EXPORT_SYMBOL(zfs_create);
1485
1486 /*
1487  * Remove an entry from a directory.
1488  *
1489  *      IN:     dip     - inode of directory to remove entry from.
1490  *              name    - name of entry to remove.
1491  *              cr      - credentials of caller.
1492  *
1493  *      RETURN: 0 if success
1494  *              error code if failure
1495  *
1496  * Timestamps:
1497  *      dip - ctime|mtime
1498  *       ip - ctime (if nlink > 0)
1499  */
1500
1501 uint64_t null_xattr = 0;
1502
1503 /*ARGSUSED*/
1504 int
1505 zfs_remove(struct inode *dip, char *name, cred_t *cr)
1506 {
1507         znode_t         *zp, *dzp = ITOZ(dip);
1508         znode_t         *xzp;
1509         struct inode    *ip;
1510         zfs_sb_t        *zsb = ITOZSB(dip);
1511         zilog_t         *zilog;
1512         uint64_t        xattr_obj;
1513         uint64_t        xattr_obj_unlinked = 0;
1514         uint64_t        obj = 0;
1515         zfs_dirlock_t   *dl;
1516         dmu_tx_t        *tx;
1517         boolean_t       unlinked;
1518         uint64_t        txtype;
1519         pathname_t      *realnmp = NULL;
1520 #ifdef HAVE_PN_UTILS
1521         pathname_t      realnm;
1522 #endif /* HAVE_PN_UTILS */
1523         int             error;
1524         int             zflg = ZEXISTS;
1525
1526         ZFS_ENTER(zsb);
1527         ZFS_VERIFY_ZP(dzp);
1528         zilog = zsb->z_log;
1529
1530 #ifdef HAVE_PN_UTILS
1531         if (flags & FIGNORECASE) {
1532                 zflg |= ZCILOOK;
1533                 pn_alloc(&realnm);
1534                 realnmp = &realnm;
1535         }
1536 #endif /* HAVE_PN_UTILS */
1537
1538 top:
1539         xattr_obj = 0;
1540         xzp = NULL;
1541         /*
1542          * Attempt to lock directory; fail if entry doesn't exist.
1543          */
1544         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1545             NULL, realnmp))) {
1546 #ifdef HAVE_PN_UTILS
1547                 if (realnmp)
1548                         pn_free(realnmp);
1549 #endif /* HAVE_PN_UTILS */
1550                 ZFS_EXIT(zsb);
1551                 return (error);
1552         }
1553
1554         ip = ZTOI(zp);
1555
1556         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1557                 goto out;
1558         }
1559
1560         /*
1561          * Need to use rmdir for removing directories.
1562          */
1563         if (S_ISDIR(ip->i_mode)) {
1564                 error = SET_ERROR(EPERM);
1565                 goto out;
1566         }
1567
1568 #ifdef HAVE_DNLC
1569         if (realnmp)
1570                 dnlc_remove(dvp, realnmp->pn_buf);
1571         else
1572                 dnlc_remove(dvp, name);
1573 #endif /* HAVE_DNLC */
1574
1575         /*
1576          * We never delete the znode and always place it in the unlinked
1577          * set.  The dentry cache will always hold the last reference and
1578          * is responsible for safely freeing the znode.
1579          */
1580         obj = zp->z_id;
1581         tx = dmu_tx_create(zsb->z_os);
1582         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1583         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1584         zfs_sa_upgrade_txholds(tx, zp);
1585         zfs_sa_upgrade_txholds(tx, dzp);
1586
1587         /* are there any extended attributes? */
1588         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb),
1589             &xattr_obj, sizeof (xattr_obj));
1590         if (error == 0 && xattr_obj) {
1591                 error = zfs_zget(zsb, xattr_obj, &xzp);
1592                 ASSERT0(error);
1593                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1594                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1595         }
1596
1597         /* charge as an update -- would be nice not to charge at all */
1598         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
1599
1600         error = dmu_tx_assign(tx, TXG_NOWAIT);
1601         if (error) {
1602                 zfs_dirent_unlock(dl);
1603                 iput(ip);
1604                 if (xzp)
1605                         iput(ZTOI(xzp));
1606                 if (error == ERESTART) {
1607                         dmu_tx_wait(tx);
1608                         dmu_tx_abort(tx);
1609                         goto top;
1610                 }
1611 #ifdef HAVE_PN_UTILS
1612                 if (realnmp)
1613                         pn_free(realnmp);
1614 #endif /* HAVE_PN_UTILS */
1615                 dmu_tx_abort(tx);
1616                 ZFS_EXIT(zsb);
1617                 return (error);
1618         }
1619
1620         /*
1621          * Remove the directory entry.
1622          */
1623         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1624
1625         if (error) {
1626                 dmu_tx_commit(tx);
1627                 goto out;
1628         }
1629
1630         if (unlinked) {
1631                 /*
1632                  * Hold z_lock so that we can make sure that the ACL obj
1633                  * hasn't changed.  Could have been deleted due to
1634                  * zfs_sa_upgrade().
1635                  */
1636                 mutex_enter(&zp->z_lock);
1637                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb),
1638                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1639                 mutex_exit(&zp->z_lock);
1640                 zfs_unlinked_add(zp, tx);
1641         }
1642
1643         txtype = TX_REMOVE;
1644 #ifdef HAVE_PN_UTILS
1645         if (flags & FIGNORECASE)
1646                 txtype |= TX_CI;
1647 #endif /* HAVE_PN_UTILS */
1648         zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1649
1650         dmu_tx_commit(tx);
1651 out:
1652 #ifdef HAVE_PN_UTILS
1653         if (realnmp)
1654                 pn_free(realnmp);
1655 #endif /* HAVE_PN_UTILS */
1656
1657         zfs_dirent_unlock(dl);
1658         zfs_inode_update(dzp);
1659         zfs_inode_update(zp);
1660         if (xzp)
1661                 zfs_inode_update(xzp);
1662
1663         iput(ip);
1664         if (xzp)
1665                 iput(ZTOI(xzp));
1666
1667         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
1668                 zil_commit(zilog, 0);
1669
1670         ZFS_EXIT(zsb);
1671         return (error);
1672 }
1673 EXPORT_SYMBOL(zfs_remove);
1674
1675 /*
1676  * Create a new directory and insert it into dip using the name
1677  * provided.  Return a pointer to the inserted directory.
1678  *
1679  *      IN:     dip     - inode of directory to add subdir to.
1680  *              dirname - name of new directory.
1681  *              vap     - attributes of new directory.
1682  *              cr      - credentials of caller.
1683  *              vsecp   - ACL to be set
1684  *
1685  *      OUT:    ipp     - inode of created directory.
1686  *
1687  *      RETURN: 0 if success
1688  *              error code if failure
1689  *
1690  * Timestamps:
1691  *      dip - ctime|mtime updated
1692  *      ipp - ctime|mtime|atime updated
1693  */
1694 /*ARGSUSED*/
1695 int
1696 zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
1697     cred_t *cr, int flags, vsecattr_t *vsecp)
1698 {
1699         znode_t         *zp, *dzp = ITOZ(dip);
1700         zfs_sb_t        *zsb = ITOZSB(dip);
1701         zilog_t         *zilog;
1702         zfs_dirlock_t   *dl;
1703         uint64_t        txtype;
1704         dmu_tx_t        *tx;
1705         int             error;
1706         int             zf = ZNEW;
1707         uid_t           uid;
1708         gid_t           gid = crgetgid(cr);
1709         zfs_acl_ids_t   acl_ids;
1710         boolean_t       fuid_dirtied;
1711
1712         ASSERT(S_ISDIR(vap->va_mode));
1713
1714         /*
1715          * If we have an ephemeral id, ACL, or XVATTR then
1716          * make sure file system is at proper version
1717          */
1718
1719         uid = crgetuid(cr);
1720         if (zsb->z_use_fuids == B_FALSE &&
1721             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1722                 return (SET_ERROR(EINVAL));
1723
1724         ZFS_ENTER(zsb);
1725         ZFS_VERIFY_ZP(dzp);
1726         zilog = zsb->z_log;
1727
1728         if (dzp->z_pflags & ZFS_XATTR) {
1729                 ZFS_EXIT(zsb);
1730                 return (SET_ERROR(EINVAL));
1731         }
1732
1733         if (zsb->z_utf8 && u8_validate(dirname,
1734             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1735                 ZFS_EXIT(zsb);
1736                 return (SET_ERROR(EILSEQ));
1737         }
1738         if (flags & FIGNORECASE)
1739                 zf |= ZCILOOK;
1740
1741         if (vap->va_mask & ATTR_XVATTR) {
1742                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1743                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1744                         ZFS_EXIT(zsb);
1745                         return (error);
1746                 }
1747         }
1748
1749         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1750             vsecp, &acl_ids)) != 0) {
1751                 ZFS_EXIT(zsb);
1752                 return (error);
1753         }
1754         /*
1755          * First make sure the new directory doesn't exist.
1756          *
1757          * Existence is checked first to make sure we don't return
1758          * EACCES instead of EEXIST which can cause some applications
1759          * to fail.
1760          */
1761 top:
1762         *ipp = NULL;
1763
1764         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1765             NULL, NULL))) {
1766                 zfs_acl_ids_free(&acl_ids);
1767                 ZFS_EXIT(zsb);
1768                 return (error);
1769         }
1770
1771         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
1772                 zfs_acl_ids_free(&acl_ids);
1773                 zfs_dirent_unlock(dl);
1774                 ZFS_EXIT(zsb);
1775                 return (error);
1776         }
1777
1778         if (zfs_acl_ids_overquota(zsb, &acl_ids)) {
1779                 zfs_acl_ids_free(&acl_ids);
1780                 zfs_dirent_unlock(dl);
1781                 ZFS_EXIT(zsb);
1782                 return (SET_ERROR(EDQUOT));
1783         }
1784
1785         /*
1786          * Add a new entry to the directory.
1787          */
1788         tx = dmu_tx_create(zsb->z_os);
1789         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1790         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1791         fuid_dirtied = zsb->z_fuid_dirty;
1792         if (fuid_dirtied)
1793                 zfs_fuid_txhold(zsb, tx);
1794         if (!zsb->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1795                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1796                     acl_ids.z_aclp->z_acl_bytes);
1797         }
1798
1799         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1800             ZFS_SA_BASE_ATTR_SIZE);
1801
1802         error = dmu_tx_assign(tx, TXG_NOWAIT);
1803         if (error) {
1804                 zfs_dirent_unlock(dl);
1805                 if (error == ERESTART) {
1806                         dmu_tx_wait(tx);
1807                         dmu_tx_abort(tx);
1808                         goto top;
1809                 }
1810                 zfs_acl_ids_free(&acl_ids);
1811                 dmu_tx_abort(tx);
1812                 ZFS_EXIT(zsb);
1813                 return (error);
1814         }
1815
1816         /*
1817          * Create new node.
1818          */
1819         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1820
1821         if (fuid_dirtied)
1822                 zfs_fuid_sync(zsb, tx);
1823
1824         /*
1825          * Now put new name in parent dir.
1826          */
1827         (void) zfs_link_create(dl, zp, tx, ZNEW);
1828
1829         *ipp = ZTOI(zp);
1830
1831         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1832         if (flags & FIGNORECASE)
1833                 txtype |= TX_CI;
1834         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1835             acl_ids.z_fuidp, vap);
1836
1837         zfs_acl_ids_free(&acl_ids);
1838
1839         dmu_tx_commit(tx);
1840
1841         zfs_dirent_unlock(dl);
1842
1843         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
1844                 zil_commit(zilog, 0);
1845
1846         zfs_inode_update(dzp);
1847         zfs_inode_update(zp);
1848         ZFS_EXIT(zsb);
1849         return (0);
1850 }
1851 EXPORT_SYMBOL(zfs_mkdir);
1852
1853 /*
1854  * Remove a directory subdir entry.  If the current working
1855  * directory is the same as the subdir to be removed, the
1856  * remove will fail.
1857  *
1858  *      IN:     dip     - inode of directory to remove from.
1859  *              name    - name of directory to be removed.
1860  *              cwd     - inode of current working directory.
1861  *              cr      - credentials of caller.
1862  *              flags   - case flags
1863  *
1864  *      RETURN: 0 on success, error code on failure.
1865  *
1866  * Timestamps:
1867  *      dip - ctime|mtime updated
1868  */
1869 /*ARGSUSED*/
1870 int
1871 zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
1872     int flags)
1873 {
1874         znode_t         *dzp = ITOZ(dip);
1875         znode_t         *zp;
1876         struct inode    *ip;
1877         zfs_sb_t        *zsb = ITOZSB(dip);
1878         zilog_t         *zilog;
1879         zfs_dirlock_t   *dl;
1880         dmu_tx_t        *tx;
1881         int             error;
1882         int             zflg = ZEXISTS;
1883
1884         ZFS_ENTER(zsb);
1885         ZFS_VERIFY_ZP(dzp);
1886         zilog = zsb->z_log;
1887
1888         if (flags & FIGNORECASE)
1889                 zflg |= ZCILOOK;
1890 top:
1891         zp = NULL;
1892
1893         /*
1894          * Attempt to lock directory; fail if entry doesn't exist.
1895          */
1896         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1897             NULL, NULL))) {
1898                 ZFS_EXIT(zsb);
1899                 return (error);
1900         }
1901
1902         ip = ZTOI(zp);
1903
1904         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1905                 goto out;
1906         }
1907
1908         if (!S_ISDIR(ip->i_mode)) {
1909                 error = SET_ERROR(ENOTDIR);
1910                 goto out;
1911         }
1912
1913         if (ip == cwd) {
1914                 error = SET_ERROR(EINVAL);
1915                 goto out;
1916         }
1917
1918         /*
1919          * Grab a lock on the directory to make sure that noone is
1920          * trying to add (or lookup) entries while we are removing it.
1921          */
1922         rw_enter(&zp->z_name_lock, RW_WRITER);
1923
1924         /*
1925          * Grab a lock on the parent pointer to make sure we play well
1926          * with the treewalk and directory rename code.
1927          */
1928         rw_enter(&zp->z_parent_lock, RW_WRITER);
1929
1930         tx = dmu_tx_create(zsb->z_os);
1931         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1932         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1933         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
1934         zfs_sa_upgrade_txholds(tx, zp);
1935         zfs_sa_upgrade_txholds(tx, dzp);
1936         error = dmu_tx_assign(tx, TXG_NOWAIT);
1937         if (error) {
1938                 rw_exit(&zp->z_parent_lock);
1939                 rw_exit(&zp->z_name_lock);
1940                 zfs_dirent_unlock(dl);
1941                 iput(ip);
1942                 if (error == ERESTART) {
1943                         dmu_tx_wait(tx);
1944                         dmu_tx_abort(tx);
1945                         goto top;
1946                 }
1947                 dmu_tx_abort(tx);
1948                 ZFS_EXIT(zsb);
1949                 return (error);
1950         }
1951
1952         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1953
1954         if (error == 0) {
1955                 uint64_t txtype = TX_RMDIR;
1956                 if (flags & FIGNORECASE)
1957                         txtype |= TX_CI;
1958                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
1959         }
1960
1961         dmu_tx_commit(tx);
1962
1963         rw_exit(&zp->z_parent_lock);
1964         rw_exit(&zp->z_name_lock);
1965 out:
1966         zfs_dirent_unlock(dl);
1967
1968         zfs_inode_update(dzp);
1969         zfs_inode_update(zp);
1970         iput(ip);
1971
1972         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
1973                 zil_commit(zilog, 0);
1974
1975         ZFS_EXIT(zsb);
1976         return (error);
1977 }
1978 EXPORT_SYMBOL(zfs_rmdir);
1979
1980 /*
1981  * Read as many directory entries as will fit into the provided
1982  * dirent buffer from the given directory cursor position.
1983  *
1984  *      IN:     ip      - inode of directory to read.
1985  *              dirent  - buffer for directory entries.
1986  *
1987  *      OUT:    dirent  - filler buffer of directory entries.
1988  *
1989  *      RETURN: 0 if success
1990  *              error code if failure
1991  *
1992  * Timestamps:
1993  *      ip - atime updated
1994  *
1995  * Note that the low 4 bits of the cookie returned by zap is always zero.
1996  * This allows us to use the low range for "special" directory entries:
1997  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1998  * we use the offset 2 for the '.zfs' directory.
1999  */
2000 /* ARGSUSED */
2001 int
2002 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
2003 {
2004         znode_t         *zp = ITOZ(ip);
2005         zfs_sb_t        *zsb = ITOZSB(ip);
2006         objset_t        *os;
2007         zap_cursor_t    zc;
2008         zap_attribute_t zap;
2009         int             error;
2010         uint8_t         prefetch;
2011         uint8_t         type;
2012         int             done = 0;
2013         uint64_t        parent;
2014         uint64_t        offset; /* must be unsigned; checks for < 1 */
2015
2016         ZFS_ENTER(zsb);
2017         ZFS_VERIFY_ZP(zp);
2018
2019         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zsb),
2020             &parent, sizeof (parent))) != 0)
2021                 goto out;
2022
2023         /*
2024          * Quit if directory has been removed (posix)
2025          */
2026         if (zp->z_unlinked)
2027                 goto out;
2028
2029         error = 0;
2030         os = zsb->z_os;
2031         offset = ctx->pos;
2032         prefetch = zp->z_zn_prefetch;
2033
2034         /*
2035          * Initialize the iterator cursor.
2036          */
2037         if (offset <= 3) {
2038                 /*
2039                  * Start iteration from the beginning of the directory.
2040                  */
2041                 zap_cursor_init(&zc, os, zp->z_id);
2042         } else {
2043                 /*
2044                  * The offset is a serialized cursor.
2045                  */
2046                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2047         }
2048
2049         /*
2050          * Transform to file-system independent format
2051          */
2052         while (!done) {
2053                 uint64_t objnum;
2054                 /*
2055                  * Special case `.', `..', and `.zfs'.
2056                  */
2057                 if (offset == 0) {
2058                         (void) strcpy(zap.za_name, ".");
2059                         zap.za_normalization_conflict = 0;
2060                         objnum = zp->z_id;
2061                         type = DT_DIR;
2062                 } else if (offset == 1) {
2063                         (void) strcpy(zap.za_name, "..");
2064                         zap.za_normalization_conflict = 0;
2065                         objnum = parent;
2066                         type = DT_DIR;
2067                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2068                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2069                         zap.za_normalization_conflict = 0;
2070                         objnum = ZFSCTL_INO_ROOT;
2071                         type = DT_DIR;
2072                 } else {
2073                         /*
2074                          * Grab next entry.
2075                          */
2076                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
2077                                 if (error == ENOENT)
2078                                         break;
2079                                 else
2080                                         goto update;
2081                         }
2082
2083                         /*
2084                          * Allow multiple entries provided the first entry is
2085                          * the object id.  Non-zpl consumers may safely make
2086                          * use of the additional space.
2087                          *
2088                          * XXX: This should be a feature flag for compatibility
2089                          */
2090                         if (zap.za_integer_length != 8 ||
2091                             zap.za_num_integers == 0) {
2092                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
2093                                     "entry, obj = %lld, offset = %lld, "
2094                                     "length = %d, num = %lld\n",
2095                                     (u_longlong_t)zp->z_id,
2096                                     (u_longlong_t)offset,
2097                                     zap.za_integer_length,
2098                                     (u_longlong_t)zap.za_num_integers);
2099                                 error = SET_ERROR(ENXIO);
2100                                 goto update;
2101                         }
2102
2103                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2104                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2105                 }
2106
2107                 done = !dir_emit(ctx, zap.za_name, strlen(zap.za_name),
2108                     objnum, type);
2109                 if (done)
2110                         break;
2111
2112                 /* Prefetch znode */
2113                 if (prefetch) {
2114                         dmu_prefetch(os, objnum, 0, 0);
2115                 }
2116
2117                 /*
2118                  * Move to the next entry, fill in the previous offset.
2119                  */
2120                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2121                         zap_cursor_advance(&zc);
2122                         offset = zap_cursor_serialize(&zc);
2123                 } else {
2124                         offset += 1;
2125                 }
2126                 ctx->pos = offset;
2127         }
2128         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2129
2130 update:
2131         zap_cursor_fini(&zc);
2132         if (error == ENOENT)
2133                 error = 0;
2134
2135         ZFS_ACCESSTIME_STAMP(zsb, zp);
2136         zfs_inode_update(zp);
2137
2138 out:
2139         ZFS_EXIT(zsb);
2140
2141         return (error);
2142 }
2143 EXPORT_SYMBOL(zfs_readdir);
2144
2145 ulong_t zfs_fsync_sync_cnt = 4;
2146
2147 int
2148 zfs_fsync(struct inode *ip, int syncflag, cred_t *cr)
2149 {
2150         znode_t *zp = ITOZ(ip);
2151         zfs_sb_t *zsb = ITOZSB(ip);
2152
2153         (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2154
2155         if (zsb->z_os->os_sync != ZFS_SYNC_DISABLED) {
2156                 ZFS_ENTER(zsb);
2157                 ZFS_VERIFY_ZP(zp);
2158                 zil_commit(zsb->z_log, zp->z_id);
2159                 ZFS_EXIT(zsb);
2160         }
2161         return (0);
2162 }
2163 EXPORT_SYMBOL(zfs_fsync);
2164
2165
2166 /*
2167  * Get the requested file attributes and place them in the provided
2168  * vattr structure.
2169  *
2170  *      IN:     ip      - inode of file.
2171  *              vap     - va_mask identifies requested attributes.
2172  *                        If ATTR_XVATTR set, then optional attrs are requested
2173  *              flags   - ATTR_NOACLCHECK (CIFS server context)
2174  *              cr      - credentials of caller.
2175  *
2176  *      OUT:    vap     - attribute values.
2177  *
2178  *      RETURN: 0 (always succeeds)
2179  */
2180 /* ARGSUSED */
2181 int
2182 zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
2183 {
2184         znode_t *zp = ITOZ(ip);
2185         zfs_sb_t *zsb = ITOZSB(ip);
2186         int     error = 0;
2187         uint64_t links;
2188         uint64_t mtime[2], ctime[2];
2189         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2190         xoptattr_t *xoap = NULL;
2191         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2192         sa_bulk_attr_t bulk[2];
2193         int count = 0;
2194
2195         ZFS_ENTER(zsb);
2196         ZFS_VERIFY_ZP(zp);
2197
2198         zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2199
2200         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
2201         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);
2202
2203         if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2204                 ZFS_EXIT(zsb);
2205                 return (error);
2206         }
2207
2208         /*
2209          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2210          * Also, if we are the owner don't bother, since owner should
2211          * always be allowed to read basic attributes of file.
2212          */
2213         if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2214             (vap->va_uid != crgetuid(cr))) {
2215                 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2216                     skipaclchk, cr))) {
2217                         ZFS_EXIT(zsb);
2218                         return (error);
2219                 }
2220         }
2221
2222         /*
2223          * Return all attributes.  It's cheaper to provide the answer
2224          * than to determine whether we were asked the question.
2225          */
2226
2227         mutex_enter(&zp->z_lock);
2228         vap->va_type = vn_mode_to_vtype(zp->z_mode);
2229         vap->va_mode = zp->z_mode;
2230         vap->va_fsid = ZTOI(zp)->i_sb->s_dev;
2231         vap->va_nodeid = zp->z_id;
2232         if ((zp->z_id == zsb->z_root) && zfs_show_ctldir(zp))
2233                 links = zp->z_links + 1;
2234         else
2235                 links = zp->z_links;
2236         vap->va_nlink = MIN(links, ZFS_LINK_MAX);
2237         vap->va_size = i_size_read(ip);
2238         vap->va_rdev = ip->i_rdev;
2239         vap->va_seq = ip->i_generation;
2240
2241         /*
2242          * Add in any requested optional attributes and the create time.
2243          * Also set the corresponding bits in the returned attribute bitmap.
2244          */
2245         if ((xoap = xva_getxoptattr(xvap)) != NULL && zsb->z_use_fuids) {
2246                 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2247                         xoap->xoa_archive =
2248                             ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2249                         XVA_SET_RTN(xvap, XAT_ARCHIVE);
2250                 }
2251
2252                 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2253                         xoap->xoa_readonly =
2254                             ((zp->z_pflags & ZFS_READONLY) != 0);
2255                         XVA_SET_RTN(xvap, XAT_READONLY);
2256                 }
2257
2258                 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2259                         xoap->xoa_system =
2260                             ((zp->z_pflags & ZFS_SYSTEM) != 0);
2261                         XVA_SET_RTN(xvap, XAT_SYSTEM);
2262                 }
2263
2264                 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2265                         xoap->xoa_hidden =
2266                             ((zp->z_pflags & ZFS_HIDDEN) != 0);
2267                         XVA_SET_RTN(xvap, XAT_HIDDEN);
2268                 }
2269
2270                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2271                         xoap->xoa_nounlink =
2272                             ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2273                         XVA_SET_RTN(xvap, XAT_NOUNLINK);
2274                 }
2275
2276                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2277                         xoap->xoa_immutable =
2278                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2279                         XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2280                 }
2281
2282                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2283                         xoap->xoa_appendonly =
2284                             ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2285                         XVA_SET_RTN(xvap, XAT_APPENDONLY);
2286                 }
2287
2288                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2289                         xoap->xoa_nodump =
2290                             ((zp->z_pflags & ZFS_NODUMP) != 0);
2291                         XVA_SET_RTN(xvap, XAT_NODUMP);
2292                 }
2293
2294                 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2295                         xoap->xoa_opaque =
2296                             ((zp->z_pflags & ZFS_OPAQUE) != 0);
2297                         XVA_SET_RTN(xvap, XAT_OPAQUE);
2298                 }
2299
2300                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2301                         xoap->xoa_av_quarantined =
2302                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2303                         XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2304                 }
2305
2306                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2307                         xoap->xoa_av_modified =
2308                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2309                         XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2310                 }
2311
2312                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2313                     S_ISREG(ip->i_mode)) {
2314                         zfs_sa_get_scanstamp(zp, xvap);
2315                 }
2316
2317                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2318                         uint64_t times[2];
2319
2320                         (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zsb),
2321                             times, sizeof (times));
2322                         ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2323                         XVA_SET_RTN(xvap, XAT_CREATETIME);
2324                 }
2325
2326                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2327                         xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2328                         XVA_SET_RTN(xvap, XAT_REPARSE);
2329                 }
2330                 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2331                         xoap->xoa_generation = zp->z_gen;
2332                         XVA_SET_RTN(xvap, XAT_GEN);
2333                 }
2334
2335                 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2336                         xoap->xoa_offline =
2337                             ((zp->z_pflags & ZFS_OFFLINE) != 0);
2338                         XVA_SET_RTN(xvap, XAT_OFFLINE);
2339                 }
2340
2341                 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2342                         xoap->xoa_sparse =
2343                             ((zp->z_pflags & ZFS_SPARSE) != 0);
2344                         XVA_SET_RTN(xvap, XAT_SPARSE);
2345                 }
2346         }
2347
2348         ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2349         ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2350         ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2351
2352         mutex_exit(&zp->z_lock);
2353
2354         sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2355
2356         if (zp->z_blksz == 0) {
2357                 /*
2358                  * Block size hasn't been set; suggest maximal I/O transfers.
2359                  */
2360                 vap->va_blksize = zsb->z_max_blksz;
2361         }
2362
2363         ZFS_EXIT(zsb);
2364         return (0);
2365 }
2366 EXPORT_SYMBOL(zfs_getattr);
2367
2368 /*
2369  * Get the basic file attributes and place them in the provided kstat
2370  * structure.  The inode is assumed to be the authoritative source
2371  * for most of the attributes.  However, the znode currently has the
2372  * authoritative atime, blksize, and block count.
2373  *
2374  *      IN:     ip      - inode of file.
2375  *
2376  *      OUT:    sp      - kstat values.
2377  *
2378  *      RETURN: 0 (always succeeds)
2379  */
2380 /* ARGSUSED */
2381 int
2382 zfs_getattr_fast(struct inode *ip, struct kstat *sp)
2383 {
2384         znode_t *zp = ITOZ(ip);
2385         zfs_sb_t *zsb = ITOZSB(ip);
2386
2387         ZFS_ENTER(zsb);
2388         ZFS_VERIFY_ZP(zp);
2389
2390         mutex_enter(&zp->z_lock);
2391
2392         generic_fillattr(ip, sp);
2393         ZFS_TIME_DECODE(&sp->atime, zp->z_atime);
2394
2395         sa_object_size(zp->z_sa_hdl, (uint32_t *)&sp->blksize, &sp->blocks);
2396         if (unlikely(zp->z_blksz == 0)) {
2397                 /*
2398                  * Block size hasn't been set; suggest maximal I/O transfers.
2399                  */
2400                 sp->blksize = zsb->z_max_blksz;
2401         }
2402
2403         mutex_exit(&zp->z_lock);
2404
2405         ZFS_EXIT(zsb);
2406
2407         return (0);
2408 }
2409 EXPORT_SYMBOL(zfs_getattr_fast);
2410
2411 /*
2412  * Set the file attributes to the values contained in the
2413  * vattr structure.
2414  *
2415  *      IN:     ip      - inode of file to be modified.
2416  *              vap     - new attribute values.
2417  *                        If ATTR_XVATTR set, then optional attrs are being set
2418  *              flags   - ATTR_UTIME set if non-default time values provided.
2419  *                      - ATTR_NOACLCHECK (CIFS context only).
2420  *              cr      - credentials of caller.
2421  *
2422  *      RETURN: 0 if success
2423  *              error code if failure
2424  *
2425  * Timestamps:
2426  *      ip - ctime updated, mtime updated if size changed.
2427  */
2428 /* ARGSUSED */
2429 int
2430 zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
2431 {
2432         znode_t         *zp = ITOZ(ip);
2433         zfs_sb_t        *zsb = ITOZSB(ip);
2434         zilog_t         *zilog;
2435         dmu_tx_t        *tx;
2436         vattr_t         oldva;
2437         xvattr_t        *tmpxvattr;
2438         uint_t          mask = vap->va_mask;
2439         uint_t          saved_mask = 0;
2440         int             trim_mask = 0;
2441         uint64_t        new_mode;
2442         uint64_t        new_uid, new_gid;
2443         uint64_t        xattr_obj;
2444         uint64_t        mtime[2], ctime[2];
2445         znode_t         *attrzp;
2446         int             need_policy = FALSE;
2447         int             err, err2;
2448         zfs_fuid_info_t *fuidp = NULL;
2449         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2450         xoptattr_t      *xoap;
2451         zfs_acl_t       *aclp;
2452         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2453         boolean_t       fuid_dirtied = B_FALSE;
2454         sa_bulk_attr_t  *bulk, *xattr_bulk;
2455         int             count = 0, xattr_count = 0;
2456
2457         if (mask == 0)
2458                 return (0);
2459
2460         ZFS_ENTER(zsb);
2461         ZFS_VERIFY_ZP(zp);
2462
2463         zilog = zsb->z_log;
2464
2465         /*
2466          * Make sure that if we have ephemeral uid/gid or xvattr specified
2467          * that file system is at proper version level
2468          */
2469
2470         if (zsb->z_use_fuids == B_FALSE &&
2471             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2472             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2473             (mask & ATTR_XVATTR))) {
2474                 ZFS_EXIT(zsb);
2475                 return (SET_ERROR(EINVAL));
2476         }
2477
2478         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2479                 ZFS_EXIT(zsb);
2480                 return (SET_ERROR(EISDIR));
2481         }
2482
2483         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2484                 ZFS_EXIT(zsb);
2485                 return (SET_ERROR(EINVAL));
2486         }
2487
2488         /*
2489          * If this is an xvattr_t, then get a pointer to the structure of
2490          * optional attributes.  If this is NULL, then we have a vattr_t.
2491          */
2492         xoap = xva_getxoptattr(xvap);
2493
2494         tmpxvattr = kmem_alloc(sizeof(xvattr_t), KM_SLEEP);
2495         xva_init(tmpxvattr);
2496
2497         bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 7, KM_SLEEP);
2498         xattr_bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 7, KM_SLEEP);
2499
2500         /*
2501          * Immutable files can only alter immutable bit and atime
2502          */
2503         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2504             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
2505             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2506                 err = EPERM;
2507                 goto out3;
2508         }
2509
2510         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2511                 err = EPERM;
2512                 goto out3;
2513         }
2514
2515         /*
2516          * Verify timestamps doesn't overflow 32 bits.
2517          * ZFS can handle large timestamps, but 32bit syscalls can't
2518          * handle times greater than 2039.  This check should be removed
2519          * once large timestamps are fully supported.
2520          */
2521         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
2522                 if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2523                     ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2524                         err = EOVERFLOW;
2525                         goto out3;
2526                 }
2527         }
2528
2529 top:
2530         attrzp = NULL;
2531         aclp = NULL;
2532
2533         /* Can this be moved to before the top label? */
2534         if (zfs_is_readonly(zsb)) {
2535                 err = EROFS;
2536                 goto out3;
2537         }
2538
2539         /*
2540          * First validate permissions
2541          */
2542
2543         if (mask & ATTR_SIZE) {
2544                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2545                 if (err)
2546                         goto out3;
2547
2548                 truncate_setsize(ip, vap->va_size);
2549
2550                 /*
2551                  * XXX - Note, we are not providing any open
2552                  * mode flags here (like FNDELAY), so we may
2553                  * block if there are locks present... this
2554                  * should be addressed in openat().
2555                  */
2556                 /* XXX - would it be OK to generate a log record here? */
2557                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2558                 if (err)
2559                         goto out3;
2560         }
2561
2562         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2563             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2564             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2565             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2566             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2567             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2568             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2569             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2570                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2571                     skipaclchk, cr);
2572         }
2573
2574         if (mask & (ATTR_UID|ATTR_GID)) {
2575                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
2576                 int     take_owner;
2577                 int     take_group;
2578
2579                 /*
2580                  * NOTE: even if a new mode is being set,
2581                  * we may clear S_ISUID/S_ISGID bits.
2582                  */
2583
2584                 if (!(mask & ATTR_MODE))
2585                         vap->va_mode = zp->z_mode;
2586
2587                 /*
2588                  * Take ownership or chgrp to group we are a member of
2589                  */
2590
2591                 take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
2592                 take_group = (mask & ATTR_GID) &&
2593                     zfs_groupmember(zsb, vap->va_gid, cr);
2594
2595                 /*
2596                  * If both ATTR_UID and ATTR_GID are set then take_owner and
2597                  * take_group must both be set in order to allow taking
2598                  * ownership.
2599                  *
2600                  * Otherwise, send the check through secpolicy_vnode_setattr()
2601                  *
2602                  */
2603
2604                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2605                     take_owner && take_group) ||
2606                     ((idmask == ATTR_UID) && take_owner) ||
2607                     ((idmask == ATTR_GID) && take_group)) {
2608                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2609                             skipaclchk, cr) == 0) {
2610                                 /*
2611                                  * Remove setuid/setgid for non-privileged users
2612                                  */
2613                                 (void) secpolicy_setid_clear(vap, cr);
2614                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2615                         } else {
2616                                 need_policy =  TRUE;
2617                         }
2618                 } else {
2619                         need_policy =  TRUE;
2620                 }
2621         }
2622
2623         mutex_enter(&zp->z_lock);
2624         oldva.va_mode = zp->z_mode;
2625         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2626         if (mask & ATTR_XVATTR) {
2627                 /*
2628                  * Update xvattr mask to include only those attributes
2629                  * that are actually changing.
2630                  *
2631                  * the bits will be restored prior to actually setting
2632                  * the attributes so the caller thinks they were set.
2633                  */
2634                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2635                         if (xoap->xoa_appendonly !=
2636                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2637                                 need_policy = TRUE;
2638                         } else {
2639                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2640                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2641                         }
2642                 }
2643
2644                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2645                         if (xoap->xoa_nounlink !=
2646                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2647                                 need_policy = TRUE;
2648                         } else {
2649                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2650                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2651                         }
2652                 }
2653
2654                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2655                         if (xoap->xoa_immutable !=
2656                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2657                                 need_policy = TRUE;
2658                         } else {
2659                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2660                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2661                         }
2662                 }
2663
2664                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2665                         if (xoap->xoa_nodump !=
2666                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2667                                 need_policy = TRUE;
2668                         } else {
2669                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2670                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2671                         }
2672                 }
2673
2674                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2675                         if (xoap->xoa_av_modified !=
2676                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2677                                 need_policy = TRUE;
2678                         } else {
2679                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2680                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2681                         }
2682                 }
2683
2684                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2685                         if ((!S_ISREG(ip->i_mode) &&
2686                             xoap->xoa_av_quarantined) ||
2687                             xoap->xoa_av_quarantined !=
2688                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2689                                 need_policy = TRUE;
2690                         } else {
2691                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2692                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2693                         }
2694                 }
2695
2696                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2697                         mutex_exit(&zp->z_lock);
2698                         err = EPERM;
2699                         goto out3;
2700                 }
2701
2702                 if (need_policy == FALSE &&
2703                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2704                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2705                         need_policy = TRUE;
2706                 }
2707         }
2708
2709         mutex_exit(&zp->z_lock);
2710
2711         if (mask & ATTR_MODE) {
2712                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2713                         err = secpolicy_setid_setsticky_clear(ip, vap,
2714                             &oldva, cr);
2715                         if (err)
2716                                 goto out3;
2717
2718                         trim_mask |= ATTR_MODE;
2719                 } else {
2720                         need_policy = TRUE;
2721                 }
2722         }
2723
2724         if (need_policy) {
2725                 /*
2726                  * If trim_mask is set then take ownership
2727                  * has been granted or write_acl is present and user
2728                  * has the ability to modify mode.  In that case remove
2729                  * UID|GID and or MODE from mask so that
2730                  * secpolicy_vnode_setattr() doesn't revoke it.
2731                  */
2732
2733                 if (trim_mask) {
2734                         saved_mask = vap->va_mask;
2735                         vap->va_mask &= ~trim_mask;
2736                 }
2737                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2738                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2739                 if (err)
2740                         goto out3;
2741
2742                 if (trim_mask)
2743                         vap->va_mask |= saved_mask;
2744         }
2745
2746         /*
2747          * secpolicy_vnode_setattr, or take ownership may have
2748          * changed va_mask
2749          */
2750         mask = vap->va_mask;
2751
2752         if ((mask & (ATTR_UID | ATTR_GID))) {
2753                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb),
2754                     &xattr_obj, sizeof (xattr_obj));
2755
2756                 if (err == 0 && xattr_obj) {
2757                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2758                         if (err)
2759                                 goto out2;
2760                 }
2761                 if (mask & ATTR_UID) {
2762                         new_uid = zfs_fuid_create(zsb,
2763                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2764                         if (new_uid != zp->z_uid &&
2765                             zfs_fuid_overquota(zsb, B_FALSE, new_uid)) {
2766                                 if (attrzp)
2767                                         iput(ZTOI(attrzp));
2768                                 err = EDQUOT;
2769                                 goto out2;
2770                         }
2771                 }
2772
2773                 if (mask & ATTR_GID) {
2774                         new_gid = zfs_fuid_create(zsb, (uint64_t)vap->va_gid,
2775                             cr, ZFS_GROUP, &fuidp);
2776                         if (new_gid != zp->z_gid &&
2777                             zfs_fuid_overquota(zsb, B_TRUE, new_gid)) {
2778                                 if (attrzp)
2779                                         iput(ZTOI(attrzp));
2780                                 err = EDQUOT;
2781                                 goto out2;
2782                         }
2783                 }
2784         }
2785         tx = dmu_tx_create(zsb->z_os);
2786
2787         if (mask & ATTR_MODE) {
2788                 uint64_t pmode = zp->z_mode;
2789                 uint64_t acl_obj;
2790                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2791
2792                 zfs_acl_chmod_setattr(zp, &aclp, new_mode);
2793
2794                 mutex_enter(&zp->z_lock);
2795                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2796                         /*
2797                          * Are we upgrading ACL from old V0 format
2798                          * to V1 format?
2799                          */
2800                         if (zsb->z_version >= ZPL_VERSION_FUID &&
2801                             zfs_znode_acl_version(zp) ==
2802                             ZFS_ACL_VERSION_INITIAL) {
2803                                 dmu_tx_hold_free(tx, acl_obj, 0,
2804                                     DMU_OBJECT_END);
2805                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2806                                     0, aclp->z_acl_bytes);
2807                         } else {
2808                                 dmu_tx_hold_write(tx, acl_obj, 0,
2809                                     aclp->z_acl_bytes);
2810                         }
2811                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2812                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2813                             0, aclp->z_acl_bytes);
2814                 }
2815                 mutex_exit(&zp->z_lock);
2816                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2817         } else {
2818                 if ((mask & ATTR_XVATTR) &&
2819                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2820                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2821                 else
2822                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2823         }
2824
2825         if (attrzp) {
2826                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2827         }
2828
2829         fuid_dirtied = zsb->z_fuid_dirty;
2830         if (fuid_dirtied)
2831                 zfs_fuid_txhold(zsb, tx);
2832
2833         zfs_sa_upgrade_txholds(tx, zp);
2834
2835         err = dmu_tx_assign(tx, TXG_NOWAIT);
2836         if (err) {
2837                 if (err == ERESTART)
2838                         dmu_tx_wait(tx);
2839                 goto out;
2840         }
2841
2842         count = 0;
2843         /*
2844          * Set each attribute requested.
2845          * We group settings according to the locks they need to acquire.
2846          *
2847          * Note: you cannot set ctime directly, although it will be
2848          * updated as a side-effect of calling this function.
2849          */
2850
2851
2852         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2853                 mutex_enter(&zp->z_acl_lock);
2854         mutex_enter(&zp->z_lock);
2855
2856         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL,
2857             &zp->z_pflags, sizeof (zp->z_pflags));
2858
2859         if (attrzp) {
2860                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2861                         mutex_enter(&attrzp->z_acl_lock);
2862                 mutex_enter(&attrzp->z_lock);
2863                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2864                     SA_ZPL_FLAGS(zsb), NULL, &attrzp->z_pflags,
2865                     sizeof (attrzp->z_pflags));
2866         }
2867
2868         if (mask & (ATTR_UID|ATTR_GID)) {
2869
2870                 if (mask & ATTR_UID) {
2871                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zsb), NULL,
2872                             &new_uid, sizeof (new_uid));
2873                         zp->z_uid = new_uid;
2874                         if (attrzp) {
2875                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2876                                     SA_ZPL_UID(zsb), NULL, &new_uid,
2877                                     sizeof (new_uid));
2878                                 attrzp->z_uid = new_uid;
2879                         }
2880                 }
2881
2882                 if (mask & ATTR_GID) {
2883                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zsb),
2884                             NULL, &new_gid, sizeof (new_gid));
2885                         zp->z_gid = new_gid;
2886                         if (attrzp) {
2887                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2888                                     SA_ZPL_GID(zsb), NULL, &new_gid,
2889                                     sizeof (new_gid));
2890                                 attrzp->z_gid = new_gid;
2891                         }
2892                 }
2893                 if (!(mask & ATTR_MODE)) {
2894                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb),
2895                             NULL, &new_mode, sizeof (new_mode));
2896                         new_mode = zp->z_mode;
2897                 }
2898                 err = zfs_acl_chown_setattr(zp);
2899                 ASSERT(err == 0);
2900                 if (attrzp) {
2901                         err = zfs_acl_chown_setattr(attrzp);
2902                         ASSERT(err == 0);
2903                 }
2904         }
2905
2906         if (mask & ATTR_MODE) {
2907                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL,
2908                     &new_mode, sizeof (new_mode));
2909                 zp->z_mode = new_mode;
2910                 ASSERT3P(aclp, !=, NULL);
2911                 err = zfs_aclset_common(zp, aclp, cr, tx);
2912                 ASSERT0(err);
2913                 if (zp->z_acl_cached)
2914                         zfs_acl_free(zp->z_acl_cached);
2915                 zp->z_acl_cached = aclp;
2916                 aclp = NULL;
2917         }
2918
2919
2920         if (mask & ATTR_ATIME) {
2921                 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
2922                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zsb), NULL,
2923                     &zp->z_atime, sizeof (zp->z_atime));
2924         }
2925
2926         if (mask & ATTR_MTIME) {
2927                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2928                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL,
2929                     mtime, sizeof (mtime));
2930         }
2931
2932         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2933         if (mask & ATTR_SIZE && !(mask & ATTR_MTIME)) {
2934                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb),
2935                     NULL, mtime, sizeof (mtime));
2936                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
2937                     &ctime, sizeof (ctime));
2938                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
2939                     B_TRUE);
2940         } else if (mask != 0) {
2941                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
2942                     &ctime, sizeof (ctime));
2943                 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
2944                     B_TRUE);
2945                 if (attrzp) {
2946                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2947                             SA_ZPL_CTIME(zsb), NULL,
2948                             &ctime, sizeof (ctime));
2949                         zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
2950                             mtime, ctime, B_TRUE);
2951                 }
2952         }
2953         /*
2954          * Do this after setting timestamps to prevent timestamp
2955          * update from toggling bit
2956          */
2957
2958         if (xoap && (mask & ATTR_XVATTR)) {
2959
2960                 /*
2961                  * restore trimmed off masks
2962                  * so that return masks can be set for caller.
2963                  */
2964
2965                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2966                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2967                 }
2968                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2969                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2970                 }
2971                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2972                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2973                 }
2974                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2975                         XVA_SET_REQ(xvap, XAT_NODUMP);
2976                 }
2977                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2978                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2979                 }
2980                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2981                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2982                 }
2983
2984                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2985                         ASSERT(S_ISREG(ip->i_mode));
2986
2987                 zfs_xvattr_set(zp, xvap, tx);
2988         }
2989
2990         if (fuid_dirtied)
2991                 zfs_fuid_sync(zsb, tx);
2992
2993         if (mask != 0)
2994                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2995
2996         mutex_exit(&zp->z_lock);
2997         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2998                 mutex_exit(&zp->z_acl_lock);
2999
3000         if (attrzp) {
3001                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3002                         mutex_exit(&attrzp->z_acl_lock);
3003                 mutex_exit(&attrzp->z_lock);
3004         }
3005 out:
3006         if (err == 0 && attrzp) {
3007                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3008                     xattr_count, tx);
3009                 ASSERT(err2 == 0);
3010         }
3011
3012         if (attrzp)
3013                 iput(ZTOI(attrzp));
3014         if (aclp)
3015                 zfs_acl_free(aclp);
3016
3017         if (fuidp) {
3018                 zfs_fuid_info_free(fuidp);
3019                 fuidp = NULL;
3020         }
3021
3022         if (err) {
3023                 dmu_tx_abort(tx);
3024                 if (err == ERESTART)
3025                         goto top;
3026         } else {
3027                 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3028                 dmu_tx_commit(tx);
3029                 zfs_inode_update(zp);
3030         }
3031
3032 out2:
3033         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
3034                 zil_commit(zilog, 0);
3035
3036 out3:
3037         kmem_free(xattr_bulk, sizeof(sa_bulk_attr_t) * 7);
3038         kmem_free(bulk, sizeof(sa_bulk_attr_t) * 7);
3039         kmem_free(tmpxvattr, sizeof(xvattr_t));
3040         ZFS_EXIT(zsb);
3041         return (err);
3042 }
3043 EXPORT_SYMBOL(zfs_setattr);
3044
3045 typedef struct zfs_zlock {
3046         krwlock_t       *zl_rwlock;     /* lock we acquired */
3047         znode_t         *zl_znode;      /* znode we held */
3048         struct zfs_zlock *zl_next;      /* next in list */
3049 } zfs_zlock_t;
3050
3051 /*
3052  * Drop locks and release vnodes that were held by zfs_rename_lock().
3053  */
3054 static void
3055 zfs_rename_unlock(zfs_zlock_t **zlpp)
3056 {
3057         zfs_zlock_t *zl;
3058
3059         while ((zl = *zlpp) != NULL) {
3060                 if (zl->zl_znode != NULL)
3061                         iput(ZTOI(zl->zl_znode));
3062                 rw_exit(zl->zl_rwlock);
3063                 *zlpp = zl->zl_next;
3064                 kmem_free(zl, sizeof (*zl));
3065         }
3066 }
3067
3068 /*
3069  * Search back through the directory tree, using the ".." entries.
3070  * Lock each directory in the chain to prevent concurrent renames.
3071  * Fail any attempt to move a directory into one of its own descendants.
3072  * XXX - z_parent_lock can overlap with map or grow locks
3073  */
3074 static int
3075 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3076 {
3077         zfs_zlock_t     *zl;
3078         znode_t         *zp = tdzp;
3079         uint64_t        rootid = ZTOZSB(zp)->z_root;
3080         uint64_t        oidp = zp->z_id;
3081         krwlock_t       *rwlp = &szp->z_parent_lock;
3082         krw_t           rw = RW_WRITER;
3083
3084         /*
3085          * First pass write-locks szp and compares to zp->z_id.
3086          * Later passes read-lock zp and compare to zp->z_parent.
3087          */
3088         do {
3089                 if (!rw_tryenter(rwlp, rw)) {
3090                         /*
3091                          * Another thread is renaming in this path.
3092                          * Note that if we are a WRITER, we don't have any
3093                          * parent_locks held yet.
3094                          */
3095                         if (rw == RW_READER && zp->z_id > szp->z_id) {
3096                                 /*
3097                                  * Drop our locks and restart
3098                                  */
3099                                 zfs_rename_unlock(&zl);
3100                                 *zlpp = NULL;
3101                                 zp = tdzp;
3102                                 oidp = zp->z_id;
3103                                 rwlp = &szp->z_parent_lock;
3104                                 rw = RW_WRITER;
3105                                 continue;
3106                         } else {
3107                                 /*
3108                                  * Wait for other thread to drop its locks
3109                                  */
3110                                 rw_enter(rwlp, rw);
3111                         }
3112                 }
3113
3114                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3115                 zl->zl_rwlock = rwlp;
3116                 zl->zl_znode = NULL;
3117                 zl->zl_next = *zlpp;
3118                 *zlpp = zl;
3119
3120                 if (oidp == szp->z_id)          /* We're a descendant of szp */
3121                         return (SET_ERROR(EINVAL));
3122
3123                 if (oidp == rootid)             /* We've hit the top */
3124                         return (0);
3125
3126                 if (rw == RW_READER) {          /* i.e. not the first pass */
3127                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
3128                         if (error)
3129                                 return (error);
3130                         zl->zl_znode = zp;
3131                 }
3132                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
3133                     &oidp, sizeof (oidp));
3134                 rwlp = &zp->z_parent_lock;
3135                 rw = RW_READER;
3136
3137         } while (zp->z_id != sdzp->z_id);
3138
3139         return (0);
3140 }
3141
3142 /*
3143  * Move an entry from the provided source directory to the target
3144  * directory.  Change the entry name as indicated.
3145  *
3146  *      IN:     sdip    - Source directory containing the "old entry".
3147  *              snm     - Old entry name.
3148  *              tdip    - Target directory to contain the "new entry".
3149  *              tnm     - New entry name.
3150  *              cr      - credentials of caller.
3151  *              flags   - case flags
3152  *
3153  *      RETURN: 0 on success, error code on failure.
3154  *
3155  * Timestamps:
3156  *      sdip,tdip - ctime|mtime updated
3157  */
3158 /*ARGSUSED*/
3159 int
3160 zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
3161     cred_t *cr, int flags)
3162 {
3163         znode_t         *tdzp, *szp, *tzp;
3164         znode_t         *sdzp = ITOZ(sdip);
3165         zfs_sb_t        *zsb = ITOZSB(sdip);
3166         zilog_t         *zilog;
3167         zfs_dirlock_t   *sdl, *tdl;
3168         dmu_tx_t        *tx;
3169         zfs_zlock_t     *zl;
3170         int             cmp, serr, terr;
3171         int             error = 0;
3172         int             zflg = 0;
3173
3174         ZFS_ENTER(zsb);
3175         ZFS_VERIFY_ZP(sdzp);
3176         zilog = zsb->z_log;
3177
3178         if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) {
3179                 ZFS_EXIT(zsb);
3180                 return (SET_ERROR(EXDEV));
3181         }
3182
3183         tdzp = ITOZ(tdip);
3184         ZFS_VERIFY_ZP(tdzp);
3185         if (zsb->z_utf8 && u8_validate(tnm,
3186             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3187                 ZFS_EXIT(zsb);
3188                 return (SET_ERROR(EILSEQ));
3189         }
3190
3191         if (flags & FIGNORECASE)
3192                 zflg |= ZCILOOK;
3193
3194 top:
3195         szp = NULL;
3196         tzp = NULL;
3197         zl = NULL;
3198
3199         /*
3200          * This is to prevent the creation of links into attribute space
3201          * by renaming a linked file into/outof an attribute directory.
3202          * See the comment in zfs_link() for why this is considered bad.
3203          */
3204         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3205                 ZFS_EXIT(zsb);
3206                 return (SET_ERROR(EINVAL));
3207         }
3208
3209         /*
3210          * Lock source and target directory entries.  To prevent deadlock,
3211          * a lock ordering must be defined.  We lock the directory with
3212          * the smallest object id first, or if it's a tie, the one with
3213          * the lexically first name.
3214          */
3215         if (sdzp->z_id < tdzp->z_id) {
3216                 cmp = -1;
3217         } else if (sdzp->z_id > tdzp->z_id) {
3218                 cmp = 1;
3219         } else {
3220                 /*
3221                  * First compare the two name arguments without
3222                  * considering any case folding.
3223                  */
3224                 int nofold = (zsb->z_norm & ~U8_TEXTPREP_TOUPPER);
3225
3226                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3227                 ASSERT(error == 0 || !zsb->z_utf8);
3228                 if (cmp == 0) {
3229                         /*
3230                          * POSIX: "If the old argument and the new argument
3231                          * both refer to links to the same existing file,
3232                          * the rename() function shall return successfully
3233                          * and perform no other action."
3234                          */
3235                         ZFS_EXIT(zsb);
3236                         return (0);
3237                 }
3238                 /*
3239                  * If the file system is case-folding, then we may
3240                  * have some more checking to do.  A case-folding file
3241                  * system is either supporting mixed case sensitivity
3242                  * access or is completely case-insensitive.  Note
3243                  * that the file system is always case preserving.
3244                  *
3245                  * In mixed sensitivity mode case sensitive behavior
3246                  * is the default.  FIGNORECASE must be used to
3247                  * explicitly request case insensitive behavior.
3248                  *
3249                  * If the source and target names provided differ only
3250                  * by case (e.g., a request to rename 'tim' to 'Tim'),
3251                  * we will treat this as a special case in the
3252                  * case-insensitive mode: as long as the source name
3253                  * is an exact match, we will allow this to proceed as
3254                  * a name-change request.
3255                  */
3256                 if ((zsb->z_case == ZFS_CASE_INSENSITIVE ||
3257                     (zsb->z_case == ZFS_CASE_MIXED &&
3258                     flags & FIGNORECASE)) &&
3259                     u8_strcmp(snm, tnm, 0, zsb->z_norm, U8_UNICODE_LATEST,
3260                     &error) == 0) {
3261                         /*
3262                          * case preserving rename request, require exact
3263                          * name matches
3264                          */
3265                         zflg |= ZCIEXACT;
3266                         zflg &= ~ZCILOOK;
3267                 }
3268         }
3269
3270         /*
3271          * If the source and destination directories are the same, we should
3272          * grab the z_name_lock of that directory only once.
3273          */
3274         if (sdzp == tdzp) {
3275                 zflg |= ZHAVELOCK;
3276                 rw_enter(&sdzp->z_name_lock, RW_READER);
3277         }
3278
3279         if (cmp < 0) {
3280                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3281                     ZEXISTS | zflg, NULL, NULL);
3282                 terr = zfs_dirent_lock(&tdl,
3283                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3284         } else {
3285                 terr = zfs_dirent_lock(&tdl,
3286                     tdzp, tnm, &tzp, zflg, NULL, NULL);
3287                 serr = zfs_dirent_lock(&sdl,
3288                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3289                     NULL, NULL);
3290         }
3291
3292         if (serr) {
3293                 /*
3294                  * Source entry invalid or not there.
3295                  */
3296                 if (!terr) {
3297                         zfs_dirent_unlock(tdl);
3298                         if (tzp)
3299                                 iput(ZTOI(tzp));
3300                 }
3301
3302                 if (sdzp == tdzp)
3303                         rw_exit(&sdzp->z_name_lock);
3304
3305                 if (strcmp(snm, "..") == 0)
3306                         serr = EINVAL;
3307                 ZFS_EXIT(zsb);
3308                 return (serr);
3309         }
3310         if (terr) {
3311                 zfs_dirent_unlock(sdl);
3312                 iput(ZTOI(szp));
3313
3314                 if (sdzp == tdzp)
3315                         rw_exit(&sdzp->z_name_lock);
3316
3317                 if (strcmp(tnm, "..") == 0)
3318                         terr = EINVAL;
3319                 ZFS_EXIT(zsb);
3320                 return (terr);
3321         }
3322
3323         /*
3324          * Must have write access at the source to remove the old entry
3325          * and write access at the target to create the new entry.
3326          * Note that if target and source are the same, this can be
3327          * done in a single check.
3328          */
3329
3330         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
3331                 goto out;
3332
3333         if (S_ISDIR(ZTOI(szp)->i_mode)) {
3334                 /*
3335                  * Check to make sure rename is valid.
3336                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3337                  */
3338                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
3339                         goto out;
3340         }
3341
3342         /*
3343          * Does target exist?
3344          */
3345         if (tzp) {
3346                 /*
3347                  * Source and target must be the same type.
3348                  */
3349                 if (S_ISDIR(ZTOI(szp)->i_mode)) {
3350                         if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
3351                                 error = SET_ERROR(ENOTDIR);
3352                                 goto out;
3353                         }
3354                 } else {
3355                         if (S_ISDIR(ZTOI(tzp)->i_mode)) {
3356                                 error = SET_ERROR(EISDIR);
3357                                 goto out;
3358                         }
3359                 }
3360                 /*
3361                  * POSIX dictates that when the source and target
3362                  * entries refer to the same file object, rename
3363                  * must do nothing and exit without error.
3364                  */
3365                 if (szp->z_id == tzp->z_id) {
3366                         error = 0;
3367                         goto out;
3368                 }
3369         }
3370
3371         tx = dmu_tx_create(zsb->z_os);
3372         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3373         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3374         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3375         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3376         if (sdzp != tdzp) {
3377                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3378                 zfs_sa_upgrade_txholds(tx, tdzp);
3379         }
3380         if (tzp) {
3381                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3382                 zfs_sa_upgrade_txholds(tx, tzp);
3383         }
3384
3385         zfs_sa_upgrade_txholds(tx, szp);
3386         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
3387         error = dmu_tx_assign(tx, TXG_NOWAIT);
3388         if (error) {
3389                 if (zl != NULL)
3390                         zfs_rename_unlock(&zl);
3391                 zfs_dirent_unlock(sdl);
3392                 zfs_dirent_unlock(tdl);
3393
3394                 if (sdzp == tdzp)
3395                         rw_exit(&sdzp->z_name_lock);
3396
3397                 iput(ZTOI(szp));
3398                 if (tzp)
3399                         iput(ZTOI(tzp));
3400                 if (error == ERESTART) {
3401                         dmu_tx_wait(tx);
3402                         dmu_tx_abort(tx);
3403                         goto top;
3404                 }
3405                 dmu_tx_abort(tx);
3406                 ZFS_EXIT(zsb);
3407                 return (error);
3408         }
3409
3410         if (tzp)        /* Attempt to remove the existing target */
3411                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3412
3413         if (error == 0) {
3414                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3415                 if (error == 0) {
3416                         szp->z_pflags |= ZFS_AV_MODIFIED;
3417
3418                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zsb),
3419                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3420                         ASSERT0(error);
3421
3422                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3423                         if (error == 0) {
3424                                 zfs_log_rename(zilog, tx, TX_RENAME |
3425                                     (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3426                                     sdl->dl_name, tdzp, tdl->dl_name, szp);
3427                         } else {
3428                                 /*
3429                                  * At this point, we have successfully created
3430                                  * the target name, but have failed to remove
3431                                  * the source name.  Since the create was done
3432                                  * with the ZRENAMING flag, there are
3433                                  * complications; for one, the link count is
3434                                  * wrong.  The easiest way to deal with this
3435                                  * is to remove the newly created target, and
3436                                  * return the original error.  This must
3437                                  * succeed; fortunately, it is very unlikely to
3438                                  * fail, since we just created it.
3439                                  */
3440                                 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3441                                     ZRENAMING, NULL), ==, 0);
3442                         }
3443                 }
3444         }
3445
3446         dmu_tx_commit(tx);
3447 out:
3448         if (zl != NULL)
3449                 zfs_rename_unlock(&zl);
3450
3451         zfs_dirent_unlock(sdl);
3452         zfs_dirent_unlock(tdl);
3453
3454         zfs_inode_update(sdzp);
3455         if (sdzp == tdzp)
3456                 rw_exit(&sdzp->z_name_lock);
3457
3458         if (sdzp != tdzp)
3459                 zfs_inode_update(tdzp);
3460
3461         zfs_inode_update(szp);
3462         iput(ZTOI(szp));
3463         if (tzp) {
3464                 zfs_inode_update(tzp);
3465                 iput(ZTOI(tzp));
3466         }
3467
3468         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
3469                 zil_commit(zilog, 0);
3470
3471         ZFS_EXIT(zsb);
3472         return (error);
3473 }
3474 EXPORT_SYMBOL(zfs_rename);
3475
3476 /*
3477  * Insert the indicated symbolic reference entry into the directory.
3478  *
3479  *      IN:     dip     - Directory to contain new symbolic link.
3480  *              link    - Name for new symlink entry.
3481  *              vap     - Attributes of new entry.
3482  *              target  - Target path of new symlink.
3483  *
3484  *              cr      - credentials of caller.
3485  *              flags   - case flags
3486  *
3487  *      RETURN: 0 on success, error code on failure.
3488  *
3489  * Timestamps:
3490  *      dip - ctime|mtime updated
3491  */
3492 /*ARGSUSED*/
3493 int
3494 zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
3495     struct inode **ipp, cred_t *cr, int flags)
3496 {
3497         znode_t         *zp, *dzp = ITOZ(dip);
3498         zfs_dirlock_t   *dl;
3499         dmu_tx_t        *tx;
3500         zfs_sb_t        *zsb = ITOZSB(dip);
3501         zilog_t         *zilog;
3502         uint64_t        len = strlen(link);
3503         int             error;
3504         int             zflg = ZNEW;
3505         zfs_acl_ids_t   acl_ids;
3506         boolean_t       fuid_dirtied;
3507         uint64_t        txtype = TX_SYMLINK;
3508
3509         ASSERT(S_ISLNK(vap->va_mode));
3510
3511         ZFS_ENTER(zsb);
3512         ZFS_VERIFY_ZP(dzp);
3513         zilog = zsb->z_log;
3514
3515         if (zsb->z_utf8 && u8_validate(name, strlen(name),
3516             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3517                 ZFS_EXIT(zsb);
3518                 return (SET_ERROR(EILSEQ));
3519         }
3520         if (flags & FIGNORECASE)
3521                 zflg |= ZCILOOK;
3522
3523         if (len > MAXPATHLEN) {
3524                 ZFS_EXIT(zsb);
3525                 return (SET_ERROR(ENAMETOOLONG));
3526         }
3527
3528         if ((error = zfs_acl_ids_create(dzp, 0,
3529             vap, cr, NULL, &acl_ids)) != 0) {
3530                 ZFS_EXIT(zsb);
3531                 return (error);
3532         }
3533 top:
3534         *ipp = NULL;
3535
3536         /*
3537          * Attempt to lock directory; fail if entry already exists.
3538          */
3539         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3540         if (error) {
3541                 zfs_acl_ids_free(&acl_ids);
3542                 ZFS_EXIT(zsb);
3543                 return (error);
3544         }
3545
3546         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3547                 zfs_acl_ids_free(&acl_ids);
3548                 zfs_dirent_unlock(dl);
3549                 ZFS_EXIT(zsb);
3550                 return (error);
3551         }
3552
3553         if (zfs_acl_ids_overquota(zsb, &acl_ids)) {
3554                 zfs_acl_ids_free(&acl_ids);
3555                 zfs_dirent_unlock(dl);
3556                 ZFS_EXIT(zsb);
3557                 return (SET_ERROR(EDQUOT));
3558         }
3559         tx = dmu_tx_create(zsb->z_os);
3560         fuid_dirtied = zsb->z_fuid_dirty;
3561         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3562         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3563         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3564             ZFS_SA_BASE_ATTR_SIZE + len);
3565         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3566         if (!zsb->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3567                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3568                     acl_ids.z_aclp->z_acl_bytes);
3569         }
3570         if (fuid_dirtied)
3571                 zfs_fuid_txhold(zsb, tx);
3572         error = dmu_tx_assign(tx, TXG_NOWAIT);
3573         if (error) {
3574                 zfs_dirent_unlock(dl);
3575                 if (error == ERESTART) {
3576                         dmu_tx_wait(tx);
3577                         dmu_tx_abort(tx);
3578                         goto top;
3579                 }
3580                 zfs_acl_ids_free(&acl_ids);
3581                 dmu_tx_abort(tx);
3582                 ZFS_EXIT(zsb);
3583                 return (error);
3584         }
3585
3586         /*
3587          * Create a new object for the symlink.
3588          * for version 4 ZPL datsets the symlink will be an SA attribute
3589          */
3590         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3591
3592         if (fuid_dirtied)
3593                 zfs_fuid_sync(zsb, tx);
3594
3595         mutex_enter(&zp->z_lock);
3596         if (zp->z_is_sa)
3597                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zsb),
3598                     link, len, tx);
3599         else
3600                 zfs_sa_symlink(zp, link, len, tx);
3601         mutex_exit(&zp->z_lock);
3602
3603         zp->z_size = len;
3604         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zsb),
3605             &zp->z_size, sizeof (zp->z_size), tx);
3606         /*
3607          * Insert the new object into the directory.
3608          */
3609         (void) zfs_link_create(dl, zp, tx, ZNEW);
3610
3611         if (flags & FIGNORECASE)
3612                 txtype |= TX_CI;
3613         zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3614
3615         zfs_inode_update(dzp);
3616         zfs_inode_update(zp);
3617
3618         zfs_acl_ids_free(&acl_ids);
3619
3620         dmu_tx_commit(tx);
3621
3622         zfs_dirent_unlock(dl);
3623
3624         *ipp = ZTOI(zp);
3625
3626         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
3627                 zil_commit(zilog, 0);
3628
3629         ZFS_EXIT(zsb);
3630         return (error);
3631 }
3632 EXPORT_SYMBOL(zfs_symlink);
3633
3634 /*
3635  * Return, in the buffer contained in the provided uio structure,
3636  * the symbolic path referred to by ip.
3637  *
3638  *      IN:     ip      - inode of symbolic link
3639  *              uio     - structure to contain the link path.
3640  *              cr      - credentials of caller.
3641  *
3642  *      RETURN: 0 if success
3643  *              error code if failure
3644  *
3645  * Timestamps:
3646  *      ip - atime updated
3647  */
3648 /* ARGSUSED */
3649 int
3650 zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
3651 {
3652         znode_t         *zp = ITOZ(ip);
3653         zfs_sb_t        *zsb = ITOZSB(ip);
3654         int             error;
3655
3656         ZFS_ENTER(zsb);
3657         ZFS_VERIFY_ZP(zp);
3658
3659         mutex_enter(&zp->z_lock);
3660         if (zp->z_is_sa)
3661                 error = sa_lookup_uio(zp->z_sa_hdl,
3662                     SA_ZPL_SYMLINK(zsb), uio);
3663         else
3664                 error = zfs_sa_readlink(zp, uio);
3665         mutex_exit(&zp->z_lock);
3666
3667         ZFS_ACCESSTIME_STAMP(zsb, zp);
3668         zfs_inode_update(zp);
3669         ZFS_EXIT(zsb);
3670         return (error);
3671 }
3672 EXPORT_SYMBOL(zfs_readlink);
3673
3674 /*
3675  * Insert a new entry into directory tdip referencing sip.
3676  *
3677  *      IN:     tdip    - Directory to contain new entry.
3678  *              sip     - inode of new entry.
3679  *              name    - name of new entry.
3680  *              cr      - credentials of caller.
3681  *
3682  *      RETURN: 0 if success
3683  *              error code if failure
3684  *
3685  * Timestamps:
3686  *      tdip - ctime|mtime updated
3687  *       sip - ctime updated
3688  */
3689 /* ARGSUSED */
3690 int
3691 zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr)
3692 {
3693         znode_t         *dzp = ITOZ(tdip);
3694         znode_t         *tzp, *szp;
3695         zfs_sb_t        *zsb = ITOZSB(tdip);
3696         zilog_t         *zilog;
3697         zfs_dirlock_t   *dl;
3698         dmu_tx_t        *tx;
3699         int             error;
3700         int             zf = ZNEW;
3701         uint64_t        parent;
3702         uid_t           owner;
3703
3704         ASSERT(S_ISDIR(tdip->i_mode));
3705
3706         ZFS_ENTER(zsb);
3707         ZFS_VERIFY_ZP(dzp);
3708         zilog = zsb->z_log;
3709
3710         /*
3711          * POSIX dictates that we return EPERM here.
3712          * Better choices include ENOTSUP or EISDIR.
3713          */
3714         if (S_ISDIR(sip->i_mode)) {
3715                 ZFS_EXIT(zsb);
3716                 return (SET_ERROR(EPERM));
3717         }
3718
3719         if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) {
3720                 ZFS_EXIT(zsb);
3721                 return (SET_ERROR(EXDEV));
3722         }
3723
3724         szp = ITOZ(sip);
3725         ZFS_VERIFY_ZP(szp);
3726
3727         /* Prevent links to .zfs/shares files */
3728
3729         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zsb),
3730             &parent, sizeof (uint64_t))) != 0) {
3731                 ZFS_EXIT(zsb);
3732                 return (error);
3733         }
3734         if (parent == zsb->z_shares_dir) {
3735                 ZFS_EXIT(zsb);
3736                 return (SET_ERROR(EPERM));
3737         }
3738
3739         if (zsb->z_utf8 && u8_validate(name,
3740             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3741                 ZFS_EXIT(zsb);
3742                 return (SET_ERROR(EILSEQ));
3743         }
3744 #ifdef HAVE_PN_UTILS
3745         if (flags & FIGNORECASE)
3746                 zf |= ZCILOOK;
3747 #endif /* HAVE_PN_UTILS */
3748
3749         /*
3750          * We do not support links between attributes and non-attributes
3751          * because of the potential security risk of creating links
3752          * into "normal" file space in order to circumvent restrictions
3753          * imposed in attribute space.
3754          */
3755         if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
3756                 ZFS_EXIT(zsb);
3757                 return (SET_ERROR(EINVAL));
3758         }
3759
3760         owner = zfs_fuid_map_id(zsb, szp->z_uid, cr, ZFS_OWNER);
3761         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3762                 ZFS_EXIT(zsb);
3763                 return (SET_ERROR(EPERM));
3764         }
3765
3766         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3767                 ZFS_EXIT(zsb);
3768                 return (error);
3769         }
3770
3771 top:
3772         /*
3773          * Attempt to lock directory; fail if entry already exists.
3774          */
3775         error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3776         if (error) {
3777                 ZFS_EXIT(zsb);
3778                 return (error);
3779         }
3780
3781         tx = dmu_tx_create(zsb->z_os);
3782         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3783         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3784         zfs_sa_upgrade_txholds(tx, szp);
3785         zfs_sa_upgrade_txholds(tx, dzp);
3786         error = dmu_tx_assign(tx, TXG_NOWAIT);
3787         if (error) {
3788                 zfs_dirent_unlock(dl);
3789                 if (error == ERESTART) {
3790                         dmu_tx_wait(tx);
3791                         dmu_tx_abort(tx);
3792                         goto top;
3793                 }
3794                 dmu_tx_abort(tx);
3795                 ZFS_EXIT(zsb);
3796                 return (error);
3797         }
3798
3799         error = zfs_link_create(dl, szp, tx, 0);
3800
3801         if (error == 0) {
3802                 uint64_t txtype = TX_LINK;
3803 #ifdef HAVE_PN_UTILS
3804                 if (flags & FIGNORECASE)
3805                         txtype |= TX_CI;
3806 #endif /* HAVE_PN_UTILS */
3807                 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3808         }
3809
3810         dmu_tx_commit(tx);
3811
3812         zfs_dirent_unlock(dl);
3813
3814         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
3815                 zil_commit(zilog, 0);
3816
3817         zfs_inode_update(dzp);
3818         zfs_inode_update(szp);
3819         ZFS_EXIT(zsb);
3820         return (error);
3821 }
3822 EXPORT_SYMBOL(zfs_link);
3823
3824 static void
3825 zfs_putpage_commit_cb(void *arg, int error)
3826 {
3827         struct page *pp = arg;
3828
3829         if (error) {
3830                 __set_page_dirty_nobuffers(pp);
3831
3832                 if (error != ECANCELED)
3833                         SetPageError(pp);
3834         } else {
3835                 ClearPageError(pp);
3836         }
3837
3838         end_page_writeback(pp);
3839 }
3840
3841 /*
3842  * Push a page out to disk, once the page is on stable storage the
3843  * registered commit callback will be run as notification of completion.
3844  *
3845  *      IN:     ip      - page mapped for inode.
3846  *              pp      - page to push (page is locked)
3847  *              wbc     - writeback control data
3848  *
3849  *      RETURN: 0 if success
3850  *              error code if failure
3851  *
3852  * Timestamps:
3853  *      ip - ctime|mtime updated
3854  */
3855 /* ARGSUSED */
3856 int
3857 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
3858 {
3859         znode_t         *zp = ITOZ(ip);
3860         zfs_sb_t        *zsb = ITOZSB(ip);
3861         loff_t          offset;
3862         loff_t          pgoff;
3863         unsigned int    pglen;
3864         rl_t            *rl;
3865         dmu_tx_t        *tx;
3866         caddr_t         va;
3867         int             err = 0;
3868         uint64_t        mtime[2], ctime[2];
3869         sa_bulk_attr_t  bulk[3];
3870         int             cnt = 0;
3871         int             sync;
3872
3873         ZFS_ENTER(zsb);
3874         ZFS_VERIFY_ZP(zp);
3875
3876         ASSERT(PageLocked(pp));
3877
3878         pgoff = page_offset(pp);     /* Page byte-offset in file */
3879         offset = i_size_read(ip);    /* File length in bytes */
3880         pglen = MIN(PAGE_CACHE_SIZE, /* Page length in bytes */
3881             P2ROUNDUP(offset, PAGE_CACHE_SIZE)-pgoff);
3882
3883         /* Page is beyond end of file */
3884         if (pgoff >= offset) {
3885                 unlock_page(pp);
3886                 ZFS_EXIT(zsb);
3887                 return (0);
3888         }
3889
3890         /* Truncate page length to end of file */
3891         if (pgoff + pglen > offset)
3892                 pglen = offset - pgoff;
3893
3894 #if 0
3895         /*
3896          * FIXME: Allow mmap writes past its quota.  The correct fix
3897          * is to register a page_mkwrite() handler to count the page
3898          * against its quota when it is about to be dirtied.
3899          */
3900         if (zfs_owner_overquota(zsb, zp, B_FALSE) ||
3901             zfs_owner_overquota(zsb, zp, B_TRUE)) {
3902                 err = EDQUOT;
3903         }
3904 #endif
3905
3906         set_page_writeback(pp);
3907         unlock_page(pp);
3908
3909         rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER);
3910         tx = dmu_tx_create(zsb->z_os);
3911
3912         sync = ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ||
3913                 (wbc->sync_mode == WB_SYNC_ALL));
3914         if (!sync)
3915                 dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp);
3916
3917         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3918
3919         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3920         zfs_sa_upgrade_txholds(tx, zp);
3921         err = dmu_tx_assign(tx, TXG_NOWAIT);
3922         if (err != 0) {
3923                 if (err == ERESTART)
3924                         dmu_tx_wait(tx);
3925
3926                 /* Will call all registered commit callbacks */
3927                 dmu_tx_abort(tx);
3928
3929                 /*
3930                  * For the synchronous case the commit callback must be
3931                  * explicitly called because there is no registered callback.
3932                  */
3933                 if (sync)
3934                         zfs_putpage_commit_cb(pp, ECANCELED);
3935
3936                 zfs_range_unlock(rl);
3937                 ZFS_EXIT(zsb);
3938                 return (err);
3939         }
3940
3941         va = kmap(pp);
3942         ASSERT3U(pglen, <=, PAGE_CACHE_SIZE);
3943         dmu_write(zsb->z_os, zp->z_id, pgoff, pglen, va, tx);
3944         kunmap(pp);
3945
3946         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
3947         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);
3948         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zsb), NULL, &zp->z_pflags, 8);
3949
3950         /* Preserve the mtime and ctime provided by the inode */
3951         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3952         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3953         zp->z_atime_dirty = 0;
3954         zp->z_seq++;
3955
3956         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3957
3958         zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0);
3959         dmu_tx_commit(tx);
3960
3961         zfs_range_unlock(rl);
3962
3963         if (sync) {
3964                 zil_commit(zsb->z_log, zp->z_id);
3965                 zfs_putpage_commit_cb(pp, err);
3966         }
3967
3968         ZFS_EXIT(zsb);
3969         return (err);
3970 }
3971
3972 /*
3973  * Update the system attributes when the inode has been dirtied.  For the
3974  * moment we only update the mode, atime, mtime, and ctime.
3975  */
3976 int
3977 zfs_dirty_inode(struct inode *ip, int flags)
3978 {
3979         znode_t         *zp = ITOZ(ip);
3980         zfs_sb_t        *zsb = ITOZSB(ip);
3981         dmu_tx_t        *tx;
3982         uint64_t        mode, atime[2], mtime[2], ctime[2];
3983         sa_bulk_attr_t  bulk[4];
3984         int             error;
3985         int             cnt = 0;
3986
3987         ZFS_ENTER(zsb);
3988         ZFS_VERIFY_ZP(zp);
3989
3990         tx = dmu_tx_create(zsb->z_os);
3991
3992         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3993         zfs_sa_upgrade_txholds(tx, zp);
3994
3995         error = dmu_tx_assign(tx, TXG_WAIT);
3996         if (error) {
3997                 dmu_tx_abort(tx);
3998                 goto out;
3999         }
4000
4001         mutex_enter(&zp->z_lock);
4002         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zsb), NULL, &mode, 8);
4003         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zsb), NULL, &atime, 16);
4004         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
4005         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);
4006
4007         /* Preserve the mode, mtime and ctime provided by the inode */
4008         ZFS_TIME_ENCODE(&ip->i_atime, atime);
4009         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
4010         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
4011         mode = ip->i_mode;
4012
4013         zp->z_mode = mode;
4014         zp->z_atime_dirty = 0;
4015
4016         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4017         mutex_exit(&zp->z_lock);
4018
4019         dmu_tx_commit(tx);
4020 out:
4021         ZFS_EXIT(zsb);
4022         return (error);
4023 }
4024 EXPORT_SYMBOL(zfs_dirty_inode);
4025
4026 /*ARGSUSED*/
4027 void
4028 zfs_inactive(struct inode *ip)
4029 {
4030         znode_t *zp = ITOZ(ip);
4031         zfs_sb_t *zsb = ITOZSB(ip);
4032         int error;
4033
4034         if (zfsctl_is_node(ip)) {
4035                 zfsctl_inode_inactive(ip);
4036                 return;
4037         }
4038
4039         rw_enter(&zsb->z_teardown_inactive_lock, RW_READER);
4040         if (zp->z_sa_hdl == NULL) {
4041                 rw_exit(&zsb->z_teardown_inactive_lock);
4042                 return;
4043         }
4044
4045         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4046                 dmu_tx_t *tx = dmu_tx_create(zsb->z_os);
4047
4048                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4049                 zfs_sa_upgrade_txholds(tx, zp);
4050                 error = dmu_tx_assign(tx, TXG_WAIT);
4051                 if (error) {
4052                         dmu_tx_abort(tx);
4053                 } else {
4054                         mutex_enter(&zp->z_lock);
4055                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zsb),
4056                             (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4057                         zp->z_atime_dirty = 0;
4058                         mutex_exit(&zp->z_lock);
4059                         dmu_tx_commit(tx);
4060                 }
4061         }
4062
4063         zfs_zinactive(zp);
4064         rw_exit(&zsb->z_teardown_inactive_lock);
4065 }
4066 EXPORT_SYMBOL(zfs_inactive);
4067
4068 /*
4069  * Bounds-check the seek operation.
4070  *
4071  *      IN:     ip      - inode seeking within
4072  *              ooff    - old file offset
4073  *              noffp   - pointer to new file offset
4074  *              ct      - caller context
4075  *
4076  *      RETURN: 0 if success
4077  *              EINVAL if new offset invalid
4078  */
4079 /* ARGSUSED */
4080 int
4081 zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp)
4082 {
4083         if (S_ISDIR(ip->i_mode))
4084                 return (0);
4085         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4086 }
4087 EXPORT_SYMBOL(zfs_seek);
4088
4089 /*
4090  * Fill pages with data from the disk.
4091  */
4092 static int
4093 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
4094 {
4095         znode_t     *zp = ITOZ(ip);
4096         zfs_sb_t    *zsb = ITOZSB(ip);
4097         objset_t    *os;
4098         struct page *cur_pp;
4099         u_offset_t  io_off, total;
4100         size_t      io_len;
4101         loff_t      i_size;
4102         unsigned    page_idx;
4103         int         err;
4104
4105         os     = zsb->z_os;
4106         io_len = nr_pages << PAGE_CACHE_SHIFT;
4107         i_size = i_size_read(ip);
4108         io_off = page_offset(pl[0]);
4109
4110         if (io_off + io_len > i_size)
4111                 io_len = i_size - io_off;
4112
4113         /*
4114          * Iterate over list of pages and read each page individually.
4115          */
4116         page_idx = 0;
4117         cur_pp   = pl[0];
4118         for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4119                 caddr_t va;
4120
4121                 va = kmap(cur_pp);
4122                 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4123                     DMU_READ_PREFETCH);
4124                 kunmap(cur_pp);
4125                 if (err) {
4126                         /* convert checksum errors into IO errors */
4127                         if (err == ECKSUM)
4128                                 err = SET_ERROR(EIO);
4129                         return (err);
4130                 }
4131                 cur_pp = pl[++page_idx];
4132         }
4133
4134         return (0);
4135 }
4136
4137 /*
4138  * Uses zfs_fillpage to read data from the file and fill the pages.
4139  *
4140  *      IN:     ip       - inode of file to get data from.
4141  *              pl       - list of pages to read
4142  *              nr_pages - number of pages to read
4143  *
4144  *      RETURN: 0 on success, error code on failure.
4145  *
4146  * Timestamps:
4147  *      vp - atime updated
4148  */
4149 /* ARGSUSED */
4150 int
4151 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
4152 {
4153         znode_t  *zp  = ITOZ(ip);
4154         zfs_sb_t *zsb = ITOZSB(ip);
4155         int      err;
4156
4157         if (pl == NULL)
4158                 return (0);
4159
4160         ZFS_ENTER(zsb);
4161         ZFS_VERIFY_ZP(zp);
4162
4163         err = zfs_fillpage(ip, pl, nr_pages);
4164
4165         if (!err)
4166                 ZFS_ACCESSTIME_STAMP(zsb, zp);
4167
4168         ZFS_EXIT(zsb);
4169         return (err);
4170 }
4171 EXPORT_SYMBOL(zfs_getpage);
4172
4173 /*
4174  * Check ZFS specific permissions to memory map a section of a file.
4175  *
4176  *      IN:     ip      - inode of the file to mmap
4177  *              off     - file offset
4178  *              addrp   - start address in memory region
4179  *              len     - length of memory region
4180  *              vm_flags- address flags
4181  *
4182  *      RETURN: 0 if success
4183  *              error code if failure
4184  */
4185 /*ARGSUSED*/
4186 int
4187 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4188     unsigned long vm_flags)
4189 {
4190         znode_t  *zp = ITOZ(ip);
4191         zfs_sb_t *zsb = ITOZSB(ip);
4192
4193         ZFS_ENTER(zsb);
4194         ZFS_VERIFY_ZP(zp);
4195
4196         if ((vm_flags & VM_WRITE) && (zp->z_pflags &
4197             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4198                 ZFS_EXIT(zsb);
4199                 return (SET_ERROR(EPERM));
4200         }
4201
4202         if ((vm_flags & (VM_READ | VM_EXEC)) &&
4203             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4204                 ZFS_EXIT(zsb);
4205                 return (SET_ERROR(EACCES));
4206         }
4207
4208         if (off < 0 || len > MAXOFFSET_T - off) {
4209                 ZFS_EXIT(zsb);
4210                 return (SET_ERROR(ENXIO));
4211         }
4212
4213         ZFS_EXIT(zsb);
4214         return (0);
4215 }
4216 EXPORT_SYMBOL(zfs_map);
4217
4218 /*
4219  * convoff - converts the given data (start, whence) to the
4220  * given whence.
4221  */
4222 int
4223 convoff(struct inode *ip, flock64_t *lckdat, int  whence, offset_t offset)
4224 {
4225         vattr_t vap;
4226         int error;
4227
4228         if ((lckdat->l_whence == 2) || (whence == 2)) {
4229                 if ((error = zfs_getattr(ip, &vap, 0, CRED()) != 0))
4230                         return (error);
4231         }
4232
4233         switch (lckdat->l_whence) {
4234         case 1:
4235                 lckdat->l_start += offset;
4236                 break;
4237         case 2:
4238                 lckdat->l_start += vap.va_size;
4239                 /* FALLTHRU */
4240         case 0:
4241                 break;
4242         default:
4243                 return (SET_ERROR(EINVAL));
4244         }
4245
4246         if (lckdat->l_start < 0)
4247                 return (SET_ERROR(EINVAL));
4248
4249         switch (whence) {
4250         case 1:
4251                 lckdat->l_start -= offset;
4252                 break;
4253         case 2:
4254                 lckdat->l_start -= vap.va_size;
4255                 /* FALLTHRU */
4256         case 0:
4257                 break;
4258         default:
4259                 return (SET_ERROR(EINVAL));
4260         }
4261
4262         lckdat->l_whence = (short)whence;
4263         return (0);
4264 }
4265
4266 /*
4267  * Free or allocate space in a file.  Currently, this function only
4268  * supports the `F_FREESP' command.  However, this command is somewhat
4269  * misnamed, as its functionality includes the ability to allocate as
4270  * well as free space.
4271  *
4272  *      IN:     ip      - inode of file to free data in.
4273  *              cmd     - action to take (only F_FREESP supported).
4274  *              bfp     - section of file to free/alloc.
4275  *              flag    - current file open mode flags.
4276  *              offset  - current file offset.
4277  *              cr      - credentials of caller [UNUSED].
4278  *
4279  *      RETURN: 0 on success, error code on failure.
4280  *
4281  * Timestamps:
4282  *      ip - ctime|mtime updated
4283  */
4284 /* ARGSUSED */
4285 int
4286 zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag,
4287     offset_t offset, cred_t *cr)
4288 {
4289         znode_t         *zp = ITOZ(ip);
4290         zfs_sb_t        *zsb = ITOZSB(ip);
4291         uint64_t        off, len;
4292         int             error;
4293
4294         ZFS_ENTER(zsb);
4295         ZFS_VERIFY_ZP(zp);
4296
4297         if (cmd != F_FREESP) {
4298                 ZFS_EXIT(zsb);
4299                 return (SET_ERROR(EINVAL));
4300         }
4301
4302         if ((error = convoff(ip, bfp, 0, offset))) {
4303                 ZFS_EXIT(zsb);
4304                 return (error);
4305         }
4306
4307         if (bfp->l_len < 0) {
4308                 ZFS_EXIT(zsb);
4309                 return (SET_ERROR(EINVAL));
4310         }
4311
4312         /*
4313          * Permissions aren't checked on Solaris because on this OS
4314          * zfs_space() can only be called with an opened file handle.
4315          * On Linux we can get here through truncate_range() which
4316          * operates directly on inodes, so we need to check access rights.
4317          */
4318         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
4319                 ZFS_EXIT(zsb);
4320                 return (error);
4321         }
4322
4323         off = bfp->l_start;
4324         len = bfp->l_len; /* 0 means from off to end of file */
4325
4326         error = zfs_freesp(zp, off, len, flag, TRUE);
4327
4328         ZFS_EXIT(zsb);
4329         return (error);
4330 }
4331 EXPORT_SYMBOL(zfs_space);
4332
4333 /*ARGSUSED*/
4334 int
4335 zfs_fid(struct inode *ip, fid_t *fidp)
4336 {
4337         znode_t         *zp = ITOZ(ip);
4338         zfs_sb_t        *zsb = ITOZSB(ip);
4339         uint32_t        gen;
4340         uint64_t        gen64;
4341         uint64_t        object = zp->z_id;
4342         zfid_short_t    *zfid;
4343         int             size, i, error;
4344
4345         ZFS_ENTER(zsb);
4346         ZFS_VERIFY_ZP(zp);
4347
4348         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zsb),
4349             &gen64, sizeof (uint64_t))) != 0) {
4350                 ZFS_EXIT(zsb);
4351                 return (error);
4352         }
4353
4354         gen = (uint32_t)gen64;
4355
4356         size = (zsb->z_parent != zsb) ? LONG_FID_LEN : SHORT_FID_LEN;
4357         if (fidp->fid_len < size) {
4358                 fidp->fid_len = size;
4359                 ZFS_EXIT(zsb);
4360                 return (SET_ERROR(ENOSPC));
4361         }
4362
4363         zfid = (zfid_short_t *)fidp;
4364
4365         zfid->zf_len = size;
4366
4367         for (i = 0; i < sizeof (zfid->zf_object); i++)
4368                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4369
4370         /* Must have a non-zero generation number to distinguish from .zfs */
4371         if (gen == 0)
4372                 gen = 1;
4373         for (i = 0; i < sizeof (zfid->zf_gen); i++)
4374                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4375
4376         if (size == LONG_FID_LEN) {
4377                 uint64_t        objsetid = dmu_objset_id(zsb->z_os);
4378                 zfid_long_t     *zlfid;
4379
4380                 zlfid = (zfid_long_t *)fidp;
4381
4382                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4383                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4384
4385                 /* XXX - this should be the generation number for the objset */
4386                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4387                         zlfid->zf_setgen[i] = 0;
4388         }
4389
4390         ZFS_EXIT(zsb);
4391         return (0);
4392 }
4393 EXPORT_SYMBOL(zfs_fid);
4394
4395 /*ARGSUSED*/
4396 int
4397 zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
4398 {
4399         znode_t *zp = ITOZ(ip);
4400         zfs_sb_t *zsb = ITOZSB(ip);
4401         int error;
4402         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4403
4404         ZFS_ENTER(zsb);
4405         ZFS_VERIFY_ZP(zp);
4406         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4407         ZFS_EXIT(zsb);
4408
4409         return (error);
4410 }
4411 EXPORT_SYMBOL(zfs_getsecattr);
4412
4413 /*ARGSUSED*/
4414 int
4415 zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
4416 {
4417         znode_t *zp = ITOZ(ip);
4418         zfs_sb_t *zsb = ITOZSB(ip);
4419         int error;
4420         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4421         zilog_t *zilog = zsb->z_log;
4422
4423         ZFS_ENTER(zsb);
4424         ZFS_VERIFY_ZP(zp);
4425
4426         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4427
4428         if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
4429                 zil_commit(zilog, 0);
4430
4431         ZFS_EXIT(zsb);
4432         return (error);
4433 }
4434 EXPORT_SYMBOL(zfs_setsecattr);
4435
4436 #ifdef HAVE_UIO_ZEROCOPY
4437 /*
4438  * Tunable, both must be a power of 2.
4439  *
4440  * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
4441  * zcr_blksz_max: if set to less than the file block size, allow loaning out of
4442  *              an arcbuf for a partial block read
4443  */
4444 int zcr_blksz_min = (1 << 10);  /* 1K */
4445 int zcr_blksz_max = (1 << 17);  /* 128K */
4446
4447 /*ARGSUSED*/
4448 static int
4449 zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
4450 {
4451         znode_t *zp = ITOZ(ip);
4452         zfs_sb_t *zsb = ITOZSB(ip);
4453         int max_blksz = zsb->z_max_blksz;
4454         uio_t *uio = &xuio->xu_uio;
4455         ssize_t size = uio->uio_resid;
4456         offset_t offset = uio->uio_loffset;
4457         int blksz;
4458         int fullblk, i;
4459         arc_buf_t *abuf;
4460         ssize_t maxsize;
4461         int preamble, postamble;
4462
4463         if (xuio->xu_type != UIOTYPE_ZEROCOPY)
4464                 return (SET_ERROR(EINVAL));
4465
4466         ZFS_ENTER(zsb);
4467         ZFS_VERIFY_ZP(zp);
4468         switch (ioflag) {
4469         case UIO_WRITE:
4470                 /*
4471                  * Loan out an arc_buf for write if write size is bigger than
4472                  * max_blksz, and the file's block size is also max_blksz.
4473                  */
4474                 blksz = max_blksz;
4475                 if (size < blksz || zp->z_blksz != blksz) {
4476                         ZFS_EXIT(zsb);
4477                         return (SET_ERROR(EINVAL));
4478                 }
4479                 /*
4480                  * Caller requests buffers for write before knowing where the
4481                  * write offset might be (e.g. NFS TCP write).
4482                  */
4483                 if (offset == -1) {
4484                         preamble = 0;
4485                 } else {
4486                         preamble = P2PHASE(offset, blksz);
4487                         if (preamble) {
4488                                 preamble = blksz - preamble;
4489                                 size -= preamble;
4490                         }
4491                 }
4492
4493                 postamble = P2PHASE(size, blksz);
4494                 size -= postamble;
4495
4496                 fullblk = size / blksz;
4497                 (void) dmu_xuio_init(xuio,
4498                     (preamble != 0) + fullblk + (postamble != 0));
4499
4500                 /*
4501                  * Have to fix iov base/len for partial buffers.  They
4502                  * currently represent full arc_buf's.
4503                  */
4504                 if (preamble) {
4505                         /* data begins in the middle of the arc_buf */
4506                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4507                             blksz);
4508                         ASSERT(abuf);
4509                         (void) dmu_xuio_add(xuio, abuf,
4510                             blksz - preamble, preamble);
4511                 }
4512
4513                 for (i = 0; i < fullblk; i++) {
4514                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4515                             blksz);
4516                         ASSERT(abuf);
4517                         (void) dmu_xuio_add(xuio, abuf, 0, blksz);
4518                 }
4519
4520                 if (postamble) {
4521                         /* data ends in the middle of the arc_buf */
4522                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4523                             blksz);
4524                         ASSERT(abuf);
4525                         (void) dmu_xuio_add(xuio, abuf, 0, postamble);
4526                 }
4527                 break;
4528         case UIO_READ:
4529                 /*
4530                  * Loan out an arc_buf for read if the read size is larger than
4531                  * the current file block size.  Block alignment is not
4532                  * considered.  Partial arc_buf will be loaned out for read.
4533                  */
4534                 blksz = zp->z_blksz;
4535                 if (blksz < zcr_blksz_min)
4536                         blksz = zcr_blksz_min;
4537                 if (blksz > zcr_blksz_max)
4538                         blksz = zcr_blksz_max;
4539                 /* avoid potential complexity of dealing with it */
4540                 if (blksz > max_blksz) {
4541                         ZFS_EXIT(zsb);
4542                         return (SET_ERROR(EINVAL));
4543                 }
4544
4545                 maxsize = zp->z_size - uio->uio_loffset;
4546                 if (size > maxsize)
4547                         size = maxsize;
4548
4549                 if (size < blksz) {
4550                         ZFS_EXIT(zsb);
4551                         return (SET_ERROR(EINVAL));
4552                 }
4553                 break;
4554         default:
4555                 ZFS_EXIT(zsb);
4556                 return (SET_ERROR(EINVAL));
4557         }
4558
4559         uio->uio_extflg = UIO_XUIO;
4560         XUIO_XUZC_RW(xuio) = ioflag;
4561         ZFS_EXIT(zsb);
4562         return (0);
4563 }
4564
4565 /*ARGSUSED*/
4566 static int
4567 zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
4568 {
4569         int i;
4570         arc_buf_t *abuf;
4571         int ioflag = XUIO_XUZC_RW(xuio);
4572
4573         ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
4574
4575         i = dmu_xuio_cnt(xuio);
4576         while (i-- > 0) {
4577                 abuf = dmu_xuio_arcbuf(xuio, i);
4578                 /*
4579                  * if abuf == NULL, it must be a write buffer
4580                  * that has been returned in zfs_write().
4581                  */
4582                 if (abuf)
4583                         dmu_return_arcbuf(abuf);
4584                 ASSERT(abuf || ioflag == UIO_WRITE);
4585         }
4586
4587         dmu_xuio_fini(xuio);
4588         return (0);
4589 }
4590 #endif /* HAVE_UIO_ZEROCOPY */
4591
4592 #if defined(_KERNEL) && defined(HAVE_SPL)
4593 module_param(zfs_read_chunk_size, long, 0644);
4594 MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
4595 #endif