module/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  28  */
  29
  30 /* Portions Copyright 2007 Jeremy Teo */
  31 /* Portions Copyright 2010 Robert Milkowski */
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/uio_impl.h>
  39 #include <sys/file.h>
  40 #include <sys/stat.h>
  41 #include <sys/kmem.h>
  42 #include <sys/cmn_err.h>
  43 #include <sys/errno.h>
  44 #include <sys/zfs_dir.h>
  45 #include <sys/zfs_acl.h>
  46 #include <sys/zfs_ioctl.h>
  47 #include <sys/fs/zfs.h>
  48 #include <sys/dmu.h>
  49 #include <sys/dmu_objset.h>
  50 #include <sys/spa.h>
  51 #include <sys/txg.h>
  52 #include <sys/dbuf.h>
  53 #include <sys/policy.h>
  54 #include <sys/zfeature.h>
  55 #include <sys/zfs_vnops.h>
  56 #include <sys/zfs_quota.h>
  57 #include <sys/zfs_vfsops.h>
  58 #include <sys/zfs_znode.h>
  59
  60
  61 static ulong_t zfs_fsync_sync_cnt = 4;
  62
  63 int
  64 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
  65 {
  66         int error = 0;
  67         zfsvfs_t *zfsvfs = ZTOZSB(zp);
  68
  69         (void) tsd_set(zfs_fsyncer_key, (void *)(uintptr_t)zfs_fsync_sync_cnt);
  70
  71         if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
  72                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
  73                         goto out;
  74                 atomic_inc_32(&zp->z_sync_writes_cnt);
  75                 zil_commit(zfsvfs->z_log, zp->z_id);
  76                 atomic_dec_32(&zp->z_sync_writes_cnt);
  77                 zfs_exit(zfsvfs, FTAG);
  78         }
  79 out:
  80         tsd_set(zfs_fsyncer_key, NULL);
  81
  82         return (error);
  83 }
  84
  85
  86 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
  87 /*
  88  * Lseek support for finding holes (cmd == SEEK_HOLE) and
  89  * data (cmd == SEEK_DATA). "off" is an in/out parameter.
  90  */
  91 static int
  92 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
  93 {
  94         zfs_locked_range_t *lr;
  95         uint64_t noff = (uint64_t)*off; /* new offset */
  96         uint64_t file_sz;
  97         int error;
  98         boolean_t hole;
  99
 100         file_sz = zp->z_size;
 101         if (noff >= file_sz)  {
 102                 return (SET_ERROR(ENXIO));
 103         }
 104
 105         if (cmd == F_SEEK_HOLE)
 106                 hole = B_TRUE;
 107         else
 108                 hole = B_FALSE;
 109
 110         /* Flush any mmap()'d data to disk */
 111         if (zn_has_cached_data(zp, 0, file_sz - 1))
 112                 zn_flush_cached_data(zp, B_FALSE);
 113
 114         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 115         error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
 116         zfs_rangelock_exit(lr);
 117
 118         if (error == ESRCH)
 119                 return (SET_ERROR(ENXIO));
 120
 121         /* File was dirty, so fall back to using generic logic */
 122         if (error == EBUSY) {
 123                 if (hole)
 124                         *off = file_sz;
 125
 126                 return (0);
 127         }
 128
 129         /*
 130          * We could find a hole that begins after the logical end-of-file,
 131          * because dmu_offset_next() only works on whole blocks.  If the
 132          * EOF falls mid-block, then indicate that the "virtual hole"
 133          * at the end of the file begins at the logical EOF, rather than
 134          * at the end of the last block.
 135          */
 136         if (noff > file_sz) {
 137                 ASSERT(hole);
 138                 noff = file_sz;
 139         }
 140
 141         if (noff < *off)
 142                 return (error);
 143         *off = noff;
 144         return (error);
 145 }
 146
 147 int
 148 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
 149 {
 150         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 151         int error;
 152
 153         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 154                 return (error);
 155
 156         error = zfs_holey_common(zp, cmd, off);
 157
 158         zfs_exit(zfsvfs, FTAG);
 159         return (error);
 160 }
 161 #endif /* SEEK_HOLE && SEEK_DATA */
 162
 163 int
 164 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
 165 {
 166         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 167         int error;
 168
 169         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 170                 return (error);
 171
 172         if (flag & V_ACE_MASK)
 173 #if defined(__linux__)
 174                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
 175                     zfs_init_idmap);
 176 #else
 177                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
 178                     NULL);
 179 #endif
 180         else
 181 #if defined(__linux__)
 182                 error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap);
 183 #else
 184                 error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL);
 185 #endif
 186
 187         zfs_exit(zfsvfs, FTAG);
 188         return (error);
 189 }
 190
 191 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
 192
 193 /*
 194  * Read bytes from specified file into supplied buffer.
 195  *
 196  *      IN:     zp      - inode of file to be read from.
 197  *              uio     - structure supplying read location, range info,
 198  *                        and return buffer.
 199  *              ioflag  - O_SYNC flags; used to provide FRSYNC semantics.
 200  *                        O_DIRECT flag; used to bypass page cache.
 201  *              cr      - credentials of caller.
 202  *
 203  *      OUT:    uio     - updated offset and range, buffer filled.
 204  *
 205  *      RETURN: 0 on success, error code on failure.
 206  *
 207  * Side Effects:
 208  *      inode - atime updated if byte count > 0
 209  */
 210 int
 211 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 212 {
 213         (void) cr;
 214         int error = 0;
 215         boolean_t frsync = B_FALSE;
 216
 217         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 218         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 219                 return (error);
 220
 221         if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 222                 zfs_exit(zfsvfs, FTAG);
 223                 return (SET_ERROR(EACCES));
 224         }
 225
 226         /* We don't copy out anything useful for directories. */
 227         if (Z_ISDIR(ZTOTYPE(zp))) {
 228                 zfs_exit(zfsvfs, FTAG);
 229                 return (SET_ERROR(EISDIR));
 230         }
 231
 232         /*
 233          * Validate file offset
 234          */
 235         if (zfs_uio_offset(uio) < (offset_t)0) {
 236                 zfs_exit(zfsvfs, FTAG);
 237                 return (SET_ERROR(EINVAL));
 238         }
 239
 240         /*
 241          * Fasttrack empty reads
 242          */
 243         if (zfs_uio_resid(uio) == 0) {
 244                 zfs_exit(zfsvfs, FTAG);
 245                 return (0);
 246         }
 247
 248 #ifdef FRSYNC
 249         /*
 250          * If we're in FRSYNC mode, sync out this znode before reading it.
 251          * Only do this for non-snapshots.
 252          *
 253          * Some platforms do not support FRSYNC and instead map it
 254          * to O_SYNC, which results in unnecessary calls to zil_commit. We
 255          * only honor FRSYNC requests on platforms which support it.
 256          */
 257         frsync = !!(ioflag & FRSYNC);
 258 #endif
 259         if (zfsvfs->z_log &&
 260             (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 261                 zil_commit(zfsvfs->z_log, zp->z_id);
 262
 263         /*
 264          * Lock the range against changes.
 265          */
 266         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
 267             zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER);
 268
 269         /*
 270          * If we are reading past end-of-file we can skip
 271          * to the end; but we might still need to set atime.
 272          */
 273         if (zfs_uio_offset(uio) >= zp->z_size) {
 274                 error = 0;
 275                 goto out;
 276         }
 277
 278         ASSERT(zfs_uio_offset(uio) < zp->z_size);
 279 #if defined(__linux__)
 280         ssize_t start_offset = zfs_uio_offset(uio);
 281 #endif
 282         ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
 283         ssize_t start_resid = n;
 284
 285         while (n > 0) {
 286                 ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
 287                     P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
 288 #ifdef UIO_NOCOPY
 289                 if (zfs_uio_segflg(uio) == UIO_NOCOPY)
 290                         error = mappedread_sf(zp, nbytes, uio);
 291                 else
 292 #endif
 293                 if (zn_has_cached_data(zp, zfs_uio_offset(uio),
 294                     zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
 295                         error = mappedread(zp, nbytes, uio);
 296                 } else {
 297                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 298                             uio, nbytes);
 299                 }
 300
 301                 if (error) {
 302                         /* convert checksum errors into IO errors */
 303                         if (error == ECKSUM)
 304                                 error = SET_ERROR(EIO);
 305
 306 #if defined(__linux__)
 307                         /*
 308                          * if we actually read some bytes, bubbling EFAULT
 309                          * up to become EAGAIN isn't what we want here...
 310                          *
 311                          * ...on Linux, at least. On FBSD, doing this breaks.
 312                          */
 313                         if (error == EFAULT &&
 314                             (zfs_uio_offset(uio) - start_offset) != 0)
 315                                 error = 0;
 316 #endif
 317                         break;
 318                 }
 319
 320                 n -= nbytes;
 321         }
 322
 323         int64_t nread = start_resid - n;
 324         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
 325         task_io_account_read(nread);
 326 out:
 327         zfs_rangelock_exit(lr);
 328
 329         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 330         zfs_exit(zfsvfs, FTAG);
 331         return (error);
 332 }
 333
 334 static void
 335 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr,
 336     uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx)
 337 {
 338         zilog_t *zilog = zfsvfs->z_log;
 339         const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 340
 341         ASSERT(clear_setid_bits_txgp != NULL);
 342         ASSERT(tx != NULL);
 343
 344         /*
 345          * Clear Set-UID/Set-GID bits on successful write if not
 346          * privileged and at least one of the execute bits is set.
 347          *
 348          * It would be nice to do this after all writes have
 349          * been done, but that would still expose the ISUID/ISGID
 350          * to another app after the partial write is committed.
 351          *
 352          * Note: we don't call zfs_fuid_map_id() here because
 353          * user 0 is not an ephemeral uid.
 354          */
 355         mutex_enter(&zp->z_acl_lock);
 356         if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 &&
 357             (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 358             secpolicy_vnode_setid_retain(zp, cr,
 359             ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
 360                 uint64_t newmode;
 361
 362                 zp->z_mode &= ~(S_ISUID | S_ISGID);
 363                 newmode = zp->z_mode;
 364                 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 365                     (void *)&newmode, sizeof (uint64_t), tx);
 366
 367                 mutex_exit(&zp->z_acl_lock);
 368
 369                 /*
 370                  * Make sure SUID/SGID bits will be removed when we replay the
 371                  * log. If the setid bits are keep coming back, don't log more
 372                  * than one TX_SETATTR per transaction group.
 373                  */
 374                 if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
 375                         vattr_t va = {0};
 376
 377                         va.va_mask = ATTR_MODE;
 378                         va.va_nodeid = zp->z_id;
 379                         va.va_mode = newmode;
 380                         zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
 381                             ATTR_MODE, NULL);
 382                         *clear_setid_bits_txgp = dmu_tx_get_txg(tx);
 383                 }
 384         } else {
 385                 mutex_exit(&zp->z_acl_lock);
 386         }
 387 }
 388
 389 /*
 390  * Write the bytes to a file.
 391  *
 392  *      IN:     zp      - znode of file to be written to.
 393  *              uio     - structure supplying write location, range info,
 394  *                        and data buffer.
 395  *              ioflag  - O_APPEND flag set if in append mode.
 396  *                        O_DIRECT flag; used to bypass page cache.
 397  *              cr      - credentials of caller.
 398  *
 399  *      OUT:    uio     - updated offset and range.
 400  *
 401  *      RETURN: 0 if success
 402  *              error code if failure
 403  *
 404  * Timestamps:
 405  *      ip - ctime|mtime updated if byte count > 0
 406  */
 407 int
 408 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 409 {
 410         int error = 0, error1;
 411         ssize_t start_resid = zfs_uio_resid(uio);
 412         uint64_t clear_setid_bits_txg = 0;
 413
 414         /*
 415          * Fasttrack empty write
 416          */
 417         ssize_t n = start_resid;
 418         if (n == 0)
 419                 return (0);
 420
 421         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 422         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 423                 return (error);
 424
 425         sa_bulk_attr_t bulk[4];
 426         int count = 0;
 427         uint64_t mtime[2], ctime[2];
 428         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 429         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 430         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 431             &zp->z_size, 8);
 432         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 433             &zp->z_pflags, 8);
 434
 435         /*
 436          * Callers might not be able to detect properly that we are read-only,
 437          * so check it explicitly here.
 438          */
 439         if (zfs_is_readonly(zfsvfs)) {
 440                 zfs_exit(zfsvfs, FTAG);
 441                 return (SET_ERROR(EROFS));
 442         }
 443
 444         /*
 445          * If immutable or not appending then return EPERM.
 446          * Intentionally allow ZFS_READONLY through here.
 447          * See zfs_zaccess_common()
 448          */
 449         if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 450             ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
 451             (zfs_uio_offset(uio) < zp->z_size))) {
 452                 zfs_exit(zfsvfs, FTAG);
 453                 return (SET_ERROR(EPERM));
 454         }
 455
 456         /*
 457          * Validate file offset
 458          */
 459         offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
 460         if (woff < 0) {
 461                 zfs_exit(zfsvfs, FTAG);
 462                 return (SET_ERROR(EINVAL));
 463         }
 464
 465         const uint64_t max_blksz = zfsvfs->z_max_blksz;
 466
 467         /*
 468          * Pre-fault the pages to ensure slow (eg NFS) pages
 469          * don't hold up txg.
 470          * Skip this if uio contains loaned arc_buf.
 471          */
 472         if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
 473                 zfs_exit(zfsvfs, FTAG);
 474                 return (SET_ERROR(EFAULT));
 475         }
 476
 477         /*
 478          * If in append mode, set the io offset pointer to eof.
 479          */
 480         zfs_locked_range_t *lr;
 481         if (ioflag & O_APPEND) {
 482                 /*
 483                  * Obtain an appending range lock to guarantee file append
 484                  * semantics.  We reset the write offset once we have the lock.
 485                  */
 486                 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
 487                 woff = lr->lr_offset;
 488                 if (lr->lr_length == UINT64_MAX) {
 489                         /*
 490                          * We overlocked the file because this write will cause
 491                          * the file block size to increase.
 492                          * Note that zp_size cannot change with this lock held.
 493                          */
 494                         woff = zp->z_size;
 495                 }
 496                 zfs_uio_setoffset(uio, woff);
 497         } else {
 498                 /*
 499                  * Note that if the file block size will change as a result of
 500                  * this write, then this range lock will lock the entire file
 501                  * so that we can re-write the block safely.
 502                  */
 503                 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 504         }
 505
 506         if (zn_rlimit_fsize_uio(zp, uio)) {
 507                 zfs_rangelock_exit(lr);
 508                 zfs_exit(zfsvfs, FTAG);
 509                 return (SET_ERROR(EFBIG));
 510         }
 511
 512         const rlim64_t limit = MAXOFFSET_T;
 513
 514         if (woff >= limit) {
 515                 zfs_rangelock_exit(lr);
 516                 zfs_exit(zfsvfs, FTAG);
 517                 return (SET_ERROR(EFBIG));
 518         }
 519
 520         if (n > limit - woff)
 521                 n = limit - woff;
 522
 523         uint64_t end_size = MAX(zp->z_size, woff + n);
 524         zilog_t *zilog = zfsvfs->z_log;
 525
 526         const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 527         const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
 528         const uint64_t projid = zp->z_projid;
 529
 530         /*
 531          * Write the file in reasonable size chunks.  Each chunk is written
 532          * in a separate transaction; this keeps the intent log records small
 533          * and allows us to do more fine-grained space accounting.
 534          */
 535         while (n > 0) {
 536                 woff = zfs_uio_offset(uio);
 537
 538                 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
 539                     zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
 540                     (projid != ZFS_DEFAULT_PROJID &&
 541                     zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 542                     projid))) {
 543                         error = SET_ERROR(EDQUOT);
 544                         break;
 545                 }
 546
 547                 arc_buf_t *abuf = NULL;
 548                 if (n >= max_blksz && woff >= zp->z_size &&
 549                     P2PHASE(woff, max_blksz) == 0 &&
 550                     zp->z_blksz == max_blksz) {
 551                         /*
 552                          * This write covers a full block.  "Borrow" a buffer
 553                          * from the dmu so that we can fill it before we enter
 554                          * a transaction.  This avoids the possibility of
 555                          * holding up the transaction if the data copy hangs
 556                          * up on a pagefault (e.g., from an NFS server mapping).
 557                          */
 558                         size_t cbytes;
 559
 560                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 561                             max_blksz);
 562                         ASSERT(abuf != NULL);
 563                         ASSERT(arc_buf_size(abuf) == max_blksz);
 564                         if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
 565                             UIO_WRITE, uio, &cbytes))) {
 566                                 dmu_return_arcbuf(abuf);
 567                                 break;
 568                         }
 569                         ASSERT3S(cbytes, ==, max_blksz);
 570                 }
 571
 572                 /*
 573                  * Start a transaction.
 574                  */
 575                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 576                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 577                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 578                 DB_DNODE_ENTER(db);
 579                 dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
 580                     MIN(n, max_blksz));
 581                 DB_DNODE_EXIT(db);
 582                 zfs_sa_upgrade_txholds(tx, zp);
 583                 error = dmu_tx_assign(tx, TXG_WAIT);
 584                 if (error) {
 585                         dmu_tx_abort(tx);
 586                         if (abuf != NULL)
 587                                 dmu_return_arcbuf(abuf);
 588                         break;
 589                 }
 590
 591                 /*
 592                  * NB: We must call zfs_clear_setid_bits_if_necessary before
 593                  * committing the transaction!
 594                  */
 595
 596                 /*
 597                  * If rangelock_enter() over-locked we grow the blocksize
 598                  * and then reduce the lock range.  This will only happen
 599                  * on the first iteration since rangelock_reduce() will
 600                  * shrink down lr_length to the appropriate size.
 601                  */
 602                 if (lr->lr_length == UINT64_MAX) {
 603                         uint64_t new_blksz;
 604
 605                         if (zp->z_blksz > max_blksz) {
 606                                 /*
 607                                  * File's blocksize is already larger than the
 608                                  * "recordsize" property.  Only let it grow to
 609                                  * the next power of 2.
 610                                  */
 611                                 ASSERT(!ISP2(zp->z_blksz));
 612                                 new_blksz = MIN(end_size,
 613                                     1 << highbit64(zp->z_blksz));
 614                         } else {
 615                                 new_blksz = MIN(end_size, max_blksz);
 616                         }
 617                         zfs_grow_blocksize(zp, new_blksz, tx);
 618                         zfs_rangelock_reduce(lr, woff, n);
 619                 }
 620
 621                 /*
 622                  * XXX - should we really limit each write to z_max_blksz?
 623                  * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 624                  */
 625                 const ssize_t nbytes =
 626                     MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 627
 628                 ssize_t tx_bytes;
 629                 if (abuf == NULL) {
 630                         tx_bytes = zfs_uio_resid(uio);
 631                         zfs_uio_fault_disable(uio, B_TRUE);
 632                         error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 633                             uio, nbytes, tx);
 634                         zfs_uio_fault_disable(uio, B_FALSE);
 635 #ifdef __linux__
 636                         if (error == EFAULT) {
 637                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 638                                     cr, &clear_setid_bits_txg, tx);
 639                                 dmu_tx_commit(tx);
 640                                 /*
 641                                  * Account for partial writes before
 642                                  * continuing the loop.
 643                                  * Update needs to occur before the next
 644                                  * zfs_uio_prefaultpages, or prefaultpages may
 645                                  * error, and we may break the loop early.
 646                                  */
 647                                 if (tx_bytes != zfs_uio_resid(uio))
 648                                         n -= tx_bytes - zfs_uio_resid(uio);
 649                                 if (zfs_uio_prefaultpages(MIN(n, max_blksz),
 650                                     uio)) {
 651                                         break;
 652                                 }
 653                                 continue;
 654                         }
 655 #endif
 656                         /*
 657                          * On FreeBSD, EFAULT should be propagated back to the
 658                          * VFS, which will handle faulting and will retry.
 659                          */
 660                         if (error != 0 && error != EFAULT) {
 661                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 662                                     cr, &clear_setid_bits_txg, tx);
 663                                 dmu_tx_commit(tx);
 664                                 break;
 665                         }
 666                         tx_bytes -= zfs_uio_resid(uio);
 667                 } else {
 668                         /* Implied by abuf != NULL: */
 669                         ASSERT3S(n, >=, max_blksz);
 670                         ASSERT0(P2PHASE(woff, max_blksz));
 671                         /*
 672                          * We can simplify nbytes to MIN(n, max_blksz) since
 673                          * P2PHASE(woff, max_blksz) is 0, and knowing
 674                          * n >= max_blksz lets us simplify further:
 675                          */
 676                         ASSERT3S(nbytes, ==, max_blksz);
 677                         /*
 678                          * Thus, we're writing a full block at a block-aligned
 679                          * offset and extending the file past EOF.
 680                          *
 681                          * dmu_assign_arcbuf_by_dbuf() will directly assign the
 682                          * arc buffer to a dbuf.
 683                          */
 684                         error = dmu_assign_arcbuf_by_dbuf(
 685                             sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
 686                         if (error != 0) {
 687                                 /*
 688                                  * XXX This might not be necessary if
 689                                  * dmu_assign_arcbuf_by_dbuf is guaranteed
 690                                  * to be atomic.
 691                                  */
 692                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 693                                     cr, &clear_setid_bits_txg, tx);
 694                                 dmu_return_arcbuf(abuf);
 695                                 dmu_tx_commit(tx);
 696                                 break;
 697                         }
 698                         ASSERT3S(nbytes, <=, zfs_uio_resid(uio));
 699                         zfs_uioskip(uio, nbytes);
 700                         tx_bytes = nbytes;
 701                 }
 702                 if (tx_bytes &&
 703                     zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
 704                     !(ioflag & O_DIRECT)) {
 705                         update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
 706                 }
 707
 708                 /*
 709                  * If we made no progress, we're done.  If we made even
 710                  * partial progress, update the znode and ZIL accordingly.
 711                  */
 712                 if (tx_bytes == 0) {
 713                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 714                             (void *)&zp->z_size, sizeof (uint64_t), tx);
 715                         dmu_tx_commit(tx);
 716                         ASSERT(error != 0);
 717                         break;
 718                 }
 719
 720                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr,
 721                     &clear_setid_bits_txg, tx);
 722
 723                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 724
 725                 /*
 726                  * Update the file size (zp_size) if it has changed;
 727                  * account for possible concurrent updates.
 728                  */
 729                 while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
 730                         (void) atomic_cas_64(&zp->z_size, end_size,
 731                             zfs_uio_offset(uio));
 732                         ASSERT(error == 0 || error == EFAULT);
 733                 }
 734                 /*
 735                  * If we are replaying and eof is non zero then force
 736                  * the file size to the specified eof. Note, there's no
 737                  * concurrency during replay.
 738                  */
 739                 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 740                         zp->z_size = zfsvfs->z_replay_eof;
 741
 742                 error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 743                 if (error1 != 0)
 744                         /* Avoid clobbering EFAULT. */
 745                         error = error1;
 746
 747                 /*
 748                  * NB: During replay, the TX_SETATTR record logged by
 749                  * zfs_clear_setid_bits_if_necessary must precede any of
 750                  * the TX_WRITE records logged here.
 751                  */
 752                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
 753                     NULL, NULL);
 754
 755                 dmu_tx_commit(tx);
 756
 757                 if (error != 0)
 758                         break;
 759                 ASSERT3S(tx_bytes, ==, nbytes);
 760                 n -= nbytes;
 761
 762                 if (n > 0) {
 763                         if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
 764                                 error = SET_ERROR(EFAULT);
 765                                 break;
 766                         }
 767                 }
 768         }
 769
 770         zfs_znode_update_vfs(zp);
 771         zfs_rangelock_exit(lr);
 772
 773         /*
 774          * If we're in replay mode, or we made no progress, or the
 775          * uio data is inaccessible return an error.  Otherwise, it's
 776          * at least a partial write, so it's successful.
 777          */
 778         if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
 779             error == EFAULT) {
 780                 zfs_exit(zfsvfs, FTAG);
 781                 return (error);
 782         }
 783
 784         if (ioflag & (O_SYNC | O_DSYNC) ||
 785             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 786                 zil_commit(zilog, zp->z_id);
 787
 788         const int64_t nwritten = start_resid - zfs_uio_resid(uio);
 789         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
 790         task_io_account_write(nwritten);
 791
 792         zfs_exit(zfsvfs, FTAG);
 793         return (0);
 794 }
 795
 796 int
 797 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 798 {
 799         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 800         int error;
 801         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 802
 803         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 804                 return (error);
 805         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 806         zfs_exit(zfsvfs, FTAG);
 807
 808         return (error);
 809 }
 810
 811 int
 812 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 813 {
 814         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 815         int error;
 816         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 817         zilog_t *zilog = zfsvfs->z_log;
 818
 819         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 820                 return (error);
 821
 822         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 823
 824         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 825                 zil_commit(zilog, 0);
 826
 827         zfs_exit(zfsvfs, FTAG);
 828         return (error);
 829 }
 830
 831 #ifdef ZFS_DEBUG
 832 static int zil_fault_io = 0;
 833 #endif
 834
 835 static void zfs_get_done(zgd_t *zgd, int error);
 836
 837 /*
 838  * Get data to generate a TX_WRITE intent log record.
 839  */
 840 int
 841 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 842     struct lwb *lwb, zio_t *zio)
 843 {
 844         zfsvfs_t *zfsvfs = arg;
 845         objset_t *os = zfsvfs->z_os;
 846         znode_t *zp;
 847         uint64_t object = lr->lr_foid;
 848         uint64_t offset = lr->lr_offset;
 849         uint64_t size = lr->lr_length;
 850         dmu_buf_t *db;
 851         zgd_t *zgd;
 852         int error = 0;
 853         uint64_t zp_gen;
 854
 855         ASSERT3P(lwb, !=, NULL);
 856         ASSERT3P(zio, !=, NULL);
 857         ASSERT3U(size, !=, 0);
 858
 859         /*
 860          * Nothing to do if the file has been removed
 861          */
 862         if (zfs_zget(zfsvfs, object, &zp) != 0)
 863                 return (SET_ERROR(ENOENT));
 864         if (zp->z_unlinked) {
 865                 /*
 866                  * Release the vnode asynchronously as we currently have the
 867                  * txg stopped from syncing.
 868                  */
 869                 zfs_zrele_async(zp);
 870                 return (SET_ERROR(ENOENT));
 871         }
 872         /* check if generation number matches */
 873         if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
 874             sizeof (zp_gen)) != 0) {
 875                 zfs_zrele_async(zp);
 876                 return (SET_ERROR(EIO));
 877         }
 878         if (zp_gen != gen) {
 879                 zfs_zrele_async(zp);
 880                 return (SET_ERROR(ENOENT));
 881         }
 882
 883         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 884         zgd->zgd_lwb = lwb;
 885         zgd->zgd_private = zp;
 886
 887         /*
 888          * Write records come in two flavors: immediate and indirect.
 889          * For small writes it's cheaper to store the data with the
 890          * log record (immediate); for large writes it's cheaper to
 891          * sync the data and get a pointer to it (indirect) so that
 892          * we don't have to write the data twice.
 893          */
 894         if (buf != NULL) { /* immediate write */
 895                 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
 896                     offset, size, RL_READER);
 897                 /* test for truncation needs to be done while range locked */
 898                 if (offset >= zp->z_size) {
 899                         error = SET_ERROR(ENOENT);
 900                 } else {
 901                         error = dmu_read(os, object, offset, size, buf,
 902                             DMU_READ_NO_PREFETCH);
 903                 }
 904                 ASSERT(error == 0 || error == ENOENT);
 905         } else { /* indirect write */
 906                 /*
 907                  * Have to lock the whole block to ensure when it's
 908                  * written out and its checksum is being calculated
 909                  * that no one can change the data. We need to re-check
 910                  * blocksize after we get the lock in case it's changed!
 911                  */
 912                 for (;;) {
 913                         uint64_t blkoff;
 914                         size = zp->z_blksz;
 915                         blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 916                         offset -= blkoff;
 917                         zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
 918                             offset, size, RL_READER);
 919                         if (zp->z_blksz == size)
 920                                 break;
 921                         offset += blkoff;
 922                         zfs_rangelock_exit(zgd->zgd_lr);
 923                 }
 924                 /* test for truncation needs to be done while range locked */
 925                 if (lr->lr_offset >= zp->z_size)
 926                         error = SET_ERROR(ENOENT);
 927 #ifdef ZFS_DEBUG
 928                 if (zil_fault_io) {
 929                         error = SET_ERROR(EIO);
 930                         zil_fault_io = 0;
 931                 }
 932 #endif
 933                 if (error == 0)
 934                         error = dmu_buf_hold(os, object, offset, zgd, &db,
 935                             DMU_READ_NO_PREFETCH);
 936
 937                 if (error == 0) {
 938                         blkptr_t *bp = &lr->lr_blkptr;
 939
 940                         zgd->zgd_db = db;
 941                         zgd->zgd_bp = bp;
 942
 943                         ASSERT(db->db_offset == offset);
 944                         ASSERT(db->db_size == size);
 945
 946                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
 947                             zfs_get_done, zgd);
 948                         ASSERT(error || lr->lr_length <= size);
 949
 950                         /*
 951                          * On success, we need to wait for the write I/O
 952                          * initiated by dmu_sync() to complete before we can
 953                          * release this dbuf.  We will finish everything up
 954                          * in the zfs_get_done() callback.
 955                          */
 956                         if (error == 0)
 957                                 return (0);
 958
 959                         if (error == EALREADY) {
 960                                 lr->lr_common.lrc_txtype = TX_WRITE2;
 961                                 /*
 962                                  * TX_WRITE2 relies on the data previously
 963                                  * written by the TX_WRITE that caused
 964                                  * EALREADY.  We zero out the BP because
 965                                  * it is the old, currently-on-disk BP.
 966                                  */
 967                                 zgd->zgd_bp = NULL;
 968                                 BP_ZERO(bp);
 969                                 error = 0;
 970                         }
 971                 }
 972         }
 973
 974         zfs_get_done(zgd, error);
 975
 976         return (error);
 977 }
 978
 979
 980 static void
 981 zfs_get_done(zgd_t *zgd, int error)
 982 {
 983         (void) error;
 984         znode_t *zp = zgd->zgd_private;
 985
 986         if (zgd->zgd_db)
 987                 dmu_buf_rele(zgd->zgd_db, zgd);
 988
 989         zfs_rangelock_exit(zgd->zgd_lr);
 990
 991         /*
 992          * Release the vnode asynchronously as we currently have the
 993          * txg stopped from syncing.
 994          */
 995         zfs_zrele_async(zp);
 996
 997         kmem_free(zgd, sizeof (zgd_t));
 998 }
 999
1000 static int
1001 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
1002 {
1003         int error;
1004
1005         /* Swap. Not sure if the order of zfs_enter()s is important. */
1006         if (zfsvfs1 > zfsvfs2) {
1007                 zfsvfs_t *tmpzfsvfs;
1008
1009                 tmpzfsvfs = zfsvfs2;
1010                 zfsvfs2 = zfsvfs1;
1011                 zfsvfs1 = tmpzfsvfs;
1012         }
1013
1014         error = zfs_enter(zfsvfs1, tag);
1015         if (error != 0)
1016                 return (error);
1017         if (zfsvfs1 != zfsvfs2) {
1018                 error = zfs_enter(zfsvfs2, tag);
1019                 if (error != 0) {
1020                         zfs_exit(zfsvfs1, tag);
1021                         return (error);
1022                 }
1023         }
1024
1025         return (0);
1026 }
1027
1028 static void
1029 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
1030 {
1031
1032         zfs_exit(zfsvfs1, tag);
1033         if (zfsvfs1 != zfsvfs2)
1034                 zfs_exit(zfsvfs2, tag);
1035 }
1036
1037 /*
1038  * We split each clone request in chunks that can fit into a single ZIL
1039  * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
1040  * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
1041  * us room for storing 1022 block pointers.
1042  *
1043  * On success, the function return the number of bytes copied in *lenp.
1044  * Note, it doesn't return how much bytes are left to be copied.
1045  */
1046 int
1047 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
1048     uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
1049 {
1050         zfsvfs_t        *inzfsvfs, *outzfsvfs;
1051         objset_t        *inos, *outos;
1052         zfs_locked_range_t *inlr, *outlr;
1053         dmu_buf_impl_t  *db;
1054         dmu_tx_t        *tx;
1055         zilog_t         *zilog;
1056         uint64_t        inoff, outoff, len, done;
1057         uint64_t        outsize, size;
1058         int             error;
1059         int             count = 0;
1060         sa_bulk_attr_t  bulk[3];
1061         uint64_t        mtime[2], ctime[2];
1062         uint64_t        uid, gid, projid;
1063         blkptr_t        *bps;
1064         size_t          maxblocks, nbps;
1065         uint_t          inblksz;
1066         uint64_t        clear_setid_bits_txg = 0;
1067
1068         inoff = *inoffp;
1069         outoff = *outoffp;
1070         len = *lenp;
1071         done = 0;
1072
1073         inzfsvfs = ZTOZSB(inzp);
1074         outzfsvfs = ZTOZSB(outzp);
1075         inos = inzfsvfs->z_os;
1076         outos = outzfsvfs->z_os;
1077
1078         /*
1079          * Both source and destination have to belong to the same storage pool.
1080          */
1081         if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
1082                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1083                 return (SET_ERROR(EXDEV));
1084         }
1085
1086         /*
1087          * We need to call zfs_enter() potentially on two different datasets,
1088          * so we need a dedicated function for that.
1089          */
1090         error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
1091         if (error != 0)
1092                 return (error);
1093
1094         ASSERT(!outzfsvfs->z_replay);
1095
1096         error = zfs_verify_zp(inzp);
1097         if (error == 0)
1098                 error = zfs_verify_zp(outzp);
1099         if (error != 0) {
1100                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1101                 return (error);
1102         }
1103
1104         if (!spa_feature_is_enabled(dmu_objset_spa(outos),
1105             SPA_FEATURE_BLOCK_CLONING)) {
1106                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1107                 return (SET_ERROR(EXDEV));
1108         }
1109
1110         /*
1111          * We don't copy source file's flags that's why we don't allow to clone
1112          * files that are in quarantine.
1113          */
1114         if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
1115                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1116                 return (SET_ERROR(EACCES));
1117         }
1118
1119         if (inoff >= inzp->z_size) {
1120                 *lenp = 0;
1121                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1122                 return (0);
1123         }
1124         if (len > inzp->z_size - inoff) {
1125                 len = inzp->z_size - inoff;
1126         }
1127         if (len == 0) {
1128                 *lenp = 0;
1129                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1130                 return (0);
1131         }
1132
1133         /*
1134          * Callers might not be able to detect properly that we are read-only,
1135          * so check it explicitly here.
1136          */
1137         if (zfs_is_readonly(outzfsvfs)) {
1138                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1139                 return (SET_ERROR(EROFS));
1140         }
1141
1142         /*
1143          * If immutable or not appending then return EPERM.
1144          * Intentionally allow ZFS_READONLY through here.
1145          * See zfs_zaccess_common()
1146          */
1147         if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
1148                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1149                 return (SET_ERROR(EPERM));
1150         }
1151
1152         /*
1153          * No overlapping if we are cloning within the same file.
1154          */
1155         if (inzp == outzp) {
1156                 if (inoff < outoff + len && outoff < inoff + len) {
1157                         zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1158                         return (SET_ERROR(EINVAL));
1159                 }
1160         }
1161
1162         /*
1163          * Maintain predictable lock order.
1164          */
1165         if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
1166                 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
1167                     RL_READER);
1168                 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
1169                     RL_WRITER);
1170         } else {
1171                 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
1172                     RL_WRITER);
1173                 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
1174                     RL_READER);
1175         }
1176
1177         inblksz = inzp->z_blksz;
1178
1179         /*
1180          * We cannot clone into files with different block size.
1181          */
1182         if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
1183                 error = SET_ERROR(EXDEV);
1184                 goto unlock;
1185         }
1186
1187         /*
1188          * Offsets and len must be at block boundries.
1189          */
1190         if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
1191                 error = SET_ERROR(EXDEV);
1192                 goto unlock;
1193         }
1194         /*
1195          * Length must be multipe of blksz, except for the end of the file.
1196          */
1197         if ((len % inblksz) != 0 &&
1198             (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
1199                 error = SET_ERROR(EXDEV);
1200                 goto unlock;
1201         }
1202
1203         error = zn_rlimit_fsize(outoff + len);
1204         if (error != 0) {
1205                 goto unlock;
1206         }
1207
1208         if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
1209                 error = SET_ERROR(EFBIG);
1210                 goto unlock;
1211         }
1212
1213         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
1214             &mtime, 16);
1215         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
1216             &ctime, 16);
1217         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
1218             &outzp->z_size, 8);
1219
1220         zilog = outzfsvfs->z_log;
1221         maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
1222             sizeof (bps[0]);
1223
1224         uid = KUID_TO_SUID(ZTOUID(outzp));
1225         gid = KGID_TO_SGID(ZTOGID(outzp));
1226         projid = outzp->z_projid;
1227
1228         bps = kmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
1229
1230         /*
1231          * Clone the file in reasonable size chunks.  Each chunk is cloned
1232          * in a separate transaction; this keeps the intent log records small
1233          * and allows us to do more fine-grained space accounting.
1234          */
1235         while (len > 0) {
1236                 size = MIN(inblksz * maxblocks, len);
1237
1238                 if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
1239                     uid) ||
1240                     zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
1241                     gid) ||
1242                     (projid != ZFS_DEFAULT_PROJID &&
1243                     zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
1244                     projid))) {
1245                         error = SET_ERROR(EDQUOT);
1246                         break;
1247                 }
1248
1249                 /*
1250                  * Start a transaction.
1251                  */
1252                 tx = dmu_tx_create(outos);
1253
1254                 nbps = maxblocks;
1255                 error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, tx, bps,
1256                     &nbps);
1257                 if (error != 0) {
1258                         dmu_tx_abort(tx);
1259                         /*
1260                          * If we are tyring to clone a block that was created
1261                          * in the current transaction group. Return an error,
1262                          * so the caller can fallback to just copying the data.
1263                          */
1264                         if (error == EAGAIN) {
1265                                 error = SET_ERROR(EXDEV);
1266                         }
1267                         break;
1268                 }
1269                 /*
1270                  * Encrypted data is fine as long as it comes from the same
1271                  * dataset.
1272                  * TODO: We want to extend it in the future to allow cloning to
1273                  * datasets with the same keys, like clones or to be able to
1274                  * clone a file from a snapshot of an encrypted dataset into the
1275                  * dataset itself.
1276                  */
1277                 if (BP_IS_PROTECTED(&bps[0])) {
1278                         if (inzfsvfs != outzfsvfs) {
1279                                 dmu_tx_abort(tx);
1280                                 error = SET_ERROR(EXDEV);
1281                                 break;
1282                         }
1283                 }
1284
1285                 dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
1286                 db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
1287                 DB_DNODE_ENTER(db);
1288                 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
1289                 DB_DNODE_EXIT(db);
1290                 zfs_sa_upgrade_txholds(tx, outzp);
1291                 error = dmu_tx_assign(tx, TXG_WAIT);
1292                 if (error != 0) {
1293                         dmu_tx_abort(tx);
1294                         break;
1295                 }
1296
1297                 /*
1298                  * Copy source znode's block size. This only happens on the
1299                  * first iteration since zfs_rangelock_reduce() will shrink down
1300                  * lr_len to the appropriate size.
1301                  */
1302                 if (outlr->lr_length == UINT64_MAX) {
1303                         zfs_grow_blocksize(outzp, inblksz, tx);
1304                         /*
1305                          * Round range lock up to the block boundary, so we
1306                          * prevent appends until we are done.
1307                          */
1308                         zfs_rangelock_reduce(outlr, outoff,
1309                             ((len - 1) / inblksz + 1) * inblksz);
1310                 }
1311
1312                 error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
1313                     bps, nbps, B_FALSE);
1314                 if (error != 0) {
1315                         dmu_tx_commit(tx);
1316                         break;
1317                 }
1318
1319                 zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
1320                     &clear_setid_bits_txg, tx);
1321
1322                 zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
1323
1324                 /*
1325                  * Update the file size (zp_size) if it has changed;
1326                  * account for possible concurrent updates.
1327                  */
1328                 while ((outsize = outzp->z_size) < outoff + size) {
1329                         (void) atomic_cas_64(&outzp->z_size, outsize,
1330                             outoff + size);
1331                 }
1332
1333                 error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
1334
1335                 zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
1336                     size, inblksz, bps, nbps);
1337
1338                 dmu_tx_commit(tx);
1339
1340                 if (error != 0)
1341                         break;
1342
1343                 inoff += size;
1344                 outoff += size;
1345                 len -= size;
1346                 done += size;
1347         }
1348
1349         kmem_free(bps, sizeof (bps[0]) * maxblocks);
1350         zfs_znode_update_vfs(outzp);
1351
1352 unlock:
1353         zfs_rangelock_exit(outlr);
1354         zfs_rangelock_exit(inlr);
1355
1356         if (done > 0) {
1357                 /*
1358                  * If we have made at least partial progress, reset the error.
1359                  */
1360                 error = 0;
1361
1362                 ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
1363
1364                 if (outos->os_sync == ZFS_SYNC_ALWAYS) {
1365                         zil_commit(zilog, outzp->z_id);
1366                 }
1367
1368                 *inoffp += done;
1369                 *outoffp += done;
1370                 *lenp = done;
1371         }
1372
1373         zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1374
1375         return (error);
1376 }
1377
1378 /*
1379  * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
1380  * but we cannot do that, because when replaying we don't have source znode
1381  * available. This is why we need a dedicated replay function.
1382  */
1383 int
1384 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
1385     const blkptr_t *bps, size_t nbps)
1386 {
1387         zfsvfs_t        *zfsvfs;
1388         dmu_buf_impl_t  *db;
1389         dmu_tx_t        *tx;
1390         int             error;
1391         int             count = 0;
1392         sa_bulk_attr_t  bulk[3];
1393         uint64_t        mtime[2], ctime[2];
1394
1395         ASSERT3U(off, <, MAXOFFSET_T);
1396         ASSERT3U(len, >, 0);
1397         ASSERT3U(nbps, >, 0);
1398
1399         zfsvfs = ZTOZSB(zp);
1400
1401         ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
1402             SPA_FEATURE_BLOCK_CLONING));
1403
1404         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1405                 return (error);
1406
1407         ASSERT(zfsvfs->z_replay);
1408         ASSERT(!zfs_is_readonly(zfsvfs));
1409
1410         if ((off % blksz) != 0) {
1411                 zfs_exit(zfsvfs, FTAG);
1412                 return (SET_ERROR(EINVAL));
1413         }
1414
1415         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1416         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1417         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1418             &zp->z_size, 8);
1419
1420         /*
1421          * Start a transaction.
1422          */
1423         tx = dmu_tx_create(zfsvfs->z_os);
1424
1425         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1426         db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
1427         DB_DNODE_ENTER(db);
1428         dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
1429         DB_DNODE_EXIT(db);
1430         zfs_sa_upgrade_txholds(tx, zp);
1431         error = dmu_tx_assign(tx, TXG_WAIT);
1432         if (error != 0) {
1433                 dmu_tx_abort(tx);
1434                 zfs_exit(zfsvfs, FTAG);
1435                 return (error);
1436         }
1437
1438         if (zp->z_blksz < blksz)
1439                 zfs_grow_blocksize(zp, blksz, tx);
1440
1441         dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
1442
1443         zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1444
1445         if (zp->z_size < off + len)
1446                 zp->z_size = off + len;
1447
1448         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1449
1450         /*
1451          * zil_replaying() not only check if we are replaying ZIL, but also
1452          * updates the ZIL header to record replay progress.
1453          */
1454         VERIFY(zil_replaying(zfsvfs->z_log, tx));
1455
1456         dmu_tx_commit(tx);
1457
1458         zfs_znode_update_vfs(zp);
1459
1460         zfs_exit(zfsvfs, FTAG);
1461
1462         return (error);
1463 }
1464
1465 EXPORT_SYMBOL(zfs_access);
1466 EXPORT_SYMBOL(zfs_fsync);
1467 EXPORT_SYMBOL(zfs_holey);
1468 EXPORT_SYMBOL(zfs_read);
1469 EXPORT_SYMBOL(zfs_write);
1470 EXPORT_SYMBOL(zfs_getsecattr);
1471 EXPORT_SYMBOL(zfs_setsecattr);
1472 EXPORT_SYMBOL(zfs_clone_range);
1473 EXPORT_SYMBOL(zfs_clone_range_replay);
1474
1475 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
1476         "Bytes to read per chunk");