]>
Commit | Line | Data |
---|---|---|
e53d678d MM |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
e53d678d MM |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | |
25 | * Copyright (c) 2015 by Chunwei Chen. All rights reserved. | |
26 | * Copyright 2017 Nexenta Systems, Inc. | |
67a1b037 | 27 | * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek |
e53d678d MM |
28 | */ |
29 | ||
30 | /* Portions Copyright 2007 Jeremy Teo */ | |
31 | /* Portions Copyright 2010 Robert Milkowski */ | |
32 | ||
33 | #include <sys/types.h> | |
34 | #include <sys/param.h> | |
35 | #include <sys/time.h> | |
36 | #include <sys/sysmacros.h> | |
37 | #include <sys/vfs.h> | |
c0801bf3 | 38 | #include <sys/uio_impl.h> |
e53d678d MM |
39 | #include <sys/file.h> |
40 | #include <sys/stat.h> | |
41 | #include <sys/kmem.h> | |
42 | #include <sys/cmn_err.h> | |
43 | #include <sys/errno.h> | |
44 | #include <sys/zfs_dir.h> | |
45 | #include <sys/zfs_acl.h> | |
46 | #include <sys/zfs_ioctl.h> | |
47 | #include <sys/fs/zfs.h> | |
48 | #include <sys/dmu.h> | |
49 | #include <sys/dmu_objset.h> | |
c7b61192 | 50 | #include <sys/dsl_crypt.h> |
e53d678d MM |
51 | #include <sys/spa.h> |
52 | #include <sys/txg.h> | |
53 | #include <sys/dbuf.h> | |
54 | #include <sys/policy.h> | |
67a1b037 | 55 | #include <sys/zfeature.h> |
e53d678d MM |
56 | #include <sys/zfs_vnops.h> |
57 | #include <sys/zfs_quota.h> | |
ab8c935e CS |
58 | #include <sys/zfs_vfsops.h> |
59 | #include <sys/zfs_znode.h> | |
e53d678d | 60 | |
6dccdf50 BB |
61 | /* |
62 | * Enable the experimental block cloning feature. If this setting is 0, then | |
63 | * even if feature@block_cloning is enabled, attempts to clone blocks will act | |
64 | * as though the feature is disabled. | |
65 | */ | |
66 | int zfs_bclone_enabled = 1; | |
67 | ||
68 | /* | |
69 | * When set zfs_clone_range() waits for dirty data to be written to disk. | |
70 | * This allows the clone operation to reliably succeed when a file is modified | |
71 | * and then immediately cloned. For small files this may be slower than making | |
72 | * a copy of the file and is therefore not the default. However, in certain | |
73 | * scenarios this behavior may be desirable so a tunable is provided. | |
74 | */ | |
75 | static int zfs_bclone_wait_dirty = 0; | |
76 | ||
77 | /* | |
78 | * Maximum bytes to read per chunk in zfs_read(). | |
79 | */ | |
80 | static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; | |
e53d678d | 81 | |
e53d678d MM |
82 | int |
83 | zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) | |
84 | { | |
768eaced | 85 | int error = 0; |
e53d678d MM |
86 | zfsvfs_t *zfsvfs = ZTOZSB(zp); |
87 | ||
e53d678d | 88 | if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { |
768eaced | 89 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
c3773de1 | 90 | return (error); |
411f4a01 | 91 | atomic_inc_32(&zp->z_sync_writes_cnt); |
e53d678d | 92 | zil_commit(zfsvfs->z_log, zp->z_id); |
411f4a01 | 93 | atomic_dec_32(&zp->z_sync_writes_cnt); |
768eaced | 94 | zfs_exit(zfsvfs, FTAG); |
e53d678d | 95 | } |
768eaced | 96 | return (error); |
e53d678d MM |
97 | } |
98 | ||
8583540c MM |
99 | |
100 | #if defined(SEEK_HOLE) && defined(SEEK_DATA) | |
101 | /* | |
102 | * Lseek support for finding holes (cmd == SEEK_HOLE) and | |
103 | * data (cmd == SEEK_DATA). "off" is an in/out parameter. | |
104 | */ | |
105 | static int | |
106 | zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) | |
107 | { | |
de198f2d | 108 | zfs_locked_range_t *lr; |
8583540c MM |
109 | uint64_t noff = (uint64_t)*off; /* new offset */ |
110 | uint64_t file_sz; | |
111 | int error; | |
112 | boolean_t hole; | |
113 | ||
114 | file_sz = zp->z_size; | |
115 | if (noff >= file_sz) { | |
116 | return (SET_ERROR(ENXIO)); | |
117 | } | |
118 | ||
119 | if (cmd == F_SEEK_HOLE) | |
120 | hole = B_TRUE; | |
121 | else | |
122 | hole = B_FALSE; | |
123 | ||
de198f2d | 124 | /* Flush any mmap()'d data to disk */ |
3fc92adc | 125 | if (zn_has_cached_data(zp, 0, file_sz - 1)) |
de198f2d BB |
126 | zn_flush_cached_data(zp, B_FALSE); |
127 | ||
64bfa6ba | 128 | lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); |
8583540c | 129 | error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); |
de198f2d | 130 | zfs_rangelock_exit(lr); |
8583540c MM |
131 | |
132 | if (error == ESRCH) | |
133 | return (SET_ERROR(ENXIO)); | |
134 | ||
de198f2d | 135 | /* File was dirty, so fall back to using generic logic */ |
8583540c MM |
136 | if (error == EBUSY) { |
137 | if (hole) | |
138 | *off = file_sz; | |
139 | ||
140 | return (0); | |
141 | } | |
142 | ||
143 | /* | |
144 | * We could find a hole that begins after the logical end-of-file, | |
145 | * because dmu_offset_next() only works on whole blocks. If the | |
146 | * EOF falls mid-block, then indicate that the "virtual hole" | |
147 | * at the end of the file begins at the logical EOF, rather than | |
148 | * at the end of the last block. | |
149 | */ | |
150 | if (noff > file_sz) { | |
151 | ASSERT(hole); | |
152 | noff = file_sz; | |
153 | } | |
154 | ||
155 | if (noff < *off) | |
156 | return (error); | |
157 | *off = noff; | |
158 | return (error); | |
159 | } | |
160 | ||
161 | int | |
162 | zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) | |
163 | { | |
164 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
165 | int error; | |
166 | ||
768eaced CC |
167 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
168 | return (error); | |
8583540c MM |
169 | |
170 | error = zfs_holey_common(zp, cmd, off); | |
171 | ||
768eaced | 172 | zfs_exit(zfsvfs, FTAG); |
8583540c MM |
173 | return (error); |
174 | } | |
175 | #endif /* SEEK_HOLE && SEEK_DATA */ | |
176 | ||
8583540c MM |
177 | int |
178 | zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) | |
179 | { | |
180 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
181 | int error; | |
182 | ||
768eaced CC |
183 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
184 | return (error); | |
8583540c MM |
185 | |
186 | if (flag & V_ACE_MASK) | |
f224eddf YY |
187 | #if defined(__linux__) |
188 | error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, | |
d4dc53da | 189 | zfs_init_idmap); |
f224eddf YY |
190 | #else |
191 | error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, | |
192 | NULL); | |
193 | #endif | |
8583540c | 194 | else |
f224eddf | 195 | #if defined(__linux__) |
d4dc53da | 196 | error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); |
f224eddf | 197 | #else |
2a068a13 | 198 | error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); |
f224eddf | 199 | #endif |
8583540c | 200 | |
768eaced | 201 | zfs_exit(zfsvfs, FTAG); |
8583540c MM |
202 | return (error); |
203 | } | |
204 | ||
e53d678d MM |
205 | /* |
206 | * Read bytes from specified file into supplied buffer. | |
207 | * | |
208 | * IN: zp - inode of file to be read from. | |
209 | * uio - structure supplying read location, range info, | |
210 | * and return buffer. | |
211 | * ioflag - O_SYNC flags; used to provide FRSYNC semantics. | |
212 | * O_DIRECT flag; used to bypass page cache. | |
213 | * cr - credentials of caller. | |
214 | * | |
215 | * OUT: uio - updated offset and range, buffer filled. | |
216 | * | |
217 | * RETURN: 0 on success, error code on failure. | |
218 | * | |
219 | * Side Effects: | |
220 | * inode - atime updated if byte count > 0 | |
221 | */ | |
e53d678d | 222 | int |
d0cd9a5c | 223 | zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) |
e53d678d | 224 | { |
ef70eff1 | 225 | (void) cr; |
e53d678d MM |
226 | int error = 0; |
227 | boolean_t frsync = B_FALSE; | |
228 | ||
229 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
768eaced CC |
230 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
231 | return (error); | |
e53d678d MM |
232 | |
233 | if (zp->z_pflags & ZFS_AV_QUARANTINED) { | |
768eaced | 234 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
235 | return (SET_ERROR(EACCES)); |
236 | } | |
237 | ||
238 | /* We don't copy out anything useful for directories. */ | |
239 | if (Z_ISDIR(ZTOTYPE(zp))) { | |
768eaced | 240 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
241 | return (SET_ERROR(EISDIR)); |
242 | } | |
243 | ||
244 | /* | |
245 | * Validate file offset | |
246 | */ | |
d0cd9a5c | 247 | if (zfs_uio_offset(uio) < (offset_t)0) { |
768eaced | 248 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
249 | return (SET_ERROR(EINVAL)); |
250 | } | |
251 | ||
252 | /* | |
253 | * Fasttrack empty reads | |
254 | */ | |
d0cd9a5c | 255 | if (zfs_uio_resid(uio) == 0) { |
768eaced | 256 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
257 | return (0); |
258 | } | |
259 | ||
260 | #ifdef FRSYNC | |
261 | /* | |
262 | * If we're in FRSYNC mode, sync out this znode before reading it. | |
263 | * Only do this for non-snapshots. | |
264 | * | |
265 | * Some platforms do not support FRSYNC and instead map it | |
266 | * to O_SYNC, which results in unnecessary calls to zil_commit. We | |
267 | * only honor FRSYNC requests on platforms which support it. | |
268 | */ | |
269 | frsync = !!(ioflag & FRSYNC); | |
270 | #endif | |
271 | if (zfsvfs->z_log && | |
272 | (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) | |
273 | zil_commit(zfsvfs->z_log, zp->z_id); | |
274 | ||
275 | /* | |
276 | * Lock the range against changes. | |
277 | */ | |
278 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, | |
d0cd9a5c | 279 | zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); |
e53d678d MM |
280 | |
281 | /* | |
282 | * If we are reading past end-of-file we can skip | |
283 | * to the end; but we might still need to set atime. | |
284 | */ | |
d0cd9a5c | 285 | if (zfs_uio_offset(uio) >= zp->z_size) { |
e53d678d MM |
286 | error = 0; |
287 | goto out; | |
288 | } | |
289 | ||
d0cd9a5c | 290 | ASSERT(zfs_uio_offset(uio) < zp->z_size); |
05679465 | 291 | #if defined(__linux__) |
59eab109 | 292 | ssize_t start_offset = zfs_uio_offset(uio); |
05679465 | 293 | #endif |
d0cd9a5c | 294 | ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); |
e53d678d MM |
295 | ssize_t start_resid = n; |
296 | ||
297 | while (n > 0) { | |
298 | ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - | |
d0cd9a5c | 299 | P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); |
e53d678d | 300 | #ifdef UIO_NOCOPY |
d0cd9a5c | 301 | if (zfs_uio_segflg(uio) == UIO_NOCOPY) |
e53d678d MM |
302 | error = mappedread_sf(zp, nbytes, uio); |
303 | else | |
304 | #endif | |
3fc92adc BB |
305 | if (zn_has_cached_data(zp, zfs_uio_offset(uio), |
306 | zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { | |
e53d678d MM |
307 | error = mappedread(zp, nbytes, uio); |
308 | } else { | |
309 | error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), | |
310 | uio, nbytes); | |
311 | } | |
312 | ||
313 | if (error) { | |
314 | /* convert checksum errors into IO errors */ | |
315 | if (error == ECKSUM) | |
316 | error = SET_ERROR(EIO); | |
05679465 RE |
317 | |
318 | #if defined(__linux__) | |
59eab109 RE |
319 | /* |
320 | * if we actually read some bytes, bubbling EFAULT | |
05679465 RE |
321 | * up to become EAGAIN isn't what we want here... |
322 | * | |
323 | * ...on Linux, at least. On FBSD, doing this breaks. | |
59eab109 RE |
324 | */ |
325 | if (error == EFAULT && | |
326 | (zfs_uio_offset(uio) - start_offset) != 0) | |
327 | error = 0; | |
05679465 | 328 | #endif |
e53d678d MM |
329 | break; |
330 | } | |
331 | ||
332 | n -= nbytes; | |
333 | } | |
334 | ||
335 | int64_t nread = start_resid - n; | |
336 | dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); | |
337 | task_io_account_read(nread); | |
338 | out: | |
339 | zfs_rangelock_exit(lr); | |
340 | ||
341 | ZFS_ACCESSTIME_STAMP(zfsvfs, zp); | |
768eaced | 342 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
343 | return (error); |
344 | } | |
345 | ||
3d244b48 PJD |
346 | static void |
347 | zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, | |
348 | uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) | |
349 | { | |
350 | zilog_t *zilog = zfsvfs->z_log; | |
351 | const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); | |
352 | ||
353 | ASSERT(clear_setid_bits_txgp != NULL); | |
354 | ASSERT(tx != NULL); | |
355 | ||
356 | /* | |
357 | * Clear Set-UID/Set-GID bits on successful write if not | |
358 | * privileged and at least one of the execute bits is set. | |
359 | * | |
360 | * It would be nice to do this after all writes have | |
361 | * been done, but that would still expose the ISUID/ISGID | |
362 | * to another app after the partial write is committed. | |
363 | * | |
364 | * Note: we don't call zfs_fuid_map_id() here because | |
365 | * user 0 is not an ephemeral uid. | |
366 | */ | |
367 | mutex_enter(&zp->z_acl_lock); | |
368 | if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && | |
369 | (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && | |
370 | secpolicy_vnode_setid_retain(zp, cr, | |
371 | ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { | |
372 | uint64_t newmode; | |
373 | ||
374 | zp->z_mode &= ~(S_ISUID | S_ISGID); | |
375 | newmode = zp->z_mode; | |
376 | (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), | |
377 | (void *)&newmode, sizeof (uint64_t), tx); | |
378 | ||
379 | mutex_exit(&zp->z_acl_lock); | |
380 | ||
381 | /* | |
382 | * Make sure SUID/SGID bits will be removed when we replay the | |
383 | * log. If the setid bits are keep coming back, don't log more | |
384 | * than one TX_SETATTR per transaction group. | |
385 | */ | |
386 | if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { | |
861166b0 | 387 | vattr_t va = {0}; |
3d244b48 | 388 | |
4d972ab5 | 389 | va.va_mask = ATTR_MODE; |
3d244b48 PJD |
390 | va.va_nodeid = zp->z_id; |
391 | va.va_mode = newmode; | |
4d972ab5 JL |
392 | zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, |
393 | ATTR_MODE, NULL); | |
3d244b48 PJD |
394 | *clear_setid_bits_txgp = dmu_tx_get_txg(tx); |
395 | } | |
396 | } else { | |
397 | mutex_exit(&zp->z_acl_lock); | |
398 | } | |
399 | } | |
400 | ||
e53d678d MM |
401 | /* |
402 | * Write the bytes to a file. | |
403 | * | |
404 | * IN: zp - znode of file to be written to. | |
405 | * uio - structure supplying write location, range info, | |
406 | * and data buffer. | |
407 | * ioflag - O_APPEND flag set if in append mode. | |
408 | * O_DIRECT flag; used to bypass page cache. | |
409 | * cr - credentials of caller. | |
410 | * | |
411 | * OUT: uio - updated offset and range. | |
412 | * | |
413 | * RETURN: 0 if success | |
414 | * error code if failure | |
415 | * | |
416 | * Timestamps: | |
417 | * ip - ctime|mtime updated if byte count > 0 | |
418 | */ | |
e53d678d | 419 | int |
d0cd9a5c | 420 | zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) |
e53d678d | 421 | { |
063daa83 | 422 | int error = 0, error1; |
d0cd9a5c | 423 | ssize_t start_resid = zfs_uio_resid(uio); |
3d244b48 | 424 | uint64_t clear_setid_bits_txg = 0; |
e53d678d MM |
425 | |
426 | /* | |
427 | * Fasttrack empty write | |
428 | */ | |
429 | ssize_t n = start_resid; | |
430 | if (n == 0) | |
431 | return (0); | |
432 | ||
e53d678d | 433 | zfsvfs_t *zfsvfs = ZTOZSB(zp); |
768eaced CC |
434 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
435 | return (error); | |
e53d678d MM |
436 | |
437 | sa_bulk_attr_t bulk[4]; | |
438 | int count = 0; | |
439 | uint64_t mtime[2], ctime[2]; | |
440 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); | |
441 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); | |
442 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, | |
443 | &zp->z_size, 8); | |
444 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, | |
445 | &zp->z_pflags, 8); | |
446 | ||
447 | /* | |
448 | * Callers might not be able to detect properly that we are read-only, | |
449 | * so check it explicitly here. | |
450 | */ | |
451 | if (zfs_is_readonly(zfsvfs)) { | |
768eaced | 452 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
453 | return (SET_ERROR(EROFS)); |
454 | } | |
455 | ||
456 | /* | |
4b2e2082 RM |
457 | * If immutable or not appending then return EPERM. |
458 | * Intentionally allow ZFS_READONLY through here. | |
459 | * See zfs_zaccess_common() | |
e53d678d | 460 | */ |
4b2e2082 | 461 | if ((zp->z_pflags & ZFS_IMMUTABLE) || |
e53d678d | 462 | ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && |
d0cd9a5c | 463 | (zfs_uio_offset(uio) < zp->z_size))) { |
768eaced | 464 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
465 | return (SET_ERROR(EPERM)); |
466 | } | |
467 | ||
468 | /* | |
469 | * Validate file offset | |
470 | */ | |
d0cd9a5c | 471 | offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); |
e53d678d | 472 | if (woff < 0) { |
768eaced | 473 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
474 | return (SET_ERROR(EINVAL)); |
475 | } | |
476 | ||
e53d678d MM |
477 | /* |
478 | * Pre-fault the pages to ensure slow (eg NFS) pages | |
479 | * don't hold up txg. | |
e53d678d | 480 | */ |
b0cbc1aa AM |
481 | ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); |
482 | if (zfs_uio_prefaultpages(pfbytes, uio)) { | |
768eaced | 483 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
484 | return (SET_ERROR(EFAULT)); |
485 | } | |
486 | ||
487 | /* | |
488 | * If in append mode, set the io offset pointer to eof. | |
489 | */ | |
490 | zfs_locked_range_t *lr; | |
491 | if (ioflag & O_APPEND) { | |
492 | /* | |
493 | * Obtain an appending range lock to guarantee file append | |
494 | * semantics. We reset the write offset once we have the lock. | |
495 | */ | |
496 | lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); | |
497 | woff = lr->lr_offset; | |
498 | if (lr->lr_length == UINT64_MAX) { | |
499 | /* | |
500 | * We overlocked the file because this write will cause | |
501 | * the file block size to increase. | |
502 | * Note that zp_size cannot change with this lock held. | |
503 | */ | |
504 | woff = zp->z_size; | |
505 | } | |
d0cd9a5c | 506 | zfs_uio_setoffset(uio, woff); |
e53d678d MM |
507 | } else { |
508 | /* | |
509 | * Note that if the file block size will change as a result of | |
510 | * this write, then this range lock will lock the entire file | |
511 | * so that we can re-write the block safely. | |
512 | */ | |
513 | lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); | |
514 | } | |
515 | ||
67a1b037 | 516 | if (zn_rlimit_fsize_uio(zp, uio)) { |
e53d678d | 517 | zfs_rangelock_exit(lr); |
768eaced | 518 | zfs_exit(zfsvfs, FTAG); |
7e3617de | 519 | return (SET_ERROR(EFBIG)); |
e53d678d MM |
520 | } |
521 | ||
d1dd72a2 RM |
522 | const rlim64_t limit = MAXOFFSET_T; |
523 | ||
e53d678d MM |
524 | if (woff >= limit) { |
525 | zfs_rangelock_exit(lr); | |
768eaced | 526 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
527 | return (SET_ERROR(EFBIG)); |
528 | } | |
529 | ||
d1dd72a2 | 530 | if (n > limit - woff) |
e53d678d MM |
531 | n = limit - woff; |
532 | ||
533 | uint64_t end_size = MAX(zp->z_size, woff + n); | |
534 | zilog_t *zilog = zfsvfs->z_log; | |
c3773de1 AM |
535 | boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) || |
536 | (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); | |
e53d678d | 537 | |
eec6646e RM |
538 | const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); |
539 | const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); | |
540 | const uint64_t projid = zp->z_projid; | |
541 | ||
e53d678d MM |
542 | /* |
543 | * Write the file in reasonable size chunks. Each chunk is written | |
544 | * in a separate transaction; this keeps the intent log records small | |
545 | * and allows us to do more fine-grained space accounting. | |
546 | */ | |
547 | while (n > 0) { | |
d0cd9a5c | 548 | woff = zfs_uio_offset(uio); |
e53d678d | 549 | |
eec6646e RM |
550 | if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || |
551 | zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || | |
552 | (projid != ZFS_DEFAULT_PROJID && | |
e53d678d | 553 | zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, |
eec6646e | 554 | projid))) { |
e53d678d MM |
555 | error = SET_ERROR(EDQUOT); |
556 | break; | |
557 | } | |
558 | ||
b0cbc1aa AM |
559 | uint64_t blksz; |
560 | if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { | |
561 | if (zp->z_blksz > zfsvfs->z_max_blksz && | |
562 | !ISP2(zp->z_blksz)) { | |
563 | /* | |
564 | * File's blocksize is already larger than the | |
565 | * "recordsize" property. Only let it grow to | |
566 | * the next power of 2. | |
567 | */ | |
568 | blksz = 1 << highbit64(zp->z_blksz); | |
569 | } else { | |
570 | blksz = zfsvfs->z_max_blksz; | |
571 | } | |
572 | blksz = MIN(blksz, P2ROUNDUP(end_size, | |
573 | SPA_MINBLOCKSIZE)); | |
574 | blksz = MAX(blksz, zp->z_blksz); | |
575 | } else { | |
576 | blksz = zp->z_blksz; | |
577 | } | |
578 | ||
e53d678d | 579 | arc_buf_t *abuf = NULL; |
b0cbc1aa AM |
580 | ssize_t nbytes = n; |
581 | if (n >= blksz && woff >= zp->z_size && | |
582 | P2PHASE(woff, blksz) == 0 && | |
583 | (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { | |
e53d678d MM |
584 | /* |
585 | * This write covers a full block. "Borrow" a buffer | |
586 | * from the dmu so that we can fill it before we enter | |
587 | * a transaction. This avoids the possibility of | |
588 | * holding up the transaction if the data copy hangs | |
589 | * up on a pagefault (e.g., from an NFS server mapping). | |
590 | */ | |
e53d678d | 591 | abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), |
b0cbc1aa | 592 | blksz); |
e53d678d | 593 | ASSERT(abuf != NULL); |
b0cbc1aa AM |
594 | ASSERT(arc_buf_size(abuf) == blksz); |
595 | if ((error = zfs_uiocopy(abuf->b_data, blksz, | |
596 | UIO_WRITE, uio, &nbytes))) { | |
e53d678d MM |
597 | dmu_return_arcbuf(abuf); |
598 | break; | |
599 | } | |
b0cbc1aa AM |
600 | ASSERT3S(nbytes, ==, blksz); |
601 | } else { | |
602 | nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - | |
603 | P2PHASE(woff, blksz)); | |
604 | if (pfbytes < nbytes) { | |
605 | if (zfs_uio_prefaultpages(nbytes, uio)) { | |
606 | error = SET_ERROR(EFAULT); | |
607 | break; | |
608 | } | |
609 | pfbytes = nbytes; | |
610 | } | |
e53d678d MM |
611 | } |
612 | ||
613 | /* | |
614 | * Start a transaction. | |
615 | */ | |
616 | dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); | |
617 | dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); | |
618 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); | |
619 | DB_DNODE_ENTER(db); | |
b0cbc1aa | 620 | dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); |
e53d678d MM |
621 | DB_DNODE_EXIT(db); |
622 | zfs_sa_upgrade_txholds(tx, zp); | |
623 | error = dmu_tx_assign(tx, TXG_WAIT); | |
624 | if (error) { | |
625 | dmu_tx_abort(tx); | |
626 | if (abuf != NULL) | |
627 | dmu_return_arcbuf(abuf); | |
628 | break; | |
629 | } | |
630 | ||
3d244b48 PJD |
631 | /* |
632 | * NB: We must call zfs_clear_setid_bits_if_necessary before | |
633 | * committing the transaction! | |
634 | */ | |
635 | ||
e53d678d MM |
636 | /* |
637 | * If rangelock_enter() over-locked we grow the blocksize | |
638 | * and then reduce the lock range. This will only happen | |
639 | * on the first iteration since rangelock_reduce() will | |
640 | * shrink down lr_length to the appropriate size. | |
641 | */ | |
642 | if (lr->lr_length == UINT64_MAX) { | |
b0cbc1aa | 643 | zfs_grow_blocksize(zp, blksz, tx); |
e53d678d MM |
644 | zfs_rangelock_reduce(lr, woff, n); |
645 | } | |
646 | ||
e53d678d MM |
647 | ssize_t tx_bytes; |
648 | if (abuf == NULL) { | |
d0cd9a5c BA |
649 | tx_bytes = zfs_uio_resid(uio); |
650 | zfs_uio_fault_disable(uio, B_TRUE); | |
e53d678d MM |
651 | error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), |
652 | uio, nbytes, tx); | |
d0cd9a5c | 653 | zfs_uio_fault_disable(uio, B_FALSE); |
e53d678d MM |
654 | #ifdef __linux__ |
655 | if (error == EFAULT) { | |
3d244b48 PJD |
656 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, |
657 | cr, &clear_setid_bits_txg, tx); | |
e53d678d MM |
658 | dmu_tx_commit(tx); |
659 | /* | |
660 | * Account for partial writes before | |
661 | * continuing the loop. | |
662 | * Update needs to occur before the next | |
d0cd9a5c | 663 | * zfs_uio_prefaultpages, or prefaultpages may |
e53d678d MM |
664 | * error, and we may break the loop early. |
665 | */ | |
b0cbc1aa AM |
666 | n -= tx_bytes - zfs_uio_resid(uio); |
667 | pfbytes -= tx_bytes - zfs_uio_resid(uio); | |
e53d678d MM |
668 | continue; |
669 | } | |
670 | #endif | |
063daa83 MJ |
671 | /* |
672 | * On FreeBSD, EFAULT should be propagated back to the | |
673 | * VFS, which will handle faulting and will retry. | |
674 | */ | |
675 | if (error != 0 && error != EFAULT) { | |
3d244b48 PJD |
676 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, |
677 | cr, &clear_setid_bits_txg, tx); | |
e53d678d MM |
678 | dmu_tx_commit(tx); |
679 | break; | |
680 | } | |
d0cd9a5c | 681 | tx_bytes -= zfs_uio_resid(uio); |
e53d678d | 682 | } else { |
e53d678d | 683 | /* |
85703f61 RM |
684 | * Thus, we're writing a full block at a block-aligned |
685 | * offset and extending the file past EOF. | |
686 | * | |
687 | * dmu_assign_arcbuf_by_dbuf() will directly assign the | |
688 | * arc buffer to a dbuf. | |
e53d678d | 689 | */ |
85703f61 RM |
690 | error = dmu_assign_arcbuf_by_dbuf( |
691 | sa_get_db(zp->z_sa_hdl), woff, abuf, tx); | |
692 | if (error != 0) { | |
3d244b48 PJD |
693 | /* |
694 | * XXX This might not be necessary if | |
695 | * dmu_assign_arcbuf_by_dbuf is guaranteed | |
696 | * to be atomic. | |
697 | */ | |
698 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, | |
699 | cr, &clear_setid_bits_txg, tx); | |
85703f61 RM |
700 | dmu_return_arcbuf(abuf); |
701 | dmu_tx_commit(tx); | |
702 | break; | |
e53d678d | 703 | } |
d0cd9a5c BA |
704 | ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); |
705 | zfs_uioskip(uio, nbytes); | |
85703f61 | 706 | tx_bytes = nbytes; |
e53d678d | 707 | } |
3fc92adc BB |
708 | if (tx_bytes && |
709 | zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && | |
e53d678d | 710 | !(ioflag & O_DIRECT)) { |
8a9634e2 | 711 | update_pages(zp, woff, tx_bytes, zfsvfs->z_os); |
e53d678d MM |
712 | } |
713 | ||
714 | /* | |
715 | * If we made no progress, we're done. If we made even | |
716 | * partial progress, update the znode and ZIL accordingly. | |
717 | */ | |
718 | if (tx_bytes == 0) { | |
719 | (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), | |
720 | (void *)&zp->z_size, sizeof (uint64_t), tx); | |
721 | dmu_tx_commit(tx); | |
722 | ASSERT(error != 0); | |
723 | break; | |
724 | } | |
725 | ||
3d244b48 PJD |
726 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, |
727 | &clear_setid_bits_txg, tx); | |
e53d678d MM |
728 | |
729 | zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); | |
730 | ||
731 | /* | |
732 | * Update the file size (zp_size) if it has changed; | |
733 | * account for possible concurrent updates. | |
734 | */ | |
d0cd9a5c | 735 | while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { |
e53d678d | 736 | (void) atomic_cas_64(&zp->z_size, end_size, |
d0cd9a5c | 737 | zfs_uio_offset(uio)); |
063daa83 | 738 | ASSERT(error == 0 || error == EFAULT); |
e53d678d MM |
739 | } |
740 | /* | |
741 | * If we are replaying and eof is non zero then force | |
742 | * the file size to the specified eof. Note, there's no | |
743 | * concurrency during replay. | |
744 | */ | |
745 | if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) | |
746 | zp->z_size = zfsvfs->z_replay_eof; | |
747 | ||
063daa83 MJ |
748 | error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); |
749 | if (error1 != 0) | |
750 | /* Avoid clobbering EFAULT. */ | |
751 | error = error1; | |
e53d678d | 752 | |
3d244b48 PJD |
753 | /* |
754 | * NB: During replay, the TX_SETATTR record logged by | |
755 | * zfs_clear_setid_bits_if_necessary must precede any of | |
756 | * the TX_WRITE records logged here. | |
757 | */ | |
c3773de1 | 758 | zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, |
e53d678d | 759 | NULL, NULL); |
3d244b48 | 760 | |
e53d678d MM |
761 | dmu_tx_commit(tx); |
762 | ||
763 | if (error != 0) | |
764 | break; | |
1c2358c1 | 765 | ASSERT3S(tx_bytes, ==, nbytes); |
e53d678d | 766 | n -= nbytes; |
b0cbc1aa | 767 | pfbytes -= nbytes; |
e53d678d MM |
768 | } |
769 | ||
fc273894 | 770 | zfs_znode_update_vfs(zp); |
e53d678d MM |
771 | zfs_rangelock_exit(lr); |
772 | ||
773 | /* | |
7e3617de RM |
774 | * If we're in replay mode, or we made no progress, or the |
775 | * uio data is inaccessible return an error. Otherwise, it's | |
776 | * at least a partial write, so it's successful. | |
e53d678d | 777 | */ |
d0cd9a5c | 778 | if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || |
7e3617de | 779 | error == EFAULT) { |
768eaced | 780 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
781 | return (error); |
782 | } | |
783 | ||
c3773de1 | 784 | if (commit) |
e53d678d MM |
785 | zil_commit(zilog, zp->z_id); |
786 | ||
d0cd9a5c | 787 | const int64_t nwritten = start_resid - zfs_uio_resid(uio); |
e53d678d MM |
788 | dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); |
789 | task_io_account_write(nwritten); | |
790 | ||
768eaced | 791 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
792 | return (0); |
793 | } | |
794 | ||
e53d678d MM |
795 | int |
796 | zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) | |
797 | { | |
798 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
799 | int error; | |
800 | boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; | |
801 | ||
768eaced CC |
802 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
803 | return (error); | |
e53d678d | 804 | error = zfs_getacl(zp, vsecp, skipaclchk, cr); |
768eaced | 805 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
806 | |
807 | return (error); | |
808 | } | |
809 | ||
e53d678d MM |
810 | int |
811 | zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) | |
812 | { | |
813 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
814 | int error; | |
815 | boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; | |
1a11ad9d | 816 | zilog_t *zilog; |
e53d678d | 817 | |
768eaced CC |
818 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
819 | return (error); | |
1a11ad9d | 820 | zilog = zfsvfs->z_log; |
e53d678d MM |
821 | error = zfs_setacl(zp, vsecp, skipaclchk, cr); |
822 | ||
823 | if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) | |
824 | zil_commit(zilog, 0); | |
825 | ||
768eaced | 826 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
827 | return (error); |
828 | } | |
829 | ||
ab8c935e CS |
830 | #ifdef ZFS_DEBUG |
831 | static int zil_fault_io = 0; | |
832 | #endif | |
833 | ||
834 | static void zfs_get_done(zgd_t *zgd, int error); | |
835 | ||
836 | /* | |
837 | * Get data to generate a TX_WRITE intent log record. | |
838 | */ | |
839 | int | |
296a4a36 CC |
840 | zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, |
841 | struct lwb *lwb, zio_t *zio) | |
ab8c935e CS |
842 | { |
843 | zfsvfs_t *zfsvfs = arg; | |
844 | objset_t *os = zfsvfs->z_os; | |
845 | znode_t *zp; | |
846 | uint64_t object = lr->lr_foid; | |
847 | uint64_t offset = lr->lr_offset; | |
848 | uint64_t size = lr->lr_length; | |
849 | dmu_buf_t *db; | |
850 | zgd_t *zgd; | |
851 | int error = 0; | |
296a4a36 | 852 | uint64_t zp_gen; |
ab8c935e CS |
853 | |
854 | ASSERT3P(lwb, !=, NULL); | |
ab8c935e CS |
855 | ASSERT3U(size, !=, 0); |
856 | ||
857 | /* | |
858 | * Nothing to do if the file has been removed | |
859 | */ | |
860 | if (zfs_zget(zfsvfs, object, &zp) != 0) | |
861 | return (SET_ERROR(ENOENT)); | |
862 | if (zp->z_unlinked) { | |
863 | /* | |
864 | * Release the vnode asynchronously as we currently have the | |
865 | * txg stopped from syncing. | |
866 | */ | |
867 | zfs_zrele_async(zp); | |
868 | return (SET_ERROR(ENOENT)); | |
869 | } | |
296a4a36 CC |
870 | /* check if generation number matches */ |
871 | if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, | |
872 | sizeof (zp_gen)) != 0) { | |
873 | zfs_zrele_async(zp); | |
874 | return (SET_ERROR(EIO)); | |
875 | } | |
876 | if (zp_gen != gen) { | |
877 | zfs_zrele_async(zp); | |
878 | return (SET_ERROR(ENOENT)); | |
879 | } | |
ab8c935e | 880 | |
7384ec65 | 881 | zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); |
ab8c935e CS |
882 | zgd->zgd_lwb = lwb; |
883 | zgd->zgd_private = zp; | |
884 | ||
885 | /* | |
886 | * Write records come in two flavors: immediate and indirect. | |
887 | * For small writes it's cheaper to store the data with the | |
888 | * log record (immediate); for large writes it's cheaper to | |
889 | * sync the data and get a pointer to it (indirect) so that | |
890 | * we don't have to write the data twice. | |
891 | */ | |
892 | if (buf != NULL) { /* immediate write */ | |
893 | zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, | |
894 | offset, size, RL_READER); | |
895 | /* test for truncation needs to be done while range locked */ | |
896 | if (offset >= zp->z_size) { | |
897 | error = SET_ERROR(ENOENT); | |
898 | } else { | |
899 | error = dmu_read(os, object, offset, size, buf, | |
900 | DMU_READ_NO_PREFETCH); | |
901 | } | |
902 | ASSERT(error == 0 || error == ENOENT); | |
903 | } else { /* indirect write */ | |
eda3fcd5 | 904 | ASSERT3P(zio, !=, NULL); |
ab8c935e CS |
905 | /* |
906 | * Have to lock the whole block to ensure when it's | |
907 | * written out and its checksum is being calculated | |
908 | * that no one can change the data. We need to re-check | |
909 | * blocksize after we get the lock in case it's changed! | |
910 | */ | |
911 | for (;;) { | |
912 | uint64_t blkoff; | |
913 | size = zp->z_blksz; | |
914 | blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; | |
915 | offset -= blkoff; | |
916 | zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, | |
917 | offset, size, RL_READER); | |
918 | if (zp->z_blksz == size) | |
919 | break; | |
920 | offset += blkoff; | |
921 | zfs_rangelock_exit(zgd->zgd_lr); | |
922 | } | |
923 | /* test for truncation needs to be done while range locked */ | |
924 | if (lr->lr_offset >= zp->z_size) | |
925 | error = SET_ERROR(ENOENT); | |
926 | #ifdef ZFS_DEBUG | |
927 | if (zil_fault_io) { | |
928 | error = SET_ERROR(EIO); | |
929 | zil_fault_io = 0; | |
930 | } | |
931 | #endif | |
932 | if (error == 0) | |
bdb7df42 AM |
933 | error = dmu_buf_hold_noread(os, object, offset, zgd, |
934 | &db); | |
ab8c935e CS |
935 | |
936 | if (error == 0) { | |
937 | blkptr_t *bp = &lr->lr_blkptr; | |
938 | ||
939 | zgd->zgd_db = db; | |
940 | zgd->zgd_bp = bp; | |
941 | ||
942 | ASSERT(db->db_offset == offset); | |
943 | ASSERT(db->db_size == size); | |
944 | ||
945 | error = dmu_sync(zio, lr->lr_common.lrc_txg, | |
946 | zfs_get_done, zgd); | |
947 | ASSERT(error || lr->lr_length <= size); | |
948 | ||
949 | /* | |
950 | * On success, we need to wait for the write I/O | |
951 | * initiated by dmu_sync() to complete before we can | |
952 | * release this dbuf. We will finish everything up | |
953 | * in the zfs_get_done() callback. | |
954 | */ | |
955 | if (error == 0) | |
956 | return (0); | |
957 | ||
958 | if (error == EALREADY) { | |
959 | lr->lr_common.lrc_txtype = TX_WRITE2; | |
960 | /* | |
961 | * TX_WRITE2 relies on the data previously | |
962 | * written by the TX_WRITE that caused | |
963 | * EALREADY. We zero out the BP because | |
964 | * it is the old, currently-on-disk BP. | |
965 | */ | |
966 | zgd->zgd_bp = NULL; | |
967 | BP_ZERO(bp); | |
968 | error = 0; | |
969 | } | |
970 | } | |
971 | } | |
972 | ||
973 | zfs_get_done(zgd, error); | |
974 | ||
975 | return (error); | |
976 | } | |
977 | ||
978 | ||
ab8c935e CS |
979 | static void |
980 | zfs_get_done(zgd_t *zgd, int error) | |
981 | { | |
ef70eff1 | 982 | (void) error; |
ab8c935e CS |
983 | znode_t *zp = zgd->zgd_private; |
984 | ||
985 | if (zgd->zgd_db) | |
986 | dmu_buf_rele(zgd->zgd_db, zgd); | |
987 | ||
988 | zfs_rangelock_exit(zgd->zgd_lr); | |
989 | ||
990 | /* | |
991 | * Release the vnode asynchronously as we currently have the | |
992 | * txg stopped from syncing. | |
993 | */ | |
994 | zfs_zrele_async(zp); | |
995 | ||
996 | kmem_free(zgd, sizeof (zgd_t)); | |
997 | } | |
998 | ||
67a1b037 PJD |
999 | static int |
1000 | zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) | |
1001 | { | |
1002 | int error; | |
1003 | ||
1004 | /* Swap. Not sure if the order of zfs_enter()s is important. */ | |
1005 | if (zfsvfs1 > zfsvfs2) { | |
1006 | zfsvfs_t *tmpzfsvfs; | |
1007 | ||
1008 | tmpzfsvfs = zfsvfs2; | |
1009 | zfsvfs2 = zfsvfs1; | |
1010 | zfsvfs1 = tmpzfsvfs; | |
1011 | } | |
1012 | ||
1013 | error = zfs_enter(zfsvfs1, tag); | |
1014 | if (error != 0) | |
1015 | return (error); | |
1016 | if (zfsvfs1 != zfsvfs2) { | |
1017 | error = zfs_enter(zfsvfs2, tag); | |
1018 | if (error != 0) { | |
1019 | zfs_exit(zfsvfs1, tag); | |
1020 | return (error); | |
1021 | } | |
1022 | } | |
1023 | ||
1024 | return (0); | |
1025 | } | |
1026 | ||
1027 | static void | |
1028 | zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) | |
1029 | { | |
1030 | ||
1031 | zfs_exit(zfsvfs1, tag); | |
1032 | if (zfsvfs1 != zfsvfs2) | |
1033 | zfs_exit(zfsvfs2, tag); | |
1034 | } | |
1035 | ||
1036 | /* | |
1037 | * We split each clone request in chunks that can fit into a single ZIL | |
1038 | * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning | |
1039 | * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives | |
1040 | * us room for storing 1022 block pointers. | |
1041 | * | |
1042 | * On success, the function return the number of bytes copied in *lenp. | |
1043 | * Note, it doesn't return how much bytes are left to be copied. | |
019dea0a KP |
1044 | * On errors which are caused by any file system limitations or |
1045 | * brt limitations `EINVAL` is returned. In the most cases a user | |
1046 | * requested bad parameters, it could be possible to clone the file but | |
1047 | * some parameters don't match the requirements. | |
67a1b037 PJD |
1048 | */ |
1049 | int | |
1050 | zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, | |
1051 | uint64_t *outoffp, uint64_t *lenp, cred_t *cr) | |
1052 | { | |
1053 | zfsvfs_t *inzfsvfs, *outzfsvfs; | |
1054 | objset_t *inos, *outos; | |
1055 | zfs_locked_range_t *inlr, *outlr; | |
1056 | dmu_buf_impl_t *db; | |
1057 | dmu_tx_t *tx; | |
1058 | zilog_t *zilog; | |
1059 | uint64_t inoff, outoff, len, done; | |
1060 | uint64_t outsize, size; | |
1061 | int error; | |
1062 | int count = 0; | |
1063 | sa_bulk_attr_t bulk[3]; | |
1064 | uint64_t mtime[2], ctime[2]; | |
1065 | uint64_t uid, gid, projid; | |
1066 | blkptr_t *bps; | |
1067 | size_t maxblocks, nbps; | |
1068 | uint_t inblksz; | |
1069 | uint64_t clear_setid_bits_txg = 0; | |
6dccdf50 | 1070 | uint64_t last_synced_txg = 0; |
67a1b037 PJD |
1071 | |
1072 | inoff = *inoffp; | |
1073 | outoff = *outoffp; | |
1074 | len = *lenp; | |
1075 | done = 0; | |
1076 | ||
1077 | inzfsvfs = ZTOZSB(inzp); | |
1078 | outzfsvfs = ZTOZSB(outzp); | |
b6d7370b PJD |
1079 | |
1080 | /* | |
1081 | * We need to call zfs_enter() potentially on two different datasets, | |
1082 | * so we need a dedicated function for that. | |
1083 | */ | |
1084 | error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); | |
1085 | if (error != 0) | |
1086 | return (error); | |
1087 | ||
67a1b037 PJD |
1088 | inos = inzfsvfs->z_os; |
1089 | outos = outzfsvfs->z_os; | |
1090 | ||
1091 | /* | |
1092 | * Both source and destination have to belong to the same storage pool. | |
1093 | */ | |
1094 | if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { | |
1095 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1096 | return (SET_ERROR(EXDEV)); | |
1097 | } | |
1098 | ||
5bdfff5c KP |
1099 | /* |
1100 | * outos and inos belongs to the same storage pool. | |
1101 | * see a few lines above, only one check. | |
1102 | */ | |
1103 | if (!spa_feature_is_enabled(dmu_objset_spa(outos), | |
1104 | SPA_FEATURE_BLOCK_CLONING)) { | |
1105 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1106 | return (SET_ERROR(EOPNOTSUPP)); | |
1107 | } | |
1108 | ||
67a1b037 PJD |
1109 | ASSERT(!outzfsvfs->z_replay); |
1110 | ||
763ca47f MM |
1111 | /* |
1112 | * Block cloning from an unencrypted dataset into an encrypted | |
1113 | * dataset and vice versa is not supported. | |
1114 | */ | |
1115 | if (inos->os_encrypted != outos->os_encrypted) { | |
1116 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1117 | return (SET_ERROR(EXDEV)); | |
1118 | } | |
1119 | ||
c7b61192 KP |
1120 | /* |
1121 | * Cloning across encrypted datasets is possible only if they | |
1122 | * share the same master key. | |
1123 | */ | |
1124 | if (inos != outos && inos->os_encrypted && | |
1125 | !dmu_objset_crypto_key_equal(inos, outos)) { | |
1126 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1127 | return (SET_ERROR(EXDEV)); | |
1128 | } | |
1129 | ||
67a1b037 PJD |
1130 | error = zfs_verify_zp(inzp); |
1131 | if (error == 0) | |
1132 | error = zfs_verify_zp(outzp); | |
1133 | if (error != 0) { | |
1134 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1135 | return (error); | |
1136 | } | |
1137 | ||
67a1b037 PJD |
1138 | /* |
1139 | * We don't copy source file's flags that's why we don't allow to clone | |
1140 | * files that are in quarantine. | |
1141 | */ | |
1142 | if (inzp->z_pflags & ZFS_AV_QUARANTINED) { | |
1143 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1144 | return (SET_ERROR(EACCES)); | |
1145 | } | |
1146 | ||
1147 | if (inoff >= inzp->z_size) { | |
1148 | *lenp = 0; | |
1149 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1150 | return (0); | |
1151 | } | |
1152 | if (len > inzp->z_size - inoff) { | |
1153 | len = inzp->z_size - inoff; | |
1154 | } | |
1155 | if (len == 0) { | |
1156 | *lenp = 0; | |
1157 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1158 | return (0); | |
1159 | } | |
1160 | ||
1161 | /* | |
1162 | * Callers might not be able to detect properly that we are read-only, | |
1163 | * so check it explicitly here. | |
1164 | */ | |
1165 | if (zfs_is_readonly(outzfsvfs)) { | |
1166 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1167 | return (SET_ERROR(EROFS)); | |
1168 | } | |
1169 | ||
1170 | /* | |
1171 | * If immutable or not appending then return EPERM. | |
1172 | * Intentionally allow ZFS_READONLY through here. | |
1173 | * See zfs_zaccess_common() | |
1174 | */ | |
1175 | if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { | |
1176 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1177 | return (SET_ERROR(EPERM)); | |
1178 | } | |
1179 | ||
1180 | /* | |
1181 | * No overlapping if we are cloning within the same file. | |
1182 | */ | |
1183 | if (inzp == outzp) { | |
1184 | if (inoff < outoff + len && outoff < inoff + len) { | |
1185 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1186 | return (SET_ERROR(EINVAL)); | |
1187 | } | |
1188 | } | |
1189 | ||
1190 | /* | |
1191 | * Maintain predictable lock order. | |
1192 | */ | |
1193 | if (inzp < outzp || (inzp == outzp && inoff < outoff)) { | |
1194 | inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, | |
1195 | RL_READER); | |
1196 | outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, | |
1197 | RL_WRITER); | |
1198 | } else { | |
1199 | outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, | |
1200 | RL_WRITER); | |
1201 | inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, | |
1202 | RL_READER); | |
1203 | } | |
1204 | ||
1205 | inblksz = inzp->z_blksz; | |
1206 | ||
1207 | /* | |
255741fc AM |
1208 | * We cannot clone into a file with different block size if we can't |
1209 | * grow it (block size is already bigger, has more than one block, or | |
1210 | * not locked for growth). There are other possible reasons for the | |
1211 | * grow to fail, but we cover what we can before opening transaction | |
1212 | * and the rest detect after we try to do it. | |
67a1b037 | 1213 | */ |
255741fc AM |
1214 | if (inblksz < outzp->z_blksz) { |
1215 | error = SET_ERROR(EINVAL); | |
1216 | goto unlock; | |
1217 | } | |
5cc1876f | 1218 | if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || |
255741fc | 1219 | outlr->lr_length != UINT64_MAX)) { |
5cc1876f AM |
1220 | error = SET_ERROR(EINVAL); |
1221 | goto unlock; | |
1222 | } | |
1223 | ||
1224 | /* | |
1225 | * Block size must be power-of-2 if destination offset != 0. | |
1226 | * There can be no multiple blocks of non-power-of-2 size. | |
1227 | */ | |
1228 | if (outoff != 0 && !ISP2(inblksz)) { | |
019dea0a | 1229 | error = SET_ERROR(EINVAL); |
67a1b037 PJD |
1230 | goto unlock; |
1231 | } | |
1232 | ||
1233 | /* | |
1234 | * Offsets and len must be at block boundries. | |
1235 | */ | |
1236 | if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { | |
019dea0a | 1237 | error = SET_ERROR(EINVAL); |
67a1b037 PJD |
1238 | goto unlock; |
1239 | } | |
1240 | /* | |
1241 | * Length must be multipe of blksz, except for the end of the file. | |
1242 | */ | |
1243 | if ((len % inblksz) != 0 && | |
1244 | (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { | |
019dea0a | 1245 | error = SET_ERROR(EINVAL); |
67a1b037 PJD |
1246 | goto unlock; |
1247 | } | |
1248 | ||
e1353885 AM |
1249 | /* |
1250 | * If we are copying only one block and it is smaller than recordsize | |
1251 | * property, do not allow destination to grow beyond one block if it | |
1252 | * is not there yet. Otherwise the destination will get stuck with | |
1253 | * that block size forever, that can be as small as 512 bytes, no | |
1254 | * matter how big the destination grow later. | |
1255 | */ | |
1256 | if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && | |
1257 | outzp->z_size <= inblksz && outoff + len > inblksz) { | |
1258 | error = SET_ERROR(EINVAL); | |
1259 | goto unlock; | |
1260 | } | |
1261 | ||
67a1b037 PJD |
1262 | error = zn_rlimit_fsize(outoff + len); |
1263 | if (error != 0) { | |
1264 | goto unlock; | |
1265 | } | |
1266 | ||
1267 | if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { | |
1268 | error = SET_ERROR(EFBIG); | |
1269 | goto unlock; | |
1270 | } | |
1271 | ||
1272 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, | |
1273 | &mtime, 16); | |
1274 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, | |
1275 | &ctime, 16); | |
1276 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, | |
1277 | &outzp->z_size, 8); | |
1278 | ||
1279 | zilog = outzfsvfs->z_log; | |
1280 | maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / | |
1281 | sizeof (bps[0]); | |
1282 | ||
1283 | uid = KUID_TO_SUID(ZTOUID(outzp)); | |
1284 | gid = KGID_TO_SGID(ZTOGID(outzp)); | |
1285 | projid = outzp->z_projid; | |
1286 | ||
8d21c002 | 1287 | bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); |
67a1b037 PJD |
1288 | |
1289 | /* | |
1290 | * Clone the file in reasonable size chunks. Each chunk is cloned | |
1291 | * in a separate transaction; this keeps the intent log records small | |
1292 | * and allows us to do more fine-grained space accounting. | |
1293 | */ | |
1294 | while (len > 0) { | |
1295 | size = MIN(inblksz * maxblocks, len); | |
1296 | ||
1297 | if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, | |
1298 | uid) || | |
1299 | zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, | |
1300 | gid) || | |
1301 | (projid != ZFS_DEFAULT_PROJID && | |
1302 | zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, | |
1303 | projid))) { | |
1304 | error = SET_ERROR(EDQUOT); | |
1305 | break; | |
1306 | } | |
1307 | ||
67a1b037 | 1308 | nbps = maxblocks; |
6dccdf50 | 1309 | last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); |
d0d91f18 | 1310 | error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, |
67a1b037 PJD |
1311 | &nbps); |
1312 | if (error != 0) { | |
67a1b037 | 1313 | /* |
cae502c1 | 1314 | * If we are trying to clone a block that was created |
6dccdf50 BB |
1315 | * in the current transaction group, the error will be |
1316 | * EAGAIN here. Based on zfs_bclone_wait_dirty either | |
1317 | * return a shortened range to the caller so it can | |
1318 | * fallback, or wait for the next TXG and check again. | |
67a1b037 | 1319 | */ |
6dccdf50 BB |
1320 | if (error == EAGAIN && zfs_bclone_wait_dirty) { |
1321 | txg_wait_synced(dmu_objset_pool(inos), | |
1322 | last_synced_txg + 1); | |
1323 | continue; | |
1324 | } | |
1325 | ||
67a1b037 PJD |
1326 | break; |
1327 | } | |
67a1b037 | 1328 | |
d0d91f18 PJD |
1329 | /* |
1330 | * Start a transaction. | |
1331 | */ | |
1332 | tx = dmu_tx_create(outos); | |
67a1b037 PJD |
1333 | dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); |
1334 | db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); | |
1335 | DB_DNODE_ENTER(db); | |
1336 | dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size); | |
1337 | DB_DNODE_EXIT(db); | |
1338 | zfs_sa_upgrade_txholds(tx, outzp); | |
1339 | error = dmu_tx_assign(tx, TXG_WAIT); | |
1340 | if (error != 0) { | |
1341 | dmu_tx_abort(tx); | |
1342 | break; | |
1343 | } | |
1344 | ||
1345 | /* | |
255741fc AM |
1346 | * Copy source znode's block size. This is done only if the |
1347 | * whole znode is locked (see zfs_rangelock_cb()) and only | |
1348 | * on the first iteration since zfs_rangelock_reduce() will | |
1349 | * shrink down lr_length to the appropriate size. | |
67a1b037 PJD |
1350 | */ |
1351 | if (outlr->lr_length == UINT64_MAX) { | |
1352 | zfs_grow_blocksize(outzp, inblksz, tx); | |
255741fc AM |
1353 | |
1354 | /* | |
1355 | * Block growth may fail for many reasons we can not | |
1356 | * predict here. If it happen the cloning is doomed. | |
1357 | */ | |
1358 | if (inblksz != outzp->z_blksz) { | |
1359 | error = SET_ERROR(EINVAL); | |
1360 | dmu_tx_abort(tx); | |
1361 | break; | |
1362 | } | |
1363 | ||
67a1b037 PJD |
1364 | /* |
1365 | * Round range lock up to the block boundary, so we | |
1366 | * prevent appends until we are done. | |
1367 | */ | |
1368 | zfs_rangelock_reduce(outlr, outoff, | |
1369 | ((len - 1) / inblksz + 1) * inblksz); | |
1370 | } | |
1371 | ||
bd8c6bd6 | 1372 | error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, |
a03ebd9b | 1373 | bps, nbps); |
bd8c6bd6 PJD |
1374 | if (error != 0) { |
1375 | dmu_tx_commit(tx); | |
1376 | break; | |
1377 | } | |
67a1b037 | 1378 | |
f45dd90f PJD |
1379 | if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { |
1380 | update_pages(outzp, outoff, size, outos); | |
1381 | } | |
1382 | ||
67a1b037 PJD |
1383 | zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, |
1384 | &clear_setid_bits_txg, tx); | |
1385 | ||
1386 | zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); | |
1387 | ||
1388 | /* | |
1389 | * Update the file size (zp_size) if it has changed; | |
1390 | * account for possible concurrent updates. | |
1391 | */ | |
1392 | while ((outsize = outzp->z_size) < outoff + size) { | |
1393 | (void) atomic_cas_64(&outzp->z_size, outsize, | |
1394 | outoff + size); | |
1395 | } | |
1396 | ||
1397 | error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); | |
1398 | ||
1399 | zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, | |
1400 | size, inblksz, bps, nbps); | |
1401 | ||
1402 | dmu_tx_commit(tx); | |
1403 | ||
1404 | if (error != 0) | |
1405 | break; | |
1406 | ||
1407 | inoff += size; | |
1408 | outoff += size; | |
1409 | len -= size; | |
1410 | done += size; | |
1411 | } | |
1412 | ||
8d21c002 | 1413 | vmem_free(bps, sizeof (bps[0]) * maxblocks); |
67a1b037 PJD |
1414 | zfs_znode_update_vfs(outzp); |
1415 | ||
1416 | unlock: | |
1417 | zfs_rangelock_exit(outlr); | |
1418 | zfs_rangelock_exit(inlr); | |
1419 | ||
1420 | if (done > 0) { | |
1421 | /* | |
1422 | * If we have made at least partial progress, reset the error. | |
1423 | */ | |
1424 | error = 0; | |
1425 | ||
1426 | ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); | |
1427 | ||
1428 | if (outos->os_sync == ZFS_SYNC_ALWAYS) { | |
1429 | zil_commit(zilog, outzp->z_id); | |
1430 | } | |
1431 | ||
1432 | *inoffp += done; | |
1433 | *outoffp += done; | |
1434 | *lenp = done; | |
5cc1876f AM |
1435 | } else { |
1436 | /* | |
1437 | * If we made no progress, there must be a good reason. | |
1438 | * EOF is handled explicitly above, before the loop. | |
1439 | */ | |
1440 | ASSERT3S(error, !=, 0); | |
67a1b037 PJD |
1441 | } |
1442 | ||
1443 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1444 | ||
1445 | return (error); | |
1446 | } | |
1447 | ||
1448 | /* | |
1449 | * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), | |
1450 | * but we cannot do that, because when replaying we don't have source znode | |
1451 | * available. This is why we need a dedicated replay function. | |
1452 | */ | |
1453 | int | |
1454 | zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, | |
1455 | const blkptr_t *bps, size_t nbps) | |
1456 | { | |
1457 | zfsvfs_t *zfsvfs; | |
1458 | dmu_buf_impl_t *db; | |
1459 | dmu_tx_t *tx; | |
1460 | int error; | |
1461 | int count = 0; | |
1462 | sa_bulk_attr_t bulk[3]; | |
1463 | uint64_t mtime[2], ctime[2]; | |
1464 | ||
1465 | ASSERT3U(off, <, MAXOFFSET_T); | |
1466 | ASSERT3U(len, >, 0); | |
1467 | ASSERT3U(nbps, >, 0); | |
1468 | ||
1469 | zfsvfs = ZTOZSB(zp); | |
1470 | ||
1471 | ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), | |
1472 | SPA_FEATURE_BLOCK_CLONING)); | |
1473 | ||
1474 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) | |
1475 | return (error); | |
1476 | ||
1477 | ASSERT(zfsvfs->z_replay); | |
1478 | ASSERT(!zfs_is_readonly(zfsvfs)); | |
1479 | ||
1480 | if ((off % blksz) != 0) { | |
1481 | zfs_exit(zfsvfs, FTAG); | |
1482 | return (SET_ERROR(EINVAL)); | |
1483 | } | |
1484 | ||
1485 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); | |
1486 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); | |
1487 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, | |
1488 | &zp->z_size, 8); | |
1489 | ||
1490 | /* | |
1491 | * Start a transaction. | |
1492 | */ | |
1493 | tx = dmu_tx_create(zfsvfs->z_os); | |
1494 | ||
1495 | dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); | |
1496 | db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); | |
1497 | DB_DNODE_ENTER(db); | |
1498 | dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len); | |
1499 | DB_DNODE_EXIT(db); | |
1500 | zfs_sa_upgrade_txholds(tx, zp); | |
1501 | error = dmu_tx_assign(tx, TXG_WAIT); | |
1502 | if (error != 0) { | |
1503 | dmu_tx_abort(tx); | |
1504 | zfs_exit(zfsvfs, FTAG); | |
1505 | return (error); | |
1506 | } | |
1507 | ||
1508 | if (zp->z_blksz < blksz) | |
1509 | zfs_grow_blocksize(zp, blksz, tx); | |
1510 | ||
a03ebd9b | 1511 | dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); |
67a1b037 PJD |
1512 | |
1513 | zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); | |
1514 | ||
1515 | if (zp->z_size < off + len) | |
1516 | zp->z_size = off + len; | |
1517 | ||
1518 | error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); | |
1519 | ||
1520 | /* | |
1521 | * zil_replaying() not only check if we are replaying ZIL, but also | |
1522 | * updates the ZIL header to record replay progress. | |
1523 | */ | |
1524 | VERIFY(zil_replaying(zfsvfs->z_log, tx)); | |
1525 | ||
1526 | dmu_tx_commit(tx); | |
1527 | ||
1528 | zfs_znode_update_vfs(zp); | |
1529 | ||
1530 | zfs_exit(zfsvfs, FTAG); | |
1531 | ||
1532 | return (error); | |
1533 | } | |
1534 | ||
8583540c | 1535 | EXPORT_SYMBOL(zfs_access); |
e53d678d | 1536 | EXPORT_SYMBOL(zfs_fsync); |
8583540c | 1537 | EXPORT_SYMBOL(zfs_holey); |
e53d678d MM |
1538 | EXPORT_SYMBOL(zfs_read); |
1539 | EXPORT_SYMBOL(zfs_write); | |
1540 | EXPORT_SYMBOL(zfs_getsecattr); | |
1541 | EXPORT_SYMBOL(zfs_setsecattr); | |
67a1b037 PJD |
1542 | EXPORT_SYMBOL(zfs_clone_range); |
1543 | EXPORT_SYMBOL(zfs_clone_range_replay); | |
e53d678d | 1544 | |
ab8d9c17 | 1545 | ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, |
e53d678d | 1546 | "Bytes to read per chunk"); |
6dccdf50 BB |
1547 | |
1548 | ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, | |
1549 | "Enable block cloning"); | |
1550 | ||
1551 | ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, | |
1552 | "Wait for dirty blocks when cloning"); |