]>
Commit | Line | Data |
---|---|---|
e53d678d MM |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
e53d678d MM |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | |
25 | * Copyright (c) 2015 by Chunwei Chen. All rights reserved. | |
26 | * Copyright 2017 Nexenta Systems, Inc. | |
67a1b037 | 27 | * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek |
e53d678d MM |
28 | */ |
29 | ||
30 | /* Portions Copyright 2007 Jeremy Teo */ | |
31 | /* Portions Copyright 2010 Robert Milkowski */ | |
32 | ||
33 | #include <sys/types.h> | |
34 | #include <sys/param.h> | |
35 | #include <sys/time.h> | |
36 | #include <sys/sysmacros.h> | |
37 | #include <sys/vfs.h> | |
c0801bf3 | 38 | #include <sys/uio_impl.h> |
e53d678d MM |
39 | #include <sys/file.h> |
40 | #include <sys/stat.h> | |
41 | #include <sys/kmem.h> | |
42 | #include <sys/cmn_err.h> | |
43 | #include <sys/errno.h> | |
44 | #include <sys/zfs_dir.h> | |
45 | #include <sys/zfs_acl.h> | |
46 | #include <sys/zfs_ioctl.h> | |
47 | #include <sys/fs/zfs.h> | |
48 | #include <sys/dmu.h> | |
49 | #include <sys/dmu_objset.h> | |
50 | #include <sys/spa.h> | |
51 | #include <sys/txg.h> | |
52 | #include <sys/dbuf.h> | |
53 | #include <sys/policy.h> | |
67a1b037 | 54 | #include <sys/zfeature.h> |
e53d678d MM |
55 | #include <sys/zfs_vnops.h> |
56 | #include <sys/zfs_quota.h> | |
ab8c935e CS |
57 | #include <sys/zfs_vfsops.h> |
58 | #include <sys/zfs_znode.h> | |
e53d678d MM |
59 | |
60 | ||
61 | static ulong_t zfs_fsync_sync_cnt = 4; | |
62 | ||
63 | int | |
64 | zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) | |
65 | { | |
768eaced | 66 | int error = 0; |
e53d678d MM |
67 | zfsvfs_t *zfsvfs = ZTOZSB(zp); |
68 | ||
b9041e1f | 69 | (void) tsd_set(zfs_fsyncer_key, (void *)(uintptr_t)zfs_fsync_sync_cnt); |
e53d678d MM |
70 | |
71 | if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { | |
768eaced CC |
72 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
73 | goto out; | |
411f4a01 | 74 | atomic_inc_32(&zp->z_sync_writes_cnt); |
e53d678d | 75 | zil_commit(zfsvfs->z_log, zp->z_id); |
411f4a01 | 76 | atomic_dec_32(&zp->z_sync_writes_cnt); |
768eaced | 77 | zfs_exit(zfsvfs, FTAG); |
e53d678d | 78 | } |
768eaced | 79 | out: |
e53d678d MM |
80 | tsd_set(zfs_fsyncer_key, NULL); |
81 | ||
768eaced | 82 | return (error); |
e53d678d MM |
83 | } |
84 | ||
8583540c MM |
85 | |
86 | #if defined(SEEK_HOLE) && defined(SEEK_DATA) | |
87 | /* | |
88 | * Lseek support for finding holes (cmd == SEEK_HOLE) and | |
89 | * data (cmd == SEEK_DATA). "off" is an in/out parameter. | |
90 | */ | |
91 | static int | |
92 | zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) | |
93 | { | |
de198f2d | 94 | zfs_locked_range_t *lr; |
8583540c MM |
95 | uint64_t noff = (uint64_t)*off; /* new offset */ |
96 | uint64_t file_sz; | |
97 | int error; | |
98 | boolean_t hole; | |
99 | ||
100 | file_sz = zp->z_size; | |
101 | if (noff >= file_sz) { | |
102 | return (SET_ERROR(ENXIO)); | |
103 | } | |
104 | ||
105 | if (cmd == F_SEEK_HOLE) | |
106 | hole = B_TRUE; | |
107 | else | |
108 | hole = B_FALSE; | |
109 | ||
de198f2d | 110 | /* Flush any mmap()'d data to disk */ |
3fc92adc | 111 | if (zn_has_cached_data(zp, 0, file_sz - 1)) |
de198f2d BB |
112 | zn_flush_cached_data(zp, B_FALSE); |
113 | ||
64bfa6ba | 114 | lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); |
8583540c | 115 | error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); |
de198f2d | 116 | zfs_rangelock_exit(lr); |
8583540c MM |
117 | |
118 | if (error == ESRCH) | |
119 | return (SET_ERROR(ENXIO)); | |
120 | ||
de198f2d | 121 | /* File was dirty, so fall back to using generic logic */ |
8583540c MM |
122 | if (error == EBUSY) { |
123 | if (hole) | |
124 | *off = file_sz; | |
125 | ||
126 | return (0); | |
127 | } | |
128 | ||
129 | /* | |
130 | * We could find a hole that begins after the logical end-of-file, | |
131 | * because dmu_offset_next() only works on whole blocks. If the | |
132 | * EOF falls mid-block, then indicate that the "virtual hole" | |
133 | * at the end of the file begins at the logical EOF, rather than | |
134 | * at the end of the last block. | |
135 | */ | |
136 | if (noff > file_sz) { | |
137 | ASSERT(hole); | |
138 | noff = file_sz; | |
139 | } | |
140 | ||
141 | if (noff < *off) | |
142 | return (error); | |
143 | *off = noff; | |
144 | return (error); | |
145 | } | |
146 | ||
147 | int | |
148 | zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) | |
149 | { | |
150 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
151 | int error; | |
152 | ||
768eaced CC |
153 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
154 | return (error); | |
8583540c MM |
155 | |
156 | error = zfs_holey_common(zp, cmd, off); | |
157 | ||
768eaced | 158 | zfs_exit(zfsvfs, FTAG); |
8583540c MM |
159 | return (error); |
160 | } | |
161 | #endif /* SEEK_HOLE && SEEK_DATA */ | |
162 | ||
8583540c MM |
163 | int |
164 | zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) | |
165 | { | |
166 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
167 | int error; | |
168 | ||
768eaced CC |
169 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
170 | return (error); | |
8583540c MM |
171 | |
172 | if (flag & V_ACE_MASK) | |
f224eddf YY |
173 | #if defined(__linux__) |
174 | error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, | |
d4dc53da | 175 | zfs_init_idmap); |
f224eddf YY |
176 | #else |
177 | error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, | |
178 | NULL); | |
179 | #endif | |
8583540c | 180 | else |
f224eddf | 181 | #if defined(__linux__) |
d4dc53da | 182 | error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); |
f224eddf | 183 | #else |
2a068a13 | 184 | error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); |
f224eddf | 185 | #endif |
8583540c | 186 | |
768eaced | 187 | zfs_exit(zfsvfs, FTAG); |
8583540c MM |
188 | return (error); |
189 | } | |
190 | ||
ab8d9c17 | 191 | static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ |
e53d678d MM |
192 | |
193 | /* | |
194 | * Read bytes from specified file into supplied buffer. | |
195 | * | |
196 | * IN: zp - inode of file to be read from. | |
197 | * uio - structure supplying read location, range info, | |
198 | * and return buffer. | |
199 | * ioflag - O_SYNC flags; used to provide FRSYNC semantics. | |
200 | * O_DIRECT flag; used to bypass page cache. | |
201 | * cr - credentials of caller. | |
202 | * | |
203 | * OUT: uio - updated offset and range, buffer filled. | |
204 | * | |
205 | * RETURN: 0 on success, error code on failure. | |
206 | * | |
207 | * Side Effects: | |
208 | * inode - atime updated if byte count > 0 | |
209 | */ | |
e53d678d | 210 | int |
d0cd9a5c | 211 | zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) |
e53d678d | 212 | { |
ef70eff1 | 213 | (void) cr; |
e53d678d MM |
214 | int error = 0; |
215 | boolean_t frsync = B_FALSE; | |
216 | ||
217 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
768eaced CC |
218 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
219 | return (error); | |
e53d678d MM |
220 | |
221 | if (zp->z_pflags & ZFS_AV_QUARANTINED) { | |
768eaced | 222 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
223 | return (SET_ERROR(EACCES)); |
224 | } | |
225 | ||
226 | /* We don't copy out anything useful for directories. */ | |
227 | if (Z_ISDIR(ZTOTYPE(zp))) { | |
768eaced | 228 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
229 | return (SET_ERROR(EISDIR)); |
230 | } | |
231 | ||
232 | /* | |
233 | * Validate file offset | |
234 | */ | |
d0cd9a5c | 235 | if (zfs_uio_offset(uio) < (offset_t)0) { |
768eaced | 236 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
237 | return (SET_ERROR(EINVAL)); |
238 | } | |
239 | ||
240 | /* | |
241 | * Fasttrack empty reads | |
242 | */ | |
d0cd9a5c | 243 | if (zfs_uio_resid(uio) == 0) { |
768eaced | 244 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
245 | return (0); |
246 | } | |
247 | ||
248 | #ifdef FRSYNC | |
249 | /* | |
250 | * If we're in FRSYNC mode, sync out this znode before reading it. | |
251 | * Only do this for non-snapshots. | |
252 | * | |
253 | * Some platforms do not support FRSYNC and instead map it | |
254 | * to O_SYNC, which results in unnecessary calls to zil_commit. We | |
255 | * only honor FRSYNC requests on platforms which support it. | |
256 | */ | |
257 | frsync = !!(ioflag & FRSYNC); | |
258 | #endif | |
259 | if (zfsvfs->z_log && | |
260 | (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) | |
261 | zil_commit(zfsvfs->z_log, zp->z_id); | |
262 | ||
263 | /* | |
264 | * Lock the range against changes. | |
265 | */ | |
266 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, | |
d0cd9a5c | 267 | zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); |
e53d678d MM |
268 | |
269 | /* | |
270 | * If we are reading past end-of-file we can skip | |
271 | * to the end; but we might still need to set atime. | |
272 | */ | |
d0cd9a5c | 273 | if (zfs_uio_offset(uio) >= zp->z_size) { |
e53d678d MM |
274 | error = 0; |
275 | goto out; | |
276 | } | |
277 | ||
d0cd9a5c | 278 | ASSERT(zfs_uio_offset(uio) < zp->z_size); |
05679465 | 279 | #if defined(__linux__) |
59eab109 | 280 | ssize_t start_offset = zfs_uio_offset(uio); |
05679465 | 281 | #endif |
d0cd9a5c | 282 | ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); |
e53d678d MM |
283 | ssize_t start_resid = n; |
284 | ||
285 | while (n > 0) { | |
286 | ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - | |
d0cd9a5c | 287 | P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); |
e53d678d | 288 | #ifdef UIO_NOCOPY |
d0cd9a5c | 289 | if (zfs_uio_segflg(uio) == UIO_NOCOPY) |
e53d678d MM |
290 | error = mappedread_sf(zp, nbytes, uio); |
291 | else | |
292 | #endif | |
3fc92adc BB |
293 | if (zn_has_cached_data(zp, zfs_uio_offset(uio), |
294 | zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { | |
e53d678d MM |
295 | error = mappedread(zp, nbytes, uio); |
296 | } else { | |
297 | error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), | |
298 | uio, nbytes); | |
299 | } | |
300 | ||
301 | if (error) { | |
302 | /* convert checksum errors into IO errors */ | |
303 | if (error == ECKSUM) | |
304 | error = SET_ERROR(EIO); | |
05679465 RE |
305 | |
306 | #if defined(__linux__) | |
59eab109 RE |
307 | /* |
308 | * if we actually read some bytes, bubbling EFAULT | |
05679465 RE |
309 | * up to become EAGAIN isn't what we want here... |
310 | * | |
311 | * ...on Linux, at least. On FBSD, doing this breaks. | |
59eab109 RE |
312 | */ |
313 | if (error == EFAULT && | |
314 | (zfs_uio_offset(uio) - start_offset) != 0) | |
315 | error = 0; | |
05679465 | 316 | #endif |
e53d678d MM |
317 | break; |
318 | } | |
319 | ||
320 | n -= nbytes; | |
321 | } | |
322 | ||
323 | int64_t nread = start_resid - n; | |
324 | dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); | |
325 | task_io_account_read(nread); | |
326 | out: | |
327 | zfs_rangelock_exit(lr); | |
328 | ||
329 | ZFS_ACCESSTIME_STAMP(zfsvfs, zp); | |
768eaced | 330 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
331 | return (error); |
332 | } | |
333 | ||
3d244b48 PJD |
334 | static void |
335 | zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, | |
336 | uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) | |
337 | { | |
338 | zilog_t *zilog = zfsvfs->z_log; | |
339 | const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); | |
340 | ||
341 | ASSERT(clear_setid_bits_txgp != NULL); | |
342 | ASSERT(tx != NULL); | |
343 | ||
344 | /* | |
345 | * Clear Set-UID/Set-GID bits on successful write if not | |
346 | * privileged and at least one of the execute bits is set. | |
347 | * | |
348 | * It would be nice to do this after all writes have | |
349 | * been done, but that would still expose the ISUID/ISGID | |
350 | * to another app after the partial write is committed. | |
351 | * | |
352 | * Note: we don't call zfs_fuid_map_id() here because | |
353 | * user 0 is not an ephemeral uid. | |
354 | */ | |
355 | mutex_enter(&zp->z_acl_lock); | |
356 | if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && | |
357 | (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && | |
358 | secpolicy_vnode_setid_retain(zp, cr, | |
359 | ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { | |
360 | uint64_t newmode; | |
361 | ||
362 | zp->z_mode &= ~(S_ISUID | S_ISGID); | |
363 | newmode = zp->z_mode; | |
364 | (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), | |
365 | (void *)&newmode, sizeof (uint64_t), tx); | |
366 | ||
367 | mutex_exit(&zp->z_acl_lock); | |
368 | ||
369 | /* | |
370 | * Make sure SUID/SGID bits will be removed when we replay the | |
371 | * log. If the setid bits are keep coming back, don't log more | |
372 | * than one TX_SETATTR per transaction group. | |
373 | */ | |
374 | if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { | |
861166b0 | 375 | vattr_t va = {0}; |
3d244b48 | 376 | |
4d972ab5 | 377 | va.va_mask = ATTR_MODE; |
3d244b48 PJD |
378 | va.va_nodeid = zp->z_id; |
379 | va.va_mode = newmode; | |
4d972ab5 JL |
380 | zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, |
381 | ATTR_MODE, NULL); | |
3d244b48 PJD |
382 | *clear_setid_bits_txgp = dmu_tx_get_txg(tx); |
383 | } | |
384 | } else { | |
385 | mutex_exit(&zp->z_acl_lock); | |
386 | } | |
387 | } | |
388 | ||
e53d678d MM |
389 | /* |
390 | * Write the bytes to a file. | |
391 | * | |
392 | * IN: zp - znode of file to be written to. | |
393 | * uio - structure supplying write location, range info, | |
394 | * and data buffer. | |
395 | * ioflag - O_APPEND flag set if in append mode. | |
396 | * O_DIRECT flag; used to bypass page cache. | |
397 | * cr - credentials of caller. | |
398 | * | |
399 | * OUT: uio - updated offset and range. | |
400 | * | |
401 | * RETURN: 0 if success | |
402 | * error code if failure | |
403 | * | |
404 | * Timestamps: | |
405 | * ip - ctime|mtime updated if byte count > 0 | |
406 | */ | |
e53d678d | 407 | int |
d0cd9a5c | 408 | zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) |
e53d678d | 409 | { |
063daa83 | 410 | int error = 0, error1; |
d0cd9a5c | 411 | ssize_t start_resid = zfs_uio_resid(uio); |
3d244b48 | 412 | uint64_t clear_setid_bits_txg = 0; |
e53d678d MM |
413 | |
414 | /* | |
415 | * Fasttrack empty write | |
416 | */ | |
417 | ssize_t n = start_resid; | |
418 | if (n == 0) | |
419 | return (0); | |
420 | ||
e53d678d | 421 | zfsvfs_t *zfsvfs = ZTOZSB(zp); |
768eaced CC |
422 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
423 | return (error); | |
e53d678d MM |
424 | |
425 | sa_bulk_attr_t bulk[4]; | |
426 | int count = 0; | |
427 | uint64_t mtime[2], ctime[2]; | |
428 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); | |
429 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); | |
430 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, | |
431 | &zp->z_size, 8); | |
432 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, | |
433 | &zp->z_pflags, 8); | |
434 | ||
435 | /* | |
436 | * Callers might not be able to detect properly that we are read-only, | |
437 | * so check it explicitly here. | |
438 | */ | |
439 | if (zfs_is_readonly(zfsvfs)) { | |
768eaced | 440 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
441 | return (SET_ERROR(EROFS)); |
442 | } | |
443 | ||
444 | /* | |
4b2e2082 RM |
445 | * If immutable or not appending then return EPERM. |
446 | * Intentionally allow ZFS_READONLY through here. | |
447 | * See zfs_zaccess_common() | |
e53d678d | 448 | */ |
4b2e2082 | 449 | if ((zp->z_pflags & ZFS_IMMUTABLE) || |
e53d678d | 450 | ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && |
d0cd9a5c | 451 | (zfs_uio_offset(uio) < zp->z_size))) { |
768eaced | 452 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
453 | return (SET_ERROR(EPERM)); |
454 | } | |
455 | ||
456 | /* | |
457 | * Validate file offset | |
458 | */ | |
d0cd9a5c | 459 | offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); |
e53d678d | 460 | if (woff < 0) { |
768eaced | 461 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
462 | return (SET_ERROR(EINVAL)); |
463 | } | |
464 | ||
e53d678d MM |
465 | /* |
466 | * Pre-fault the pages to ensure slow (eg NFS) pages | |
467 | * don't hold up txg. | |
e53d678d | 468 | */ |
b0cbc1aa AM |
469 | ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); |
470 | if (zfs_uio_prefaultpages(pfbytes, uio)) { | |
768eaced | 471 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
472 | return (SET_ERROR(EFAULT)); |
473 | } | |
474 | ||
475 | /* | |
476 | * If in append mode, set the io offset pointer to eof. | |
477 | */ | |
478 | zfs_locked_range_t *lr; | |
479 | if (ioflag & O_APPEND) { | |
480 | /* | |
481 | * Obtain an appending range lock to guarantee file append | |
482 | * semantics. We reset the write offset once we have the lock. | |
483 | */ | |
484 | lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); | |
485 | woff = lr->lr_offset; | |
486 | if (lr->lr_length == UINT64_MAX) { | |
487 | /* | |
488 | * We overlocked the file because this write will cause | |
489 | * the file block size to increase. | |
490 | * Note that zp_size cannot change with this lock held. | |
491 | */ | |
492 | woff = zp->z_size; | |
493 | } | |
d0cd9a5c | 494 | zfs_uio_setoffset(uio, woff); |
e53d678d MM |
495 | } else { |
496 | /* | |
497 | * Note that if the file block size will change as a result of | |
498 | * this write, then this range lock will lock the entire file | |
499 | * so that we can re-write the block safely. | |
500 | */ | |
501 | lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); | |
502 | } | |
503 | ||
67a1b037 | 504 | if (zn_rlimit_fsize_uio(zp, uio)) { |
e53d678d | 505 | zfs_rangelock_exit(lr); |
768eaced | 506 | zfs_exit(zfsvfs, FTAG); |
7e3617de | 507 | return (SET_ERROR(EFBIG)); |
e53d678d MM |
508 | } |
509 | ||
d1dd72a2 RM |
510 | const rlim64_t limit = MAXOFFSET_T; |
511 | ||
e53d678d MM |
512 | if (woff >= limit) { |
513 | zfs_rangelock_exit(lr); | |
768eaced | 514 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
515 | return (SET_ERROR(EFBIG)); |
516 | } | |
517 | ||
d1dd72a2 | 518 | if (n > limit - woff) |
e53d678d MM |
519 | n = limit - woff; |
520 | ||
521 | uint64_t end_size = MAX(zp->z_size, woff + n); | |
522 | zilog_t *zilog = zfsvfs->z_log; | |
523 | ||
eec6646e RM |
524 | const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); |
525 | const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); | |
526 | const uint64_t projid = zp->z_projid; | |
527 | ||
e53d678d MM |
528 | /* |
529 | * Write the file in reasonable size chunks. Each chunk is written | |
530 | * in a separate transaction; this keeps the intent log records small | |
531 | * and allows us to do more fine-grained space accounting. | |
532 | */ | |
533 | while (n > 0) { | |
d0cd9a5c | 534 | woff = zfs_uio_offset(uio); |
e53d678d | 535 | |
eec6646e RM |
536 | if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || |
537 | zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || | |
538 | (projid != ZFS_DEFAULT_PROJID && | |
e53d678d | 539 | zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, |
eec6646e | 540 | projid))) { |
e53d678d MM |
541 | error = SET_ERROR(EDQUOT); |
542 | break; | |
543 | } | |
544 | ||
b0cbc1aa AM |
545 | uint64_t blksz; |
546 | if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { | |
547 | if (zp->z_blksz > zfsvfs->z_max_blksz && | |
548 | !ISP2(zp->z_blksz)) { | |
549 | /* | |
550 | * File's blocksize is already larger than the | |
551 | * "recordsize" property. Only let it grow to | |
552 | * the next power of 2. | |
553 | */ | |
554 | blksz = 1 << highbit64(zp->z_blksz); | |
555 | } else { | |
556 | blksz = zfsvfs->z_max_blksz; | |
557 | } | |
558 | blksz = MIN(blksz, P2ROUNDUP(end_size, | |
559 | SPA_MINBLOCKSIZE)); | |
560 | blksz = MAX(blksz, zp->z_blksz); | |
561 | } else { | |
562 | blksz = zp->z_blksz; | |
563 | } | |
564 | ||
e53d678d | 565 | arc_buf_t *abuf = NULL; |
b0cbc1aa AM |
566 | ssize_t nbytes = n; |
567 | if (n >= blksz && woff >= zp->z_size && | |
568 | P2PHASE(woff, blksz) == 0 && | |
569 | (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { | |
e53d678d MM |
570 | /* |
571 | * This write covers a full block. "Borrow" a buffer | |
572 | * from the dmu so that we can fill it before we enter | |
573 | * a transaction. This avoids the possibility of | |
574 | * holding up the transaction if the data copy hangs | |
575 | * up on a pagefault (e.g., from an NFS server mapping). | |
576 | */ | |
e53d678d | 577 | abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), |
b0cbc1aa | 578 | blksz); |
e53d678d | 579 | ASSERT(abuf != NULL); |
b0cbc1aa AM |
580 | ASSERT(arc_buf_size(abuf) == blksz); |
581 | if ((error = zfs_uiocopy(abuf->b_data, blksz, | |
582 | UIO_WRITE, uio, &nbytes))) { | |
e53d678d MM |
583 | dmu_return_arcbuf(abuf); |
584 | break; | |
585 | } | |
b0cbc1aa AM |
586 | ASSERT3S(nbytes, ==, blksz); |
587 | } else { | |
588 | nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - | |
589 | P2PHASE(woff, blksz)); | |
590 | if (pfbytes < nbytes) { | |
591 | if (zfs_uio_prefaultpages(nbytes, uio)) { | |
592 | error = SET_ERROR(EFAULT); | |
593 | break; | |
594 | } | |
595 | pfbytes = nbytes; | |
596 | } | |
e53d678d MM |
597 | } |
598 | ||
599 | /* | |
600 | * Start a transaction. | |
601 | */ | |
602 | dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); | |
603 | dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); | |
604 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); | |
605 | DB_DNODE_ENTER(db); | |
b0cbc1aa | 606 | dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); |
e53d678d MM |
607 | DB_DNODE_EXIT(db); |
608 | zfs_sa_upgrade_txholds(tx, zp); | |
609 | error = dmu_tx_assign(tx, TXG_WAIT); | |
610 | if (error) { | |
611 | dmu_tx_abort(tx); | |
612 | if (abuf != NULL) | |
613 | dmu_return_arcbuf(abuf); | |
614 | break; | |
615 | } | |
616 | ||
3d244b48 PJD |
617 | /* |
618 | * NB: We must call zfs_clear_setid_bits_if_necessary before | |
619 | * committing the transaction! | |
620 | */ | |
621 | ||
e53d678d MM |
622 | /* |
623 | * If rangelock_enter() over-locked we grow the blocksize | |
624 | * and then reduce the lock range. This will only happen | |
625 | * on the first iteration since rangelock_reduce() will | |
626 | * shrink down lr_length to the appropriate size. | |
627 | */ | |
628 | if (lr->lr_length == UINT64_MAX) { | |
b0cbc1aa | 629 | zfs_grow_blocksize(zp, blksz, tx); |
e53d678d MM |
630 | zfs_rangelock_reduce(lr, woff, n); |
631 | } | |
632 | ||
e53d678d MM |
633 | ssize_t tx_bytes; |
634 | if (abuf == NULL) { | |
d0cd9a5c BA |
635 | tx_bytes = zfs_uio_resid(uio); |
636 | zfs_uio_fault_disable(uio, B_TRUE); | |
e53d678d MM |
637 | error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), |
638 | uio, nbytes, tx); | |
d0cd9a5c | 639 | zfs_uio_fault_disable(uio, B_FALSE); |
e53d678d MM |
640 | #ifdef __linux__ |
641 | if (error == EFAULT) { | |
3d244b48 PJD |
642 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, |
643 | cr, &clear_setid_bits_txg, tx); | |
e53d678d MM |
644 | dmu_tx_commit(tx); |
645 | /* | |
646 | * Account for partial writes before | |
647 | * continuing the loop. | |
648 | * Update needs to occur before the next | |
d0cd9a5c | 649 | * zfs_uio_prefaultpages, or prefaultpages may |
e53d678d MM |
650 | * error, and we may break the loop early. |
651 | */ | |
b0cbc1aa AM |
652 | n -= tx_bytes - zfs_uio_resid(uio); |
653 | pfbytes -= tx_bytes - zfs_uio_resid(uio); | |
e53d678d MM |
654 | continue; |
655 | } | |
656 | #endif | |
063daa83 MJ |
657 | /* |
658 | * On FreeBSD, EFAULT should be propagated back to the | |
659 | * VFS, which will handle faulting and will retry. | |
660 | */ | |
661 | if (error != 0 && error != EFAULT) { | |
3d244b48 PJD |
662 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, |
663 | cr, &clear_setid_bits_txg, tx); | |
e53d678d MM |
664 | dmu_tx_commit(tx); |
665 | break; | |
666 | } | |
d0cd9a5c | 667 | tx_bytes -= zfs_uio_resid(uio); |
e53d678d | 668 | } else { |
e53d678d | 669 | /* |
85703f61 RM |
670 | * Thus, we're writing a full block at a block-aligned |
671 | * offset and extending the file past EOF. | |
672 | * | |
673 | * dmu_assign_arcbuf_by_dbuf() will directly assign the | |
674 | * arc buffer to a dbuf. | |
e53d678d | 675 | */ |
85703f61 RM |
676 | error = dmu_assign_arcbuf_by_dbuf( |
677 | sa_get_db(zp->z_sa_hdl), woff, abuf, tx); | |
678 | if (error != 0) { | |
3d244b48 PJD |
679 | /* |
680 | * XXX This might not be necessary if | |
681 | * dmu_assign_arcbuf_by_dbuf is guaranteed | |
682 | * to be atomic. | |
683 | */ | |
684 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, | |
685 | cr, &clear_setid_bits_txg, tx); | |
85703f61 RM |
686 | dmu_return_arcbuf(abuf); |
687 | dmu_tx_commit(tx); | |
688 | break; | |
e53d678d | 689 | } |
d0cd9a5c BA |
690 | ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); |
691 | zfs_uioskip(uio, nbytes); | |
85703f61 | 692 | tx_bytes = nbytes; |
e53d678d | 693 | } |
3fc92adc BB |
694 | if (tx_bytes && |
695 | zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && | |
e53d678d | 696 | !(ioflag & O_DIRECT)) { |
8a9634e2 | 697 | update_pages(zp, woff, tx_bytes, zfsvfs->z_os); |
e53d678d MM |
698 | } |
699 | ||
700 | /* | |
701 | * If we made no progress, we're done. If we made even | |
702 | * partial progress, update the znode and ZIL accordingly. | |
703 | */ | |
704 | if (tx_bytes == 0) { | |
705 | (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), | |
706 | (void *)&zp->z_size, sizeof (uint64_t), tx); | |
707 | dmu_tx_commit(tx); | |
708 | ASSERT(error != 0); | |
709 | break; | |
710 | } | |
711 | ||
3d244b48 PJD |
712 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, |
713 | &clear_setid_bits_txg, tx); | |
e53d678d MM |
714 | |
715 | zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); | |
716 | ||
717 | /* | |
718 | * Update the file size (zp_size) if it has changed; | |
719 | * account for possible concurrent updates. | |
720 | */ | |
d0cd9a5c | 721 | while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { |
e53d678d | 722 | (void) atomic_cas_64(&zp->z_size, end_size, |
d0cd9a5c | 723 | zfs_uio_offset(uio)); |
063daa83 | 724 | ASSERT(error == 0 || error == EFAULT); |
e53d678d MM |
725 | } |
726 | /* | |
727 | * If we are replaying and eof is non zero then force | |
728 | * the file size to the specified eof. Note, there's no | |
729 | * concurrency during replay. | |
730 | */ | |
731 | if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) | |
732 | zp->z_size = zfsvfs->z_replay_eof; | |
733 | ||
063daa83 MJ |
734 | error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); |
735 | if (error1 != 0) | |
736 | /* Avoid clobbering EFAULT. */ | |
737 | error = error1; | |
e53d678d | 738 | |
3d244b48 PJD |
739 | /* |
740 | * NB: During replay, the TX_SETATTR record logged by | |
741 | * zfs_clear_setid_bits_if_necessary must precede any of | |
742 | * the TX_WRITE records logged here. | |
743 | */ | |
e53d678d MM |
744 | zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, |
745 | NULL, NULL); | |
3d244b48 | 746 | |
e53d678d MM |
747 | dmu_tx_commit(tx); |
748 | ||
749 | if (error != 0) | |
750 | break; | |
1c2358c1 | 751 | ASSERT3S(tx_bytes, ==, nbytes); |
e53d678d | 752 | n -= nbytes; |
b0cbc1aa | 753 | pfbytes -= nbytes; |
e53d678d MM |
754 | } |
755 | ||
fc273894 | 756 | zfs_znode_update_vfs(zp); |
e53d678d MM |
757 | zfs_rangelock_exit(lr); |
758 | ||
759 | /* | |
7e3617de RM |
760 | * If we're in replay mode, or we made no progress, or the |
761 | * uio data is inaccessible return an error. Otherwise, it's | |
762 | * at least a partial write, so it's successful. | |
e53d678d | 763 | */ |
d0cd9a5c | 764 | if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || |
7e3617de | 765 | error == EFAULT) { |
768eaced | 766 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
767 | return (error); |
768 | } | |
769 | ||
770 | if (ioflag & (O_SYNC | O_DSYNC) || | |
771 | zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) | |
772 | zil_commit(zilog, zp->z_id); | |
773 | ||
d0cd9a5c | 774 | const int64_t nwritten = start_resid - zfs_uio_resid(uio); |
e53d678d MM |
775 | dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); |
776 | task_io_account_write(nwritten); | |
777 | ||
768eaced | 778 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
779 | return (0); |
780 | } | |
781 | ||
e53d678d MM |
782 | int |
783 | zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) | |
784 | { | |
785 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
786 | int error; | |
787 | boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; | |
788 | ||
768eaced CC |
789 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
790 | return (error); | |
e53d678d | 791 | error = zfs_getacl(zp, vsecp, skipaclchk, cr); |
768eaced | 792 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
793 | |
794 | return (error); | |
795 | } | |
796 | ||
e53d678d MM |
797 | int |
798 | zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) | |
799 | { | |
800 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
801 | int error; | |
802 | boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; | |
803 | zilog_t *zilog = zfsvfs->z_log; | |
804 | ||
768eaced CC |
805 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) |
806 | return (error); | |
e53d678d MM |
807 | |
808 | error = zfs_setacl(zp, vsecp, skipaclchk, cr); | |
809 | ||
810 | if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) | |
811 | zil_commit(zilog, 0); | |
812 | ||
768eaced | 813 | zfs_exit(zfsvfs, FTAG); |
e53d678d MM |
814 | return (error); |
815 | } | |
816 | ||
ab8c935e CS |
817 | #ifdef ZFS_DEBUG |
818 | static int zil_fault_io = 0; | |
819 | #endif | |
820 | ||
821 | static void zfs_get_done(zgd_t *zgd, int error); | |
822 | ||
823 | /* | |
824 | * Get data to generate a TX_WRITE intent log record. | |
825 | */ | |
826 | int | |
296a4a36 CC |
827 | zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, |
828 | struct lwb *lwb, zio_t *zio) | |
ab8c935e CS |
829 | { |
830 | zfsvfs_t *zfsvfs = arg; | |
831 | objset_t *os = zfsvfs->z_os; | |
832 | znode_t *zp; | |
833 | uint64_t object = lr->lr_foid; | |
834 | uint64_t offset = lr->lr_offset; | |
835 | uint64_t size = lr->lr_length; | |
836 | dmu_buf_t *db; | |
837 | zgd_t *zgd; | |
838 | int error = 0; | |
296a4a36 | 839 | uint64_t zp_gen; |
ab8c935e CS |
840 | |
841 | ASSERT3P(lwb, !=, NULL); | |
ab8c935e CS |
842 | ASSERT3U(size, !=, 0); |
843 | ||
844 | /* | |
845 | * Nothing to do if the file has been removed | |
846 | */ | |
847 | if (zfs_zget(zfsvfs, object, &zp) != 0) | |
848 | return (SET_ERROR(ENOENT)); | |
849 | if (zp->z_unlinked) { | |
850 | /* | |
851 | * Release the vnode asynchronously as we currently have the | |
852 | * txg stopped from syncing. | |
853 | */ | |
854 | zfs_zrele_async(zp); | |
855 | return (SET_ERROR(ENOENT)); | |
856 | } | |
296a4a36 CC |
857 | /* check if generation number matches */ |
858 | if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, | |
859 | sizeof (zp_gen)) != 0) { | |
860 | zfs_zrele_async(zp); | |
861 | return (SET_ERROR(EIO)); | |
862 | } | |
863 | if (zp_gen != gen) { | |
864 | zfs_zrele_async(zp); | |
865 | return (SET_ERROR(ENOENT)); | |
866 | } | |
ab8c935e | 867 | |
7384ec65 | 868 | zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); |
ab8c935e CS |
869 | zgd->zgd_lwb = lwb; |
870 | zgd->zgd_private = zp; | |
871 | ||
872 | /* | |
873 | * Write records come in two flavors: immediate and indirect. | |
874 | * For small writes it's cheaper to store the data with the | |
875 | * log record (immediate); for large writes it's cheaper to | |
876 | * sync the data and get a pointer to it (indirect) so that | |
877 | * we don't have to write the data twice. | |
878 | */ | |
879 | if (buf != NULL) { /* immediate write */ | |
880 | zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, | |
881 | offset, size, RL_READER); | |
882 | /* test for truncation needs to be done while range locked */ | |
883 | if (offset >= zp->z_size) { | |
884 | error = SET_ERROR(ENOENT); | |
885 | } else { | |
886 | error = dmu_read(os, object, offset, size, buf, | |
887 | DMU_READ_NO_PREFETCH); | |
888 | } | |
889 | ASSERT(error == 0 || error == ENOENT); | |
890 | } else { /* indirect write */ | |
df8c9f35 | 891 | ASSERT3P(zio, !=, NULL); |
ab8c935e CS |
892 | /* |
893 | * Have to lock the whole block to ensure when it's | |
894 | * written out and its checksum is being calculated | |
895 | * that no one can change the data. We need to re-check | |
896 | * blocksize after we get the lock in case it's changed! | |
897 | */ | |
898 | for (;;) { | |
899 | uint64_t blkoff; | |
900 | size = zp->z_blksz; | |
901 | blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; | |
902 | offset -= blkoff; | |
903 | zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, | |
904 | offset, size, RL_READER); | |
905 | if (zp->z_blksz == size) | |
906 | break; | |
907 | offset += blkoff; | |
908 | zfs_rangelock_exit(zgd->zgd_lr); | |
909 | } | |
910 | /* test for truncation needs to be done while range locked */ | |
911 | if (lr->lr_offset >= zp->z_size) | |
912 | error = SET_ERROR(ENOENT); | |
913 | #ifdef ZFS_DEBUG | |
914 | if (zil_fault_io) { | |
915 | error = SET_ERROR(EIO); | |
916 | zil_fault_io = 0; | |
917 | } | |
918 | #endif | |
919 | if (error == 0) | |
c1801cbe AM |
920 | error = dmu_buf_hold_noread(os, object, offset, zgd, |
921 | &db); | |
ab8c935e CS |
922 | |
923 | if (error == 0) { | |
924 | blkptr_t *bp = &lr->lr_blkptr; | |
925 | ||
926 | zgd->zgd_db = db; | |
927 | zgd->zgd_bp = bp; | |
928 | ||
929 | ASSERT(db->db_offset == offset); | |
930 | ASSERT(db->db_size == size); | |
931 | ||
932 | error = dmu_sync(zio, lr->lr_common.lrc_txg, | |
933 | zfs_get_done, zgd); | |
934 | ASSERT(error || lr->lr_length <= size); | |
935 | ||
936 | /* | |
937 | * On success, we need to wait for the write I/O | |
938 | * initiated by dmu_sync() to complete before we can | |
939 | * release this dbuf. We will finish everything up | |
940 | * in the zfs_get_done() callback. | |
941 | */ | |
942 | if (error == 0) | |
943 | return (0); | |
944 | ||
945 | if (error == EALREADY) { | |
946 | lr->lr_common.lrc_txtype = TX_WRITE2; | |
947 | /* | |
948 | * TX_WRITE2 relies on the data previously | |
949 | * written by the TX_WRITE that caused | |
950 | * EALREADY. We zero out the BP because | |
951 | * it is the old, currently-on-disk BP. | |
952 | */ | |
953 | zgd->zgd_bp = NULL; | |
954 | BP_ZERO(bp); | |
955 | error = 0; | |
956 | } | |
957 | } | |
958 | } | |
959 | ||
960 | zfs_get_done(zgd, error); | |
961 | ||
962 | return (error); | |
963 | } | |
964 | ||
965 | ||
ab8c935e CS |
966 | static void |
967 | zfs_get_done(zgd_t *zgd, int error) | |
968 | { | |
ef70eff1 | 969 | (void) error; |
ab8c935e CS |
970 | znode_t *zp = zgd->zgd_private; |
971 | ||
972 | if (zgd->zgd_db) | |
973 | dmu_buf_rele(zgd->zgd_db, zgd); | |
974 | ||
975 | zfs_rangelock_exit(zgd->zgd_lr); | |
976 | ||
977 | /* | |
978 | * Release the vnode asynchronously as we currently have the | |
979 | * txg stopped from syncing. | |
980 | */ | |
981 | zfs_zrele_async(zp); | |
982 | ||
983 | kmem_free(zgd, sizeof (zgd_t)); | |
984 | } | |
985 | ||
67a1b037 PJD |
986 | static int |
987 | zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) | |
988 | { | |
989 | int error; | |
990 | ||
991 | /* Swap. Not sure if the order of zfs_enter()s is important. */ | |
992 | if (zfsvfs1 > zfsvfs2) { | |
993 | zfsvfs_t *tmpzfsvfs; | |
994 | ||
995 | tmpzfsvfs = zfsvfs2; | |
996 | zfsvfs2 = zfsvfs1; | |
997 | zfsvfs1 = tmpzfsvfs; | |
998 | } | |
999 | ||
1000 | error = zfs_enter(zfsvfs1, tag); | |
1001 | if (error != 0) | |
1002 | return (error); | |
1003 | if (zfsvfs1 != zfsvfs2) { | |
1004 | error = zfs_enter(zfsvfs2, tag); | |
1005 | if (error != 0) { | |
1006 | zfs_exit(zfsvfs1, tag); | |
1007 | return (error); | |
1008 | } | |
1009 | } | |
1010 | ||
1011 | return (0); | |
1012 | } | |
1013 | ||
1014 | static void | |
1015 | zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) | |
1016 | { | |
1017 | ||
1018 | zfs_exit(zfsvfs1, tag); | |
1019 | if (zfsvfs1 != zfsvfs2) | |
1020 | zfs_exit(zfsvfs2, tag); | |
1021 | } | |
1022 | ||
1023 | /* | |
1024 | * We split each clone request in chunks that can fit into a single ZIL | |
1025 | * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning | |
1026 | * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives | |
1027 | * us room for storing 1022 block pointers. | |
1028 | * | |
1029 | * On success, the function return the number of bytes copied in *lenp. | |
1030 | * Note, it doesn't return how much bytes are left to be copied. | |
895cb689 KP |
1031 | * On errors which are caused by any file system limitations or |
1032 | * brt limitations `EINVAL` is returned. In the most cases a user | |
1033 | * requested bad parameters, it could be possible to clone the file but | |
1034 | * some parameters don't match the requirements. | |
67a1b037 PJD |
1035 | */ |
1036 | int | |
1037 | zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, | |
1038 | uint64_t *outoffp, uint64_t *lenp, cred_t *cr) | |
1039 | { | |
1040 | zfsvfs_t *inzfsvfs, *outzfsvfs; | |
1041 | objset_t *inos, *outos; | |
1042 | zfs_locked_range_t *inlr, *outlr; | |
1043 | dmu_buf_impl_t *db; | |
1044 | dmu_tx_t *tx; | |
1045 | zilog_t *zilog; | |
1046 | uint64_t inoff, outoff, len, done; | |
1047 | uint64_t outsize, size; | |
1048 | int error; | |
1049 | int count = 0; | |
1050 | sa_bulk_attr_t bulk[3]; | |
1051 | uint64_t mtime[2], ctime[2]; | |
1052 | uint64_t uid, gid, projid; | |
1053 | blkptr_t *bps; | |
1054 | size_t maxblocks, nbps; | |
1055 | uint_t inblksz; | |
1056 | uint64_t clear_setid_bits_txg = 0; | |
1057 | ||
1058 | inoff = *inoffp; | |
1059 | outoff = *outoffp; | |
1060 | len = *lenp; | |
1061 | done = 0; | |
1062 | ||
1063 | inzfsvfs = ZTOZSB(inzp); | |
1064 | outzfsvfs = ZTOZSB(outzp); | |
b6d7370b PJD |
1065 | |
1066 | /* | |
1067 | * We need to call zfs_enter() potentially on two different datasets, | |
1068 | * so we need a dedicated function for that. | |
1069 | */ | |
1070 | error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); | |
1071 | if (error != 0) | |
1072 | return (error); | |
1073 | ||
67a1b037 PJD |
1074 | inos = inzfsvfs->z_os; |
1075 | outos = outzfsvfs->z_os; | |
1076 | ||
1077 | /* | |
1078 | * Both source and destination have to belong to the same storage pool. | |
1079 | */ | |
1080 | if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { | |
1081 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1082 | return (SET_ERROR(EXDEV)); | |
1083 | } | |
1084 | ||
c24a4806 KP |
1085 | /* |
1086 | * outos and inos belongs to the same storage pool. | |
1087 | * see a few lines above, only one check. | |
1088 | */ | |
1089 | if (!spa_feature_is_enabled(dmu_objset_spa(outos), | |
1090 | SPA_FEATURE_BLOCK_CLONING)) { | |
1091 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1092 | return (SET_ERROR(EOPNOTSUPP)); | |
1093 | } | |
1094 | ||
67a1b037 PJD |
1095 | ASSERT(!outzfsvfs->z_replay); |
1096 | ||
459c99ff MM |
1097 | /* |
1098 | * Block cloning from an unencrypted dataset into an encrypted | |
1099 | * dataset and vice versa is not supported. | |
1100 | */ | |
1101 | if (inos->os_encrypted != outos->os_encrypted) { | |
1102 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1103 | return (SET_ERROR(EXDEV)); | |
1104 | } | |
1105 | ||
67a1b037 PJD |
1106 | error = zfs_verify_zp(inzp); |
1107 | if (error == 0) | |
1108 | error = zfs_verify_zp(outzp); | |
1109 | if (error != 0) { | |
1110 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1111 | return (error); | |
1112 | } | |
1113 | ||
67a1b037 PJD |
1114 | /* |
1115 | * We don't copy source file's flags that's why we don't allow to clone | |
1116 | * files that are in quarantine. | |
1117 | */ | |
1118 | if (inzp->z_pflags & ZFS_AV_QUARANTINED) { | |
1119 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1120 | return (SET_ERROR(EACCES)); | |
1121 | } | |
1122 | ||
1123 | if (inoff >= inzp->z_size) { | |
1124 | *lenp = 0; | |
1125 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1126 | return (0); | |
1127 | } | |
1128 | if (len > inzp->z_size - inoff) { | |
1129 | len = inzp->z_size - inoff; | |
1130 | } | |
1131 | if (len == 0) { | |
1132 | *lenp = 0; | |
1133 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1134 | return (0); | |
1135 | } | |
1136 | ||
1137 | /* | |
1138 | * Callers might not be able to detect properly that we are read-only, | |
1139 | * so check it explicitly here. | |
1140 | */ | |
1141 | if (zfs_is_readonly(outzfsvfs)) { | |
1142 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1143 | return (SET_ERROR(EROFS)); | |
1144 | } | |
1145 | ||
1146 | /* | |
1147 | * If immutable or not appending then return EPERM. | |
1148 | * Intentionally allow ZFS_READONLY through here. | |
1149 | * See zfs_zaccess_common() | |
1150 | */ | |
1151 | if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { | |
1152 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1153 | return (SET_ERROR(EPERM)); | |
1154 | } | |
1155 | ||
1156 | /* | |
1157 | * No overlapping if we are cloning within the same file. | |
1158 | */ | |
1159 | if (inzp == outzp) { | |
1160 | if (inoff < outoff + len && outoff < inoff + len) { | |
1161 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1162 | return (SET_ERROR(EINVAL)); | |
1163 | } | |
1164 | } | |
1165 | ||
1166 | /* | |
1167 | * Maintain predictable lock order. | |
1168 | */ | |
1169 | if (inzp < outzp || (inzp == outzp && inoff < outoff)) { | |
1170 | inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, | |
1171 | RL_READER); | |
1172 | outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, | |
1173 | RL_WRITER); | |
1174 | } else { | |
1175 | outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, | |
1176 | RL_WRITER); | |
1177 | inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, | |
1178 | RL_READER); | |
1179 | } | |
1180 | ||
1181 | inblksz = inzp->z_blksz; | |
1182 | ||
1183 | /* | |
e96fbdba AM |
1184 | * We cannot clone into files with different block size if we can't |
1185 | * grow it (block size is already bigger or more than one block). | |
67a1b037 | 1186 | */ |
e96fbdba AM |
1187 | if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || |
1188 | outzp->z_size > inblksz)) { | |
1189 | error = SET_ERROR(EINVAL); | |
1190 | goto unlock; | |
1191 | } | |
1192 | ||
1193 | /* | |
1194 | * Block size must be power-of-2 if destination offset != 0. | |
1195 | * There can be no multiple blocks of non-power-of-2 size. | |
1196 | */ | |
1197 | if (outoff != 0 && !ISP2(inblksz)) { | |
895cb689 | 1198 | error = SET_ERROR(EINVAL); |
67a1b037 PJD |
1199 | goto unlock; |
1200 | } | |
1201 | ||
1202 | /* | |
1203 | * Offsets and len must be at block boundries. | |
1204 | */ | |
1205 | if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { | |
895cb689 | 1206 | error = SET_ERROR(EINVAL); |
67a1b037 PJD |
1207 | goto unlock; |
1208 | } | |
1209 | /* | |
1210 | * Length must be multipe of blksz, except for the end of the file. | |
1211 | */ | |
1212 | if ((len % inblksz) != 0 && | |
1213 | (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { | |
895cb689 | 1214 | error = SET_ERROR(EINVAL); |
67a1b037 PJD |
1215 | goto unlock; |
1216 | } | |
1217 | ||
3079bf2e AM |
1218 | /* |
1219 | * If we are copying only one block and it is smaller than recordsize | |
1220 | * property, do not allow destination to grow beyond one block if it | |
1221 | * is not there yet. Otherwise the destination will get stuck with | |
1222 | * that block size forever, that can be as small as 512 bytes, no | |
1223 | * matter how big the destination grow later. | |
1224 | */ | |
1225 | if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && | |
1226 | outzp->z_size <= inblksz && outoff + len > inblksz) { | |
1227 | error = SET_ERROR(EINVAL); | |
1228 | goto unlock; | |
1229 | } | |
1230 | ||
67a1b037 PJD |
1231 | error = zn_rlimit_fsize(outoff + len); |
1232 | if (error != 0) { | |
1233 | goto unlock; | |
1234 | } | |
1235 | ||
1236 | if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { | |
1237 | error = SET_ERROR(EFBIG); | |
1238 | goto unlock; | |
1239 | } | |
1240 | ||
1241 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, | |
1242 | &mtime, 16); | |
1243 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, | |
1244 | &ctime, 16); | |
1245 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, | |
1246 | &outzp->z_size, 8); | |
1247 | ||
1248 | zilog = outzfsvfs->z_log; | |
1249 | maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / | |
1250 | sizeof (bps[0]); | |
1251 | ||
1252 | uid = KUID_TO_SUID(ZTOUID(outzp)); | |
1253 | gid = KGID_TO_SGID(ZTOGID(outzp)); | |
1254 | projid = outzp->z_projid; | |
1255 | ||
7698503d | 1256 | bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); |
67a1b037 PJD |
1257 | |
1258 | /* | |
1259 | * Clone the file in reasonable size chunks. Each chunk is cloned | |
1260 | * in a separate transaction; this keeps the intent log records small | |
1261 | * and allows us to do more fine-grained space accounting. | |
1262 | */ | |
1263 | while (len > 0) { | |
1264 | size = MIN(inblksz * maxblocks, len); | |
1265 | ||
1266 | if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, | |
1267 | uid) || | |
1268 | zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, | |
1269 | gid) || | |
1270 | (projid != ZFS_DEFAULT_PROJID && | |
1271 | zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, | |
1272 | projid))) { | |
1273 | error = SET_ERROR(EDQUOT); | |
1274 | break; | |
1275 | } | |
1276 | ||
67a1b037 | 1277 | nbps = maxblocks; |
d0d91f18 | 1278 | error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, |
67a1b037 PJD |
1279 | &nbps); |
1280 | if (error != 0) { | |
67a1b037 | 1281 | /* |
92f095a9 RN |
1282 | * If we are trying to clone a block that was created |
1283 | * in the current transaction group, error will be | |
1284 | * EAGAIN here, which we can just return to the caller | |
1285 | * so it can fallback if it likes. | |
67a1b037 | 1286 | */ |
67a1b037 PJD |
1287 | break; |
1288 | } | |
1289 | /* | |
1290 | * Encrypted data is fine as long as it comes from the same | |
1291 | * dataset. | |
1292 | * TODO: We want to extend it in the future to allow cloning to | |
1293 | * datasets with the same keys, like clones or to be able to | |
1294 | * clone a file from a snapshot of an encrypted dataset into the | |
1295 | * dataset itself. | |
1296 | */ | |
1297 | if (BP_IS_PROTECTED(&bps[0])) { | |
1298 | if (inzfsvfs != outzfsvfs) { | |
67a1b037 PJD |
1299 | error = SET_ERROR(EXDEV); |
1300 | break; | |
1301 | } | |
1302 | } | |
1303 | ||
d0d91f18 PJD |
1304 | /* |
1305 | * Start a transaction. | |
1306 | */ | |
1307 | tx = dmu_tx_create(outos); | |
67a1b037 PJD |
1308 | dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); |
1309 | db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); | |
1310 | DB_DNODE_ENTER(db); | |
1311 | dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size); | |
1312 | DB_DNODE_EXIT(db); | |
1313 | zfs_sa_upgrade_txholds(tx, outzp); | |
1314 | error = dmu_tx_assign(tx, TXG_WAIT); | |
1315 | if (error != 0) { | |
1316 | dmu_tx_abort(tx); | |
1317 | break; | |
1318 | } | |
1319 | ||
1320 | /* | |
1321 | * Copy source znode's block size. This only happens on the | |
1322 | * first iteration since zfs_rangelock_reduce() will shrink down | |
1323 | * lr_len to the appropriate size. | |
1324 | */ | |
1325 | if (outlr->lr_length == UINT64_MAX) { | |
1326 | zfs_grow_blocksize(outzp, inblksz, tx); | |
1327 | /* | |
1328 | * Round range lock up to the block boundary, so we | |
1329 | * prevent appends until we are done. | |
1330 | */ | |
1331 | zfs_rangelock_reduce(outlr, outoff, | |
1332 | ((len - 1) / inblksz + 1) * inblksz); | |
1333 | } | |
1334 | ||
bd8c6bd6 | 1335 | error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, |
a8c25604 | 1336 | bps, nbps); |
bd8c6bd6 PJD |
1337 | if (error != 0) { |
1338 | dmu_tx_commit(tx); | |
1339 | break; | |
1340 | } | |
67a1b037 PJD |
1341 | |
1342 | zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, | |
1343 | &clear_setid_bits_txg, tx); | |
1344 | ||
1345 | zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); | |
1346 | ||
1347 | /* | |
1348 | * Update the file size (zp_size) if it has changed; | |
1349 | * account for possible concurrent updates. | |
1350 | */ | |
1351 | while ((outsize = outzp->z_size) < outoff + size) { | |
1352 | (void) atomic_cas_64(&outzp->z_size, outsize, | |
1353 | outoff + size); | |
1354 | } | |
1355 | ||
1356 | error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); | |
1357 | ||
1358 | zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, | |
1359 | size, inblksz, bps, nbps); | |
1360 | ||
1361 | dmu_tx_commit(tx); | |
1362 | ||
1363 | if (error != 0) | |
1364 | break; | |
1365 | ||
1366 | inoff += size; | |
1367 | outoff += size; | |
1368 | len -= size; | |
1369 | done += size; | |
1370 | } | |
1371 | ||
7698503d | 1372 | vmem_free(bps, sizeof (bps[0]) * maxblocks); |
67a1b037 PJD |
1373 | zfs_znode_update_vfs(outzp); |
1374 | ||
1375 | unlock: | |
1376 | zfs_rangelock_exit(outlr); | |
1377 | zfs_rangelock_exit(inlr); | |
1378 | ||
1379 | if (done > 0) { | |
1380 | /* | |
1381 | * If we have made at least partial progress, reset the error. | |
1382 | */ | |
1383 | error = 0; | |
1384 | ||
1385 | ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); | |
1386 | ||
1387 | if (outos->os_sync == ZFS_SYNC_ALWAYS) { | |
1388 | zil_commit(zilog, outzp->z_id); | |
1389 | } | |
1390 | ||
1391 | *inoffp += done; | |
1392 | *outoffp += done; | |
1393 | *lenp = done; | |
e96fbdba AM |
1394 | } else { |
1395 | /* | |
1396 | * If we made no progress, there must be a good reason. | |
1397 | * EOF is handled explicitly above, before the loop. | |
1398 | */ | |
1399 | ASSERT3S(error, !=, 0); | |
67a1b037 PJD |
1400 | } |
1401 | ||
1402 | zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); | |
1403 | ||
1404 | return (error); | |
1405 | } | |
1406 | ||
1407 | /* | |
1408 | * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), | |
1409 | * but we cannot do that, because when replaying we don't have source znode | |
1410 | * available. This is why we need a dedicated replay function. | |
1411 | */ | |
1412 | int | |
1413 | zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, | |
1414 | const blkptr_t *bps, size_t nbps) | |
1415 | { | |
1416 | zfsvfs_t *zfsvfs; | |
1417 | dmu_buf_impl_t *db; | |
1418 | dmu_tx_t *tx; | |
1419 | int error; | |
1420 | int count = 0; | |
1421 | sa_bulk_attr_t bulk[3]; | |
1422 | uint64_t mtime[2], ctime[2]; | |
1423 | ||
1424 | ASSERT3U(off, <, MAXOFFSET_T); | |
1425 | ASSERT3U(len, >, 0); | |
1426 | ASSERT3U(nbps, >, 0); | |
1427 | ||
1428 | zfsvfs = ZTOZSB(zp); | |
1429 | ||
1430 | ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), | |
1431 | SPA_FEATURE_BLOCK_CLONING)); | |
1432 | ||
1433 | if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) | |
1434 | return (error); | |
1435 | ||
1436 | ASSERT(zfsvfs->z_replay); | |
1437 | ASSERT(!zfs_is_readonly(zfsvfs)); | |
1438 | ||
1439 | if ((off % blksz) != 0) { | |
1440 | zfs_exit(zfsvfs, FTAG); | |
1441 | return (SET_ERROR(EINVAL)); | |
1442 | } | |
1443 | ||
1444 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); | |
1445 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); | |
1446 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, | |
1447 | &zp->z_size, 8); | |
1448 | ||
1449 | /* | |
1450 | * Start a transaction. | |
1451 | */ | |
1452 | tx = dmu_tx_create(zfsvfs->z_os); | |
1453 | ||
1454 | dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); | |
1455 | db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); | |
1456 | DB_DNODE_ENTER(db); | |
1457 | dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len); | |
1458 | DB_DNODE_EXIT(db); | |
1459 | zfs_sa_upgrade_txholds(tx, zp); | |
1460 | error = dmu_tx_assign(tx, TXG_WAIT); | |
1461 | if (error != 0) { | |
1462 | dmu_tx_abort(tx); | |
1463 | zfs_exit(zfsvfs, FTAG); | |
1464 | return (error); | |
1465 | } | |
1466 | ||
1467 | if (zp->z_blksz < blksz) | |
1468 | zfs_grow_blocksize(zp, blksz, tx); | |
1469 | ||
a8c25604 | 1470 | dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); |
67a1b037 PJD |
1471 | |
1472 | zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); | |
1473 | ||
1474 | if (zp->z_size < off + len) | |
1475 | zp->z_size = off + len; | |
1476 | ||
1477 | error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); | |
1478 | ||
1479 | /* | |
1480 | * zil_replaying() not only check if we are replaying ZIL, but also | |
1481 | * updates the ZIL header to record replay progress. | |
1482 | */ | |
1483 | VERIFY(zil_replaying(zfsvfs->z_log, tx)); | |
1484 | ||
1485 | dmu_tx_commit(tx); | |
1486 | ||
1487 | zfs_znode_update_vfs(zp); | |
1488 | ||
1489 | zfs_exit(zfsvfs, FTAG); | |
1490 | ||
1491 | return (error); | |
1492 | } | |
1493 | ||
8583540c | 1494 | EXPORT_SYMBOL(zfs_access); |
e53d678d | 1495 | EXPORT_SYMBOL(zfs_fsync); |
8583540c | 1496 | EXPORT_SYMBOL(zfs_holey); |
e53d678d MM |
1497 | EXPORT_SYMBOL(zfs_read); |
1498 | EXPORT_SYMBOL(zfs_write); | |
1499 | EXPORT_SYMBOL(zfs_getsecattr); | |
1500 | EXPORT_SYMBOL(zfs_setsecattr); | |
67a1b037 PJD |
1501 | EXPORT_SYMBOL(zfs_clone_range); |
1502 | EXPORT_SYMBOL(zfs_clone_range_replay); | |
e53d678d | 1503 | |
ab8d9c17 | 1504 | ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, |
e53d678d | 1505 | "Bytes to read per chunk"); |