]>
Commit | Line | Data |
---|---|---|
e53d678d MM |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | |
25 | * Copyright (c) 2015 by Chunwei Chen. All rights reserved. | |
26 | * Copyright 2017 Nexenta Systems, Inc. | |
27 | */ | |
28 | ||
29 | /* Portions Copyright 2007 Jeremy Teo */ | |
30 | /* Portions Copyright 2010 Robert Milkowski */ | |
31 | ||
32 | #include <sys/types.h> | |
33 | #include <sys/param.h> | |
34 | #include <sys/time.h> | |
35 | #include <sys/sysmacros.h> | |
36 | #include <sys/vfs.h> | |
c0801bf3 | 37 | #include <sys/uio_impl.h> |
e53d678d MM |
38 | #include <sys/file.h> |
39 | #include <sys/stat.h> | |
40 | #include <sys/kmem.h> | |
41 | #include <sys/cmn_err.h> | |
42 | #include <sys/errno.h> | |
43 | #include <sys/zfs_dir.h> | |
44 | #include <sys/zfs_acl.h> | |
45 | #include <sys/zfs_ioctl.h> | |
46 | #include <sys/fs/zfs.h> | |
47 | #include <sys/dmu.h> | |
48 | #include <sys/dmu_objset.h> | |
49 | #include <sys/spa.h> | |
50 | #include <sys/txg.h> | |
51 | #include <sys/dbuf.h> | |
52 | #include <sys/policy.h> | |
53 | #include <sys/zfs_vnops.h> | |
54 | #include <sys/zfs_quota.h> | |
ab8c935e CS |
55 | #include <sys/zfs_vfsops.h> |
56 | #include <sys/zfs_znode.h> | |
e53d678d MM |
57 | |
58 | ||
59 | static ulong_t zfs_fsync_sync_cnt = 4; | |
60 | ||
61 | int | |
62 | zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) | |
63 | { | |
64 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
65 | ||
66 | (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); | |
67 | ||
68 | if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { | |
69 | ZFS_ENTER(zfsvfs); | |
70 | ZFS_VERIFY_ZP(zp); | |
71 | zil_commit(zfsvfs->z_log, zp->z_id); | |
72 | ZFS_EXIT(zfsvfs); | |
73 | } | |
74 | tsd_set(zfs_fsyncer_key, NULL); | |
75 | ||
76 | return (0); | |
77 | } | |
78 | ||
8583540c MM |
79 | |
80 | #if defined(SEEK_HOLE) && defined(SEEK_DATA) | |
81 | /* | |
82 | * Lseek support for finding holes (cmd == SEEK_HOLE) and | |
83 | * data (cmd == SEEK_DATA). "off" is an in/out parameter. | |
84 | */ | |
85 | static int | |
86 | zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) | |
87 | { | |
de198f2d | 88 | zfs_locked_range_t *lr; |
8583540c MM |
89 | uint64_t noff = (uint64_t)*off; /* new offset */ |
90 | uint64_t file_sz; | |
91 | int error; | |
92 | boolean_t hole; | |
93 | ||
94 | file_sz = zp->z_size; | |
95 | if (noff >= file_sz) { | |
96 | return (SET_ERROR(ENXIO)); | |
97 | } | |
98 | ||
99 | if (cmd == F_SEEK_HOLE) | |
100 | hole = B_TRUE; | |
101 | else | |
102 | hole = B_FALSE; | |
103 | ||
de198f2d BB |
104 | /* Flush any mmap()'d data to disk */ |
105 | if (zn_has_cached_data(zp)) | |
106 | zn_flush_cached_data(zp, B_FALSE); | |
107 | ||
108 | lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER); | |
8583540c | 109 | error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); |
de198f2d | 110 | zfs_rangelock_exit(lr); |
8583540c MM |
111 | |
112 | if (error == ESRCH) | |
113 | return (SET_ERROR(ENXIO)); | |
114 | ||
de198f2d | 115 | /* File was dirty, so fall back to using generic logic */ |
8583540c MM |
116 | if (error == EBUSY) { |
117 | if (hole) | |
118 | *off = file_sz; | |
119 | ||
120 | return (0); | |
121 | } | |
122 | ||
123 | /* | |
124 | * We could find a hole that begins after the logical end-of-file, | |
125 | * because dmu_offset_next() only works on whole blocks. If the | |
126 | * EOF falls mid-block, then indicate that the "virtual hole" | |
127 | * at the end of the file begins at the logical EOF, rather than | |
128 | * at the end of the last block. | |
129 | */ | |
130 | if (noff > file_sz) { | |
131 | ASSERT(hole); | |
132 | noff = file_sz; | |
133 | } | |
134 | ||
135 | if (noff < *off) | |
136 | return (error); | |
137 | *off = noff; | |
138 | return (error); | |
139 | } | |
140 | ||
141 | int | |
142 | zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) | |
143 | { | |
144 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
145 | int error; | |
146 | ||
147 | ZFS_ENTER(zfsvfs); | |
148 | ZFS_VERIFY_ZP(zp); | |
149 | ||
150 | error = zfs_holey_common(zp, cmd, off); | |
151 | ||
152 | ZFS_EXIT(zfsvfs); | |
153 | return (error); | |
154 | } | |
155 | #endif /* SEEK_HOLE && SEEK_DATA */ | |
156 | ||
8583540c MM |
157 | int |
158 | zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) | |
159 | { | |
160 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
161 | int error; | |
162 | ||
163 | ZFS_ENTER(zfsvfs); | |
164 | ZFS_VERIFY_ZP(zp); | |
165 | ||
166 | if (flag & V_ACE_MASK) | |
167 | error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); | |
168 | else | |
169 | error = zfs_zaccess_rwx(zp, mode, flag, cr); | |
170 | ||
171 | ZFS_EXIT(zfsvfs); | |
172 | return (error); | |
173 | } | |
174 | ||
e53d678d MM |
175 | static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ |
176 | ||
177 | /* | |
178 | * Read bytes from specified file into supplied buffer. | |
179 | * | |
180 | * IN: zp - inode of file to be read from. | |
181 | * uio - structure supplying read location, range info, | |
182 | * and return buffer. | |
183 | * ioflag - O_SYNC flags; used to provide FRSYNC semantics. | |
184 | * O_DIRECT flag; used to bypass page cache. | |
185 | * cr - credentials of caller. | |
186 | * | |
187 | * OUT: uio - updated offset and range, buffer filled. | |
188 | * | |
189 | * RETURN: 0 on success, error code on failure. | |
190 | * | |
191 | * Side Effects: | |
192 | * inode - atime updated if byte count > 0 | |
193 | */ | |
e53d678d | 194 | int |
d0cd9a5c | 195 | zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) |
e53d678d | 196 | { |
ef70eff1 | 197 | (void) cr; |
e53d678d MM |
198 | int error = 0; |
199 | boolean_t frsync = B_FALSE; | |
200 | ||
201 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
202 | ZFS_ENTER(zfsvfs); | |
203 | ZFS_VERIFY_ZP(zp); | |
204 | ||
205 | if (zp->z_pflags & ZFS_AV_QUARANTINED) { | |
206 | ZFS_EXIT(zfsvfs); | |
207 | return (SET_ERROR(EACCES)); | |
208 | } | |
209 | ||
210 | /* We don't copy out anything useful for directories. */ | |
211 | if (Z_ISDIR(ZTOTYPE(zp))) { | |
212 | ZFS_EXIT(zfsvfs); | |
213 | return (SET_ERROR(EISDIR)); | |
214 | } | |
215 | ||
216 | /* | |
217 | * Validate file offset | |
218 | */ | |
d0cd9a5c | 219 | if (zfs_uio_offset(uio) < (offset_t)0) { |
e53d678d MM |
220 | ZFS_EXIT(zfsvfs); |
221 | return (SET_ERROR(EINVAL)); | |
222 | } | |
223 | ||
224 | /* | |
225 | * Fasttrack empty reads | |
226 | */ | |
d0cd9a5c | 227 | if (zfs_uio_resid(uio) == 0) { |
e53d678d MM |
228 | ZFS_EXIT(zfsvfs); |
229 | return (0); | |
230 | } | |
231 | ||
232 | #ifdef FRSYNC | |
233 | /* | |
234 | * If we're in FRSYNC mode, sync out this znode before reading it. | |
235 | * Only do this for non-snapshots. | |
236 | * | |
237 | * Some platforms do not support FRSYNC and instead map it | |
238 | * to O_SYNC, which results in unnecessary calls to zil_commit. We | |
239 | * only honor FRSYNC requests on platforms which support it. | |
240 | */ | |
241 | frsync = !!(ioflag & FRSYNC); | |
242 | #endif | |
243 | if (zfsvfs->z_log && | |
244 | (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) | |
245 | zil_commit(zfsvfs->z_log, zp->z_id); | |
246 | ||
247 | /* | |
248 | * Lock the range against changes. | |
249 | */ | |
250 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, | |
d0cd9a5c | 251 | zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); |
e53d678d MM |
252 | |
253 | /* | |
254 | * If we are reading past end-of-file we can skip | |
255 | * to the end; but we might still need to set atime. | |
256 | */ | |
d0cd9a5c | 257 | if (zfs_uio_offset(uio) >= zp->z_size) { |
e53d678d MM |
258 | error = 0; |
259 | goto out; | |
260 | } | |
261 | ||
d0cd9a5c | 262 | ASSERT(zfs_uio_offset(uio) < zp->z_size); |
05679465 | 263 | #if defined(__linux__) |
59eab109 | 264 | ssize_t start_offset = zfs_uio_offset(uio); |
05679465 | 265 | #endif |
d0cd9a5c | 266 | ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); |
e53d678d MM |
267 | ssize_t start_resid = n; |
268 | ||
269 | while (n > 0) { | |
270 | ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - | |
d0cd9a5c | 271 | P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); |
e53d678d | 272 | #ifdef UIO_NOCOPY |
d0cd9a5c | 273 | if (zfs_uio_segflg(uio) == UIO_NOCOPY) |
e53d678d MM |
274 | error = mappedread_sf(zp, nbytes, uio); |
275 | else | |
276 | #endif | |
277 | if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { | |
278 | error = mappedread(zp, nbytes, uio); | |
279 | } else { | |
280 | error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), | |
281 | uio, nbytes); | |
282 | } | |
283 | ||
284 | if (error) { | |
285 | /* convert checksum errors into IO errors */ | |
286 | if (error == ECKSUM) | |
287 | error = SET_ERROR(EIO); | |
05679465 RE |
288 | |
289 | #if defined(__linux__) | |
59eab109 RE |
290 | /* |
291 | * if we actually read some bytes, bubbling EFAULT | |
05679465 RE |
292 | * up to become EAGAIN isn't what we want here... |
293 | * | |
294 | * ...on Linux, at least. On FBSD, doing this breaks. | |
59eab109 RE |
295 | */ |
296 | if (error == EFAULT && | |
297 | (zfs_uio_offset(uio) - start_offset) != 0) | |
298 | error = 0; | |
05679465 | 299 | #endif |
e53d678d MM |
300 | break; |
301 | } | |
302 | ||
303 | n -= nbytes; | |
304 | } | |
305 | ||
306 | int64_t nread = start_resid - n; | |
307 | dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); | |
308 | task_io_account_read(nread); | |
309 | out: | |
310 | zfs_rangelock_exit(lr); | |
311 | ||
312 | ZFS_ACCESSTIME_STAMP(zfsvfs, zp); | |
313 | ZFS_EXIT(zfsvfs); | |
314 | return (error); | |
315 | } | |
316 | ||
3d244b48 PJD |
317 | static void |
318 | zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, | |
319 | uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) | |
320 | { | |
321 | zilog_t *zilog = zfsvfs->z_log; | |
322 | const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); | |
323 | ||
324 | ASSERT(clear_setid_bits_txgp != NULL); | |
325 | ASSERT(tx != NULL); | |
326 | ||
327 | /* | |
328 | * Clear Set-UID/Set-GID bits on successful write if not | |
329 | * privileged and at least one of the execute bits is set. | |
330 | * | |
331 | * It would be nice to do this after all writes have | |
332 | * been done, but that would still expose the ISUID/ISGID | |
333 | * to another app after the partial write is committed. | |
334 | * | |
335 | * Note: we don't call zfs_fuid_map_id() here because | |
336 | * user 0 is not an ephemeral uid. | |
337 | */ | |
338 | mutex_enter(&zp->z_acl_lock); | |
339 | if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && | |
340 | (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && | |
341 | secpolicy_vnode_setid_retain(zp, cr, | |
342 | ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { | |
343 | uint64_t newmode; | |
344 | ||
345 | zp->z_mode &= ~(S_ISUID | S_ISGID); | |
346 | newmode = zp->z_mode; | |
347 | (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), | |
348 | (void *)&newmode, sizeof (uint64_t), tx); | |
349 | ||
350 | mutex_exit(&zp->z_acl_lock); | |
351 | ||
352 | /* | |
353 | * Make sure SUID/SGID bits will be removed when we replay the | |
354 | * log. If the setid bits are keep coming back, don't log more | |
355 | * than one TX_SETATTR per transaction group. | |
356 | */ | |
357 | if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { | |
861166b0 | 358 | vattr_t va = {0}; |
3d244b48 | 359 | |
4d972ab5 | 360 | va.va_mask = ATTR_MODE; |
3d244b48 PJD |
361 | va.va_nodeid = zp->z_id; |
362 | va.va_mode = newmode; | |
4d972ab5 JL |
363 | zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, |
364 | ATTR_MODE, NULL); | |
3d244b48 PJD |
365 | *clear_setid_bits_txgp = dmu_tx_get_txg(tx); |
366 | } | |
367 | } else { | |
368 | mutex_exit(&zp->z_acl_lock); | |
369 | } | |
370 | } | |
371 | ||
e53d678d MM |
372 | /* |
373 | * Write the bytes to a file. | |
374 | * | |
375 | * IN: zp - znode of file to be written to. | |
376 | * uio - structure supplying write location, range info, | |
377 | * and data buffer. | |
378 | * ioflag - O_APPEND flag set if in append mode. | |
379 | * O_DIRECT flag; used to bypass page cache. | |
380 | * cr - credentials of caller. | |
381 | * | |
382 | * OUT: uio - updated offset and range. | |
383 | * | |
384 | * RETURN: 0 if success | |
385 | * error code if failure | |
386 | * | |
387 | * Timestamps: | |
388 | * ip - ctime|mtime updated if byte count > 0 | |
389 | */ | |
e53d678d | 390 | int |
d0cd9a5c | 391 | zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) |
e53d678d | 392 | { |
063daa83 | 393 | int error = 0, error1; |
d0cd9a5c | 394 | ssize_t start_resid = zfs_uio_resid(uio); |
3d244b48 | 395 | uint64_t clear_setid_bits_txg = 0; |
e53d678d MM |
396 | |
397 | /* | |
398 | * Fasttrack empty write | |
399 | */ | |
400 | ssize_t n = start_resid; | |
401 | if (n == 0) | |
402 | return (0); | |
403 | ||
e53d678d MM |
404 | zfsvfs_t *zfsvfs = ZTOZSB(zp); |
405 | ZFS_ENTER(zfsvfs); | |
406 | ZFS_VERIFY_ZP(zp); | |
407 | ||
408 | sa_bulk_attr_t bulk[4]; | |
409 | int count = 0; | |
410 | uint64_t mtime[2], ctime[2]; | |
411 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); | |
412 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); | |
413 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, | |
414 | &zp->z_size, 8); | |
415 | SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, | |
416 | &zp->z_pflags, 8); | |
417 | ||
418 | /* | |
419 | * Callers might not be able to detect properly that we are read-only, | |
420 | * so check it explicitly here. | |
421 | */ | |
422 | if (zfs_is_readonly(zfsvfs)) { | |
423 | ZFS_EXIT(zfsvfs); | |
424 | return (SET_ERROR(EROFS)); | |
425 | } | |
426 | ||
427 | /* | |
4b2e2082 RM |
428 | * If immutable or not appending then return EPERM. |
429 | * Intentionally allow ZFS_READONLY through here. | |
430 | * See zfs_zaccess_common() | |
e53d678d | 431 | */ |
4b2e2082 | 432 | if ((zp->z_pflags & ZFS_IMMUTABLE) || |
e53d678d | 433 | ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && |
d0cd9a5c | 434 | (zfs_uio_offset(uio) < zp->z_size))) { |
e53d678d MM |
435 | ZFS_EXIT(zfsvfs); |
436 | return (SET_ERROR(EPERM)); | |
437 | } | |
438 | ||
439 | /* | |
440 | * Validate file offset | |
441 | */ | |
d0cd9a5c | 442 | offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); |
e53d678d MM |
443 | if (woff < 0) { |
444 | ZFS_EXIT(zfsvfs); | |
445 | return (SET_ERROR(EINVAL)); | |
446 | } | |
447 | ||
9a764716 | 448 | const uint64_t max_blksz = zfsvfs->z_max_blksz; |
e53d678d MM |
449 | |
450 | /* | |
451 | * Pre-fault the pages to ensure slow (eg NFS) pages | |
452 | * don't hold up txg. | |
453 | * Skip this if uio contains loaned arc_buf. | |
454 | */ | |
d0cd9a5c | 455 | if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { |
e53d678d MM |
456 | ZFS_EXIT(zfsvfs); |
457 | return (SET_ERROR(EFAULT)); | |
458 | } | |
459 | ||
460 | /* | |
461 | * If in append mode, set the io offset pointer to eof. | |
462 | */ | |
463 | zfs_locked_range_t *lr; | |
464 | if (ioflag & O_APPEND) { | |
465 | /* | |
466 | * Obtain an appending range lock to guarantee file append | |
467 | * semantics. We reset the write offset once we have the lock. | |
468 | */ | |
469 | lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); | |
470 | woff = lr->lr_offset; | |
471 | if (lr->lr_length == UINT64_MAX) { | |
472 | /* | |
473 | * We overlocked the file because this write will cause | |
474 | * the file block size to increase. | |
475 | * Note that zp_size cannot change with this lock held. | |
476 | */ | |
477 | woff = zp->z_size; | |
478 | } | |
d0cd9a5c | 479 | zfs_uio_setoffset(uio, woff); |
e53d678d MM |
480 | } else { |
481 | /* | |
482 | * Note that if the file block size will change as a result of | |
483 | * this write, then this range lock will lock the entire file | |
484 | * so that we can re-write the block safely. | |
485 | */ | |
486 | lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); | |
487 | } | |
488 | ||
d0cd9a5c | 489 | if (zn_rlimit_fsize(zp, uio)) { |
e53d678d MM |
490 | zfs_rangelock_exit(lr); |
491 | ZFS_EXIT(zfsvfs); | |
7e3617de | 492 | return (SET_ERROR(EFBIG)); |
e53d678d MM |
493 | } |
494 | ||
d1dd72a2 RM |
495 | const rlim64_t limit = MAXOFFSET_T; |
496 | ||
e53d678d MM |
497 | if (woff >= limit) { |
498 | zfs_rangelock_exit(lr); | |
499 | ZFS_EXIT(zfsvfs); | |
500 | return (SET_ERROR(EFBIG)); | |
501 | } | |
502 | ||
d1dd72a2 | 503 | if (n > limit - woff) |
e53d678d MM |
504 | n = limit - woff; |
505 | ||
506 | uint64_t end_size = MAX(zp->z_size, woff + n); | |
507 | zilog_t *zilog = zfsvfs->z_log; | |
508 | ||
eec6646e RM |
509 | const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); |
510 | const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); | |
511 | const uint64_t projid = zp->z_projid; | |
512 | ||
e53d678d MM |
513 | /* |
514 | * Write the file in reasonable size chunks. Each chunk is written | |
515 | * in a separate transaction; this keeps the intent log records small | |
516 | * and allows us to do more fine-grained space accounting. | |
517 | */ | |
518 | while (n > 0) { | |
d0cd9a5c | 519 | woff = zfs_uio_offset(uio); |
e53d678d | 520 | |
eec6646e RM |
521 | if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || |
522 | zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || | |
523 | (projid != ZFS_DEFAULT_PROJID && | |
e53d678d | 524 | zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, |
eec6646e | 525 | projid))) { |
e53d678d MM |
526 | error = SET_ERROR(EDQUOT); |
527 | break; | |
528 | } | |
529 | ||
530 | arc_buf_t *abuf = NULL; | |
531 | if (n >= max_blksz && woff >= zp->z_size && | |
532 | P2PHASE(woff, max_blksz) == 0 && | |
533 | zp->z_blksz == max_blksz) { | |
534 | /* | |
535 | * This write covers a full block. "Borrow" a buffer | |
536 | * from the dmu so that we can fill it before we enter | |
537 | * a transaction. This avoids the possibility of | |
538 | * holding up the transaction if the data copy hangs | |
539 | * up on a pagefault (e.g., from an NFS server mapping). | |
540 | */ | |
541 | size_t cbytes; | |
542 | ||
543 | abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), | |
544 | max_blksz); | |
545 | ASSERT(abuf != NULL); | |
546 | ASSERT(arc_buf_size(abuf) == max_blksz); | |
d0cd9a5c | 547 | if ((error = zfs_uiocopy(abuf->b_data, max_blksz, |
e53d678d MM |
548 | UIO_WRITE, uio, &cbytes))) { |
549 | dmu_return_arcbuf(abuf); | |
550 | break; | |
551 | } | |
1c2358c1 | 552 | ASSERT3S(cbytes, ==, max_blksz); |
e53d678d MM |
553 | } |
554 | ||
555 | /* | |
556 | * Start a transaction. | |
557 | */ | |
558 | dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); | |
559 | dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); | |
560 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); | |
561 | DB_DNODE_ENTER(db); | |
562 | dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, | |
563 | MIN(n, max_blksz)); | |
564 | DB_DNODE_EXIT(db); | |
565 | zfs_sa_upgrade_txholds(tx, zp); | |
566 | error = dmu_tx_assign(tx, TXG_WAIT); | |
567 | if (error) { | |
568 | dmu_tx_abort(tx); | |
569 | if (abuf != NULL) | |
570 | dmu_return_arcbuf(abuf); | |
571 | break; | |
572 | } | |
573 | ||
3d244b48 PJD |
574 | /* |
575 | * NB: We must call zfs_clear_setid_bits_if_necessary before | |
576 | * committing the transaction! | |
577 | */ | |
578 | ||
e53d678d MM |
579 | /* |
580 | * If rangelock_enter() over-locked we grow the blocksize | |
581 | * and then reduce the lock range. This will only happen | |
582 | * on the first iteration since rangelock_reduce() will | |
583 | * shrink down lr_length to the appropriate size. | |
584 | */ | |
585 | if (lr->lr_length == UINT64_MAX) { | |
586 | uint64_t new_blksz; | |
587 | ||
588 | if (zp->z_blksz > max_blksz) { | |
589 | /* | |
590 | * File's blocksize is already larger than the | |
591 | * "recordsize" property. Only let it grow to | |
592 | * the next power of 2. | |
593 | */ | |
594 | ASSERT(!ISP2(zp->z_blksz)); | |
595 | new_blksz = MIN(end_size, | |
596 | 1 << highbit64(zp->z_blksz)); | |
597 | } else { | |
598 | new_blksz = MIN(end_size, max_blksz); | |
599 | } | |
600 | zfs_grow_blocksize(zp, new_blksz, tx); | |
601 | zfs_rangelock_reduce(lr, woff, n); | |
602 | } | |
603 | ||
604 | /* | |
605 | * XXX - should we really limit each write to z_max_blksz? | |
606 | * Perhaps we should use SPA_MAXBLOCKSIZE chunks? | |
607 | */ | |
85703f61 RM |
608 | const ssize_t nbytes = |
609 | MIN(n, max_blksz - P2PHASE(woff, max_blksz)); | |
e53d678d MM |
610 | |
611 | ssize_t tx_bytes; | |
612 | if (abuf == NULL) { | |
d0cd9a5c BA |
613 | tx_bytes = zfs_uio_resid(uio); |
614 | zfs_uio_fault_disable(uio, B_TRUE); | |
e53d678d MM |
615 | error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), |
616 | uio, nbytes, tx); | |
d0cd9a5c | 617 | zfs_uio_fault_disable(uio, B_FALSE); |
e53d678d MM |
618 | #ifdef __linux__ |
619 | if (error == EFAULT) { | |
3d244b48 PJD |
620 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, |
621 | cr, &clear_setid_bits_txg, tx); | |
e53d678d MM |
622 | dmu_tx_commit(tx); |
623 | /* | |
624 | * Account for partial writes before | |
625 | * continuing the loop. | |
626 | * Update needs to occur before the next | |
d0cd9a5c | 627 | * zfs_uio_prefaultpages, or prefaultpages may |
e53d678d MM |
628 | * error, and we may break the loop early. |
629 | */ | |
d0cd9a5c BA |
630 | if (tx_bytes != zfs_uio_resid(uio)) |
631 | n -= tx_bytes - zfs_uio_resid(uio); | |
632 | if (zfs_uio_prefaultpages(MIN(n, max_blksz), | |
633 | uio)) { | |
e53d678d MM |
634 | break; |
635 | } | |
636 | continue; | |
637 | } | |
638 | #endif | |
063daa83 MJ |
639 | /* |
640 | * On FreeBSD, EFAULT should be propagated back to the | |
641 | * VFS, which will handle faulting and will retry. | |
642 | */ | |
643 | if (error != 0 && error != EFAULT) { | |
3d244b48 PJD |
644 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, |
645 | cr, &clear_setid_bits_txg, tx); | |
e53d678d MM |
646 | dmu_tx_commit(tx); |
647 | break; | |
648 | } | |
d0cd9a5c | 649 | tx_bytes -= zfs_uio_resid(uio); |
e53d678d | 650 | } else { |
85703f61 RM |
651 | /* Implied by abuf != NULL: */ |
652 | ASSERT3S(n, >=, max_blksz); | |
85703f61 | 653 | ASSERT0(P2PHASE(woff, max_blksz)); |
e53d678d | 654 | /* |
85703f61 RM |
655 | * We can simplify nbytes to MIN(n, max_blksz) since |
656 | * P2PHASE(woff, max_blksz) is 0, and knowing | |
657 | * n >= max_blksz lets us simplify further: | |
e53d678d | 658 | */ |
85703f61 | 659 | ASSERT3S(nbytes, ==, max_blksz); |
e53d678d | 660 | /* |
85703f61 RM |
661 | * Thus, we're writing a full block at a block-aligned |
662 | * offset and extending the file past EOF. | |
663 | * | |
664 | * dmu_assign_arcbuf_by_dbuf() will directly assign the | |
665 | * arc buffer to a dbuf. | |
e53d678d | 666 | */ |
85703f61 RM |
667 | error = dmu_assign_arcbuf_by_dbuf( |
668 | sa_get_db(zp->z_sa_hdl), woff, abuf, tx); | |
669 | if (error != 0) { | |
3d244b48 PJD |
670 | /* |
671 | * XXX This might not be necessary if | |
672 | * dmu_assign_arcbuf_by_dbuf is guaranteed | |
673 | * to be atomic. | |
674 | */ | |
675 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, | |
676 | cr, &clear_setid_bits_txg, tx); | |
85703f61 RM |
677 | dmu_return_arcbuf(abuf); |
678 | dmu_tx_commit(tx); | |
679 | break; | |
e53d678d | 680 | } |
d0cd9a5c BA |
681 | ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); |
682 | zfs_uioskip(uio, nbytes); | |
85703f61 | 683 | tx_bytes = nbytes; |
e53d678d MM |
684 | } |
685 | if (tx_bytes && zn_has_cached_data(zp) && | |
686 | !(ioflag & O_DIRECT)) { | |
8a9634e2 | 687 | update_pages(zp, woff, tx_bytes, zfsvfs->z_os); |
e53d678d MM |
688 | } |
689 | ||
690 | /* | |
691 | * If we made no progress, we're done. If we made even | |
692 | * partial progress, update the znode and ZIL accordingly. | |
693 | */ | |
694 | if (tx_bytes == 0) { | |
695 | (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), | |
696 | (void *)&zp->z_size, sizeof (uint64_t), tx); | |
697 | dmu_tx_commit(tx); | |
698 | ASSERT(error != 0); | |
699 | break; | |
700 | } | |
701 | ||
3d244b48 PJD |
702 | zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, |
703 | &clear_setid_bits_txg, tx); | |
e53d678d MM |
704 | |
705 | zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); | |
706 | ||
707 | /* | |
708 | * Update the file size (zp_size) if it has changed; | |
709 | * account for possible concurrent updates. | |
710 | */ | |
d0cd9a5c | 711 | while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { |
e53d678d | 712 | (void) atomic_cas_64(&zp->z_size, end_size, |
d0cd9a5c | 713 | zfs_uio_offset(uio)); |
063daa83 | 714 | ASSERT(error == 0 || error == EFAULT); |
e53d678d MM |
715 | } |
716 | /* | |
717 | * If we are replaying and eof is non zero then force | |
718 | * the file size to the specified eof. Note, there's no | |
719 | * concurrency during replay. | |
720 | */ | |
721 | if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) | |
722 | zp->z_size = zfsvfs->z_replay_eof; | |
723 | ||
063daa83 MJ |
724 | error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); |
725 | if (error1 != 0) | |
726 | /* Avoid clobbering EFAULT. */ | |
727 | error = error1; | |
e53d678d | 728 | |
3d244b48 PJD |
729 | /* |
730 | * NB: During replay, the TX_SETATTR record logged by | |
731 | * zfs_clear_setid_bits_if_necessary must precede any of | |
732 | * the TX_WRITE records logged here. | |
733 | */ | |
e53d678d MM |
734 | zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, |
735 | NULL, NULL); | |
3d244b48 | 736 | |
e53d678d MM |
737 | dmu_tx_commit(tx); |
738 | ||
739 | if (error != 0) | |
740 | break; | |
1c2358c1 | 741 | ASSERT3S(tx_bytes, ==, nbytes); |
e53d678d MM |
742 | n -= nbytes; |
743 | ||
744 | if (n > 0) { | |
d0cd9a5c | 745 | if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { |
7e3617de | 746 | error = SET_ERROR(EFAULT); |
e53d678d MM |
747 | break; |
748 | } | |
749 | } | |
750 | } | |
751 | ||
fc273894 | 752 | zfs_znode_update_vfs(zp); |
e53d678d MM |
753 | zfs_rangelock_exit(lr); |
754 | ||
755 | /* | |
7e3617de RM |
756 | * If we're in replay mode, or we made no progress, or the |
757 | * uio data is inaccessible return an error. Otherwise, it's | |
758 | * at least a partial write, so it's successful. | |
e53d678d | 759 | */ |
d0cd9a5c | 760 | if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || |
7e3617de | 761 | error == EFAULT) { |
e53d678d MM |
762 | ZFS_EXIT(zfsvfs); |
763 | return (error); | |
764 | } | |
765 | ||
766 | if (ioflag & (O_SYNC | O_DSYNC) || | |
767 | zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) | |
768 | zil_commit(zilog, zp->z_id); | |
769 | ||
d0cd9a5c | 770 | const int64_t nwritten = start_resid - zfs_uio_resid(uio); |
e53d678d MM |
771 | dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); |
772 | task_io_account_write(nwritten); | |
773 | ||
774 | ZFS_EXIT(zfsvfs); | |
775 | return (0); | |
776 | } | |
777 | ||
e53d678d MM |
778 | int |
779 | zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) | |
780 | { | |
781 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
782 | int error; | |
783 | boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; | |
784 | ||
785 | ZFS_ENTER(zfsvfs); | |
786 | ZFS_VERIFY_ZP(zp); | |
787 | error = zfs_getacl(zp, vsecp, skipaclchk, cr); | |
788 | ZFS_EXIT(zfsvfs); | |
789 | ||
790 | return (error); | |
791 | } | |
792 | ||
e53d678d MM |
793 | int |
794 | zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) | |
795 | { | |
796 | zfsvfs_t *zfsvfs = ZTOZSB(zp); | |
797 | int error; | |
798 | boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; | |
799 | zilog_t *zilog = zfsvfs->z_log; | |
800 | ||
801 | ZFS_ENTER(zfsvfs); | |
802 | ZFS_VERIFY_ZP(zp); | |
803 | ||
804 | error = zfs_setacl(zp, vsecp, skipaclchk, cr); | |
805 | ||
806 | if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) | |
807 | zil_commit(zilog, 0); | |
808 | ||
809 | ZFS_EXIT(zfsvfs); | |
810 | return (error); | |
811 | } | |
812 | ||
ab8c935e CS |
813 | #ifdef ZFS_DEBUG |
814 | static int zil_fault_io = 0; | |
815 | #endif | |
816 | ||
817 | static void zfs_get_done(zgd_t *zgd, int error); | |
818 | ||
819 | /* | |
820 | * Get data to generate a TX_WRITE intent log record. | |
821 | */ | |
822 | int | |
296a4a36 CC |
823 | zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, |
824 | struct lwb *lwb, zio_t *zio) | |
ab8c935e CS |
825 | { |
826 | zfsvfs_t *zfsvfs = arg; | |
827 | objset_t *os = zfsvfs->z_os; | |
828 | znode_t *zp; | |
829 | uint64_t object = lr->lr_foid; | |
830 | uint64_t offset = lr->lr_offset; | |
831 | uint64_t size = lr->lr_length; | |
832 | dmu_buf_t *db; | |
833 | zgd_t *zgd; | |
834 | int error = 0; | |
296a4a36 | 835 | uint64_t zp_gen; |
ab8c935e CS |
836 | |
837 | ASSERT3P(lwb, !=, NULL); | |
838 | ASSERT3P(zio, !=, NULL); | |
839 | ASSERT3U(size, !=, 0); | |
840 | ||
841 | /* | |
842 | * Nothing to do if the file has been removed | |
843 | */ | |
844 | if (zfs_zget(zfsvfs, object, &zp) != 0) | |
845 | return (SET_ERROR(ENOENT)); | |
846 | if (zp->z_unlinked) { | |
847 | /* | |
848 | * Release the vnode asynchronously as we currently have the | |
849 | * txg stopped from syncing. | |
850 | */ | |
851 | zfs_zrele_async(zp); | |
852 | return (SET_ERROR(ENOENT)); | |
853 | } | |
296a4a36 CC |
854 | /* check if generation number matches */ |
855 | if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, | |
856 | sizeof (zp_gen)) != 0) { | |
857 | zfs_zrele_async(zp); | |
858 | return (SET_ERROR(EIO)); | |
859 | } | |
860 | if (zp_gen != gen) { | |
861 | zfs_zrele_async(zp); | |
862 | return (SET_ERROR(ENOENT)); | |
863 | } | |
ab8c935e CS |
864 | |
865 | zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); | |
866 | zgd->zgd_lwb = lwb; | |
867 | zgd->zgd_private = zp; | |
868 | ||
869 | /* | |
870 | * Write records come in two flavors: immediate and indirect. | |
871 | * For small writes it's cheaper to store the data with the | |
872 | * log record (immediate); for large writes it's cheaper to | |
873 | * sync the data and get a pointer to it (indirect) so that | |
874 | * we don't have to write the data twice. | |
875 | */ | |
876 | if (buf != NULL) { /* immediate write */ | |
877 | zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, | |
878 | offset, size, RL_READER); | |
879 | /* test for truncation needs to be done while range locked */ | |
880 | if (offset >= zp->z_size) { | |
881 | error = SET_ERROR(ENOENT); | |
882 | } else { | |
883 | error = dmu_read(os, object, offset, size, buf, | |
884 | DMU_READ_NO_PREFETCH); | |
885 | } | |
886 | ASSERT(error == 0 || error == ENOENT); | |
887 | } else { /* indirect write */ | |
888 | /* | |
889 | * Have to lock the whole block to ensure when it's | |
890 | * written out and its checksum is being calculated | |
891 | * that no one can change the data. We need to re-check | |
892 | * blocksize after we get the lock in case it's changed! | |
893 | */ | |
894 | for (;;) { | |
895 | uint64_t blkoff; | |
896 | size = zp->z_blksz; | |
897 | blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; | |
898 | offset -= blkoff; | |
899 | zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, | |
900 | offset, size, RL_READER); | |
901 | if (zp->z_blksz == size) | |
902 | break; | |
903 | offset += blkoff; | |
904 | zfs_rangelock_exit(zgd->zgd_lr); | |
905 | } | |
906 | /* test for truncation needs to be done while range locked */ | |
907 | if (lr->lr_offset >= zp->z_size) | |
908 | error = SET_ERROR(ENOENT); | |
909 | #ifdef ZFS_DEBUG | |
910 | if (zil_fault_io) { | |
911 | error = SET_ERROR(EIO); | |
912 | zil_fault_io = 0; | |
913 | } | |
914 | #endif | |
915 | if (error == 0) | |
916 | error = dmu_buf_hold(os, object, offset, zgd, &db, | |
917 | DMU_READ_NO_PREFETCH); | |
918 | ||
919 | if (error == 0) { | |
920 | blkptr_t *bp = &lr->lr_blkptr; | |
921 | ||
922 | zgd->zgd_db = db; | |
923 | zgd->zgd_bp = bp; | |
924 | ||
925 | ASSERT(db->db_offset == offset); | |
926 | ASSERT(db->db_size == size); | |
927 | ||
928 | error = dmu_sync(zio, lr->lr_common.lrc_txg, | |
929 | zfs_get_done, zgd); | |
930 | ASSERT(error || lr->lr_length <= size); | |
931 | ||
932 | /* | |
933 | * On success, we need to wait for the write I/O | |
934 | * initiated by dmu_sync() to complete before we can | |
935 | * release this dbuf. We will finish everything up | |
936 | * in the zfs_get_done() callback. | |
937 | */ | |
938 | if (error == 0) | |
939 | return (0); | |
940 | ||
941 | if (error == EALREADY) { | |
942 | lr->lr_common.lrc_txtype = TX_WRITE2; | |
943 | /* | |
944 | * TX_WRITE2 relies on the data previously | |
945 | * written by the TX_WRITE that caused | |
946 | * EALREADY. We zero out the BP because | |
947 | * it is the old, currently-on-disk BP. | |
948 | */ | |
949 | zgd->zgd_bp = NULL; | |
950 | BP_ZERO(bp); | |
951 | error = 0; | |
952 | } | |
953 | } | |
954 | } | |
955 | ||
956 | zfs_get_done(zgd, error); | |
957 | ||
958 | return (error); | |
959 | } | |
960 | ||
961 | ||
ab8c935e CS |
962 | static void |
963 | zfs_get_done(zgd_t *zgd, int error) | |
964 | { | |
ef70eff1 | 965 | (void) error; |
ab8c935e CS |
966 | znode_t *zp = zgd->zgd_private; |
967 | ||
968 | if (zgd->zgd_db) | |
969 | dmu_buf_rele(zgd->zgd_db, zgd); | |
970 | ||
971 | zfs_rangelock_exit(zgd->zgd_lr); | |
972 | ||
973 | /* | |
974 | * Release the vnode asynchronously as we currently have the | |
975 | * txg stopped from syncing. | |
976 | */ | |
977 | zfs_zrele_async(zp); | |
978 | ||
979 | kmem_free(zgd, sizeof (zgd_t)); | |
980 | } | |
981 | ||
8583540c | 982 | EXPORT_SYMBOL(zfs_access); |
e53d678d | 983 | EXPORT_SYMBOL(zfs_fsync); |
8583540c | 984 | EXPORT_SYMBOL(zfs_holey); |
e53d678d MM |
985 | EXPORT_SYMBOL(zfs_read); |
986 | EXPORT_SYMBOL(zfs_write); | |
987 | EXPORT_SYMBOL(zfs_getsecattr); | |
988 | EXPORT_SYMBOL(zfs_setsecattr); | |
989 | ||
990 | ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW, | |
991 | "Bytes to read per chunk"); |