]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zfs_vnops.c
Rebase master to b105
[mirror_zfs.git] / module / zfs / zfs_vnops.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /* Portions Copyright 2007 Jeremy Teo */
27
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/time.h>
31 #include <sys/systm.h>
32 #include <sys/sysmacros.h>
33 #include <sys/resource.h>
34 #include <sys/vfs.h>
35 #include <sys/vfs_opreg.h>
36 #include <sys/vnode.h>
37 #include <sys/file.h>
38 #include <sys/stat.h>
39 #include <sys/kmem.h>
40 #include <sys/taskq.h>
41 #include <sys/uio.h>
42 #include <sys/vmsystm.h>
43 #include <sys/atomic.h>
44 #include <sys/vm.h>
45 #include <vm/seg_vn.h>
46 #include <vm/pvn.h>
47 #include <vm/as.h>
48 #include <vm/kpm.h>
49 #include <vm/seg_kpm.h>
50 #include <sys/mman.h>
51 #include <sys/pathname.h>
52 #include <sys/cmn_err.h>
53 #include <sys/errno.h>
54 #include <sys/unistd.h>
55 #include <sys/zfs_dir.h>
56 #include <sys/zfs_acl.h>
57 #include <sys/zfs_ioctl.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/dmu.h>
60 #include <sys/spa.h>
61 #include <sys/txg.h>
62 #include <sys/dbuf.h>
63 #include <sys/zap.h>
64 #include <sys/dirent.h>
65 #include <sys/policy.h>
66 #include <sys/sunddi.h>
67 #include <sys/filio.h>
68 #include <sys/sid.h>
69 #include "fs/fs_subr.h"
70 #include <sys/zfs_ctldir.h>
71 #include <sys/zfs_fuid.h>
72 #include <sys/dnlc.h>
73 #include <sys/zfs_rlock.h>
74 #include <sys/extdirent.h>
75 #include <sys/kidmap.h>
76 #include <sys/cred_impl.h>
77 #include <sys/attr.h>
78
79 /*
80 * Programming rules.
81 *
82 * Each vnode op performs some logical unit of work. To do this, the ZPL must
83 * properly lock its in-core state, create a DMU transaction, do the work,
84 * record this work in the intent log (ZIL), commit the DMU transaction,
85 * and wait for the intent log to commit if it is a synchronous operation.
86 * Moreover, the vnode ops must work in both normal and log replay context.
87 * The ordering of events is important to avoid deadlocks and references
88 * to freed memory. The example below illustrates the following Big Rules:
89 *
90 * (1) A check must be made in each zfs thread for a mounted file system.
91 * This is done avoiding races using ZFS_ENTER(zfsvfs).
92 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
93 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
94 * can return EIO from the calling function.
95 *
96 * (2) VN_RELE() should always be the last thing except for zil_commit()
97 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
98 * First, if it's the last reference, the vnode/znode
99 * can be freed, so the zp may point to freed memory. Second, the last
100 * reference will call zfs_zinactive(), which may induce a lot of work --
101 * pushing cached pages (which acquires range locks) and syncing out
102 * cached atime changes. Third, zfs_zinactive() may require a new tx,
103 * which could deadlock the system if you were already holding one.
104 *
105 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
106 * as they can span dmu_tx_assign() calls.
107 *
108 * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
109 * This is critical because we don't want to block while holding locks.
110 * Note, in particular, that if a lock is sometimes acquired before
111 * the tx assigns, and sometimes after (e.g. z_lock), then failing to
112 * use a non-blocking assign can deadlock the system. The scenario:
113 *
114 * Thread A has grabbed a lock before calling dmu_tx_assign().
115 * Thread B is in an already-assigned tx, and blocks for this lock.
116 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
117 * forever, because the previous txg can't quiesce until B's tx commits.
118 *
119 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
120 * then drop all locks, call dmu_tx_wait(), and try again.
121 *
122 * (5) If the operation succeeded, generate the intent log entry for it
123 * before dropping locks. This ensures that the ordering of events
124 * in the intent log matches the order in which they actually occurred.
125 * During ZIL replay the zfs_log_* functions will update the sequence
126 * number to indicate the zil transaction has replayed.
127 *
128 * (6) At the end of each vnode op, the DMU tx must always commit,
129 * regardless of whether there were any errors.
130 *
131 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
132 * to ensure that synchronous semantics are provided when necessary.
133 *
134 * In general, this is how things should be ordered in each vnode op:
135 *
136 * ZFS_ENTER(zfsvfs); // exit if unmounted
137 * top:
138 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
139 * rw_enter(...); // grab any other locks you need
140 * tx = dmu_tx_create(...); // get DMU tx
141 * dmu_tx_hold_*(); // hold each object you might modify
142 * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign
143 * if (error) {
144 * rw_exit(...); // drop locks
145 * zfs_dirent_unlock(dl); // unlock directory entry
146 * VN_RELE(...); // release held vnodes
147 * if (error == ERESTART) {
148 * dmu_tx_wait(tx);
149 * dmu_tx_abort(tx);
150 * goto top;
151 * }
152 * dmu_tx_abort(tx); // abort DMU tx
153 * ZFS_EXIT(zfsvfs); // finished in zfs
154 * return (error); // really out of space
155 * }
156 * error = do_real_work(); // do whatever this VOP does
157 * if (error == 0)
158 * zfs_log_*(...); // on success, make ZIL entry
159 * dmu_tx_commit(tx); // commit DMU tx -- error or not
160 * rw_exit(...); // drop locks
161 * zfs_dirent_unlock(dl); // unlock directory entry
162 * VN_RELE(...); // release held vnodes
163 * zil_commit(zilog, seq, foid); // synchronous when necessary
164 * ZFS_EXIT(zfsvfs); // finished in zfs
165 * return (error); // done, report error
166 */
167
168 /* ARGSUSED */
169 static int
170 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
171 {
172 znode_t *zp = VTOZ(*vpp);
173 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
174
175 ZFS_ENTER(zfsvfs);
176 ZFS_VERIFY_ZP(zp);
177
178 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
179 ((flag & FAPPEND) == 0)) {
180 ZFS_EXIT(zfsvfs);
181 return (EPERM);
182 }
183
184 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
185 ZTOV(zp)->v_type == VREG &&
186 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
187 zp->z_phys->zp_size > 0) {
188 if (fs_vscan(*vpp, cr, 0) != 0) {
189 ZFS_EXIT(zfsvfs);
190 return (EACCES);
191 }
192 }
193
194 /* Keep a count of the synchronous opens in the znode */
195 if (flag & (FSYNC | FDSYNC))
196 atomic_inc_32(&zp->z_sync_cnt);
197
198 ZFS_EXIT(zfsvfs);
199 return (0);
200 }
201
202 /* ARGSUSED */
203 static int
204 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
205 caller_context_t *ct)
206 {
207 znode_t *zp = VTOZ(vp);
208 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
209
210 ZFS_ENTER(zfsvfs);
211 ZFS_VERIFY_ZP(zp);
212
213 /* Decrement the synchronous opens in the znode */
214 if ((flag & (FSYNC | FDSYNC)) && (count == 1))
215 atomic_dec_32(&zp->z_sync_cnt);
216
217 /*
218 * Clean up any locks held by this process on the vp.
219 */
220 cleanlocks(vp, ddi_get_pid(), 0);
221 cleanshares(vp, ddi_get_pid());
222
223 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
224 ZTOV(zp)->v_type == VREG &&
225 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
226 zp->z_phys->zp_size > 0)
227 VERIFY(fs_vscan(vp, cr, 1) == 0);
228
229 ZFS_EXIT(zfsvfs);
230 return (0);
231 }
232
233 /*
234 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
235 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
236 */
237 static int
238 zfs_holey(vnode_t *vp, int cmd, offset_t *off)
239 {
240 znode_t *zp = VTOZ(vp);
241 uint64_t noff = (uint64_t)*off; /* new offset */
242 uint64_t file_sz;
243 int error;
244 boolean_t hole;
245
246 file_sz = zp->z_phys->zp_size;
247 if (noff >= file_sz) {
248 return (ENXIO);
249 }
250
251 if (cmd == _FIO_SEEK_HOLE)
252 hole = B_TRUE;
253 else
254 hole = B_FALSE;
255
256 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
257
258 /* end of file? */
259 if ((error == ESRCH) || (noff > file_sz)) {
260 /*
261 * Handle the virtual hole at the end of file.
262 */
263 if (hole) {
264 *off = file_sz;
265 return (0);
266 }
267 return (ENXIO);
268 }
269
270 if (noff < *off)
271 return (error);
272 *off = noff;
273 return (error);
274 }
275
276 /* ARGSUSED */
277 static int
278 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
279 int *rvalp, caller_context_t *ct)
280 {
281 offset_t off;
282 int error;
283 zfsvfs_t *zfsvfs;
284 znode_t *zp;
285
286 switch (com) {
287 case _FIOFFS:
288 return (zfs_sync(vp->v_vfsp, 0, cred));
289
290 /*
291 * The following two ioctls are used by bfu. Faking out,
292 * necessary to avoid bfu errors.
293 */
294 case _FIOGDIO:
295 case _FIOSDIO:
296 return (0);
297
298 case _FIO_SEEK_DATA:
299 case _FIO_SEEK_HOLE:
300 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
301 return (EFAULT);
302
303 zp = VTOZ(vp);
304 zfsvfs = zp->z_zfsvfs;
305 ZFS_ENTER(zfsvfs);
306 ZFS_VERIFY_ZP(zp);
307
308 /* offset parameter is in/out */
309 error = zfs_holey(vp, com, &off);
310 ZFS_EXIT(zfsvfs);
311 if (error)
312 return (error);
313 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
314 return (EFAULT);
315 return (0);
316 }
317 return (ENOTTY);
318 }
319
320 /*
321 * Utility functions to map and unmap a single physical page. These
322 * are used to manage the mappable copies of ZFS file data, and therefore
323 * do not update ref/mod bits.
324 */
325 caddr_t
326 zfs_map_page(page_t *pp, enum seg_rw rw)
327 {
328 if (kpm_enable)
329 return (hat_kpm_mapin(pp, 0));
330 ASSERT(rw == S_READ || rw == S_WRITE);
331 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
332 (caddr_t)-1));
333 }
334
335 void
336 zfs_unmap_page(page_t *pp, caddr_t addr)
337 {
338 if (kpm_enable) {
339 hat_kpm_mapout(pp, 0, addr);
340 } else {
341 ppmapout(addr);
342 }
343 }
344
345 /*
346 * When a file is memory mapped, we must keep the IO data synchronized
347 * between the DMU cache and the memory mapped pages. What this means:
348 *
349 * On Write: If we find a memory mapped page, we write to *both*
350 * the page and the dmu buffer.
351 *
352 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
353 * the file is memory mapped.
354 */
355 static int
356 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
357 {
358 znode_t *zp = VTOZ(vp);
359 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
360 int64_t start, off;
361 int len = nbytes;
362 int error = 0;
363
364 start = uio->uio_loffset;
365 off = start & PAGEOFFSET;
366 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
367 page_t *pp;
368 uint64_t bytes = MIN(PAGESIZE - off, len);
369 uint64_t woff = uio->uio_loffset;
370
371 /*
372 * We don't want a new page to "appear" in the middle of
373 * the file update (because it may not get the write
374 * update data), so we grab a lock to block
375 * zfs_getpage().
376 */
377 rw_enter(&zp->z_map_lock, RW_WRITER);
378 if (pp = page_lookup(vp, start, SE_SHARED)) {
379 caddr_t va;
380
381 rw_exit(&zp->z_map_lock);
382 va = zfs_map_page(pp, S_WRITE);
383 error = uiomove(va+off, bytes, UIO_WRITE, uio);
384 if (error == 0) {
385 dmu_write(zfsvfs->z_os, zp->z_id,
386 woff, bytes, va+off, tx);
387 }
388 zfs_unmap_page(pp, va);
389 page_unlock(pp);
390 } else {
391 error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
392 uio, bytes, tx);
393 rw_exit(&zp->z_map_lock);
394 }
395 len -= bytes;
396 off = 0;
397 if (error)
398 break;
399 }
400 return (error);
401 }
402
403 /*
404 * When a file is memory mapped, we must keep the IO data synchronized
405 * between the DMU cache and the memory mapped pages. What this means:
406 *
407 * On Read: We "read" preferentially from memory mapped pages,
408 * else we default from the dmu buffer.
409 *
410 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
411 * the file is memory mapped.
412 */
413 static int
414 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
415 {
416 znode_t *zp = VTOZ(vp);
417 objset_t *os = zp->z_zfsvfs->z_os;
418 int64_t start, off;
419 int len = nbytes;
420 int error = 0;
421
422 start = uio->uio_loffset;
423 off = start & PAGEOFFSET;
424 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
425 page_t *pp;
426 uint64_t bytes = MIN(PAGESIZE - off, len);
427
428 if (pp = page_lookup(vp, start, SE_SHARED)) {
429 caddr_t va;
430
431 va = zfs_map_page(pp, S_READ);
432 error = uiomove(va + off, bytes, UIO_READ, uio);
433 zfs_unmap_page(pp, va);
434 page_unlock(pp);
435 } else {
436 error = dmu_read_uio(os, zp->z_id, uio, bytes);
437 }
438 len -= bytes;
439 off = 0;
440 if (error)
441 break;
442 }
443 return (error);
444 }
445
446 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
447
448 /*
449 * Read bytes from specified file into supplied buffer.
450 *
451 * IN: vp - vnode of file to be read from.
452 * uio - structure supplying read location, range info,
453 * and return buffer.
454 * ioflag - SYNC flags; used to provide FRSYNC semantics.
455 * cr - credentials of caller.
456 * ct - caller context
457 *
458 * OUT: uio - updated offset and range, buffer filled.
459 *
460 * RETURN: 0 if success
461 * error code if failure
462 *
463 * Side Effects:
464 * vp - atime updated if byte count > 0
465 */
466 /* ARGSUSED */
467 static int
468 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
469 {
470 znode_t *zp = VTOZ(vp);
471 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
472 objset_t *os;
473 ssize_t n, nbytes;
474 int error;
475 rl_t *rl;
476
477 ZFS_ENTER(zfsvfs);
478 ZFS_VERIFY_ZP(zp);
479 os = zfsvfs->z_os;
480
481 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
482 ZFS_EXIT(zfsvfs);
483 return (EACCES);
484 }
485
486 /*
487 * Validate file offset
488 */
489 if (uio->uio_loffset < (offset_t)0) {
490 ZFS_EXIT(zfsvfs);
491 return (EINVAL);
492 }
493
494 /*
495 * Fasttrack empty reads
496 */
497 if (uio->uio_resid == 0) {
498 ZFS_EXIT(zfsvfs);
499 return (0);
500 }
501
502 /*
503 * Check for mandatory locks
504 */
505 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
506 if (error = chklock(vp, FREAD,
507 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
508 ZFS_EXIT(zfsvfs);
509 return (error);
510 }
511 }
512
513 /*
514 * If we're in FRSYNC mode, sync out this znode before reading it.
515 */
516 if (ioflag & FRSYNC)
517 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
518
519 /*
520 * Lock the range against changes.
521 */
522 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
523
524 /*
525 * If we are reading past end-of-file we can skip
526 * to the end; but we might still need to set atime.
527 */
528 if (uio->uio_loffset >= zp->z_phys->zp_size) {
529 error = 0;
530 goto out;
531 }
532
533 ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
534 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
535
536 while (n > 0) {
537 nbytes = MIN(n, zfs_read_chunk_size -
538 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
539
540 if (vn_has_cached_data(vp))
541 error = mappedread(vp, nbytes, uio);
542 else
543 error = dmu_read_uio(os, zp->z_id, uio, nbytes);
544 if (error) {
545 /* convert checksum errors into IO errors */
546 if (error == ECKSUM)
547 error = EIO;
548 break;
549 }
550
551 n -= nbytes;
552 }
553
554 out:
555 zfs_range_unlock(rl);
556
557 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
558 ZFS_EXIT(zfsvfs);
559 return (error);
560 }
561
562 /*
563 * Write the bytes to a file.
564 *
565 * IN: vp - vnode of file to be written to.
566 * uio - structure supplying write location, range info,
567 * and data buffer.
568 * ioflag - FAPPEND flag set if in append mode.
569 * cr - credentials of caller.
570 * ct - caller context (NFS/CIFS fem monitor only)
571 *
572 * OUT: uio - updated offset and range.
573 *
574 * RETURN: 0 if success
575 * error code if failure
576 *
577 * Timestamps:
578 * vp - ctime|mtime updated if byte count > 0
579 */
580 /* ARGSUSED */
581 static int
582 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
583 {
584 znode_t *zp = VTOZ(vp);
585 rlim64_t limit = uio->uio_llimit;
586 ssize_t start_resid = uio->uio_resid;
587 ssize_t tx_bytes;
588 uint64_t end_size;
589 dmu_tx_t *tx;
590 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
591 zilog_t *zilog;
592 offset_t woff;
593 ssize_t n, nbytes;
594 rl_t *rl;
595 int max_blksz = zfsvfs->z_max_blksz;
596 uint64_t pflags;
597 int error;
598
599 /*
600 * Fasttrack empty write
601 */
602 n = start_resid;
603 if (n == 0)
604 return (0);
605
606 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
607 limit = MAXOFFSET_T;
608
609 ZFS_ENTER(zfsvfs);
610 ZFS_VERIFY_ZP(zp);
611
612 /*
613 * If immutable or not appending then return EPERM
614 */
615 pflags = zp->z_phys->zp_flags;
616 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
617 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
618 (uio->uio_loffset < zp->z_phys->zp_size))) {
619 ZFS_EXIT(zfsvfs);
620 return (EPERM);
621 }
622
623 zilog = zfsvfs->z_log;
624
625 /*
626 * Pre-fault the pages to ensure slow (eg NFS) pages
627 * don't hold up txg.
628 */
629 uio_prefaultpages(n, uio);
630
631 /*
632 * If in append mode, set the io offset pointer to eof.
633 */
634 if (ioflag & FAPPEND) {
635 /*
636 * Range lock for a file append:
637 * The value for the start of range will be determined by
638 * zfs_range_lock() (to guarantee append semantics).
639 * If this write will cause the block size to increase,
640 * zfs_range_lock() will lock the entire file, so we must
641 * later reduce the range after we grow the block size.
642 */
643 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
644 if (rl->r_len == UINT64_MAX) {
645 /* overlocked, zp_size can't change */
646 woff = uio->uio_loffset = zp->z_phys->zp_size;
647 } else {
648 woff = uio->uio_loffset = rl->r_off;
649 }
650 } else {
651 woff = uio->uio_loffset;
652 /*
653 * Validate file offset
654 */
655 if (woff < 0) {
656 ZFS_EXIT(zfsvfs);
657 return (EINVAL);
658 }
659
660 /*
661 * If we need to grow the block size then zfs_range_lock()
662 * will lock a wider range than we request here.
663 * Later after growing the block size we reduce the range.
664 */
665 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
666 }
667
668 if (woff >= limit) {
669 zfs_range_unlock(rl);
670 ZFS_EXIT(zfsvfs);
671 return (EFBIG);
672 }
673
674 if ((woff + n) > limit || woff > (limit - n))
675 n = limit - woff;
676
677 /*
678 * Check for mandatory locks
679 */
680 if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
681 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
682 zfs_range_unlock(rl);
683 ZFS_EXIT(zfsvfs);
684 return (error);
685 }
686 end_size = MAX(zp->z_phys->zp_size, woff + n);
687
688 /*
689 * Write the file in reasonable size chunks. Each chunk is written
690 * in a separate transaction; this keeps the intent log records small
691 * and allows us to do more fine-grained space accounting.
692 */
693 while (n > 0) {
694 /*
695 * Start a transaction.
696 */
697 woff = uio->uio_loffset;
698 tx = dmu_tx_create(zfsvfs->z_os);
699 dmu_tx_hold_bonus(tx, zp->z_id);
700 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
701 error = dmu_tx_assign(tx, TXG_NOWAIT);
702 if (error) {
703 if (error == ERESTART) {
704 dmu_tx_wait(tx);
705 dmu_tx_abort(tx);
706 continue;
707 }
708 dmu_tx_abort(tx);
709 break;
710 }
711
712 /*
713 * If zfs_range_lock() over-locked we grow the blocksize
714 * and then reduce the lock range. This will only happen
715 * on the first iteration since zfs_range_reduce() will
716 * shrink down r_len to the appropriate size.
717 */
718 if (rl->r_len == UINT64_MAX) {
719 uint64_t new_blksz;
720
721 if (zp->z_blksz > max_blksz) {
722 ASSERT(!ISP2(zp->z_blksz));
723 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
724 } else {
725 new_blksz = MIN(end_size, max_blksz);
726 }
727 zfs_grow_blocksize(zp, new_blksz, tx);
728 zfs_range_reduce(rl, woff, n);
729 }
730
731 /*
732 * XXX - should we really limit each write to z_max_blksz?
733 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
734 */
735 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
736 rw_enter(&zp->z_map_lock, RW_READER);
737
738 tx_bytes = uio->uio_resid;
739 if (vn_has_cached_data(vp)) {
740 rw_exit(&zp->z_map_lock);
741 error = mappedwrite(vp, nbytes, uio, tx);
742 } else {
743 error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
744 uio, nbytes, tx);
745 rw_exit(&zp->z_map_lock);
746 }
747 tx_bytes -= uio->uio_resid;
748
749 /*
750 * If we made no progress, we're done. If we made even
751 * partial progress, update the znode and ZIL accordingly.
752 */
753 if (tx_bytes == 0) {
754 dmu_tx_commit(tx);
755 ASSERT(error != 0);
756 break;
757 }
758
759 /*
760 * Clear Set-UID/Set-GID bits on successful write if not
761 * privileged and at least one of the excute bits is set.
762 *
763 * It would be nice to to this after all writes have
764 * been done, but that would still expose the ISUID/ISGID
765 * to another app after the partial write is committed.
766 *
767 * Note: we don't call zfs_fuid_map_id() here because
768 * user 0 is not an ephemeral uid.
769 */
770 mutex_enter(&zp->z_acl_lock);
771 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
772 (S_IXUSR >> 6))) != 0 &&
773 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
774 secpolicy_vnode_setid_retain(cr,
775 (zp->z_phys->zp_mode & S_ISUID) != 0 &&
776 zp->z_phys->zp_uid == 0) != 0) {
777 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
778 }
779 mutex_exit(&zp->z_acl_lock);
780
781 /*
782 * Update time stamp. NOTE: This marks the bonus buffer as
783 * dirty, so we don't have to do it again for zp_size.
784 */
785 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
786
787 /*
788 * Update the file size (zp_size) if it has changed;
789 * account for possible concurrent updates.
790 */
791 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
792 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
793 uio->uio_loffset);
794 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
795 dmu_tx_commit(tx);
796
797 if (error != 0)
798 break;
799 ASSERT(tx_bytes == nbytes);
800 n -= nbytes;
801 }
802
803 zfs_range_unlock(rl);
804
805 /*
806 * If we're in replay mode, or we made no progress, return error.
807 * Otherwise, it's at least a partial write, so it's successful.
808 */
809 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
810 ZFS_EXIT(zfsvfs);
811 return (error);
812 }
813
814 if (ioflag & (FSYNC | FDSYNC))
815 zil_commit(zilog, zp->z_last_itx, zp->z_id);
816
817 ZFS_EXIT(zfsvfs);
818 return (0);
819 }
820
821 void
822 zfs_get_done(dmu_buf_t *db, void *vzgd)
823 {
824 zgd_t *zgd = (zgd_t *)vzgd;
825 rl_t *rl = zgd->zgd_rl;
826 vnode_t *vp = ZTOV(rl->r_zp);
827
828 dmu_buf_rele(db, vzgd);
829 zfs_range_unlock(rl);
830 VN_RELE(vp);
831 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
832 kmem_free(zgd, sizeof (zgd_t));
833 }
834
835 /*
836 * Get data to generate a TX_WRITE intent log record.
837 */
838 int
839 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
840 {
841 zfsvfs_t *zfsvfs = arg;
842 objset_t *os = zfsvfs->z_os;
843 znode_t *zp;
844 uint64_t off = lr->lr_offset;
845 dmu_buf_t *db;
846 rl_t *rl;
847 zgd_t *zgd;
848 int dlen = lr->lr_length; /* length of user data */
849 int error = 0;
850
851 ASSERT(zio);
852 ASSERT(dlen != 0);
853
854 /*
855 * Nothing to do if the file has been removed
856 */
857 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
858 return (ENOENT);
859 if (zp->z_unlinked) {
860 VN_RELE(ZTOV(zp));
861 return (ENOENT);
862 }
863
864 /*
865 * Write records come in two flavors: immediate and indirect.
866 * For small writes it's cheaper to store the data with the
867 * log record (immediate); for large writes it's cheaper to
868 * sync the data and get a pointer to it (indirect) so that
869 * we don't have to write the data twice.
870 */
871 if (buf != NULL) { /* immediate write */
872 rl = zfs_range_lock(zp, off, dlen, RL_READER);
873 /* test for truncation needs to be done while range locked */
874 if (off >= zp->z_phys->zp_size) {
875 error = ENOENT;
876 goto out;
877 }
878 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
879 } else { /* indirect write */
880 uint64_t boff; /* block starting offset */
881
882 /*
883 * Have to lock the whole block to ensure when it's
884 * written out and it's checksum is being calculated
885 * that no one can change the data. We need to re-check
886 * blocksize after we get the lock in case it's changed!
887 */
888 for (;;) {
889 if (ISP2(zp->z_blksz)) {
890 boff = P2ALIGN_TYPED(off, zp->z_blksz,
891 uint64_t);
892 } else {
893 boff = 0;
894 }
895 dlen = zp->z_blksz;
896 rl = zfs_range_lock(zp, boff, dlen, RL_READER);
897 if (zp->z_blksz == dlen)
898 break;
899 zfs_range_unlock(rl);
900 }
901 /* test for truncation needs to be done while range locked */
902 if (off >= zp->z_phys->zp_size) {
903 error = ENOENT;
904 goto out;
905 }
906 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
907 zgd->zgd_rl = rl;
908 zgd->zgd_zilog = zfsvfs->z_log;
909 zgd->zgd_bp = &lr->lr_blkptr;
910 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
911 ASSERT(boff == db->db_offset);
912 lr->lr_blkoff = off - boff;
913 error = dmu_sync(zio, db, &lr->lr_blkptr,
914 lr->lr_common.lrc_txg, zfs_get_done, zgd);
915 ASSERT((error && error != EINPROGRESS) ||
916 lr->lr_length <= zp->z_blksz);
917 if (error == 0)
918 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
919 /*
920 * If we get EINPROGRESS, then we need to wait for a
921 * write IO initiated by dmu_sync() to complete before
922 * we can release this dbuf. We will finish everything
923 * up in the zfs_get_done() callback.
924 */
925 if (error == EINPROGRESS)
926 return (0);
927 dmu_buf_rele(db, zgd);
928 kmem_free(zgd, sizeof (zgd_t));
929 }
930 out:
931 zfs_range_unlock(rl);
932 VN_RELE(ZTOV(zp));
933 return (error);
934 }
935
936 /*ARGSUSED*/
937 static int
938 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
939 caller_context_t *ct)
940 {
941 znode_t *zp = VTOZ(vp);
942 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
943 int error;
944
945 ZFS_ENTER(zfsvfs);
946 ZFS_VERIFY_ZP(zp);
947
948 if (flag & V_ACE_MASK)
949 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
950 else
951 error = zfs_zaccess_rwx(zp, mode, flag, cr);
952
953 ZFS_EXIT(zfsvfs);
954 return (error);
955 }
956
957 /*
958 * Lookup an entry in a directory, or an extended attribute directory.
959 * If it exists, return a held vnode reference for it.
960 *
961 * IN: dvp - vnode of directory to search.
962 * nm - name of entry to lookup.
963 * pnp - full pathname to lookup [UNUSED].
964 * flags - LOOKUP_XATTR set if looking for an attribute.
965 * rdir - root directory vnode [UNUSED].
966 * cr - credentials of caller.
967 * ct - caller context
968 * direntflags - directory lookup flags
969 * realpnp - returned pathname.
970 *
971 * OUT: vpp - vnode of located entry, NULL if not found.
972 *
973 * RETURN: 0 if success
974 * error code if failure
975 *
976 * Timestamps:
977 * NA
978 */
979 /* ARGSUSED */
980 static int
981 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
982 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
983 int *direntflags, pathname_t *realpnp)
984 {
985 znode_t *zdp = VTOZ(dvp);
986 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
987 int error;
988
989 ZFS_ENTER(zfsvfs);
990 ZFS_VERIFY_ZP(zdp);
991
992 *vpp = NULL;
993
994 if (flags & LOOKUP_XATTR) {
995 /*
996 * If the xattr property is off, refuse the lookup request.
997 */
998 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
999 ZFS_EXIT(zfsvfs);
1000 return (EINVAL);
1001 }
1002
1003 /*
1004 * We don't allow recursive attributes..
1005 * Maybe someday we will.
1006 */
1007 if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1008 ZFS_EXIT(zfsvfs);
1009 return (EINVAL);
1010 }
1011
1012 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1013 ZFS_EXIT(zfsvfs);
1014 return (error);
1015 }
1016
1017 /*
1018 * Do we have permission to get into attribute directory?
1019 */
1020
1021 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1022 B_FALSE, cr)) {
1023 VN_RELE(*vpp);
1024 *vpp = NULL;
1025 }
1026
1027 ZFS_EXIT(zfsvfs);
1028 return (error);
1029 }
1030
1031 if (dvp->v_type != VDIR) {
1032 ZFS_EXIT(zfsvfs);
1033 return (ENOTDIR);
1034 }
1035
1036 /*
1037 * Check accessibility of directory.
1038 */
1039
1040 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1041 ZFS_EXIT(zfsvfs);
1042 return (error);
1043 }
1044
1045 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1046 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1047 ZFS_EXIT(zfsvfs);
1048 return (EILSEQ);
1049 }
1050
1051 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1052 if (error == 0) {
1053 /*
1054 * Convert device special files
1055 */
1056 if (IS_DEVVP(*vpp)) {
1057 vnode_t *svp;
1058
1059 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1060 VN_RELE(*vpp);
1061 if (svp == NULL)
1062 error = ENOSYS;
1063 else
1064 *vpp = svp;
1065 }
1066 }
1067
1068 ZFS_EXIT(zfsvfs);
1069 return (error);
1070 }
1071
1072 /*
1073 * Attempt to create a new entry in a directory. If the entry
1074 * already exists, truncate the file if permissible, else return
1075 * an error. Return the vp of the created or trunc'd file.
1076 *
1077 * IN: dvp - vnode of directory to put new file entry in.
1078 * name - name of new file entry.
1079 * vap - attributes of new file.
1080 * excl - flag indicating exclusive or non-exclusive mode.
1081 * mode - mode to open file with.
1082 * cr - credentials of caller.
1083 * flag - large file flag [UNUSED].
1084 * ct - caller context
1085 * vsecp - ACL to be set
1086 *
1087 * OUT: vpp - vnode of created or trunc'd entry.
1088 *
1089 * RETURN: 0 if success
1090 * error code if failure
1091 *
1092 * Timestamps:
1093 * dvp - ctime|mtime updated if new entry created
1094 * vp - ctime|mtime always, atime if new
1095 */
1096
1097 /* ARGSUSED */
1098 static int
1099 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1100 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1101 vsecattr_t *vsecp)
1102 {
1103 znode_t *zp, *dzp = VTOZ(dvp);
1104 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1105 zilog_t *zilog;
1106 objset_t *os;
1107 zfs_dirlock_t *dl;
1108 dmu_tx_t *tx;
1109 int error;
1110 zfs_acl_t *aclp = NULL;
1111 zfs_fuid_info_t *fuidp = NULL;
1112 ksid_t *ksid;
1113 uid_t uid;
1114 gid_t gid = crgetgid(cr);
1115
1116 /*
1117 * If we have an ephemeral id, ACL, or XVATTR then
1118 * make sure file system is at proper version
1119 */
1120
1121 ksid = crgetsid(cr, KSID_OWNER);
1122 if (ksid)
1123 uid = ksid_getid(ksid);
1124 else
1125 uid = crgetuid(cr);
1126
1127 if (zfsvfs->z_use_fuids == B_FALSE &&
1128 (vsecp || (vap->va_mask & AT_XVATTR) ||
1129 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1130 return (EINVAL);
1131
1132 ZFS_ENTER(zfsvfs);
1133 ZFS_VERIFY_ZP(dzp);
1134 os = zfsvfs->z_os;
1135 zilog = zfsvfs->z_log;
1136
1137 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1138 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1139 ZFS_EXIT(zfsvfs);
1140 return (EILSEQ);
1141 }
1142
1143 if (vap->va_mask & AT_XVATTR) {
1144 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1145 crgetuid(cr), cr, vap->va_type)) != 0) {
1146 ZFS_EXIT(zfsvfs);
1147 return (error);
1148 }
1149 }
1150 top:
1151 *vpp = NULL;
1152
1153 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1154 vap->va_mode &= ~VSVTX;
1155
1156 if (*name == '\0') {
1157 /*
1158 * Null component name refers to the directory itself.
1159 */
1160 VN_HOLD(dvp);
1161 zp = dzp;
1162 dl = NULL;
1163 error = 0;
1164 } else {
1165 /* possible VN_HOLD(zp) */
1166 int zflg = 0;
1167
1168 if (flag & FIGNORECASE)
1169 zflg |= ZCILOOK;
1170
1171 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1172 NULL, NULL);
1173 if (error) {
1174 if (strcmp(name, "..") == 0)
1175 error = EISDIR;
1176 ZFS_EXIT(zfsvfs);
1177 if (aclp)
1178 zfs_acl_free(aclp);
1179 return (error);
1180 }
1181 }
1182 if (vsecp && aclp == NULL) {
1183 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
1184 if (error) {
1185 ZFS_EXIT(zfsvfs);
1186 if (dl)
1187 zfs_dirent_unlock(dl);
1188 return (error);
1189 }
1190 }
1191
1192 if (zp == NULL) {
1193 uint64_t txtype;
1194
1195 /*
1196 * Create a new file object and update the directory
1197 * to reference it.
1198 */
1199 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1200 goto out;
1201 }
1202
1203 /*
1204 * We only support the creation of regular files in
1205 * extended attribute directories.
1206 */
1207 if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1208 (vap->va_type != VREG)) {
1209 error = EINVAL;
1210 goto out;
1211 }
1212
1213 tx = dmu_tx_create(os);
1214 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1215 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) ||
1216 IS_EPHEMERAL(gid)) {
1217 if (zfsvfs->z_fuid_obj == 0) {
1218 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1219 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1220 FUID_SIZE_ESTIMATE(zfsvfs));
1221 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
1222 FALSE, NULL);
1223 } else {
1224 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
1225 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
1226 FUID_SIZE_ESTIMATE(zfsvfs));
1227 }
1228 }
1229 dmu_tx_hold_bonus(tx, dzp->z_id);
1230 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1231 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
1232 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1233 0, SPA_MAXBLOCKSIZE);
1234 }
1235 error = dmu_tx_assign(tx, TXG_NOWAIT);
1236 if (error) {
1237 zfs_dirent_unlock(dl);
1238 if (error == ERESTART) {
1239 dmu_tx_wait(tx);
1240 dmu_tx_abort(tx);
1241 goto top;
1242 }
1243 dmu_tx_abort(tx);
1244 ZFS_EXIT(zfsvfs);
1245 if (aclp)
1246 zfs_acl_free(aclp);
1247 return (error);
1248 }
1249 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
1250 (void) zfs_link_create(dl, zp, tx, ZNEW);
1251 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1252 if (flag & FIGNORECASE)
1253 txtype |= TX_CI;
1254 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1255 vsecp, fuidp, vap);
1256 if (fuidp)
1257 zfs_fuid_info_free(fuidp);
1258 dmu_tx_commit(tx);
1259 } else {
1260 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1261
1262 /*
1263 * A directory entry already exists for this name.
1264 */
1265 /*
1266 * Can't truncate an existing file if in exclusive mode.
1267 */
1268 if (excl == EXCL) {
1269 error = EEXIST;
1270 goto out;
1271 }
1272 /*
1273 * Can't open a directory for writing.
1274 */
1275 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1276 error = EISDIR;
1277 goto out;
1278 }
1279 /*
1280 * Verify requested access to file.
1281 */
1282 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1283 goto out;
1284 }
1285
1286 mutex_enter(&dzp->z_lock);
1287 dzp->z_seq++;
1288 mutex_exit(&dzp->z_lock);
1289
1290 /*
1291 * Truncate regular files if requested.
1292 */
1293 if ((ZTOV(zp)->v_type == VREG) &&
1294 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1295 /* we can't hold any locks when calling zfs_freesp() */
1296 zfs_dirent_unlock(dl);
1297 dl = NULL;
1298 error = zfs_freesp(zp, 0, 0, mode, TRUE);
1299 if (error == 0) {
1300 vnevent_create(ZTOV(zp), ct);
1301 }
1302 }
1303 }
1304 out:
1305
1306 if (dl)
1307 zfs_dirent_unlock(dl);
1308
1309 if (error) {
1310 if (zp)
1311 VN_RELE(ZTOV(zp));
1312 } else {
1313 *vpp = ZTOV(zp);
1314 /*
1315 * If vnode is for a device return a specfs vnode instead.
1316 */
1317 if (IS_DEVVP(*vpp)) {
1318 struct vnode *svp;
1319
1320 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1321 VN_RELE(*vpp);
1322 if (svp == NULL) {
1323 error = ENOSYS;
1324 }
1325 *vpp = svp;
1326 }
1327 }
1328 if (aclp)
1329 zfs_acl_free(aclp);
1330
1331 ZFS_EXIT(zfsvfs);
1332 return (error);
1333 }
1334
1335 /*
1336 * Remove an entry from a directory.
1337 *
1338 * IN: dvp - vnode of directory to remove entry from.
1339 * name - name of entry to remove.
1340 * cr - credentials of caller.
1341 * ct - caller context
1342 * flags - case flags
1343 *
1344 * RETURN: 0 if success
1345 * error code if failure
1346 *
1347 * Timestamps:
1348 * dvp - ctime|mtime
1349 * vp - ctime (if nlink > 0)
1350 */
1351 /*ARGSUSED*/
1352 static int
1353 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1354 int flags)
1355 {
1356 znode_t *zp, *dzp = VTOZ(dvp);
1357 znode_t *xzp = NULL;
1358 vnode_t *vp;
1359 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1360 zilog_t *zilog;
1361 uint64_t acl_obj, xattr_obj;
1362 zfs_dirlock_t *dl;
1363 dmu_tx_t *tx;
1364 boolean_t may_delete_now, delete_now = FALSE;
1365 boolean_t unlinked, toobig = FALSE;
1366 uint64_t txtype;
1367 pathname_t *realnmp = NULL;
1368 pathname_t realnm;
1369 int error;
1370 int zflg = ZEXISTS;
1371
1372 ZFS_ENTER(zfsvfs);
1373 ZFS_VERIFY_ZP(dzp);
1374 zilog = zfsvfs->z_log;
1375
1376 if (flags & FIGNORECASE) {
1377 zflg |= ZCILOOK;
1378 pn_alloc(&realnm);
1379 realnmp = &realnm;
1380 }
1381
1382 top:
1383 /*
1384 * Attempt to lock directory; fail if entry doesn't exist.
1385 */
1386 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1387 NULL, realnmp)) {
1388 if (realnmp)
1389 pn_free(realnmp);
1390 ZFS_EXIT(zfsvfs);
1391 return (error);
1392 }
1393
1394 vp = ZTOV(zp);
1395
1396 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1397 goto out;
1398 }
1399
1400 /*
1401 * Need to use rmdir for removing directories.
1402 */
1403 if (vp->v_type == VDIR) {
1404 error = EPERM;
1405 goto out;
1406 }
1407
1408 vnevent_remove(vp, dvp, name, ct);
1409
1410 if (realnmp)
1411 dnlc_remove(dvp, realnmp->pn_buf);
1412 else
1413 dnlc_remove(dvp, name);
1414
1415 mutex_enter(&vp->v_lock);
1416 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1417 mutex_exit(&vp->v_lock);
1418
1419 /*
1420 * We may delete the znode now, or we may put it in the unlinked set;
1421 * it depends on whether we're the last link, and on whether there are
1422 * other holds on the vnode. So we dmu_tx_hold() the right things to
1423 * allow for either case.
1424 */
1425 tx = dmu_tx_create(zfsvfs->z_os);
1426 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1427 dmu_tx_hold_bonus(tx, zp->z_id);
1428 if (may_delete_now) {
1429 toobig =
1430 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1431 /* if the file is too big, only hold_free a token amount */
1432 dmu_tx_hold_free(tx, zp->z_id, 0,
1433 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1434 }
1435
1436 /* are there any extended attributes? */
1437 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1438 /* XXX - do we need this if we are deleting? */
1439 dmu_tx_hold_bonus(tx, xattr_obj);
1440 }
1441
1442 /* are there any additional acls */
1443 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1444 may_delete_now)
1445 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1446
1447 /* charge as an update -- would be nice not to charge at all */
1448 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1449
1450 error = dmu_tx_assign(tx, TXG_NOWAIT);
1451 if (error) {
1452 zfs_dirent_unlock(dl);
1453 VN_RELE(vp);
1454 if (error == ERESTART) {
1455 dmu_tx_wait(tx);
1456 dmu_tx_abort(tx);
1457 goto top;
1458 }
1459 if (realnmp)
1460 pn_free(realnmp);
1461 dmu_tx_abort(tx);
1462 ZFS_EXIT(zfsvfs);
1463 return (error);
1464 }
1465
1466 /*
1467 * Remove the directory entry.
1468 */
1469 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1470
1471 if (error) {
1472 dmu_tx_commit(tx);
1473 goto out;
1474 }
1475
1476 if (unlinked) {
1477 mutex_enter(&vp->v_lock);
1478 delete_now = may_delete_now && !toobig &&
1479 vp->v_count == 1 && !vn_has_cached_data(vp) &&
1480 zp->z_phys->zp_xattr == xattr_obj &&
1481 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1482 mutex_exit(&vp->v_lock);
1483 }
1484
1485 if (delete_now) {
1486 if (zp->z_phys->zp_xattr) {
1487 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1488 ASSERT3U(error, ==, 0);
1489 ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1490 dmu_buf_will_dirty(xzp->z_dbuf, tx);
1491 mutex_enter(&xzp->z_lock);
1492 xzp->z_unlinked = 1;
1493 xzp->z_phys->zp_links = 0;
1494 mutex_exit(&xzp->z_lock);
1495 zfs_unlinked_add(xzp, tx);
1496 zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1497 }
1498 mutex_enter(&zp->z_lock);
1499 mutex_enter(&vp->v_lock);
1500 vp->v_count--;
1501 ASSERT3U(vp->v_count, ==, 0);
1502 mutex_exit(&vp->v_lock);
1503 mutex_exit(&zp->z_lock);
1504 zfs_znode_delete(zp, tx);
1505 } else if (unlinked) {
1506 zfs_unlinked_add(zp, tx);
1507 }
1508
1509 txtype = TX_REMOVE;
1510 if (flags & FIGNORECASE)
1511 txtype |= TX_CI;
1512 zfs_log_remove(zilog, tx, txtype, dzp, name);
1513
1514 dmu_tx_commit(tx);
1515 out:
1516 if (realnmp)
1517 pn_free(realnmp);
1518
1519 zfs_dirent_unlock(dl);
1520
1521 if (!delete_now) {
1522 VN_RELE(vp);
1523 } else if (xzp) {
1524 /* this rele is delayed to prevent nesting transactions */
1525 VN_RELE(ZTOV(xzp));
1526 }
1527
1528 ZFS_EXIT(zfsvfs);
1529 return (error);
1530 }
1531
1532 /*
1533 * Create a new directory and insert it into dvp using the name
1534 * provided. Return a pointer to the inserted directory.
1535 *
1536 * IN: dvp - vnode of directory to add subdir to.
1537 * dirname - name of new directory.
1538 * vap - attributes of new directory.
1539 * cr - credentials of caller.
1540 * ct - caller context
1541 * vsecp - ACL to be set
1542 *
1543 * OUT: vpp - vnode of created directory.
1544 *
1545 * RETURN: 0 if success
1546 * error code if failure
1547 *
1548 * Timestamps:
1549 * dvp - ctime|mtime updated
1550 * vp - ctime|mtime|atime updated
1551 */
1552 /*ARGSUSED*/
1553 static int
1554 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1555 caller_context_t *ct, int flags, vsecattr_t *vsecp)
1556 {
1557 znode_t *zp, *dzp = VTOZ(dvp);
1558 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1559 zilog_t *zilog;
1560 zfs_dirlock_t *dl;
1561 uint64_t txtype;
1562 dmu_tx_t *tx;
1563 int error;
1564 zfs_acl_t *aclp = NULL;
1565 zfs_fuid_info_t *fuidp = NULL;
1566 int zf = ZNEW;
1567 ksid_t *ksid;
1568 uid_t uid;
1569 gid_t gid = crgetgid(cr);
1570
1571 ASSERT(vap->va_type == VDIR);
1572
1573 /*
1574 * If we have an ephemeral id, ACL, or XVATTR then
1575 * make sure file system is at proper version
1576 */
1577
1578 ksid = crgetsid(cr, KSID_OWNER);
1579 if (ksid)
1580 uid = ksid_getid(ksid);
1581 else
1582 uid = crgetuid(cr);
1583 if (zfsvfs->z_use_fuids == B_FALSE &&
1584 (vsecp || (vap->va_mask & AT_XVATTR) ||
1585 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1586 return (EINVAL);
1587
1588 ZFS_ENTER(zfsvfs);
1589 ZFS_VERIFY_ZP(dzp);
1590 zilog = zfsvfs->z_log;
1591
1592 if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1593 ZFS_EXIT(zfsvfs);
1594 return (EINVAL);
1595 }
1596
1597 if (zfsvfs->z_utf8 && u8_validate(dirname,
1598 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1599 ZFS_EXIT(zfsvfs);
1600 return (EILSEQ);
1601 }
1602 if (flags & FIGNORECASE)
1603 zf |= ZCILOOK;
1604
1605 if (vap->va_mask & AT_XVATTR)
1606 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1607 crgetuid(cr), cr, vap->va_type)) != 0) {
1608 ZFS_EXIT(zfsvfs);
1609 return (error);
1610 }
1611
1612 /*
1613 * First make sure the new directory doesn't exist.
1614 */
1615 top:
1616 *vpp = NULL;
1617
1618 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1619 NULL, NULL)) {
1620 ZFS_EXIT(zfsvfs);
1621 return (error);
1622 }
1623
1624 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1625 zfs_dirent_unlock(dl);
1626 ZFS_EXIT(zfsvfs);
1627 return (error);
1628 }
1629
1630 if (vsecp && aclp == NULL) {
1631 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
1632 if (error) {
1633 zfs_dirent_unlock(dl);
1634 ZFS_EXIT(zfsvfs);
1635 return (error);
1636 }
1637 }
1638 /*
1639 * Add a new entry to the directory.
1640 */
1641 tx = dmu_tx_create(zfsvfs->z_os);
1642 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1643 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1644 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) ||
1645 IS_EPHEMERAL(gid)) {
1646 if (zfsvfs->z_fuid_obj == 0) {
1647 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1648 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1649 FUID_SIZE_ESTIMATE(zfsvfs));
1650 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
1651 } else {
1652 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
1653 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
1654 FUID_SIZE_ESTIMATE(zfsvfs));
1655 }
1656 }
1657 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
1658 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1659 0, SPA_MAXBLOCKSIZE);
1660 error = dmu_tx_assign(tx, TXG_NOWAIT);
1661 if (error) {
1662 zfs_dirent_unlock(dl);
1663 if (error == ERESTART) {
1664 dmu_tx_wait(tx);
1665 dmu_tx_abort(tx);
1666 goto top;
1667 }
1668 dmu_tx_abort(tx);
1669 ZFS_EXIT(zfsvfs);
1670 if (aclp)
1671 zfs_acl_free(aclp);
1672 return (error);
1673 }
1674
1675 /*
1676 * Create new node.
1677 */
1678 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
1679
1680 if (aclp)
1681 zfs_acl_free(aclp);
1682
1683 /*
1684 * Now put new name in parent dir.
1685 */
1686 (void) zfs_link_create(dl, zp, tx, ZNEW);
1687
1688 *vpp = ZTOV(zp);
1689
1690 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1691 if (flags & FIGNORECASE)
1692 txtype |= TX_CI;
1693 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
1694
1695 if (fuidp)
1696 zfs_fuid_info_free(fuidp);
1697 dmu_tx_commit(tx);
1698
1699 zfs_dirent_unlock(dl);
1700
1701 ZFS_EXIT(zfsvfs);
1702 return (0);
1703 }
1704
1705 /*
1706 * Remove a directory subdir entry. If the current working
1707 * directory is the same as the subdir to be removed, the
1708 * remove will fail.
1709 *
1710 * IN: dvp - vnode of directory to remove from.
1711 * name - name of directory to be removed.
1712 * cwd - vnode of current working directory.
1713 * cr - credentials of caller.
1714 * ct - caller context
1715 * flags - case flags
1716 *
1717 * RETURN: 0 if success
1718 * error code if failure
1719 *
1720 * Timestamps:
1721 * dvp - ctime|mtime updated
1722 */
1723 /*ARGSUSED*/
1724 static int
1725 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1726 caller_context_t *ct, int flags)
1727 {
1728 znode_t *dzp = VTOZ(dvp);
1729 znode_t *zp;
1730 vnode_t *vp;
1731 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1732 zilog_t *zilog;
1733 zfs_dirlock_t *dl;
1734 dmu_tx_t *tx;
1735 int error;
1736 int zflg = ZEXISTS;
1737
1738 ZFS_ENTER(zfsvfs);
1739 ZFS_VERIFY_ZP(dzp);
1740 zilog = zfsvfs->z_log;
1741
1742 if (flags & FIGNORECASE)
1743 zflg |= ZCILOOK;
1744 top:
1745 zp = NULL;
1746
1747 /*
1748 * Attempt to lock directory; fail if entry doesn't exist.
1749 */
1750 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1751 NULL, NULL)) {
1752 ZFS_EXIT(zfsvfs);
1753 return (error);
1754 }
1755
1756 vp = ZTOV(zp);
1757
1758 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1759 goto out;
1760 }
1761
1762 if (vp->v_type != VDIR) {
1763 error = ENOTDIR;
1764 goto out;
1765 }
1766
1767 if (vp == cwd) {
1768 error = EINVAL;
1769 goto out;
1770 }
1771
1772 vnevent_rmdir(vp, dvp, name, ct);
1773
1774 /*
1775 * Grab a lock on the directory to make sure that noone is
1776 * trying to add (or lookup) entries while we are removing it.
1777 */
1778 rw_enter(&zp->z_name_lock, RW_WRITER);
1779
1780 /*
1781 * Grab a lock on the parent pointer to make sure we play well
1782 * with the treewalk and directory rename code.
1783 */
1784 rw_enter(&zp->z_parent_lock, RW_WRITER);
1785
1786 tx = dmu_tx_create(zfsvfs->z_os);
1787 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1788 dmu_tx_hold_bonus(tx, zp->z_id);
1789 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1790 error = dmu_tx_assign(tx, TXG_NOWAIT);
1791 if (error) {
1792 rw_exit(&zp->z_parent_lock);
1793 rw_exit(&zp->z_name_lock);
1794 zfs_dirent_unlock(dl);
1795 VN_RELE(vp);
1796 if (error == ERESTART) {
1797 dmu_tx_wait(tx);
1798 dmu_tx_abort(tx);
1799 goto top;
1800 }
1801 dmu_tx_abort(tx);
1802 ZFS_EXIT(zfsvfs);
1803 return (error);
1804 }
1805
1806 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1807
1808 if (error == 0) {
1809 uint64_t txtype = TX_RMDIR;
1810 if (flags & FIGNORECASE)
1811 txtype |= TX_CI;
1812 zfs_log_remove(zilog, tx, txtype, dzp, name);
1813 }
1814
1815 dmu_tx_commit(tx);
1816
1817 rw_exit(&zp->z_parent_lock);
1818 rw_exit(&zp->z_name_lock);
1819 out:
1820 zfs_dirent_unlock(dl);
1821
1822 VN_RELE(vp);
1823
1824 ZFS_EXIT(zfsvfs);
1825 return (error);
1826 }
1827
1828 /*
1829 * Read as many directory entries as will fit into the provided
1830 * buffer from the given directory cursor position (specified in
1831 * the uio structure.
1832 *
1833 * IN: vp - vnode of directory to read.
1834 * uio - structure supplying read location, range info,
1835 * and return buffer.
1836 * cr - credentials of caller.
1837 * ct - caller context
1838 * flags - case flags
1839 *
1840 * OUT: uio - updated offset and range, buffer filled.
1841 * eofp - set to true if end-of-file detected.
1842 *
1843 * RETURN: 0 if success
1844 * error code if failure
1845 *
1846 * Timestamps:
1847 * vp - atime updated
1848 *
1849 * Note that the low 4 bits of the cookie returned by zap is always zero.
1850 * This allows us to use the low range for "special" directory entries:
1851 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
1852 * we use the offset 2 for the '.zfs' directory.
1853 */
1854 /* ARGSUSED */
1855 static int
1856 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
1857 caller_context_t *ct, int flags)
1858 {
1859 znode_t *zp = VTOZ(vp);
1860 iovec_t *iovp;
1861 edirent_t *eodp;
1862 dirent64_t *odp;
1863 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1864 objset_t *os;
1865 caddr_t outbuf;
1866 size_t bufsize;
1867 zap_cursor_t zc;
1868 zap_attribute_t zap;
1869 uint_t bytes_wanted;
1870 uint64_t offset; /* must be unsigned; checks for < 1 */
1871 int local_eof;
1872 int outcount;
1873 int error;
1874 uint8_t prefetch;
1875 boolean_t check_sysattrs;
1876
1877 ZFS_ENTER(zfsvfs);
1878 ZFS_VERIFY_ZP(zp);
1879
1880 /*
1881 * If we are not given an eof variable,
1882 * use a local one.
1883 */
1884 if (eofp == NULL)
1885 eofp = &local_eof;
1886
1887 /*
1888 * Check for valid iov_len.
1889 */
1890 if (uio->uio_iov->iov_len <= 0) {
1891 ZFS_EXIT(zfsvfs);
1892 return (EINVAL);
1893 }
1894
1895 /*
1896 * Quit if directory has been removed (posix)
1897 */
1898 if ((*eofp = zp->z_unlinked) != 0) {
1899 ZFS_EXIT(zfsvfs);
1900 return (0);
1901 }
1902
1903 error = 0;
1904 os = zfsvfs->z_os;
1905 offset = uio->uio_loffset;
1906 prefetch = zp->z_zn_prefetch;
1907
1908 /*
1909 * Initialize the iterator cursor.
1910 */
1911 if (offset <= 3) {
1912 /*
1913 * Start iteration from the beginning of the directory.
1914 */
1915 zap_cursor_init(&zc, os, zp->z_id);
1916 } else {
1917 /*
1918 * The offset is a serialized cursor.
1919 */
1920 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1921 }
1922
1923 /*
1924 * Get space to change directory entries into fs independent format.
1925 */
1926 iovp = uio->uio_iov;
1927 bytes_wanted = iovp->iov_len;
1928 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
1929 bufsize = bytes_wanted;
1930 outbuf = kmem_alloc(bufsize, KM_SLEEP);
1931 odp = (struct dirent64 *)outbuf;
1932 } else {
1933 bufsize = bytes_wanted;
1934 odp = (struct dirent64 *)iovp->iov_base;
1935 }
1936 eodp = (struct edirent *)odp;
1937
1938 /*
1939 * If this VFS supports the system attribute view interface; and
1940 * we're looking at an extended attribute directory; and we care
1941 * about normalization conflicts on this vfs; then we must check
1942 * for normalization conflicts with the sysattr name space.
1943 */
1944 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
1945 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
1946 (flags & V_RDDIR_ENTFLAGS);
1947
1948 /*
1949 * Transform to file-system independent format
1950 */
1951 outcount = 0;
1952 while (outcount < bytes_wanted) {
1953 ino64_t objnum;
1954 ushort_t reclen;
1955 off64_t *next;
1956
1957 /*
1958 * Special case `.', `..', and `.zfs'.
1959 */
1960 if (offset == 0) {
1961 (void) strcpy(zap.za_name, ".");
1962 zap.za_normalization_conflict = 0;
1963 objnum = zp->z_id;
1964 } else if (offset == 1) {
1965 (void) strcpy(zap.za_name, "..");
1966 zap.za_normalization_conflict = 0;
1967 objnum = zp->z_phys->zp_parent;
1968 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1969 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1970 zap.za_normalization_conflict = 0;
1971 objnum = ZFSCTL_INO_ROOT;
1972 } else {
1973 /*
1974 * Grab next entry.
1975 */
1976 if (error = zap_cursor_retrieve(&zc, &zap)) {
1977 if ((*eofp = (error == ENOENT)) != 0)
1978 break;
1979 else
1980 goto update;
1981 }
1982
1983 if (zap.za_integer_length != 8 ||
1984 zap.za_num_integers != 1) {
1985 cmn_err(CE_WARN, "zap_readdir: bad directory "
1986 "entry, obj = %lld, offset = %lld\n",
1987 (u_longlong_t)zp->z_id,
1988 (u_longlong_t)offset);
1989 error = ENXIO;
1990 goto update;
1991 }
1992
1993 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1994 /*
1995 * MacOS X can extract the object type here such as:
1996 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1997 */
1998
1999 if (check_sysattrs && !zap.za_normalization_conflict) {
2000 zap.za_normalization_conflict =
2001 xattr_sysattr_casechk(zap.za_name);
2002 }
2003 }
2004
2005 if (flags & V_RDDIR_ENTFLAGS)
2006 reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2007 else
2008 reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2009
2010 /*
2011 * Will this entry fit in the buffer?
2012 */
2013 if (outcount + reclen > bufsize) {
2014 /*
2015 * Did we manage to fit anything in the buffer?
2016 */
2017 if (!outcount) {
2018 error = EINVAL;
2019 goto update;
2020 }
2021 break;
2022 }
2023 if (flags & V_RDDIR_ENTFLAGS) {
2024 /*
2025 * Add extended flag entry:
2026 */
2027 eodp->ed_ino = objnum;
2028 eodp->ed_reclen = reclen;
2029 /* NOTE: ed_off is the offset for the *next* entry */
2030 next = &(eodp->ed_off);
2031 eodp->ed_eflags = zap.za_normalization_conflict ?
2032 ED_CASE_CONFLICT : 0;
2033 (void) strncpy(eodp->ed_name, zap.za_name,
2034 EDIRENT_NAMELEN(reclen));
2035 eodp = (edirent_t *)((intptr_t)eodp + reclen);
2036 } else {
2037 /*
2038 * Add normal entry:
2039 */
2040 odp->d_ino = objnum;
2041 odp->d_reclen = reclen;
2042 /* NOTE: d_off is the offset for the *next* entry */
2043 next = &(odp->d_off);
2044 (void) strncpy(odp->d_name, zap.za_name,
2045 DIRENT64_NAMELEN(reclen));
2046 odp = (dirent64_t *)((intptr_t)odp + reclen);
2047 }
2048 outcount += reclen;
2049
2050 ASSERT(outcount <= bufsize);
2051
2052 /* Prefetch znode */
2053 if (prefetch)
2054 dmu_prefetch(os, objnum, 0, 0);
2055
2056 /*
2057 * Move to the next entry, fill in the previous offset.
2058 */
2059 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2060 zap_cursor_advance(&zc);
2061 offset = zap_cursor_serialize(&zc);
2062 } else {
2063 offset += 1;
2064 }
2065 *next = offset;
2066 }
2067 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2068
2069 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2070 iovp->iov_base += outcount;
2071 iovp->iov_len -= outcount;
2072 uio->uio_resid -= outcount;
2073 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2074 /*
2075 * Reset the pointer.
2076 */
2077 offset = uio->uio_loffset;
2078 }
2079
2080 update:
2081 zap_cursor_fini(&zc);
2082 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2083 kmem_free(outbuf, bufsize);
2084
2085 if (error == ENOENT)
2086 error = 0;
2087
2088 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2089
2090 uio->uio_loffset = offset;
2091 ZFS_EXIT(zfsvfs);
2092 return (error);
2093 }
2094
2095 ulong_t zfs_fsync_sync_cnt = 4;
2096
2097 static int
2098 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2099 {
2100 znode_t *zp = VTOZ(vp);
2101 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2102
2103 /*
2104 * Regardless of whether this is required for standards conformance,
2105 * this is the logical behavior when fsync() is called on a file with
2106 * dirty pages. We use B_ASYNC since the ZIL transactions are already
2107 * going to be pushed out as part of the zil_commit().
2108 */
2109 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2110 (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2111 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2112
2113 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2114
2115 ZFS_ENTER(zfsvfs);
2116 ZFS_VERIFY_ZP(zp);
2117 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
2118 ZFS_EXIT(zfsvfs);
2119 return (0);
2120 }
2121
2122
2123 /*
2124 * Get the requested file attributes and place them in the provided
2125 * vattr structure.
2126 *
2127 * IN: vp - vnode of file.
2128 * vap - va_mask identifies requested attributes.
2129 * If AT_XVATTR set, then optional attrs are requested
2130 * flags - ATTR_NOACLCHECK (CIFS server context)
2131 * cr - credentials of caller.
2132 * ct - caller context
2133 *
2134 * OUT: vap - attribute values.
2135 *
2136 * RETURN: 0 (always succeeds)
2137 */
2138 /* ARGSUSED */
2139 static int
2140 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2141 caller_context_t *ct)
2142 {
2143 znode_t *zp = VTOZ(vp);
2144 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2145 znode_phys_t *pzp;
2146 int error = 0;
2147 uint64_t links;
2148 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2149 xoptattr_t *xoap = NULL;
2150 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2151
2152 ZFS_ENTER(zfsvfs);
2153 ZFS_VERIFY_ZP(zp);
2154 pzp = zp->z_phys;
2155
2156 mutex_enter(&zp->z_lock);
2157
2158 /*
2159 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2160 * Also, if we are the owner don't bother, since owner should
2161 * always be allowed to read basic attributes of file.
2162 */
2163 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
2164 (pzp->zp_uid != crgetuid(cr))) {
2165 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2166 skipaclchk, cr)) {
2167 mutex_exit(&zp->z_lock);
2168 ZFS_EXIT(zfsvfs);
2169 return (error);
2170 }
2171 }
2172
2173 /*
2174 * Return all attributes. It's cheaper to provide the answer
2175 * than to determine whether we were asked the question.
2176 */
2177
2178 vap->va_type = vp->v_type;
2179 vap->va_mode = pzp->zp_mode & MODEMASK;
2180 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2181 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2182 vap->va_nodeid = zp->z_id;
2183 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2184 links = pzp->zp_links + 1;
2185 else
2186 links = pzp->zp_links;
2187 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2188 vap->va_size = pzp->zp_size;
2189 vap->va_rdev = vp->v_rdev;
2190 vap->va_seq = zp->z_seq;
2191
2192 /*
2193 * Add in any requested optional attributes and the create time.
2194 * Also set the corresponding bits in the returned attribute bitmap.
2195 */
2196 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2197 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2198 xoap->xoa_archive =
2199 ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
2200 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2201 }
2202
2203 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2204 xoap->xoa_readonly =
2205 ((pzp->zp_flags & ZFS_READONLY) != 0);
2206 XVA_SET_RTN(xvap, XAT_READONLY);
2207 }
2208
2209 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2210 xoap->xoa_system =
2211 ((pzp->zp_flags & ZFS_SYSTEM) != 0);
2212 XVA_SET_RTN(xvap, XAT_SYSTEM);
2213 }
2214
2215 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2216 xoap->xoa_hidden =
2217 ((pzp->zp_flags & ZFS_HIDDEN) != 0);
2218 XVA_SET_RTN(xvap, XAT_HIDDEN);
2219 }
2220
2221 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2222 xoap->xoa_nounlink =
2223 ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
2224 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2225 }
2226
2227 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2228 xoap->xoa_immutable =
2229 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
2230 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2231 }
2232
2233 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2234 xoap->xoa_appendonly =
2235 ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
2236 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2237 }
2238
2239 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2240 xoap->xoa_nodump =
2241 ((pzp->zp_flags & ZFS_NODUMP) != 0);
2242 XVA_SET_RTN(xvap, XAT_NODUMP);
2243 }
2244
2245 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2246 xoap->xoa_opaque =
2247 ((pzp->zp_flags & ZFS_OPAQUE) != 0);
2248 XVA_SET_RTN(xvap, XAT_OPAQUE);
2249 }
2250
2251 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2252 xoap->xoa_av_quarantined =
2253 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
2254 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2255 }
2256
2257 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2258 xoap->xoa_av_modified =
2259 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
2260 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2261 }
2262
2263 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2264 vp->v_type == VREG &&
2265 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
2266 size_t len;
2267 dmu_object_info_t doi;
2268
2269 /*
2270 * Only VREG files have anti-virus scanstamps, so we
2271 * won't conflict with symlinks in the bonus buffer.
2272 */
2273 dmu_object_info_from_db(zp->z_dbuf, &doi);
2274 len = sizeof (xoap->xoa_av_scanstamp) +
2275 sizeof (znode_phys_t);
2276 if (len <= doi.doi_bonus_size) {
2277 /*
2278 * pzp points to the start of the
2279 * znode_phys_t. pzp + 1 points to the
2280 * first byte after the znode_phys_t.
2281 */
2282 (void) memcpy(xoap->xoa_av_scanstamp,
2283 pzp + 1,
2284 sizeof (xoap->xoa_av_scanstamp));
2285 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
2286 }
2287 }
2288
2289 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2290 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
2291 XVA_SET_RTN(xvap, XAT_CREATETIME);
2292 }
2293 }
2294
2295 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2296 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2297 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2298
2299 mutex_exit(&zp->z_lock);
2300
2301 dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks);
2302
2303 if (zp->z_blksz == 0) {
2304 /*
2305 * Block size hasn't been set; suggest maximal I/O transfers.
2306 */
2307 vap->va_blksize = zfsvfs->z_max_blksz;
2308 }
2309
2310 ZFS_EXIT(zfsvfs);
2311 return (0);
2312 }
2313
2314 /*
2315 * Set the file attributes to the values contained in the
2316 * vattr structure.
2317 *
2318 * IN: vp - vnode of file to be modified.
2319 * vap - new attribute values.
2320 * If AT_XVATTR set, then optional attrs are being set
2321 * flags - ATTR_UTIME set if non-default time values provided.
2322 * - ATTR_NOACLCHECK (CIFS context only).
2323 * cr - credentials of caller.
2324 * ct - caller context
2325 *
2326 * RETURN: 0 if success
2327 * error code if failure
2328 *
2329 * Timestamps:
2330 * vp - ctime updated, mtime updated if size changed.
2331 */
2332 /* ARGSUSED */
2333 static int
2334 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2335 caller_context_t *ct)
2336 {
2337 znode_t *zp = VTOZ(vp);
2338 znode_phys_t *pzp;
2339 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2340 zilog_t *zilog;
2341 dmu_tx_t *tx;
2342 vattr_t oldva;
2343 xvattr_t tmpxvattr;
2344 uint_t mask = vap->va_mask;
2345 uint_t saved_mask;
2346 int trim_mask = 0;
2347 uint64_t new_mode;
2348 znode_t *attrzp;
2349 int need_policy = FALSE;
2350 int err;
2351 zfs_fuid_info_t *fuidp = NULL;
2352 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2353 xoptattr_t *xoap;
2354 zfs_acl_t *aclp = NULL;
2355 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2356
2357 if (mask == 0)
2358 return (0);
2359
2360 if (mask & AT_NOSET)
2361 return (EINVAL);
2362
2363 ZFS_ENTER(zfsvfs);
2364 ZFS_VERIFY_ZP(zp);
2365
2366 pzp = zp->z_phys;
2367 zilog = zfsvfs->z_log;
2368
2369 /*
2370 * Make sure that if we have ephemeral uid/gid or xvattr specified
2371 * that file system is at proper version level
2372 */
2373
2374 if (zfsvfs->z_use_fuids == B_FALSE &&
2375 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2376 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2377 (mask & AT_XVATTR))) {
2378 ZFS_EXIT(zfsvfs);
2379 return (EINVAL);
2380 }
2381
2382 if (mask & AT_SIZE && vp->v_type == VDIR) {
2383 ZFS_EXIT(zfsvfs);
2384 return (EISDIR);
2385 }
2386
2387 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2388 ZFS_EXIT(zfsvfs);
2389 return (EINVAL);
2390 }
2391
2392 /*
2393 * If this is an xvattr_t, then get a pointer to the structure of
2394 * optional attributes. If this is NULL, then we have a vattr_t.
2395 */
2396 xoap = xva_getxoptattr(xvap);
2397
2398 xva_init(&tmpxvattr);
2399
2400 /*
2401 * Immutable files can only alter immutable bit and atime
2402 */
2403 if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
2404 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2405 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2406 ZFS_EXIT(zfsvfs);
2407 return (EPERM);
2408 }
2409
2410 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
2411 ZFS_EXIT(zfsvfs);
2412 return (EPERM);
2413 }
2414
2415 /*
2416 * Verify timestamps doesn't overflow 32 bits.
2417 * ZFS can handle large timestamps, but 32bit syscalls can't
2418 * handle times greater than 2039. This check should be removed
2419 * once large timestamps are fully supported.
2420 */
2421 if (mask & (AT_ATIME | AT_MTIME)) {
2422 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2423 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2424 ZFS_EXIT(zfsvfs);
2425 return (EOVERFLOW);
2426 }
2427 }
2428
2429 top:
2430 attrzp = NULL;
2431
2432 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2433 ZFS_EXIT(zfsvfs);
2434 return (EROFS);
2435 }
2436
2437 /*
2438 * First validate permissions
2439 */
2440
2441 if (mask & AT_SIZE) {
2442 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2443 if (err) {
2444 ZFS_EXIT(zfsvfs);
2445 return (err);
2446 }
2447 /*
2448 * XXX - Note, we are not providing any open
2449 * mode flags here (like FNDELAY), so we may
2450 * block if there are locks present... this
2451 * should be addressed in openat().
2452 */
2453 /* XXX - would it be OK to generate a log record here? */
2454 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2455 if (err) {
2456 ZFS_EXIT(zfsvfs);
2457 return (err);
2458 }
2459 }
2460
2461 if (mask & (AT_ATIME|AT_MTIME) ||
2462 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2463 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2464 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2465 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2466 XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
2467 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2468 skipaclchk, cr);
2469
2470 if (mask & (AT_UID|AT_GID)) {
2471 int idmask = (mask & (AT_UID|AT_GID));
2472 int take_owner;
2473 int take_group;
2474
2475 /*
2476 * NOTE: even if a new mode is being set,
2477 * we may clear S_ISUID/S_ISGID bits.
2478 */
2479
2480 if (!(mask & AT_MODE))
2481 vap->va_mode = pzp->zp_mode;
2482
2483 /*
2484 * Take ownership or chgrp to group we are a member of
2485 */
2486
2487 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2488 take_group = (mask & AT_GID) &&
2489 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2490
2491 /*
2492 * If both AT_UID and AT_GID are set then take_owner and
2493 * take_group must both be set in order to allow taking
2494 * ownership.
2495 *
2496 * Otherwise, send the check through secpolicy_vnode_setattr()
2497 *
2498 */
2499
2500 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2501 ((idmask == AT_UID) && take_owner) ||
2502 ((idmask == AT_GID) && take_group)) {
2503 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2504 skipaclchk, cr) == 0) {
2505 /*
2506 * Remove setuid/setgid for non-privileged users
2507 */
2508 secpolicy_setid_clear(vap, cr);
2509 trim_mask = (mask & (AT_UID|AT_GID));
2510 } else {
2511 need_policy = TRUE;
2512 }
2513 } else {
2514 need_policy = TRUE;
2515 }
2516 }
2517
2518 mutex_enter(&zp->z_lock);
2519 oldva.va_mode = pzp->zp_mode;
2520 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2521 if (mask & AT_XVATTR) {
2522 /*
2523 * Update xvattr mask to include only those attributes
2524 * that are actually changing.
2525 *
2526 * the bits will be restored prior to actually setting
2527 * the attributes so the caller thinks they were set.
2528 */
2529 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2530 if (xoap->xoa_appendonly !=
2531 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
2532 need_policy = TRUE;
2533 } else {
2534 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2535 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2536 }
2537 }
2538
2539 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2540 if (xoap->xoa_nounlink !=
2541 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
2542 need_policy = TRUE;
2543 } else {
2544 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2545 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2546 }
2547 }
2548
2549 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2550 if (xoap->xoa_immutable !=
2551 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
2552 need_policy = TRUE;
2553 } else {
2554 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2555 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2556 }
2557 }
2558
2559 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2560 if (xoap->xoa_nodump !=
2561 ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
2562 need_policy = TRUE;
2563 } else {
2564 XVA_CLR_REQ(xvap, XAT_NODUMP);
2565 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2566 }
2567 }
2568
2569 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2570 if (xoap->xoa_av_modified !=
2571 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
2572 need_policy = TRUE;
2573 } else {
2574 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2575 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2576 }
2577 }
2578
2579 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2580 if ((vp->v_type != VREG &&
2581 xoap->xoa_av_quarantined) ||
2582 xoap->xoa_av_quarantined !=
2583 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
2584 need_policy = TRUE;
2585 } else {
2586 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2587 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2588 }
2589 }
2590
2591 if (need_policy == FALSE &&
2592 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2593 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2594 need_policy = TRUE;
2595 }
2596 }
2597
2598 mutex_exit(&zp->z_lock);
2599
2600 if (mask & AT_MODE) {
2601 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2602 err = secpolicy_setid_setsticky_clear(vp, vap,
2603 &oldva, cr);
2604 if (err) {
2605 ZFS_EXIT(zfsvfs);
2606 return (err);
2607 }
2608 trim_mask |= AT_MODE;
2609 } else {
2610 need_policy = TRUE;
2611 }
2612 }
2613
2614 if (need_policy) {
2615 /*
2616 * If trim_mask is set then take ownership
2617 * has been granted or write_acl is present and user
2618 * has the ability to modify mode. In that case remove
2619 * UID|GID and or MODE from mask so that
2620 * secpolicy_vnode_setattr() doesn't revoke it.
2621 */
2622
2623 if (trim_mask) {
2624 saved_mask = vap->va_mask;
2625 vap->va_mask &= ~trim_mask;
2626 }
2627 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2628 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2629 if (err) {
2630 ZFS_EXIT(zfsvfs);
2631 return (err);
2632 }
2633
2634 if (trim_mask)
2635 vap->va_mask |= saved_mask;
2636 }
2637
2638 /*
2639 * secpolicy_vnode_setattr, or take ownership may have
2640 * changed va_mask
2641 */
2642 mask = vap->va_mask;
2643
2644 tx = dmu_tx_create(zfsvfs->z_os);
2645 dmu_tx_hold_bonus(tx, zp->z_id);
2646 if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2647 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
2648 if (zfsvfs->z_fuid_obj == 0) {
2649 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
2650 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2651 FUID_SIZE_ESTIMATE(zfsvfs));
2652 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
2653 } else {
2654 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
2655 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
2656 FUID_SIZE_ESTIMATE(zfsvfs));
2657 }
2658 }
2659
2660 if (mask & AT_MODE) {
2661 uint64_t pmode = pzp->zp_mode;
2662
2663 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2664
2665 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
2666 dmu_tx_abort(tx);
2667 ZFS_EXIT(zfsvfs);
2668 return (err);
2669 }
2670 if (pzp->zp_acl.z_acl_extern_obj) {
2671 /* Are we upgrading ACL from old V0 format to new V1 */
2672 if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
2673 pzp->zp_acl.z_acl_version ==
2674 ZFS_ACL_VERSION_INITIAL) {
2675 dmu_tx_hold_free(tx,
2676 pzp->zp_acl.z_acl_extern_obj, 0,
2677 DMU_OBJECT_END);
2678 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2679 0, aclp->z_acl_bytes);
2680 } else {
2681 dmu_tx_hold_write(tx,
2682 pzp->zp_acl.z_acl_extern_obj, 0,
2683 aclp->z_acl_bytes);
2684 }
2685 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2686 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2687 0, aclp->z_acl_bytes);
2688 }
2689 }
2690
2691 if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
2692 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
2693 if (err) {
2694 dmu_tx_abort(tx);
2695 ZFS_EXIT(zfsvfs);
2696 if (aclp)
2697 zfs_acl_free(aclp);
2698 return (err);
2699 }
2700 dmu_tx_hold_bonus(tx, attrzp->z_id);
2701 }
2702
2703 err = dmu_tx_assign(tx, TXG_NOWAIT);
2704 if (err) {
2705 if (attrzp)
2706 VN_RELE(ZTOV(attrzp));
2707
2708 if (aclp) {
2709 zfs_acl_free(aclp);
2710 aclp = NULL;
2711 }
2712
2713 if (err == ERESTART) {
2714 dmu_tx_wait(tx);
2715 dmu_tx_abort(tx);
2716 goto top;
2717 }
2718 dmu_tx_abort(tx);
2719 ZFS_EXIT(zfsvfs);
2720 return (err);
2721 }
2722
2723 dmu_buf_will_dirty(zp->z_dbuf, tx);
2724
2725 /*
2726 * Set each attribute requested.
2727 * We group settings according to the locks they need to acquire.
2728 *
2729 * Note: you cannot set ctime directly, although it will be
2730 * updated as a side-effect of calling this function.
2731 */
2732
2733 mutex_enter(&zp->z_lock);
2734
2735 if (mask & AT_MODE) {
2736 mutex_enter(&zp->z_acl_lock);
2737 zp->z_phys->zp_mode = new_mode;
2738 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
2739 ASSERT3U(err, ==, 0);
2740 mutex_exit(&zp->z_acl_lock);
2741 }
2742
2743 if (attrzp)
2744 mutex_enter(&attrzp->z_lock);
2745
2746 if (mask & AT_UID) {
2747 pzp->zp_uid = zfs_fuid_create(zfsvfs,
2748 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
2749 if (attrzp) {
2750 attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
2751 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
2752 }
2753 }
2754
2755 if (mask & AT_GID) {
2756 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
2757 cr, ZFS_GROUP, tx, &fuidp);
2758 if (attrzp)
2759 attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
2760 vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
2761 }
2762
2763 if (aclp)
2764 zfs_acl_free(aclp);
2765
2766 if (attrzp)
2767 mutex_exit(&attrzp->z_lock);
2768
2769 if (mask & AT_ATIME)
2770 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2771
2772 if (mask & AT_MTIME)
2773 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2774
2775 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2776 if (mask & AT_SIZE)
2777 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2778 else if (mask != 0)
2779 zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2780 /*
2781 * Do this after setting timestamps to prevent timestamp
2782 * update from toggling bit
2783 */
2784
2785 if (xoap && (mask & AT_XVATTR)) {
2786
2787 /*
2788 * restore trimmed off masks
2789 * so that return masks can be set for caller.
2790 */
2791
2792 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
2793 XVA_SET_REQ(xvap, XAT_APPENDONLY);
2794 }
2795 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
2796 XVA_SET_REQ(xvap, XAT_NOUNLINK);
2797 }
2798 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
2799 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2800 }
2801 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
2802 XVA_SET_REQ(xvap, XAT_NODUMP);
2803 }
2804 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
2805 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2806 }
2807 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
2808 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2809 }
2810
2811 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
2812 size_t len;
2813 dmu_object_info_t doi;
2814
2815 ASSERT(vp->v_type == VREG);
2816
2817 /* Grow the bonus buffer if necessary. */
2818 dmu_object_info_from_db(zp->z_dbuf, &doi);
2819 len = sizeof (xoap->xoa_av_scanstamp) +
2820 sizeof (znode_phys_t);
2821 if (len > doi.doi_bonus_size)
2822 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
2823 }
2824 zfs_xvattr_set(zp, xvap);
2825 }
2826
2827 if (mask != 0)
2828 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2829
2830 if (fuidp)
2831 zfs_fuid_info_free(fuidp);
2832 mutex_exit(&zp->z_lock);
2833
2834 if (attrzp)
2835 VN_RELE(ZTOV(attrzp));
2836
2837 dmu_tx_commit(tx);
2838
2839 ZFS_EXIT(zfsvfs);
2840 return (err);
2841 }
2842
2843 typedef struct zfs_zlock {
2844 krwlock_t *zl_rwlock; /* lock we acquired */
2845 znode_t *zl_znode; /* znode we held */
2846 struct zfs_zlock *zl_next; /* next in list */
2847 } zfs_zlock_t;
2848
2849 /*
2850 * Drop locks and release vnodes that were held by zfs_rename_lock().
2851 */
2852 static void
2853 zfs_rename_unlock(zfs_zlock_t **zlpp)
2854 {
2855 zfs_zlock_t *zl;
2856
2857 while ((zl = *zlpp) != NULL) {
2858 if (zl->zl_znode != NULL)
2859 VN_RELE(ZTOV(zl->zl_znode));
2860 rw_exit(zl->zl_rwlock);
2861 *zlpp = zl->zl_next;
2862 kmem_free(zl, sizeof (*zl));
2863 }
2864 }
2865
2866 /*
2867 * Search back through the directory tree, using the ".." entries.
2868 * Lock each directory in the chain to prevent concurrent renames.
2869 * Fail any attempt to move a directory into one of its own descendants.
2870 * XXX - z_parent_lock can overlap with map or grow locks
2871 */
2872 static int
2873 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2874 {
2875 zfs_zlock_t *zl;
2876 znode_t *zp = tdzp;
2877 uint64_t rootid = zp->z_zfsvfs->z_root;
2878 uint64_t *oidp = &zp->z_id;
2879 krwlock_t *rwlp = &szp->z_parent_lock;
2880 krw_t rw = RW_WRITER;
2881
2882 /*
2883 * First pass write-locks szp and compares to zp->z_id.
2884 * Later passes read-lock zp and compare to zp->z_parent.
2885 */
2886 do {
2887 if (!rw_tryenter(rwlp, rw)) {
2888 /*
2889 * Another thread is renaming in this path.
2890 * Note that if we are a WRITER, we don't have any
2891 * parent_locks held yet.
2892 */
2893 if (rw == RW_READER && zp->z_id > szp->z_id) {
2894 /*
2895 * Drop our locks and restart
2896 */
2897 zfs_rename_unlock(&zl);
2898 *zlpp = NULL;
2899 zp = tdzp;
2900 oidp = &zp->z_id;
2901 rwlp = &szp->z_parent_lock;
2902 rw = RW_WRITER;
2903 continue;
2904 } else {
2905 /*
2906 * Wait for other thread to drop its locks
2907 */
2908 rw_enter(rwlp, rw);
2909 }
2910 }
2911
2912 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2913 zl->zl_rwlock = rwlp;
2914 zl->zl_znode = NULL;
2915 zl->zl_next = *zlpp;
2916 *zlpp = zl;
2917
2918 if (*oidp == szp->z_id) /* We're a descendant of szp */
2919 return (EINVAL);
2920
2921 if (*oidp == rootid) /* We've hit the top */
2922 return (0);
2923
2924 if (rw == RW_READER) { /* i.e. not the first pass */
2925 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
2926 if (error)
2927 return (error);
2928 zl->zl_znode = zp;
2929 }
2930 oidp = &zp->z_phys->zp_parent;
2931 rwlp = &zp->z_parent_lock;
2932 rw = RW_READER;
2933
2934 } while (zp->z_id != sdzp->z_id);
2935
2936 return (0);
2937 }
2938
2939 /*
2940 * Move an entry from the provided source directory to the target
2941 * directory. Change the entry name as indicated.
2942 *
2943 * IN: sdvp - Source directory containing the "old entry".
2944 * snm - Old entry name.
2945 * tdvp - Target directory to contain the "new entry".
2946 * tnm - New entry name.
2947 * cr - credentials of caller.
2948 * ct - caller context
2949 * flags - case flags
2950 *
2951 * RETURN: 0 if success
2952 * error code if failure
2953 *
2954 * Timestamps:
2955 * sdvp,tdvp - ctime|mtime updated
2956 */
2957 /*ARGSUSED*/
2958 static int
2959 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
2960 caller_context_t *ct, int flags)
2961 {
2962 znode_t *tdzp, *szp, *tzp;
2963 znode_t *sdzp = VTOZ(sdvp);
2964 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
2965 zilog_t *zilog;
2966 vnode_t *realvp;
2967 zfs_dirlock_t *sdl, *tdl;
2968 dmu_tx_t *tx;
2969 zfs_zlock_t *zl;
2970 int cmp, serr, terr;
2971 int error = 0;
2972 int zflg = 0;
2973
2974 ZFS_ENTER(zfsvfs);
2975 ZFS_VERIFY_ZP(sdzp);
2976 zilog = zfsvfs->z_log;
2977
2978 /*
2979 * Make sure we have the real vp for the target directory.
2980 */
2981 if (VOP_REALVP(tdvp, &realvp, ct) == 0)
2982 tdvp = realvp;
2983
2984 if (tdvp->v_vfsp != sdvp->v_vfsp) {
2985 ZFS_EXIT(zfsvfs);
2986 return (EXDEV);
2987 }
2988
2989 tdzp = VTOZ(tdvp);
2990 ZFS_VERIFY_ZP(tdzp);
2991 if (zfsvfs->z_utf8 && u8_validate(tnm,
2992 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2993 ZFS_EXIT(zfsvfs);
2994 return (EILSEQ);
2995 }
2996
2997 if (flags & FIGNORECASE)
2998 zflg |= ZCILOOK;
2999
3000 top:
3001 szp = NULL;
3002 tzp = NULL;
3003 zl = NULL;
3004
3005 /*
3006 * This is to prevent the creation of links into attribute space
3007 * by renaming a linked file into/outof an attribute directory.
3008 * See the comment in zfs_link() for why this is considered bad.
3009 */
3010 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
3011 (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
3012 ZFS_EXIT(zfsvfs);
3013 return (EINVAL);
3014 }
3015
3016 /*
3017 * Lock source and target directory entries. To prevent deadlock,
3018 * a lock ordering must be defined. We lock the directory with
3019 * the smallest object id first, or if it's a tie, the one with
3020 * the lexically first name.
3021 */
3022 if (sdzp->z_id < tdzp->z_id) {
3023 cmp = -1;
3024 } else if (sdzp->z_id > tdzp->z_id) {
3025 cmp = 1;
3026 } else {
3027 /*
3028 * First compare the two name arguments without
3029 * considering any case folding.
3030 */
3031 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3032
3033 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3034 ASSERT(error == 0 || !zfsvfs->z_utf8);
3035 if (cmp == 0) {
3036 /*
3037 * POSIX: "If the old argument and the new argument
3038 * both refer to links to the same existing file,
3039 * the rename() function shall return successfully
3040 * and perform no other action."
3041 */
3042 ZFS_EXIT(zfsvfs);
3043 return (0);
3044 }
3045 /*
3046 * If the file system is case-folding, then we may
3047 * have some more checking to do. A case-folding file
3048 * system is either supporting mixed case sensitivity
3049 * access or is completely case-insensitive. Note
3050 * that the file system is always case preserving.
3051 *
3052 * In mixed sensitivity mode case sensitive behavior
3053 * is the default. FIGNORECASE must be used to
3054 * explicitly request case insensitive behavior.
3055 *
3056 * If the source and target names provided differ only
3057 * by case (e.g., a request to rename 'tim' to 'Tim'),
3058 * we will treat this as a special case in the
3059 * case-insensitive mode: as long as the source name
3060 * is an exact match, we will allow this to proceed as
3061 * a name-change request.
3062 */
3063 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3064 (zfsvfs->z_case == ZFS_CASE_MIXED &&
3065 flags & FIGNORECASE)) &&
3066 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3067 &error) == 0) {
3068 /*
3069 * case preserving rename request, require exact
3070 * name matches
3071 */
3072 zflg |= ZCIEXACT;
3073 zflg &= ~ZCILOOK;
3074 }
3075 }
3076
3077 if (cmp < 0) {
3078 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3079 ZEXISTS | zflg, NULL, NULL);
3080 terr = zfs_dirent_lock(&tdl,
3081 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3082 } else {
3083 terr = zfs_dirent_lock(&tdl,
3084 tdzp, tnm, &tzp, zflg, NULL, NULL);
3085 serr = zfs_dirent_lock(&sdl,
3086 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3087 NULL, NULL);
3088 }
3089
3090 if (serr) {
3091 /*
3092 * Source entry invalid or not there.
3093 */
3094 if (!terr) {
3095 zfs_dirent_unlock(tdl);
3096 if (tzp)
3097 VN_RELE(ZTOV(tzp));
3098 }
3099 if (strcmp(snm, "..") == 0)
3100 serr = EINVAL;
3101 ZFS_EXIT(zfsvfs);
3102 return (serr);
3103 }
3104 if (terr) {
3105 zfs_dirent_unlock(sdl);
3106 VN_RELE(ZTOV(szp));
3107 if (strcmp(tnm, "..") == 0)
3108 terr = EINVAL;
3109 ZFS_EXIT(zfsvfs);
3110 return (terr);
3111 }
3112
3113 /*
3114 * Must have write access at the source to remove the old entry
3115 * and write access at the target to create the new entry.
3116 * Note that if target and source are the same, this can be
3117 * done in a single check.
3118 */
3119
3120 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3121 goto out;
3122
3123 if (ZTOV(szp)->v_type == VDIR) {
3124 /*
3125 * Check to make sure rename is valid.
3126 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3127 */
3128 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3129 goto out;
3130 }
3131
3132 /*
3133 * Does target exist?
3134 */
3135 if (tzp) {
3136 /*
3137 * Source and target must be the same type.
3138 */
3139 if (ZTOV(szp)->v_type == VDIR) {
3140 if (ZTOV(tzp)->v_type != VDIR) {
3141 error = ENOTDIR;
3142 goto out;
3143 }
3144 } else {
3145 if (ZTOV(tzp)->v_type == VDIR) {
3146 error = EISDIR;
3147 goto out;
3148 }
3149 }
3150 /*
3151 * POSIX dictates that when the source and target
3152 * entries refer to the same file object, rename
3153 * must do nothing and exit without error.
3154 */
3155 if (szp->z_id == tzp->z_id) {
3156 error = 0;
3157 goto out;
3158 }
3159 }
3160
3161 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3162 if (tzp)
3163 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3164
3165 /*
3166 * notify the target directory if it is not the same
3167 * as source directory.
3168 */
3169 if (tdvp != sdvp) {
3170 vnevent_rename_dest_dir(tdvp, ct);
3171 }
3172
3173 tx = dmu_tx_create(zfsvfs->z_os);
3174 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
3175 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
3176 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3177 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3178 if (sdzp != tdzp)
3179 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
3180 if (tzp)
3181 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
3182 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3183 error = dmu_tx_assign(tx, TXG_NOWAIT);
3184 if (error) {
3185 if (zl != NULL)
3186 zfs_rename_unlock(&zl);
3187 zfs_dirent_unlock(sdl);
3188 zfs_dirent_unlock(tdl);
3189 VN_RELE(ZTOV(szp));
3190 if (tzp)
3191 VN_RELE(ZTOV(tzp));
3192 if (error == ERESTART) {
3193 dmu_tx_wait(tx);
3194 dmu_tx_abort(tx);
3195 goto top;
3196 }
3197 dmu_tx_abort(tx);
3198 ZFS_EXIT(zfsvfs);
3199 return (error);
3200 }
3201
3202 if (tzp) /* Attempt to remove the existing target */
3203 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3204
3205 if (error == 0) {
3206 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3207 if (error == 0) {
3208 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
3209
3210 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3211 ASSERT(error == 0);
3212
3213 zfs_log_rename(zilog, tx,
3214 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
3215 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3216
3217 /* Update path information for the target vnode */
3218 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
3219 }
3220 }
3221
3222 dmu_tx_commit(tx);
3223 out:
3224 if (zl != NULL)
3225 zfs_rename_unlock(&zl);
3226
3227 zfs_dirent_unlock(sdl);
3228 zfs_dirent_unlock(tdl);
3229
3230 VN_RELE(ZTOV(szp));
3231 if (tzp)
3232 VN_RELE(ZTOV(tzp));
3233
3234 ZFS_EXIT(zfsvfs);
3235 return (error);
3236 }
3237
3238 /*
3239 * Insert the indicated symbolic reference entry into the directory.
3240 *
3241 * IN: dvp - Directory to contain new symbolic link.
3242 * link - Name for new symlink entry.
3243 * vap - Attributes of new entry.
3244 * target - Target path of new symlink.
3245 * cr - credentials of caller.
3246 * ct - caller context
3247 * flags - case flags
3248 *
3249 * RETURN: 0 if success
3250 * error code if failure
3251 *
3252 * Timestamps:
3253 * dvp - ctime|mtime updated
3254 */
3255 /*ARGSUSED*/
3256 static int
3257 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3258 caller_context_t *ct, int flags)
3259 {
3260 znode_t *zp, *dzp = VTOZ(dvp);
3261 zfs_dirlock_t *dl;
3262 dmu_tx_t *tx;
3263 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3264 zilog_t *zilog;
3265 int len = strlen(link);
3266 int error;
3267 int zflg = ZNEW;
3268 zfs_fuid_info_t *fuidp = NULL;
3269
3270 ASSERT(vap->va_type == VLNK);
3271
3272 ZFS_ENTER(zfsvfs);
3273 ZFS_VERIFY_ZP(dzp);
3274 zilog = zfsvfs->z_log;
3275
3276 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3277 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3278 ZFS_EXIT(zfsvfs);
3279 return (EILSEQ);
3280 }
3281 if (flags & FIGNORECASE)
3282 zflg |= ZCILOOK;
3283 top:
3284 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3285 ZFS_EXIT(zfsvfs);
3286 return (error);
3287 }
3288
3289 if (len > MAXPATHLEN) {
3290 ZFS_EXIT(zfsvfs);
3291 return (ENAMETOOLONG);
3292 }
3293
3294 /*
3295 * Attempt to lock directory; fail if entry already exists.
3296 */
3297 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3298 if (error) {
3299 ZFS_EXIT(zfsvfs);
3300 return (error);
3301 }
3302
3303 tx = dmu_tx_create(zfsvfs->z_os);
3304 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3305 dmu_tx_hold_bonus(tx, dzp->z_id);
3306 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3307 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
3308 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
3309 if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
3310 if (zfsvfs->z_fuid_obj == 0) {
3311 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
3312 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3313 FUID_SIZE_ESTIMATE(zfsvfs));
3314 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
3315 } else {
3316 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
3317 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
3318 FUID_SIZE_ESTIMATE(zfsvfs));
3319 }
3320 }
3321 error = dmu_tx_assign(tx, TXG_NOWAIT);
3322 if (error) {
3323 zfs_dirent_unlock(dl);
3324 if (error == ERESTART) {
3325 dmu_tx_wait(tx);
3326 dmu_tx_abort(tx);
3327 goto top;
3328 }
3329 dmu_tx_abort(tx);
3330 ZFS_EXIT(zfsvfs);
3331 return (error);
3332 }
3333
3334 dmu_buf_will_dirty(dzp->z_dbuf, tx);
3335
3336 /*
3337 * Create a new object for the symlink.
3338 * Put the link content into bonus buffer if it will fit;
3339 * otherwise, store it just like any other file data.
3340 */
3341 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
3342 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
3343 if (len != 0)
3344 bcopy(link, zp->z_phys + 1, len);
3345 } else {
3346 dmu_buf_t *dbp;
3347
3348 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
3349 /*
3350 * Nothing can access the znode yet so no locking needed
3351 * for growing the znode's blocksize.
3352 */
3353 zfs_grow_blocksize(zp, len, tx);
3354
3355 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
3356 zp->z_id, 0, FTAG, &dbp));
3357 dmu_buf_will_dirty(dbp, tx);
3358
3359 ASSERT3U(len, <=, dbp->db_size);
3360 bcopy(link, dbp->db_data, len);
3361 dmu_buf_rele(dbp, FTAG);
3362 }
3363 zp->z_phys->zp_size = len;
3364
3365 /*
3366 * Insert the new object into the directory.
3367 */
3368 (void) zfs_link_create(dl, zp, tx, ZNEW);
3369 out:
3370 if (error == 0) {
3371 uint64_t txtype = TX_SYMLINK;
3372 if (flags & FIGNORECASE)
3373 txtype |= TX_CI;
3374 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3375 }
3376 if (fuidp)
3377 zfs_fuid_info_free(fuidp);
3378
3379 dmu_tx_commit(tx);
3380
3381 zfs_dirent_unlock(dl);
3382
3383 VN_RELE(ZTOV(zp));
3384
3385 ZFS_EXIT(zfsvfs);
3386 return (error);
3387 }
3388
3389 /*
3390 * Return, in the buffer contained in the provided uio structure,
3391 * the symbolic path referred to by vp.
3392 *
3393 * IN: vp - vnode of symbolic link.
3394 * uoip - structure to contain the link path.
3395 * cr - credentials of caller.
3396 * ct - caller context
3397 *
3398 * OUT: uio - structure to contain the link path.
3399 *
3400 * RETURN: 0 if success
3401 * error code if failure
3402 *
3403 * Timestamps:
3404 * vp - atime updated
3405 */
3406 /* ARGSUSED */
3407 static int
3408 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3409 {
3410 znode_t *zp = VTOZ(vp);
3411 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3412 size_t bufsz;
3413 int error;
3414
3415 ZFS_ENTER(zfsvfs);
3416 ZFS_VERIFY_ZP(zp);
3417
3418 bufsz = (size_t)zp->z_phys->zp_size;
3419 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
3420 error = uiomove(zp->z_phys + 1,
3421 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3422 } else {
3423 dmu_buf_t *dbp;
3424 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
3425 if (error) {
3426 ZFS_EXIT(zfsvfs);
3427 return (error);
3428 }
3429 error = uiomove(dbp->db_data,
3430 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3431 dmu_buf_rele(dbp, FTAG);
3432 }
3433
3434 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3435 ZFS_EXIT(zfsvfs);
3436 return (error);
3437 }
3438
3439 /*
3440 * Insert a new entry into directory tdvp referencing svp.
3441 *
3442 * IN: tdvp - Directory to contain new entry.
3443 * svp - vnode of new entry.
3444 * name - name of new entry.
3445 * cr - credentials of caller.
3446 * ct - caller context
3447 *
3448 * RETURN: 0 if success
3449 * error code if failure
3450 *
3451 * Timestamps:
3452 * tdvp - ctime|mtime updated
3453 * svp - ctime updated
3454 */
3455 /* ARGSUSED */
3456 static int
3457 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3458 caller_context_t *ct, int flags)
3459 {
3460 znode_t *dzp = VTOZ(tdvp);
3461 znode_t *tzp, *szp;
3462 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3463 zilog_t *zilog;
3464 zfs_dirlock_t *dl;
3465 dmu_tx_t *tx;
3466 vnode_t *realvp;
3467 int error;
3468 int zf = ZNEW;
3469 uid_t owner;
3470
3471 ASSERT(tdvp->v_type == VDIR);
3472
3473 ZFS_ENTER(zfsvfs);
3474 ZFS_VERIFY_ZP(dzp);
3475 zilog = zfsvfs->z_log;
3476
3477 if (VOP_REALVP(svp, &realvp, ct) == 0)
3478 svp = realvp;
3479
3480 if (svp->v_vfsp != tdvp->v_vfsp) {
3481 ZFS_EXIT(zfsvfs);
3482 return (EXDEV);
3483 }
3484 szp = VTOZ(svp);
3485 ZFS_VERIFY_ZP(szp);
3486
3487 if (zfsvfs->z_utf8 && u8_validate(name,
3488 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3489 ZFS_EXIT(zfsvfs);
3490 return (EILSEQ);
3491 }
3492 if (flags & FIGNORECASE)
3493 zf |= ZCILOOK;
3494
3495 top:
3496 /*
3497 * We do not support links between attributes and non-attributes
3498 * because of the potential security risk of creating links
3499 * into "normal" file space in order to circumvent restrictions
3500 * imposed in attribute space.
3501 */
3502 if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
3503 (dzp->z_phys->zp_flags & ZFS_XATTR)) {
3504 ZFS_EXIT(zfsvfs);
3505 return (EINVAL);
3506 }
3507
3508 /*
3509 * POSIX dictates that we return EPERM here.
3510 * Better choices include ENOTSUP or EISDIR.
3511 */
3512 if (svp->v_type == VDIR) {
3513 ZFS_EXIT(zfsvfs);
3514 return (EPERM);
3515 }
3516
3517 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
3518 if (owner != crgetuid(cr) &&
3519 secpolicy_basic_link(cr) != 0) {
3520 ZFS_EXIT(zfsvfs);
3521 return (EPERM);
3522 }
3523
3524 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3525 ZFS_EXIT(zfsvfs);
3526 return (error);
3527 }
3528
3529 /*
3530 * Attempt to lock directory; fail if entry already exists.
3531 */
3532 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3533 if (error) {
3534 ZFS_EXIT(zfsvfs);
3535 return (error);
3536 }
3537
3538 tx = dmu_tx_create(zfsvfs->z_os);
3539 dmu_tx_hold_bonus(tx, szp->z_id);
3540 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3541 error = dmu_tx_assign(tx, TXG_NOWAIT);
3542 if (error) {
3543 zfs_dirent_unlock(dl);
3544 if (error == ERESTART) {
3545 dmu_tx_wait(tx);
3546 dmu_tx_abort(tx);
3547 goto top;
3548 }
3549 dmu_tx_abort(tx);
3550 ZFS_EXIT(zfsvfs);
3551 return (error);
3552 }
3553
3554 error = zfs_link_create(dl, szp, tx, 0);
3555
3556 if (error == 0) {
3557 uint64_t txtype = TX_LINK;
3558 if (flags & FIGNORECASE)
3559 txtype |= TX_CI;
3560 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3561 }
3562
3563 dmu_tx_commit(tx);
3564
3565 zfs_dirent_unlock(dl);
3566
3567 if (error == 0) {
3568 vnevent_link(svp, ct);
3569 }
3570
3571 ZFS_EXIT(zfsvfs);
3572 return (error);
3573 }
3574
3575 /*
3576 * zfs_null_putapage() is used when the file system has been force
3577 * unmounted. It just drops the pages.
3578 */
3579 /* ARGSUSED */
3580 static int
3581 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
3582 size_t *lenp, int flags, cred_t *cr)
3583 {
3584 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
3585 return (0);
3586 }
3587
3588 /*
3589 * Push a page out to disk, klustering if possible.
3590 *
3591 * IN: vp - file to push page to.
3592 * pp - page to push.
3593 * flags - additional flags.
3594 * cr - credentials of caller.
3595 *
3596 * OUT: offp - start of range pushed.
3597 * lenp - len of range pushed.
3598 *
3599 * RETURN: 0 if success
3600 * error code if failure
3601 *
3602 * NOTE: callers must have locked the page to be pushed. On
3603 * exit, the page (and all other pages in the kluster) must be
3604 * unlocked.
3605 */
3606 /* ARGSUSED */
3607 static int
3608 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
3609 size_t *lenp, int flags, cred_t *cr)
3610 {
3611 znode_t *zp = VTOZ(vp);
3612 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3613 zilog_t *zilog = zfsvfs->z_log;
3614 dmu_tx_t *tx;
3615 rl_t *rl;
3616 u_offset_t off, koff;
3617 size_t len, klen;
3618 uint64_t filesz;
3619 int err;
3620
3621 filesz = zp->z_phys->zp_size;
3622 off = pp->p_offset;
3623 len = PAGESIZE;
3624 /*
3625 * If our blocksize is bigger than the page size, try to kluster
3626 * multiple pages so that we write a full block (thus avoiding
3627 * a read-modify-write).
3628 */
3629 if (off < filesz && zp->z_blksz > PAGESIZE) {
3630 if (!ISP2(zp->z_blksz)) {
3631 /* Only one block in the file. */
3632 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
3633 koff = 0;
3634 } else {
3635 klen = zp->z_blksz;
3636 koff = P2ALIGN(off, (u_offset_t)klen);
3637 }
3638 ASSERT(koff <= filesz);
3639 if (koff + klen > filesz)
3640 klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE);
3641 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
3642 }
3643 ASSERT3U(btop(len), ==, btopr(len));
3644 top:
3645 rl = zfs_range_lock(zp, off, len, RL_WRITER);
3646 /*
3647 * Can't push pages past end-of-file.
3648 */
3649 filesz = zp->z_phys->zp_size;
3650 if (off >= filesz) {
3651 /* ignore all pages */
3652 err = 0;
3653 goto out;
3654 } else if (off + len > filesz) {
3655 int npages = btopr(filesz - off);
3656 page_t *trunc;
3657
3658 page_list_break(&pp, &trunc, npages);
3659 /* ignore pages past end of file */
3660 if (trunc)
3661 pvn_write_done(trunc, flags);
3662 len = filesz - off;
3663 }
3664
3665 tx = dmu_tx_create(zfsvfs->z_os);
3666 dmu_tx_hold_write(tx, zp->z_id, off, len);
3667 dmu_tx_hold_bonus(tx, zp->z_id);
3668 err = dmu_tx_assign(tx, TXG_NOWAIT);
3669 if (err != 0) {
3670 if (err == ERESTART) {
3671 zfs_range_unlock(rl);
3672 dmu_tx_wait(tx);
3673 dmu_tx_abort(tx);
3674 err = 0;
3675 goto top;
3676 }
3677 dmu_tx_abort(tx);
3678 goto out;
3679 }
3680
3681 if (zp->z_blksz <= PAGESIZE) {
3682 caddr_t va = zfs_map_page(pp, S_READ);
3683 ASSERT3U(len, <=, PAGESIZE);
3684 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
3685 zfs_unmap_page(pp, va);
3686 } else {
3687 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
3688 }
3689
3690 if (err == 0) {
3691 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
3692 zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0);
3693 dmu_tx_commit(tx);
3694 }
3695
3696 out:
3697 zfs_range_unlock(rl);
3698 pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
3699 if (offp)
3700 *offp = off;
3701 if (lenp)
3702 *lenp = len;
3703
3704 return (err);
3705 }
3706
3707 /*
3708 * Copy the portion of the file indicated from pages into the file.
3709 * The pages are stored in a page list attached to the files vnode.
3710 *
3711 * IN: vp - vnode of file to push page data to.
3712 * off - position in file to put data.
3713 * len - amount of data to write.
3714 * flags - flags to control the operation.
3715 * cr - credentials of caller.
3716 * ct - caller context.
3717 *
3718 * RETURN: 0 if success
3719 * error code if failure
3720 *
3721 * Timestamps:
3722 * vp - ctime|mtime updated
3723 */
3724 /*ARGSUSED*/
3725 static int
3726 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
3727 caller_context_t *ct)
3728 {
3729 znode_t *zp = VTOZ(vp);
3730 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3731 page_t *pp;
3732 size_t io_len;
3733 u_offset_t io_off;
3734 uint64_t filesz;
3735 int error = 0;
3736
3737 ZFS_ENTER(zfsvfs);
3738 ZFS_VERIFY_ZP(zp);
3739
3740 if (len == 0) {
3741 /*
3742 * Search the entire vp list for pages >= off.
3743 */
3744 error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage,
3745 flags, cr);
3746 goto out;
3747 }
3748
3749 filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
3750 if (off > filesz) {
3751 /* past end of file */
3752 ZFS_EXIT(zfsvfs);
3753 return (0);
3754 }
3755
3756 len = MIN(len, filesz - off);
3757
3758 for (io_off = off; io_off < off + len; io_off += io_len) {
3759 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
3760 pp = page_lookup(vp, io_off,
3761 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
3762 } else {
3763 pp = page_lookup_nowait(vp, io_off,
3764 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
3765 }
3766
3767 if (pp != NULL && pvn_getdirty(pp, flags)) {
3768 int err;
3769
3770 /*
3771 * Found a dirty page to push
3772 */
3773 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
3774 if (err)
3775 error = err;
3776 } else {
3777 io_len = PAGESIZE;
3778 }
3779 }
3780 out:
3781 if ((flags & B_ASYNC) == 0)
3782 zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
3783 ZFS_EXIT(zfsvfs);
3784 return (error);
3785 }
3786
3787 /*ARGSUSED*/
3788 void
3789 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3790 {
3791 znode_t *zp = VTOZ(vp);
3792 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3793 int error;
3794
3795 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3796 if (zp->z_dbuf == NULL) {
3797 /*
3798 * The fs has been unmounted, or we did a
3799 * suspend/resume and this file no longer exists.
3800 */
3801 if (vn_has_cached_data(vp)) {
3802 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
3803 B_INVAL, cr);
3804 }
3805
3806 mutex_enter(&zp->z_lock);
3807 vp->v_count = 0; /* count arrives as 1 */
3808 mutex_exit(&zp->z_lock);
3809 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3810 zfs_znode_free(zp);
3811 return;
3812 }
3813
3814 /*
3815 * Attempt to push any data in the page cache. If this fails
3816 * we will get kicked out later in zfs_zinactive().
3817 */
3818 if (vn_has_cached_data(vp)) {
3819 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
3820 cr);
3821 }
3822
3823 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
3824 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3825
3826 dmu_tx_hold_bonus(tx, zp->z_id);
3827 error = dmu_tx_assign(tx, TXG_WAIT);
3828 if (error) {
3829 dmu_tx_abort(tx);
3830 } else {
3831 dmu_buf_will_dirty(zp->z_dbuf, tx);
3832 mutex_enter(&zp->z_lock);
3833 zp->z_atime_dirty = 0;
3834 mutex_exit(&zp->z_lock);
3835 dmu_tx_commit(tx);
3836 }
3837 }
3838
3839 zfs_zinactive(zp);
3840 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3841 }
3842
3843 /*
3844 * Bounds-check the seek operation.
3845 *
3846 * IN: vp - vnode seeking within
3847 * ooff - old file offset
3848 * noffp - pointer to new file offset
3849 * ct - caller context
3850 *
3851 * RETURN: 0 if success
3852 * EINVAL if new offset invalid
3853 */
3854 /* ARGSUSED */
3855 static int
3856 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
3857 caller_context_t *ct)
3858 {
3859 if (vp->v_type == VDIR)
3860 return (0);
3861 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
3862 }
3863
3864 /*
3865 * Pre-filter the generic locking function to trap attempts to place
3866 * a mandatory lock on a memory mapped file.
3867 */
3868 static int
3869 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
3870 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
3871 {
3872 znode_t *zp = VTOZ(vp);
3873 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3874 int error;
3875
3876 ZFS_ENTER(zfsvfs);
3877 ZFS_VERIFY_ZP(zp);
3878
3879 /*
3880 * We are following the UFS semantics with respect to mapcnt
3881 * here: If we see that the file is mapped already, then we will
3882 * return an error, but we don't worry about races between this
3883 * function and zfs_map().
3884 */
3885 if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) {
3886 ZFS_EXIT(zfsvfs);
3887 return (EAGAIN);
3888 }
3889 error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3890 ZFS_EXIT(zfsvfs);
3891 return (error);
3892 }
3893
3894 /*
3895 * If we can't find a page in the cache, we will create a new page
3896 * and fill it with file data. For efficiency, we may try to fill
3897 * multiple pages at once (klustering).
3898 */
3899 static int
3900 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
3901 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
3902 {
3903 znode_t *zp = VTOZ(vp);
3904 page_t *pp, *cur_pp;
3905 objset_t *os = zp->z_zfsvfs->z_os;
3906 caddr_t va;
3907 u_offset_t io_off, total;
3908 uint64_t oid = zp->z_id;
3909 size_t io_len;
3910 uint64_t filesz;
3911 int err;
3912
3913 /*
3914 * If we are only asking for a single page don't bother klustering.
3915 */
3916 filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
3917 if (off >= filesz)
3918 return (EFAULT);
3919 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
3920 io_off = off;
3921 io_len = PAGESIZE;
3922 pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
3923 } else {
3924 /*
3925 * Try to fill a kluster of pages (a blocks worth).
3926 */
3927 size_t klen;
3928 u_offset_t koff;
3929
3930 if (!ISP2(zp->z_blksz)) {
3931 /* Only one block in the file. */
3932 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
3933 koff = 0;
3934 } else {
3935 /*
3936 * It would be ideal to align our offset to the
3937 * blocksize but doing so has resulted in some
3938 * strange application crashes. For now, we
3939 * leave the offset as is and only adjust the
3940 * length if we are off the end of the file.
3941 */
3942 koff = off;
3943 klen = plsz;
3944 }
3945 ASSERT(koff <= filesz);
3946 if (koff + klen > filesz)
3947 klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff;
3948 ASSERT3U(off, >=, koff);
3949 ASSERT3U(off, <, koff + klen);
3950 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3951 &io_len, koff, klen, 0);
3952 }
3953 if (pp == NULL) {
3954 /*
3955 * Some other thread entered the page before us.
3956 * Return to zfs_getpage to retry the lookup.
3957 */
3958 *pl = NULL;
3959 return (0);
3960 }
3961
3962 /*
3963 * Fill the pages in the kluster.
3964 */
3965 cur_pp = pp;
3966 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
3967 ASSERT3U(io_off, ==, cur_pp->p_offset);
3968 va = zfs_map_page(cur_pp, S_WRITE);
3969 err = dmu_read(os, oid, io_off, PAGESIZE, va);
3970 zfs_unmap_page(cur_pp, va);
3971 if (err) {
3972 /* On error, toss the entire kluster */
3973 pvn_read_done(pp, B_ERROR);
3974 /* convert checksum errors into IO errors */
3975 if (err == ECKSUM)
3976 err = EIO;
3977 return (err);
3978 }
3979 cur_pp = cur_pp->p_next;
3980 }
3981 out:
3982 /*
3983 * Fill in the page list array from the kluster. If
3984 * there are too many pages in the kluster, return
3985 * as many pages as possible starting from the desired
3986 * offset `off'.
3987 * NOTE: the page list will always be null terminated.
3988 */
3989 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3990
3991 return (0);
3992 }
3993
3994 /*
3995 * Return pointers to the pages for the file region [off, off + len]
3996 * in the pl array. If plsz is greater than len, this function may
3997 * also return page pointers from before or after the specified
3998 * region (i.e. some region [off', off' + plsz]). These additional
3999 * pages are only returned if they are already in the cache, or were
4000 * created as part of a klustered read.
4001 *
4002 * IN: vp - vnode of file to get data from.
4003 * off - position in file to get data from.
4004 * len - amount of data to retrieve.
4005 * plsz - length of provided page list.
4006 * seg - segment to obtain pages for.
4007 * addr - virtual address of fault.
4008 * rw - mode of created pages.
4009 * cr - credentials of caller.
4010 * ct - caller context.
4011 *
4012 * OUT: protp - protection mode of created pages.
4013 * pl - list of pages created.
4014 *
4015 * RETURN: 0 if success
4016 * error code if failure
4017 *
4018 * Timestamps:
4019 * vp - atime updated
4020 */
4021 /* ARGSUSED */
4022 static int
4023 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4024 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4025 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4026 {
4027 znode_t *zp = VTOZ(vp);
4028 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4029 page_t *pp, **pl0 = pl;
4030 int need_unlock = 0, err = 0;
4031 offset_t orig_off;
4032
4033 ZFS_ENTER(zfsvfs);
4034 ZFS_VERIFY_ZP(zp);
4035
4036 if (protp)
4037 *protp = PROT_ALL;
4038
4039 /* no faultahead (for now) */
4040 if (pl == NULL) {
4041 ZFS_EXIT(zfsvfs);
4042 return (0);
4043 }
4044
4045 /* can't fault past EOF */
4046 if (off >= zp->z_phys->zp_size) {
4047 ZFS_EXIT(zfsvfs);
4048 return (EFAULT);
4049 }
4050 orig_off = off;
4051
4052 /*
4053 * If we already own the lock, then we must be page faulting
4054 * in the middle of a write to this file (i.e., we are writing
4055 * to this file using data from a mapped region of the file).
4056 */
4057 if (rw_owner(&zp->z_map_lock) != curthread) {
4058 rw_enter(&zp->z_map_lock, RW_WRITER);
4059 need_unlock = TRUE;
4060 }
4061
4062 /*
4063 * Loop through the requested range [off, off + len] looking
4064 * for pages. If we don't find a page, we will need to create
4065 * a new page and fill it with data from the file.
4066 */
4067 while (len > 0) {
4068 if (plsz < PAGESIZE)
4069 break;
4070 if (pp = page_lookup(vp, off, SE_SHARED)) {
4071 *pl++ = pp;
4072 off += PAGESIZE;
4073 addr += PAGESIZE;
4074 len -= PAGESIZE;
4075 plsz -= PAGESIZE;
4076 } else {
4077 err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw);
4078 if (err)
4079 goto out;
4080 /*
4081 * klustering may have changed our region
4082 * to be block aligned.
4083 */
4084 if (((pp = *pl) != 0) && (off != pp->p_offset)) {
4085 int delta = off - pp->p_offset;
4086 len += delta;
4087 off -= delta;
4088 addr -= delta;
4089 }
4090 while (*pl) {
4091 pl++;
4092 off += PAGESIZE;
4093 addr += PAGESIZE;
4094 plsz -= PAGESIZE;
4095 if (len > PAGESIZE)
4096 len -= PAGESIZE;
4097 else
4098 len = 0;
4099 }
4100 }
4101 }
4102
4103 /*
4104 * Fill out the page array with any pages already in the cache.
4105 */
4106 while (plsz > 0) {
4107 pp = page_lookup_nowait(vp, off, SE_SHARED);
4108 if (pp == NULL)
4109 break;
4110 *pl++ = pp;
4111 off += PAGESIZE;
4112 plsz -= PAGESIZE;
4113 }
4114
4115 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4116 out:
4117 /*
4118 * We can't grab the range lock for the page as reader which would
4119 * stop truncation as this leads to deadlock. So we need to recheck
4120 * the file size.
4121 */
4122 if (orig_off >= zp->z_phys->zp_size)
4123 err = EFAULT;
4124 if (err) {
4125 /*
4126 * Release any pages we have previously locked.
4127 */
4128 while (pl > pl0)
4129 page_unlock(*--pl);
4130 }
4131
4132 *pl = NULL;
4133
4134 if (need_unlock)
4135 rw_exit(&zp->z_map_lock);
4136
4137 ZFS_EXIT(zfsvfs);
4138 return (err);
4139 }
4140
4141 /*
4142 * Request a memory map for a section of a file. This code interacts
4143 * with common code and the VM system as follows:
4144 *
4145 * common code calls mmap(), which ends up in smmap_common()
4146 *
4147 * this calls VOP_MAP(), which takes you into (say) zfs
4148 *
4149 * zfs_map() calls as_map(), passing segvn_create() as the callback
4150 *
4151 * segvn_create() creates the new segment and calls VOP_ADDMAP()
4152 *
4153 * zfs_addmap() updates z_mapcnt
4154 */
4155 /*ARGSUSED*/
4156 static int
4157 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4158 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4159 caller_context_t *ct)
4160 {
4161 znode_t *zp = VTOZ(vp);
4162 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4163 segvn_crargs_t vn_a;
4164 int error;
4165
4166 ZFS_ENTER(zfsvfs);
4167 ZFS_VERIFY_ZP(zp);
4168
4169 if ((prot & PROT_WRITE) &&
4170 (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY |
4171 ZFS_APPENDONLY))) {
4172 ZFS_EXIT(zfsvfs);
4173 return (EPERM);
4174 }
4175
4176 if ((prot & (PROT_READ | PROT_EXEC)) &&
4177 (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) {
4178 ZFS_EXIT(zfsvfs);
4179 return (EACCES);
4180 }
4181
4182 if (vp->v_flag & VNOMAP) {
4183 ZFS_EXIT(zfsvfs);
4184 return (ENOSYS);
4185 }
4186
4187 if (off < 0 || len > MAXOFFSET_T - off) {
4188 ZFS_EXIT(zfsvfs);
4189 return (ENXIO);
4190 }
4191
4192 if (vp->v_type != VREG) {
4193 ZFS_EXIT(zfsvfs);
4194 return (ENODEV);
4195 }
4196
4197 /*
4198 * If file is locked, disallow mapping.
4199 */
4200 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) {
4201 ZFS_EXIT(zfsvfs);
4202 return (EAGAIN);
4203 }
4204
4205 as_rangelock(as);
4206 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4207 if (error != 0) {
4208 as_rangeunlock(as);
4209 ZFS_EXIT(zfsvfs);
4210 return (error);
4211 }
4212
4213 vn_a.vp = vp;
4214 vn_a.offset = (u_offset_t)off;
4215 vn_a.type = flags & MAP_TYPE;
4216 vn_a.prot = prot;
4217 vn_a.maxprot = maxprot;
4218 vn_a.cred = cr;
4219 vn_a.amp = NULL;
4220 vn_a.flags = flags & ~MAP_TYPE;
4221 vn_a.szc = 0;
4222 vn_a.lgrp_mem_policy_flags = 0;
4223
4224 error = as_map(as, *addrp, len, segvn_create, &vn_a);
4225
4226 as_rangeunlock(as);
4227 ZFS_EXIT(zfsvfs);
4228 return (error);
4229 }
4230
4231 /* ARGSUSED */
4232 static int
4233 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4234 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4235 caller_context_t *ct)
4236 {
4237 uint64_t pages = btopr(len);
4238
4239 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4240 return (0);
4241 }
4242
4243 /*
4244 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4245 * more accurate mtime for the associated file. Since we don't have a way of
4246 * detecting when the data was actually modified, we have to resort to
4247 * heuristics. If an explicit msync() is done, then we mark the mtime when the
4248 * last page is pushed. The problem occurs when the msync() call is omitted,
4249 * which by far the most common case:
4250 *
4251 * open()
4252 * mmap()
4253 * <modify memory>
4254 * munmap()
4255 * close()
4256 * <time lapse>
4257 * putpage() via fsflush
4258 *
4259 * If we wait until fsflush to come along, we can have a modification time that
4260 * is some arbitrary point in the future. In order to prevent this in the
4261 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4262 * torn down.
4263 */
4264 /* ARGSUSED */
4265 static int
4266 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4267 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4268 caller_context_t *ct)
4269 {
4270 uint64_t pages = btopr(len);
4271
4272 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4273 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4274
4275 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4276 vn_has_cached_data(vp))
4277 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4278
4279 return (0);
4280 }
4281
4282 /*
4283 * Free or allocate space in a file. Currently, this function only
4284 * supports the `F_FREESP' command. However, this command is somewhat
4285 * misnamed, as its functionality includes the ability to allocate as
4286 * well as free space.
4287 *
4288 * IN: vp - vnode of file to free data in.
4289 * cmd - action to take (only F_FREESP supported).
4290 * bfp - section of file to free/alloc.
4291 * flag - current file open mode flags.
4292 * offset - current file offset.
4293 * cr - credentials of caller [UNUSED].
4294 * ct - caller context.
4295 *
4296 * RETURN: 0 if success
4297 * error code if failure
4298 *
4299 * Timestamps:
4300 * vp - ctime|mtime updated
4301 */
4302 /* ARGSUSED */
4303 static int
4304 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4305 offset_t offset, cred_t *cr, caller_context_t *ct)
4306 {
4307 znode_t *zp = VTOZ(vp);
4308 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4309 uint64_t off, len;
4310 int error;
4311
4312 ZFS_ENTER(zfsvfs);
4313 ZFS_VERIFY_ZP(zp);
4314
4315 if (cmd != F_FREESP) {
4316 ZFS_EXIT(zfsvfs);
4317 return (EINVAL);
4318 }
4319
4320 if (error = convoff(vp, bfp, 0, offset)) {
4321 ZFS_EXIT(zfsvfs);
4322 return (error);
4323 }
4324
4325 if (bfp->l_len < 0) {
4326 ZFS_EXIT(zfsvfs);
4327 return (EINVAL);
4328 }
4329
4330 off = bfp->l_start;
4331 len = bfp->l_len; /* 0 means from off to end of file */
4332
4333 error = zfs_freesp(zp, off, len, flag, TRUE);
4334
4335 ZFS_EXIT(zfsvfs);
4336 return (error);
4337 }
4338
4339 /*ARGSUSED*/
4340 static int
4341 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4342 {
4343 znode_t *zp = VTOZ(vp);
4344 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4345 uint32_t gen;
4346 uint64_t object = zp->z_id;
4347 zfid_short_t *zfid;
4348 int size, i;
4349
4350 ZFS_ENTER(zfsvfs);
4351 ZFS_VERIFY_ZP(zp);
4352 gen = (uint32_t)zp->z_gen;
4353
4354 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4355 if (fidp->fid_len < size) {
4356 fidp->fid_len = size;
4357 ZFS_EXIT(zfsvfs);
4358 return (ENOSPC);
4359 }
4360
4361 zfid = (zfid_short_t *)fidp;
4362
4363 zfid->zf_len = size;
4364
4365 for (i = 0; i < sizeof (zfid->zf_object); i++)
4366 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4367
4368 /* Must have a non-zero generation number to distinguish from .zfs */
4369 if (gen == 0)
4370 gen = 1;
4371 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4372 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4373
4374 if (size == LONG_FID_LEN) {
4375 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4376 zfid_long_t *zlfid;
4377
4378 zlfid = (zfid_long_t *)fidp;
4379
4380 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4381 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4382
4383 /* XXX - this should be the generation number for the objset */
4384 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4385 zlfid->zf_setgen[i] = 0;
4386 }
4387
4388 ZFS_EXIT(zfsvfs);
4389 return (0);
4390 }
4391
4392 static int
4393 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4394 caller_context_t *ct)
4395 {
4396 znode_t *zp, *xzp;
4397 zfsvfs_t *zfsvfs;
4398 zfs_dirlock_t *dl;
4399 int error;
4400
4401 switch (cmd) {
4402 case _PC_LINK_MAX:
4403 *valp = ULONG_MAX;
4404 return (0);
4405
4406 case _PC_FILESIZEBITS:
4407 *valp = 64;
4408 return (0);
4409
4410 case _PC_XATTR_EXISTS:
4411 zp = VTOZ(vp);
4412 zfsvfs = zp->z_zfsvfs;
4413 ZFS_ENTER(zfsvfs);
4414 ZFS_VERIFY_ZP(zp);
4415 *valp = 0;
4416 error = zfs_dirent_lock(&dl, zp, "", &xzp,
4417 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4418 if (error == 0) {
4419 zfs_dirent_unlock(dl);
4420 if (!zfs_dirempty(xzp))
4421 *valp = 1;
4422 VN_RELE(ZTOV(xzp));
4423 } else if (error == ENOENT) {
4424 /*
4425 * If there aren't extended attributes, it's the
4426 * same as having zero of them.
4427 */
4428 error = 0;
4429 }
4430 ZFS_EXIT(zfsvfs);
4431 return (error);
4432
4433 case _PC_SATTR_ENABLED:
4434 case _PC_SATTR_EXISTS:
4435 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4436 (vp->v_type == VREG || vp->v_type == VDIR);
4437 return (0);
4438
4439 case _PC_ACL_ENABLED:
4440 *valp = _ACL_ACE_ENABLED;
4441 return (0);
4442
4443 case _PC_MIN_HOLE_SIZE:
4444 *valp = (ulong_t)SPA_MINBLOCKSIZE;
4445 return (0);
4446
4447 default:
4448 return (fs_pathconf(vp, cmd, valp, cr, ct));
4449 }
4450 }
4451
4452 /*ARGSUSED*/
4453 static int
4454 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4455 caller_context_t *ct)
4456 {
4457 znode_t *zp = VTOZ(vp);
4458 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4459 int error;
4460 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4461
4462 ZFS_ENTER(zfsvfs);
4463 ZFS_VERIFY_ZP(zp);
4464 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4465 ZFS_EXIT(zfsvfs);
4466
4467 return (error);
4468 }
4469
4470 /*ARGSUSED*/
4471 static int
4472 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4473 caller_context_t *ct)
4474 {
4475 znode_t *zp = VTOZ(vp);
4476 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4477 int error;
4478 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4479
4480 ZFS_ENTER(zfsvfs);
4481 ZFS_VERIFY_ZP(zp);
4482 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4483 ZFS_EXIT(zfsvfs);
4484 return (error);
4485 }
4486
4487 /*
4488 * Predeclare these here so that the compiler assumes that
4489 * this is an "old style" function declaration that does
4490 * not include arguments => we won't get type mismatch errors
4491 * in the initializations that follow.
4492 */
4493 static int zfs_inval();
4494 static int zfs_isdir();
4495
4496 static int
4497 zfs_inval()
4498 {
4499 return (EINVAL);
4500 }
4501
4502 static int
4503 zfs_isdir()
4504 {
4505 return (EISDIR);
4506 }
4507 /*
4508 * Directory vnode operations template
4509 */
4510 vnodeops_t *zfs_dvnodeops;
4511 const fs_operation_def_t zfs_dvnodeops_template[] = {
4512 VOPNAME_OPEN, { .vop_open = zfs_open },
4513 VOPNAME_CLOSE, { .vop_close = zfs_close },
4514 VOPNAME_READ, { .error = zfs_isdir },
4515 VOPNAME_WRITE, { .error = zfs_isdir },
4516 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
4517 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
4518 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
4519 VOPNAME_ACCESS, { .vop_access = zfs_access },
4520 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
4521 VOPNAME_CREATE, { .vop_create = zfs_create },
4522 VOPNAME_REMOVE, { .vop_remove = zfs_remove },
4523 VOPNAME_LINK, { .vop_link = zfs_link },
4524 VOPNAME_RENAME, { .vop_rename = zfs_rename },
4525 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
4526 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
4527 VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
4528 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
4529 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
4530 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
4531 VOPNAME_FID, { .vop_fid = zfs_fid },
4532 VOPNAME_SEEK, { .vop_seek = zfs_seek },
4533 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
4534 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
4535 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
4536 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
4537 NULL, NULL
4538 };
4539
4540 /*
4541 * Regular file vnode operations template
4542 */
4543 vnodeops_t *zfs_fvnodeops;
4544 const fs_operation_def_t zfs_fvnodeops_template[] = {
4545 VOPNAME_OPEN, { .vop_open = zfs_open },
4546 VOPNAME_CLOSE, { .vop_close = zfs_close },
4547 VOPNAME_READ, { .vop_read = zfs_read },
4548 VOPNAME_WRITE, { .vop_write = zfs_write },
4549 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
4550 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
4551 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
4552 VOPNAME_ACCESS, { .vop_access = zfs_access },
4553 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
4554 VOPNAME_RENAME, { .vop_rename = zfs_rename },
4555 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
4556 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
4557 VOPNAME_FID, { .vop_fid = zfs_fid },
4558 VOPNAME_SEEK, { .vop_seek = zfs_seek },
4559 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
4560 VOPNAME_SPACE, { .vop_space = zfs_space },
4561 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
4562 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
4563 VOPNAME_MAP, { .vop_map = zfs_map },
4564 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
4565 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
4566 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
4567 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
4568 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
4569 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
4570 NULL, NULL
4571 };
4572
4573 /*
4574 * Symbolic link vnode operations template
4575 */
4576 vnodeops_t *zfs_symvnodeops;
4577 const fs_operation_def_t zfs_symvnodeops_template[] = {
4578 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
4579 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
4580 VOPNAME_ACCESS, { .vop_access = zfs_access },
4581 VOPNAME_RENAME, { .vop_rename = zfs_rename },
4582 VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
4583 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
4584 VOPNAME_FID, { .vop_fid = zfs_fid },
4585 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
4586 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
4587 NULL, NULL
4588 };
4589
4590 /*
4591 * Extended attribute directory vnode operations template
4592 * This template is identical to the directory vnodes
4593 * operation template except for restricted operations:
4594 * VOP_MKDIR()
4595 * VOP_SYMLINK()
4596 * Note that there are other restrictions embedded in:
4597 * zfs_create() - restrict type to VREG
4598 * zfs_link() - no links into/out of attribute space
4599 * zfs_rename() - no moves into/out of attribute space
4600 */
4601 vnodeops_t *zfs_xdvnodeops;
4602 const fs_operation_def_t zfs_xdvnodeops_template[] = {
4603 VOPNAME_OPEN, { .vop_open = zfs_open },
4604 VOPNAME_CLOSE, { .vop_close = zfs_close },
4605 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
4606 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
4607 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
4608 VOPNAME_ACCESS, { .vop_access = zfs_access },
4609 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
4610 VOPNAME_CREATE, { .vop_create = zfs_create },
4611 VOPNAME_REMOVE, { .vop_remove = zfs_remove },
4612 VOPNAME_LINK, { .vop_link = zfs_link },
4613 VOPNAME_RENAME, { .vop_rename = zfs_rename },
4614 VOPNAME_MKDIR, { .error = zfs_inval },
4615 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
4616 VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
4617 VOPNAME_SYMLINK, { .error = zfs_inval },
4618 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
4619 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
4620 VOPNAME_FID, { .vop_fid = zfs_fid },
4621 VOPNAME_SEEK, { .vop_seek = zfs_seek },
4622 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
4623 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
4624 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
4625 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
4626 NULL, NULL
4627 };
4628
4629 /*
4630 * Error vnode operations template
4631 */
4632 vnodeops_t *zfs_evnodeops;
4633 const fs_operation_def_t zfs_evnodeops_template[] = {
4634 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
4635 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
4636 NULL, NULL
4637 };