]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zfs_vnops.c
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
[mirror_zfs.git] / module / zfs / zfs_vnops.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29 /* Portions Copyright 2007 Jeremy Teo */
30 /* Portions Copyright 2010 Robert Milkowski */
31
32
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/time.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/resource.h>
39 #include <sys/vfs.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/file.h>
42 #include <sys/stat.h>
43 #include <sys/kmem.h>
44 #include <sys/taskq.h>
45 #include <sys/uio.h>
46 #include <sys/vmsystm.h>
47 #include <sys/atomic.h>
48 #include <vm/pvn.h>
49 #include <sys/pathname.h>
50 #include <sys/cmn_err.h>
51 #include <sys/errno.h>
52 #include <sys/unistd.h>
53 #include <sys/zfs_dir.h>
54 #include <sys/zfs_acl.h>
55 #include <sys/zfs_ioctl.h>
56 #include <sys/fs/zfs.h>
57 #include <sys/dmu.h>
58 #include <sys/dmu_objset.h>
59 #include <sys/spa.h>
60 #include <sys/txg.h>
61 #include <sys/dbuf.h>
62 #include <sys/zap.h>
63 #include <sys/sa.h>
64 #include <sys/dirent.h>
65 #include <sys/policy.h>
66 #include <sys/sunddi.h>
67 #include <sys/sid.h>
68 #include <sys/mode.h>
69 #include "fs/fs_subr.h"
70 #include <sys/zfs_ctldir.h>
71 #include <sys/zfs_fuid.h>
72 #include <sys/zfs_sa.h>
73 #include <sys/zfs_vnops.h>
74 #include <sys/dnlc.h>
75 #include <sys/zfs_rlock.h>
76 #include <sys/extdirent.h>
77 #include <sys/kidmap.h>
78 #include <sys/cred.h>
79 #include <sys/attr.h>
80 #include <sys/zpl.h>
81 #include <sys/zil.h>
82
83 /*
84 * Programming rules.
85 *
86 * Each vnode op performs some logical unit of work. To do this, the ZPL must
87 * properly lock its in-core state, create a DMU transaction, do the work,
88 * record this work in the intent log (ZIL), commit the DMU transaction,
89 * and wait for the intent log to commit if it is a synchronous operation.
90 * Moreover, the vnode ops must work in both normal and log replay context.
91 * The ordering of events is important to avoid deadlocks and references
92 * to freed memory. The example below illustrates the following Big Rules:
93 *
94 * (1) A check must be made in each zfs thread for a mounted file system.
95 * This is done avoiding races using ZFS_ENTER(zfsvfs).
96 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
97 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
98 * can return EIO from the calling function.
99 *
100 * (2) iput() should always be the last thing except for zil_commit()
101 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
102 * First, if it's the last reference, the vnode/znode
103 * can be freed, so the zp may point to freed memory. Second, the last
104 * reference will call zfs_zinactive(), which may induce a lot of work --
105 * pushing cached pages (which acquires range locks) and syncing out
106 * cached atime changes. Third, zfs_zinactive() may require a new tx,
107 * which could deadlock the system if you were already holding one.
108 * If you must call iput() within a tx then use zfs_iput_async().
109 *
110 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
111 * as they can span dmu_tx_assign() calls.
112 *
113 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
114 * dmu_tx_assign(). This is critical because we don't want to block
115 * while holding locks.
116 *
117 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
118 * reduces lock contention and CPU usage when we must wait (note that if
119 * throughput is constrained by the storage, nearly every transaction
120 * must wait).
121 *
122 * Note, in particular, that if a lock is sometimes acquired before
123 * the tx assigns, and sometimes after (e.g. z_lock), then failing
124 * to use a non-blocking assign can deadlock the system. The scenario:
125 *
126 * Thread A has grabbed a lock before calling dmu_tx_assign().
127 * Thread B is in an already-assigned tx, and blocks for this lock.
128 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
129 * forever, because the previous txg can't quiesce until B's tx commits.
130 *
131 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
132 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
133 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
134 * to indicate that this operation has already called dmu_tx_wait().
135 * This will ensure that we don't retry forever, waiting a short bit
136 * each time.
137 *
138 * (5) If the operation succeeded, generate the intent log entry for it
139 * before dropping locks. This ensures that the ordering of events
140 * in the intent log matches the order in which they actually occurred.
141 * During ZIL replay the zfs_log_* functions will update the sequence
142 * number to indicate the zil transaction has replayed.
143 *
144 * (6) At the end of each vnode op, the DMU tx must always commit,
145 * regardless of whether there were any errors.
146 *
147 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
148 * to ensure that synchronous semantics are provided when necessary.
149 *
150 * In general, this is how things should be ordered in each vnode op:
151 *
152 * ZFS_ENTER(zfsvfs); // exit if unmounted
153 * top:
154 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
155 * rw_enter(...); // grab any other locks you need
156 * tx = dmu_tx_create(...); // get DMU tx
157 * dmu_tx_hold_*(); // hold each object you might modify
158 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
159 * if (error) {
160 * rw_exit(...); // drop locks
161 * zfs_dirent_unlock(dl); // unlock directory entry
162 * iput(...); // release held vnodes
163 * if (error == ERESTART) {
164 * waited = B_TRUE;
165 * dmu_tx_wait(tx);
166 * dmu_tx_abort(tx);
167 * goto top;
168 * }
169 * dmu_tx_abort(tx); // abort DMU tx
170 * ZFS_EXIT(zfsvfs); // finished in zfs
171 * return (error); // really out of space
172 * }
173 * error = do_real_work(); // do whatever this VOP does
174 * if (error == 0)
175 * zfs_log_*(...); // on success, make ZIL entry
176 * dmu_tx_commit(tx); // commit DMU tx -- error or not
177 * rw_exit(...); // drop locks
178 * zfs_dirent_unlock(dl); // unlock directory entry
179 * iput(...); // release held vnodes
180 * zil_commit(zilog, foid); // synchronous when necessary
181 * ZFS_EXIT(zfsvfs); // finished in zfs
182 * return (error); // done, report error
183 */
184
185 /*
186 * Virus scanning is unsupported. It would be possible to add a hook
187 * here to performance the required virus scan. This could be done
188 * entirely in the kernel or potentially as an update to invoke a
189 * scanning utility.
190 */
191 static int
192 zfs_vscan(struct inode *ip, cred_t *cr, int async)
193 {
194 return (0);
195 }
196
197 /* ARGSUSED */
198 int
199 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
200 {
201 znode_t *zp = ITOZ(ip);
202 zfsvfs_t *zfsvfs = ITOZSB(ip);
203
204 ZFS_ENTER(zfsvfs);
205 ZFS_VERIFY_ZP(zp);
206
207 /* Honor ZFS_APPENDONLY file attribute */
208 if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
209 ((flag & O_APPEND) == 0)) {
210 ZFS_EXIT(zfsvfs);
211 return (SET_ERROR(EPERM));
212 }
213
214 /* Virus scan eligible files on open */
215 if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
216 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
217 if (zfs_vscan(ip, cr, 0) != 0) {
218 ZFS_EXIT(zfsvfs);
219 return (SET_ERROR(EACCES));
220 }
221 }
222
223 /* Keep a count of the synchronous opens in the znode */
224 if (flag & O_SYNC)
225 atomic_inc_32(&zp->z_sync_cnt);
226
227 ZFS_EXIT(zfsvfs);
228 return (0);
229 }
230
231 /* ARGSUSED */
232 int
233 zfs_close(struct inode *ip, int flag, cred_t *cr)
234 {
235 znode_t *zp = ITOZ(ip);
236 zfsvfs_t *zfsvfs = ITOZSB(ip);
237
238 ZFS_ENTER(zfsvfs);
239 ZFS_VERIFY_ZP(zp);
240
241 /* Decrement the synchronous opens in the znode */
242 if (flag & O_SYNC)
243 atomic_dec_32(&zp->z_sync_cnt);
244
245 if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
246 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
247 VERIFY(zfs_vscan(ip, cr, 1) == 0);
248
249 ZFS_EXIT(zfsvfs);
250 return (0);
251 }
252
253 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
254 /*
255 * Lseek support for finding holes (cmd == SEEK_HOLE) and
256 * data (cmd == SEEK_DATA). "off" is an in/out parameter.
257 */
258 static int
259 zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
260 {
261 znode_t *zp = ITOZ(ip);
262 uint64_t noff = (uint64_t)*off; /* new offset */
263 uint64_t file_sz;
264 int error;
265 boolean_t hole;
266
267 file_sz = zp->z_size;
268 if (noff >= file_sz) {
269 return (SET_ERROR(ENXIO));
270 }
271
272 if (cmd == SEEK_HOLE)
273 hole = B_TRUE;
274 else
275 hole = B_FALSE;
276
277 error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
278
279 if (error == ESRCH)
280 return (SET_ERROR(ENXIO));
281
282 /* file was dirty, so fall back to using generic logic */
283 if (error == EBUSY) {
284 if (hole)
285 *off = file_sz;
286
287 return (0);
288 }
289
290 /*
291 * We could find a hole that begins after the logical end-of-file,
292 * because dmu_offset_next() only works on whole blocks. If the
293 * EOF falls mid-block, then indicate that the "virtual hole"
294 * at the end of the file begins at the logical EOF, rather than
295 * at the end of the last block.
296 */
297 if (noff > file_sz) {
298 ASSERT(hole);
299 noff = file_sz;
300 }
301
302 if (noff < *off)
303 return (error);
304 *off = noff;
305 return (error);
306 }
307
308 int
309 zfs_holey(struct inode *ip, int cmd, loff_t *off)
310 {
311 znode_t *zp = ITOZ(ip);
312 zfsvfs_t *zfsvfs = ITOZSB(ip);
313 int error;
314
315 ZFS_ENTER(zfsvfs);
316 ZFS_VERIFY_ZP(zp);
317
318 error = zfs_holey_common(ip, cmd, off);
319
320 ZFS_EXIT(zfsvfs);
321 return (error);
322 }
323 #endif /* SEEK_HOLE && SEEK_DATA */
324
325 #if defined(_KERNEL)
326 /*
327 * When a file is memory mapped, we must keep the IO data synchronized
328 * between the DMU cache and the memory mapped pages. What this means:
329 *
330 * On Write: If we find a memory mapped page, we write to *both*
331 * the page and the dmu buffer.
332 */
333 static void
334 update_pages(struct inode *ip, int64_t start, int len,
335 objset_t *os, uint64_t oid)
336 {
337 struct address_space *mp = ip->i_mapping;
338 struct page *pp;
339 uint64_t nbytes;
340 int64_t off;
341 void *pb;
342
343 off = start & (PAGE_SIZE-1);
344 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
345 nbytes = MIN(PAGE_SIZE - off, len);
346
347 pp = find_lock_page(mp, start >> PAGE_SHIFT);
348 if (pp) {
349 if (mapping_writably_mapped(mp))
350 flush_dcache_page(pp);
351
352 pb = kmap(pp);
353 (void) dmu_read(os, oid, start+off, nbytes, pb+off,
354 DMU_READ_PREFETCH);
355 kunmap(pp);
356
357 if (mapping_writably_mapped(mp))
358 flush_dcache_page(pp);
359
360 mark_page_accessed(pp);
361 SetPageUptodate(pp);
362 ClearPageError(pp);
363 unlock_page(pp);
364 put_page(pp);
365 }
366
367 len -= nbytes;
368 off = 0;
369 }
370 }
371
372 /*
373 * When a file is memory mapped, we must keep the IO data synchronized
374 * between the DMU cache and the memory mapped pages. What this means:
375 *
376 * On Read: We "read" preferentially from memory mapped pages,
377 * else we default from the dmu buffer.
378 *
379 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
380 * the file is memory mapped.
381 */
382 static int
383 mappedread(struct inode *ip, int nbytes, uio_t *uio)
384 {
385 struct address_space *mp = ip->i_mapping;
386 struct page *pp;
387 znode_t *zp = ITOZ(ip);
388 int64_t start, off;
389 uint64_t bytes;
390 int len = nbytes;
391 int error = 0;
392 void *pb;
393
394 start = uio->uio_loffset;
395 off = start & (PAGE_SIZE-1);
396 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
397 bytes = MIN(PAGE_SIZE - off, len);
398
399 pp = find_lock_page(mp, start >> PAGE_SHIFT);
400 if (pp) {
401 ASSERT(PageUptodate(pp));
402
403 pb = kmap(pp);
404 error = uiomove(pb + off, bytes, UIO_READ, uio);
405 kunmap(pp);
406
407 if (mapping_writably_mapped(mp))
408 flush_dcache_page(pp);
409
410 mark_page_accessed(pp);
411 unlock_page(pp);
412 put_page(pp);
413 } else {
414 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
415 uio, bytes);
416 }
417
418 len -= bytes;
419 off = 0;
420 if (error)
421 break;
422 }
423 return (error);
424 }
425 #endif /* _KERNEL */
426
427 unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
428 unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
429
430 /*
431 * Read bytes from specified file into supplied buffer.
432 *
433 * IN: ip - inode of file to be read from.
434 * uio - structure supplying read location, range info,
435 * and return buffer.
436 * ioflag - FSYNC flags; used to provide FRSYNC semantics.
437 * O_DIRECT flag; used to bypass page cache.
438 * cr - credentials of caller.
439 *
440 * OUT: uio - updated offset and range, buffer filled.
441 *
442 * RETURN: 0 on success, error code on failure.
443 *
444 * Side Effects:
445 * inode - atime updated if byte count > 0
446 */
447 /* ARGSUSED */
448 int
449 zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
450 {
451 znode_t *zp = ITOZ(ip);
452 zfsvfs_t *zfsvfs = ITOZSB(ip);
453 ssize_t n, nbytes;
454 int error = 0;
455 rl_t *rl;
456 #ifdef HAVE_UIO_ZEROCOPY
457 xuio_t *xuio = NULL;
458 #endif /* HAVE_UIO_ZEROCOPY */
459
460 ZFS_ENTER(zfsvfs);
461 ZFS_VERIFY_ZP(zp);
462
463 if (zp->z_pflags & ZFS_AV_QUARANTINED) {
464 ZFS_EXIT(zfsvfs);
465 return (SET_ERROR(EACCES));
466 }
467
468 /*
469 * Validate file offset
470 */
471 if (uio->uio_loffset < (offset_t)0) {
472 ZFS_EXIT(zfsvfs);
473 return (SET_ERROR(EINVAL));
474 }
475
476 /*
477 * Fasttrack empty reads
478 */
479 if (uio->uio_resid == 0) {
480 ZFS_EXIT(zfsvfs);
481 return (0);
482 }
483
484 /*
485 * If we're in FRSYNC mode, sync out this znode before reading it.
486 * Only do this for non-snapshots.
487 */
488 if (zfsvfs->z_log &&
489 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
490 zil_commit(zfsvfs->z_log, zp->z_id);
491
492 /*
493 * Lock the range against changes.
494 */
495 rl = zfs_range_lock(&zp->z_range_lock, uio->uio_loffset, uio->uio_resid,
496 RL_READER);
497
498 /*
499 * If we are reading past end-of-file we can skip
500 * to the end; but we might still need to set atime.
501 */
502 if (uio->uio_loffset >= zp->z_size) {
503 error = 0;
504 goto out;
505 }
506
507 ASSERT(uio->uio_loffset < zp->z_size);
508 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
509
510 #ifdef HAVE_UIO_ZEROCOPY
511 if ((uio->uio_extflg == UIO_XUIO) &&
512 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
513 int nblk;
514 int blksz = zp->z_blksz;
515 uint64_t offset = uio->uio_loffset;
516
517 xuio = (xuio_t *)uio;
518 if ((ISP2(blksz))) {
519 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
520 blksz)) / blksz;
521 } else {
522 ASSERT(offset + n <= blksz);
523 nblk = 1;
524 }
525 (void) dmu_xuio_init(xuio, nblk);
526
527 if (vn_has_cached_data(ip)) {
528 /*
529 * For simplicity, we always allocate a full buffer
530 * even if we only expect to read a portion of a block.
531 */
532 while (--nblk >= 0) {
533 (void) dmu_xuio_add(xuio,
534 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
535 blksz), 0, blksz);
536 }
537 }
538 }
539 #endif /* HAVE_UIO_ZEROCOPY */
540
541 while (n > 0) {
542 nbytes = MIN(n, zfs_read_chunk_size -
543 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
544
545 if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
546 error = mappedread(ip, nbytes, uio);
547 } else {
548 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
549 uio, nbytes);
550 }
551
552 if (error) {
553 /* convert checksum errors into IO errors */
554 if (error == ECKSUM)
555 error = SET_ERROR(EIO);
556 break;
557 }
558
559 n -= nbytes;
560 }
561 out:
562 zfs_range_unlock(rl);
563
564 ZFS_EXIT(zfsvfs);
565 return (error);
566 }
567
568 /*
569 * Write the bytes to a file.
570 *
571 * IN: ip - inode of file to be written to.
572 * uio - structure supplying write location, range info,
573 * and data buffer.
574 * ioflag - FAPPEND flag set if in append mode.
575 * O_DIRECT flag; used to bypass page cache.
576 * cr - credentials of caller.
577 *
578 * OUT: uio - updated offset and range.
579 *
580 * RETURN: 0 if success
581 * error code if failure
582 *
583 * Timestamps:
584 * ip - ctime|mtime updated if byte count > 0
585 */
586
587 /* ARGSUSED */
588 int
589 zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
590 {
591 znode_t *zp = ITOZ(ip);
592 rlim64_t limit = uio->uio_limit;
593 ssize_t start_resid = uio->uio_resid;
594 ssize_t tx_bytes;
595 uint64_t end_size;
596 dmu_tx_t *tx;
597 zfsvfs_t *zfsvfs = ZTOZSB(zp);
598 zilog_t *zilog;
599 offset_t woff;
600 ssize_t n, nbytes;
601 rl_t *rl;
602 int max_blksz = zfsvfs->z_max_blksz;
603 int error = 0;
604 arc_buf_t *abuf;
605 const iovec_t *aiov = NULL;
606 xuio_t *xuio = NULL;
607 int write_eof;
608 int count = 0;
609 sa_bulk_attr_t bulk[4];
610 uint64_t mtime[2], ctime[2];
611 uint32_t uid;
612 #ifdef HAVE_UIO_ZEROCOPY
613 int i_iov = 0;
614 const iovec_t *iovp = uio->uio_iov;
615 ASSERTV(int iovcnt = uio->uio_iovcnt);
616 #endif
617
618 /*
619 * Fasttrack empty write
620 */
621 n = start_resid;
622 if (n == 0)
623 return (0);
624
625 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
626 limit = MAXOFFSET_T;
627
628 ZFS_ENTER(zfsvfs);
629 ZFS_VERIFY_ZP(zp);
630
631 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
632 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
633 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
634 &zp->z_size, 8);
635 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
636 &zp->z_pflags, 8);
637
638 /*
639 * Callers might not be able to detect properly that we are read-only,
640 * so check it explicitly here.
641 */
642 if (zfs_is_readonly(zfsvfs)) {
643 ZFS_EXIT(zfsvfs);
644 return (SET_ERROR(EROFS));
645 }
646
647 /*
648 * If immutable or not appending then return EPERM
649 */
650 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
651 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
652 (uio->uio_loffset < zp->z_size))) {
653 ZFS_EXIT(zfsvfs);
654 return (SET_ERROR(EPERM));
655 }
656
657 zilog = zfsvfs->z_log;
658
659 /*
660 * Validate file offset
661 */
662 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
663 if (woff < 0) {
664 ZFS_EXIT(zfsvfs);
665 return (SET_ERROR(EINVAL));
666 }
667
668 /*
669 * Pre-fault the pages to ensure slow (eg NFS) pages
670 * don't hold up txg.
671 * Skip this if uio contains loaned arc_buf.
672 */
673 #ifdef HAVE_UIO_ZEROCOPY
674 if ((uio->uio_extflg == UIO_XUIO) &&
675 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
676 xuio = (xuio_t *)uio;
677 else
678 #endif
679 uio_prefaultpages(MIN(n, max_blksz), uio);
680
681 /*
682 * If in append mode, set the io offset pointer to eof.
683 */
684 if (ioflag & FAPPEND) {
685 /*
686 * Obtain an appending range lock to guarantee file append
687 * semantics. We reset the write offset once we have the lock.
688 */
689 rl = zfs_range_lock(&zp->z_range_lock, 0, n, RL_APPEND);
690 woff = rl->r_off;
691 if (rl->r_len == UINT64_MAX) {
692 /*
693 * We overlocked the file because this write will cause
694 * the file block size to increase.
695 * Note that zp_size cannot change with this lock held.
696 */
697 woff = zp->z_size;
698 }
699 uio->uio_loffset = woff;
700 } else {
701 /*
702 * Note that if the file block size will change as a result of
703 * this write, then this range lock will lock the entire file
704 * so that we can re-write the block safely.
705 */
706 rl = zfs_range_lock(&zp->z_range_lock, woff, n, RL_WRITER);
707 }
708
709 if (woff >= limit) {
710 zfs_range_unlock(rl);
711 ZFS_EXIT(zfsvfs);
712 return (SET_ERROR(EFBIG));
713 }
714
715 if ((woff + n) > limit || woff > (limit - n))
716 n = limit - woff;
717
718 /* Will this write extend the file length? */
719 write_eof = (woff + n > zp->z_size);
720
721 end_size = MAX(zp->z_size, woff + n);
722
723 /*
724 * Write the file in reasonable size chunks. Each chunk is written
725 * in a separate transaction; this keeps the intent log records small
726 * and allows us to do more fine-grained space accounting.
727 */
728 while (n > 0) {
729 abuf = NULL;
730 woff = uio->uio_loffset;
731 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
732 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
733 if (abuf != NULL)
734 dmu_return_arcbuf(abuf);
735 error = SET_ERROR(EDQUOT);
736 break;
737 }
738
739 if (xuio && abuf == NULL) {
740 #ifdef HAVE_UIO_ZEROCOPY
741 ASSERT(i_iov < iovcnt);
742 ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
743 aiov = &iovp[i_iov];
744 abuf = dmu_xuio_arcbuf(xuio, i_iov);
745 dmu_xuio_clear(xuio, i_iov);
746 ASSERT((aiov->iov_base == abuf->b_data) ||
747 ((char *)aiov->iov_base - (char *)abuf->b_data +
748 aiov->iov_len == arc_buf_size(abuf)));
749 i_iov++;
750 #endif
751 } else if (abuf == NULL && n >= max_blksz &&
752 woff >= zp->z_size &&
753 P2PHASE(woff, max_blksz) == 0 &&
754 zp->z_blksz == max_blksz) {
755 /*
756 * This write covers a full block. "Borrow" a buffer
757 * from the dmu so that we can fill it before we enter
758 * a transaction. This avoids the possibility of
759 * holding up the transaction if the data copy hangs
760 * up on a pagefault (e.g., from an NFS server mapping).
761 */
762 size_t cbytes;
763
764 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
765 max_blksz);
766 ASSERT(abuf != NULL);
767 ASSERT(arc_buf_size(abuf) == max_blksz);
768 if ((error = uiocopy(abuf->b_data, max_blksz,
769 UIO_WRITE, uio, &cbytes))) {
770 dmu_return_arcbuf(abuf);
771 break;
772 }
773 ASSERT(cbytes == max_blksz);
774 }
775
776 /*
777 * Start a transaction.
778 */
779 tx = dmu_tx_create(zfsvfs->z_os);
780 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
781 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
782 zfs_sa_upgrade_txholds(tx, zp);
783 error = dmu_tx_assign(tx, TXG_WAIT);
784 if (error) {
785 dmu_tx_abort(tx);
786 if (abuf != NULL)
787 dmu_return_arcbuf(abuf);
788 break;
789 }
790
791 /*
792 * If zfs_range_lock() over-locked we grow the blocksize
793 * and then reduce the lock range. This will only happen
794 * on the first iteration since zfs_range_reduce() will
795 * shrink down r_len to the appropriate size.
796 */
797 if (rl->r_len == UINT64_MAX) {
798 uint64_t new_blksz;
799
800 if (zp->z_blksz > max_blksz) {
801 /*
802 * File's blocksize is already larger than the
803 * "recordsize" property. Only let it grow to
804 * the next power of 2.
805 */
806 ASSERT(!ISP2(zp->z_blksz));
807 new_blksz = MIN(end_size,
808 1 << highbit64(zp->z_blksz));
809 } else {
810 new_blksz = MIN(end_size, max_blksz);
811 }
812 zfs_grow_blocksize(zp, new_blksz, tx);
813 zfs_range_reduce(rl, woff, n);
814 }
815
816 /*
817 * XXX - should we really limit each write to z_max_blksz?
818 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
819 */
820 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
821
822 if (abuf == NULL) {
823 tx_bytes = uio->uio_resid;
824 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
825 uio, nbytes, tx);
826 tx_bytes -= uio->uio_resid;
827 } else {
828 tx_bytes = nbytes;
829 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
830 /*
831 * If this is not a full block write, but we are
832 * extending the file past EOF and this data starts
833 * block-aligned, use assign_arcbuf(). Otherwise,
834 * write via dmu_write().
835 */
836 if (tx_bytes < max_blksz && (!write_eof ||
837 aiov->iov_base != abuf->b_data)) {
838 ASSERT(xuio);
839 dmu_write(zfsvfs->z_os, zp->z_id, woff,
840 /* cppcheck-suppress nullPointer */
841 aiov->iov_len, aiov->iov_base, tx);
842 dmu_return_arcbuf(abuf);
843 xuio_stat_wbuf_copied();
844 } else {
845 ASSERT(xuio || tx_bytes == max_blksz);
846 dmu_assign_arcbuf_by_dbuf(
847 sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
848 }
849 ASSERT(tx_bytes <= uio->uio_resid);
850 uioskip(uio, tx_bytes);
851 }
852 if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
853 update_pages(ip, woff,
854 tx_bytes, zfsvfs->z_os, zp->z_id);
855 }
856
857 /*
858 * If we made no progress, we're done. If we made even
859 * partial progress, update the znode and ZIL accordingly.
860 */
861 if (tx_bytes == 0) {
862 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
863 (void *)&zp->z_size, sizeof (uint64_t), tx);
864 dmu_tx_commit(tx);
865 ASSERT(error != 0);
866 break;
867 }
868
869 /*
870 * Clear Set-UID/Set-GID bits on successful write if not
871 * privileged and at least one of the execute bits is set.
872 *
873 * It would be nice to to this after all writes have
874 * been done, but that would still expose the ISUID/ISGID
875 * to another app after the partial write is committed.
876 *
877 * Note: we don't call zfs_fuid_map_id() here because
878 * user 0 is not an ephemeral uid.
879 */
880 mutex_enter(&zp->z_acl_lock);
881 uid = KUID_TO_SUID(ip->i_uid);
882 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
883 (S_IXUSR >> 6))) != 0 &&
884 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
885 secpolicy_vnode_setid_retain(cr,
886 ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
887 uint64_t newmode;
888 zp->z_mode &= ~(S_ISUID | S_ISGID);
889 ip->i_mode = newmode = zp->z_mode;
890 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
891 (void *)&newmode, sizeof (uint64_t), tx);
892 }
893 mutex_exit(&zp->z_acl_lock);
894
895 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
896
897 /*
898 * Update the file size (zp_size) if it has changed;
899 * account for possible concurrent updates.
900 */
901 while ((end_size = zp->z_size) < uio->uio_loffset) {
902 (void) atomic_cas_64(&zp->z_size, end_size,
903 uio->uio_loffset);
904 ASSERT(error == 0);
905 }
906 /*
907 * If we are replaying and eof is non zero then force
908 * the file size to the specified eof. Note, there's no
909 * concurrency during replay.
910 */
911 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
912 zp->z_size = zfsvfs->z_replay_eof;
913
914 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
915
916 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
917 NULL, NULL);
918 dmu_tx_commit(tx);
919
920 if (error != 0)
921 break;
922 ASSERT(tx_bytes == nbytes);
923 n -= nbytes;
924
925 if (!xuio && n > 0)
926 uio_prefaultpages(MIN(n, max_blksz), uio);
927 }
928
929 zfs_inode_update(zp);
930 zfs_range_unlock(rl);
931
932 /*
933 * If we're in replay mode, or we made no progress, return error.
934 * Otherwise, it's at least a partial write, so it's successful.
935 */
936 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
937 ZFS_EXIT(zfsvfs);
938 return (error);
939 }
940
941 if (ioflag & (FSYNC | FDSYNC) ||
942 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
943 zil_commit(zilog, zp->z_id);
944
945 ZFS_EXIT(zfsvfs);
946 return (0);
947 }
948
949 /*
950 * Drop a reference on the passed inode asynchronously. This ensures
951 * that the caller will never drop the last reference on an inode in
952 * the current context. Doing so while holding open a tx could result
953 * in a deadlock if iput_final() re-enters the filesystem code.
954 */
955 void
956 zfs_iput_async(struct inode *ip)
957 {
958 objset_t *os = ITOZSB(ip)->z_os;
959
960 ASSERT(atomic_read(&ip->i_count) > 0);
961 ASSERT(os != NULL);
962
963 if (atomic_read(&ip->i_count) == 1)
964 VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)),
965 (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
966 else
967 iput(ip);
968 }
969
970 void
971 zfs_get_done(zgd_t *zgd, int error)
972 {
973 znode_t *zp = zgd->zgd_private;
974
975 if (zgd->zgd_db)
976 dmu_buf_rele(zgd->zgd_db, zgd);
977
978 zfs_range_unlock(zgd->zgd_rl);
979
980 /*
981 * Release the vnode asynchronously as we currently have the
982 * txg stopped from syncing.
983 */
984 zfs_iput_async(ZTOI(zp));
985
986 if (error == 0 && zgd->zgd_bp)
987 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
988
989 kmem_free(zgd, sizeof (zgd_t));
990 }
991
992 #ifdef DEBUG
993 static int zil_fault_io = 0;
994 #endif
995
996 /*
997 * Get data to generate a TX_WRITE intent log record.
998 */
999 int
1000 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1001 {
1002 zfsvfs_t *zfsvfs = arg;
1003 objset_t *os = zfsvfs->z_os;
1004 znode_t *zp;
1005 uint64_t object = lr->lr_foid;
1006 uint64_t offset = lr->lr_offset;
1007 uint64_t size = lr->lr_length;
1008 dmu_buf_t *db;
1009 zgd_t *zgd;
1010 int error = 0;
1011
1012 ASSERT3P(lwb, !=, NULL);
1013 ASSERT3P(zio, !=, NULL);
1014 ASSERT3U(size, !=, 0);
1015
1016 /*
1017 * Nothing to do if the file has been removed
1018 */
1019 if (zfs_zget(zfsvfs, object, &zp) != 0)
1020 return (SET_ERROR(ENOENT));
1021 if (zp->z_unlinked) {
1022 /*
1023 * Release the vnode asynchronously as we currently have the
1024 * txg stopped from syncing.
1025 */
1026 zfs_iput_async(ZTOI(zp));
1027 return (SET_ERROR(ENOENT));
1028 }
1029
1030 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1031 zgd->zgd_lwb = lwb;
1032 zgd->zgd_private = zp;
1033
1034 /*
1035 * Write records come in two flavors: immediate and indirect.
1036 * For small writes it's cheaper to store the data with the
1037 * log record (immediate); for large writes it's cheaper to
1038 * sync the data and get a pointer to it (indirect) so that
1039 * we don't have to write the data twice.
1040 */
1041 if (buf != NULL) { /* immediate write */
1042 zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, size,
1043 RL_READER);
1044 /* test for truncation needs to be done while range locked */
1045 if (offset >= zp->z_size) {
1046 error = SET_ERROR(ENOENT);
1047 } else {
1048 error = dmu_read(os, object, offset, size, buf,
1049 DMU_READ_NO_PREFETCH);
1050 }
1051 ASSERT(error == 0 || error == ENOENT);
1052 } else { /* indirect write */
1053 /*
1054 * Have to lock the whole block to ensure when it's
1055 * written out and its checksum is being calculated
1056 * that no one can change the data. We need to re-check
1057 * blocksize after we get the lock in case it's changed!
1058 */
1059 for (;;) {
1060 uint64_t blkoff;
1061 size = zp->z_blksz;
1062 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1063 offset -= blkoff;
1064 zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset,
1065 size, RL_READER);
1066 if (zp->z_blksz == size)
1067 break;
1068 offset += blkoff;
1069 zfs_range_unlock(zgd->zgd_rl);
1070 }
1071 /* test for truncation needs to be done while range locked */
1072 if (lr->lr_offset >= zp->z_size)
1073 error = SET_ERROR(ENOENT);
1074 #ifdef DEBUG
1075 if (zil_fault_io) {
1076 error = SET_ERROR(EIO);
1077 zil_fault_io = 0;
1078 }
1079 #endif
1080 if (error == 0)
1081 error = dmu_buf_hold(os, object, offset, zgd, &db,
1082 DMU_READ_NO_PREFETCH);
1083
1084 if (error == 0) {
1085 blkptr_t *bp = &lr->lr_blkptr;
1086
1087 zgd->zgd_db = db;
1088 zgd->zgd_bp = bp;
1089
1090 ASSERT(db->db_offset == offset);
1091 ASSERT(db->db_size == size);
1092
1093 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1094 zfs_get_done, zgd);
1095 ASSERT(error || lr->lr_length <= size);
1096
1097 /*
1098 * On success, we need to wait for the write I/O
1099 * initiated by dmu_sync() to complete before we can
1100 * release this dbuf. We will finish everything up
1101 * in the zfs_get_done() callback.
1102 */
1103 if (error == 0)
1104 return (0);
1105
1106 if (error == EALREADY) {
1107 lr->lr_common.lrc_txtype = TX_WRITE2;
1108 error = 0;
1109 }
1110 }
1111 }
1112
1113 zfs_get_done(zgd, error);
1114
1115 return (error);
1116 }
1117
1118 /*ARGSUSED*/
1119 int
1120 zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
1121 {
1122 znode_t *zp = ITOZ(ip);
1123 zfsvfs_t *zfsvfs = ITOZSB(ip);
1124 int error;
1125
1126 ZFS_ENTER(zfsvfs);
1127 ZFS_VERIFY_ZP(zp);
1128
1129 if (flag & V_ACE_MASK)
1130 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1131 else
1132 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1133
1134 ZFS_EXIT(zfsvfs);
1135 return (error);
1136 }
1137
1138 /*
1139 * Lookup an entry in a directory, or an extended attribute directory.
1140 * If it exists, return a held inode reference for it.
1141 *
1142 * IN: dip - inode of directory to search.
1143 * nm - name of entry to lookup.
1144 * flags - LOOKUP_XATTR set if looking for an attribute.
1145 * cr - credentials of caller.
1146 * direntflags - directory lookup flags
1147 * realpnp - returned pathname.
1148 *
1149 * OUT: ipp - inode of located entry, NULL if not found.
1150 *
1151 * RETURN: 0 on success, error code on failure.
1152 *
1153 * Timestamps:
1154 * NA
1155 */
1156 /* ARGSUSED */
1157 int
1158 zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags,
1159 cred_t *cr, int *direntflags, pathname_t *realpnp)
1160 {
1161 znode_t *zdp = ITOZ(dip);
1162 zfsvfs_t *zfsvfs = ITOZSB(dip);
1163 int error = 0;
1164
1165 /*
1166 * Fast path lookup, however we must skip DNLC lookup
1167 * for case folding or normalizing lookups because the
1168 * DNLC code only stores the passed in name. This means
1169 * creating 'a' and removing 'A' on a case insensitive
1170 * file system would work, but DNLC still thinks 'a'
1171 * exists and won't let you create it again on the next
1172 * pass through fast path.
1173 */
1174 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1175
1176 if (!S_ISDIR(dip->i_mode)) {
1177 return (SET_ERROR(ENOTDIR));
1178 } else if (zdp->z_sa_hdl == NULL) {
1179 return (SET_ERROR(EIO));
1180 }
1181
1182 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1183 error = zfs_fastaccesschk_execute(zdp, cr);
1184 if (!error) {
1185 *ipp = dip;
1186 igrab(*ipp);
1187 return (0);
1188 }
1189 return (error);
1190 #ifdef HAVE_DNLC
1191 } else if (!zdp->z_zfsvfs->z_norm &&
1192 (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) {
1193
1194 vnode_t *tvp = dnlc_lookup(dvp, nm);
1195
1196 if (tvp) {
1197 error = zfs_fastaccesschk_execute(zdp, cr);
1198 if (error) {
1199 iput(tvp);
1200 return (error);
1201 }
1202 if (tvp == DNLC_NO_VNODE) {
1203 iput(tvp);
1204 return (SET_ERROR(ENOENT));
1205 } else {
1206 *vpp = tvp;
1207 return (specvp_check(vpp, cr));
1208 }
1209 }
1210 #endif /* HAVE_DNLC */
1211 }
1212 }
1213
1214 ZFS_ENTER(zfsvfs);
1215 ZFS_VERIFY_ZP(zdp);
1216
1217 *ipp = NULL;
1218
1219 if (flags & LOOKUP_XATTR) {
1220 /*
1221 * We don't allow recursive attributes..
1222 * Maybe someday we will.
1223 */
1224 if (zdp->z_pflags & ZFS_XATTR) {
1225 ZFS_EXIT(zfsvfs);
1226 return (SET_ERROR(EINVAL));
1227 }
1228
1229 if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) {
1230 ZFS_EXIT(zfsvfs);
1231 return (error);
1232 }
1233
1234 /*
1235 * Do we have permission to get into attribute directory?
1236 */
1237
1238 if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0,
1239 B_FALSE, cr))) {
1240 iput(*ipp);
1241 *ipp = NULL;
1242 }
1243
1244 ZFS_EXIT(zfsvfs);
1245 return (error);
1246 }
1247
1248 if (!S_ISDIR(dip->i_mode)) {
1249 ZFS_EXIT(zfsvfs);
1250 return (SET_ERROR(ENOTDIR));
1251 }
1252
1253 /*
1254 * Check accessibility of directory.
1255 */
1256
1257 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
1258 ZFS_EXIT(zfsvfs);
1259 return (error);
1260 }
1261
1262 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1263 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1264 ZFS_EXIT(zfsvfs);
1265 return (SET_ERROR(EILSEQ));
1266 }
1267
1268 error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp);
1269 if ((error == 0) && (*ipp))
1270 zfs_inode_update(ITOZ(*ipp));
1271
1272 ZFS_EXIT(zfsvfs);
1273 return (error);
1274 }
1275
1276 /*
1277 * Attempt to create a new entry in a directory. If the entry
1278 * already exists, truncate the file if permissible, else return
1279 * an error. Return the ip of the created or trunc'd file.
1280 *
1281 * IN: dip - inode of directory to put new file entry in.
1282 * name - name of new file entry.
1283 * vap - attributes of new file.
1284 * excl - flag indicating exclusive or non-exclusive mode.
1285 * mode - mode to open file with.
1286 * cr - credentials of caller.
1287 * flag - large file flag [UNUSED].
1288 * vsecp - ACL to be set
1289 *
1290 * OUT: ipp - inode of created or trunc'd entry.
1291 *
1292 * RETURN: 0 on success, error code on failure.
1293 *
1294 * Timestamps:
1295 * dip - ctime|mtime updated if new entry created
1296 * ip - ctime|mtime always, atime if new
1297 */
1298
1299 /* ARGSUSED */
1300 int
1301 zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
1302 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
1303 {
1304 znode_t *zp, *dzp = ITOZ(dip);
1305 zfsvfs_t *zfsvfs = ITOZSB(dip);
1306 zilog_t *zilog;
1307 objset_t *os;
1308 zfs_dirlock_t *dl;
1309 dmu_tx_t *tx;
1310 int error;
1311 uid_t uid;
1312 gid_t gid;
1313 zfs_acl_ids_t acl_ids;
1314 boolean_t fuid_dirtied;
1315 boolean_t have_acl = B_FALSE;
1316 boolean_t waited = B_FALSE;
1317
1318 /*
1319 * If we have an ephemeral id, ACL, or XVATTR then
1320 * make sure file system is at proper version
1321 */
1322
1323 gid = crgetgid(cr);
1324 uid = crgetuid(cr);
1325
1326 if (zfsvfs->z_use_fuids == B_FALSE &&
1327 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1328 return (SET_ERROR(EINVAL));
1329
1330 if (name == NULL)
1331 return (SET_ERROR(EINVAL));
1332
1333 ZFS_ENTER(zfsvfs);
1334 ZFS_VERIFY_ZP(dzp);
1335 os = zfsvfs->z_os;
1336 zilog = zfsvfs->z_log;
1337
1338 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1339 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1340 ZFS_EXIT(zfsvfs);
1341 return (SET_ERROR(EILSEQ));
1342 }
1343
1344 if (vap->va_mask & ATTR_XVATTR) {
1345 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1346 crgetuid(cr), cr, vap->va_mode)) != 0) {
1347 ZFS_EXIT(zfsvfs);
1348 return (error);
1349 }
1350 }
1351
1352 top:
1353 *ipp = NULL;
1354 if (*name == '\0') {
1355 /*
1356 * Null component name refers to the directory itself.
1357 */
1358 igrab(dip);
1359 zp = dzp;
1360 dl = NULL;
1361 error = 0;
1362 } else {
1363 /* possible igrab(zp) */
1364 int zflg = 0;
1365
1366 if (flag & FIGNORECASE)
1367 zflg |= ZCILOOK;
1368
1369 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1370 NULL, NULL);
1371 if (error) {
1372 if (have_acl)
1373 zfs_acl_ids_free(&acl_ids);
1374 if (strcmp(name, "..") == 0)
1375 error = SET_ERROR(EISDIR);
1376 ZFS_EXIT(zfsvfs);
1377 return (error);
1378 }
1379 }
1380
1381 if (zp == NULL) {
1382 uint64_t txtype;
1383
1384 /*
1385 * Create a new file object and update the directory
1386 * to reference it.
1387 */
1388 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
1389 if (have_acl)
1390 zfs_acl_ids_free(&acl_ids);
1391 goto out;
1392 }
1393
1394 /*
1395 * We only support the creation of regular files in
1396 * extended attribute directories.
1397 */
1398
1399 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
1400 if (have_acl)
1401 zfs_acl_ids_free(&acl_ids);
1402 error = SET_ERROR(EINVAL);
1403 goto out;
1404 }
1405
1406 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1407 cr, vsecp, &acl_ids)) != 0)
1408 goto out;
1409 have_acl = B_TRUE;
1410
1411 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1412 zfs_acl_ids_free(&acl_ids);
1413 error = SET_ERROR(EDQUOT);
1414 goto out;
1415 }
1416
1417 tx = dmu_tx_create(os);
1418
1419 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1420 ZFS_SA_BASE_ATTR_SIZE);
1421
1422 fuid_dirtied = zfsvfs->z_fuid_dirty;
1423 if (fuid_dirtied)
1424 zfs_fuid_txhold(zfsvfs, tx);
1425 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1426 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1427 if (!zfsvfs->z_use_sa &&
1428 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1429 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1430 0, acl_ids.z_aclp->z_acl_bytes);
1431 }
1432 error = dmu_tx_assign(tx,
1433 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1434 if (error) {
1435 zfs_dirent_unlock(dl);
1436 if (error == ERESTART) {
1437 waited = B_TRUE;
1438 dmu_tx_wait(tx);
1439 dmu_tx_abort(tx);
1440 goto top;
1441 }
1442 zfs_acl_ids_free(&acl_ids);
1443 dmu_tx_abort(tx);
1444 ZFS_EXIT(zfsvfs);
1445 return (error);
1446 }
1447 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1448
1449 if (fuid_dirtied)
1450 zfs_fuid_sync(zfsvfs, tx);
1451
1452 (void) zfs_link_create(dl, zp, tx, ZNEW);
1453 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1454 if (flag & FIGNORECASE)
1455 txtype |= TX_CI;
1456 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1457 vsecp, acl_ids.z_fuidp, vap);
1458 zfs_acl_ids_free(&acl_ids);
1459 dmu_tx_commit(tx);
1460 } else {
1461 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1462
1463 if (have_acl)
1464 zfs_acl_ids_free(&acl_ids);
1465 have_acl = B_FALSE;
1466
1467 /*
1468 * A directory entry already exists for this name.
1469 */
1470 /*
1471 * Can't truncate an existing file if in exclusive mode.
1472 */
1473 if (excl) {
1474 error = SET_ERROR(EEXIST);
1475 goto out;
1476 }
1477 /*
1478 * Can't open a directory for writing.
1479 */
1480 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1481 error = SET_ERROR(EISDIR);
1482 goto out;
1483 }
1484 /*
1485 * Verify requested access to file.
1486 */
1487 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1488 goto out;
1489 }
1490
1491 mutex_enter(&dzp->z_lock);
1492 dzp->z_seq++;
1493 mutex_exit(&dzp->z_lock);
1494
1495 /*
1496 * Truncate regular files if requested.
1497 */
1498 if (S_ISREG(ZTOI(zp)->i_mode) &&
1499 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
1500 /* we can't hold any locks when calling zfs_freesp() */
1501 if (dl) {
1502 zfs_dirent_unlock(dl);
1503 dl = NULL;
1504 }
1505 error = zfs_freesp(zp, 0, 0, mode, TRUE);
1506 }
1507 }
1508 out:
1509
1510 if (dl)
1511 zfs_dirent_unlock(dl);
1512
1513 if (error) {
1514 if (zp)
1515 iput(ZTOI(zp));
1516 } else {
1517 zfs_inode_update(dzp);
1518 zfs_inode_update(zp);
1519 *ipp = ZTOI(zp);
1520 }
1521
1522 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1523 zil_commit(zilog, 0);
1524
1525 ZFS_EXIT(zfsvfs);
1526 return (error);
1527 }
1528
1529 /* ARGSUSED */
1530 int
1531 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
1532 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
1533 {
1534 znode_t *zp = NULL, *dzp = ITOZ(dip);
1535 zfsvfs_t *zfsvfs = ITOZSB(dip);
1536 objset_t *os;
1537 dmu_tx_t *tx;
1538 int error;
1539 uid_t uid;
1540 gid_t gid;
1541 zfs_acl_ids_t acl_ids;
1542 boolean_t fuid_dirtied;
1543 boolean_t have_acl = B_FALSE;
1544 boolean_t waited = B_FALSE;
1545
1546 /*
1547 * If we have an ephemeral id, ACL, or XVATTR then
1548 * make sure file system is at proper version
1549 */
1550
1551 gid = crgetgid(cr);
1552 uid = crgetuid(cr);
1553
1554 if (zfsvfs->z_use_fuids == B_FALSE &&
1555 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1556 return (SET_ERROR(EINVAL));
1557
1558 ZFS_ENTER(zfsvfs);
1559 ZFS_VERIFY_ZP(dzp);
1560 os = zfsvfs->z_os;
1561
1562 if (vap->va_mask & ATTR_XVATTR) {
1563 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1564 crgetuid(cr), cr, vap->va_mode)) != 0) {
1565 ZFS_EXIT(zfsvfs);
1566 return (error);
1567 }
1568 }
1569
1570 top:
1571 *ipp = NULL;
1572
1573 /*
1574 * Create a new file object and update the directory
1575 * to reference it.
1576 */
1577 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
1578 if (have_acl)
1579 zfs_acl_ids_free(&acl_ids);
1580 goto out;
1581 }
1582
1583 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1584 cr, vsecp, &acl_ids)) != 0)
1585 goto out;
1586 have_acl = B_TRUE;
1587
1588 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1589 zfs_acl_ids_free(&acl_ids);
1590 error = SET_ERROR(EDQUOT);
1591 goto out;
1592 }
1593
1594 tx = dmu_tx_create(os);
1595
1596 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1597 ZFS_SA_BASE_ATTR_SIZE);
1598 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1599
1600 fuid_dirtied = zfsvfs->z_fuid_dirty;
1601 if (fuid_dirtied)
1602 zfs_fuid_txhold(zfsvfs, tx);
1603 if (!zfsvfs->z_use_sa &&
1604 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1605 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1606 0, acl_ids.z_aclp->z_acl_bytes);
1607 }
1608 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1609 if (error) {
1610 if (error == ERESTART) {
1611 waited = B_TRUE;
1612 dmu_tx_wait(tx);
1613 dmu_tx_abort(tx);
1614 goto top;
1615 }
1616 zfs_acl_ids_free(&acl_ids);
1617 dmu_tx_abort(tx);
1618 ZFS_EXIT(zfsvfs);
1619 return (error);
1620 }
1621 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
1622
1623 if (fuid_dirtied)
1624 zfs_fuid_sync(zfsvfs, tx);
1625
1626 /* Add to unlinked set */
1627 zp->z_unlinked = 1;
1628 zfs_unlinked_add(zp, tx);
1629 zfs_acl_ids_free(&acl_ids);
1630 dmu_tx_commit(tx);
1631 out:
1632
1633 if (error) {
1634 if (zp)
1635 iput(ZTOI(zp));
1636 } else {
1637 zfs_inode_update(dzp);
1638 zfs_inode_update(zp);
1639 *ipp = ZTOI(zp);
1640 }
1641
1642 ZFS_EXIT(zfsvfs);
1643 return (error);
1644 }
1645
1646 /*
1647 * Remove an entry from a directory.
1648 *
1649 * IN: dip - inode of directory to remove entry from.
1650 * name - name of entry to remove.
1651 * cr - credentials of caller.
1652 *
1653 * RETURN: 0 if success
1654 * error code if failure
1655 *
1656 * Timestamps:
1657 * dip - ctime|mtime
1658 * ip - ctime (if nlink > 0)
1659 */
1660
1661 uint64_t null_xattr = 0;
1662
1663 /*ARGSUSED*/
1664 int
1665 zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags)
1666 {
1667 znode_t *zp, *dzp = ITOZ(dip);
1668 znode_t *xzp;
1669 struct inode *ip;
1670 zfsvfs_t *zfsvfs = ITOZSB(dip);
1671 zilog_t *zilog;
1672 uint64_t acl_obj, xattr_obj;
1673 uint64_t xattr_obj_unlinked = 0;
1674 uint64_t obj = 0;
1675 uint64_t links;
1676 zfs_dirlock_t *dl;
1677 dmu_tx_t *tx;
1678 boolean_t may_delete_now, delete_now = FALSE;
1679 boolean_t unlinked, toobig = FALSE;
1680 uint64_t txtype;
1681 pathname_t *realnmp = NULL;
1682 pathname_t realnm;
1683 int error;
1684 int zflg = ZEXISTS;
1685 boolean_t waited = B_FALSE;
1686
1687 if (name == NULL)
1688 return (SET_ERROR(EINVAL));
1689
1690 ZFS_ENTER(zfsvfs);
1691 ZFS_VERIFY_ZP(dzp);
1692 zilog = zfsvfs->z_log;
1693
1694 if (flags & FIGNORECASE) {
1695 zflg |= ZCILOOK;
1696 pn_alloc(&realnm);
1697 realnmp = &realnm;
1698 }
1699
1700 top:
1701 xattr_obj = 0;
1702 xzp = NULL;
1703 /*
1704 * Attempt to lock directory; fail if entry doesn't exist.
1705 */
1706 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1707 NULL, realnmp))) {
1708 if (realnmp)
1709 pn_free(realnmp);
1710 ZFS_EXIT(zfsvfs);
1711 return (error);
1712 }
1713
1714 ip = ZTOI(zp);
1715
1716 if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1717 goto out;
1718 }
1719
1720 /*
1721 * Need to use rmdir for removing directories.
1722 */
1723 if (S_ISDIR(ip->i_mode)) {
1724 error = SET_ERROR(EPERM);
1725 goto out;
1726 }
1727
1728 #ifdef HAVE_DNLC
1729 if (realnmp)
1730 dnlc_remove(dvp, realnmp->pn_buf);
1731 else
1732 dnlc_remove(dvp, name);
1733 #endif /* HAVE_DNLC */
1734
1735 mutex_enter(&zp->z_lock);
1736 may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped);
1737 mutex_exit(&zp->z_lock);
1738
1739 /*
1740 * We may delete the znode now, or we may put it in the unlinked set;
1741 * it depends on whether we're the last link, and on whether there are
1742 * other holds on the inode. So we dmu_tx_hold() the right things to
1743 * allow for either case.
1744 */
1745 obj = zp->z_id;
1746 tx = dmu_tx_create(zfsvfs->z_os);
1747 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1748 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1749 zfs_sa_upgrade_txholds(tx, zp);
1750 zfs_sa_upgrade_txholds(tx, dzp);
1751 if (may_delete_now) {
1752 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1753 /* if the file is too big, only hold_free a token amount */
1754 dmu_tx_hold_free(tx, zp->z_id, 0,
1755 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1756 }
1757
1758 /* are there any extended attributes? */
1759 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1760 &xattr_obj, sizeof (xattr_obj));
1761 if (error == 0 && xattr_obj) {
1762 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1763 ASSERT0(error);
1764 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1765 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1766 }
1767
1768 mutex_enter(&zp->z_lock);
1769 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1770 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1771 mutex_exit(&zp->z_lock);
1772
1773 /* charge as an update -- would be nice not to charge at all */
1774 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1775
1776 /*
1777 * Mark this transaction as typically resulting in a net free of space
1778 */
1779 dmu_tx_mark_netfree(tx);
1780
1781 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1782 if (error) {
1783 zfs_dirent_unlock(dl);
1784 if (error == ERESTART) {
1785 waited = B_TRUE;
1786 dmu_tx_wait(tx);
1787 dmu_tx_abort(tx);
1788 iput(ip);
1789 if (xzp)
1790 iput(ZTOI(xzp));
1791 goto top;
1792 }
1793 if (realnmp)
1794 pn_free(realnmp);
1795 dmu_tx_abort(tx);
1796 iput(ip);
1797 if (xzp)
1798 iput(ZTOI(xzp));
1799 ZFS_EXIT(zfsvfs);
1800 return (error);
1801 }
1802
1803 /*
1804 * Remove the directory entry.
1805 */
1806 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1807
1808 if (error) {
1809 dmu_tx_commit(tx);
1810 goto out;
1811 }
1812
1813 if (unlinked) {
1814 /*
1815 * Hold z_lock so that we can make sure that the ACL obj
1816 * hasn't changed. Could have been deleted due to
1817 * zfs_sa_upgrade().
1818 */
1819 mutex_enter(&zp->z_lock);
1820 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1821 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1822 delete_now = may_delete_now && !toobig &&
1823 atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) &&
1824 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1825 acl_obj;
1826 }
1827
1828 if (delete_now) {
1829 if (xattr_obj_unlinked) {
1830 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1831 mutex_enter(&xzp->z_lock);
1832 xzp->z_unlinked = 1;
1833 clear_nlink(ZTOI(xzp));
1834 links = 0;
1835 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1836 &links, sizeof (links), tx);
1837 ASSERT3U(error, ==, 0);
1838 mutex_exit(&xzp->z_lock);
1839 zfs_unlinked_add(xzp, tx);
1840
1841 if (zp->z_is_sa)
1842 error = sa_remove(zp->z_sa_hdl,
1843 SA_ZPL_XATTR(zfsvfs), tx);
1844 else
1845 error = sa_update(zp->z_sa_hdl,
1846 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1847 sizeof (uint64_t), tx);
1848 ASSERT0(error);
1849 }
1850 /*
1851 * Add to the unlinked set because a new reference could be
1852 * taken concurrently resulting in a deferred destruction.
1853 */
1854 zfs_unlinked_add(zp, tx);
1855 mutex_exit(&zp->z_lock);
1856 } else if (unlinked) {
1857 mutex_exit(&zp->z_lock);
1858 zfs_unlinked_add(zp, tx);
1859 }
1860
1861 txtype = TX_REMOVE;
1862 if (flags & FIGNORECASE)
1863 txtype |= TX_CI;
1864 zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1865
1866 dmu_tx_commit(tx);
1867 out:
1868 if (realnmp)
1869 pn_free(realnmp);
1870
1871 zfs_dirent_unlock(dl);
1872 zfs_inode_update(dzp);
1873 zfs_inode_update(zp);
1874
1875 if (delete_now)
1876 iput(ip);
1877 else
1878 zfs_iput_async(ip);
1879
1880 if (xzp) {
1881 zfs_inode_update(xzp);
1882 zfs_iput_async(ZTOI(xzp));
1883 }
1884
1885 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1886 zil_commit(zilog, 0);
1887
1888 ZFS_EXIT(zfsvfs);
1889 return (error);
1890 }
1891
1892 /*
1893 * Create a new directory and insert it into dip using the name
1894 * provided. Return a pointer to the inserted directory.
1895 *
1896 * IN: dip - inode of directory to add subdir to.
1897 * dirname - name of new directory.
1898 * vap - attributes of new directory.
1899 * cr - credentials of caller.
1900 * vsecp - ACL to be set
1901 *
1902 * OUT: ipp - inode of created directory.
1903 *
1904 * RETURN: 0 if success
1905 * error code if failure
1906 *
1907 * Timestamps:
1908 * dip - ctime|mtime updated
1909 * ipp - ctime|mtime|atime updated
1910 */
1911 /*ARGSUSED*/
1912 int
1913 zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
1914 cred_t *cr, int flags, vsecattr_t *vsecp)
1915 {
1916 znode_t *zp, *dzp = ITOZ(dip);
1917 zfsvfs_t *zfsvfs = ITOZSB(dip);
1918 zilog_t *zilog;
1919 zfs_dirlock_t *dl;
1920 uint64_t txtype;
1921 dmu_tx_t *tx;
1922 int error;
1923 int zf = ZNEW;
1924 uid_t uid;
1925 gid_t gid = crgetgid(cr);
1926 zfs_acl_ids_t acl_ids;
1927 boolean_t fuid_dirtied;
1928 boolean_t waited = B_FALSE;
1929
1930 ASSERT(S_ISDIR(vap->va_mode));
1931
1932 /*
1933 * If we have an ephemeral id, ACL, or XVATTR then
1934 * make sure file system is at proper version
1935 */
1936
1937 uid = crgetuid(cr);
1938 if (zfsvfs->z_use_fuids == B_FALSE &&
1939 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1940 return (SET_ERROR(EINVAL));
1941
1942 if (dirname == NULL)
1943 return (SET_ERROR(EINVAL));
1944
1945 ZFS_ENTER(zfsvfs);
1946 ZFS_VERIFY_ZP(dzp);
1947 zilog = zfsvfs->z_log;
1948
1949 if (dzp->z_pflags & ZFS_XATTR) {
1950 ZFS_EXIT(zfsvfs);
1951 return (SET_ERROR(EINVAL));
1952 }
1953
1954 if (zfsvfs->z_utf8 && u8_validate(dirname,
1955 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1956 ZFS_EXIT(zfsvfs);
1957 return (SET_ERROR(EILSEQ));
1958 }
1959 if (flags & FIGNORECASE)
1960 zf |= ZCILOOK;
1961
1962 if (vap->va_mask & ATTR_XVATTR) {
1963 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1964 crgetuid(cr), cr, vap->va_mode)) != 0) {
1965 ZFS_EXIT(zfsvfs);
1966 return (error);
1967 }
1968 }
1969
1970 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1971 vsecp, &acl_ids)) != 0) {
1972 ZFS_EXIT(zfsvfs);
1973 return (error);
1974 }
1975 /*
1976 * First make sure the new directory doesn't exist.
1977 *
1978 * Existence is checked first to make sure we don't return
1979 * EACCES instead of EEXIST which can cause some applications
1980 * to fail.
1981 */
1982 top:
1983 *ipp = NULL;
1984
1985 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1986 NULL, NULL))) {
1987 zfs_acl_ids_free(&acl_ids);
1988 ZFS_EXIT(zfsvfs);
1989 return (error);
1990 }
1991
1992 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
1993 zfs_acl_ids_free(&acl_ids);
1994 zfs_dirent_unlock(dl);
1995 ZFS_EXIT(zfsvfs);
1996 return (error);
1997 }
1998
1999 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2000 zfs_acl_ids_free(&acl_ids);
2001 zfs_dirent_unlock(dl);
2002 ZFS_EXIT(zfsvfs);
2003 return (SET_ERROR(EDQUOT));
2004 }
2005
2006 /*
2007 * Add a new entry to the directory.
2008 */
2009 tx = dmu_tx_create(zfsvfs->z_os);
2010 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2011 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2012 fuid_dirtied = zfsvfs->z_fuid_dirty;
2013 if (fuid_dirtied)
2014 zfs_fuid_txhold(zfsvfs, tx);
2015 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2016 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2017 acl_ids.z_aclp->z_acl_bytes);
2018 }
2019
2020 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2021 ZFS_SA_BASE_ATTR_SIZE);
2022
2023 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2024 if (error) {
2025 zfs_dirent_unlock(dl);
2026 if (error == ERESTART) {
2027 waited = B_TRUE;
2028 dmu_tx_wait(tx);
2029 dmu_tx_abort(tx);
2030 goto top;
2031 }
2032 zfs_acl_ids_free(&acl_ids);
2033 dmu_tx_abort(tx);
2034 ZFS_EXIT(zfsvfs);
2035 return (error);
2036 }
2037
2038 /*
2039 * Create new node.
2040 */
2041 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2042
2043 if (fuid_dirtied)
2044 zfs_fuid_sync(zfsvfs, tx);
2045
2046 /*
2047 * Now put new name in parent dir.
2048 */
2049 (void) zfs_link_create(dl, zp, tx, ZNEW);
2050
2051 *ipp = ZTOI(zp);
2052
2053 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2054 if (flags & FIGNORECASE)
2055 txtype |= TX_CI;
2056 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2057 acl_ids.z_fuidp, vap);
2058
2059 zfs_acl_ids_free(&acl_ids);
2060
2061 dmu_tx_commit(tx);
2062
2063 zfs_dirent_unlock(dl);
2064
2065 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2066 zil_commit(zilog, 0);
2067
2068 zfs_inode_update(dzp);
2069 zfs_inode_update(zp);
2070 ZFS_EXIT(zfsvfs);
2071 return (0);
2072 }
2073
2074 /*
2075 * Remove a directory subdir entry. If the current working
2076 * directory is the same as the subdir to be removed, the
2077 * remove will fail.
2078 *
2079 * IN: dip - inode of directory to remove from.
2080 * name - name of directory to be removed.
2081 * cwd - inode of current working directory.
2082 * cr - credentials of caller.
2083 * flags - case flags
2084 *
2085 * RETURN: 0 on success, error code on failure.
2086 *
2087 * Timestamps:
2088 * dip - ctime|mtime updated
2089 */
2090 /*ARGSUSED*/
2091 int
2092 zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
2093 int flags)
2094 {
2095 znode_t *dzp = ITOZ(dip);
2096 znode_t *zp;
2097 struct inode *ip;
2098 zfsvfs_t *zfsvfs = ITOZSB(dip);
2099 zilog_t *zilog;
2100 zfs_dirlock_t *dl;
2101 dmu_tx_t *tx;
2102 int error;
2103 int zflg = ZEXISTS;
2104 boolean_t waited = B_FALSE;
2105
2106 if (name == NULL)
2107 return (SET_ERROR(EINVAL));
2108
2109 ZFS_ENTER(zfsvfs);
2110 ZFS_VERIFY_ZP(dzp);
2111 zilog = zfsvfs->z_log;
2112
2113 if (flags & FIGNORECASE)
2114 zflg |= ZCILOOK;
2115 top:
2116 zp = NULL;
2117
2118 /*
2119 * Attempt to lock directory; fail if entry doesn't exist.
2120 */
2121 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2122 NULL, NULL))) {
2123 ZFS_EXIT(zfsvfs);
2124 return (error);
2125 }
2126
2127 ip = ZTOI(zp);
2128
2129 if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
2130 goto out;
2131 }
2132
2133 if (!S_ISDIR(ip->i_mode)) {
2134 error = SET_ERROR(ENOTDIR);
2135 goto out;
2136 }
2137
2138 if (ip == cwd) {
2139 error = SET_ERROR(EINVAL);
2140 goto out;
2141 }
2142
2143 /*
2144 * Grab a lock on the directory to make sure that no one is
2145 * trying to add (or lookup) entries while we are removing it.
2146 */
2147 rw_enter(&zp->z_name_lock, RW_WRITER);
2148
2149 /*
2150 * Grab a lock on the parent pointer to make sure we play well
2151 * with the treewalk and directory rename code.
2152 */
2153 rw_enter(&zp->z_parent_lock, RW_WRITER);
2154
2155 tx = dmu_tx_create(zfsvfs->z_os);
2156 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2157 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2158 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2159 zfs_sa_upgrade_txholds(tx, zp);
2160 zfs_sa_upgrade_txholds(tx, dzp);
2161 dmu_tx_mark_netfree(tx);
2162 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2163 if (error) {
2164 rw_exit(&zp->z_parent_lock);
2165 rw_exit(&zp->z_name_lock);
2166 zfs_dirent_unlock(dl);
2167 if (error == ERESTART) {
2168 waited = B_TRUE;
2169 dmu_tx_wait(tx);
2170 dmu_tx_abort(tx);
2171 iput(ip);
2172 goto top;
2173 }
2174 dmu_tx_abort(tx);
2175 iput(ip);
2176 ZFS_EXIT(zfsvfs);
2177 return (error);
2178 }
2179
2180 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2181
2182 if (error == 0) {
2183 uint64_t txtype = TX_RMDIR;
2184 if (flags & FIGNORECASE)
2185 txtype |= TX_CI;
2186 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2187 }
2188
2189 dmu_tx_commit(tx);
2190
2191 rw_exit(&zp->z_parent_lock);
2192 rw_exit(&zp->z_name_lock);
2193 out:
2194 zfs_dirent_unlock(dl);
2195
2196 zfs_inode_update(dzp);
2197 zfs_inode_update(zp);
2198 iput(ip);
2199
2200 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2201 zil_commit(zilog, 0);
2202
2203 ZFS_EXIT(zfsvfs);
2204 return (error);
2205 }
2206
2207 /*
2208 * Read as many directory entries as will fit into the provided
2209 * dirent buffer from the given directory cursor position.
2210 *
2211 * IN: ip - inode of directory to read.
2212 * dirent - buffer for directory entries.
2213 *
2214 * OUT: dirent - filler buffer of directory entries.
2215 *
2216 * RETURN: 0 if success
2217 * error code if failure
2218 *
2219 * Timestamps:
2220 * ip - atime updated
2221 *
2222 * Note that the low 4 bits of the cookie returned by zap is always zero.
2223 * This allows us to use the low range for "special" directory entries:
2224 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2225 * we use the offset 2 for the '.zfs' directory.
2226 */
2227 /* ARGSUSED */
2228 int
2229 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
2230 {
2231 znode_t *zp = ITOZ(ip);
2232 zfsvfs_t *zfsvfs = ITOZSB(ip);
2233 objset_t *os;
2234 zap_cursor_t zc;
2235 zap_attribute_t zap;
2236 int error;
2237 uint8_t prefetch;
2238 uint8_t type;
2239 int done = 0;
2240 uint64_t parent;
2241 uint64_t offset; /* must be unsigned; checks for < 1 */
2242
2243 ZFS_ENTER(zfsvfs);
2244 ZFS_VERIFY_ZP(zp);
2245
2246 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2247 &parent, sizeof (parent))) != 0)
2248 goto out;
2249
2250 /*
2251 * Quit if directory has been removed (posix)
2252 */
2253 if (zp->z_unlinked)
2254 goto out;
2255
2256 error = 0;
2257 os = zfsvfs->z_os;
2258 offset = ctx->pos;
2259 prefetch = zp->z_zn_prefetch;
2260
2261 /*
2262 * Initialize the iterator cursor.
2263 */
2264 if (offset <= 3) {
2265 /*
2266 * Start iteration from the beginning of the directory.
2267 */
2268 zap_cursor_init(&zc, os, zp->z_id);
2269 } else {
2270 /*
2271 * The offset is a serialized cursor.
2272 */
2273 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2274 }
2275
2276 /*
2277 * Transform to file-system independent format
2278 */
2279 while (!done) {
2280 uint64_t objnum;
2281 /*
2282 * Special case `.', `..', and `.zfs'.
2283 */
2284 if (offset == 0) {
2285 (void) strcpy(zap.za_name, ".");
2286 zap.za_normalization_conflict = 0;
2287 objnum = zp->z_id;
2288 type = DT_DIR;
2289 } else if (offset == 1) {
2290 (void) strcpy(zap.za_name, "..");
2291 zap.za_normalization_conflict = 0;
2292 objnum = parent;
2293 type = DT_DIR;
2294 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2295 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2296 zap.za_normalization_conflict = 0;
2297 objnum = ZFSCTL_INO_ROOT;
2298 type = DT_DIR;
2299 } else {
2300 /*
2301 * Grab next entry.
2302 */
2303 if ((error = zap_cursor_retrieve(&zc, &zap))) {
2304 if (error == ENOENT)
2305 break;
2306 else
2307 goto update;
2308 }
2309
2310 /*
2311 * Allow multiple entries provided the first entry is
2312 * the object id. Non-zpl consumers may safely make
2313 * use of the additional space.
2314 *
2315 * XXX: This should be a feature flag for compatibility
2316 */
2317 if (zap.za_integer_length != 8 ||
2318 zap.za_num_integers == 0) {
2319 cmn_err(CE_WARN, "zap_readdir: bad directory "
2320 "entry, obj = %lld, offset = %lld, "
2321 "length = %d, num = %lld\n",
2322 (u_longlong_t)zp->z_id,
2323 (u_longlong_t)offset,
2324 zap.za_integer_length,
2325 (u_longlong_t)zap.za_num_integers);
2326 error = SET_ERROR(ENXIO);
2327 goto update;
2328 }
2329
2330 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2331 type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2332 }
2333
2334 done = !dir_emit(ctx, zap.za_name, strlen(zap.za_name),
2335 objnum, type);
2336 if (done)
2337 break;
2338
2339 /* Prefetch znode */
2340 if (prefetch) {
2341 dmu_prefetch(os, objnum, 0, 0, 0,
2342 ZIO_PRIORITY_SYNC_READ);
2343 }
2344
2345 /*
2346 * Move to the next entry, fill in the previous offset.
2347 */
2348 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2349 zap_cursor_advance(&zc);
2350 offset = zap_cursor_serialize(&zc);
2351 } else {
2352 offset += 1;
2353 }
2354 ctx->pos = offset;
2355 }
2356 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2357
2358 update:
2359 zap_cursor_fini(&zc);
2360 if (error == ENOENT)
2361 error = 0;
2362 out:
2363 ZFS_EXIT(zfsvfs);
2364
2365 return (error);
2366 }
2367
2368 ulong_t zfs_fsync_sync_cnt = 4;
2369
2370 int
2371 zfs_fsync(struct inode *ip, int syncflag, cred_t *cr)
2372 {
2373 znode_t *zp = ITOZ(ip);
2374 zfsvfs_t *zfsvfs = ITOZSB(ip);
2375
2376 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2377
2378 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2379 ZFS_ENTER(zfsvfs);
2380 ZFS_VERIFY_ZP(zp);
2381 zil_commit(zfsvfs->z_log, zp->z_id);
2382 ZFS_EXIT(zfsvfs);
2383 }
2384 tsd_set(zfs_fsyncer_key, NULL);
2385
2386 return (0);
2387 }
2388
2389
2390 /*
2391 * Get the requested file attributes and place them in the provided
2392 * vattr structure.
2393 *
2394 * IN: ip - inode of file.
2395 * vap - va_mask identifies requested attributes.
2396 * If ATTR_XVATTR set, then optional attrs are requested
2397 * flags - ATTR_NOACLCHECK (CIFS server context)
2398 * cr - credentials of caller.
2399 *
2400 * OUT: vap - attribute values.
2401 *
2402 * RETURN: 0 (always succeeds)
2403 */
2404 /* ARGSUSED */
2405 int
2406 zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
2407 {
2408 znode_t *zp = ITOZ(ip);
2409 zfsvfs_t *zfsvfs = ITOZSB(ip);
2410 int error = 0;
2411 uint64_t links;
2412 uint64_t atime[2], mtime[2], ctime[2];
2413 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2414 xoptattr_t *xoap = NULL;
2415 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2416 sa_bulk_attr_t bulk[3];
2417 int count = 0;
2418
2419 ZFS_ENTER(zfsvfs);
2420 ZFS_VERIFY_ZP(zp);
2421
2422 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2423
2424 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
2425 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2426 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2427
2428 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2429 ZFS_EXIT(zfsvfs);
2430 return (error);
2431 }
2432
2433 /*
2434 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2435 * Also, if we are the owner don't bother, since owner should
2436 * always be allowed to read basic attributes of file.
2437 */
2438 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2439 (vap->va_uid != crgetuid(cr))) {
2440 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2441 skipaclchk, cr))) {
2442 ZFS_EXIT(zfsvfs);
2443 return (error);
2444 }
2445 }
2446
2447 /*
2448 * Return all attributes. It's cheaper to provide the answer
2449 * than to determine whether we were asked the question.
2450 */
2451
2452 mutex_enter(&zp->z_lock);
2453 vap->va_type = vn_mode_to_vtype(zp->z_mode);
2454 vap->va_mode = zp->z_mode;
2455 vap->va_fsid = ZTOI(zp)->i_sb->s_dev;
2456 vap->va_nodeid = zp->z_id;
2457 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
2458 links = ZTOI(zp)->i_nlink + 1;
2459 else
2460 links = ZTOI(zp)->i_nlink;
2461 vap->va_nlink = MIN(links, ZFS_LINK_MAX);
2462 vap->va_size = i_size_read(ip);
2463 vap->va_rdev = ip->i_rdev;
2464 vap->va_seq = ip->i_generation;
2465
2466 /*
2467 * Add in any requested optional attributes and the create time.
2468 * Also set the corresponding bits in the returned attribute bitmap.
2469 */
2470 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2471 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2472 xoap->xoa_archive =
2473 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2474 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2475 }
2476
2477 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2478 xoap->xoa_readonly =
2479 ((zp->z_pflags & ZFS_READONLY) != 0);
2480 XVA_SET_RTN(xvap, XAT_READONLY);
2481 }
2482
2483 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2484 xoap->xoa_system =
2485 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2486 XVA_SET_RTN(xvap, XAT_SYSTEM);
2487 }
2488
2489 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2490 xoap->xoa_hidden =
2491 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2492 XVA_SET_RTN(xvap, XAT_HIDDEN);
2493 }
2494
2495 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2496 xoap->xoa_nounlink =
2497 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2498 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2499 }
2500
2501 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2502 xoap->xoa_immutable =
2503 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2504 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2505 }
2506
2507 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2508 xoap->xoa_appendonly =
2509 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2510 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2511 }
2512
2513 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2514 xoap->xoa_nodump =
2515 ((zp->z_pflags & ZFS_NODUMP) != 0);
2516 XVA_SET_RTN(xvap, XAT_NODUMP);
2517 }
2518
2519 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2520 xoap->xoa_opaque =
2521 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2522 XVA_SET_RTN(xvap, XAT_OPAQUE);
2523 }
2524
2525 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2526 xoap->xoa_av_quarantined =
2527 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2528 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2529 }
2530
2531 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2532 xoap->xoa_av_modified =
2533 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2534 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2535 }
2536
2537 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2538 S_ISREG(ip->i_mode)) {
2539 zfs_sa_get_scanstamp(zp, xvap);
2540 }
2541
2542 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2543 uint64_t times[2];
2544
2545 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2546 times, sizeof (times));
2547 ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2548 XVA_SET_RTN(xvap, XAT_CREATETIME);
2549 }
2550
2551 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2552 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2553 XVA_SET_RTN(xvap, XAT_REPARSE);
2554 }
2555 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2556 xoap->xoa_generation = ip->i_generation;
2557 XVA_SET_RTN(xvap, XAT_GEN);
2558 }
2559
2560 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2561 xoap->xoa_offline =
2562 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2563 XVA_SET_RTN(xvap, XAT_OFFLINE);
2564 }
2565
2566 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2567 xoap->xoa_sparse =
2568 ((zp->z_pflags & ZFS_SPARSE) != 0);
2569 XVA_SET_RTN(xvap, XAT_SPARSE);
2570 }
2571 }
2572
2573 ZFS_TIME_DECODE(&vap->va_atime, atime);
2574 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2575 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2576
2577 mutex_exit(&zp->z_lock);
2578
2579 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2580
2581 if (zp->z_blksz == 0) {
2582 /*
2583 * Block size hasn't been set; suggest maximal I/O transfers.
2584 */
2585 vap->va_blksize = zfsvfs->z_max_blksz;
2586 }
2587
2588 ZFS_EXIT(zfsvfs);
2589 return (0);
2590 }
2591
2592 /*
2593 * Get the basic file attributes and place them in the provided kstat
2594 * structure. The inode is assumed to be the authoritative source
2595 * for most of the attributes. However, the znode currently has the
2596 * authoritative atime, blksize, and block count.
2597 *
2598 * IN: ip - inode of file.
2599 *
2600 * OUT: sp - kstat values.
2601 *
2602 * RETURN: 0 (always succeeds)
2603 */
2604 /* ARGSUSED */
2605 int
2606 zfs_getattr_fast(struct inode *ip, struct kstat *sp)
2607 {
2608 znode_t *zp = ITOZ(ip);
2609 zfsvfs_t *zfsvfs = ITOZSB(ip);
2610 uint32_t blksize;
2611 u_longlong_t nblocks;
2612
2613 ZFS_ENTER(zfsvfs);
2614 ZFS_VERIFY_ZP(zp);
2615
2616 mutex_enter(&zp->z_lock);
2617
2618 generic_fillattr(ip, sp);
2619
2620 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2621 sp->blksize = blksize;
2622 sp->blocks = nblocks;
2623
2624 if (unlikely(zp->z_blksz == 0)) {
2625 /*
2626 * Block size hasn't been set; suggest maximal I/O transfers.
2627 */
2628 sp->blksize = zfsvfs->z_max_blksz;
2629 }
2630
2631 mutex_exit(&zp->z_lock);
2632
2633 /*
2634 * Required to prevent NFS client from detecting different inode
2635 * numbers of snapshot root dentry before and after snapshot mount.
2636 */
2637 if (zfsvfs->z_issnap) {
2638 if (ip->i_sb->s_root->d_inode == ip)
2639 sp->ino = ZFSCTL_INO_SNAPDIRS -
2640 dmu_objset_id(zfsvfs->z_os);
2641 }
2642
2643 ZFS_EXIT(zfsvfs);
2644
2645 return (0);
2646 }
2647
2648 /*
2649 * Set the file attributes to the values contained in the
2650 * vattr structure.
2651 *
2652 * IN: ip - inode of file to be modified.
2653 * vap - new attribute values.
2654 * If ATTR_XVATTR set, then optional attrs are being set
2655 * flags - ATTR_UTIME set if non-default time values provided.
2656 * - ATTR_NOACLCHECK (CIFS context only).
2657 * cr - credentials of caller.
2658 *
2659 * RETURN: 0 if success
2660 * error code if failure
2661 *
2662 * Timestamps:
2663 * ip - ctime updated, mtime updated if size changed.
2664 */
2665 /* ARGSUSED */
2666 int
2667 zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
2668 {
2669 znode_t *zp = ITOZ(ip);
2670 zfsvfs_t *zfsvfs = ITOZSB(ip);
2671 zilog_t *zilog;
2672 dmu_tx_t *tx;
2673 vattr_t oldva;
2674 xvattr_t *tmpxvattr;
2675 uint_t mask = vap->va_mask;
2676 uint_t saved_mask = 0;
2677 int trim_mask = 0;
2678 uint64_t new_mode;
2679 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
2680 uint64_t xattr_obj;
2681 uint64_t mtime[2], ctime[2], atime[2];
2682 znode_t *attrzp;
2683 int need_policy = FALSE;
2684 int err, err2;
2685 zfs_fuid_info_t *fuidp = NULL;
2686 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2687 xoptattr_t *xoap;
2688 zfs_acl_t *aclp;
2689 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2690 boolean_t fuid_dirtied = B_FALSE;
2691 sa_bulk_attr_t *bulk, *xattr_bulk;
2692 int count = 0, xattr_count = 0;
2693
2694 if (mask == 0)
2695 return (0);
2696
2697 ZFS_ENTER(zfsvfs);
2698 ZFS_VERIFY_ZP(zp);
2699
2700 zilog = zfsvfs->z_log;
2701
2702 /*
2703 * Make sure that if we have ephemeral uid/gid or xvattr specified
2704 * that file system is at proper version level
2705 */
2706
2707 if (zfsvfs->z_use_fuids == B_FALSE &&
2708 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2709 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2710 (mask & ATTR_XVATTR))) {
2711 ZFS_EXIT(zfsvfs);
2712 return (SET_ERROR(EINVAL));
2713 }
2714
2715 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2716 ZFS_EXIT(zfsvfs);
2717 return (SET_ERROR(EISDIR));
2718 }
2719
2720 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2721 ZFS_EXIT(zfsvfs);
2722 return (SET_ERROR(EINVAL));
2723 }
2724
2725 /*
2726 * If this is an xvattr_t, then get a pointer to the structure of
2727 * optional attributes. If this is NULL, then we have a vattr_t.
2728 */
2729 xoap = xva_getxoptattr(xvap);
2730
2731 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
2732 xva_init(tmpxvattr);
2733
2734 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP);
2735 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP);
2736
2737 /*
2738 * Immutable files can only alter immutable bit and atime
2739 */
2740 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2741 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
2742 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2743 err = SET_ERROR(EPERM);
2744 goto out3;
2745 }
2746
2747 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2748 err = SET_ERROR(EPERM);
2749 goto out3;
2750 }
2751
2752 /*
2753 * Verify timestamps doesn't overflow 32 bits.
2754 * ZFS can handle large timestamps, but 32bit syscalls can't
2755 * handle times greater than 2039. This check should be removed
2756 * once large timestamps are fully supported.
2757 */
2758 if (mask & (ATTR_ATIME | ATTR_MTIME)) {
2759 if (((mask & ATTR_ATIME) &&
2760 TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2761 ((mask & ATTR_MTIME) &&
2762 TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2763 err = SET_ERROR(EOVERFLOW);
2764 goto out3;
2765 }
2766 }
2767
2768 top:
2769 attrzp = NULL;
2770 aclp = NULL;
2771
2772 /* Can this be moved to before the top label? */
2773 if (zfs_is_readonly(zfsvfs)) {
2774 err = SET_ERROR(EROFS);
2775 goto out3;
2776 }
2777
2778 /*
2779 * First validate permissions
2780 */
2781
2782 if (mask & ATTR_SIZE) {
2783 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2784 if (err)
2785 goto out3;
2786
2787 /*
2788 * XXX - Note, we are not providing any open
2789 * mode flags here (like FNDELAY), so we may
2790 * block if there are locks present... this
2791 * should be addressed in openat().
2792 */
2793 /* XXX - would it be OK to generate a log record here? */
2794 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2795 if (err)
2796 goto out3;
2797 }
2798
2799 if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2800 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2801 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2802 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2803 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2804 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2805 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2806 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2807 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2808 skipaclchk, cr);
2809 }
2810
2811 if (mask & (ATTR_UID|ATTR_GID)) {
2812 int idmask = (mask & (ATTR_UID|ATTR_GID));
2813 int take_owner;
2814 int take_group;
2815
2816 /*
2817 * NOTE: even if a new mode is being set,
2818 * we may clear S_ISUID/S_ISGID bits.
2819 */
2820
2821 if (!(mask & ATTR_MODE))
2822 vap->va_mode = zp->z_mode;
2823
2824 /*
2825 * Take ownership or chgrp to group we are a member of
2826 */
2827
2828 take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
2829 take_group = (mask & ATTR_GID) &&
2830 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2831
2832 /*
2833 * If both ATTR_UID and ATTR_GID are set then take_owner and
2834 * take_group must both be set in order to allow taking
2835 * ownership.
2836 *
2837 * Otherwise, send the check through secpolicy_vnode_setattr()
2838 *
2839 */
2840
2841 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2842 take_owner && take_group) ||
2843 ((idmask == ATTR_UID) && take_owner) ||
2844 ((idmask == ATTR_GID) && take_group)) {
2845 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2846 skipaclchk, cr) == 0) {
2847 /*
2848 * Remove setuid/setgid for non-privileged users
2849 */
2850 (void) secpolicy_setid_clear(vap, cr);
2851 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2852 } else {
2853 need_policy = TRUE;
2854 }
2855 } else {
2856 need_policy = TRUE;
2857 }
2858 }
2859
2860 mutex_enter(&zp->z_lock);
2861 oldva.va_mode = zp->z_mode;
2862 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2863 if (mask & ATTR_XVATTR) {
2864 /*
2865 * Update xvattr mask to include only those attributes
2866 * that are actually changing.
2867 *
2868 * the bits will be restored prior to actually setting
2869 * the attributes so the caller thinks they were set.
2870 */
2871 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2872 if (xoap->xoa_appendonly !=
2873 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2874 need_policy = TRUE;
2875 } else {
2876 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2877 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2878 }
2879 }
2880
2881 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2882 if (xoap->xoa_nounlink !=
2883 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2884 need_policy = TRUE;
2885 } else {
2886 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2887 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2888 }
2889 }
2890
2891 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2892 if (xoap->xoa_immutable !=
2893 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2894 need_policy = TRUE;
2895 } else {
2896 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2897 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2898 }
2899 }
2900
2901 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2902 if (xoap->xoa_nodump !=
2903 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2904 need_policy = TRUE;
2905 } else {
2906 XVA_CLR_REQ(xvap, XAT_NODUMP);
2907 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2908 }
2909 }
2910
2911 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2912 if (xoap->xoa_av_modified !=
2913 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2914 need_policy = TRUE;
2915 } else {
2916 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2917 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2918 }
2919 }
2920
2921 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2922 if ((!S_ISREG(ip->i_mode) &&
2923 xoap->xoa_av_quarantined) ||
2924 xoap->xoa_av_quarantined !=
2925 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2926 need_policy = TRUE;
2927 } else {
2928 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2929 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2930 }
2931 }
2932
2933 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2934 mutex_exit(&zp->z_lock);
2935 err = SET_ERROR(EPERM);
2936 goto out3;
2937 }
2938
2939 if (need_policy == FALSE &&
2940 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2941 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2942 need_policy = TRUE;
2943 }
2944 }
2945
2946 mutex_exit(&zp->z_lock);
2947
2948 if (mask & ATTR_MODE) {
2949 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2950 err = secpolicy_setid_setsticky_clear(ip, vap,
2951 &oldva, cr);
2952 if (err)
2953 goto out3;
2954
2955 trim_mask |= ATTR_MODE;
2956 } else {
2957 need_policy = TRUE;
2958 }
2959 }
2960
2961 if (need_policy) {
2962 /*
2963 * If trim_mask is set then take ownership
2964 * has been granted or write_acl is present and user
2965 * has the ability to modify mode. In that case remove
2966 * UID|GID and or MODE from mask so that
2967 * secpolicy_vnode_setattr() doesn't revoke it.
2968 */
2969
2970 if (trim_mask) {
2971 saved_mask = vap->va_mask;
2972 vap->va_mask &= ~trim_mask;
2973 }
2974 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2975 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2976 if (err)
2977 goto out3;
2978
2979 if (trim_mask)
2980 vap->va_mask |= saved_mask;
2981 }
2982
2983 /*
2984 * secpolicy_vnode_setattr, or take ownership may have
2985 * changed va_mask
2986 */
2987 mask = vap->va_mask;
2988
2989 if ((mask & (ATTR_UID | ATTR_GID))) {
2990 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2991 &xattr_obj, sizeof (xattr_obj));
2992
2993 if (err == 0 && xattr_obj) {
2994 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2995 if (err)
2996 goto out2;
2997 }
2998 if (mask & ATTR_UID) {
2999 new_kuid = zfs_fuid_create(zfsvfs,
3000 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3001 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
3002 zfs_fuid_overquota(zfsvfs, B_FALSE, new_kuid)) {
3003 if (attrzp)
3004 iput(ZTOI(attrzp));
3005 err = SET_ERROR(EDQUOT);
3006 goto out2;
3007 }
3008 }
3009
3010 if (mask & ATTR_GID) {
3011 new_kgid = zfs_fuid_create(zfsvfs,
3012 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
3013 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
3014 zfs_fuid_overquota(zfsvfs, B_TRUE, new_kgid)) {
3015 if (attrzp)
3016 iput(ZTOI(attrzp));
3017 err = SET_ERROR(EDQUOT);
3018 goto out2;
3019 }
3020 }
3021 }
3022 tx = dmu_tx_create(zfsvfs->z_os);
3023
3024 if (mask & ATTR_MODE) {
3025 uint64_t pmode = zp->z_mode;
3026 uint64_t acl_obj;
3027 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3028
3029 zfs_acl_chmod_setattr(zp, &aclp, new_mode);
3030
3031 mutex_enter(&zp->z_lock);
3032 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3033 /*
3034 * Are we upgrading ACL from old V0 format
3035 * to V1 format?
3036 */
3037 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3038 zfs_znode_acl_version(zp) ==
3039 ZFS_ACL_VERSION_INITIAL) {
3040 dmu_tx_hold_free(tx, acl_obj, 0,
3041 DMU_OBJECT_END);
3042 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3043 0, aclp->z_acl_bytes);
3044 } else {
3045 dmu_tx_hold_write(tx, acl_obj, 0,
3046 aclp->z_acl_bytes);
3047 }
3048 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3049 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3050 0, aclp->z_acl_bytes);
3051 }
3052 mutex_exit(&zp->z_lock);
3053 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3054 } else {
3055 if ((mask & ATTR_XVATTR) &&
3056 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3057 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3058 else
3059 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3060 }
3061
3062 if (attrzp) {
3063 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3064 }
3065
3066 fuid_dirtied = zfsvfs->z_fuid_dirty;
3067 if (fuid_dirtied)
3068 zfs_fuid_txhold(zfsvfs, tx);
3069
3070 zfs_sa_upgrade_txholds(tx, zp);
3071
3072 err = dmu_tx_assign(tx, TXG_WAIT);
3073 if (err)
3074 goto out;
3075
3076 count = 0;
3077 /*
3078 * Set each attribute requested.
3079 * We group settings according to the locks they need to acquire.
3080 *
3081 * Note: you cannot set ctime directly, although it will be
3082 * updated as a side-effect of calling this function.
3083 */
3084
3085
3086 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3087 mutex_enter(&zp->z_acl_lock);
3088 mutex_enter(&zp->z_lock);
3089
3090 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3091 &zp->z_pflags, sizeof (zp->z_pflags));
3092
3093 if (attrzp) {
3094 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3095 mutex_enter(&attrzp->z_acl_lock);
3096 mutex_enter(&attrzp->z_lock);
3097 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3098 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3099 sizeof (attrzp->z_pflags));
3100 }
3101
3102 if (mask & (ATTR_UID|ATTR_GID)) {
3103
3104 if (mask & ATTR_UID) {
3105 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
3106 new_uid = zfs_uid_read(ZTOI(zp));
3107 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3108 &new_uid, sizeof (new_uid));
3109 if (attrzp) {
3110 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3111 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3112 sizeof (new_uid));
3113 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
3114 }
3115 }
3116
3117 if (mask & ATTR_GID) {
3118 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
3119 new_gid = zfs_gid_read(ZTOI(zp));
3120 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3121 NULL, &new_gid, sizeof (new_gid));
3122 if (attrzp) {
3123 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3124 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3125 sizeof (new_gid));
3126 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
3127 }
3128 }
3129 if (!(mask & ATTR_MODE)) {
3130 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3131 NULL, &new_mode, sizeof (new_mode));
3132 new_mode = zp->z_mode;
3133 }
3134 err = zfs_acl_chown_setattr(zp);
3135 ASSERT(err == 0);
3136 if (attrzp) {
3137 err = zfs_acl_chown_setattr(attrzp);
3138 ASSERT(err == 0);
3139 }
3140 }
3141
3142 if (mask & ATTR_MODE) {
3143 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3144 &new_mode, sizeof (new_mode));
3145 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
3146 ASSERT3P(aclp, !=, NULL);
3147 err = zfs_aclset_common(zp, aclp, cr, tx);
3148 ASSERT0(err);
3149 if (zp->z_acl_cached)
3150 zfs_acl_free(zp->z_acl_cached);
3151 zp->z_acl_cached = aclp;
3152 aclp = NULL;
3153 }
3154
3155 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
3156 zp->z_atime_dirty = 0;
3157 ZFS_TIME_ENCODE(&ip->i_atime, atime);
3158 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3159 &atime, sizeof (atime));
3160 }
3161
3162 if (mask & (ATTR_MTIME | ATTR_SIZE)) {
3163 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3164 ZTOI(zp)->i_mtime = timespec_trunc(vap->va_mtime,
3165 ZTOI(zp)->i_sb->s_time_gran);
3166
3167 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3168 mtime, sizeof (mtime));
3169 }
3170
3171 if (mask & (ATTR_CTIME | ATTR_SIZE)) {
3172 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
3173 ZTOI(zp)->i_ctime = timespec_trunc(vap->va_ctime,
3174 ZTOI(zp)->i_sb->s_time_gran);
3175 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3176 ctime, sizeof (ctime));
3177 }
3178
3179 if (attrzp && mask) {
3180 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3181 SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
3182 sizeof (ctime));
3183 }
3184
3185 /*
3186 * Do this after setting timestamps to prevent timestamp
3187 * update from toggling bit
3188 */
3189
3190 if (xoap && (mask & ATTR_XVATTR)) {
3191
3192 /*
3193 * restore trimmed off masks
3194 * so that return masks can be set for caller.
3195 */
3196
3197 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
3198 XVA_SET_REQ(xvap, XAT_APPENDONLY);
3199 }
3200 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
3201 XVA_SET_REQ(xvap, XAT_NOUNLINK);
3202 }
3203 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
3204 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3205 }
3206 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
3207 XVA_SET_REQ(xvap, XAT_NODUMP);
3208 }
3209 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
3210 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3211 }
3212 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
3213 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3214 }
3215
3216 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3217 ASSERT(S_ISREG(ip->i_mode));
3218
3219 zfs_xvattr_set(zp, xvap, tx);
3220 }
3221
3222 if (fuid_dirtied)
3223 zfs_fuid_sync(zfsvfs, tx);
3224
3225 if (mask != 0)
3226 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3227
3228 mutex_exit(&zp->z_lock);
3229 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3230 mutex_exit(&zp->z_acl_lock);
3231
3232 if (attrzp) {
3233 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3234 mutex_exit(&attrzp->z_acl_lock);
3235 mutex_exit(&attrzp->z_lock);
3236 }
3237 out:
3238 if (err == 0 && attrzp) {
3239 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3240 xattr_count, tx);
3241 ASSERT(err2 == 0);
3242 }
3243
3244 if (aclp)
3245 zfs_acl_free(aclp);
3246
3247 if (fuidp) {
3248 zfs_fuid_info_free(fuidp);
3249 fuidp = NULL;
3250 }
3251
3252 if (err) {
3253 dmu_tx_abort(tx);
3254 if (attrzp)
3255 iput(ZTOI(attrzp));
3256 if (err == ERESTART)
3257 goto top;
3258 } else {
3259 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3260 dmu_tx_commit(tx);
3261 if (attrzp)
3262 iput(ZTOI(attrzp));
3263 zfs_inode_update(zp);
3264 }
3265
3266 out2:
3267 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3268 zil_commit(zilog, 0);
3269
3270 out3:
3271 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * 7);
3272 kmem_free(bulk, sizeof (sa_bulk_attr_t) * 7);
3273 kmem_free(tmpxvattr, sizeof (xvattr_t));
3274 ZFS_EXIT(zfsvfs);
3275 return (err);
3276 }
3277
3278 typedef struct zfs_zlock {
3279 krwlock_t *zl_rwlock; /* lock we acquired */
3280 znode_t *zl_znode; /* znode we held */
3281 struct zfs_zlock *zl_next; /* next in list */
3282 } zfs_zlock_t;
3283
3284 /*
3285 * Drop locks and release vnodes that were held by zfs_rename_lock().
3286 */
3287 static void
3288 zfs_rename_unlock(zfs_zlock_t **zlpp)
3289 {
3290 zfs_zlock_t *zl;
3291
3292 while ((zl = *zlpp) != NULL) {
3293 if (zl->zl_znode != NULL)
3294 zfs_iput_async(ZTOI(zl->zl_znode));
3295 rw_exit(zl->zl_rwlock);
3296 *zlpp = zl->zl_next;
3297 kmem_free(zl, sizeof (*zl));
3298 }
3299 }
3300
3301 /*
3302 * Search back through the directory tree, using the ".." entries.
3303 * Lock each directory in the chain to prevent concurrent renames.
3304 * Fail any attempt to move a directory into one of its own descendants.
3305 * XXX - z_parent_lock can overlap with map or grow locks
3306 */
3307 static int
3308 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3309 {
3310 zfs_zlock_t *zl;
3311 znode_t *zp = tdzp;
3312 uint64_t rootid = ZTOZSB(zp)->z_root;
3313 uint64_t oidp = zp->z_id;
3314 krwlock_t *rwlp = &szp->z_parent_lock;
3315 krw_t rw = RW_WRITER;
3316
3317 /*
3318 * First pass write-locks szp and compares to zp->z_id.
3319 * Later passes read-lock zp and compare to zp->z_parent.
3320 */
3321 do {
3322 if (!rw_tryenter(rwlp, rw)) {
3323 /*
3324 * Another thread is renaming in this path.
3325 * Note that if we are a WRITER, we don't have any
3326 * parent_locks held yet.
3327 */
3328 if (rw == RW_READER && zp->z_id > szp->z_id) {
3329 /*
3330 * Drop our locks and restart
3331 */
3332 zfs_rename_unlock(&zl);
3333 *zlpp = NULL;
3334 zp = tdzp;
3335 oidp = zp->z_id;
3336 rwlp = &szp->z_parent_lock;
3337 rw = RW_WRITER;
3338 continue;
3339 } else {
3340 /*
3341 * Wait for other thread to drop its locks
3342 */
3343 rw_enter(rwlp, rw);
3344 }
3345 }
3346
3347 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3348 zl->zl_rwlock = rwlp;
3349 zl->zl_znode = NULL;
3350 zl->zl_next = *zlpp;
3351 *zlpp = zl;
3352
3353 if (oidp == szp->z_id) /* We're a descendant of szp */
3354 return (SET_ERROR(EINVAL));
3355
3356 if (oidp == rootid) /* We've hit the top */
3357 return (0);
3358
3359 if (rw == RW_READER) { /* i.e. not the first pass */
3360 int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
3361 if (error)
3362 return (error);
3363 zl->zl_znode = zp;
3364 }
3365 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
3366 &oidp, sizeof (oidp));
3367 rwlp = &zp->z_parent_lock;
3368 rw = RW_READER;
3369
3370 } while (zp->z_id != sdzp->z_id);
3371
3372 return (0);
3373 }
3374
3375 /*
3376 * Move an entry from the provided source directory to the target
3377 * directory. Change the entry name as indicated.
3378 *
3379 * IN: sdip - Source directory containing the "old entry".
3380 * snm - Old entry name.
3381 * tdip - Target directory to contain the "new entry".
3382 * tnm - New entry name.
3383 * cr - credentials of caller.
3384 * flags - case flags
3385 *
3386 * RETURN: 0 on success, error code on failure.
3387 *
3388 * Timestamps:
3389 * sdip,tdip - ctime|mtime updated
3390 */
3391 /*ARGSUSED*/
3392 int
3393 zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
3394 cred_t *cr, int flags)
3395 {
3396 znode_t *tdzp, *szp, *tzp;
3397 znode_t *sdzp = ITOZ(sdip);
3398 zfsvfs_t *zfsvfs = ITOZSB(sdip);
3399 zilog_t *zilog;
3400 zfs_dirlock_t *sdl, *tdl;
3401 dmu_tx_t *tx;
3402 zfs_zlock_t *zl;
3403 int cmp, serr, terr;
3404 int error = 0;
3405 int zflg = 0;
3406 boolean_t waited = B_FALSE;
3407
3408 if (snm == NULL || tnm == NULL)
3409 return (SET_ERROR(EINVAL));
3410
3411 ZFS_ENTER(zfsvfs);
3412 ZFS_VERIFY_ZP(sdzp);
3413 zilog = zfsvfs->z_log;
3414
3415 tdzp = ITOZ(tdip);
3416 ZFS_VERIFY_ZP(tdzp);
3417
3418 /*
3419 * We check i_sb because snapshots and the ctldir must have different
3420 * super blocks.
3421 */
3422 if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) {
3423 ZFS_EXIT(zfsvfs);
3424 return (SET_ERROR(EXDEV));
3425 }
3426
3427 if (zfsvfs->z_utf8 && u8_validate(tnm,
3428 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3429 ZFS_EXIT(zfsvfs);
3430 return (SET_ERROR(EILSEQ));
3431 }
3432
3433 if (flags & FIGNORECASE)
3434 zflg |= ZCILOOK;
3435
3436 top:
3437 szp = NULL;
3438 tzp = NULL;
3439 zl = NULL;
3440
3441 /*
3442 * This is to prevent the creation of links into attribute space
3443 * by renaming a linked file into/outof an attribute directory.
3444 * See the comment in zfs_link() for why this is considered bad.
3445 */
3446 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3447 ZFS_EXIT(zfsvfs);
3448 return (SET_ERROR(EINVAL));
3449 }
3450
3451 /*
3452 * Lock source and target directory entries. To prevent deadlock,
3453 * a lock ordering must be defined. We lock the directory with
3454 * the smallest object id first, or if it's a tie, the one with
3455 * the lexically first name.
3456 */
3457 if (sdzp->z_id < tdzp->z_id) {
3458 cmp = -1;
3459 } else if (sdzp->z_id > tdzp->z_id) {
3460 cmp = 1;
3461 } else {
3462 /*
3463 * First compare the two name arguments without
3464 * considering any case folding.
3465 */
3466 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3467
3468 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3469 ASSERT(error == 0 || !zfsvfs->z_utf8);
3470 if (cmp == 0) {
3471 /*
3472 * POSIX: "If the old argument and the new argument
3473 * both refer to links to the same existing file,
3474 * the rename() function shall return successfully
3475 * and perform no other action."
3476 */
3477 ZFS_EXIT(zfsvfs);
3478 return (0);
3479 }
3480 /*
3481 * If the file system is case-folding, then we may
3482 * have some more checking to do. A case-folding file
3483 * system is either supporting mixed case sensitivity
3484 * access or is completely case-insensitive. Note
3485 * that the file system is always case preserving.
3486 *
3487 * In mixed sensitivity mode case sensitive behavior
3488 * is the default. FIGNORECASE must be used to
3489 * explicitly request case insensitive behavior.
3490 *
3491 * If the source and target names provided differ only
3492 * by case (e.g., a request to rename 'tim' to 'Tim'),
3493 * we will treat this as a special case in the
3494 * case-insensitive mode: as long as the source name
3495 * is an exact match, we will allow this to proceed as
3496 * a name-change request.
3497 */
3498 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3499 (zfsvfs->z_case == ZFS_CASE_MIXED &&
3500 flags & FIGNORECASE)) &&
3501 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3502 &error) == 0) {
3503 /*
3504 * case preserving rename request, require exact
3505 * name matches
3506 */
3507 zflg |= ZCIEXACT;
3508 zflg &= ~ZCILOOK;
3509 }
3510 }
3511
3512 /*
3513 * If the source and destination directories are the same, we should
3514 * grab the z_name_lock of that directory only once.
3515 */
3516 if (sdzp == tdzp) {
3517 zflg |= ZHAVELOCK;
3518 rw_enter(&sdzp->z_name_lock, RW_READER);
3519 }
3520
3521 if (cmp < 0) {
3522 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3523 ZEXISTS | zflg, NULL, NULL);
3524 terr = zfs_dirent_lock(&tdl,
3525 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3526 } else {
3527 terr = zfs_dirent_lock(&tdl,
3528 tdzp, tnm, &tzp, zflg, NULL, NULL);
3529 serr = zfs_dirent_lock(&sdl,
3530 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3531 NULL, NULL);
3532 }
3533
3534 if (serr) {
3535 /*
3536 * Source entry invalid or not there.
3537 */
3538 if (!terr) {
3539 zfs_dirent_unlock(tdl);
3540 if (tzp)
3541 iput(ZTOI(tzp));
3542 }
3543
3544 if (sdzp == tdzp)
3545 rw_exit(&sdzp->z_name_lock);
3546
3547 if (strcmp(snm, "..") == 0)
3548 serr = EINVAL;
3549 ZFS_EXIT(zfsvfs);
3550 return (serr);
3551 }
3552 if (terr) {
3553 zfs_dirent_unlock(sdl);
3554 iput(ZTOI(szp));
3555
3556 if (sdzp == tdzp)
3557 rw_exit(&sdzp->z_name_lock);
3558
3559 if (strcmp(tnm, "..") == 0)
3560 terr = EINVAL;
3561 ZFS_EXIT(zfsvfs);
3562 return (terr);
3563 }
3564
3565 /*
3566 * Must have write access at the source to remove the old entry
3567 * and write access at the target to create the new entry.
3568 * Note that if target and source are the same, this can be
3569 * done in a single check.
3570 */
3571
3572 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
3573 goto out;
3574
3575 if (S_ISDIR(ZTOI(szp)->i_mode)) {
3576 /*
3577 * Check to make sure rename is valid.
3578 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3579 */
3580 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
3581 goto out;
3582 }
3583
3584 /*
3585 * Does target exist?
3586 */
3587 if (tzp) {
3588 /*
3589 * Source and target must be the same type.
3590 */
3591 if (S_ISDIR(ZTOI(szp)->i_mode)) {
3592 if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
3593 error = SET_ERROR(ENOTDIR);
3594 goto out;
3595 }
3596 } else {
3597 if (S_ISDIR(ZTOI(tzp)->i_mode)) {
3598 error = SET_ERROR(EISDIR);
3599 goto out;
3600 }
3601 }
3602 /*
3603 * POSIX dictates that when the source and target
3604 * entries refer to the same file object, rename
3605 * must do nothing and exit without error.
3606 */
3607 if (szp->z_id == tzp->z_id) {
3608 error = 0;
3609 goto out;
3610 }
3611 }
3612
3613 tx = dmu_tx_create(zfsvfs->z_os);
3614 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3615 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3616 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3617 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3618 if (sdzp != tdzp) {
3619 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3620 zfs_sa_upgrade_txholds(tx, tdzp);
3621 }
3622 if (tzp) {
3623 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3624 zfs_sa_upgrade_txholds(tx, tzp);
3625 }
3626
3627 zfs_sa_upgrade_txholds(tx, szp);
3628 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3629 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3630 if (error) {
3631 if (zl != NULL)
3632 zfs_rename_unlock(&zl);
3633 zfs_dirent_unlock(sdl);
3634 zfs_dirent_unlock(tdl);
3635
3636 if (sdzp == tdzp)
3637 rw_exit(&sdzp->z_name_lock);
3638
3639 if (error == ERESTART) {
3640 waited = B_TRUE;
3641 dmu_tx_wait(tx);
3642 dmu_tx_abort(tx);
3643 iput(ZTOI(szp));
3644 if (tzp)
3645 iput(ZTOI(tzp));
3646 goto top;
3647 }
3648 dmu_tx_abort(tx);
3649 iput(ZTOI(szp));
3650 if (tzp)
3651 iput(ZTOI(tzp));
3652 ZFS_EXIT(zfsvfs);
3653 return (error);
3654 }
3655
3656 if (tzp) /* Attempt to remove the existing target */
3657 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3658
3659 if (error == 0) {
3660 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3661 if (error == 0) {
3662 szp->z_pflags |= ZFS_AV_MODIFIED;
3663
3664 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3665 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3666 ASSERT0(error);
3667
3668 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3669 if (error == 0) {
3670 zfs_log_rename(zilog, tx, TX_RENAME |
3671 (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3672 sdl->dl_name, tdzp, tdl->dl_name, szp);
3673 } else {
3674 /*
3675 * At this point, we have successfully created
3676 * the target name, but have failed to remove
3677 * the source name. Since the create was done
3678 * with the ZRENAMING flag, there are
3679 * complications; for one, the link count is
3680 * wrong. The easiest way to deal with this
3681 * is to remove the newly created target, and
3682 * return the original error. This must
3683 * succeed; fortunately, it is very unlikely to
3684 * fail, since we just created it.
3685 */
3686 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3687 ZRENAMING, NULL), ==, 0);
3688 }
3689 }
3690 }
3691
3692 dmu_tx_commit(tx);
3693 out:
3694 if (zl != NULL)
3695 zfs_rename_unlock(&zl);
3696
3697 zfs_dirent_unlock(sdl);
3698 zfs_dirent_unlock(tdl);
3699
3700 zfs_inode_update(sdzp);
3701 if (sdzp == tdzp)
3702 rw_exit(&sdzp->z_name_lock);
3703
3704 if (sdzp != tdzp)
3705 zfs_inode_update(tdzp);
3706
3707 zfs_inode_update(szp);
3708 iput(ZTOI(szp));
3709 if (tzp) {
3710 zfs_inode_update(tzp);
3711 iput(ZTOI(tzp));
3712 }
3713
3714 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3715 zil_commit(zilog, 0);
3716
3717 ZFS_EXIT(zfsvfs);
3718 return (error);
3719 }
3720
3721 /*
3722 * Insert the indicated symbolic reference entry into the directory.
3723 *
3724 * IN: dip - Directory to contain new symbolic link.
3725 * link - Name for new symlink entry.
3726 * vap - Attributes of new entry.
3727 * target - Target path of new symlink.
3728 *
3729 * cr - credentials of caller.
3730 * flags - case flags
3731 *
3732 * RETURN: 0 on success, error code on failure.
3733 *
3734 * Timestamps:
3735 * dip - ctime|mtime updated
3736 */
3737 /*ARGSUSED*/
3738 int
3739 zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
3740 struct inode **ipp, cred_t *cr, int flags)
3741 {
3742 znode_t *zp, *dzp = ITOZ(dip);
3743 zfs_dirlock_t *dl;
3744 dmu_tx_t *tx;
3745 zfsvfs_t *zfsvfs = ITOZSB(dip);
3746 zilog_t *zilog;
3747 uint64_t len = strlen(link);
3748 int error;
3749 int zflg = ZNEW;
3750 zfs_acl_ids_t acl_ids;
3751 boolean_t fuid_dirtied;
3752 uint64_t txtype = TX_SYMLINK;
3753 boolean_t waited = B_FALSE;
3754
3755 ASSERT(S_ISLNK(vap->va_mode));
3756
3757 if (name == NULL)
3758 return (SET_ERROR(EINVAL));
3759
3760 ZFS_ENTER(zfsvfs);
3761 ZFS_VERIFY_ZP(dzp);
3762 zilog = zfsvfs->z_log;
3763
3764 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3765 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3766 ZFS_EXIT(zfsvfs);
3767 return (SET_ERROR(EILSEQ));
3768 }
3769 if (flags & FIGNORECASE)
3770 zflg |= ZCILOOK;
3771
3772 if (len > MAXPATHLEN) {
3773 ZFS_EXIT(zfsvfs);
3774 return (SET_ERROR(ENAMETOOLONG));
3775 }
3776
3777 if ((error = zfs_acl_ids_create(dzp, 0,
3778 vap, cr, NULL, &acl_ids)) != 0) {
3779 ZFS_EXIT(zfsvfs);
3780 return (error);
3781 }
3782 top:
3783 *ipp = NULL;
3784
3785 /*
3786 * Attempt to lock directory; fail if entry already exists.
3787 */
3788 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3789 if (error) {
3790 zfs_acl_ids_free(&acl_ids);
3791 ZFS_EXIT(zfsvfs);
3792 return (error);
3793 }
3794
3795 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3796 zfs_acl_ids_free(&acl_ids);
3797 zfs_dirent_unlock(dl);
3798 ZFS_EXIT(zfsvfs);
3799 return (error);
3800 }
3801
3802 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3803 zfs_acl_ids_free(&acl_ids);
3804 zfs_dirent_unlock(dl);
3805 ZFS_EXIT(zfsvfs);
3806 return (SET_ERROR(EDQUOT));
3807 }
3808 tx = dmu_tx_create(zfsvfs->z_os);
3809 fuid_dirtied = zfsvfs->z_fuid_dirty;
3810 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3811 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3812 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3813 ZFS_SA_BASE_ATTR_SIZE + len);
3814 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3815 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3816 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3817 acl_ids.z_aclp->z_acl_bytes);
3818 }
3819 if (fuid_dirtied)
3820 zfs_fuid_txhold(zfsvfs, tx);
3821 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3822 if (error) {
3823 zfs_dirent_unlock(dl);
3824 if (error == ERESTART) {
3825 waited = B_TRUE;
3826 dmu_tx_wait(tx);
3827 dmu_tx_abort(tx);
3828 goto top;
3829 }
3830 zfs_acl_ids_free(&acl_ids);
3831 dmu_tx_abort(tx);
3832 ZFS_EXIT(zfsvfs);
3833 return (error);
3834 }
3835
3836 /*
3837 * Create a new object for the symlink.
3838 * for version 4 ZPL datsets the symlink will be an SA attribute
3839 */
3840 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3841
3842 if (fuid_dirtied)
3843 zfs_fuid_sync(zfsvfs, tx);
3844
3845 mutex_enter(&zp->z_lock);
3846 if (zp->z_is_sa)
3847 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3848 link, len, tx);
3849 else
3850 zfs_sa_symlink(zp, link, len, tx);
3851 mutex_exit(&zp->z_lock);
3852
3853 zp->z_size = len;
3854 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3855 &zp->z_size, sizeof (zp->z_size), tx);
3856 /*
3857 * Insert the new object into the directory.
3858 */
3859 (void) zfs_link_create(dl, zp, tx, ZNEW);
3860
3861 if (flags & FIGNORECASE)
3862 txtype |= TX_CI;
3863 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3864
3865 zfs_inode_update(dzp);
3866 zfs_inode_update(zp);
3867
3868 zfs_acl_ids_free(&acl_ids);
3869
3870 dmu_tx_commit(tx);
3871
3872 zfs_dirent_unlock(dl);
3873
3874 *ipp = ZTOI(zp);
3875
3876 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3877 zil_commit(zilog, 0);
3878
3879 ZFS_EXIT(zfsvfs);
3880 return (error);
3881 }
3882
3883 /*
3884 * Return, in the buffer contained in the provided uio structure,
3885 * the symbolic path referred to by ip.
3886 *
3887 * IN: ip - inode of symbolic link
3888 * uio - structure to contain the link path.
3889 * cr - credentials of caller.
3890 *
3891 * RETURN: 0 if success
3892 * error code if failure
3893 *
3894 * Timestamps:
3895 * ip - atime updated
3896 */
3897 /* ARGSUSED */
3898 int
3899 zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
3900 {
3901 znode_t *zp = ITOZ(ip);
3902 zfsvfs_t *zfsvfs = ITOZSB(ip);
3903 int error;
3904
3905 ZFS_ENTER(zfsvfs);
3906 ZFS_VERIFY_ZP(zp);
3907
3908 mutex_enter(&zp->z_lock);
3909 if (zp->z_is_sa)
3910 error = sa_lookup_uio(zp->z_sa_hdl,
3911 SA_ZPL_SYMLINK(zfsvfs), uio);
3912 else
3913 error = zfs_sa_readlink(zp, uio);
3914 mutex_exit(&zp->z_lock);
3915
3916 ZFS_EXIT(zfsvfs);
3917 return (error);
3918 }
3919
3920 /*
3921 * Insert a new entry into directory tdip referencing sip.
3922 *
3923 * IN: tdip - Directory to contain new entry.
3924 * sip - inode of new entry.
3925 * name - name of new entry.
3926 * cr - credentials of caller.
3927 *
3928 * RETURN: 0 if success
3929 * error code if failure
3930 *
3931 * Timestamps:
3932 * tdip - ctime|mtime updated
3933 * sip - ctime updated
3934 */
3935 /* ARGSUSED */
3936 int
3937 zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr,
3938 int flags)
3939 {
3940 znode_t *dzp = ITOZ(tdip);
3941 znode_t *tzp, *szp;
3942 zfsvfs_t *zfsvfs = ITOZSB(tdip);
3943 zilog_t *zilog;
3944 zfs_dirlock_t *dl;
3945 dmu_tx_t *tx;
3946 int error;
3947 int zf = ZNEW;
3948 uint64_t parent;
3949 uid_t owner;
3950 boolean_t waited = B_FALSE;
3951 boolean_t is_tmpfile = 0;
3952 uint64_t txg;
3953 #ifdef HAVE_TMPFILE
3954 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3955 #endif
3956 ASSERT(S_ISDIR(tdip->i_mode));
3957
3958 if (name == NULL)
3959 return (SET_ERROR(EINVAL));
3960
3961 ZFS_ENTER(zfsvfs);
3962 ZFS_VERIFY_ZP(dzp);
3963 zilog = zfsvfs->z_log;
3964
3965 /*
3966 * POSIX dictates that we return EPERM here.
3967 * Better choices include ENOTSUP or EISDIR.
3968 */
3969 if (S_ISDIR(sip->i_mode)) {
3970 ZFS_EXIT(zfsvfs);
3971 return (SET_ERROR(EPERM));
3972 }
3973
3974 szp = ITOZ(sip);
3975 ZFS_VERIFY_ZP(szp);
3976
3977 /*
3978 * We check i_sb because snapshots and the ctldir must have different
3979 * super blocks.
3980 */
3981 if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) {
3982 ZFS_EXIT(zfsvfs);
3983 return (SET_ERROR(EXDEV));
3984 }
3985
3986 /* Prevent links to .zfs/shares files */
3987
3988 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3989 &parent, sizeof (uint64_t))) != 0) {
3990 ZFS_EXIT(zfsvfs);
3991 return (error);
3992 }
3993 if (parent == zfsvfs->z_shares_dir) {
3994 ZFS_EXIT(zfsvfs);
3995 return (SET_ERROR(EPERM));
3996 }
3997
3998 if (zfsvfs->z_utf8 && u8_validate(name,
3999 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4000 ZFS_EXIT(zfsvfs);
4001 return (SET_ERROR(EILSEQ));
4002 }
4003 if (flags & FIGNORECASE)
4004 zf |= ZCILOOK;
4005
4006 /*
4007 * We do not support links between attributes and non-attributes
4008 * because of the potential security risk of creating links
4009 * into "normal" file space in order to circumvent restrictions
4010 * imposed in attribute space.
4011 */
4012 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4013 ZFS_EXIT(zfsvfs);
4014 return (SET_ERROR(EINVAL));
4015 }
4016
4017 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
4018 cr, ZFS_OWNER);
4019 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4020 ZFS_EXIT(zfsvfs);
4021 return (SET_ERROR(EPERM));
4022 }
4023
4024 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
4025 ZFS_EXIT(zfsvfs);
4026 return (error);
4027 }
4028
4029 top:
4030 /*
4031 * Attempt to lock directory; fail if entry already exists.
4032 */
4033 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4034 if (error) {
4035 ZFS_EXIT(zfsvfs);
4036 return (error);
4037 }
4038
4039 tx = dmu_tx_create(zfsvfs->z_os);
4040 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4041 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4042 if (is_tmpfile)
4043 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4044
4045 zfs_sa_upgrade_txholds(tx, szp);
4046 zfs_sa_upgrade_txholds(tx, dzp);
4047 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4048 if (error) {
4049 zfs_dirent_unlock(dl);
4050 if (error == ERESTART) {
4051 waited = B_TRUE;
4052 dmu_tx_wait(tx);
4053 dmu_tx_abort(tx);
4054 goto top;
4055 }
4056 dmu_tx_abort(tx);
4057 ZFS_EXIT(zfsvfs);
4058 return (error);
4059 }
4060 /* unmark z_unlinked so zfs_link_create will not reject */
4061 if (is_tmpfile)
4062 szp->z_unlinked = 0;
4063 error = zfs_link_create(dl, szp, tx, 0);
4064
4065 if (error == 0) {
4066 uint64_t txtype = TX_LINK;
4067 /*
4068 * tmpfile is created to be in z_unlinkedobj, so remove it.
4069 * Also, we don't log in ZIL, be cause all previous file
4070 * operation on the tmpfile are ignored by ZIL. Instead we
4071 * always wait for txg to sync to make sure all previous
4072 * operation are sync safe.
4073 */
4074 if (is_tmpfile) {
4075 VERIFY(zap_remove_int(zfsvfs->z_os,
4076 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
4077 } else {
4078 if (flags & FIGNORECASE)
4079 txtype |= TX_CI;
4080 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4081 }
4082 } else if (is_tmpfile) {
4083 /* restore z_unlinked since when linking failed */
4084 szp->z_unlinked = 1;
4085 }
4086 txg = dmu_tx_get_txg(tx);
4087 dmu_tx_commit(tx);
4088
4089 zfs_dirent_unlock(dl);
4090
4091 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4092 zil_commit(zilog, 0);
4093
4094 if (is_tmpfile)
4095 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
4096
4097 zfs_inode_update(dzp);
4098 zfs_inode_update(szp);
4099 ZFS_EXIT(zfsvfs);
4100 return (error);
4101 }
4102
4103 static void
4104 zfs_putpage_commit_cb(void *arg)
4105 {
4106 struct page *pp = arg;
4107
4108 ClearPageError(pp);
4109 end_page_writeback(pp);
4110 }
4111
4112 /*
4113 * Push a page out to disk, once the page is on stable storage the
4114 * registered commit callback will be run as notification of completion.
4115 *
4116 * IN: ip - page mapped for inode.
4117 * pp - page to push (page is locked)
4118 * wbc - writeback control data
4119 *
4120 * RETURN: 0 if success
4121 * error code if failure
4122 *
4123 * Timestamps:
4124 * ip - ctime|mtime updated
4125 */
4126 /* ARGSUSED */
4127 int
4128 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
4129 {
4130 znode_t *zp = ITOZ(ip);
4131 zfsvfs_t *zfsvfs = ITOZSB(ip);
4132 loff_t offset;
4133 loff_t pgoff;
4134 unsigned int pglen;
4135 rl_t *rl;
4136 dmu_tx_t *tx;
4137 caddr_t va;
4138 int err = 0;
4139 uint64_t mtime[2], ctime[2];
4140 sa_bulk_attr_t bulk[3];
4141 int cnt = 0;
4142 struct address_space *mapping;
4143
4144 ZFS_ENTER(zfsvfs);
4145 ZFS_VERIFY_ZP(zp);
4146
4147 ASSERT(PageLocked(pp));
4148
4149 pgoff = page_offset(pp); /* Page byte-offset in file */
4150 offset = i_size_read(ip); /* File length in bytes */
4151 pglen = MIN(PAGE_SIZE, /* Page length in bytes */
4152 P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
4153
4154 /* Page is beyond end of file */
4155 if (pgoff >= offset) {
4156 unlock_page(pp);
4157 ZFS_EXIT(zfsvfs);
4158 return (0);
4159 }
4160
4161 /* Truncate page length to end of file */
4162 if (pgoff + pglen > offset)
4163 pglen = offset - pgoff;
4164
4165 #if 0
4166 /*
4167 * FIXME: Allow mmap writes past its quota. The correct fix
4168 * is to register a page_mkwrite() handler to count the page
4169 * against its quota when it is about to be dirtied.
4170 */
4171 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4172 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4173 err = EDQUOT;
4174 }
4175 #endif
4176
4177 /*
4178 * The ordering here is critical and must adhere to the following
4179 * rules in order to avoid deadlocking in either zfs_read() or
4180 * zfs_free_range() due to a lock inversion.
4181 *
4182 * 1) The page must be unlocked prior to acquiring the range lock.
4183 * This is critical because zfs_read() calls find_lock_page()
4184 * which may block on the page lock while holding the range lock.
4185 *
4186 * 2) Before setting or clearing write back on a page the range lock
4187 * must be held in order to prevent a lock inversion with the
4188 * zfs_free_range() function.
4189 *
4190 * This presents a problem because upon entering this function the
4191 * page lock is already held. To safely acquire the range lock the
4192 * page lock must be dropped. This creates a window where another
4193 * process could truncate, invalidate, dirty, or write out the page.
4194 *
4195 * Therefore, after successfully reacquiring the range and page locks
4196 * the current page state is checked. In the common case everything
4197 * will be as is expected and it can be written out. However, if
4198 * the page state has changed it must be handled accordingly.
4199 */
4200 mapping = pp->mapping;
4201 redirty_page_for_writepage(wbc, pp);
4202 unlock_page(pp);
4203
4204 rl = zfs_range_lock(&zp->z_range_lock, pgoff, pglen, RL_WRITER);
4205 lock_page(pp);
4206
4207 /* Page mapping changed or it was no longer dirty, we're done */
4208 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
4209 unlock_page(pp);
4210 zfs_range_unlock(rl);
4211 ZFS_EXIT(zfsvfs);
4212 return (0);
4213 }
4214
4215 /* Another process started write block if required */
4216 if (PageWriteback(pp)) {
4217 unlock_page(pp);
4218 zfs_range_unlock(rl);
4219
4220 if (wbc->sync_mode != WB_SYNC_NONE)
4221 wait_on_page_writeback(pp);
4222
4223 ZFS_EXIT(zfsvfs);
4224 return (0);
4225 }
4226
4227 /* Clear the dirty flag the required locks are held */
4228 if (!clear_page_dirty_for_io(pp)) {
4229 unlock_page(pp);
4230 zfs_range_unlock(rl);
4231 ZFS_EXIT(zfsvfs);
4232 return (0);
4233 }
4234
4235 /*
4236 * Counterpart for redirty_page_for_writepage() above. This page
4237 * was in fact not skipped and should not be counted as if it were.
4238 */
4239 wbc->pages_skipped--;
4240 set_page_writeback(pp);
4241 unlock_page(pp);
4242
4243 tx = dmu_tx_create(zfsvfs->z_os);
4244 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
4245 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4246 zfs_sa_upgrade_txholds(tx, zp);
4247
4248 err = dmu_tx_assign(tx, TXG_NOWAIT);
4249 if (err != 0) {
4250 if (err == ERESTART)
4251 dmu_tx_wait(tx);
4252
4253 dmu_tx_abort(tx);
4254 __set_page_dirty_nobuffers(pp);
4255 ClearPageError(pp);
4256 end_page_writeback(pp);
4257 zfs_range_unlock(rl);
4258 ZFS_EXIT(zfsvfs);
4259 return (err);
4260 }
4261
4262 va = kmap(pp);
4263 ASSERT3U(pglen, <=, PAGE_SIZE);
4264 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
4265 kunmap(pp);
4266
4267 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4268 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4269 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
4270 &zp->z_pflags, 8);
4271
4272 /* Preserve the mtime and ctime provided by the inode */
4273 ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
4274 ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
4275 zp->z_atime_dirty = 0;
4276 zp->z_seq++;
4277
4278 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4279
4280 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
4281 zfs_putpage_commit_cb, pp);
4282 dmu_tx_commit(tx);
4283
4284 zfs_range_unlock(rl);
4285
4286 if (wbc->sync_mode != WB_SYNC_NONE) {
4287 /*
4288 * Note that this is rarely called under writepages(), because
4289 * writepages() normally handles the entire commit for
4290 * performance reasons.
4291 */
4292 zil_commit(zfsvfs->z_log, zp->z_id);
4293 }
4294
4295 ZFS_EXIT(zfsvfs);
4296 return (err);
4297 }
4298
4299 /*
4300 * Update the system attributes when the inode has been dirtied. For the
4301 * moment we only update the mode, atime, mtime, and ctime.
4302 */
4303 int
4304 zfs_dirty_inode(struct inode *ip, int flags)
4305 {
4306 znode_t *zp = ITOZ(ip);
4307 zfsvfs_t *zfsvfs = ITOZSB(ip);
4308 dmu_tx_t *tx;
4309 uint64_t mode, atime[2], mtime[2], ctime[2];
4310 sa_bulk_attr_t bulk[4];
4311 int error = 0;
4312 int cnt = 0;
4313
4314 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
4315 return (0);
4316
4317 ZFS_ENTER(zfsvfs);
4318 ZFS_VERIFY_ZP(zp);
4319
4320 #ifdef I_DIRTY_TIME
4321 /*
4322 * This is the lazytime semantic indroduced in Linux 4.0
4323 * This flag will only be called from update_time when lazytime is set.
4324 * (Note, I_DIRTY_SYNC will also set if not lazytime)
4325 * Fortunately mtime and ctime are managed within ZFS itself, so we
4326 * only need to dirty atime.
4327 */
4328 if (flags == I_DIRTY_TIME) {
4329 zp->z_atime_dirty = 1;
4330 goto out;
4331 }
4332 #endif
4333
4334 tx = dmu_tx_create(zfsvfs->z_os);
4335
4336 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4337 zfs_sa_upgrade_txholds(tx, zp);
4338
4339 error = dmu_tx_assign(tx, TXG_WAIT);
4340 if (error) {
4341 dmu_tx_abort(tx);
4342 goto out;
4343 }
4344
4345 mutex_enter(&zp->z_lock);
4346 zp->z_atime_dirty = 0;
4347
4348 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
4349 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
4350 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4351 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4352
4353 /* Preserve the mode, mtime and ctime provided by the inode */
4354 ZFS_TIME_ENCODE(&ip->i_atime, atime);
4355 ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
4356 ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
4357 mode = ip->i_mode;
4358
4359 zp->z_mode = mode;
4360
4361 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4362 mutex_exit(&zp->z_lock);
4363
4364 dmu_tx_commit(tx);
4365 out:
4366 ZFS_EXIT(zfsvfs);
4367 return (error);
4368 }
4369
4370 /*ARGSUSED*/
4371 void
4372 zfs_inactive(struct inode *ip)
4373 {
4374 znode_t *zp = ITOZ(ip);
4375 zfsvfs_t *zfsvfs = ITOZSB(ip);
4376 uint64_t atime[2];
4377 int error;
4378 int need_unlock = 0;
4379
4380 /* Only read lock if we haven't already write locked, e.g. rollback */
4381 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
4382 need_unlock = 1;
4383 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4384 }
4385 if (zp->z_sa_hdl == NULL) {
4386 if (need_unlock)
4387 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4388 return;
4389 }
4390
4391 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4392 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4393
4394 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4395 zfs_sa_upgrade_txholds(tx, zp);
4396 error = dmu_tx_assign(tx, TXG_WAIT);
4397 if (error) {
4398 dmu_tx_abort(tx);
4399 } else {
4400 ZFS_TIME_ENCODE(&ip->i_atime, atime);
4401 mutex_enter(&zp->z_lock);
4402 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4403 (void *)&atime, sizeof (atime), tx);
4404 zp->z_atime_dirty = 0;
4405 mutex_exit(&zp->z_lock);
4406 dmu_tx_commit(tx);
4407 }
4408 }
4409
4410 zfs_zinactive(zp);
4411 if (need_unlock)
4412 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4413 }
4414
4415 /*
4416 * Bounds-check the seek operation.
4417 *
4418 * IN: ip - inode seeking within
4419 * ooff - old file offset
4420 * noffp - pointer to new file offset
4421 * ct - caller context
4422 *
4423 * RETURN: 0 if success
4424 * EINVAL if new offset invalid
4425 */
4426 /* ARGSUSED */
4427 int
4428 zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp)
4429 {
4430 if (S_ISDIR(ip->i_mode))
4431 return (0);
4432 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4433 }
4434
4435 /*
4436 * Fill pages with data from the disk.
4437 */
4438 static int
4439 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
4440 {
4441 znode_t *zp = ITOZ(ip);
4442 zfsvfs_t *zfsvfs = ITOZSB(ip);
4443 objset_t *os;
4444 struct page *cur_pp;
4445 u_offset_t io_off, total;
4446 size_t io_len;
4447 loff_t i_size;
4448 unsigned page_idx;
4449 int err;
4450
4451 os = zfsvfs->z_os;
4452 io_len = nr_pages << PAGE_SHIFT;
4453 i_size = i_size_read(ip);
4454 io_off = page_offset(pl[0]);
4455
4456 if (io_off + io_len > i_size)
4457 io_len = i_size - io_off;
4458
4459 /*
4460 * Iterate over list of pages and read each page individually.
4461 */
4462 page_idx = 0;
4463 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4464 caddr_t va;
4465
4466 cur_pp = pl[page_idx++];
4467 va = kmap(cur_pp);
4468 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4469 DMU_READ_PREFETCH);
4470 kunmap(cur_pp);
4471 if (err) {
4472 /* convert checksum errors into IO errors */
4473 if (err == ECKSUM)
4474 err = SET_ERROR(EIO);
4475 return (err);
4476 }
4477 }
4478
4479 return (0);
4480 }
4481
4482 /*
4483 * Uses zfs_fillpage to read data from the file and fill the pages.
4484 *
4485 * IN: ip - inode of file to get data from.
4486 * pl - list of pages to read
4487 * nr_pages - number of pages to read
4488 *
4489 * RETURN: 0 on success, error code on failure.
4490 *
4491 * Timestamps:
4492 * vp - atime updated
4493 */
4494 /* ARGSUSED */
4495 int
4496 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
4497 {
4498 znode_t *zp = ITOZ(ip);
4499 zfsvfs_t *zfsvfs = ITOZSB(ip);
4500 int err;
4501
4502 if (pl == NULL)
4503 return (0);
4504
4505 ZFS_ENTER(zfsvfs);
4506 ZFS_VERIFY_ZP(zp);
4507
4508 err = zfs_fillpage(ip, pl, nr_pages);
4509
4510 ZFS_EXIT(zfsvfs);
4511 return (err);
4512 }
4513
4514 /*
4515 * Check ZFS specific permissions to memory map a section of a file.
4516 *
4517 * IN: ip - inode of the file to mmap
4518 * off - file offset
4519 * addrp - start address in memory region
4520 * len - length of memory region
4521 * vm_flags- address flags
4522 *
4523 * RETURN: 0 if success
4524 * error code if failure
4525 */
4526 /*ARGSUSED*/
4527 int
4528 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4529 unsigned long vm_flags)
4530 {
4531 znode_t *zp = ITOZ(ip);
4532 zfsvfs_t *zfsvfs = ITOZSB(ip);
4533
4534 ZFS_ENTER(zfsvfs);
4535 ZFS_VERIFY_ZP(zp);
4536
4537 if ((vm_flags & VM_WRITE) && (zp->z_pflags &
4538 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4539 ZFS_EXIT(zfsvfs);
4540 return (SET_ERROR(EPERM));
4541 }
4542
4543 if ((vm_flags & (VM_READ | VM_EXEC)) &&
4544 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4545 ZFS_EXIT(zfsvfs);
4546 return (SET_ERROR(EACCES));
4547 }
4548
4549 if (off < 0 || len > MAXOFFSET_T - off) {
4550 ZFS_EXIT(zfsvfs);
4551 return (SET_ERROR(ENXIO));
4552 }
4553
4554 ZFS_EXIT(zfsvfs);
4555 return (0);
4556 }
4557
4558 /*
4559 * convoff - converts the given data (start, whence) to the
4560 * given whence.
4561 */
4562 int
4563 convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset)
4564 {
4565 vattr_t vap;
4566 int error;
4567
4568 if ((lckdat->l_whence == 2) || (whence == 2)) {
4569 if ((error = zfs_getattr(ip, &vap, 0, CRED())))
4570 return (error);
4571 }
4572
4573 switch (lckdat->l_whence) {
4574 case 1:
4575 lckdat->l_start += offset;
4576 break;
4577 case 2:
4578 lckdat->l_start += vap.va_size;
4579 /* FALLTHRU */
4580 case 0:
4581 break;
4582 default:
4583 return (SET_ERROR(EINVAL));
4584 }
4585
4586 if (lckdat->l_start < 0)
4587 return (SET_ERROR(EINVAL));
4588
4589 switch (whence) {
4590 case 1:
4591 lckdat->l_start -= offset;
4592 break;
4593 case 2:
4594 lckdat->l_start -= vap.va_size;
4595 /* FALLTHRU */
4596 case 0:
4597 break;
4598 default:
4599 return (SET_ERROR(EINVAL));
4600 }
4601
4602 lckdat->l_whence = (short)whence;
4603 return (0);
4604 }
4605
4606 /*
4607 * Free or allocate space in a file. Currently, this function only
4608 * supports the `F_FREESP' command. However, this command is somewhat
4609 * misnamed, as its functionality includes the ability to allocate as
4610 * well as free space.
4611 *
4612 * IN: ip - inode of file to free data in.
4613 * cmd - action to take (only F_FREESP supported).
4614 * bfp - section of file to free/alloc.
4615 * flag - current file open mode flags.
4616 * offset - current file offset.
4617 * cr - credentials of caller [UNUSED].
4618 *
4619 * RETURN: 0 on success, error code on failure.
4620 *
4621 * Timestamps:
4622 * ip - ctime|mtime updated
4623 */
4624 /* ARGSUSED */
4625 int
4626 zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag,
4627 offset_t offset, cred_t *cr)
4628 {
4629 znode_t *zp = ITOZ(ip);
4630 zfsvfs_t *zfsvfs = ITOZSB(ip);
4631 uint64_t off, len;
4632 int error;
4633
4634 ZFS_ENTER(zfsvfs);
4635 ZFS_VERIFY_ZP(zp);
4636
4637 if (cmd != F_FREESP) {
4638 ZFS_EXIT(zfsvfs);
4639 return (SET_ERROR(EINVAL));
4640 }
4641
4642 /*
4643 * Callers might not be able to detect properly that we are read-only,
4644 * so check it explicitly here.
4645 */
4646 if (zfs_is_readonly(zfsvfs)) {
4647 ZFS_EXIT(zfsvfs);
4648 return (SET_ERROR(EROFS));
4649 }
4650
4651 if ((error = convoff(ip, bfp, 0, offset))) {
4652 ZFS_EXIT(zfsvfs);
4653 return (error);
4654 }
4655
4656 if (bfp->l_len < 0) {
4657 ZFS_EXIT(zfsvfs);
4658 return (SET_ERROR(EINVAL));
4659 }
4660
4661 /*
4662 * Permissions aren't checked on Solaris because on this OS
4663 * zfs_space() can only be called with an opened file handle.
4664 * On Linux we can get here through truncate_range() which
4665 * operates directly on inodes, so we need to check access rights.
4666 */
4667 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
4668 ZFS_EXIT(zfsvfs);
4669 return (error);
4670 }
4671
4672 off = bfp->l_start;
4673 len = bfp->l_len; /* 0 means from off to end of file */
4674
4675 error = zfs_freesp(zp, off, len, flag, TRUE);
4676
4677 ZFS_EXIT(zfsvfs);
4678 return (error);
4679 }
4680
4681 /*ARGSUSED*/
4682 int
4683 zfs_fid(struct inode *ip, fid_t *fidp)
4684 {
4685 znode_t *zp = ITOZ(ip);
4686 zfsvfs_t *zfsvfs = ITOZSB(ip);
4687 uint32_t gen;
4688 uint64_t gen64;
4689 uint64_t object = zp->z_id;
4690 zfid_short_t *zfid;
4691 int size, i, error;
4692
4693 ZFS_ENTER(zfsvfs);
4694 ZFS_VERIFY_ZP(zp);
4695
4696 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4697 &gen64, sizeof (uint64_t))) != 0) {
4698 ZFS_EXIT(zfsvfs);
4699 return (error);
4700 }
4701
4702 gen = (uint32_t)gen64;
4703
4704 size = SHORT_FID_LEN;
4705
4706 zfid = (zfid_short_t *)fidp;
4707
4708 zfid->zf_len = size;
4709
4710 for (i = 0; i < sizeof (zfid->zf_object); i++)
4711 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4712
4713 /* Must have a non-zero generation number to distinguish from .zfs */
4714 if (gen == 0)
4715 gen = 1;
4716 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4717 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4718
4719 ZFS_EXIT(zfsvfs);
4720 return (0);
4721 }
4722
4723 /*ARGSUSED*/
4724 int
4725 zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
4726 {
4727 znode_t *zp = ITOZ(ip);
4728 zfsvfs_t *zfsvfs = ITOZSB(ip);
4729 int error;
4730 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4731
4732 ZFS_ENTER(zfsvfs);
4733 ZFS_VERIFY_ZP(zp);
4734 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4735 ZFS_EXIT(zfsvfs);
4736
4737 return (error);
4738 }
4739
4740 /*ARGSUSED*/
4741 int
4742 zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
4743 {
4744 znode_t *zp = ITOZ(ip);
4745 zfsvfs_t *zfsvfs = ITOZSB(ip);
4746 int error;
4747 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4748 zilog_t *zilog = zfsvfs->z_log;
4749
4750 ZFS_ENTER(zfsvfs);
4751 ZFS_VERIFY_ZP(zp);
4752
4753 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4754
4755 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4756 zil_commit(zilog, 0);
4757
4758 ZFS_EXIT(zfsvfs);
4759 return (error);
4760 }
4761
4762 #ifdef HAVE_UIO_ZEROCOPY
4763 /*
4764 * Tunable, both must be a power of 2.
4765 *
4766 * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
4767 * zcr_blksz_max: if set to less than the file block size, allow loaning out of
4768 * an arcbuf for a partial block read
4769 */
4770 int zcr_blksz_min = (1 << 10); /* 1K */
4771 int zcr_blksz_max = (1 << 17); /* 128K */
4772
4773 /*ARGSUSED*/
4774 static int
4775 zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
4776 {
4777 znode_t *zp = ITOZ(ip);
4778 zfsvfs_t *zfsvfs = ITOZSB(ip);
4779 int max_blksz = zfsvfs->z_max_blksz;
4780 uio_t *uio = &xuio->xu_uio;
4781 ssize_t size = uio->uio_resid;
4782 offset_t offset = uio->uio_loffset;
4783 int blksz;
4784 int fullblk, i;
4785 arc_buf_t *abuf;
4786 ssize_t maxsize;
4787 int preamble, postamble;
4788
4789 if (xuio->xu_type != UIOTYPE_ZEROCOPY)
4790 return (SET_ERROR(EINVAL));
4791
4792 ZFS_ENTER(zfsvfs);
4793 ZFS_VERIFY_ZP(zp);
4794 switch (ioflag) {
4795 case UIO_WRITE:
4796 /*
4797 * Loan out an arc_buf for write if write size is bigger than
4798 * max_blksz, and the file's block size is also max_blksz.
4799 */
4800 blksz = max_blksz;
4801 if (size < blksz || zp->z_blksz != blksz) {
4802 ZFS_EXIT(zfsvfs);
4803 return (SET_ERROR(EINVAL));
4804 }
4805 /*
4806 * Caller requests buffers for write before knowing where the
4807 * write offset might be (e.g. NFS TCP write).
4808 */
4809 if (offset == -1) {
4810 preamble = 0;
4811 } else {
4812 preamble = P2PHASE(offset, blksz);
4813 if (preamble) {
4814 preamble = blksz - preamble;
4815 size -= preamble;
4816 }
4817 }
4818
4819 postamble = P2PHASE(size, blksz);
4820 size -= postamble;
4821
4822 fullblk = size / blksz;
4823 (void) dmu_xuio_init(xuio,
4824 (preamble != 0) + fullblk + (postamble != 0));
4825
4826 /*
4827 * Have to fix iov base/len for partial buffers. They
4828 * currently represent full arc_buf's.
4829 */
4830 if (preamble) {
4831 /* data begins in the middle of the arc_buf */
4832 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4833 blksz);
4834 ASSERT(abuf);
4835 (void) dmu_xuio_add(xuio, abuf,
4836 blksz - preamble, preamble);
4837 }
4838
4839 for (i = 0; i < fullblk; i++) {
4840 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4841 blksz);
4842 ASSERT(abuf);
4843 (void) dmu_xuio_add(xuio, abuf, 0, blksz);
4844 }
4845
4846 if (postamble) {
4847 /* data ends in the middle of the arc_buf */
4848 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4849 blksz);
4850 ASSERT(abuf);
4851 (void) dmu_xuio_add(xuio, abuf, 0, postamble);
4852 }
4853 break;
4854 case UIO_READ:
4855 /*
4856 * Loan out an arc_buf for read if the read size is larger than
4857 * the current file block size. Block alignment is not
4858 * considered. Partial arc_buf will be loaned out for read.
4859 */
4860 blksz = zp->z_blksz;
4861 if (blksz < zcr_blksz_min)
4862 blksz = zcr_blksz_min;
4863 if (blksz > zcr_blksz_max)
4864 blksz = zcr_blksz_max;
4865 /* avoid potential complexity of dealing with it */
4866 if (blksz > max_blksz) {
4867 ZFS_EXIT(zfsvfs);
4868 return (SET_ERROR(EINVAL));
4869 }
4870
4871 maxsize = zp->z_size - uio->uio_loffset;
4872 if (size > maxsize)
4873 size = maxsize;
4874
4875 if (size < blksz) {
4876 ZFS_EXIT(zfsvfs);
4877 return (SET_ERROR(EINVAL));
4878 }
4879 break;
4880 default:
4881 ZFS_EXIT(zfsvfs);
4882 return (SET_ERROR(EINVAL));
4883 }
4884
4885 uio->uio_extflg = UIO_XUIO;
4886 XUIO_XUZC_RW(xuio) = ioflag;
4887 ZFS_EXIT(zfsvfs);
4888 return (0);
4889 }
4890
4891 /*ARGSUSED*/
4892 static int
4893 zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
4894 {
4895 int i;
4896 arc_buf_t *abuf;
4897 int ioflag = XUIO_XUZC_RW(xuio);
4898
4899 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
4900
4901 i = dmu_xuio_cnt(xuio);
4902 while (i-- > 0) {
4903 abuf = dmu_xuio_arcbuf(xuio, i);
4904 /*
4905 * if abuf == NULL, it must be a write buffer
4906 * that has been returned in zfs_write().
4907 */
4908 if (abuf)
4909 dmu_return_arcbuf(abuf);
4910 ASSERT(abuf || ioflag == UIO_WRITE);
4911 }
4912
4913 dmu_xuio_fini(xuio);
4914 return (0);
4915 }
4916 #endif /* HAVE_UIO_ZEROCOPY */
4917
4918 #if defined(_KERNEL) && defined(HAVE_SPL)
4919 EXPORT_SYMBOL(zfs_open);
4920 EXPORT_SYMBOL(zfs_close);
4921 EXPORT_SYMBOL(zfs_read);
4922 EXPORT_SYMBOL(zfs_write);
4923 EXPORT_SYMBOL(zfs_access);
4924 EXPORT_SYMBOL(zfs_lookup);
4925 EXPORT_SYMBOL(zfs_create);
4926 EXPORT_SYMBOL(zfs_tmpfile);
4927 EXPORT_SYMBOL(zfs_remove);
4928 EXPORT_SYMBOL(zfs_mkdir);
4929 EXPORT_SYMBOL(zfs_rmdir);
4930 EXPORT_SYMBOL(zfs_readdir);
4931 EXPORT_SYMBOL(zfs_fsync);
4932 EXPORT_SYMBOL(zfs_getattr);
4933 EXPORT_SYMBOL(zfs_getattr_fast);
4934 EXPORT_SYMBOL(zfs_setattr);
4935 EXPORT_SYMBOL(zfs_rename);
4936 EXPORT_SYMBOL(zfs_symlink);
4937 EXPORT_SYMBOL(zfs_readlink);
4938 EXPORT_SYMBOL(zfs_link);
4939 EXPORT_SYMBOL(zfs_inactive);
4940 EXPORT_SYMBOL(zfs_space);
4941 EXPORT_SYMBOL(zfs_fid);
4942 EXPORT_SYMBOL(zfs_getsecattr);
4943 EXPORT_SYMBOL(zfs_setsecattr);
4944 EXPORT_SYMBOL(zfs_getpage);
4945 EXPORT_SYMBOL(zfs_putpage);
4946 EXPORT_SYMBOL(zfs_dirty_inode);
4947 EXPORT_SYMBOL(zfs_map);
4948
4949 /* CSTYLED */
4950 module_param(zfs_delete_blocks, ulong, 0644);
4951 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4952 module_param(zfs_read_chunk_size, long, 0644);
4953 MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
4954 #endif