]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zfs_vnops.c
863adc5950346ffb2e7d85cba72c6ed82748197c
[mirror_zfs.git] / module / zfs / zfs_vnops.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29 /* Portions Copyright 2007 Jeremy Teo */
30 /* Portions Copyright 2010 Robert Milkowski */
31
32
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/time.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/file.h>
39 #include <sys/stat.h>
40 #include <sys/kmem.h>
41 #include <sys/taskq.h>
42 #include <sys/uio.h>
43 #include <sys/vmsystm.h>
44 #include <sys/atomic.h>
45 #include <sys/pathname.h>
46 #include <sys/cmn_err.h>
47 #include <sys/errno.h>
48 #include <sys/zfs_dir.h>
49 #include <sys/zfs_acl.h>
50 #include <sys/zfs_ioctl.h>
51 #include <sys/fs/zfs.h>
52 #include <sys/dmu.h>
53 #include <sys/dmu_objset.h>
54 #include <sys/spa.h>
55 #include <sys/txg.h>
56 #include <sys/dbuf.h>
57 #include <sys/zap.h>
58 #include <sys/sa.h>
59 #include <sys/policy.h>
60 #include <sys/sunddi.h>
61 #include <sys/sid.h>
62 #include <sys/mode.h>
63 #include <sys/zfs_ctldir.h>
64 #include <sys/zfs_fuid.h>
65 #include <sys/zfs_sa.h>
66 #include <sys/zfs_vnops.h>
67 #include <sys/zfs_rlock.h>
68 #include <sys/cred.h>
69 #include <sys/zpl.h>
70 #include <sys/zil.h>
71 #include <sys/sa_impl.h>
72
73 /*
74 * Programming rules.
75 *
76 * Each vnode op performs some logical unit of work. To do this, the ZPL must
77 * properly lock its in-core state, create a DMU transaction, do the work,
78 * record this work in the intent log (ZIL), commit the DMU transaction,
79 * and wait for the intent log to commit if it is a synchronous operation.
80 * Moreover, the vnode ops must work in both normal and log replay context.
81 * The ordering of events is important to avoid deadlocks and references
82 * to freed memory. The example below illustrates the following Big Rules:
83 *
84 * (1) A check must be made in each zfs thread for a mounted file system.
85 * This is done avoiding races using ZFS_ENTER(zfsvfs).
86 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
87 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
88 * can return EIO from the calling function.
89 *
90 * (2) iput() should always be the last thing except for zil_commit()
91 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
92 * First, if it's the last reference, the vnode/znode
93 * can be freed, so the zp may point to freed memory. Second, the last
94 * reference will call zfs_zinactive(), which may induce a lot of work --
95 * pushing cached pages (which acquires range locks) and syncing out
96 * cached atime changes. Third, zfs_zinactive() may require a new tx,
97 * which could deadlock the system if you were already holding one.
98 * If you must call iput() within a tx then use zfs_iput_async().
99 *
100 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
101 * as they can span dmu_tx_assign() calls.
102 *
103 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
104 * dmu_tx_assign(). This is critical because we don't want to block
105 * while holding locks.
106 *
107 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
108 * reduces lock contention and CPU usage when we must wait (note that if
109 * throughput is constrained by the storage, nearly every transaction
110 * must wait).
111 *
112 * Note, in particular, that if a lock is sometimes acquired before
113 * the tx assigns, and sometimes after (e.g. z_lock), then failing
114 * to use a non-blocking assign can deadlock the system. The scenario:
115 *
116 * Thread A has grabbed a lock before calling dmu_tx_assign().
117 * Thread B is in an already-assigned tx, and blocks for this lock.
118 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
119 * forever, because the previous txg can't quiesce until B's tx commits.
120 *
121 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
122 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
123 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
124 * to indicate that this operation has already called dmu_tx_wait().
125 * This will ensure that we don't retry forever, waiting a short bit
126 * each time.
127 *
128 * (5) If the operation succeeded, generate the intent log entry for it
129 * before dropping locks. This ensures that the ordering of events
130 * in the intent log matches the order in which they actually occurred.
131 * During ZIL replay the zfs_log_* functions will update the sequence
132 * number to indicate the zil transaction has replayed.
133 *
134 * (6) At the end of each vnode op, the DMU tx must always commit,
135 * regardless of whether there were any errors.
136 *
137 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
138 * to ensure that synchronous semantics are provided when necessary.
139 *
140 * In general, this is how things should be ordered in each vnode op:
141 *
142 * ZFS_ENTER(zfsvfs); // exit if unmounted
143 * top:
144 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
145 * rw_enter(...); // grab any other locks you need
146 * tx = dmu_tx_create(...); // get DMU tx
147 * dmu_tx_hold_*(); // hold each object you might modify
148 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
149 * if (error) {
150 * rw_exit(...); // drop locks
151 * zfs_dirent_unlock(dl); // unlock directory entry
152 * iput(...); // release held vnodes
153 * if (error == ERESTART) {
154 * waited = B_TRUE;
155 * dmu_tx_wait(tx);
156 * dmu_tx_abort(tx);
157 * goto top;
158 * }
159 * dmu_tx_abort(tx); // abort DMU tx
160 * ZFS_EXIT(zfsvfs); // finished in zfs
161 * return (error); // really out of space
162 * }
163 * error = do_real_work(); // do whatever this VOP does
164 * if (error == 0)
165 * zfs_log_*(...); // on success, make ZIL entry
166 * dmu_tx_commit(tx); // commit DMU tx -- error or not
167 * rw_exit(...); // drop locks
168 * zfs_dirent_unlock(dl); // unlock directory entry
169 * iput(...); // release held vnodes
170 * zil_commit(zilog, foid); // synchronous when necessary
171 * ZFS_EXIT(zfsvfs); // finished in zfs
172 * return (error); // done, report error
173 */
174
175 /*
176 * Virus scanning is unsupported. It would be possible to add a hook
177 * here to performance the required virus scan. This could be done
178 * entirely in the kernel or potentially as an update to invoke a
179 * scanning utility.
180 */
181 static int
182 zfs_vscan(struct inode *ip, cred_t *cr, int async)
183 {
184 return (0);
185 }
186
187 /* ARGSUSED */
188 int
189 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
190 {
191 znode_t *zp = ITOZ(ip);
192 zfsvfs_t *zfsvfs = ITOZSB(ip);
193
194 ZFS_ENTER(zfsvfs);
195 ZFS_VERIFY_ZP(zp);
196
197 /* Honor ZFS_APPENDONLY file attribute */
198 if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
199 ((flag & O_APPEND) == 0)) {
200 ZFS_EXIT(zfsvfs);
201 return (SET_ERROR(EPERM));
202 }
203
204 /* Virus scan eligible files on open */
205 if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
206 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
207 if (zfs_vscan(ip, cr, 0) != 0) {
208 ZFS_EXIT(zfsvfs);
209 return (SET_ERROR(EACCES));
210 }
211 }
212
213 /* Keep a count of the synchronous opens in the znode */
214 if (flag & O_SYNC)
215 atomic_inc_32(&zp->z_sync_cnt);
216
217 ZFS_EXIT(zfsvfs);
218 return (0);
219 }
220
221 /* ARGSUSED */
222 int
223 zfs_close(struct inode *ip, int flag, cred_t *cr)
224 {
225 znode_t *zp = ITOZ(ip);
226 zfsvfs_t *zfsvfs = ITOZSB(ip);
227
228 ZFS_ENTER(zfsvfs);
229 ZFS_VERIFY_ZP(zp);
230
231 /* Decrement the synchronous opens in the znode */
232 if (flag & O_SYNC)
233 atomic_dec_32(&zp->z_sync_cnt);
234
235 if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
236 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
237 VERIFY(zfs_vscan(ip, cr, 1) == 0);
238
239 ZFS_EXIT(zfsvfs);
240 return (0);
241 }
242
243 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
244 /*
245 * Lseek support for finding holes (cmd == SEEK_HOLE) and
246 * data (cmd == SEEK_DATA). "off" is an in/out parameter.
247 */
248 static int
249 zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
250 {
251 znode_t *zp = ITOZ(ip);
252 uint64_t noff = (uint64_t)*off; /* new offset */
253 uint64_t file_sz;
254 int error;
255 boolean_t hole;
256
257 file_sz = zp->z_size;
258 if (noff >= file_sz) {
259 return (SET_ERROR(ENXIO));
260 }
261
262 if (cmd == SEEK_HOLE)
263 hole = B_TRUE;
264 else
265 hole = B_FALSE;
266
267 error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
268
269 if (error == ESRCH)
270 return (SET_ERROR(ENXIO));
271
272 /* file was dirty, so fall back to using generic logic */
273 if (error == EBUSY) {
274 if (hole)
275 *off = file_sz;
276
277 return (0);
278 }
279
280 /*
281 * We could find a hole that begins after the logical end-of-file,
282 * because dmu_offset_next() only works on whole blocks. If the
283 * EOF falls mid-block, then indicate that the "virtual hole"
284 * at the end of the file begins at the logical EOF, rather than
285 * at the end of the last block.
286 */
287 if (noff > file_sz) {
288 ASSERT(hole);
289 noff = file_sz;
290 }
291
292 if (noff < *off)
293 return (error);
294 *off = noff;
295 return (error);
296 }
297
298 int
299 zfs_holey(struct inode *ip, int cmd, loff_t *off)
300 {
301 znode_t *zp = ITOZ(ip);
302 zfsvfs_t *zfsvfs = ITOZSB(ip);
303 int error;
304
305 ZFS_ENTER(zfsvfs);
306 ZFS_VERIFY_ZP(zp);
307
308 error = zfs_holey_common(ip, cmd, off);
309
310 ZFS_EXIT(zfsvfs);
311 return (error);
312 }
313 #endif /* SEEK_HOLE && SEEK_DATA */
314
315 #if defined(_KERNEL)
316 /*
317 * When a file is memory mapped, we must keep the IO data synchronized
318 * between the DMU cache and the memory mapped pages. What this means:
319 *
320 * On Write: If we find a memory mapped page, we write to *both*
321 * the page and the dmu buffer.
322 */
323 static void
324 update_pages(struct inode *ip, int64_t start, int len,
325 objset_t *os, uint64_t oid)
326 {
327 struct address_space *mp = ip->i_mapping;
328 struct page *pp;
329 uint64_t nbytes;
330 int64_t off;
331 void *pb;
332
333 off = start & (PAGE_SIZE-1);
334 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
335 nbytes = MIN(PAGE_SIZE - off, len);
336
337 pp = find_lock_page(mp, start >> PAGE_SHIFT);
338 if (pp) {
339 if (mapping_writably_mapped(mp))
340 flush_dcache_page(pp);
341
342 pb = kmap(pp);
343 (void) dmu_read(os, oid, start+off, nbytes, pb+off,
344 DMU_READ_PREFETCH);
345 kunmap(pp);
346
347 if (mapping_writably_mapped(mp))
348 flush_dcache_page(pp);
349
350 mark_page_accessed(pp);
351 SetPageUptodate(pp);
352 ClearPageError(pp);
353 unlock_page(pp);
354 put_page(pp);
355 }
356
357 len -= nbytes;
358 off = 0;
359 }
360 }
361
362 /*
363 * When a file is memory mapped, we must keep the IO data synchronized
364 * between the DMU cache and the memory mapped pages. What this means:
365 *
366 * On Read: We "read" preferentially from memory mapped pages,
367 * else we default from the dmu buffer.
368 *
369 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
370 * the file is memory mapped.
371 */
372 static int
373 mappedread(struct inode *ip, int nbytes, uio_t *uio)
374 {
375 struct address_space *mp = ip->i_mapping;
376 struct page *pp;
377 znode_t *zp = ITOZ(ip);
378 int64_t start, off;
379 uint64_t bytes;
380 int len = nbytes;
381 int error = 0;
382 void *pb;
383
384 start = uio->uio_loffset;
385 off = start & (PAGE_SIZE-1);
386 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
387 bytes = MIN(PAGE_SIZE - off, len);
388
389 pp = find_lock_page(mp, start >> PAGE_SHIFT);
390 if (pp) {
391 ASSERT(PageUptodate(pp));
392 unlock_page(pp);
393
394 pb = kmap(pp);
395 error = uiomove(pb + off, bytes, UIO_READ, uio);
396 kunmap(pp);
397
398 if (mapping_writably_mapped(mp))
399 flush_dcache_page(pp);
400
401 mark_page_accessed(pp);
402 put_page(pp);
403 } else {
404 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
405 uio, bytes);
406 }
407
408 len -= bytes;
409 off = 0;
410 if (error)
411 break;
412 }
413 return (error);
414 }
415 #endif /* _KERNEL */
416
417 unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
418 unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
419
420 /*
421 * Read bytes from specified file into supplied buffer.
422 *
423 * IN: ip - inode of file to be read from.
424 * uio - structure supplying read location, range info,
425 * and return buffer.
426 * ioflag - FSYNC flags; used to provide FRSYNC semantics.
427 * O_DIRECT flag; used to bypass page cache.
428 * cr - credentials of caller.
429 *
430 * OUT: uio - updated offset and range, buffer filled.
431 *
432 * RETURN: 0 on success, error code on failure.
433 *
434 * Side Effects:
435 * inode - atime updated if byte count > 0
436 */
437 /* ARGSUSED */
438 int
439 zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
440 {
441 int error = 0;
442
443 znode_t *zp = ITOZ(ip);
444 zfsvfs_t *zfsvfs = ITOZSB(ip);
445 ZFS_ENTER(zfsvfs);
446 ZFS_VERIFY_ZP(zp);
447
448 if (zp->z_pflags & ZFS_AV_QUARANTINED) {
449 ZFS_EXIT(zfsvfs);
450 return (SET_ERROR(EACCES));
451 }
452
453 /*
454 * Validate file offset
455 */
456 if (uio->uio_loffset < (offset_t)0) {
457 ZFS_EXIT(zfsvfs);
458 return (SET_ERROR(EINVAL));
459 }
460
461 /*
462 * Fasttrack empty reads
463 */
464 if (uio->uio_resid == 0) {
465 ZFS_EXIT(zfsvfs);
466 return (0);
467 }
468
469 /*
470 * If we're in FRSYNC mode, sync out this znode before reading it.
471 * Only do this for non-snapshots.
472 */
473 if (zfsvfs->z_log &&
474 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
475 zil_commit(zfsvfs->z_log, zp->z_id);
476
477 /*
478 * Lock the range against changes.
479 */
480 locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
481 uio->uio_loffset, uio->uio_resid, RL_READER);
482
483 /*
484 * If we are reading past end-of-file we can skip
485 * to the end; but we might still need to set atime.
486 */
487 if (uio->uio_loffset >= zp->z_size) {
488 error = 0;
489 goto out;
490 }
491
492 ASSERT(uio->uio_loffset < zp->z_size);
493 ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
494 ssize_t start_resid = n;
495
496 #ifdef HAVE_UIO_ZEROCOPY
497 xuio_t *xuio = NULL;
498 if ((uio->uio_extflg == UIO_XUIO) &&
499 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
500 int nblk;
501 int blksz = zp->z_blksz;
502 uint64_t offset = uio->uio_loffset;
503
504 xuio = (xuio_t *)uio;
505 if ((ISP2(blksz))) {
506 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
507 blksz)) / blksz;
508 } else {
509 ASSERT(offset + n <= blksz);
510 nblk = 1;
511 }
512 (void) dmu_xuio_init(xuio, nblk);
513
514 if (vn_has_cached_data(ip)) {
515 /*
516 * For simplicity, we always allocate a full buffer
517 * even if we only expect to read a portion of a block.
518 */
519 while (--nblk >= 0) {
520 (void) dmu_xuio_add(xuio,
521 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
522 blksz), 0, blksz);
523 }
524 }
525 }
526 #endif /* HAVE_UIO_ZEROCOPY */
527
528 while (n > 0) {
529 ssize_t nbytes = MIN(n, zfs_read_chunk_size -
530 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
531
532 if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
533 error = mappedread(ip, nbytes, uio);
534 } else {
535 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
536 uio, nbytes);
537 }
538
539 if (error) {
540 /* convert checksum errors into IO errors */
541 if (error == ECKSUM)
542 error = SET_ERROR(EIO);
543 break;
544 }
545
546 n -= nbytes;
547 }
548
549 int64_t nread = start_resid - n;
550 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
551 task_io_account_read(nread);
552 out:
553 rangelock_exit(lr);
554
555 ZFS_EXIT(zfsvfs);
556 return (error);
557 }
558
559 /*
560 * Write the bytes to a file.
561 *
562 * IN: ip - inode of file to be written to.
563 * uio - structure supplying write location, range info,
564 * and data buffer.
565 * ioflag - FAPPEND flag set if in append mode.
566 * O_DIRECT flag; used to bypass page cache.
567 * cr - credentials of caller.
568 *
569 * OUT: uio - updated offset and range.
570 *
571 * RETURN: 0 if success
572 * error code if failure
573 *
574 * Timestamps:
575 * ip - ctime|mtime updated if byte count > 0
576 */
577
578 /* ARGSUSED */
579 int
580 zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
581 {
582 int error = 0;
583 ssize_t start_resid = uio->uio_resid;
584
585 /*
586 * Fasttrack empty write
587 */
588 ssize_t n = start_resid;
589 if (n == 0)
590 return (0);
591
592 rlim64_t limit = uio->uio_limit;
593 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
594 limit = MAXOFFSET_T;
595
596 znode_t *zp = ITOZ(ip);
597 zfsvfs_t *zfsvfs = ZTOZSB(zp);
598 ZFS_ENTER(zfsvfs);
599 ZFS_VERIFY_ZP(zp);
600
601 sa_bulk_attr_t bulk[4];
602 int count = 0;
603 uint64_t mtime[2], ctime[2];
604 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
605 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
606 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
607 &zp->z_size, 8);
608 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
609 &zp->z_pflags, 8);
610
611 /*
612 * Callers might not be able to detect properly that we are read-only,
613 * so check it explicitly here.
614 */
615 if (zfs_is_readonly(zfsvfs)) {
616 ZFS_EXIT(zfsvfs);
617 return (SET_ERROR(EROFS));
618 }
619
620 /*
621 * If immutable or not appending then return EPERM
622 */
623 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
624 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
625 (uio->uio_loffset < zp->z_size))) {
626 ZFS_EXIT(zfsvfs);
627 return (SET_ERROR(EPERM));
628 }
629
630 /*
631 * Validate file offset
632 */
633 offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
634 if (woff < 0) {
635 ZFS_EXIT(zfsvfs);
636 return (SET_ERROR(EINVAL));
637 }
638
639 int max_blksz = zfsvfs->z_max_blksz;
640 xuio_t *xuio = NULL;
641
642 /*
643 * Pre-fault the pages to ensure slow (eg NFS) pages
644 * don't hold up txg.
645 * Skip this if uio contains loaned arc_buf.
646 */
647 #ifdef HAVE_UIO_ZEROCOPY
648 if ((uio->uio_extflg == UIO_XUIO) &&
649 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
650 xuio = (xuio_t *)uio;
651 else
652 #endif
653 if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
654 ZFS_EXIT(zfsvfs);
655 return (SET_ERROR(EFAULT));
656 }
657
658 /*
659 * If in append mode, set the io offset pointer to eof.
660 */
661 locked_range_t *lr;
662 if (ioflag & FAPPEND) {
663 /*
664 * Obtain an appending range lock to guarantee file append
665 * semantics. We reset the write offset once we have the lock.
666 */
667 lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
668 woff = lr->lr_offset;
669 if (lr->lr_length == UINT64_MAX) {
670 /*
671 * We overlocked the file because this write will cause
672 * the file block size to increase.
673 * Note that zp_size cannot change with this lock held.
674 */
675 woff = zp->z_size;
676 }
677 uio->uio_loffset = woff;
678 } else {
679 /*
680 * Note that if the file block size will change as a result of
681 * this write, then this range lock will lock the entire file
682 * so that we can re-write the block safely.
683 */
684 lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
685 }
686
687 if (woff >= limit) {
688 rangelock_exit(lr);
689 ZFS_EXIT(zfsvfs);
690 return (SET_ERROR(EFBIG));
691 }
692
693 if ((woff + n) > limit || woff > (limit - n))
694 n = limit - woff;
695
696 /* Will this write extend the file length? */
697 int write_eof = (woff + n > zp->z_size);
698
699 uint64_t end_size = MAX(zp->z_size, woff + n);
700 zilog_t *zilog = zfsvfs->z_log;
701 #ifdef HAVE_UIO_ZEROCOPY
702 int i_iov = 0;
703 const iovec_t *iovp = uio->uio_iov;
704 ASSERTV(int iovcnt = uio->uio_iovcnt);
705 #endif
706
707
708 /*
709 * Write the file in reasonable size chunks. Each chunk is written
710 * in a separate transaction; this keeps the intent log records small
711 * and allows us to do more fine-grained space accounting.
712 */
713 while (n > 0) {
714 woff = uio->uio_loffset;
715
716 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
717 KUID_TO_SUID(ip->i_uid)) ||
718 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
719 KGID_TO_SGID(ip->i_gid)) ||
720 (zp->z_projid != ZFS_DEFAULT_PROJID &&
721 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
722 zp->z_projid))) {
723 error = SET_ERROR(EDQUOT);
724 break;
725 }
726
727 arc_buf_t *abuf = NULL;
728 const iovec_t *aiov = NULL;
729 if (xuio) {
730 #ifdef HAVE_UIO_ZEROCOPY
731 ASSERT(i_iov < iovcnt);
732 ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
733 aiov = &iovp[i_iov];
734 abuf = dmu_xuio_arcbuf(xuio, i_iov);
735 dmu_xuio_clear(xuio, i_iov);
736 ASSERT((aiov->iov_base == abuf->b_data) ||
737 ((char *)aiov->iov_base - (char *)abuf->b_data +
738 aiov->iov_len == arc_buf_size(abuf)));
739 i_iov++;
740 #endif
741 } else if (n >= max_blksz && woff >= zp->z_size &&
742 P2PHASE(woff, max_blksz) == 0 &&
743 zp->z_blksz == max_blksz) {
744 /*
745 * This write covers a full block. "Borrow" a buffer
746 * from the dmu so that we can fill it before we enter
747 * a transaction. This avoids the possibility of
748 * holding up the transaction if the data copy hangs
749 * up on a pagefault (e.g., from an NFS server mapping).
750 */
751 size_t cbytes;
752
753 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
754 max_blksz);
755 ASSERT(abuf != NULL);
756 ASSERT(arc_buf_size(abuf) == max_blksz);
757 if ((error = uiocopy(abuf->b_data, max_blksz,
758 UIO_WRITE, uio, &cbytes))) {
759 dmu_return_arcbuf(abuf);
760 break;
761 }
762 ASSERT(cbytes == max_blksz);
763 }
764
765 /*
766 * Start a transaction.
767 */
768 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
769 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
770 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
771 zfs_sa_upgrade_txholds(tx, zp);
772 error = dmu_tx_assign(tx, TXG_WAIT);
773 if (error) {
774 dmu_tx_abort(tx);
775 if (abuf != NULL)
776 dmu_return_arcbuf(abuf);
777 break;
778 }
779
780 /*
781 * If rangelock_enter() over-locked we grow the blocksize
782 * and then reduce the lock range. This will only happen
783 * on the first iteration since rangelock_reduce() will
784 * shrink down lr_length to the appropriate size.
785 */
786 if (lr->lr_length == UINT64_MAX) {
787 uint64_t new_blksz;
788
789 if (zp->z_blksz > max_blksz) {
790 /*
791 * File's blocksize is already larger than the
792 * "recordsize" property. Only let it grow to
793 * the next power of 2.
794 */
795 ASSERT(!ISP2(zp->z_blksz));
796 new_blksz = MIN(end_size,
797 1 << highbit64(zp->z_blksz));
798 } else {
799 new_blksz = MIN(end_size, max_blksz);
800 }
801 zfs_grow_blocksize(zp, new_blksz, tx);
802 rangelock_reduce(lr, woff, n);
803 }
804
805 /*
806 * XXX - should we really limit each write to z_max_blksz?
807 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
808 */
809 ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
810
811 ssize_t tx_bytes;
812 if (abuf == NULL) {
813 tx_bytes = uio->uio_resid;
814 uio->uio_fault_disable = B_TRUE;
815 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
816 uio, nbytes, tx);
817 if (error == EFAULT) {
818 dmu_tx_commit(tx);
819 if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
820 break;
821 }
822 continue;
823 } else if (error != 0) {
824 dmu_tx_commit(tx);
825 break;
826 }
827 tx_bytes -= uio->uio_resid;
828 } else {
829 tx_bytes = nbytes;
830 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
831 /*
832 * If this is not a full block write, but we are
833 * extending the file past EOF and this data starts
834 * block-aligned, use assign_arcbuf(). Otherwise,
835 * write via dmu_write().
836 */
837 if (tx_bytes < max_blksz && (!write_eof ||
838 aiov->iov_base != abuf->b_data)) {
839 ASSERT(xuio);
840 dmu_write(zfsvfs->z_os, zp->z_id, woff,
841 /* cppcheck-suppress nullPointer */
842 aiov->iov_len, aiov->iov_base, tx);
843 dmu_return_arcbuf(abuf);
844 xuio_stat_wbuf_copied();
845 } else {
846 ASSERT(xuio || tx_bytes == max_blksz);
847 dmu_assign_arcbuf_by_dbuf(
848 sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
849 }
850 ASSERT(tx_bytes <= uio->uio_resid);
851 uioskip(uio, tx_bytes);
852 }
853 if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
854 update_pages(ip, woff,
855 tx_bytes, zfsvfs->z_os, zp->z_id);
856 }
857
858 /*
859 * If we made no progress, we're done. If we made even
860 * partial progress, update the znode and ZIL accordingly.
861 */
862 if (tx_bytes == 0) {
863 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
864 (void *)&zp->z_size, sizeof (uint64_t), tx);
865 dmu_tx_commit(tx);
866 ASSERT(error != 0);
867 break;
868 }
869
870 /*
871 * Clear Set-UID/Set-GID bits on successful write if not
872 * privileged and at least one of the execute bits is set.
873 *
874 * It would be nice to to this after all writes have
875 * been done, but that would still expose the ISUID/ISGID
876 * to another app after the partial write is committed.
877 *
878 * Note: we don't call zfs_fuid_map_id() here because
879 * user 0 is not an ephemeral uid.
880 */
881 mutex_enter(&zp->z_acl_lock);
882 uint32_t uid = KUID_TO_SUID(ip->i_uid);
883 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
884 (S_IXUSR >> 6))) != 0 &&
885 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
886 secpolicy_vnode_setid_retain(cr,
887 ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
888 uint64_t newmode;
889 zp->z_mode &= ~(S_ISUID | S_ISGID);
890 ip->i_mode = newmode = zp->z_mode;
891 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
892 (void *)&newmode, sizeof (uint64_t), tx);
893 }
894 mutex_exit(&zp->z_acl_lock);
895
896 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
897
898 /*
899 * Update the file size (zp_size) if it has changed;
900 * account for possible concurrent updates.
901 */
902 while ((end_size = zp->z_size) < uio->uio_loffset) {
903 (void) atomic_cas_64(&zp->z_size, end_size,
904 uio->uio_loffset);
905 ASSERT(error == 0);
906 }
907 /*
908 * If we are replaying and eof is non zero then force
909 * the file size to the specified eof. Note, there's no
910 * concurrency during replay.
911 */
912 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
913 zp->z_size = zfsvfs->z_replay_eof;
914
915 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
916
917 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
918 NULL, NULL);
919 dmu_tx_commit(tx);
920
921 if (error != 0)
922 break;
923 ASSERT(tx_bytes == nbytes);
924 n -= nbytes;
925
926 if (!xuio && n > 0) {
927 if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
928 error = EFAULT;
929 break;
930 }
931 }
932 }
933
934 zfs_inode_update(zp);
935 rangelock_exit(lr);
936
937 /*
938 * If we're in replay mode, or we made no progress, return error.
939 * Otherwise, it's at least a partial write, so it's successful.
940 */
941 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
942 ZFS_EXIT(zfsvfs);
943 return (error);
944 }
945
946 if (ioflag & (FSYNC | FDSYNC) ||
947 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
948 zil_commit(zilog, zp->z_id);
949
950 int64_t nwritten = start_resid - uio->uio_resid;
951 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
952 task_io_account_write(nwritten);
953
954 ZFS_EXIT(zfsvfs);
955 return (0);
956 }
957
958 /*
959 * Drop a reference on the passed inode asynchronously. This ensures
960 * that the caller will never drop the last reference on an inode in
961 * the current context. Doing so while holding open a tx could result
962 * in a deadlock if iput_final() re-enters the filesystem code.
963 */
964 void
965 zfs_iput_async(struct inode *ip)
966 {
967 objset_t *os = ITOZSB(ip)->z_os;
968
969 ASSERT(atomic_read(&ip->i_count) > 0);
970 ASSERT(os != NULL);
971
972 if (atomic_read(&ip->i_count) == 1)
973 VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)),
974 (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
975 else
976 iput(ip);
977 }
978
979 void
980 zfs_get_done(zgd_t *zgd, int error)
981 {
982 znode_t *zp = zgd->zgd_private;
983
984 if (zgd->zgd_db)
985 dmu_buf_rele(zgd->zgd_db, zgd);
986
987 rangelock_exit(zgd->zgd_lr);
988
989 /*
990 * Release the vnode asynchronously as we currently have the
991 * txg stopped from syncing.
992 */
993 zfs_iput_async(ZTOI(zp));
994
995 if (error == 0 && zgd->zgd_bp)
996 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
997
998 kmem_free(zgd, sizeof (zgd_t));
999 }
1000
1001 #ifdef DEBUG
1002 static int zil_fault_io = 0;
1003 #endif
1004
1005 /*
1006 * Get data to generate a TX_WRITE intent log record.
1007 */
1008 int
1009 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1010 {
1011 zfsvfs_t *zfsvfs = arg;
1012 objset_t *os = zfsvfs->z_os;
1013 znode_t *zp;
1014 uint64_t object = lr->lr_foid;
1015 uint64_t offset = lr->lr_offset;
1016 uint64_t size = lr->lr_length;
1017 dmu_buf_t *db;
1018 zgd_t *zgd;
1019 int error = 0;
1020
1021 ASSERT3P(lwb, !=, NULL);
1022 ASSERT3P(zio, !=, NULL);
1023 ASSERT3U(size, !=, 0);
1024
1025 /*
1026 * Nothing to do if the file has been removed
1027 */
1028 if (zfs_zget(zfsvfs, object, &zp) != 0)
1029 return (SET_ERROR(ENOENT));
1030 if (zp->z_unlinked) {
1031 /*
1032 * Release the vnode asynchronously as we currently have the
1033 * txg stopped from syncing.
1034 */
1035 zfs_iput_async(ZTOI(zp));
1036 return (SET_ERROR(ENOENT));
1037 }
1038
1039 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1040 zgd->zgd_lwb = lwb;
1041 zgd->zgd_private = zp;
1042
1043 /*
1044 * Write records come in two flavors: immediate and indirect.
1045 * For small writes it's cheaper to store the data with the
1046 * log record (immediate); for large writes it's cheaper to
1047 * sync the data and get a pointer to it (indirect) so that
1048 * we don't have to write the data twice.
1049 */
1050 if (buf != NULL) { /* immediate write */
1051 zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1052 offset, size, RL_READER);
1053 /* test for truncation needs to be done while range locked */
1054 if (offset >= zp->z_size) {
1055 error = SET_ERROR(ENOENT);
1056 } else {
1057 error = dmu_read(os, object, offset, size, buf,
1058 DMU_READ_NO_PREFETCH);
1059 }
1060 ASSERT(error == 0 || error == ENOENT);
1061 } else { /* indirect write */
1062 /*
1063 * Have to lock the whole block to ensure when it's
1064 * written out and its checksum is being calculated
1065 * that no one can change the data. We need to re-check
1066 * blocksize after we get the lock in case it's changed!
1067 */
1068 for (;;) {
1069 uint64_t blkoff;
1070 size = zp->z_blksz;
1071 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1072 offset -= blkoff;
1073 zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1074 offset, size, RL_READER);
1075 if (zp->z_blksz == size)
1076 break;
1077 offset += blkoff;
1078 rangelock_exit(zgd->zgd_lr);
1079 }
1080 /* test for truncation needs to be done while range locked */
1081 if (lr->lr_offset >= zp->z_size)
1082 error = SET_ERROR(ENOENT);
1083 #ifdef DEBUG
1084 if (zil_fault_io) {
1085 error = SET_ERROR(EIO);
1086 zil_fault_io = 0;
1087 }
1088 #endif
1089 if (error == 0)
1090 error = dmu_buf_hold(os, object, offset, zgd, &db,
1091 DMU_READ_NO_PREFETCH);
1092
1093 if (error == 0) {
1094 blkptr_t *bp = &lr->lr_blkptr;
1095
1096 zgd->zgd_db = db;
1097 zgd->zgd_bp = bp;
1098
1099 ASSERT(db->db_offset == offset);
1100 ASSERT(db->db_size == size);
1101
1102 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1103 zfs_get_done, zgd);
1104 ASSERT(error || lr->lr_length <= size);
1105
1106 /*
1107 * On success, we need to wait for the write I/O
1108 * initiated by dmu_sync() to complete before we can
1109 * release this dbuf. We will finish everything up
1110 * in the zfs_get_done() callback.
1111 */
1112 if (error == 0)
1113 return (0);
1114
1115 if (error == EALREADY) {
1116 lr->lr_common.lrc_txtype = TX_WRITE2;
1117 /*
1118 * TX_WRITE2 relies on the data previously
1119 * written by the TX_WRITE that caused
1120 * EALREADY. We zero out the BP because
1121 * it is the old, currently-on-disk BP,
1122 * so there's no need to zio_flush() its
1123 * vdevs (flushing would needlesly hurt
1124 * performance, and doesn't work on
1125 * indirect vdevs).
1126 */
1127 zgd->zgd_bp = NULL;
1128 BP_ZERO(bp);
1129 error = 0;
1130 }
1131 }
1132 }
1133
1134 zfs_get_done(zgd, error);
1135
1136 return (error);
1137 }
1138
1139 /*ARGSUSED*/
1140 int
1141 zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
1142 {
1143 znode_t *zp = ITOZ(ip);
1144 zfsvfs_t *zfsvfs = ITOZSB(ip);
1145 int error;
1146
1147 ZFS_ENTER(zfsvfs);
1148 ZFS_VERIFY_ZP(zp);
1149
1150 if (flag & V_ACE_MASK)
1151 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1152 else
1153 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1154
1155 ZFS_EXIT(zfsvfs);
1156 return (error);
1157 }
1158
1159 /*
1160 * Lookup an entry in a directory, or an extended attribute directory.
1161 * If it exists, return a held inode reference for it.
1162 *
1163 * IN: dip - inode of directory to search.
1164 * nm - name of entry to lookup.
1165 * flags - LOOKUP_XATTR set if looking for an attribute.
1166 * cr - credentials of caller.
1167 * direntflags - directory lookup flags
1168 * realpnp - returned pathname.
1169 *
1170 * OUT: ipp - inode of located entry, NULL if not found.
1171 *
1172 * RETURN: 0 on success, error code on failure.
1173 *
1174 * Timestamps:
1175 * NA
1176 */
1177 /* ARGSUSED */
1178 int
1179 zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags,
1180 cred_t *cr, int *direntflags, pathname_t *realpnp)
1181 {
1182 znode_t *zdp = ITOZ(dip);
1183 zfsvfs_t *zfsvfs = ITOZSB(dip);
1184 int error = 0;
1185
1186 /*
1187 * Fast path lookup, however we must skip DNLC lookup
1188 * for case folding or normalizing lookups because the
1189 * DNLC code only stores the passed in name. This means
1190 * creating 'a' and removing 'A' on a case insensitive
1191 * file system would work, but DNLC still thinks 'a'
1192 * exists and won't let you create it again on the next
1193 * pass through fast path.
1194 */
1195 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1196
1197 if (!S_ISDIR(dip->i_mode)) {
1198 return (SET_ERROR(ENOTDIR));
1199 } else if (zdp->z_sa_hdl == NULL) {
1200 return (SET_ERROR(EIO));
1201 }
1202
1203 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1204 error = zfs_fastaccesschk_execute(zdp, cr);
1205 if (!error) {
1206 *ipp = dip;
1207 igrab(*ipp);
1208 return (0);
1209 }
1210 return (error);
1211 }
1212 }
1213
1214 ZFS_ENTER(zfsvfs);
1215 ZFS_VERIFY_ZP(zdp);
1216
1217 *ipp = NULL;
1218
1219 if (flags & LOOKUP_XATTR) {
1220 /*
1221 * We don't allow recursive attributes..
1222 * Maybe someday we will.
1223 */
1224 if (zdp->z_pflags & ZFS_XATTR) {
1225 ZFS_EXIT(zfsvfs);
1226 return (SET_ERROR(EINVAL));
1227 }
1228
1229 if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) {
1230 ZFS_EXIT(zfsvfs);
1231 return (error);
1232 }
1233
1234 /*
1235 * Do we have permission to get into attribute directory?
1236 */
1237
1238 if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0,
1239 B_FALSE, cr))) {
1240 iput(*ipp);
1241 *ipp = NULL;
1242 }
1243
1244 ZFS_EXIT(zfsvfs);
1245 return (error);
1246 }
1247
1248 if (!S_ISDIR(dip->i_mode)) {
1249 ZFS_EXIT(zfsvfs);
1250 return (SET_ERROR(ENOTDIR));
1251 }
1252
1253 /*
1254 * Check accessibility of directory.
1255 */
1256
1257 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
1258 ZFS_EXIT(zfsvfs);
1259 return (error);
1260 }
1261
1262 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1263 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1264 ZFS_EXIT(zfsvfs);
1265 return (SET_ERROR(EILSEQ));
1266 }
1267
1268 error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp);
1269 if ((error == 0) && (*ipp))
1270 zfs_inode_update(ITOZ(*ipp));
1271
1272 ZFS_EXIT(zfsvfs);
1273 return (error);
1274 }
1275
1276 /*
1277 * Attempt to create a new entry in a directory. If the entry
1278 * already exists, truncate the file if permissible, else return
1279 * an error. Return the ip of the created or trunc'd file.
1280 *
1281 * IN: dip - inode of directory to put new file entry in.
1282 * name - name of new file entry.
1283 * vap - attributes of new file.
1284 * excl - flag indicating exclusive or non-exclusive mode.
1285 * mode - mode to open file with.
1286 * cr - credentials of caller.
1287 * flag - large file flag [UNUSED].
1288 * vsecp - ACL to be set
1289 *
1290 * OUT: ipp - inode of created or trunc'd entry.
1291 *
1292 * RETURN: 0 on success, error code on failure.
1293 *
1294 * Timestamps:
1295 * dip - ctime|mtime updated if new entry created
1296 * ip - ctime|mtime always, atime if new
1297 */
1298
1299 /* ARGSUSED */
1300 int
1301 zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
1302 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
1303 {
1304 znode_t *zp, *dzp = ITOZ(dip);
1305 zfsvfs_t *zfsvfs = ITOZSB(dip);
1306 zilog_t *zilog;
1307 objset_t *os;
1308 zfs_dirlock_t *dl;
1309 dmu_tx_t *tx;
1310 int error;
1311 uid_t uid;
1312 gid_t gid;
1313 zfs_acl_ids_t acl_ids;
1314 boolean_t fuid_dirtied;
1315 boolean_t have_acl = B_FALSE;
1316 boolean_t waited = B_FALSE;
1317
1318 /*
1319 * If we have an ephemeral id, ACL, or XVATTR then
1320 * make sure file system is at proper version
1321 */
1322
1323 gid = crgetgid(cr);
1324 uid = crgetuid(cr);
1325
1326 if (zfsvfs->z_use_fuids == B_FALSE &&
1327 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1328 return (SET_ERROR(EINVAL));
1329
1330 if (name == NULL)
1331 return (SET_ERROR(EINVAL));
1332
1333 ZFS_ENTER(zfsvfs);
1334 ZFS_VERIFY_ZP(dzp);
1335 os = zfsvfs->z_os;
1336 zilog = zfsvfs->z_log;
1337
1338 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1339 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1340 ZFS_EXIT(zfsvfs);
1341 return (SET_ERROR(EILSEQ));
1342 }
1343
1344 if (vap->va_mask & ATTR_XVATTR) {
1345 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1346 crgetuid(cr), cr, vap->va_mode)) != 0) {
1347 ZFS_EXIT(zfsvfs);
1348 return (error);
1349 }
1350 }
1351
1352 top:
1353 *ipp = NULL;
1354 if (*name == '\0') {
1355 /*
1356 * Null component name refers to the directory itself.
1357 */
1358 igrab(dip);
1359 zp = dzp;
1360 dl = NULL;
1361 error = 0;
1362 } else {
1363 /* possible igrab(zp) */
1364 int zflg = 0;
1365
1366 if (flag & FIGNORECASE)
1367 zflg |= ZCILOOK;
1368
1369 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1370 NULL, NULL);
1371 if (error) {
1372 if (have_acl)
1373 zfs_acl_ids_free(&acl_ids);
1374 if (strcmp(name, "..") == 0)
1375 error = SET_ERROR(EISDIR);
1376 ZFS_EXIT(zfsvfs);
1377 return (error);
1378 }
1379 }
1380
1381 if (zp == NULL) {
1382 uint64_t txtype;
1383 uint64_t projid = ZFS_DEFAULT_PROJID;
1384
1385 /*
1386 * Create a new file object and update the directory
1387 * to reference it.
1388 */
1389 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
1390 if (have_acl)
1391 zfs_acl_ids_free(&acl_ids);
1392 goto out;
1393 }
1394
1395 /*
1396 * We only support the creation of regular files in
1397 * extended attribute directories.
1398 */
1399
1400 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
1401 if (have_acl)
1402 zfs_acl_ids_free(&acl_ids);
1403 error = SET_ERROR(EINVAL);
1404 goto out;
1405 }
1406
1407 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1408 cr, vsecp, &acl_ids)) != 0)
1409 goto out;
1410 have_acl = B_TRUE;
1411
1412 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1413 projid = zfs_inherit_projid(dzp);
1414 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1415 zfs_acl_ids_free(&acl_ids);
1416 error = SET_ERROR(EDQUOT);
1417 goto out;
1418 }
1419
1420 tx = dmu_tx_create(os);
1421
1422 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1423 ZFS_SA_BASE_ATTR_SIZE);
1424
1425 fuid_dirtied = zfsvfs->z_fuid_dirty;
1426 if (fuid_dirtied)
1427 zfs_fuid_txhold(zfsvfs, tx);
1428 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1429 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1430 if (!zfsvfs->z_use_sa &&
1431 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1432 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1433 0, acl_ids.z_aclp->z_acl_bytes);
1434 }
1435
1436 error = dmu_tx_assign(tx,
1437 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1438 if (error) {
1439 zfs_dirent_unlock(dl);
1440 if (error == ERESTART) {
1441 waited = B_TRUE;
1442 dmu_tx_wait(tx);
1443 dmu_tx_abort(tx);
1444 goto top;
1445 }
1446 zfs_acl_ids_free(&acl_ids);
1447 dmu_tx_abort(tx);
1448 ZFS_EXIT(zfsvfs);
1449 return (error);
1450 }
1451 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1452
1453 error = zfs_link_create(dl, zp, tx, ZNEW);
1454 if (error != 0) {
1455 /*
1456 * Since, we failed to add the directory entry for it,
1457 * delete the newly created dnode.
1458 */
1459 zfs_znode_delete(zp, tx);
1460 remove_inode_hash(ZTOI(zp));
1461 zfs_acl_ids_free(&acl_ids);
1462 dmu_tx_commit(tx);
1463 goto out;
1464 }
1465
1466 if (fuid_dirtied)
1467 zfs_fuid_sync(zfsvfs, tx);
1468
1469 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1470 if (flag & FIGNORECASE)
1471 txtype |= TX_CI;
1472 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1473 vsecp, acl_ids.z_fuidp, vap);
1474 zfs_acl_ids_free(&acl_ids);
1475 dmu_tx_commit(tx);
1476 } else {
1477 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1478
1479 if (have_acl)
1480 zfs_acl_ids_free(&acl_ids);
1481 have_acl = B_FALSE;
1482
1483 /*
1484 * A directory entry already exists for this name.
1485 */
1486 /*
1487 * Can't truncate an existing file if in exclusive mode.
1488 */
1489 if (excl) {
1490 error = SET_ERROR(EEXIST);
1491 goto out;
1492 }
1493 /*
1494 * Can't open a directory for writing.
1495 */
1496 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1497 error = SET_ERROR(EISDIR);
1498 goto out;
1499 }
1500 /*
1501 * Verify requested access to file.
1502 */
1503 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1504 goto out;
1505 }
1506
1507 mutex_enter(&dzp->z_lock);
1508 dzp->z_seq++;
1509 mutex_exit(&dzp->z_lock);
1510
1511 /*
1512 * Truncate regular files if requested.
1513 */
1514 if (S_ISREG(ZTOI(zp)->i_mode) &&
1515 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
1516 /* we can't hold any locks when calling zfs_freesp() */
1517 if (dl) {
1518 zfs_dirent_unlock(dl);
1519 dl = NULL;
1520 }
1521 error = zfs_freesp(zp, 0, 0, mode, TRUE);
1522 }
1523 }
1524 out:
1525
1526 if (dl)
1527 zfs_dirent_unlock(dl);
1528
1529 if (error) {
1530 if (zp)
1531 iput(ZTOI(zp));
1532 } else {
1533 zfs_inode_update(dzp);
1534 zfs_inode_update(zp);
1535 *ipp = ZTOI(zp);
1536 }
1537
1538 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1539 zil_commit(zilog, 0);
1540
1541 ZFS_EXIT(zfsvfs);
1542 return (error);
1543 }
1544
1545 /* ARGSUSED */
1546 int
1547 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
1548 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
1549 {
1550 znode_t *zp = NULL, *dzp = ITOZ(dip);
1551 zfsvfs_t *zfsvfs = ITOZSB(dip);
1552 objset_t *os;
1553 dmu_tx_t *tx;
1554 int error;
1555 uid_t uid;
1556 gid_t gid;
1557 zfs_acl_ids_t acl_ids;
1558 uint64_t projid = ZFS_DEFAULT_PROJID;
1559 boolean_t fuid_dirtied;
1560 boolean_t have_acl = B_FALSE;
1561 boolean_t waited = B_FALSE;
1562
1563 /*
1564 * If we have an ephemeral id, ACL, or XVATTR then
1565 * make sure file system is at proper version
1566 */
1567
1568 gid = crgetgid(cr);
1569 uid = crgetuid(cr);
1570
1571 if (zfsvfs->z_use_fuids == B_FALSE &&
1572 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1573 return (SET_ERROR(EINVAL));
1574
1575 ZFS_ENTER(zfsvfs);
1576 ZFS_VERIFY_ZP(dzp);
1577 os = zfsvfs->z_os;
1578
1579 if (vap->va_mask & ATTR_XVATTR) {
1580 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1581 crgetuid(cr), cr, vap->va_mode)) != 0) {
1582 ZFS_EXIT(zfsvfs);
1583 return (error);
1584 }
1585 }
1586
1587 top:
1588 *ipp = NULL;
1589
1590 /*
1591 * Create a new file object and update the directory
1592 * to reference it.
1593 */
1594 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
1595 if (have_acl)
1596 zfs_acl_ids_free(&acl_ids);
1597 goto out;
1598 }
1599
1600 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1601 cr, vsecp, &acl_ids)) != 0)
1602 goto out;
1603 have_acl = B_TRUE;
1604
1605 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1606 projid = zfs_inherit_projid(dzp);
1607 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1608 zfs_acl_ids_free(&acl_ids);
1609 error = SET_ERROR(EDQUOT);
1610 goto out;
1611 }
1612
1613 tx = dmu_tx_create(os);
1614
1615 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1616 ZFS_SA_BASE_ATTR_SIZE);
1617 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1618
1619 fuid_dirtied = zfsvfs->z_fuid_dirty;
1620 if (fuid_dirtied)
1621 zfs_fuid_txhold(zfsvfs, tx);
1622 if (!zfsvfs->z_use_sa &&
1623 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1624 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1625 0, acl_ids.z_aclp->z_acl_bytes);
1626 }
1627 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1628 if (error) {
1629 if (error == ERESTART) {
1630 waited = B_TRUE;
1631 dmu_tx_wait(tx);
1632 dmu_tx_abort(tx);
1633 goto top;
1634 }
1635 zfs_acl_ids_free(&acl_ids);
1636 dmu_tx_abort(tx);
1637 ZFS_EXIT(zfsvfs);
1638 return (error);
1639 }
1640 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
1641
1642 if (fuid_dirtied)
1643 zfs_fuid_sync(zfsvfs, tx);
1644
1645 /* Add to unlinked set */
1646 zp->z_unlinked = 1;
1647 zfs_unlinked_add(zp, tx);
1648 zfs_acl_ids_free(&acl_ids);
1649 dmu_tx_commit(tx);
1650 out:
1651
1652 if (error) {
1653 if (zp)
1654 iput(ZTOI(zp));
1655 } else {
1656 zfs_inode_update(dzp);
1657 zfs_inode_update(zp);
1658 *ipp = ZTOI(zp);
1659 }
1660
1661 ZFS_EXIT(zfsvfs);
1662 return (error);
1663 }
1664
1665 /*
1666 * Remove an entry from a directory.
1667 *
1668 * IN: dip - inode of directory to remove entry from.
1669 * name - name of entry to remove.
1670 * cr - credentials of caller.
1671 *
1672 * RETURN: 0 if success
1673 * error code if failure
1674 *
1675 * Timestamps:
1676 * dip - ctime|mtime
1677 * ip - ctime (if nlink > 0)
1678 */
1679
1680 uint64_t null_xattr = 0;
1681
1682 /*ARGSUSED*/
1683 int
1684 zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags)
1685 {
1686 znode_t *zp, *dzp = ITOZ(dip);
1687 znode_t *xzp;
1688 struct inode *ip;
1689 zfsvfs_t *zfsvfs = ITOZSB(dip);
1690 zilog_t *zilog;
1691 uint64_t acl_obj, xattr_obj;
1692 uint64_t xattr_obj_unlinked = 0;
1693 uint64_t obj = 0;
1694 uint64_t links;
1695 zfs_dirlock_t *dl;
1696 dmu_tx_t *tx;
1697 boolean_t may_delete_now, delete_now = FALSE;
1698 boolean_t unlinked, toobig = FALSE;
1699 uint64_t txtype;
1700 pathname_t *realnmp = NULL;
1701 pathname_t realnm;
1702 int error;
1703 int zflg = ZEXISTS;
1704 boolean_t waited = B_FALSE;
1705
1706 if (name == NULL)
1707 return (SET_ERROR(EINVAL));
1708
1709 ZFS_ENTER(zfsvfs);
1710 ZFS_VERIFY_ZP(dzp);
1711 zilog = zfsvfs->z_log;
1712
1713 if (flags & FIGNORECASE) {
1714 zflg |= ZCILOOK;
1715 pn_alloc(&realnm);
1716 realnmp = &realnm;
1717 }
1718
1719 top:
1720 xattr_obj = 0;
1721 xzp = NULL;
1722 /*
1723 * Attempt to lock directory; fail if entry doesn't exist.
1724 */
1725 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1726 NULL, realnmp))) {
1727 if (realnmp)
1728 pn_free(realnmp);
1729 ZFS_EXIT(zfsvfs);
1730 return (error);
1731 }
1732
1733 ip = ZTOI(zp);
1734
1735 if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1736 goto out;
1737 }
1738
1739 /*
1740 * Need to use rmdir for removing directories.
1741 */
1742 if (S_ISDIR(ip->i_mode)) {
1743 error = SET_ERROR(EPERM);
1744 goto out;
1745 }
1746
1747 mutex_enter(&zp->z_lock);
1748 may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped);
1749 mutex_exit(&zp->z_lock);
1750
1751 /*
1752 * We may delete the znode now, or we may put it in the unlinked set;
1753 * it depends on whether we're the last link, and on whether there are
1754 * other holds on the inode. So we dmu_tx_hold() the right things to
1755 * allow for either case.
1756 */
1757 obj = zp->z_id;
1758 tx = dmu_tx_create(zfsvfs->z_os);
1759 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1760 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1761 zfs_sa_upgrade_txholds(tx, zp);
1762 zfs_sa_upgrade_txholds(tx, dzp);
1763 if (may_delete_now) {
1764 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1765 /* if the file is too big, only hold_free a token amount */
1766 dmu_tx_hold_free(tx, zp->z_id, 0,
1767 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1768 }
1769
1770 /* are there any extended attributes? */
1771 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1772 &xattr_obj, sizeof (xattr_obj));
1773 if (error == 0 && xattr_obj) {
1774 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1775 ASSERT0(error);
1776 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1777 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1778 }
1779
1780 mutex_enter(&zp->z_lock);
1781 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1782 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1783 mutex_exit(&zp->z_lock);
1784
1785 /* charge as an update -- would be nice not to charge at all */
1786 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1787
1788 /*
1789 * Mark this transaction as typically resulting in a net free of space
1790 */
1791 dmu_tx_mark_netfree(tx);
1792
1793 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1794 if (error) {
1795 zfs_dirent_unlock(dl);
1796 if (error == ERESTART) {
1797 waited = B_TRUE;
1798 dmu_tx_wait(tx);
1799 dmu_tx_abort(tx);
1800 iput(ip);
1801 if (xzp)
1802 iput(ZTOI(xzp));
1803 goto top;
1804 }
1805 if (realnmp)
1806 pn_free(realnmp);
1807 dmu_tx_abort(tx);
1808 iput(ip);
1809 if (xzp)
1810 iput(ZTOI(xzp));
1811 ZFS_EXIT(zfsvfs);
1812 return (error);
1813 }
1814
1815 /*
1816 * Remove the directory entry.
1817 */
1818 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1819
1820 if (error) {
1821 dmu_tx_commit(tx);
1822 goto out;
1823 }
1824
1825 if (unlinked) {
1826 /*
1827 * Hold z_lock so that we can make sure that the ACL obj
1828 * hasn't changed. Could have been deleted due to
1829 * zfs_sa_upgrade().
1830 */
1831 mutex_enter(&zp->z_lock);
1832 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1833 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1834 delete_now = may_delete_now && !toobig &&
1835 atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) &&
1836 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1837 acl_obj;
1838 }
1839
1840 if (delete_now) {
1841 if (xattr_obj_unlinked) {
1842 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1843 mutex_enter(&xzp->z_lock);
1844 xzp->z_unlinked = 1;
1845 clear_nlink(ZTOI(xzp));
1846 links = 0;
1847 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1848 &links, sizeof (links), tx);
1849 ASSERT3U(error, ==, 0);
1850 mutex_exit(&xzp->z_lock);
1851 zfs_unlinked_add(xzp, tx);
1852
1853 if (zp->z_is_sa)
1854 error = sa_remove(zp->z_sa_hdl,
1855 SA_ZPL_XATTR(zfsvfs), tx);
1856 else
1857 error = sa_update(zp->z_sa_hdl,
1858 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1859 sizeof (uint64_t), tx);
1860 ASSERT0(error);
1861 }
1862 /*
1863 * Add to the unlinked set because a new reference could be
1864 * taken concurrently resulting in a deferred destruction.
1865 */
1866 zfs_unlinked_add(zp, tx);
1867 mutex_exit(&zp->z_lock);
1868 } else if (unlinked) {
1869 mutex_exit(&zp->z_lock);
1870 zfs_unlinked_add(zp, tx);
1871 }
1872
1873 txtype = TX_REMOVE;
1874 if (flags & FIGNORECASE)
1875 txtype |= TX_CI;
1876 zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1877
1878 dmu_tx_commit(tx);
1879 out:
1880 if (realnmp)
1881 pn_free(realnmp);
1882
1883 zfs_dirent_unlock(dl);
1884 zfs_inode_update(dzp);
1885 zfs_inode_update(zp);
1886
1887 if (delete_now)
1888 iput(ip);
1889 else
1890 zfs_iput_async(ip);
1891
1892 if (xzp) {
1893 zfs_inode_update(xzp);
1894 zfs_iput_async(ZTOI(xzp));
1895 }
1896
1897 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1898 zil_commit(zilog, 0);
1899
1900 ZFS_EXIT(zfsvfs);
1901 return (error);
1902 }
1903
1904 /*
1905 * Create a new directory and insert it into dip using the name
1906 * provided. Return a pointer to the inserted directory.
1907 *
1908 * IN: dip - inode of directory to add subdir to.
1909 * dirname - name of new directory.
1910 * vap - attributes of new directory.
1911 * cr - credentials of caller.
1912 * vsecp - ACL to be set
1913 *
1914 * OUT: ipp - inode of created directory.
1915 *
1916 * RETURN: 0 if success
1917 * error code if failure
1918 *
1919 * Timestamps:
1920 * dip - ctime|mtime updated
1921 * ipp - ctime|mtime|atime updated
1922 */
1923 /*ARGSUSED*/
1924 int
1925 zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
1926 cred_t *cr, int flags, vsecattr_t *vsecp)
1927 {
1928 znode_t *zp, *dzp = ITOZ(dip);
1929 zfsvfs_t *zfsvfs = ITOZSB(dip);
1930 zilog_t *zilog;
1931 zfs_dirlock_t *dl;
1932 uint64_t txtype;
1933 dmu_tx_t *tx;
1934 int error;
1935 int zf = ZNEW;
1936 uid_t uid;
1937 gid_t gid = crgetgid(cr);
1938 zfs_acl_ids_t acl_ids;
1939 boolean_t fuid_dirtied;
1940 boolean_t waited = B_FALSE;
1941
1942 ASSERT(S_ISDIR(vap->va_mode));
1943
1944 /*
1945 * If we have an ephemeral id, ACL, or XVATTR then
1946 * make sure file system is at proper version
1947 */
1948
1949 uid = crgetuid(cr);
1950 if (zfsvfs->z_use_fuids == B_FALSE &&
1951 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1952 return (SET_ERROR(EINVAL));
1953
1954 if (dirname == NULL)
1955 return (SET_ERROR(EINVAL));
1956
1957 ZFS_ENTER(zfsvfs);
1958 ZFS_VERIFY_ZP(dzp);
1959 zilog = zfsvfs->z_log;
1960
1961 if (dzp->z_pflags & ZFS_XATTR) {
1962 ZFS_EXIT(zfsvfs);
1963 return (SET_ERROR(EINVAL));
1964 }
1965
1966 if (zfsvfs->z_utf8 && u8_validate(dirname,
1967 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1968 ZFS_EXIT(zfsvfs);
1969 return (SET_ERROR(EILSEQ));
1970 }
1971 if (flags & FIGNORECASE)
1972 zf |= ZCILOOK;
1973
1974 if (vap->va_mask & ATTR_XVATTR) {
1975 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1976 crgetuid(cr), cr, vap->va_mode)) != 0) {
1977 ZFS_EXIT(zfsvfs);
1978 return (error);
1979 }
1980 }
1981
1982 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1983 vsecp, &acl_ids)) != 0) {
1984 ZFS_EXIT(zfsvfs);
1985 return (error);
1986 }
1987 /*
1988 * First make sure the new directory doesn't exist.
1989 *
1990 * Existence is checked first to make sure we don't return
1991 * EACCES instead of EEXIST which can cause some applications
1992 * to fail.
1993 */
1994 top:
1995 *ipp = NULL;
1996
1997 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1998 NULL, NULL))) {
1999 zfs_acl_ids_free(&acl_ids);
2000 ZFS_EXIT(zfsvfs);
2001 return (error);
2002 }
2003
2004 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
2005 zfs_acl_ids_free(&acl_ids);
2006 zfs_dirent_unlock(dl);
2007 ZFS_EXIT(zfsvfs);
2008 return (error);
2009 }
2010
2011 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
2012 zfs_acl_ids_free(&acl_ids);
2013 zfs_dirent_unlock(dl);
2014 ZFS_EXIT(zfsvfs);
2015 return (SET_ERROR(EDQUOT));
2016 }
2017
2018 /*
2019 * Add a new entry to the directory.
2020 */
2021 tx = dmu_tx_create(zfsvfs->z_os);
2022 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2023 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2024 fuid_dirtied = zfsvfs->z_fuid_dirty;
2025 if (fuid_dirtied)
2026 zfs_fuid_txhold(zfsvfs, tx);
2027 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2028 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2029 acl_ids.z_aclp->z_acl_bytes);
2030 }
2031
2032 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2033 ZFS_SA_BASE_ATTR_SIZE);
2034
2035 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2036 if (error) {
2037 zfs_dirent_unlock(dl);
2038 if (error == ERESTART) {
2039 waited = B_TRUE;
2040 dmu_tx_wait(tx);
2041 dmu_tx_abort(tx);
2042 goto top;
2043 }
2044 zfs_acl_ids_free(&acl_ids);
2045 dmu_tx_abort(tx);
2046 ZFS_EXIT(zfsvfs);
2047 return (error);
2048 }
2049
2050 /*
2051 * Create new node.
2052 */
2053 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2054
2055 /*
2056 * Now put new name in parent dir.
2057 */
2058 error = zfs_link_create(dl, zp, tx, ZNEW);
2059 if (error != 0) {
2060 zfs_znode_delete(zp, tx);
2061 remove_inode_hash(ZTOI(zp));
2062 goto out;
2063 }
2064
2065 if (fuid_dirtied)
2066 zfs_fuid_sync(zfsvfs, tx);
2067
2068 *ipp = ZTOI(zp);
2069
2070 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2071 if (flags & FIGNORECASE)
2072 txtype |= TX_CI;
2073 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2074 acl_ids.z_fuidp, vap);
2075
2076 out:
2077 zfs_acl_ids_free(&acl_ids);
2078
2079 dmu_tx_commit(tx);
2080
2081 zfs_dirent_unlock(dl);
2082
2083 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2084 zil_commit(zilog, 0);
2085
2086 if (error != 0) {
2087 iput(ZTOI(zp));
2088 } else {
2089 zfs_inode_update(dzp);
2090 zfs_inode_update(zp);
2091 }
2092 ZFS_EXIT(zfsvfs);
2093 return (error);
2094 }
2095
2096 /*
2097 * Remove a directory subdir entry. If the current working
2098 * directory is the same as the subdir to be removed, the
2099 * remove will fail.
2100 *
2101 * IN: dip - inode of directory to remove from.
2102 * name - name of directory to be removed.
2103 * cwd - inode of current working directory.
2104 * cr - credentials of caller.
2105 * flags - case flags
2106 *
2107 * RETURN: 0 on success, error code on failure.
2108 *
2109 * Timestamps:
2110 * dip - ctime|mtime updated
2111 */
2112 /*ARGSUSED*/
2113 int
2114 zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
2115 int flags)
2116 {
2117 znode_t *dzp = ITOZ(dip);
2118 znode_t *zp;
2119 struct inode *ip;
2120 zfsvfs_t *zfsvfs = ITOZSB(dip);
2121 zilog_t *zilog;
2122 zfs_dirlock_t *dl;
2123 dmu_tx_t *tx;
2124 int error;
2125 int zflg = ZEXISTS;
2126 boolean_t waited = B_FALSE;
2127
2128 if (name == NULL)
2129 return (SET_ERROR(EINVAL));
2130
2131 ZFS_ENTER(zfsvfs);
2132 ZFS_VERIFY_ZP(dzp);
2133 zilog = zfsvfs->z_log;
2134
2135 if (flags & FIGNORECASE)
2136 zflg |= ZCILOOK;
2137 top:
2138 zp = NULL;
2139
2140 /*
2141 * Attempt to lock directory; fail if entry doesn't exist.
2142 */
2143 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2144 NULL, NULL))) {
2145 ZFS_EXIT(zfsvfs);
2146 return (error);
2147 }
2148
2149 ip = ZTOI(zp);
2150
2151 if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
2152 goto out;
2153 }
2154
2155 if (!S_ISDIR(ip->i_mode)) {
2156 error = SET_ERROR(ENOTDIR);
2157 goto out;
2158 }
2159
2160 if (ip == cwd) {
2161 error = SET_ERROR(EINVAL);
2162 goto out;
2163 }
2164
2165 /*
2166 * Grab a lock on the directory to make sure that no one is
2167 * trying to add (or lookup) entries while we are removing it.
2168 */
2169 rw_enter(&zp->z_name_lock, RW_WRITER);
2170
2171 /*
2172 * Grab a lock on the parent pointer to make sure we play well
2173 * with the treewalk and directory rename code.
2174 */
2175 rw_enter(&zp->z_parent_lock, RW_WRITER);
2176
2177 tx = dmu_tx_create(zfsvfs->z_os);
2178 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2179 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2180 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2181 zfs_sa_upgrade_txholds(tx, zp);
2182 zfs_sa_upgrade_txholds(tx, dzp);
2183 dmu_tx_mark_netfree(tx);
2184 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2185 if (error) {
2186 rw_exit(&zp->z_parent_lock);
2187 rw_exit(&zp->z_name_lock);
2188 zfs_dirent_unlock(dl);
2189 if (error == ERESTART) {
2190 waited = B_TRUE;
2191 dmu_tx_wait(tx);
2192 dmu_tx_abort(tx);
2193 iput(ip);
2194 goto top;
2195 }
2196 dmu_tx_abort(tx);
2197 iput(ip);
2198 ZFS_EXIT(zfsvfs);
2199 return (error);
2200 }
2201
2202 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2203
2204 if (error == 0) {
2205 uint64_t txtype = TX_RMDIR;
2206 if (flags & FIGNORECASE)
2207 txtype |= TX_CI;
2208 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2209 }
2210
2211 dmu_tx_commit(tx);
2212
2213 rw_exit(&zp->z_parent_lock);
2214 rw_exit(&zp->z_name_lock);
2215 out:
2216 zfs_dirent_unlock(dl);
2217
2218 zfs_inode_update(dzp);
2219 zfs_inode_update(zp);
2220 iput(ip);
2221
2222 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2223 zil_commit(zilog, 0);
2224
2225 ZFS_EXIT(zfsvfs);
2226 return (error);
2227 }
2228
2229 /*
2230 * Read as many directory entries as will fit into the provided
2231 * dirent buffer from the given directory cursor position.
2232 *
2233 * IN: ip - inode of directory to read.
2234 * dirent - buffer for directory entries.
2235 *
2236 * OUT: dirent - filler buffer of directory entries.
2237 *
2238 * RETURN: 0 if success
2239 * error code if failure
2240 *
2241 * Timestamps:
2242 * ip - atime updated
2243 *
2244 * Note that the low 4 bits of the cookie returned by zap is always zero.
2245 * This allows us to use the low range for "special" directory entries:
2246 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2247 * we use the offset 2 for the '.zfs' directory.
2248 */
2249 /* ARGSUSED */
2250 int
2251 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
2252 {
2253 znode_t *zp = ITOZ(ip);
2254 zfsvfs_t *zfsvfs = ITOZSB(ip);
2255 objset_t *os;
2256 zap_cursor_t zc;
2257 zap_attribute_t zap;
2258 int error;
2259 uint8_t prefetch;
2260 uint8_t type;
2261 int done = 0;
2262 uint64_t parent;
2263 uint64_t offset; /* must be unsigned; checks for < 1 */
2264
2265 ZFS_ENTER(zfsvfs);
2266 ZFS_VERIFY_ZP(zp);
2267
2268 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2269 &parent, sizeof (parent))) != 0)
2270 goto out;
2271
2272 /*
2273 * Quit if directory has been removed (posix)
2274 */
2275 if (zp->z_unlinked)
2276 goto out;
2277
2278 error = 0;
2279 os = zfsvfs->z_os;
2280 offset = ctx->pos;
2281 prefetch = zp->z_zn_prefetch;
2282
2283 /*
2284 * Initialize the iterator cursor.
2285 */
2286 if (offset <= 3) {
2287 /*
2288 * Start iteration from the beginning of the directory.
2289 */
2290 zap_cursor_init(&zc, os, zp->z_id);
2291 } else {
2292 /*
2293 * The offset is a serialized cursor.
2294 */
2295 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2296 }
2297
2298 /*
2299 * Transform to file-system independent format
2300 */
2301 while (!done) {
2302 uint64_t objnum;
2303 /*
2304 * Special case `.', `..', and `.zfs'.
2305 */
2306 if (offset == 0) {
2307 (void) strcpy(zap.za_name, ".");
2308 zap.za_normalization_conflict = 0;
2309 objnum = zp->z_id;
2310 type = DT_DIR;
2311 } else if (offset == 1) {
2312 (void) strcpy(zap.za_name, "..");
2313 zap.za_normalization_conflict = 0;
2314 objnum = parent;
2315 type = DT_DIR;
2316 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2317 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2318 zap.za_normalization_conflict = 0;
2319 objnum = ZFSCTL_INO_ROOT;
2320 type = DT_DIR;
2321 } else {
2322 /*
2323 * Grab next entry.
2324 */
2325 if ((error = zap_cursor_retrieve(&zc, &zap))) {
2326 if (error == ENOENT)
2327 break;
2328 else
2329 goto update;
2330 }
2331
2332 /*
2333 * Allow multiple entries provided the first entry is
2334 * the object id. Non-zpl consumers may safely make
2335 * use of the additional space.
2336 *
2337 * XXX: This should be a feature flag for compatibility
2338 */
2339 if (zap.za_integer_length != 8 ||
2340 zap.za_num_integers == 0) {
2341 cmn_err(CE_WARN, "zap_readdir: bad directory "
2342 "entry, obj = %lld, offset = %lld, "
2343 "length = %d, num = %lld\n",
2344 (u_longlong_t)zp->z_id,
2345 (u_longlong_t)offset,
2346 zap.za_integer_length,
2347 (u_longlong_t)zap.za_num_integers);
2348 error = SET_ERROR(ENXIO);
2349 goto update;
2350 }
2351
2352 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2353 type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2354 }
2355
2356 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
2357 objnum, type);
2358 if (done)
2359 break;
2360
2361 /* Prefetch znode */
2362 if (prefetch) {
2363 dmu_prefetch(os, objnum, 0, 0, 0,
2364 ZIO_PRIORITY_SYNC_READ);
2365 }
2366
2367 /*
2368 * Move to the next entry, fill in the previous offset.
2369 */
2370 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2371 zap_cursor_advance(&zc);
2372 offset = zap_cursor_serialize(&zc);
2373 } else {
2374 offset += 1;
2375 }
2376 ctx->pos = offset;
2377 }
2378 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2379
2380 update:
2381 zap_cursor_fini(&zc);
2382 if (error == ENOENT)
2383 error = 0;
2384 out:
2385 ZFS_EXIT(zfsvfs);
2386
2387 return (error);
2388 }
2389
2390 ulong_t zfs_fsync_sync_cnt = 4;
2391
2392 int
2393 zfs_fsync(struct inode *ip, int syncflag, cred_t *cr)
2394 {
2395 znode_t *zp = ITOZ(ip);
2396 zfsvfs_t *zfsvfs = ITOZSB(ip);
2397
2398 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2399
2400 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2401 ZFS_ENTER(zfsvfs);
2402 ZFS_VERIFY_ZP(zp);
2403 zil_commit(zfsvfs->z_log, zp->z_id);
2404 ZFS_EXIT(zfsvfs);
2405 }
2406 tsd_set(zfs_fsyncer_key, NULL);
2407
2408 return (0);
2409 }
2410
2411
2412 /*
2413 * Get the requested file attributes and place them in the provided
2414 * vattr structure.
2415 *
2416 * IN: ip - inode of file.
2417 * vap - va_mask identifies requested attributes.
2418 * If ATTR_XVATTR set, then optional attrs are requested
2419 * flags - ATTR_NOACLCHECK (CIFS server context)
2420 * cr - credentials of caller.
2421 *
2422 * OUT: vap - attribute values.
2423 *
2424 * RETURN: 0 (always succeeds)
2425 */
2426 /* ARGSUSED */
2427 int
2428 zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
2429 {
2430 znode_t *zp = ITOZ(ip);
2431 zfsvfs_t *zfsvfs = ITOZSB(ip);
2432 int error = 0;
2433 uint64_t links;
2434 uint64_t atime[2], mtime[2], ctime[2];
2435 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2436 xoptattr_t *xoap = NULL;
2437 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2438 sa_bulk_attr_t bulk[3];
2439 int count = 0;
2440
2441 ZFS_ENTER(zfsvfs);
2442 ZFS_VERIFY_ZP(zp);
2443
2444 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2445
2446 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
2447 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2448 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2449
2450 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2451 ZFS_EXIT(zfsvfs);
2452 return (error);
2453 }
2454
2455 /*
2456 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2457 * Also, if we are the owner don't bother, since owner should
2458 * always be allowed to read basic attributes of file.
2459 */
2460 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2461 (vap->va_uid != crgetuid(cr))) {
2462 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2463 skipaclchk, cr))) {
2464 ZFS_EXIT(zfsvfs);
2465 return (error);
2466 }
2467 }
2468
2469 /*
2470 * Return all attributes. It's cheaper to provide the answer
2471 * than to determine whether we were asked the question.
2472 */
2473
2474 mutex_enter(&zp->z_lock);
2475 vap->va_type = vn_mode_to_vtype(zp->z_mode);
2476 vap->va_mode = zp->z_mode;
2477 vap->va_fsid = ZTOI(zp)->i_sb->s_dev;
2478 vap->va_nodeid = zp->z_id;
2479 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
2480 links = ZTOI(zp)->i_nlink + 1;
2481 else
2482 links = ZTOI(zp)->i_nlink;
2483 vap->va_nlink = MIN(links, ZFS_LINK_MAX);
2484 vap->va_size = i_size_read(ip);
2485 vap->va_rdev = ip->i_rdev;
2486 vap->va_seq = ip->i_generation;
2487
2488 /*
2489 * Add in any requested optional attributes and the create time.
2490 * Also set the corresponding bits in the returned attribute bitmap.
2491 */
2492 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2493 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2494 xoap->xoa_archive =
2495 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2496 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2497 }
2498
2499 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2500 xoap->xoa_readonly =
2501 ((zp->z_pflags & ZFS_READONLY) != 0);
2502 XVA_SET_RTN(xvap, XAT_READONLY);
2503 }
2504
2505 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2506 xoap->xoa_system =
2507 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2508 XVA_SET_RTN(xvap, XAT_SYSTEM);
2509 }
2510
2511 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2512 xoap->xoa_hidden =
2513 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2514 XVA_SET_RTN(xvap, XAT_HIDDEN);
2515 }
2516
2517 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2518 xoap->xoa_nounlink =
2519 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2520 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2521 }
2522
2523 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2524 xoap->xoa_immutable =
2525 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2526 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2527 }
2528
2529 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2530 xoap->xoa_appendonly =
2531 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2532 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2533 }
2534
2535 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2536 xoap->xoa_nodump =
2537 ((zp->z_pflags & ZFS_NODUMP) != 0);
2538 XVA_SET_RTN(xvap, XAT_NODUMP);
2539 }
2540
2541 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2542 xoap->xoa_opaque =
2543 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2544 XVA_SET_RTN(xvap, XAT_OPAQUE);
2545 }
2546
2547 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2548 xoap->xoa_av_quarantined =
2549 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2550 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2551 }
2552
2553 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2554 xoap->xoa_av_modified =
2555 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2556 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2557 }
2558
2559 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2560 S_ISREG(ip->i_mode)) {
2561 zfs_sa_get_scanstamp(zp, xvap);
2562 }
2563
2564 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2565 uint64_t times[2];
2566
2567 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2568 times, sizeof (times));
2569 ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2570 XVA_SET_RTN(xvap, XAT_CREATETIME);
2571 }
2572
2573 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2574 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2575 XVA_SET_RTN(xvap, XAT_REPARSE);
2576 }
2577 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2578 xoap->xoa_generation = ip->i_generation;
2579 XVA_SET_RTN(xvap, XAT_GEN);
2580 }
2581
2582 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2583 xoap->xoa_offline =
2584 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2585 XVA_SET_RTN(xvap, XAT_OFFLINE);
2586 }
2587
2588 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2589 xoap->xoa_sparse =
2590 ((zp->z_pflags & ZFS_SPARSE) != 0);
2591 XVA_SET_RTN(xvap, XAT_SPARSE);
2592 }
2593
2594 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2595 xoap->xoa_projinherit =
2596 ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
2597 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
2598 }
2599
2600 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2601 xoap->xoa_projid = zp->z_projid;
2602 XVA_SET_RTN(xvap, XAT_PROJID);
2603 }
2604 }
2605
2606 ZFS_TIME_DECODE(&vap->va_atime, atime);
2607 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2608 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2609
2610 mutex_exit(&zp->z_lock);
2611
2612 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2613
2614 if (zp->z_blksz == 0) {
2615 /*
2616 * Block size hasn't been set; suggest maximal I/O transfers.
2617 */
2618 vap->va_blksize = zfsvfs->z_max_blksz;
2619 }
2620
2621 ZFS_EXIT(zfsvfs);
2622 return (0);
2623 }
2624
2625 /*
2626 * Get the basic file attributes and place them in the provided kstat
2627 * structure. The inode is assumed to be the authoritative source
2628 * for most of the attributes. However, the znode currently has the
2629 * authoritative atime, blksize, and block count.
2630 *
2631 * IN: ip - inode of file.
2632 *
2633 * OUT: sp - kstat values.
2634 *
2635 * RETURN: 0 (always succeeds)
2636 */
2637 /* ARGSUSED */
2638 int
2639 zfs_getattr_fast(struct inode *ip, struct kstat *sp)
2640 {
2641 znode_t *zp = ITOZ(ip);
2642 zfsvfs_t *zfsvfs = ITOZSB(ip);
2643 uint32_t blksize;
2644 u_longlong_t nblocks;
2645
2646 ZFS_ENTER(zfsvfs);
2647 ZFS_VERIFY_ZP(zp);
2648
2649 mutex_enter(&zp->z_lock);
2650
2651 generic_fillattr(ip, sp);
2652
2653 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2654 sp->blksize = blksize;
2655 sp->blocks = nblocks;
2656
2657 if (unlikely(zp->z_blksz == 0)) {
2658 /*
2659 * Block size hasn't been set; suggest maximal I/O transfers.
2660 */
2661 sp->blksize = zfsvfs->z_max_blksz;
2662 }
2663
2664 mutex_exit(&zp->z_lock);
2665
2666 /*
2667 * Required to prevent NFS client from detecting different inode
2668 * numbers of snapshot root dentry before and after snapshot mount.
2669 */
2670 if (zfsvfs->z_issnap) {
2671 if (ip->i_sb->s_root->d_inode == ip)
2672 sp->ino = ZFSCTL_INO_SNAPDIRS -
2673 dmu_objset_id(zfsvfs->z_os);
2674 }
2675
2676 ZFS_EXIT(zfsvfs);
2677
2678 return (0);
2679 }
2680
2681 /*
2682 * For the operation of changing file's user/group/project, we need to
2683 * handle not only the main object that is assigned to the file directly,
2684 * but also the ones that are used by the file via hidden xattr directory.
2685 *
2686 * Because the xattr directory may contains many EA entries, as to it may
2687 * be impossible to change all of them via the transaction of changing the
2688 * main object's user/group/project attributes. Then we have to change them
2689 * via other multiple independent transactions one by one. It may be not good
2690 * solution, but we have no better idea yet.
2691 */
2692 static int
2693 zfs_setattr_dir(znode_t *dzp)
2694 {
2695 struct inode *dxip = ZTOI(dzp);
2696 struct inode *xip = NULL;
2697 zfsvfs_t *zfsvfs = ITOZSB(dxip);
2698 objset_t *os = zfsvfs->z_os;
2699 zap_cursor_t zc;
2700 zap_attribute_t zap;
2701 zfs_dirlock_t *dl;
2702 znode_t *zp;
2703 dmu_tx_t *tx = NULL;
2704 uint64_t uid, gid;
2705 sa_bulk_attr_t bulk[4];
2706 int count = 0;
2707 int err;
2708
2709 zap_cursor_init(&zc, os, dzp->z_id);
2710 while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
2711 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
2712 err = ENXIO;
2713 break;
2714 }
2715
2716 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
2717 ZEXISTS, NULL, NULL);
2718 if (err == ENOENT)
2719 goto next;
2720 if (err)
2721 break;
2722
2723 xip = ZTOI(zp);
2724 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
2725 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
2726 zp->z_projid == dzp->z_projid)
2727 goto next;
2728
2729 tx = dmu_tx_create(os);
2730 if (!(zp->z_pflags & ZFS_PROJID))
2731 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2732 else
2733 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2734
2735 err = dmu_tx_assign(tx, TXG_WAIT);
2736 if (err)
2737 break;
2738
2739 mutex_enter(&dzp->z_lock);
2740
2741 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
2742 xip->i_uid = dxip->i_uid;
2743 uid = zfs_uid_read(dxip);
2744 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2745 &uid, sizeof (uid));
2746 }
2747
2748 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
2749 xip->i_gid = dxip->i_gid;
2750 gid = zfs_gid_read(dxip);
2751 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2752 &gid, sizeof (gid));
2753 }
2754
2755 if (zp->z_projid != dzp->z_projid) {
2756 if (!(zp->z_pflags & ZFS_PROJID)) {
2757 zp->z_pflags |= ZFS_PROJID;
2758 SA_ADD_BULK_ATTR(bulk, count,
2759 SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
2760 sizeof (zp->z_pflags));
2761 }
2762
2763 zp->z_projid = dzp->z_projid;
2764 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
2765 NULL, &zp->z_projid, sizeof (zp->z_projid));
2766 }
2767
2768 mutex_exit(&dzp->z_lock);
2769
2770 if (likely(count > 0)) {
2771 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2772 dmu_tx_commit(tx);
2773 } else {
2774 dmu_tx_abort(tx);
2775 }
2776 tx = NULL;
2777 if (err != 0 && err != ENOENT)
2778 break;
2779
2780 next:
2781 if (xip) {
2782 iput(xip);
2783 xip = NULL;
2784 zfs_dirent_unlock(dl);
2785 }
2786 zap_cursor_advance(&zc);
2787 }
2788
2789 if (tx)
2790 dmu_tx_abort(tx);
2791 if (xip) {
2792 iput(xip);
2793 zfs_dirent_unlock(dl);
2794 }
2795 zap_cursor_fini(&zc);
2796
2797 return (err == ENOENT ? 0 : err);
2798 }
2799
2800 /*
2801 * Set the file attributes to the values contained in the
2802 * vattr structure.
2803 *
2804 * IN: ip - inode of file to be modified.
2805 * vap - new attribute values.
2806 * If ATTR_XVATTR set, then optional attrs are being set
2807 * flags - ATTR_UTIME set if non-default time values provided.
2808 * - ATTR_NOACLCHECK (CIFS context only).
2809 * cr - credentials of caller.
2810 *
2811 * RETURN: 0 if success
2812 * error code if failure
2813 *
2814 * Timestamps:
2815 * ip - ctime updated, mtime updated if size changed.
2816 */
2817 /* ARGSUSED */
2818 int
2819 zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
2820 {
2821 znode_t *zp = ITOZ(ip);
2822 zfsvfs_t *zfsvfs = ITOZSB(ip);
2823 objset_t *os = zfsvfs->z_os;
2824 zilog_t *zilog;
2825 dmu_tx_t *tx;
2826 vattr_t oldva;
2827 xvattr_t *tmpxvattr;
2828 uint_t mask = vap->va_mask;
2829 uint_t saved_mask = 0;
2830 int trim_mask = 0;
2831 uint64_t new_mode;
2832 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
2833 uint64_t xattr_obj;
2834 uint64_t mtime[2], ctime[2], atime[2];
2835 uint64_t projid = ZFS_INVALID_PROJID;
2836 znode_t *attrzp;
2837 int need_policy = FALSE;
2838 int err, err2 = 0;
2839 zfs_fuid_info_t *fuidp = NULL;
2840 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2841 xoptattr_t *xoap;
2842 zfs_acl_t *aclp;
2843 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2844 boolean_t fuid_dirtied = B_FALSE;
2845 boolean_t handle_eadir = B_FALSE;
2846 sa_bulk_attr_t *bulk, *xattr_bulk;
2847 int count = 0, xattr_count = 0, bulks = 8;
2848
2849 if (mask == 0)
2850 return (0);
2851
2852 ZFS_ENTER(zfsvfs);
2853 ZFS_VERIFY_ZP(zp);
2854
2855 /*
2856 * If this is a xvattr_t, then get a pointer to the structure of
2857 * optional attributes. If this is NULL, then we have a vattr_t.
2858 */
2859 xoap = xva_getxoptattr(xvap);
2860 if (xoap != NULL && (mask & ATTR_XVATTR)) {
2861 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2862 if (!dmu_objset_projectquota_enabled(os) ||
2863 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
2864 ZFS_EXIT(zfsvfs);
2865 return (SET_ERROR(ENOTSUP));
2866 }
2867
2868 projid = xoap->xoa_projid;
2869 if (unlikely(projid == ZFS_INVALID_PROJID)) {
2870 ZFS_EXIT(zfsvfs);
2871 return (SET_ERROR(EINVAL));
2872 }
2873
2874 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
2875 projid = ZFS_INVALID_PROJID;
2876 else
2877 need_policy = TRUE;
2878 }
2879
2880 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
2881 (xoap->xoa_projinherit !=
2882 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
2883 (!dmu_objset_projectquota_enabled(os) ||
2884 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
2885 ZFS_EXIT(zfsvfs);
2886 return (SET_ERROR(ENOTSUP));
2887 }
2888 }
2889
2890 zilog = zfsvfs->z_log;
2891
2892 /*
2893 * Make sure that if we have ephemeral uid/gid or xvattr specified
2894 * that file system is at proper version level
2895 */
2896
2897 if (zfsvfs->z_use_fuids == B_FALSE &&
2898 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2899 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2900 (mask & ATTR_XVATTR))) {
2901 ZFS_EXIT(zfsvfs);
2902 return (SET_ERROR(EINVAL));
2903 }
2904
2905 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2906 ZFS_EXIT(zfsvfs);
2907 return (SET_ERROR(EISDIR));
2908 }
2909
2910 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2911 ZFS_EXIT(zfsvfs);
2912 return (SET_ERROR(EINVAL));
2913 }
2914
2915 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
2916 xva_init(tmpxvattr);
2917
2918 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2919 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2920
2921 /*
2922 * Immutable files can only alter immutable bit and atime
2923 */
2924 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2925 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
2926 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2927 err = SET_ERROR(EPERM);
2928 goto out3;
2929 }
2930
2931 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2932 err = SET_ERROR(EPERM);
2933 goto out3;
2934 }
2935
2936 /*
2937 * Verify timestamps doesn't overflow 32 bits.
2938 * ZFS can handle large timestamps, but 32bit syscalls can't
2939 * handle times greater than 2039. This check should be removed
2940 * once large timestamps are fully supported.
2941 */
2942 if (mask & (ATTR_ATIME | ATTR_MTIME)) {
2943 if (((mask & ATTR_ATIME) &&
2944 TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2945 ((mask & ATTR_MTIME) &&
2946 TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2947 err = SET_ERROR(EOVERFLOW);
2948 goto out3;
2949 }
2950 }
2951
2952 top:
2953 attrzp = NULL;
2954 aclp = NULL;
2955
2956 /* Can this be moved to before the top label? */
2957 if (zfs_is_readonly(zfsvfs)) {
2958 err = SET_ERROR(EROFS);
2959 goto out3;
2960 }
2961
2962 /*
2963 * First validate permissions
2964 */
2965
2966 if (mask & ATTR_SIZE) {
2967 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2968 if (err)
2969 goto out3;
2970
2971 /*
2972 * XXX - Note, we are not providing any open
2973 * mode flags here (like FNDELAY), so we may
2974 * block if there are locks present... this
2975 * should be addressed in openat().
2976 */
2977 /* XXX - would it be OK to generate a log record here? */
2978 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2979 if (err)
2980 goto out3;
2981 }
2982
2983 if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2984 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2985 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2986 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2987 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2988 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2989 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2990 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2991 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2992 skipaclchk, cr);
2993 }
2994
2995 if (mask & (ATTR_UID|ATTR_GID)) {
2996 int idmask = (mask & (ATTR_UID|ATTR_GID));
2997 int take_owner;
2998 int take_group;
2999
3000 /*
3001 * NOTE: even if a new mode is being set,
3002 * we may clear S_ISUID/S_ISGID bits.
3003 */
3004
3005 if (!(mask & ATTR_MODE))
3006 vap->va_mode = zp->z_mode;
3007
3008 /*
3009 * Take ownership or chgrp to group we are a member of
3010 */
3011
3012 take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
3013 take_group = (mask & ATTR_GID) &&
3014 zfs_groupmember(zfsvfs, vap->va_gid, cr);
3015
3016 /*
3017 * If both ATTR_UID and ATTR_GID are set then take_owner and
3018 * take_group must both be set in order to allow taking
3019 * ownership.
3020 *
3021 * Otherwise, send the check through secpolicy_vnode_setattr()
3022 *
3023 */
3024
3025 if (((idmask == (ATTR_UID|ATTR_GID)) &&
3026 take_owner && take_group) ||
3027 ((idmask == ATTR_UID) && take_owner) ||
3028 ((idmask == ATTR_GID) && take_group)) {
3029 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3030 skipaclchk, cr) == 0) {
3031 /*
3032 * Remove setuid/setgid for non-privileged users
3033 */
3034 (void) secpolicy_setid_clear(vap, cr);
3035 trim_mask = (mask & (ATTR_UID|ATTR_GID));
3036 } else {
3037 need_policy = TRUE;
3038 }
3039 } else {
3040 need_policy = TRUE;
3041 }
3042 }
3043
3044 mutex_enter(&zp->z_lock);
3045 oldva.va_mode = zp->z_mode;
3046 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3047 if (mask & ATTR_XVATTR) {
3048 /*
3049 * Update xvattr mask to include only those attributes
3050 * that are actually changing.
3051 *
3052 * the bits will be restored prior to actually setting
3053 * the attributes so the caller thinks they were set.
3054 */
3055 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3056 if (xoap->xoa_appendonly !=
3057 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3058 need_policy = TRUE;
3059 } else {
3060 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3061 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
3062 }
3063 }
3064
3065 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
3066 if (xoap->xoa_projinherit !=
3067 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
3068 need_policy = TRUE;
3069 } else {
3070 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
3071 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
3072 }
3073 }
3074
3075 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3076 if (xoap->xoa_nounlink !=
3077 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3078 need_policy = TRUE;
3079 } else {
3080 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3081 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
3082 }
3083 }
3084
3085 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3086 if (xoap->xoa_immutable !=
3087 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3088 need_policy = TRUE;
3089 } else {
3090 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3091 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
3092 }
3093 }
3094
3095 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3096 if (xoap->xoa_nodump !=
3097 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3098 need_policy = TRUE;
3099 } else {
3100 XVA_CLR_REQ(xvap, XAT_NODUMP);
3101 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
3102 }
3103 }
3104
3105 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3106 if (xoap->xoa_av_modified !=
3107 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3108 need_policy = TRUE;
3109 } else {
3110 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3111 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
3112 }
3113 }
3114
3115 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3116 if ((!S_ISREG(ip->i_mode) &&
3117 xoap->xoa_av_quarantined) ||
3118 xoap->xoa_av_quarantined !=
3119 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3120 need_policy = TRUE;
3121 } else {
3122 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3123 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
3124 }
3125 }
3126
3127 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3128 mutex_exit(&zp->z_lock);
3129 err = SET_ERROR(EPERM);
3130 goto out3;
3131 }
3132
3133 if (need_policy == FALSE &&
3134 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3135 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3136 need_policy = TRUE;
3137 }
3138 }
3139
3140 mutex_exit(&zp->z_lock);
3141
3142 if (mask & ATTR_MODE) {
3143 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3144 err = secpolicy_setid_setsticky_clear(ip, vap,
3145 &oldva, cr);
3146 if (err)
3147 goto out3;
3148
3149 trim_mask |= ATTR_MODE;
3150 } else {
3151 need_policy = TRUE;
3152 }
3153 }
3154
3155 if (need_policy) {
3156 /*
3157 * If trim_mask is set then take ownership
3158 * has been granted or write_acl is present and user
3159 * has the ability to modify mode. In that case remove
3160 * UID|GID and or MODE from mask so that
3161 * secpolicy_vnode_setattr() doesn't revoke it.
3162 */
3163
3164 if (trim_mask) {
3165 saved_mask = vap->va_mask;
3166 vap->va_mask &= ~trim_mask;
3167 }
3168 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
3169 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3170 if (err)
3171 goto out3;
3172
3173 if (trim_mask)
3174 vap->va_mask |= saved_mask;
3175 }
3176
3177 /*
3178 * secpolicy_vnode_setattr, or take ownership may have
3179 * changed va_mask
3180 */
3181 mask = vap->va_mask;
3182
3183 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
3184 handle_eadir = B_TRUE;
3185 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3186 &xattr_obj, sizeof (xattr_obj));
3187
3188 if (err == 0 && xattr_obj) {
3189 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
3190 if (err)
3191 goto out2;
3192 }
3193 if (mask & ATTR_UID) {
3194 new_kuid = zfs_fuid_create(zfsvfs,
3195 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3196 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
3197 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
3198 new_kuid)) {
3199 if (attrzp)
3200 iput(ZTOI(attrzp));
3201 err = SET_ERROR(EDQUOT);
3202 goto out2;
3203 }
3204 }
3205
3206 if (mask & ATTR_GID) {
3207 new_kgid = zfs_fuid_create(zfsvfs,
3208 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
3209 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
3210 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3211 new_kgid)) {
3212 if (attrzp)
3213 iput(ZTOI(attrzp));
3214 err = SET_ERROR(EDQUOT);
3215 goto out2;
3216 }
3217 }
3218
3219 if (projid != ZFS_INVALID_PROJID &&
3220 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
3221 if (attrzp)
3222 iput(ZTOI(attrzp));
3223 err = EDQUOT;
3224 goto out2;
3225 }
3226 }
3227 tx = dmu_tx_create(os);
3228
3229 if (mask & ATTR_MODE) {
3230 uint64_t pmode = zp->z_mode;
3231 uint64_t acl_obj;
3232 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3233
3234 zfs_acl_chmod_setattr(zp, &aclp, new_mode);
3235
3236 mutex_enter(&zp->z_lock);
3237 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3238 /*
3239 * Are we upgrading ACL from old V0 format
3240 * to V1 format?
3241 */
3242 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3243 zfs_znode_acl_version(zp) ==
3244 ZFS_ACL_VERSION_INITIAL) {
3245 dmu_tx_hold_free(tx, acl_obj, 0,
3246 DMU_OBJECT_END);
3247 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3248 0, aclp->z_acl_bytes);
3249 } else {
3250 dmu_tx_hold_write(tx, acl_obj, 0,
3251 aclp->z_acl_bytes);
3252 }
3253 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3254 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3255 0, aclp->z_acl_bytes);
3256 }
3257 mutex_exit(&zp->z_lock);
3258 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3259 } else {
3260 if (((mask & ATTR_XVATTR) &&
3261 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
3262 (projid != ZFS_INVALID_PROJID &&
3263 !(zp->z_pflags & ZFS_PROJID)))
3264 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3265 else
3266 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3267 }
3268
3269 if (attrzp) {
3270 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3271 }
3272
3273 fuid_dirtied = zfsvfs->z_fuid_dirty;
3274 if (fuid_dirtied)
3275 zfs_fuid_txhold(zfsvfs, tx);
3276
3277 zfs_sa_upgrade_txholds(tx, zp);
3278
3279 err = dmu_tx_assign(tx, TXG_WAIT);
3280 if (err)
3281 goto out;
3282
3283 count = 0;
3284 /*
3285 * Set each attribute requested.
3286 * We group settings according to the locks they need to acquire.
3287 *
3288 * Note: you cannot set ctime directly, although it will be
3289 * updated as a side-effect of calling this function.
3290 */
3291
3292 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
3293 /*
3294 * For the existed object that is upgraded from old system,
3295 * its on-disk layout has no slot for the project ID attribute.
3296 * But quota accounting logic needs to access related slots by
3297 * offset directly. So we need to adjust old objects' layout
3298 * to make the project ID to some unified and fixed offset.
3299 */
3300 if (attrzp)
3301 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
3302 if (err == 0)
3303 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
3304
3305 if (unlikely(err == EEXIST))
3306 err = 0;
3307 else if (err != 0)
3308 goto out;
3309 else
3310 projid = ZFS_INVALID_PROJID;
3311 }
3312
3313 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3314 mutex_enter(&zp->z_acl_lock);
3315 mutex_enter(&zp->z_lock);
3316
3317 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3318 &zp->z_pflags, sizeof (zp->z_pflags));
3319
3320 if (attrzp) {
3321 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3322 mutex_enter(&attrzp->z_acl_lock);
3323 mutex_enter(&attrzp->z_lock);
3324 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3325 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3326 sizeof (attrzp->z_pflags));
3327 if (projid != ZFS_INVALID_PROJID) {
3328 attrzp->z_projid = projid;
3329 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3330 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
3331 sizeof (attrzp->z_projid));
3332 }
3333 }
3334
3335 if (mask & (ATTR_UID|ATTR_GID)) {
3336
3337 if (mask & ATTR_UID) {
3338 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
3339 new_uid = zfs_uid_read(ZTOI(zp));
3340 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3341 &new_uid, sizeof (new_uid));
3342 if (attrzp) {
3343 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3344 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3345 sizeof (new_uid));
3346 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
3347 }
3348 }
3349
3350 if (mask & ATTR_GID) {
3351 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
3352 new_gid = zfs_gid_read(ZTOI(zp));
3353 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3354 NULL, &new_gid, sizeof (new_gid));
3355 if (attrzp) {
3356 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3357 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3358 sizeof (new_gid));
3359 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
3360 }
3361 }
3362 if (!(mask & ATTR_MODE)) {
3363 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3364 NULL, &new_mode, sizeof (new_mode));
3365 new_mode = zp->z_mode;
3366 }
3367 err = zfs_acl_chown_setattr(zp);
3368 ASSERT(err == 0);
3369 if (attrzp) {
3370 err = zfs_acl_chown_setattr(attrzp);
3371 ASSERT(err == 0);
3372 }
3373 }
3374
3375 if (mask & ATTR_MODE) {
3376 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3377 &new_mode, sizeof (new_mode));
3378 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
3379 ASSERT3P(aclp, !=, NULL);
3380 err = zfs_aclset_common(zp, aclp, cr, tx);
3381 ASSERT0(err);
3382 if (zp->z_acl_cached)
3383 zfs_acl_free(zp->z_acl_cached);
3384 zp->z_acl_cached = aclp;
3385 aclp = NULL;
3386 }
3387
3388 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
3389 zp->z_atime_dirty = 0;
3390 ZFS_TIME_ENCODE(&ip->i_atime, atime);
3391 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3392 &atime, sizeof (atime));
3393 }
3394
3395 if (mask & (ATTR_MTIME | ATTR_SIZE)) {
3396 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3397 ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime,
3398 ZTOI(zp)->i_sb->s_time_gran);
3399
3400 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3401 mtime, sizeof (mtime));
3402 }
3403
3404 if (mask & (ATTR_CTIME | ATTR_SIZE)) {
3405 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
3406 ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime,
3407 ZTOI(zp)->i_sb->s_time_gran);
3408 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3409 ctime, sizeof (ctime));
3410 }
3411
3412 if (projid != ZFS_INVALID_PROJID) {
3413 zp->z_projid = projid;
3414 SA_ADD_BULK_ATTR(bulk, count,
3415 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
3416 sizeof (zp->z_projid));
3417 }
3418
3419 if (attrzp && mask) {
3420 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3421 SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
3422 sizeof (ctime));
3423 }
3424
3425 /*
3426 * Do this after setting timestamps to prevent timestamp
3427 * update from toggling bit
3428 */
3429
3430 if (xoap && (mask & ATTR_XVATTR)) {
3431
3432 /*
3433 * restore trimmed off masks
3434 * so that return masks can be set for caller.
3435 */
3436
3437 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
3438 XVA_SET_REQ(xvap, XAT_APPENDONLY);
3439 }
3440 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
3441 XVA_SET_REQ(xvap, XAT_NOUNLINK);
3442 }
3443 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
3444 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3445 }
3446 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
3447 XVA_SET_REQ(xvap, XAT_NODUMP);
3448 }
3449 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
3450 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3451 }
3452 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
3453 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3454 }
3455 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
3456 XVA_SET_REQ(xvap, XAT_PROJINHERIT);
3457 }
3458
3459 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3460 ASSERT(S_ISREG(ip->i_mode));
3461
3462 zfs_xvattr_set(zp, xvap, tx);
3463 }
3464
3465 if (fuid_dirtied)
3466 zfs_fuid_sync(zfsvfs, tx);
3467
3468 if (mask != 0)
3469 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3470
3471 mutex_exit(&zp->z_lock);
3472 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3473 mutex_exit(&zp->z_acl_lock);
3474
3475 if (attrzp) {
3476 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
3477 mutex_exit(&attrzp->z_acl_lock);
3478 mutex_exit(&attrzp->z_lock);
3479 }
3480 out:
3481 if (err == 0 && xattr_count > 0) {
3482 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3483 xattr_count, tx);
3484 ASSERT(err2 == 0);
3485 }
3486
3487 if (aclp)
3488 zfs_acl_free(aclp);
3489
3490 if (fuidp) {
3491 zfs_fuid_info_free(fuidp);
3492 fuidp = NULL;
3493 }
3494
3495 if (err) {
3496 dmu_tx_abort(tx);
3497 if (attrzp)
3498 iput(ZTOI(attrzp));
3499 if (err == ERESTART)
3500 goto top;
3501 } else {
3502 if (count > 0)
3503 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3504 dmu_tx_commit(tx);
3505 if (attrzp) {
3506 if (err2 == 0 && handle_eadir)
3507 err2 = zfs_setattr_dir(attrzp);
3508 iput(ZTOI(attrzp));
3509 }
3510 zfs_inode_update(zp);
3511 }
3512
3513 out2:
3514 if (os->os_sync == ZFS_SYNC_ALWAYS)
3515 zil_commit(zilog, 0);
3516
3517 out3:
3518 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
3519 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
3520 kmem_free(tmpxvattr, sizeof (xvattr_t));
3521 ZFS_EXIT(zfsvfs);
3522 return (err);
3523 }
3524
3525 typedef struct zfs_zlock {
3526 krwlock_t *zl_rwlock; /* lock we acquired */
3527 znode_t *zl_znode; /* znode we held */
3528 struct zfs_zlock *zl_next; /* next in list */
3529 } zfs_zlock_t;
3530
3531 /*
3532 * Drop locks and release vnodes that were held by zfs_rename_lock().
3533 */
3534 static void
3535 zfs_rename_unlock(zfs_zlock_t **zlpp)
3536 {
3537 zfs_zlock_t *zl;
3538
3539 while ((zl = *zlpp) != NULL) {
3540 if (zl->zl_znode != NULL)
3541 zfs_iput_async(ZTOI(zl->zl_znode));
3542 rw_exit(zl->zl_rwlock);
3543 *zlpp = zl->zl_next;
3544 kmem_free(zl, sizeof (*zl));
3545 }
3546 }
3547
3548 /*
3549 * Search back through the directory tree, using the ".." entries.
3550 * Lock each directory in the chain to prevent concurrent renames.
3551 * Fail any attempt to move a directory into one of its own descendants.
3552 * XXX - z_parent_lock can overlap with map or grow locks
3553 */
3554 static int
3555 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3556 {
3557 zfs_zlock_t *zl;
3558 znode_t *zp = tdzp;
3559 uint64_t rootid = ZTOZSB(zp)->z_root;
3560 uint64_t oidp = zp->z_id;
3561 krwlock_t *rwlp = &szp->z_parent_lock;
3562 krw_t rw = RW_WRITER;
3563
3564 /*
3565 * First pass write-locks szp and compares to zp->z_id.
3566 * Later passes read-lock zp and compare to zp->z_parent.
3567 */
3568 do {
3569 if (!rw_tryenter(rwlp, rw)) {
3570 /*
3571 * Another thread is renaming in this path.
3572 * Note that if we are a WRITER, we don't have any
3573 * parent_locks held yet.
3574 */
3575 if (rw == RW_READER && zp->z_id > szp->z_id) {
3576 /*
3577 * Drop our locks and restart
3578 */
3579 zfs_rename_unlock(&zl);
3580 *zlpp = NULL;
3581 zp = tdzp;
3582 oidp = zp->z_id;
3583 rwlp = &szp->z_parent_lock;
3584 rw = RW_WRITER;
3585 continue;
3586 } else {
3587 /*
3588 * Wait for other thread to drop its locks
3589 */
3590 rw_enter(rwlp, rw);
3591 }
3592 }
3593
3594 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3595 zl->zl_rwlock = rwlp;
3596 zl->zl_znode = NULL;
3597 zl->zl_next = *zlpp;
3598 *zlpp = zl;
3599
3600 if (oidp == szp->z_id) /* We're a descendant of szp */
3601 return (SET_ERROR(EINVAL));
3602
3603 if (oidp == rootid) /* We've hit the top */
3604 return (0);
3605
3606 if (rw == RW_READER) { /* i.e. not the first pass */
3607 int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
3608 if (error)
3609 return (error);
3610 zl->zl_znode = zp;
3611 }
3612 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
3613 &oidp, sizeof (oidp));
3614 rwlp = &zp->z_parent_lock;
3615 rw = RW_READER;
3616
3617 } while (zp->z_id != sdzp->z_id);
3618
3619 return (0);
3620 }
3621
3622 /*
3623 * Move an entry from the provided source directory to the target
3624 * directory. Change the entry name as indicated.
3625 *
3626 * IN: sdip - Source directory containing the "old entry".
3627 * snm - Old entry name.
3628 * tdip - Target directory to contain the "new entry".
3629 * tnm - New entry name.
3630 * cr - credentials of caller.
3631 * flags - case flags
3632 *
3633 * RETURN: 0 on success, error code on failure.
3634 *
3635 * Timestamps:
3636 * sdip,tdip - ctime|mtime updated
3637 */
3638 /*ARGSUSED*/
3639 int
3640 zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
3641 cred_t *cr, int flags)
3642 {
3643 znode_t *tdzp, *szp, *tzp;
3644 znode_t *sdzp = ITOZ(sdip);
3645 zfsvfs_t *zfsvfs = ITOZSB(sdip);
3646 zilog_t *zilog;
3647 zfs_dirlock_t *sdl, *tdl;
3648 dmu_tx_t *tx;
3649 zfs_zlock_t *zl;
3650 int cmp, serr, terr;
3651 int error = 0;
3652 int zflg = 0;
3653 boolean_t waited = B_FALSE;
3654
3655 if (snm == NULL || tnm == NULL)
3656 return (SET_ERROR(EINVAL));
3657
3658 ZFS_ENTER(zfsvfs);
3659 ZFS_VERIFY_ZP(sdzp);
3660 zilog = zfsvfs->z_log;
3661
3662 tdzp = ITOZ(tdip);
3663 ZFS_VERIFY_ZP(tdzp);
3664
3665 /*
3666 * We check i_sb because snapshots and the ctldir must have different
3667 * super blocks.
3668 */
3669 if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) {
3670 ZFS_EXIT(zfsvfs);
3671 return (SET_ERROR(EXDEV));
3672 }
3673
3674 if (zfsvfs->z_utf8 && u8_validate(tnm,
3675 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3676 ZFS_EXIT(zfsvfs);
3677 return (SET_ERROR(EILSEQ));
3678 }
3679
3680 if (flags & FIGNORECASE)
3681 zflg |= ZCILOOK;
3682
3683 top:
3684 szp = NULL;
3685 tzp = NULL;
3686 zl = NULL;
3687
3688 /*
3689 * This is to prevent the creation of links into attribute space
3690 * by renaming a linked file into/outof an attribute directory.
3691 * See the comment in zfs_link() for why this is considered bad.
3692 */
3693 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3694 ZFS_EXIT(zfsvfs);
3695 return (SET_ERROR(EINVAL));
3696 }
3697
3698 /*
3699 * Lock source and target directory entries. To prevent deadlock,
3700 * a lock ordering must be defined. We lock the directory with
3701 * the smallest object id first, or if it's a tie, the one with
3702 * the lexically first name.
3703 */
3704 if (sdzp->z_id < tdzp->z_id) {
3705 cmp = -1;
3706 } else if (sdzp->z_id > tdzp->z_id) {
3707 cmp = 1;
3708 } else {
3709 /*
3710 * First compare the two name arguments without
3711 * considering any case folding.
3712 */
3713 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3714
3715 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3716 ASSERT(error == 0 || !zfsvfs->z_utf8);
3717 if (cmp == 0) {
3718 /*
3719 * POSIX: "If the old argument and the new argument
3720 * both refer to links to the same existing file,
3721 * the rename() function shall return successfully
3722 * and perform no other action."
3723 */
3724 ZFS_EXIT(zfsvfs);
3725 return (0);
3726 }
3727 /*
3728 * If the file system is case-folding, then we may
3729 * have some more checking to do. A case-folding file
3730 * system is either supporting mixed case sensitivity
3731 * access or is completely case-insensitive. Note
3732 * that the file system is always case preserving.
3733 *
3734 * In mixed sensitivity mode case sensitive behavior
3735 * is the default. FIGNORECASE must be used to
3736 * explicitly request case insensitive behavior.
3737 *
3738 * If the source and target names provided differ only
3739 * by case (e.g., a request to rename 'tim' to 'Tim'),
3740 * we will treat this as a special case in the
3741 * case-insensitive mode: as long as the source name
3742 * is an exact match, we will allow this to proceed as
3743 * a name-change request.
3744 */
3745 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3746 (zfsvfs->z_case == ZFS_CASE_MIXED &&
3747 flags & FIGNORECASE)) &&
3748 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3749 &error) == 0) {
3750 /*
3751 * case preserving rename request, require exact
3752 * name matches
3753 */
3754 zflg |= ZCIEXACT;
3755 zflg &= ~ZCILOOK;
3756 }
3757 }
3758
3759 /*
3760 * If the source and destination directories are the same, we should
3761 * grab the z_name_lock of that directory only once.
3762 */
3763 if (sdzp == tdzp) {
3764 zflg |= ZHAVELOCK;
3765 rw_enter(&sdzp->z_name_lock, RW_READER);
3766 }
3767
3768 if (cmp < 0) {
3769 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3770 ZEXISTS | zflg, NULL, NULL);
3771 terr = zfs_dirent_lock(&tdl,
3772 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3773 } else {
3774 terr = zfs_dirent_lock(&tdl,
3775 tdzp, tnm, &tzp, zflg, NULL, NULL);
3776 serr = zfs_dirent_lock(&sdl,
3777 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3778 NULL, NULL);
3779 }
3780
3781 if (serr) {
3782 /*
3783 * Source entry invalid or not there.
3784 */
3785 if (!terr) {
3786 zfs_dirent_unlock(tdl);
3787 if (tzp)
3788 iput(ZTOI(tzp));
3789 }
3790
3791 if (sdzp == tdzp)
3792 rw_exit(&sdzp->z_name_lock);
3793
3794 if (strcmp(snm, "..") == 0)
3795 serr = EINVAL;
3796 ZFS_EXIT(zfsvfs);
3797 return (serr);
3798 }
3799 if (terr) {
3800 zfs_dirent_unlock(sdl);
3801 iput(ZTOI(szp));
3802
3803 if (sdzp == tdzp)
3804 rw_exit(&sdzp->z_name_lock);
3805
3806 if (strcmp(tnm, "..") == 0)
3807 terr = EINVAL;
3808 ZFS_EXIT(zfsvfs);
3809 return (terr);
3810 }
3811
3812 /*
3813 * If we are using project inheritance, means if the directory has
3814 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3815 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3816 * such case, we only allow renames into our tree when the project
3817 * IDs are the same.
3818 */
3819 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3820 tdzp->z_projid != szp->z_projid) {
3821 error = SET_ERROR(EXDEV);
3822 goto out;
3823 }
3824
3825 /*
3826 * Must have write access at the source to remove the old entry
3827 * and write access at the target to create the new entry.
3828 * Note that if target and source are the same, this can be
3829 * done in a single check.
3830 */
3831
3832 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
3833 goto out;
3834
3835 if (S_ISDIR(ZTOI(szp)->i_mode)) {
3836 /*
3837 * Check to make sure rename is valid.
3838 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3839 */
3840 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
3841 goto out;
3842 }
3843
3844 /*
3845 * Does target exist?
3846 */
3847 if (tzp) {
3848 /*
3849 * Source and target must be the same type.
3850 */
3851 if (S_ISDIR(ZTOI(szp)->i_mode)) {
3852 if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
3853 error = SET_ERROR(ENOTDIR);
3854 goto out;
3855 }
3856 } else {
3857 if (S_ISDIR(ZTOI(tzp)->i_mode)) {
3858 error = SET_ERROR(EISDIR);
3859 goto out;
3860 }
3861 }
3862 /*
3863 * POSIX dictates that when the source and target
3864 * entries refer to the same file object, rename
3865 * must do nothing and exit without error.
3866 */
3867 if (szp->z_id == tzp->z_id) {
3868 error = 0;
3869 goto out;
3870 }
3871 }
3872
3873 tx = dmu_tx_create(zfsvfs->z_os);
3874 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3875 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3876 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3877 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3878 if (sdzp != tdzp) {
3879 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3880 zfs_sa_upgrade_txholds(tx, tdzp);
3881 }
3882 if (tzp) {
3883 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3884 zfs_sa_upgrade_txholds(tx, tzp);
3885 }
3886
3887 zfs_sa_upgrade_txholds(tx, szp);
3888 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3889 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3890 if (error) {
3891 if (zl != NULL)
3892 zfs_rename_unlock(&zl);
3893 zfs_dirent_unlock(sdl);
3894 zfs_dirent_unlock(tdl);
3895
3896 if (sdzp == tdzp)
3897 rw_exit(&sdzp->z_name_lock);
3898
3899 if (error == ERESTART) {
3900 waited = B_TRUE;
3901 dmu_tx_wait(tx);
3902 dmu_tx_abort(tx);
3903 iput(ZTOI(szp));
3904 if (tzp)
3905 iput(ZTOI(tzp));
3906 goto top;
3907 }
3908 dmu_tx_abort(tx);
3909 iput(ZTOI(szp));
3910 if (tzp)
3911 iput(ZTOI(tzp));
3912 ZFS_EXIT(zfsvfs);
3913 return (error);
3914 }
3915
3916 if (tzp) /* Attempt to remove the existing target */
3917 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3918
3919 if (error == 0) {
3920 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3921 if (error == 0) {
3922 szp->z_pflags |= ZFS_AV_MODIFIED;
3923 if (tdzp->z_pflags & ZFS_PROJINHERIT)
3924 szp->z_pflags |= ZFS_PROJINHERIT;
3925
3926 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3927 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3928 ASSERT0(error);
3929
3930 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3931 if (error == 0) {
3932 zfs_log_rename(zilog, tx, TX_RENAME |
3933 (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3934 sdl->dl_name, tdzp, tdl->dl_name, szp);
3935 } else {
3936 /*
3937 * At this point, we have successfully created
3938 * the target name, but have failed to remove
3939 * the source name. Since the create was done
3940 * with the ZRENAMING flag, there are
3941 * complications; for one, the link count is
3942 * wrong. The easiest way to deal with this
3943 * is to remove the newly created target, and
3944 * return the original error. This must
3945 * succeed; fortunately, it is very unlikely to
3946 * fail, since we just created it.
3947 */
3948 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3949 ZRENAMING, NULL), ==, 0);
3950 }
3951 } else {
3952 /*
3953 * If we had removed the existing target, subsequent
3954 * call to zfs_link_create() to add back the same entry
3955 * but, the new dnode (szp) should not fail.
3956 */
3957 ASSERT(tzp == NULL);
3958 }
3959 }
3960
3961 dmu_tx_commit(tx);
3962 out:
3963 if (zl != NULL)
3964 zfs_rename_unlock(&zl);
3965
3966 zfs_dirent_unlock(sdl);
3967 zfs_dirent_unlock(tdl);
3968
3969 zfs_inode_update(sdzp);
3970 if (sdzp == tdzp)
3971 rw_exit(&sdzp->z_name_lock);
3972
3973 if (sdzp != tdzp)
3974 zfs_inode_update(tdzp);
3975
3976 zfs_inode_update(szp);
3977 iput(ZTOI(szp));
3978 if (tzp) {
3979 zfs_inode_update(tzp);
3980 iput(ZTOI(tzp));
3981 }
3982
3983 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3984 zil_commit(zilog, 0);
3985
3986 ZFS_EXIT(zfsvfs);
3987 return (error);
3988 }
3989
3990 /*
3991 * Insert the indicated symbolic reference entry into the directory.
3992 *
3993 * IN: dip - Directory to contain new symbolic link.
3994 * link - Name for new symlink entry.
3995 * vap - Attributes of new entry.
3996 * target - Target path of new symlink.
3997 *
3998 * cr - credentials of caller.
3999 * flags - case flags
4000 *
4001 * RETURN: 0 on success, error code on failure.
4002 *
4003 * Timestamps:
4004 * dip - ctime|mtime updated
4005 */
4006 /*ARGSUSED*/
4007 int
4008 zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
4009 struct inode **ipp, cred_t *cr, int flags)
4010 {
4011 znode_t *zp, *dzp = ITOZ(dip);
4012 zfs_dirlock_t *dl;
4013 dmu_tx_t *tx;
4014 zfsvfs_t *zfsvfs = ITOZSB(dip);
4015 zilog_t *zilog;
4016 uint64_t len = strlen(link);
4017 int error;
4018 int zflg = ZNEW;
4019 zfs_acl_ids_t acl_ids;
4020 boolean_t fuid_dirtied;
4021 uint64_t txtype = TX_SYMLINK;
4022 boolean_t waited = B_FALSE;
4023
4024 ASSERT(S_ISLNK(vap->va_mode));
4025
4026 if (name == NULL)
4027 return (SET_ERROR(EINVAL));
4028
4029 ZFS_ENTER(zfsvfs);
4030 ZFS_VERIFY_ZP(dzp);
4031 zilog = zfsvfs->z_log;
4032
4033 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4034 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4035 ZFS_EXIT(zfsvfs);
4036 return (SET_ERROR(EILSEQ));
4037 }
4038 if (flags & FIGNORECASE)
4039 zflg |= ZCILOOK;
4040
4041 if (len > MAXPATHLEN) {
4042 ZFS_EXIT(zfsvfs);
4043 return (SET_ERROR(ENAMETOOLONG));
4044 }
4045
4046 if ((error = zfs_acl_ids_create(dzp, 0,
4047 vap, cr, NULL, &acl_ids)) != 0) {
4048 ZFS_EXIT(zfsvfs);
4049 return (error);
4050 }
4051 top:
4052 *ipp = NULL;
4053
4054 /*
4055 * Attempt to lock directory; fail if entry already exists.
4056 */
4057 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4058 if (error) {
4059 zfs_acl_ids_free(&acl_ids);
4060 ZFS_EXIT(zfsvfs);
4061 return (error);
4062 }
4063
4064 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
4065 zfs_acl_ids_free(&acl_ids);
4066 zfs_dirent_unlock(dl);
4067 ZFS_EXIT(zfsvfs);
4068 return (error);
4069 }
4070
4071 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
4072 zfs_acl_ids_free(&acl_ids);
4073 zfs_dirent_unlock(dl);
4074 ZFS_EXIT(zfsvfs);
4075 return (SET_ERROR(EDQUOT));
4076 }
4077 tx = dmu_tx_create(zfsvfs->z_os);
4078 fuid_dirtied = zfsvfs->z_fuid_dirty;
4079 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4080 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4081 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4082 ZFS_SA_BASE_ATTR_SIZE + len);
4083 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4084 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4085 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4086 acl_ids.z_aclp->z_acl_bytes);
4087 }
4088 if (fuid_dirtied)
4089 zfs_fuid_txhold(zfsvfs, tx);
4090 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4091 if (error) {
4092 zfs_dirent_unlock(dl);
4093 if (error == ERESTART) {
4094 waited = B_TRUE;
4095 dmu_tx_wait(tx);
4096 dmu_tx_abort(tx);
4097 goto top;
4098 }
4099 zfs_acl_ids_free(&acl_ids);
4100 dmu_tx_abort(tx);
4101 ZFS_EXIT(zfsvfs);
4102 return (error);
4103 }
4104
4105 /*
4106 * Create a new object for the symlink.
4107 * for version 4 ZPL datsets the symlink will be an SA attribute
4108 */
4109 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4110
4111 if (fuid_dirtied)
4112 zfs_fuid_sync(zfsvfs, tx);
4113
4114 mutex_enter(&zp->z_lock);
4115 if (zp->z_is_sa)
4116 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4117 link, len, tx);
4118 else
4119 zfs_sa_symlink(zp, link, len, tx);
4120 mutex_exit(&zp->z_lock);
4121
4122 zp->z_size = len;
4123 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4124 &zp->z_size, sizeof (zp->z_size), tx);
4125 /*
4126 * Insert the new object into the directory.
4127 */
4128 error = zfs_link_create(dl, zp, tx, ZNEW);
4129 if (error != 0) {
4130 zfs_znode_delete(zp, tx);
4131 remove_inode_hash(ZTOI(zp));
4132 } else {
4133 if (flags & FIGNORECASE)
4134 txtype |= TX_CI;
4135 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4136
4137 zfs_inode_update(dzp);
4138 zfs_inode_update(zp);
4139 }
4140
4141 zfs_acl_ids_free(&acl_ids);
4142
4143 dmu_tx_commit(tx);
4144
4145 zfs_dirent_unlock(dl);
4146
4147 if (error == 0) {
4148 *ipp = ZTOI(zp);
4149
4150 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4151 zil_commit(zilog, 0);
4152 } else {
4153 iput(ZTOI(zp));
4154 }
4155
4156 ZFS_EXIT(zfsvfs);
4157 return (error);
4158 }
4159
4160 /*
4161 * Return, in the buffer contained in the provided uio structure,
4162 * the symbolic path referred to by ip.
4163 *
4164 * IN: ip - inode of symbolic link
4165 * uio - structure to contain the link path.
4166 * cr - credentials of caller.
4167 *
4168 * RETURN: 0 if success
4169 * error code if failure
4170 *
4171 * Timestamps:
4172 * ip - atime updated
4173 */
4174 /* ARGSUSED */
4175 int
4176 zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
4177 {
4178 znode_t *zp = ITOZ(ip);
4179 zfsvfs_t *zfsvfs = ITOZSB(ip);
4180 int error;
4181
4182 ZFS_ENTER(zfsvfs);
4183 ZFS_VERIFY_ZP(zp);
4184
4185 mutex_enter(&zp->z_lock);
4186 if (zp->z_is_sa)
4187 error = sa_lookup_uio(zp->z_sa_hdl,
4188 SA_ZPL_SYMLINK(zfsvfs), uio);
4189 else
4190 error = zfs_sa_readlink(zp, uio);
4191 mutex_exit(&zp->z_lock);
4192
4193 ZFS_EXIT(zfsvfs);
4194 return (error);
4195 }
4196
4197 /*
4198 * Insert a new entry into directory tdip referencing sip.
4199 *
4200 * IN: tdip - Directory to contain new entry.
4201 * sip - inode of new entry.
4202 * name - name of new entry.
4203 * cr - credentials of caller.
4204 *
4205 * RETURN: 0 if success
4206 * error code if failure
4207 *
4208 * Timestamps:
4209 * tdip - ctime|mtime updated
4210 * sip - ctime updated
4211 */
4212 /* ARGSUSED */
4213 int
4214 zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr,
4215 int flags)
4216 {
4217 znode_t *dzp = ITOZ(tdip);
4218 znode_t *tzp, *szp;
4219 zfsvfs_t *zfsvfs = ITOZSB(tdip);
4220 zilog_t *zilog;
4221 zfs_dirlock_t *dl;
4222 dmu_tx_t *tx;
4223 int error;
4224 int zf = ZNEW;
4225 uint64_t parent;
4226 uid_t owner;
4227 boolean_t waited = B_FALSE;
4228 boolean_t is_tmpfile = 0;
4229 uint64_t txg;
4230 #ifdef HAVE_TMPFILE
4231 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
4232 #endif
4233 ASSERT(S_ISDIR(tdip->i_mode));
4234
4235 if (name == NULL)
4236 return (SET_ERROR(EINVAL));
4237
4238 ZFS_ENTER(zfsvfs);
4239 ZFS_VERIFY_ZP(dzp);
4240 zilog = zfsvfs->z_log;
4241
4242 /*
4243 * POSIX dictates that we return EPERM here.
4244 * Better choices include ENOTSUP or EISDIR.
4245 */
4246 if (S_ISDIR(sip->i_mode)) {
4247 ZFS_EXIT(zfsvfs);
4248 return (SET_ERROR(EPERM));
4249 }
4250
4251 szp = ITOZ(sip);
4252 ZFS_VERIFY_ZP(szp);
4253
4254 /*
4255 * If we are using project inheritance, means if the directory has
4256 * ZFS_PROJINHERIT set, then its descendant directories will inherit
4257 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
4258 * such case, we only allow hard link creation in our tree when the
4259 * project IDs are the same.
4260 */
4261 if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) {
4262 ZFS_EXIT(zfsvfs);
4263 return (SET_ERROR(EXDEV));
4264 }
4265
4266 /*
4267 * We check i_sb because snapshots and the ctldir must have different
4268 * super blocks.
4269 */
4270 if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) {
4271 ZFS_EXIT(zfsvfs);
4272 return (SET_ERROR(EXDEV));
4273 }
4274
4275 /* Prevent links to .zfs/shares files */
4276
4277 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4278 &parent, sizeof (uint64_t))) != 0) {
4279 ZFS_EXIT(zfsvfs);
4280 return (error);
4281 }
4282 if (parent == zfsvfs->z_shares_dir) {
4283 ZFS_EXIT(zfsvfs);
4284 return (SET_ERROR(EPERM));
4285 }
4286
4287 if (zfsvfs->z_utf8 && u8_validate(name,
4288 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4289 ZFS_EXIT(zfsvfs);
4290 return (SET_ERROR(EILSEQ));
4291 }
4292 if (flags & FIGNORECASE)
4293 zf |= ZCILOOK;
4294
4295 /*
4296 * We do not support links between attributes and non-attributes
4297 * because of the potential security risk of creating links
4298 * into "normal" file space in order to circumvent restrictions
4299 * imposed in attribute space.
4300 */
4301 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4302 ZFS_EXIT(zfsvfs);
4303 return (SET_ERROR(EINVAL));
4304 }
4305
4306 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
4307 cr, ZFS_OWNER);
4308 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4309 ZFS_EXIT(zfsvfs);
4310 return (SET_ERROR(EPERM));
4311 }
4312
4313 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
4314 ZFS_EXIT(zfsvfs);
4315 return (error);
4316 }
4317
4318 top:
4319 /*
4320 * Attempt to lock directory; fail if entry already exists.
4321 */
4322 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4323 if (error) {
4324 ZFS_EXIT(zfsvfs);
4325 return (error);
4326 }
4327
4328 tx = dmu_tx_create(zfsvfs->z_os);
4329 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4330 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4331 if (is_tmpfile)
4332 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4333
4334 zfs_sa_upgrade_txholds(tx, szp);
4335 zfs_sa_upgrade_txholds(tx, dzp);
4336 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4337 if (error) {
4338 zfs_dirent_unlock(dl);
4339 if (error == ERESTART) {
4340 waited = B_TRUE;
4341 dmu_tx_wait(tx);
4342 dmu_tx_abort(tx);
4343 goto top;
4344 }
4345 dmu_tx_abort(tx);
4346 ZFS_EXIT(zfsvfs);
4347 return (error);
4348 }
4349 /* unmark z_unlinked so zfs_link_create will not reject */
4350 if (is_tmpfile)
4351 szp->z_unlinked = 0;
4352 error = zfs_link_create(dl, szp, tx, 0);
4353
4354 if (error == 0) {
4355 uint64_t txtype = TX_LINK;
4356 /*
4357 * tmpfile is created to be in z_unlinkedobj, so remove it.
4358 * Also, we don't log in ZIL, be cause all previous file
4359 * operation on the tmpfile are ignored by ZIL. Instead we
4360 * always wait for txg to sync to make sure all previous
4361 * operation are sync safe.
4362 */
4363 if (is_tmpfile) {
4364 VERIFY(zap_remove_int(zfsvfs->z_os,
4365 zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
4366 } else {
4367 if (flags & FIGNORECASE)
4368 txtype |= TX_CI;
4369 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4370 }
4371 } else if (is_tmpfile) {
4372 /* restore z_unlinked since when linking failed */
4373 szp->z_unlinked = 1;
4374 }
4375 txg = dmu_tx_get_txg(tx);
4376 dmu_tx_commit(tx);
4377
4378 zfs_dirent_unlock(dl);
4379
4380 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4381 zil_commit(zilog, 0);
4382
4383 if (is_tmpfile)
4384 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
4385
4386 zfs_inode_update(dzp);
4387 zfs_inode_update(szp);
4388 ZFS_EXIT(zfsvfs);
4389 return (error);
4390 }
4391
4392 static void
4393 zfs_putpage_commit_cb(void *arg)
4394 {
4395 struct page *pp = arg;
4396
4397 ClearPageError(pp);
4398 end_page_writeback(pp);
4399 }
4400
4401 /*
4402 * Push a page out to disk, once the page is on stable storage the
4403 * registered commit callback will be run as notification of completion.
4404 *
4405 * IN: ip - page mapped for inode.
4406 * pp - page to push (page is locked)
4407 * wbc - writeback control data
4408 *
4409 * RETURN: 0 if success
4410 * error code if failure
4411 *
4412 * Timestamps:
4413 * ip - ctime|mtime updated
4414 */
4415 /* ARGSUSED */
4416 int
4417 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
4418 {
4419 znode_t *zp = ITOZ(ip);
4420 zfsvfs_t *zfsvfs = ITOZSB(ip);
4421 loff_t offset;
4422 loff_t pgoff;
4423 unsigned int pglen;
4424 dmu_tx_t *tx;
4425 caddr_t va;
4426 int err = 0;
4427 uint64_t mtime[2], ctime[2];
4428 sa_bulk_attr_t bulk[3];
4429 int cnt = 0;
4430 struct address_space *mapping;
4431
4432 ZFS_ENTER(zfsvfs);
4433 ZFS_VERIFY_ZP(zp);
4434
4435 ASSERT(PageLocked(pp));
4436
4437 pgoff = page_offset(pp); /* Page byte-offset in file */
4438 offset = i_size_read(ip); /* File length in bytes */
4439 pglen = MIN(PAGE_SIZE, /* Page length in bytes */
4440 P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
4441
4442 /* Page is beyond end of file */
4443 if (pgoff >= offset) {
4444 unlock_page(pp);
4445 ZFS_EXIT(zfsvfs);
4446 return (0);
4447 }
4448
4449 /* Truncate page length to end of file */
4450 if (pgoff + pglen > offset)
4451 pglen = offset - pgoff;
4452
4453 #if 0
4454 /*
4455 * FIXME: Allow mmap writes past its quota. The correct fix
4456 * is to register a page_mkwrite() handler to count the page
4457 * against its quota when it is about to be dirtied.
4458 */
4459 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
4460 KUID_TO_SUID(ip->i_uid)) ||
4461 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
4462 KGID_TO_SGID(ip->i_gid)) ||
4463 (zp->z_projid != ZFS_DEFAULT_PROJID &&
4464 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
4465 zp->z_projid))) {
4466 err = EDQUOT;
4467 }
4468 #endif
4469
4470 /*
4471 * The ordering here is critical and must adhere to the following
4472 * rules in order to avoid deadlocking in either zfs_read() or
4473 * zfs_free_range() due to a lock inversion.
4474 *
4475 * 1) The page must be unlocked prior to acquiring the range lock.
4476 * This is critical because zfs_read() calls find_lock_page()
4477 * which may block on the page lock while holding the range lock.
4478 *
4479 * 2) Before setting or clearing write back on a page the range lock
4480 * must be held in order to prevent a lock inversion with the
4481 * zfs_free_range() function.
4482 *
4483 * This presents a problem because upon entering this function the
4484 * page lock is already held. To safely acquire the range lock the
4485 * page lock must be dropped. This creates a window where another
4486 * process could truncate, invalidate, dirty, or write out the page.
4487 *
4488 * Therefore, after successfully reacquiring the range and page locks
4489 * the current page state is checked. In the common case everything
4490 * will be as is expected and it can be written out. However, if
4491 * the page state has changed it must be handled accordingly.
4492 */
4493 mapping = pp->mapping;
4494 redirty_page_for_writepage(wbc, pp);
4495 unlock_page(pp);
4496
4497 locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
4498 pgoff, pglen, RL_WRITER);
4499 lock_page(pp);
4500
4501 /* Page mapping changed or it was no longer dirty, we're done */
4502 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
4503 unlock_page(pp);
4504 rangelock_exit(lr);
4505 ZFS_EXIT(zfsvfs);
4506 return (0);
4507 }
4508
4509 /* Another process started write block if required */
4510 if (PageWriteback(pp)) {
4511 unlock_page(pp);
4512 rangelock_exit(lr);
4513
4514 if (wbc->sync_mode != WB_SYNC_NONE)
4515 wait_on_page_writeback(pp);
4516
4517 ZFS_EXIT(zfsvfs);
4518 return (0);
4519 }
4520
4521 /* Clear the dirty flag the required locks are held */
4522 if (!clear_page_dirty_for_io(pp)) {
4523 unlock_page(pp);
4524 rangelock_exit(lr);
4525 ZFS_EXIT(zfsvfs);
4526 return (0);
4527 }
4528
4529 /*
4530 * Counterpart for redirty_page_for_writepage() above. This page
4531 * was in fact not skipped and should not be counted as if it were.
4532 */
4533 wbc->pages_skipped--;
4534 set_page_writeback(pp);
4535 unlock_page(pp);
4536
4537 tx = dmu_tx_create(zfsvfs->z_os);
4538 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
4539 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4540 zfs_sa_upgrade_txholds(tx, zp);
4541
4542 err = dmu_tx_assign(tx, TXG_NOWAIT);
4543 if (err != 0) {
4544 if (err == ERESTART)
4545 dmu_tx_wait(tx);
4546
4547 dmu_tx_abort(tx);
4548 __set_page_dirty_nobuffers(pp);
4549 ClearPageError(pp);
4550 end_page_writeback(pp);
4551 rangelock_exit(lr);
4552 ZFS_EXIT(zfsvfs);
4553 return (err);
4554 }
4555
4556 va = kmap(pp);
4557 ASSERT3U(pglen, <=, PAGE_SIZE);
4558 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
4559 kunmap(pp);
4560
4561 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4562 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4563 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
4564 &zp->z_pflags, 8);
4565
4566 /* Preserve the mtime and ctime provided by the inode */
4567 ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
4568 ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
4569 zp->z_atime_dirty = 0;
4570 zp->z_seq++;
4571
4572 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4573
4574 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
4575 zfs_putpage_commit_cb, pp);
4576 dmu_tx_commit(tx);
4577
4578 rangelock_exit(lr);
4579
4580 if (wbc->sync_mode != WB_SYNC_NONE) {
4581 /*
4582 * Note that this is rarely called under writepages(), because
4583 * writepages() normally handles the entire commit for
4584 * performance reasons.
4585 */
4586 zil_commit(zfsvfs->z_log, zp->z_id);
4587 }
4588
4589 ZFS_EXIT(zfsvfs);
4590 return (err);
4591 }
4592
4593 /*
4594 * Update the system attributes when the inode has been dirtied. For the
4595 * moment we only update the mode, atime, mtime, and ctime.
4596 */
4597 int
4598 zfs_dirty_inode(struct inode *ip, int flags)
4599 {
4600 znode_t *zp = ITOZ(ip);
4601 zfsvfs_t *zfsvfs = ITOZSB(ip);
4602 dmu_tx_t *tx;
4603 uint64_t mode, atime[2], mtime[2], ctime[2];
4604 sa_bulk_attr_t bulk[4];
4605 int error = 0;
4606 int cnt = 0;
4607
4608 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
4609 return (0);
4610
4611 ZFS_ENTER(zfsvfs);
4612 ZFS_VERIFY_ZP(zp);
4613
4614 #ifdef I_DIRTY_TIME
4615 /*
4616 * This is the lazytime semantic indroduced in Linux 4.0
4617 * This flag will only be called from update_time when lazytime is set.
4618 * (Note, I_DIRTY_SYNC will also set if not lazytime)
4619 * Fortunately mtime and ctime are managed within ZFS itself, so we
4620 * only need to dirty atime.
4621 */
4622 if (flags == I_DIRTY_TIME) {
4623 zp->z_atime_dirty = 1;
4624 goto out;
4625 }
4626 #endif
4627
4628 tx = dmu_tx_create(zfsvfs->z_os);
4629
4630 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4631 zfs_sa_upgrade_txholds(tx, zp);
4632
4633 error = dmu_tx_assign(tx, TXG_WAIT);
4634 if (error) {
4635 dmu_tx_abort(tx);
4636 goto out;
4637 }
4638
4639 mutex_enter(&zp->z_lock);
4640 zp->z_atime_dirty = 0;
4641
4642 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
4643 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
4644 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4645 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4646
4647 /* Preserve the mode, mtime and ctime provided by the inode */
4648 ZFS_TIME_ENCODE(&ip->i_atime, atime);
4649 ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
4650 ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
4651 mode = ip->i_mode;
4652
4653 zp->z_mode = mode;
4654
4655 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4656 mutex_exit(&zp->z_lock);
4657
4658 dmu_tx_commit(tx);
4659 out:
4660 ZFS_EXIT(zfsvfs);
4661 return (error);
4662 }
4663
4664 /*ARGSUSED*/
4665 void
4666 zfs_inactive(struct inode *ip)
4667 {
4668 znode_t *zp = ITOZ(ip);
4669 zfsvfs_t *zfsvfs = ITOZSB(ip);
4670 uint64_t atime[2];
4671 int error;
4672 int need_unlock = 0;
4673
4674 /* Only read lock if we haven't already write locked, e.g. rollback */
4675 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
4676 need_unlock = 1;
4677 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4678 }
4679 if (zp->z_sa_hdl == NULL) {
4680 if (need_unlock)
4681 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4682 return;
4683 }
4684
4685 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4686 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4687
4688 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4689 zfs_sa_upgrade_txholds(tx, zp);
4690 error = dmu_tx_assign(tx, TXG_WAIT);
4691 if (error) {
4692 dmu_tx_abort(tx);
4693 } else {
4694 ZFS_TIME_ENCODE(&ip->i_atime, atime);
4695 mutex_enter(&zp->z_lock);
4696 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4697 (void *)&atime, sizeof (atime), tx);
4698 zp->z_atime_dirty = 0;
4699 mutex_exit(&zp->z_lock);
4700 dmu_tx_commit(tx);
4701 }
4702 }
4703
4704 zfs_zinactive(zp);
4705 if (need_unlock)
4706 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4707 }
4708
4709 /*
4710 * Bounds-check the seek operation.
4711 *
4712 * IN: ip - inode seeking within
4713 * ooff - old file offset
4714 * noffp - pointer to new file offset
4715 * ct - caller context
4716 *
4717 * RETURN: 0 if success
4718 * EINVAL if new offset invalid
4719 */
4720 /* ARGSUSED */
4721 int
4722 zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp)
4723 {
4724 if (S_ISDIR(ip->i_mode))
4725 return (0);
4726 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4727 }
4728
4729 /*
4730 * Fill pages with data from the disk.
4731 */
4732 static int
4733 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
4734 {
4735 znode_t *zp = ITOZ(ip);
4736 zfsvfs_t *zfsvfs = ITOZSB(ip);
4737 objset_t *os;
4738 struct page *cur_pp;
4739 u_offset_t io_off, total;
4740 size_t io_len;
4741 loff_t i_size;
4742 unsigned page_idx;
4743 int err;
4744
4745 os = zfsvfs->z_os;
4746 io_len = nr_pages << PAGE_SHIFT;
4747 i_size = i_size_read(ip);
4748 io_off = page_offset(pl[0]);
4749
4750 if (io_off + io_len > i_size)
4751 io_len = i_size - io_off;
4752
4753 /*
4754 * Iterate over list of pages and read each page individually.
4755 */
4756 page_idx = 0;
4757 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4758 caddr_t va;
4759
4760 cur_pp = pl[page_idx++];
4761 va = kmap(cur_pp);
4762 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4763 DMU_READ_PREFETCH);
4764 kunmap(cur_pp);
4765 if (err) {
4766 /* convert checksum errors into IO errors */
4767 if (err == ECKSUM)
4768 err = SET_ERROR(EIO);
4769 return (err);
4770 }
4771 }
4772
4773 return (0);
4774 }
4775
4776 /*
4777 * Uses zfs_fillpage to read data from the file and fill the pages.
4778 *
4779 * IN: ip - inode of file to get data from.
4780 * pl - list of pages to read
4781 * nr_pages - number of pages to read
4782 *
4783 * RETURN: 0 on success, error code on failure.
4784 *
4785 * Timestamps:
4786 * vp - atime updated
4787 */
4788 /* ARGSUSED */
4789 int
4790 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
4791 {
4792 znode_t *zp = ITOZ(ip);
4793 zfsvfs_t *zfsvfs = ITOZSB(ip);
4794 int err;
4795
4796 if (pl == NULL)
4797 return (0);
4798
4799 ZFS_ENTER(zfsvfs);
4800 ZFS_VERIFY_ZP(zp);
4801
4802 err = zfs_fillpage(ip, pl, nr_pages);
4803
4804 ZFS_EXIT(zfsvfs);
4805 return (err);
4806 }
4807
4808 /*
4809 * Check ZFS specific permissions to memory map a section of a file.
4810 *
4811 * IN: ip - inode of the file to mmap
4812 * off - file offset
4813 * addrp - start address in memory region
4814 * len - length of memory region
4815 * vm_flags- address flags
4816 *
4817 * RETURN: 0 if success
4818 * error code if failure
4819 */
4820 /*ARGSUSED*/
4821 int
4822 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4823 unsigned long vm_flags)
4824 {
4825 znode_t *zp = ITOZ(ip);
4826 zfsvfs_t *zfsvfs = ITOZSB(ip);
4827
4828 ZFS_ENTER(zfsvfs);
4829 ZFS_VERIFY_ZP(zp);
4830
4831 if ((vm_flags & VM_WRITE) && (zp->z_pflags &
4832 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4833 ZFS_EXIT(zfsvfs);
4834 return (SET_ERROR(EPERM));
4835 }
4836
4837 if ((vm_flags & (VM_READ | VM_EXEC)) &&
4838 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4839 ZFS_EXIT(zfsvfs);
4840 return (SET_ERROR(EACCES));
4841 }
4842
4843 if (off < 0 || len > MAXOFFSET_T - off) {
4844 ZFS_EXIT(zfsvfs);
4845 return (SET_ERROR(ENXIO));
4846 }
4847
4848 ZFS_EXIT(zfsvfs);
4849 return (0);
4850 }
4851
4852 /*
4853 * convoff - converts the given data (start, whence) to the
4854 * given whence.
4855 */
4856 int
4857 convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset)
4858 {
4859 vattr_t vap;
4860 int error;
4861
4862 if ((lckdat->l_whence == 2) || (whence == 2)) {
4863 if ((error = zfs_getattr(ip, &vap, 0, CRED())))
4864 return (error);
4865 }
4866
4867 switch (lckdat->l_whence) {
4868 case 1:
4869 lckdat->l_start += offset;
4870 break;
4871 case 2:
4872 lckdat->l_start += vap.va_size;
4873 /* FALLTHRU */
4874 case 0:
4875 break;
4876 default:
4877 return (SET_ERROR(EINVAL));
4878 }
4879
4880 if (lckdat->l_start < 0)
4881 return (SET_ERROR(EINVAL));
4882
4883 switch (whence) {
4884 case 1:
4885 lckdat->l_start -= offset;
4886 break;
4887 case 2:
4888 lckdat->l_start -= vap.va_size;
4889 /* FALLTHRU */
4890 case 0:
4891 break;
4892 default:
4893 return (SET_ERROR(EINVAL));
4894 }
4895
4896 lckdat->l_whence = (short)whence;
4897 return (0);
4898 }
4899
4900 /*
4901 * Free or allocate space in a file. Currently, this function only
4902 * supports the `F_FREESP' command. However, this command is somewhat
4903 * misnamed, as its functionality includes the ability to allocate as
4904 * well as free space.
4905 *
4906 * IN: ip - inode of file to free data in.
4907 * cmd - action to take (only F_FREESP supported).
4908 * bfp - section of file to free/alloc.
4909 * flag - current file open mode flags.
4910 * offset - current file offset.
4911 * cr - credentials of caller [UNUSED].
4912 *
4913 * RETURN: 0 on success, error code on failure.
4914 *
4915 * Timestamps:
4916 * ip - ctime|mtime updated
4917 */
4918 /* ARGSUSED */
4919 int
4920 zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag,
4921 offset_t offset, cred_t *cr)
4922 {
4923 znode_t *zp = ITOZ(ip);
4924 zfsvfs_t *zfsvfs = ITOZSB(ip);
4925 uint64_t off, len;
4926 int error;
4927
4928 ZFS_ENTER(zfsvfs);
4929 ZFS_VERIFY_ZP(zp);
4930
4931 if (cmd != F_FREESP) {
4932 ZFS_EXIT(zfsvfs);
4933 return (SET_ERROR(EINVAL));
4934 }
4935
4936 /*
4937 * Callers might not be able to detect properly that we are read-only,
4938 * so check it explicitly here.
4939 */
4940 if (zfs_is_readonly(zfsvfs)) {
4941 ZFS_EXIT(zfsvfs);
4942 return (SET_ERROR(EROFS));
4943 }
4944
4945 if ((error = convoff(ip, bfp, 0, offset))) {
4946 ZFS_EXIT(zfsvfs);
4947 return (error);
4948 }
4949
4950 if (bfp->l_len < 0) {
4951 ZFS_EXIT(zfsvfs);
4952 return (SET_ERROR(EINVAL));
4953 }
4954
4955 /*
4956 * Permissions aren't checked on Solaris because on this OS
4957 * zfs_space() can only be called with an opened file handle.
4958 * On Linux we can get here through truncate_range() which
4959 * operates directly on inodes, so we need to check access rights.
4960 */
4961 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
4962 ZFS_EXIT(zfsvfs);
4963 return (error);
4964 }
4965
4966 off = bfp->l_start;
4967 len = bfp->l_len; /* 0 means from off to end of file */
4968
4969 error = zfs_freesp(zp, off, len, flag, TRUE);
4970
4971 ZFS_EXIT(zfsvfs);
4972 return (error);
4973 }
4974
4975 /*ARGSUSED*/
4976 int
4977 zfs_fid(struct inode *ip, fid_t *fidp)
4978 {
4979 znode_t *zp = ITOZ(ip);
4980 zfsvfs_t *zfsvfs = ITOZSB(ip);
4981 uint32_t gen;
4982 uint64_t gen64;
4983 uint64_t object = zp->z_id;
4984 zfid_short_t *zfid;
4985 int size, i, error;
4986
4987 ZFS_ENTER(zfsvfs);
4988 ZFS_VERIFY_ZP(zp);
4989
4990 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4991 &gen64, sizeof (uint64_t))) != 0) {
4992 ZFS_EXIT(zfsvfs);
4993 return (error);
4994 }
4995
4996 gen = (uint32_t)gen64;
4997
4998 size = SHORT_FID_LEN;
4999
5000 zfid = (zfid_short_t *)fidp;
5001
5002 zfid->zf_len = size;
5003
5004 for (i = 0; i < sizeof (zfid->zf_object); i++)
5005 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5006
5007 /* Must have a non-zero generation number to distinguish from .zfs */
5008 if (gen == 0)
5009 gen = 1;
5010 for (i = 0; i < sizeof (zfid->zf_gen); i++)
5011 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5012
5013 ZFS_EXIT(zfsvfs);
5014 return (0);
5015 }
5016
5017 /*ARGSUSED*/
5018 int
5019 zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
5020 {
5021 znode_t *zp = ITOZ(ip);
5022 zfsvfs_t *zfsvfs = ITOZSB(ip);
5023 int error;
5024 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5025
5026 ZFS_ENTER(zfsvfs);
5027 ZFS_VERIFY_ZP(zp);
5028 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5029 ZFS_EXIT(zfsvfs);
5030
5031 return (error);
5032 }
5033
5034 /*ARGSUSED*/
5035 int
5036 zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
5037 {
5038 znode_t *zp = ITOZ(ip);
5039 zfsvfs_t *zfsvfs = ITOZSB(ip);
5040 int error;
5041 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5042 zilog_t *zilog = zfsvfs->z_log;
5043
5044 ZFS_ENTER(zfsvfs);
5045 ZFS_VERIFY_ZP(zp);
5046
5047 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5048
5049 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5050 zil_commit(zilog, 0);
5051
5052 ZFS_EXIT(zfsvfs);
5053 return (error);
5054 }
5055
5056 #ifdef HAVE_UIO_ZEROCOPY
5057 /*
5058 * Tunable, both must be a power of 2.
5059 *
5060 * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
5061 * zcr_blksz_max: if set to less than the file block size, allow loaning out of
5062 * an arcbuf for a partial block read
5063 */
5064 int zcr_blksz_min = (1 << 10); /* 1K */
5065 int zcr_blksz_max = (1 << 17); /* 128K */
5066
5067 /*ARGSUSED*/
5068 static int
5069 zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
5070 {
5071 znode_t *zp = ITOZ(ip);
5072 zfsvfs_t *zfsvfs = ITOZSB(ip);
5073 int max_blksz = zfsvfs->z_max_blksz;
5074 uio_t *uio = &xuio->xu_uio;
5075 ssize_t size = uio->uio_resid;
5076 offset_t offset = uio->uio_loffset;
5077 int blksz;
5078 int fullblk, i;
5079 arc_buf_t *abuf;
5080 ssize_t maxsize;
5081 int preamble, postamble;
5082
5083 if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5084 return (SET_ERROR(EINVAL));
5085
5086 ZFS_ENTER(zfsvfs);
5087 ZFS_VERIFY_ZP(zp);
5088 switch (ioflag) {
5089 case UIO_WRITE:
5090 /*
5091 * Loan out an arc_buf for write if write size is bigger than
5092 * max_blksz, and the file's block size is also max_blksz.
5093 */
5094 blksz = max_blksz;
5095 if (size < blksz || zp->z_blksz != blksz) {
5096 ZFS_EXIT(zfsvfs);
5097 return (SET_ERROR(EINVAL));
5098 }
5099 /*
5100 * Caller requests buffers for write before knowing where the
5101 * write offset might be (e.g. NFS TCP write).
5102 */
5103 if (offset == -1) {
5104 preamble = 0;
5105 } else {
5106 preamble = P2PHASE(offset, blksz);
5107 if (preamble) {
5108 preamble = blksz - preamble;
5109 size -= preamble;
5110 }
5111 }
5112
5113 postamble = P2PHASE(size, blksz);
5114 size -= postamble;
5115
5116 fullblk = size / blksz;
5117 (void) dmu_xuio_init(xuio,
5118 (preamble != 0) + fullblk + (postamble != 0));
5119
5120 /*
5121 * Have to fix iov base/len for partial buffers. They
5122 * currently represent full arc_buf's.
5123 */
5124 if (preamble) {
5125 /* data begins in the middle of the arc_buf */
5126 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5127 blksz);
5128 ASSERT(abuf);
5129 (void) dmu_xuio_add(xuio, abuf,
5130 blksz - preamble, preamble);
5131 }
5132
5133 for (i = 0; i < fullblk; i++) {
5134 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5135 blksz);
5136 ASSERT(abuf);
5137 (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5138 }
5139
5140 if (postamble) {
5141 /* data ends in the middle of the arc_buf */
5142 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5143 blksz);
5144 ASSERT(abuf);
5145 (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5146 }
5147 break;
5148 case UIO_READ:
5149 /*
5150 * Loan out an arc_buf for read if the read size is larger than
5151 * the current file block size. Block alignment is not
5152 * considered. Partial arc_buf will be loaned out for read.
5153 */
5154 blksz = zp->z_blksz;
5155 if (blksz < zcr_blksz_min)
5156 blksz = zcr_blksz_min;
5157 if (blksz > zcr_blksz_max)
5158 blksz = zcr_blksz_max;
5159 /* avoid potential complexity of dealing with it */
5160 if (blksz > max_blksz) {
5161 ZFS_EXIT(zfsvfs);
5162 return (SET_ERROR(EINVAL));
5163 }
5164
5165 maxsize = zp->z_size - uio->uio_loffset;
5166 if (size > maxsize)
5167 size = maxsize;
5168
5169 if (size < blksz) {
5170 ZFS_EXIT(zfsvfs);
5171 return (SET_ERROR(EINVAL));
5172 }
5173 break;
5174 default:
5175 ZFS_EXIT(zfsvfs);
5176 return (SET_ERROR(EINVAL));
5177 }
5178
5179 uio->uio_extflg = UIO_XUIO;
5180 XUIO_XUZC_RW(xuio) = ioflag;
5181 ZFS_EXIT(zfsvfs);
5182 return (0);
5183 }
5184
5185 /*ARGSUSED*/
5186 static int
5187 zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
5188 {
5189 int i;
5190 arc_buf_t *abuf;
5191 int ioflag = XUIO_XUZC_RW(xuio);
5192
5193 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5194
5195 i = dmu_xuio_cnt(xuio);
5196 while (i-- > 0) {
5197 abuf = dmu_xuio_arcbuf(xuio, i);
5198 /*
5199 * if abuf == NULL, it must be a write buffer
5200 * that has been returned in zfs_write().
5201 */
5202 if (abuf)
5203 dmu_return_arcbuf(abuf);
5204 ASSERT(abuf || ioflag == UIO_WRITE);
5205 }
5206
5207 dmu_xuio_fini(xuio);
5208 return (0);
5209 }
5210 #endif /* HAVE_UIO_ZEROCOPY */
5211
5212 #if defined(_KERNEL)
5213 EXPORT_SYMBOL(zfs_open);
5214 EXPORT_SYMBOL(zfs_close);
5215 EXPORT_SYMBOL(zfs_read);
5216 EXPORT_SYMBOL(zfs_write);
5217 EXPORT_SYMBOL(zfs_access);
5218 EXPORT_SYMBOL(zfs_lookup);
5219 EXPORT_SYMBOL(zfs_create);
5220 EXPORT_SYMBOL(zfs_tmpfile);
5221 EXPORT_SYMBOL(zfs_remove);
5222 EXPORT_SYMBOL(zfs_mkdir);
5223 EXPORT_SYMBOL(zfs_rmdir);
5224 EXPORT_SYMBOL(zfs_readdir);
5225 EXPORT_SYMBOL(zfs_fsync);
5226 EXPORT_SYMBOL(zfs_getattr);
5227 EXPORT_SYMBOL(zfs_getattr_fast);
5228 EXPORT_SYMBOL(zfs_setattr);
5229 EXPORT_SYMBOL(zfs_rename);
5230 EXPORT_SYMBOL(zfs_symlink);
5231 EXPORT_SYMBOL(zfs_readlink);
5232 EXPORT_SYMBOL(zfs_link);
5233 EXPORT_SYMBOL(zfs_inactive);
5234 EXPORT_SYMBOL(zfs_space);
5235 EXPORT_SYMBOL(zfs_fid);
5236 EXPORT_SYMBOL(zfs_getsecattr);
5237 EXPORT_SYMBOL(zfs_setsecattr);
5238 EXPORT_SYMBOL(zfs_getpage);
5239 EXPORT_SYMBOL(zfs_putpage);
5240 EXPORT_SYMBOL(zfs_dirty_inode);
5241 EXPORT_SYMBOL(zfs_map);
5242
5243 /* CSTYLED */
5244 module_param(zfs_delete_blocks, ulong, 0644);
5245 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
5246 module_param(zfs_read_chunk_size, long, 0644);
5247 MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
5248 #endif