]> git.proxmox.com Git - mirror_zfs-debian.git/blame - module/zfs/zfs_znode.c
Rebase master to b117
[mirror_zfs-debian.git] / module / zfs / zfs_znode.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
d164b209 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
34dc7c2f
BB
23 * Use is subject to license terms.
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
34dc7c2f
BB
28#ifdef _KERNEL
29#include <sys/types.h>
30#include <sys/param.h>
31#include <sys/time.h>
32#include <sys/systm.h>
33#include <sys/sysmacros.h>
34#include <sys/resource.h>
35#include <sys/mntent.h>
36#include <sys/mkdev.h>
37#include <sys/u8_textprep.h>
38#include <sys/dsl_dataset.h>
39#include <sys/vfs.h>
40#include <sys/vfs_opreg.h>
41#include <sys/vnode.h>
42#include <sys/file.h>
43#include <sys/kmem.h>
44#include <sys/errno.h>
45#include <sys/unistd.h>
46#include <sys/mode.h>
47#include <sys/atomic.h>
48#include <vm/pvn.h>
49#include "fs/fs_subr.h"
50#include <sys/zfs_dir.h>
51#include <sys/zfs_acl.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/zfs_rlock.h>
54#include <sys/zfs_fuid.h>
55#include <sys/fs/zfs.h>
56#include <sys/kidmap.h>
57#endif /* _KERNEL */
58
59#include <sys/dmu.h>
60#include <sys/refcount.h>
61#include <sys/stat.h>
62#include <sys/zap.h>
63#include <sys/zfs_znode.h>
64
65#include "zfs_prop.h"
66
b128c09f
BB
67/*
68 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
69 * turned on when DEBUG is also defined.
70 */
71#ifdef DEBUG
72#define ZNODE_STATS
73#endif /* DEBUG */
74
75#ifdef ZNODE_STATS
76#define ZNODE_STAT_ADD(stat) ((stat)++)
77#else
78#define ZNODE_STAT_ADD(stat) /* nothing */
79#endif /* ZNODE_STATS */
80
81#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
82#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
83
34dc7c2f
BB
84/*
85 * Functions needed for userland (ie: libzpool) are not put under
86 * #ifdef_KERNEL; the rest of the functions have dependencies
87 * (such as VFS logic) that will not compile easily in userland.
88 */
89#ifdef _KERNEL
9babb374
BB
90/*
91 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
92 * be freed before it can be safely accessed.
93 */
94krwlock_t zfsvfs_lock;
95
b128c09f 96static kmem_cache_t *znode_cache = NULL;
34dc7c2f
BB
97
98/*ARGSUSED*/
99static void
100znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
101{
102 /*
103 * We should never drop all dbuf refs without first clearing
104 * the eviction callback.
105 */
106 panic("evicting znode %p\n", user_ptr);
107}
108
109/*ARGSUSED*/
110static int
b128c09f 111zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
34dc7c2f
BB
112{
113 znode_t *zp = buf;
114
b128c09f
BB
115 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
116
117 zp->z_vnode = vn_alloc(kmflags);
118 if (zp->z_vnode == NULL) {
119 return (-1);
120 }
121 ZTOV(zp)->v_data = zp;
122
123 list_link_init(&zp->z_link_node);
124
34dc7c2f 125 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
126 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
127 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
128 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
129
130 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
131 avl_create(&zp->z_range_avl, zfs_range_compare,
132 sizeof (rl_t), offsetof(rl_t, r_node));
133
134 zp->z_dbuf = NULL;
b128c09f 135 zp->z_dirlocks = NULL;
34dc7c2f
BB
136 return (0);
137}
138
139/*ARGSUSED*/
140static void
b128c09f 141zfs_znode_cache_destructor(void *buf, void *arg)
34dc7c2f
BB
142{
143 znode_t *zp = buf;
144
b128c09f
BB
145 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
146 ASSERT(ZTOV(zp)->v_data == zp);
147 vn_free(ZTOV(zp));
148 ASSERT(!list_link_active(&zp->z_link_node));
34dc7c2f 149 mutex_destroy(&zp->z_lock);
34dc7c2f
BB
150 rw_destroy(&zp->z_parent_lock);
151 rw_destroy(&zp->z_name_lock);
152 mutex_destroy(&zp->z_acl_lock);
153 avl_destroy(&zp->z_range_avl);
154 mutex_destroy(&zp->z_range_lock);
155
156 ASSERT(zp->z_dbuf == NULL);
b128c09f
BB
157 ASSERT(zp->z_dirlocks == NULL);
158}
159
160#ifdef ZNODE_STATS
161static struct {
162 uint64_t zms_zfsvfs_invalid;
9babb374 163 uint64_t zms_zfsvfs_recheck1;
b128c09f 164 uint64_t zms_zfsvfs_unmounted;
9babb374 165 uint64_t zms_zfsvfs_recheck2;
b128c09f
BB
166 uint64_t zms_obj_held;
167 uint64_t zms_vnode_locked;
168 uint64_t zms_not_only_dnlc;
169} znode_move_stats;
170#endif /* ZNODE_STATS */
171
172static void
173zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
174{
175 vnode_t *vp;
176
177 /* Copy fields. */
178 nzp->z_zfsvfs = ozp->z_zfsvfs;
179
180 /* Swap vnodes. */
181 vp = nzp->z_vnode;
182 nzp->z_vnode = ozp->z_vnode;
183 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
184 ZTOV(ozp)->v_data = ozp;
185 ZTOV(nzp)->v_data = nzp;
186
187 nzp->z_id = ozp->z_id;
188 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
189 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
190 nzp->z_unlinked = ozp->z_unlinked;
191 nzp->z_atime_dirty = ozp->z_atime_dirty;
192 nzp->z_zn_prefetch = ozp->z_zn_prefetch;
193 nzp->z_blksz = ozp->z_blksz;
194 nzp->z_seq = ozp->z_seq;
195 nzp->z_mapcnt = ozp->z_mapcnt;
196 nzp->z_last_itx = ozp->z_last_itx;
197 nzp->z_gen = ozp->z_gen;
198 nzp->z_sync_cnt = ozp->z_sync_cnt;
199 nzp->z_phys = ozp->z_phys;
200 nzp->z_dbuf = ozp->z_dbuf;
201
202 /* Update back pointers. */
203 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
204 znode_evict_error);
205
206 /*
207 * Invalidate the original znode by clearing fields that provide a
208 * pointer back to the znode. Set the low bit of the vfs pointer to
209 * ensure that zfs_znode_move() recognizes the znode as invalid in any
210 * subsequent callback.
211 */
212 ozp->z_dbuf = NULL;
213 POINTER_INVALIDATE(&ozp->z_zfsvfs);
214}
215
b128c09f
BB
216/*ARGSUSED*/
217static kmem_cbrc_t
218zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
219{
220 znode_t *ozp = buf, *nzp = newbuf;
221 zfsvfs_t *zfsvfs;
222 vnode_t *vp;
223
224 /*
225 * The znode is on the file system's list of known znodes if the vfs
226 * pointer is valid. We set the low bit of the vfs pointer when freeing
227 * the znode to invalidate it, and the memory patterns written by kmem
228 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
229 * created znode sets the vfs pointer last of all to indicate that the
230 * znode is known and in a valid state to be moved by this function.
231 */
232 zfsvfs = ozp->z_zfsvfs;
233 if (!POINTER_IS_VALID(zfsvfs)) {
234 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
235 return (KMEM_CBRC_DONT_KNOW);
236 }
237
238 /*
9babb374
BB
239 * Close a small window in which it's possible that the filesystem could
240 * be unmounted and freed, and zfsvfs, though valid in the previous
241 * statement, could point to unrelated memory by the time we try to
242 * prevent the filesystem from being unmounted.
243 */
244 rw_enter(&zfsvfs_lock, RW_WRITER);
245 if (zfsvfs != ozp->z_zfsvfs) {
246 rw_exit(&zfsvfs_lock);
247 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
248 return (KMEM_CBRC_DONT_KNOW);
249 }
250
251 /*
252 * If the znode is still valid, then so is the file system. We know that
253 * no valid file system can be freed while we hold zfsvfs_lock, so we
254 * can safely ensure that the filesystem is not and will not be
255 * unmounted. The next statement is equivalent to ZFS_ENTER().
b128c09f 256 */
9babb374
BB
257 rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
258 if (zfsvfs->z_unmounted) {
259 ZFS_EXIT(zfsvfs);
260 rw_exit(&zfsvfs_lock);
b128c09f
BB
261 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
262 return (KMEM_CBRC_DONT_KNOW);
263 }
9babb374 264 rw_exit(&zfsvfs_lock);
b128c09f
BB
265
266 mutex_enter(&zfsvfs->z_znodes_lock);
267 /*
268 * Recheck the vfs pointer in case the znode was removed just before
269 * acquiring the lock.
270 */
271 if (zfsvfs != ozp->z_zfsvfs) {
272 mutex_exit(&zfsvfs->z_znodes_lock);
273 ZFS_EXIT(zfsvfs);
9babb374 274 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
b128c09f
BB
275 return (KMEM_CBRC_DONT_KNOW);
276 }
277
278 /*
279 * At this point we know that as long as we hold z_znodes_lock, the
280 * znode cannot be freed and fields within the znode can be safely
281 * accessed. Now, prevent a race with zfs_zget().
282 */
283 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
284 mutex_exit(&zfsvfs->z_znodes_lock);
285 ZFS_EXIT(zfsvfs);
286 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
287 return (KMEM_CBRC_LATER);
288 }
289
290 vp = ZTOV(ozp);
291 if (mutex_tryenter(&vp->v_lock) == 0) {
292 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
293 mutex_exit(&zfsvfs->z_znodes_lock);
294 ZFS_EXIT(zfsvfs);
295 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
296 return (KMEM_CBRC_LATER);
297 }
298
299 /* Only move znodes that are referenced _only_ by the DNLC. */
300 if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
301 mutex_exit(&vp->v_lock);
302 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
303 mutex_exit(&zfsvfs->z_znodes_lock);
304 ZFS_EXIT(zfsvfs);
305 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
306 return (KMEM_CBRC_LATER);
307 }
308
309 /*
310 * The znode is known and in a valid state to move. We're holding the
311 * locks needed to execute the critical section.
312 */
313 zfs_znode_move_impl(ozp, nzp);
314 mutex_exit(&vp->v_lock);
315 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
316
317 list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
318 mutex_exit(&zfsvfs->z_znodes_lock);
319 ZFS_EXIT(zfsvfs);
320
321 return (KMEM_CBRC_YES);
34dc7c2f
BB
322}
323
324void
325zfs_znode_init(void)
326{
327 /*
328 * Initialize zcache
329 */
9babb374 330 rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
34dc7c2f
BB
331 ASSERT(znode_cache == NULL);
332 znode_cache = kmem_cache_create("zfs_znode_cache",
333 sizeof (znode_t), 0, zfs_znode_cache_constructor,
334 zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
b128c09f 335 kmem_cache_set_move(znode_cache, zfs_znode_move);
34dc7c2f
BB
336}
337
338void
339zfs_znode_fini(void)
340{
341 /*
342 * Cleanup vfs & vnode ops
343 */
344 zfs_remove_op_tables();
345
346 /*
347 * Cleanup zcache
348 */
349 if (znode_cache)
350 kmem_cache_destroy(znode_cache);
351 znode_cache = NULL;
9babb374 352 rw_destroy(&zfsvfs_lock);
34dc7c2f
BB
353}
354
355struct vnodeops *zfs_dvnodeops;
356struct vnodeops *zfs_fvnodeops;
357struct vnodeops *zfs_symvnodeops;
358struct vnodeops *zfs_xdvnodeops;
359struct vnodeops *zfs_evnodeops;
9babb374 360struct vnodeops *zfs_sharevnodeops;
34dc7c2f
BB
361
362void
363zfs_remove_op_tables()
364{
365 /*
366 * Remove vfs ops
367 */
368 ASSERT(zfsfstype);
369 (void) vfs_freevfsops_by_type(zfsfstype);
370 zfsfstype = 0;
371
372 /*
373 * Remove vnode ops
374 */
375 if (zfs_dvnodeops)
376 vn_freevnodeops(zfs_dvnodeops);
377 if (zfs_fvnodeops)
378 vn_freevnodeops(zfs_fvnodeops);
379 if (zfs_symvnodeops)
380 vn_freevnodeops(zfs_symvnodeops);
381 if (zfs_xdvnodeops)
382 vn_freevnodeops(zfs_xdvnodeops);
383 if (zfs_evnodeops)
384 vn_freevnodeops(zfs_evnodeops);
9babb374
BB
385 if (zfs_sharevnodeops)
386 vn_freevnodeops(zfs_sharevnodeops);
34dc7c2f
BB
387
388 zfs_dvnodeops = NULL;
389 zfs_fvnodeops = NULL;
390 zfs_symvnodeops = NULL;
391 zfs_xdvnodeops = NULL;
392 zfs_evnodeops = NULL;
9babb374 393 zfs_sharevnodeops = NULL;
34dc7c2f
BB
394}
395
396extern const fs_operation_def_t zfs_dvnodeops_template[];
397extern const fs_operation_def_t zfs_fvnodeops_template[];
398extern const fs_operation_def_t zfs_xdvnodeops_template[];
399extern const fs_operation_def_t zfs_symvnodeops_template[];
400extern const fs_operation_def_t zfs_evnodeops_template[];
9babb374 401extern const fs_operation_def_t zfs_sharevnodeops_template[];
34dc7c2f
BB
402
403int
404zfs_create_op_tables()
405{
406 int error;
407
408 /*
409 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
410 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
411 * In this case we just return as the ops vectors are already set up.
412 */
413 if (zfs_dvnodeops)
414 return (0);
415
416 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
417 &zfs_dvnodeops);
418 if (error)
419 return (error);
420
421 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
422 &zfs_fvnodeops);
423 if (error)
424 return (error);
425
426 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
427 &zfs_symvnodeops);
428 if (error)
429 return (error);
430
431 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
432 &zfs_xdvnodeops);
433 if (error)
434 return (error);
435
436 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
437 &zfs_evnodeops);
9babb374
BB
438 if (error)
439 return (error);
440
441 error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
442 &zfs_sharevnodeops);
34dc7c2f
BB
443
444 return (error);
445}
446
34dc7c2f 447int
9babb374 448zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
34dc7c2f 449{
9babb374
BB
450 zfs_acl_ids_t acl_ids;
451 vattr_t vattr;
452 znode_t *sharezp;
453 vnode_t *vp;
454 znode_t *zp;
455 int error;
34dc7c2f 456
9babb374
BB
457 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
458 vattr.va_type = VDIR;
459 vattr.va_mode = S_IFDIR|0555;
460 vattr.va_uid = crgetuid(kcred);
461 vattr.va_gid = crgetgid(kcred);
34dc7c2f 462
9babb374
BB
463 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
464 sharezp->z_unlinked = 0;
465 sharezp->z_atime_dirty = 0;
466 sharezp->z_zfsvfs = zfsvfs;
34dc7c2f 467
9babb374
BB
468 vp = ZTOV(sharezp);
469 vn_reinit(vp);
470 vp->v_type = VDIR;
34dc7c2f 471
9babb374
BB
472 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
473 kcred, NULL, &acl_ids));
474 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
475 &zp, 0, &acl_ids);
476 ASSERT3P(zp, ==, sharezp);
477 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
478 POINTER_INVALIDATE(&sharezp->z_zfsvfs);
479 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
480 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
481 zfsvfs->z_shares_dir = sharezp->z_id;
482
483 zfs_acl_ids_free(&acl_ids);
484 ZTOV(sharezp)->v_count = 0;
485 dmu_buf_rele(sharezp->z_dbuf, NULL);
486 sharezp->z_dbuf = NULL;
487 kmem_cache_free(znode_cache, sharezp);
34dc7c2f 488
9babb374 489 return (error);
34dc7c2f
BB
490}
491
492/*
493 * define a couple of values we need available
494 * for both 64 and 32 bit environments.
495 */
496#ifndef NBITSMINOR64
497#define NBITSMINOR64 32
498#endif
499#ifndef MAXMAJ64
500#define MAXMAJ64 0xffffffffUL
501#endif
502#ifndef MAXMIN64
503#define MAXMIN64 0xffffffffUL
504#endif
505
506/*
507 * Create special expldev for ZFS private use.
508 * Can't use standard expldev since it doesn't do
509 * what we want. The standard expldev() takes a
510 * dev32_t in LP64 and expands it to a long dev_t.
511 * We need an interface that takes a dev32_t in ILP32
512 * and expands it to a long dev_t.
513 */
514static uint64_t
515zfs_expldev(dev_t dev)
516{
517#ifndef _LP64
518 major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
519 return (((uint64_t)major << NBITSMINOR64) |
520 ((minor_t)dev & MAXMIN32));
521#else
522 return (dev);
523#endif
524}
525
526/*
527 * Special cmpldev for ZFS private use.
528 * Can't use standard cmpldev since it takes
529 * a long dev_t and compresses it to dev32_t in
530 * LP64. We need to do a compaction of a long dev_t
531 * to a dev32_t in ILP32.
532 */
533dev_t
534zfs_cmpldev(uint64_t dev)
535{
536#ifndef _LP64
537 minor_t minor = (minor_t)dev & MAXMIN64;
538 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
539
540 if (major > MAXMAJ32 || minor > MAXMIN32)
541 return (NODEV32);
542
543 return (((dev32_t)major << NBITSMINOR32) | minor);
544#else
545 return (dev);
546#endif
547}
548
549static void
b128c09f 550zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
34dc7c2f
BB
551{
552 znode_t *nzp;
34dc7c2f 553
b128c09f
BB
554 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
555 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
34dc7c2f
BB
556
557 mutex_enter(&zp->z_lock);
558
559 ASSERT(zp->z_dbuf == NULL);
560 zp->z_dbuf = db;
561 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
562
563 /*
564 * there should be no
565 * concurrent zgets on this object.
566 */
567 if (nzp != NULL)
b128c09f 568 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
34dc7c2f
BB
569
570 /*
571 * Slap on VROOT if we are the root znode
572 */
573 if (zp->z_id == zfsvfs->z_root)
574 ZTOV(zp)->v_flag |= VROOT;
575
576 mutex_exit(&zp->z_lock);
577 vn_exists(ZTOV(zp));
578}
579
580void
581zfs_znode_dmu_fini(znode_t *zp)
582{
583 dmu_buf_t *db = zp->z_dbuf;
b128c09f
BB
584 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
585 zp->z_unlinked ||
34dc7c2f
BB
586 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
587 ASSERT(zp->z_dbuf != NULL);
588 zp->z_dbuf = NULL;
589 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
590 dmu_buf_rele(db, NULL);
591}
592
593/*
594 * Construct a new znode/vnode and intialize.
595 *
596 * This does not do a call to dmu_set_user() that is
597 * up to the caller to do, in case you don't want to
598 * return the znode
599 */
600static znode_t *
601zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
602{
603 znode_t *zp;
604 vnode_t *vp;
605
606 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
607
608 ASSERT(zp->z_dirlocks == NULL);
609 ASSERT(zp->z_dbuf == NULL);
b128c09f 610 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
34dc7c2f 611
b128c09f
BB
612 /*
613 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
614 * the zfs_znode_move() callback.
615 */
34dc7c2f 616 zp->z_phys = NULL;
34dc7c2f
BB
617 zp->z_unlinked = 0;
618 zp->z_atime_dirty = 0;
619 zp->z_mapcnt = 0;
620 zp->z_last_itx = 0;
621 zp->z_id = db->db_object;
622 zp->z_blksz = blksz;
623 zp->z_seq = 0x7A4653;
624 zp->z_sync_cnt = 0;
625
626 vp = ZTOV(zp);
627 vn_reinit(vp);
628
b128c09f 629 zfs_znode_dmu_init(zfsvfs, zp, db);
34dc7c2f
BB
630
631 zp->z_gen = zp->z_phys->zp_gen;
632
34dc7c2f
BB
633 vp->v_vfsp = zfsvfs->z_parent->z_vfs;
634 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
635
636 switch (vp->v_type) {
637 case VDIR:
638 if (zp->z_phys->zp_flags & ZFS_XATTR) {
639 vn_setops(vp, zfs_xdvnodeops);
640 vp->v_flag |= V_XATTRDIR;
641 } else {
642 vn_setops(vp, zfs_dvnodeops);
643 }
644 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
645 break;
646 case VBLK:
647 case VCHR:
648 vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev);
649 /*FALLTHROUGH*/
650 case VFIFO:
651 case VSOCK:
652 case VDOOR:
653 vn_setops(vp, zfs_fvnodeops);
654 break;
655 case VREG:
656 vp->v_flag |= VMODSORT;
9babb374
BB
657 if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir)
658 vn_setops(vp, zfs_sharevnodeops);
659 else
660 vn_setops(vp, zfs_fvnodeops);
34dc7c2f
BB
661 break;
662 case VLNK:
663 vn_setops(vp, zfs_symvnodeops);
664 break;
665 default:
666 vn_setops(vp, zfs_evnodeops);
667 break;
668 }
669
b128c09f
BB
670 mutex_enter(&zfsvfs->z_znodes_lock);
671 list_insert_tail(&zfsvfs->z_all_znodes, zp);
672 membar_producer();
673 /*
674 * Everything else must be valid before assigning z_zfsvfs makes the
675 * znode eligible for zfs_znode_move().
676 */
677 zp->z_zfsvfs = zfsvfs;
678 mutex_exit(&zfsvfs->z_znodes_lock);
679
34dc7c2f
BB
680 VFS_HOLD(zfsvfs->z_vfs);
681 return (zp);
682}
683
684/*
685 * Create a new DMU object to hold a zfs znode.
686 *
687 * IN: dzp - parent directory for new znode
688 * vap - file attributes for new znode
689 * tx - dmu transaction id for zap operations
690 * cr - credentials of caller
691 * flag - flags:
692 * IS_ROOT_NODE - new object will be root
693 * IS_XATTR - new object is an attribute
694 * IS_REPLAY - intent log replay
695 * bonuslen - length of bonus buffer
696 * setaclp - File/Dir initial ACL
697 * fuidp - Tracks fuid allocation.
698 *
699 * OUT: zpp - allocated znode
700 *
701 */
702void
703zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
9babb374 704 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
34dc7c2f
BB
705{
706 dmu_buf_t *db;
707 znode_phys_t *pzp;
708 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
709 timestruc_t now;
710 uint64_t gen, obj;
711 int err;
712
713 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
714
fb5f0bc8 715 if (zfsvfs->z_replay) {
34dc7c2f
BB
716 obj = vap->va_nodeid;
717 flag |= IS_REPLAY;
718 now = vap->va_ctime; /* see zfs_replay_create() */
719 gen = vap->va_nblocks; /* ditto */
720 } else {
721 obj = 0;
722 gethrestime(&now);
723 gen = dmu_tx_get_txg(tx);
724 }
725
726 /*
727 * Create a new DMU object.
728 */
729 /*
730 * There's currently no mechanism for pre-reading the blocks that will
731 * be to needed allocate a new object, so we accept the small chance
732 * that there will be an i/o error and we will fail one of the
733 * assertions below.
734 */
735 if (vap->va_type == VDIR) {
736 if (flag & IS_REPLAY) {
737 err = zap_create_claim_norm(zfsvfs->z_os, obj,
738 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
739 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
740 ASSERT3U(err, ==, 0);
741 } else {
742 obj = zap_create_norm(zfsvfs->z_os,
743 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
744 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
745 }
746 } else {
747 if (flag & IS_REPLAY) {
748 err = dmu_object_claim(zfsvfs->z_os, obj,
749 DMU_OT_PLAIN_FILE_CONTENTS, 0,
750 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
751 ASSERT3U(err, ==, 0);
752 } else {
753 obj = dmu_object_alloc(zfsvfs->z_os,
754 DMU_OT_PLAIN_FILE_CONTENTS, 0,
755 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
756 }
757 }
758 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
759 dmu_buf_will_dirty(db, tx);
760
761 /*
762 * Initialize the znode physical data to zero.
763 */
764 ASSERT(db->db_size >= sizeof (znode_phys_t));
765 bzero(db->db_data, db->db_size);
766 pzp = db->db_data;
767
768 /*
769 * If this is the root, fix up the half-initialized parent pointer
770 * to reference the just-allocated physical data area.
771 */
772 if (flag & IS_ROOT_NODE) {
773 dzp->z_dbuf = db;
774 dzp->z_phys = pzp;
775 dzp->z_id = obj;
776 }
777
778 /*
779 * If parent is an xattr, so am I.
780 */
781 if (dzp->z_phys->zp_flags & ZFS_XATTR)
782 flag |= IS_XATTR;
783
784 if (vap->va_type == VBLK || vap->va_type == VCHR) {
785 pzp->zp_rdev = zfs_expldev(vap->va_rdev);
786 }
787
788 if (zfsvfs->z_use_fuids)
789 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
790
791 if (vap->va_type == VDIR) {
792 pzp->zp_size = 2; /* contents ("." and "..") */
793 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
794 }
795
796 pzp->zp_parent = dzp->z_id;
797 if (flag & IS_XATTR)
798 pzp->zp_flags |= ZFS_XATTR;
799
800 pzp->zp_gen = gen;
801
802 ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
803 ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
804
805 if (vap->va_mask & AT_ATIME) {
806 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
807 } else {
808 ZFS_TIME_ENCODE(&now, pzp->zp_atime);
809 }
810
811 if (vap->va_mask & AT_MTIME) {
812 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
813 } else {
814 ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
815 }
816
817 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
818 if (!(flag & IS_ROOT_NODE)) {
b128c09f 819 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
34dc7c2f
BB
820 *zpp = zfs_znode_alloc(zfsvfs, db, 0);
821 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
822 } else {
823 /*
824 * If we are creating the root node, the "parent" we
825 * passed in is the znode for the root.
826 */
827 *zpp = dzp;
828 }
9babb374
BB
829 pzp->zp_uid = acl_ids->z_fuid;
830 pzp->zp_gid = acl_ids->z_fgid;
831 pzp->zp_mode = acl_ids->z_mode;
832 VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
833 if (vap->va_mask & AT_XVATTR)
834 zfs_xvattr_set(*zpp, (xvattr_t *)vap);
34dc7c2f
BB
835}
836
837void
838zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
839{
840 xoptattr_t *xoap;
841
842 xoap = xva_getxoptattr(xvap);
843 ASSERT(xoap);
844
845 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
846 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
847 XVA_SET_RTN(xvap, XAT_CREATETIME);
848 }
849 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
850 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
851 XVA_SET_RTN(xvap, XAT_READONLY);
852 }
853 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
854 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
855 XVA_SET_RTN(xvap, XAT_HIDDEN);
856 }
857 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
858 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
859 XVA_SET_RTN(xvap, XAT_SYSTEM);
860 }
861 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
862 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
863 XVA_SET_RTN(xvap, XAT_ARCHIVE);
864 }
865 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
866 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
867 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
868 }
869 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
870 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
871 XVA_SET_RTN(xvap, XAT_NOUNLINK);
872 }
873 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
874 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
875 XVA_SET_RTN(xvap, XAT_APPENDONLY);
876 }
877 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
878 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
879 XVA_SET_RTN(xvap, XAT_NODUMP);
880 }
881 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
882 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
883 XVA_SET_RTN(xvap, XAT_OPAQUE);
884 }
885 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
886 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
887 xoap->xoa_av_quarantined);
888 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
889 }
890 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
891 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
892 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
893 }
894 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
895 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
896 sizeof (xoap->xoa_av_scanstamp));
897 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
898 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
899 }
900}
901
902int
903zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
904{
905 dmu_object_info_t doi;
906 dmu_buf_t *db;
907 znode_t *zp;
908 int err;
909
910 *zpp = NULL;
911
912 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
913
914 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
915 if (err) {
916 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
917 return (err);
918 }
919
920 dmu_object_info_from_db(db, &doi);
921 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
922 doi.doi_bonus_size < sizeof (znode_phys_t)) {
923 dmu_buf_rele(db, NULL);
924 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
925 return (EINVAL);
926 }
927
928 zp = dmu_buf_get_user(db);
929 if (zp != NULL) {
930 mutex_enter(&zp->z_lock);
931
932 /*
933 * Since we do immediate eviction of the z_dbuf, we
934 * should never find a dbuf with a znode that doesn't
935 * know about the dbuf.
936 */
937 ASSERT3P(zp->z_dbuf, ==, db);
938 ASSERT3U(zp->z_id, ==, obj_num);
939 if (zp->z_unlinked) {
940 err = ENOENT;
941 } else {
942 VN_HOLD(ZTOV(zp));
943 *zpp = zp;
944 err = 0;
945 }
946 dmu_buf_rele(db, NULL);
947 mutex_exit(&zp->z_lock);
948 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
949 return (err);
950 }
951
952 /*
953 * Not found create new znode/vnode
954 */
955 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
956 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
957 *zpp = zp;
958 return (0);
959}
960
961int
962zfs_rezget(znode_t *zp)
963{
964 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
965 dmu_object_info_t doi;
966 dmu_buf_t *db;
967 uint64_t obj_num = zp->z_id;
968 int err;
969
970 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
971
972 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
973 if (err) {
974 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
975 return (err);
976 }
977
978 dmu_object_info_from_db(db, &doi);
979 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
980 doi.doi_bonus_size < sizeof (znode_phys_t)) {
981 dmu_buf_rele(db, NULL);
982 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
983 return (EINVAL);
984 }
985
986 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
987 dmu_buf_rele(db, NULL);
988 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
989 return (EIO);
990 }
991
b128c09f 992 zfs_znode_dmu_init(zfsvfs, zp, db);
34dc7c2f
BB
993 zp->z_unlinked = (zp->z_phys->zp_links == 0);
994 zp->z_blksz = doi.doi_data_block_size;
995
996 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
997
998 return (0);
999}
1000
1001void
1002zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1003{
1004 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
b128c09f 1005 objset_t *os = zfsvfs->z_os;
34dc7c2f 1006 uint64_t obj = zp->z_id;
b128c09f 1007 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
34dc7c2f
BB
1008
1009 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
b128c09f
BB
1010 if (acl_obj)
1011 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1012 VERIFY(0 == dmu_object_free(os, obj, tx));
34dc7c2f
BB
1013 zfs_znode_dmu_fini(zp);
1014 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1015 zfs_znode_free(zp);
1016}
1017
1018void
1019zfs_zinactive(znode_t *zp)
1020{
1021 vnode_t *vp = ZTOV(zp);
1022 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1023 uint64_t z_id = zp->z_id;
1024
1025 ASSERT(zp->z_dbuf && zp->z_phys);
1026
1027 /*
1028 * Don't allow a zfs_zget() while were trying to release this znode
1029 */
1030 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1031
1032 mutex_enter(&zp->z_lock);
1033 mutex_enter(&vp->v_lock);
1034 vp->v_count--;
1035 if (vp->v_count > 0 || vn_has_cached_data(vp)) {
1036 /*
1037 * If the hold count is greater than zero, somebody has
1038 * obtained a new reference on this znode while we were
1039 * processing it here, so we are done. If we still have
1040 * mapped pages then we are also done, since we don't
1041 * want to inactivate the znode until the pages get pushed.
1042 *
1043 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
1044 * this seems like it would leave the znode hanging with
1045 * no chance to go inactive...
1046 */
1047 mutex_exit(&vp->v_lock);
1048 mutex_exit(&zp->z_lock);
1049 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1050 return;
1051 }
1052 mutex_exit(&vp->v_lock);
1053
1054 /*
1055 * If this was the last reference to a file with no links,
1056 * remove the file from the file system.
1057 */
1058 if (zp->z_unlinked) {
1059 mutex_exit(&zp->z_lock);
1060 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1061 zfs_rmnode(zp);
1062 return;
1063 }
1064 mutex_exit(&zp->z_lock);
1065 zfs_znode_dmu_fini(zp);
1066 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1067 zfs_znode_free(zp);
1068}
1069
1070void
1071zfs_znode_free(znode_t *zp)
1072{
1073 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1074
1075 vn_invalid(ZTOV(zp));
1076
b128c09f
BB
1077 ASSERT(ZTOV(zp)->v_count == 0);
1078
34dc7c2f 1079 mutex_enter(&zfsvfs->z_znodes_lock);
b128c09f 1080 POINTER_INVALIDATE(&zp->z_zfsvfs);
34dc7c2f
BB
1081 list_remove(&zfsvfs->z_all_znodes, zp);
1082 mutex_exit(&zfsvfs->z_znodes_lock);
1083
1084 kmem_cache_free(znode_cache, zp);
1085
1086 VFS_RELE(zfsvfs->z_vfs);
1087}
1088
1089void
1090zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1091{
1092 timestruc_t now;
1093
1094 ASSERT(MUTEX_HELD(&zp->z_lock));
1095
1096 gethrestime(&now);
1097
1098 if (tx) {
1099 dmu_buf_will_dirty(zp->z_dbuf, tx);
1100 zp->z_atime_dirty = 0;
1101 zp->z_seq++;
1102 } else {
1103 zp->z_atime_dirty = 1;
1104 }
1105
1106 if (flag & AT_ATIME)
1107 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
1108
1109 if (flag & AT_MTIME) {
1110 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
1111 if (zp->z_zfsvfs->z_use_fuids)
1112 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
1113 }
1114
1115 if (flag & AT_CTIME) {
1116 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
1117 if (zp->z_zfsvfs->z_use_fuids)
1118 zp->z_phys->zp_flags |= ZFS_ARCHIVE;
1119 }
1120}
1121
1122/*
1123 * Update the requested znode timestamps with the current time.
1124 * If we are in a transaction, then go ahead and mark the znode
1125 * dirty in the transaction so the timestamps will go to disk.
1126 * Otherwise, we will get pushed next time the znode is updated
1127 * in a transaction, or when this znode eventually goes inactive.
1128 *
1129 * Why is this OK?
1130 * 1 - Only the ACCESS time is ever updated outside of a transaction.
1131 * 2 - Multiple consecutive updates will be collapsed into a single
1132 * znode update by the transaction grouping semantics of the DMU.
1133 */
1134void
1135zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1136{
1137 mutex_enter(&zp->z_lock);
1138 zfs_time_stamper_locked(zp, flag, tx);
1139 mutex_exit(&zp->z_lock);
1140}
1141
1142/*
1143 * Grow the block size for a file.
1144 *
1145 * IN: zp - znode of file to free data in.
1146 * size - requested block size
1147 * tx - open transaction.
1148 *
1149 * NOTE: this function assumes that the znode is write locked.
1150 */
1151void
1152zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1153{
1154 int error;
1155 u_longlong_t dummy;
1156
1157 if (size <= zp->z_blksz)
1158 return;
1159 /*
1160 * If the file size is already greater than the current blocksize,
1161 * we will not grow. If there is more than one block in a file,
1162 * the blocksize cannot change.
1163 */
1164 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
1165 return;
1166
1167 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1168 size, 0, tx);
1169 if (error == ENOTSUP)
1170 return;
1171 ASSERT3U(error, ==, 0);
1172
1173 /* What blocksize did we actually get? */
1174 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
1175}
1176
1177/*
1178 * This is a dummy interface used when pvn_vplist_dirty() should *not*
1179 * be calling back into the fs for a putpage(). E.g.: when truncating
1180 * a file, the pages being "thrown away* don't need to be written out.
1181 */
1182/* ARGSUSED */
1183static int
1184zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1185 int flags, cred_t *cr)
1186{
1187 ASSERT(0);
1188 return (0);
1189}
1190
1191/*
b128c09f 1192 * Increase the file length
34dc7c2f
BB
1193 *
1194 * IN: zp - znode of file to free data in.
b128c09f 1195 * end - new end-of-file
34dc7c2f
BB
1196 *
1197 * RETURN: 0 if success
1198 * error code if failure
1199 */
b128c09f
BB
1200static int
1201zfs_extend(znode_t *zp, uint64_t end)
34dc7c2f 1202{
34dc7c2f 1203 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
b128c09f 1204 dmu_tx_t *tx;
34dc7c2f 1205 rl_t *rl;
b128c09f 1206 uint64_t newblksz;
34dc7c2f
BB
1207 int error;
1208
34dc7c2f 1209 /*
b128c09f 1210 * We will change zp_size, lock the whole file.
34dc7c2f 1211 */
b128c09f 1212 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
34dc7c2f
BB
1213
1214 /*
1215 * Nothing to do if file already at desired length.
1216 */
b128c09f 1217 if (end <= zp->z_phys->zp_size) {
34dc7c2f
BB
1218 zfs_range_unlock(rl);
1219 return (0);
1220 }
b128c09f 1221top:
34dc7c2f
BB
1222 tx = dmu_tx_create(zfsvfs->z_os);
1223 dmu_tx_hold_bonus(tx, zp->z_id);
b128c09f 1224 if (end > zp->z_blksz &&
34dc7c2f
BB
1225 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1226 /*
1227 * We are growing the file past the current block size.
1228 */
1229 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1230 ASSERT(!ISP2(zp->z_blksz));
b128c09f 1231 newblksz = MIN(end, SPA_MAXBLOCKSIZE);
34dc7c2f 1232 } else {
b128c09f 1233 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
34dc7c2f 1234 }
b128c09f
BB
1235 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1236 } else {
1237 newblksz = 0;
34dc7c2f
BB
1238 }
1239
fb5f0bc8 1240 error = dmu_tx_assign(tx, TXG_NOWAIT);
34dc7c2f 1241 if (error) {
fb5f0bc8 1242 if (error == ERESTART) {
34dc7c2f 1243 dmu_tx_wait(tx);
b128c09f
BB
1244 dmu_tx_abort(tx);
1245 goto top;
1246 }
34dc7c2f
BB
1247 dmu_tx_abort(tx);
1248 zfs_range_unlock(rl);
1249 return (error);
1250 }
b128c09f 1251 dmu_buf_will_dirty(zp->z_dbuf, tx);
34dc7c2f 1252
b128c09f
BB
1253 if (newblksz)
1254 zfs_grow_blocksize(zp, newblksz, tx);
34dc7c2f 1255
b128c09f 1256 zp->z_phys->zp_size = end;
34dc7c2f 1257
b128c09f 1258 zfs_range_unlock(rl);
34dc7c2f 1259
b128c09f 1260 dmu_tx_commit(tx);
34dc7c2f 1261
b128c09f
BB
1262 return (0);
1263}
1264
1265/*
1266 * Free space in a file.
1267 *
1268 * IN: zp - znode of file to free data in.
1269 * off - start of section to free.
1270 * len - length of section to free.
1271 *
1272 * RETURN: 0 if success
1273 * error code if failure
1274 */
1275static int
1276zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1277{
1278 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1279 rl_t *rl;
1280 int error;
1281
1282 /*
1283 * Lock the range being freed.
1284 */
1285 rl = zfs_range_lock(zp, off, len, RL_WRITER);
1286
1287 /*
1288 * Nothing to do if file already at desired length.
1289 */
1290 if (off >= zp->z_phys->zp_size) {
1291 zfs_range_unlock(rl);
1292 return (0);
34dc7c2f
BB
1293 }
1294
b128c09f
BB
1295 if (off + len > zp->z_phys->zp_size)
1296 len = zp->z_phys->zp_size - off;
1297
1298 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1299
34dc7c2f
BB
1300 zfs_range_unlock(rl);
1301
b128c09f
BB
1302 return (error);
1303}
1304
1305/*
1306 * Truncate a file
1307 *
1308 * IN: zp - znode of file to free data in.
1309 * end - new end-of-file.
1310 *
1311 * RETURN: 0 if success
1312 * error code if failure
1313 */
1314static int
1315zfs_trunc(znode_t *zp, uint64_t end)
1316{
1317 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1318 vnode_t *vp = ZTOV(zp);
1319 dmu_tx_t *tx;
1320 rl_t *rl;
1321 int error;
1322
1323 /*
1324 * We will change zp_size, lock the whole file.
1325 */
1326 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1327
1328 /*
1329 * Nothing to do if file already at desired length.
1330 */
1331 if (end >= zp->z_phys->zp_size) {
1332 zfs_range_unlock(rl);
1333 return (0);
1334 }
1335
1336 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
1337 if (error) {
1338 zfs_range_unlock(rl);
1339 return (error);
1340 }
1341top:
1342 tx = dmu_tx_create(zfsvfs->z_os);
1343 dmu_tx_hold_bonus(tx, zp->z_id);
fb5f0bc8 1344 error = dmu_tx_assign(tx, TXG_NOWAIT);
b128c09f 1345 if (error) {
fb5f0bc8 1346 if (error == ERESTART) {
b128c09f
BB
1347 dmu_tx_wait(tx);
1348 dmu_tx_abort(tx);
1349 goto top;
1350 }
1351 dmu_tx_abort(tx);
1352 zfs_range_unlock(rl);
1353 return (error);
1354 }
1355 dmu_buf_will_dirty(zp->z_dbuf, tx);
1356
1357 zp->z_phys->zp_size = end;
1358
34dc7c2f
BB
1359 dmu_tx_commit(tx);
1360
1361 /*
1362 * Clear any mapped pages in the truncated region. This has to
1363 * happen outside of the transaction to avoid the possibility of
1364 * a deadlock with someone trying to push a page that we are
1365 * about to invalidate.
1366 */
b128c09f 1367 if (vn_has_cached_data(vp)) {
34dc7c2f 1368 page_t *pp;
b128c09f
BB
1369 uint64_t start = end & PAGEMASK;
1370 int poff = end & PAGEOFFSET;
34dc7c2f
BB
1371
1372 if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
1373 /*
1374 * We need to zero a partial page.
1375 */
1376 pagezero(pp, poff, PAGESIZE - poff);
1377 start += PAGESIZE;
1378 page_unlock(pp);
1379 }
1380 error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
1381 B_INVAL | B_TRUNC, NULL);
1382 ASSERT(error == 0);
1383 }
d164b209
BB
1384
1385 zfs_range_unlock(rl);
34dc7c2f
BB
1386
1387 return (0);
1388}
1389
b128c09f
BB
1390/*
1391 * Free space in a file
1392 *
1393 * IN: zp - znode of file to free data in.
1394 * off - start of range
1395 * len - end of range (0 => EOF)
1396 * flag - current file open mode flags.
1397 * log - TRUE if this action should be logged
1398 *
1399 * RETURN: 0 if success
1400 * error code if failure
1401 */
1402int
1403zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1404{
1405 vnode_t *vp = ZTOV(zp);
1406 dmu_tx_t *tx;
1407 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1408 zilog_t *zilog = zfsvfs->z_log;
1409 int error;
1410
1411 if (off > zp->z_phys->zp_size) {
1412 error = zfs_extend(zp, off+len);
1413 if (error == 0 && log)
1414 goto log;
1415 else
1416 return (error);
1417 }
1418
1419 /*
1420 * Check for any locks in the region to be freed.
1421 */
1422 if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
1423 uint64_t length = (len ? len : zp->z_phys->zp_size - off);
1424 if (error = chklock(vp, FWRITE, off, length, flag, NULL))
1425 return (error);
1426 }
1427
1428 if (len == 0) {
1429 error = zfs_trunc(zp, off);
1430 } else {
1431 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1432 off + len > zp->z_phys->zp_size)
1433 error = zfs_extend(zp, off+len);
1434 }
1435 if (error || !log)
1436 return (error);
1437log:
1438 tx = dmu_tx_create(zfsvfs->z_os);
1439 dmu_tx_hold_bonus(tx, zp->z_id);
fb5f0bc8 1440 error = dmu_tx_assign(tx, TXG_NOWAIT);
b128c09f 1441 if (error) {
fb5f0bc8 1442 if (error == ERESTART) {
b128c09f
BB
1443 dmu_tx_wait(tx);
1444 dmu_tx_abort(tx);
1445 goto log;
1446 }
1447 dmu_tx_abort(tx);
1448 return (error);
1449 }
1450
1451 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1452 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1453
1454 dmu_tx_commit(tx);
1455 return (0);
1456}
1457
34dc7c2f
BB
1458void
1459zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1460{
1461 zfsvfs_t zfsvfs;
9babb374 1462 uint64_t moid, obj, version;
34dc7c2f
BB
1463 uint64_t sense = ZFS_CASE_SENSITIVE;
1464 uint64_t norm = 0;
1465 nvpair_t *elem;
1466 int error;
1467 znode_t *rootzp = NULL;
1468 vnode_t *vp;
1469 vattr_t vattr;
1470 znode_t *zp;
9babb374 1471 zfs_acl_ids_t acl_ids;
34dc7c2f
BB
1472
1473 /*
1474 * First attempt to create master node.
1475 */
1476 /*
1477 * In an empty objset, there are no blocks to read and thus
1478 * there can be no i/o errors (which we assert below).
1479 */
1480 moid = MASTER_NODE_OBJ;
1481 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1482 DMU_OT_NONE, 0, tx);
1483 ASSERT(error == 0);
1484
1485 /*
1486 * Set starting attributes.
1487 */
9babb374 1488 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
b128c09f 1489 version = ZPL_VERSION;
9babb374
BB
1490 else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
1491 version = ZPL_VERSION_USERSPACE - 1;
b128c09f
BB
1492 else
1493 version = ZPL_VERSION_FUID - 1;
34dc7c2f
BB
1494 elem = NULL;
1495 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1496 /* For the moment we expect all zpl props to be uint64_ts */
1497 uint64_t val;
1498 char *name;
1499
1500 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1501 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1502 name = nvpair_name(elem);
1503 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
9babb374
BB
1504 if (val < version)
1505 version = val;
34dc7c2f
BB
1506 } else {
1507 error = zap_update(os, moid, name, 8, 1, &val, tx);
1508 }
1509 ASSERT(error == 0);
1510 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1511 norm = val;
1512 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1513 sense = val;
1514 }
1515 ASSERT(version != 0);
9babb374 1516 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
34dc7c2f
BB
1517
1518 /*
1519 * Create a delete queue.
1520 */
9babb374 1521 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
34dc7c2f 1522
9babb374 1523 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
34dc7c2f
BB
1524 ASSERT(error == 0);
1525
1526 /*
1527 * Create root znode. Create minimal znode/vnode/zfsvfs
1528 * to allow zfs_mknode to work.
1529 */
1530 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1531 vattr.va_type = VDIR;
1532 vattr.va_mode = S_IFDIR|0755;
1533 vattr.va_uid = crgetuid(cr);
1534 vattr.va_gid = crgetgid(cr);
1535
1536 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
34dc7c2f
BB
1537 rootzp->z_unlinked = 0;
1538 rootzp->z_atime_dirty = 0;
1539
1540 vp = ZTOV(rootzp);
1541 vn_reinit(vp);
1542 vp->v_type = VDIR;
1543
1544 bzero(&zfsvfs, sizeof (zfsvfs_t));
1545
1546 zfsvfs.z_os = os;
34dc7c2f
BB
1547 zfsvfs.z_parent = &zfsvfs;
1548 zfsvfs.z_version = version;
1549 zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1550 zfsvfs.z_norm = norm;
1551 /*
1552 * Fold case on file systems that are always or sometimes case
1553 * insensitive.
1554 */
1555 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1556 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
1557
1558 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1559 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1560 offsetof(znode_t, z_link_node));
1561
b128c09f
BB
1562 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1563 rootzp->z_zfsvfs = &zfsvfs;
9babb374
BB
1564 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1565 cr, NULL, &acl_ids));
1566 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
34dc7c2f 1567 ASSERT3P(zp, ==, rootzp);
b128c09f 1568 ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
34dc7c2f
BB
1569 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1570 ASSERT(error == 0);
9babb374 1571 zfs_acl_ids_free(&acl_ids);
b128c09f 1572 POINTER_INVALIDATE(&rootzp->z_zfsvfs);
34dc7c2f
BB
1573
1574 ZTOV(rootzp)->v_count = 0;
1575 dmu_buf_rele(rootzp->z_dbuf, NULL);
1576 rootzp->z_dbuf = NULL;
1577 kmem_cache_free(znode_cache, rootzp);
9babb374
BB
1578
1579 /*
1580 * Create shares directory
1581 */
1582
1583 error = zfs_create_share_dir(&zfsvfs, tx);
1584
1585 ASSERT(error == 0);
34dc7c2f
BB
1586}
1587
1588#endif /* _KERNEL */
1589/*
1590 * Given an object number, return its parent object number and whether
1591 * or not the object is an extended attribute directory.
1592 */
1593static int
1594zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1595{
1596 dmu_buf_t *db;
1597 dmu_object_info_t doi;
1598 znode_phys_t *zp;
1599 int error;
1600
1601 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1602 return (error);
1603
1604 dmu_object_info_from_db(db, &doi);
1605 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1606 doi.doi_bonus_size < sizeof (znode_phys_t)) {
1607 dmu_buf_rele(db, FTAG);
1608 return (EINVAL);
1609 }
1610
1611 zp = db->db_data;
1612 *pobjp = zp->zp_parent;
1613 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1614 S_ISDIR(zp->zp_mode);
1615 dmu_buf_rele(db, FTAG);
1616
1617 return (0);
1618}
1619
1620int
1621zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1622{
1623 char *path = buf + len - 1;
1624 int error;
1625
1626 *path = '\0';
1627
1628 for (;;) {
1629 uint64_t pobj;
1630 char component[MAXNAMELEN + 2];
1631 size_t complen;
1632 int is_xattrdir;
1633
1634 if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1635 &is_xattrdir)) != 0)
1636 break;
1637
1638 if (pobj == obj) {
1639 if (path[0] != '/')
1640 *--path = '/';
1641 break;
1642 }
1643
1644 component[0] = '/';
1645 if (is_xattrdir) {
1646 (void) sprintf(component + 1, "<xattrdir>");
1647 } else {
1648 error = zap_value_search(osp, pobj, obj,
1649 ZFS_DIRENT_OBJ(-1ULL), component + 1);
1650 if (error != 0)
1651 break;
1652 }
1653
1654 complen = strlen(component);
1655 path -= complen;
1656 ASSERT(path >= buf);
1657 bcopy(component, path, complen);
1658 obj = pobj;
1659 }
1660
1661 if (error == 0)
1662 (void) memmove(buf, path, buf + len - path);
1663 return (error);
1664}