]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dsl_dir.c
Optionally skip zil_close during zvol_create_minor_impl
[mirror_zfs.git] / module / zfs / dsl_dir.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
34dc7c2f
BB
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
37f03da8 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
b1118acb 24 * Copyright (c) 2013 Martin Matuska. All rights reserved.
788eb90c 25 * Copyright (c) 2014 Joyent, Inc. All rights reserved.
0c66c32d 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
a0bd735a 27 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
d8d418ff 28 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
34dc7c2f
BB
29 */
30
34dc7c2f
BB
31#include <sys/dmu.h>
32#include <sys/dmu_objset.h>
33#include <sys/dmu_tx.h>
34#include <sys/dsl_dataset.h>
35#include <sys/dsl_dir.h>
36#include <sys/dsl_prop.h>
37#include <sys/dsl_synctask.h>
38#include <sys/dsl_deleg.h>
fa86b5db 39#include <sys/dmu_impl.h>
34dc7c2f 40#include <sys/spa.h>
ae76f45c 41#include <sys/spa_impl.h>
428870ff 42#include <sys/metaslab.h>
34dc7c2f
BB
43#include <sys/zap.h>
44#include <sys/zio.h>
45#include <sys/arc.h>
46#include <sys/sunddi.h>
788eb90c
JJ
47#include <sys/zfeature.h>
48#include <sys/policy.h>
7b4e2723 49#include <sys/zfs_vfsops.h>
788eb90c 50#include <sys/zfs_znode.h>
ba6a2402 51#include <sys/zvol.h>
37f03da8 52#include <sys/zthr.h>
34dc7c2f 53#include "zfs_namecheck.h"
788eb90c
JJ
54#include "zfs_prop.h"
55
56/*
57 * Filesystem and Snapshot Limits
58 * ------------------------------
59 *
60 * These limits are used to restrict the number of filesystems and/or snapshots
61 * that can be created at a given level in the tree or below. A typical
62 * use-case is with a delegated dataset where the administrator wants to ensure
63 * that a user within the zone is not creating too many additional filesystems
64 * or snapshots, even though they're not exceeding their space quota.
65 *
66 * The filesystem and snapshot counts are stored as extensible properties. This
67 * capability is controlled by a feature flag and must be enabled to be used.
68 * Once enabled, the feature is not active until the first limit is set. At
69 * that point, future operations to create/destroy filesystems or snapshots
70 * will validate and update the counts.
71 *
72 * Because the count properties will not exist before the feature is active,
73 * the counts are updated when a limit is first set on an uninitialized
74 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
75 * all of the nested filesystems/snapshots. Thus, a new leaf node has a
76 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
77 * snapshot count properties on a node indicate uninitialized counts on that
78 * node.) When first setting a limit on an uninitialized node, the code starts
79 * at the filesystem with the new limit and descends into all sub-filesystems
80 * to add the count properties.
81 *
82 * In practice this is lightweight since a limit is typically set when the
83 * filesystem is created and thus has no children. Once valid, changing the
84 * limit value won't require a re-traversal since the counts are already valid.
85 * When recursively fixing the counts, if a node with a limit is encountered
86 * during the descent, the counts are known to be valid and there is no need to
87 * descend into that filesystem's children. The counts on filesystems above the
88 * one with the new limit will still be uninitialized, unless a limit is
89 * eventually set on one of those filesystems. The counts are always recursively
90 * updated when a limit is set on a dataset, unless there is already a limit.
91 * When a new limit value is set on a filesystem with an existing limit, it is
92 * possible for the new limit to be less than the current count at that level
93 * since a user who can change the limit is also allowed to exceed the limit.
94 *
95 * Once the feature is active, then whenever a filesystem or snapshot is
96 * created, the code recurses up the tree, validating the new count against the
97 * limit at each initialized level. In practice, most levels will not have a
98 * limit set. If there is a limit at any initialized level up the tree, the
99 * check must pass or the creation will fail. Likewise, when a filesystem or
100 * snapshot is destroyed, the counts are recursively adjusted all the way up
e1cfd73f 101 * the initialized nodes in the tree. Renaming a filesystem into different point
788eb90c
JJ
102 * in the tree will first validate, then update the counts on each branch up to
103 * the common ancestor. A receive will also validate the counts and then update
104 * them.
105 *
106 * An exception to the above behavior is that the limit is not enforced if the
107 * user has permission to modify the limit. This is primarily so that
108 * recursive snapshots in the global zone always work. We want to prevent a
109 * denial-of-service in which a lower level delegated dataset could max out its
110 * limit and thus block recursive snapshots from being taken in the global zone.
111 * Because of this, it is possible for the snapshot count to be over the limit
112 * and snapshots taken in the global zone could cause a lower level dataset to
113 * hit or exceed its limit. The administrator taking the global zone recursive
114 * snapshot should be aware of this side-effect and behave accordingly.
115 * For consistency, the filesystem limit is also not enforced if the user can
116 * modify the limit.
117 *
118 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
119 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
120 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
121 * dsl_dir_init_fs_ss_count().
788eb90c 122 */
34dc7c2f
BB
123
124static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
34dc7c2f 125
a1d477c2
MA
126typedef struct ddulrt_arg {
127 dsl_dir_t *ddulrta_dd;
128 uint64_t ddlrta_txg;
129} ddulrt_arg_t;
130
34dc7c2f 131static void
39efbde7 132dsl_dir_evict_async(void *dbu)
34dc7c2f 133{
0c66c32d 134 dsl_dir_t *dd = dbu;
34dc7c2f 135 int t;
2a8ba608 136 dsl_pool_t *dp __maybe_unused = dd->dd_pool;
34dc7c2f 137
0c66c32d
JG
138 dd->dd_dbuf = NULL;
139
34dc7c2f
BB
140 for (t = 0; t < TXG_SIZE; t++) {
141 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
142 ASSERT(dd->dd_tempreserved[t] == 0);
143 ASSERT(dd->dd_space_towrite[t] == 0);
144 }
145
34dc7c2f 146 if (dd->dd_parent)
0c66c32d 147 dsl_dir_async_rele(dd->dd_parent, dd);
34dc7c2f 148
0c66c32d 149 spa_async_close(dd->dd_pool->dp_spa, dd);
34dc7c2f 150
37f03da8
SH
151 if (dsl_deadlist_is_open(&dd->dd_livelist))
152 dsl_dir_livelist_close(dd);
153
0eb21616 154 dsl_prop_fini(dd);
5a42ef04
PD
155 cv_destroy(&dd->dd_activity_cv);
156 mutex_destroy(&dd->dd_activity_lock);
34dc7c2f
BB
157 mutex_destroy(&dd->dd_lock);
158 kmem_free(dd, sizeof (dsl_dir_t));
159}
160
161int
13fe0198 162dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
dd66857d 163 const char *tail, const void *tag, dsl_dir_t **ddp)
34dc7c2f
BB
164{
165 dmu_buf_t *dbuf;
166 dsl_dir_t *dd;
b5256303 167 dmu_object_info_t doi;
34dc7c2f
BB
168 int err;
169
13fe0198 170 ASSERT(dsl_pool_config_held(dp));
34dc7c2f
BB
171
172 err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
13fe0198 173 if (err != 0)
34dc7c2f
BB
174 return (err);
175 dd = dmu_buf_get_user(dbuf);
b5256303
TC
176
177 dmu_object_info_from_db(dbuf, &doi);
178 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
179 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
180
34dc7c2f
BB
181 if (dd == NULL) {
182 dsl_dir_t *winner;
34dc7c2f 183
79c76d5b 184 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
34dc7c2f
BB
185 dd->dd_object = ddobj;
186 dd->dd_dbuf = dbuf;
187 dd->dd_pool = dp;
b5256303 188
34dc7c2f 189 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
5a42ef04
PD
190 mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
191 cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
0eb21616 192 dsl_prop_init(dd);
34dc7c2f 193
d87676a9
MA
194 if (dsl_dir_is_zapified(dd)) {
195 err = zap_lookup(dp->dp_meta_objset,
196 ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
197 sizeof (uint64_t), 1, &dd->dd_crypto_obj);
198 if (err == 0) {
199 /* check for on-disk format errata */
200 if (dsl_dir_incompatible_encryption_version(
201 dd)) {
202 dp->dp_spa->spa_errata =
203 ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
204 }
205 } else if (err != ENOENT) {
206 goto errout;
207 }
208 }
209
d683ddbb
JG
210 if (dsl_dir_phys(dd)->dd_parent_obj) {
211 err = dsl_dir_hold_obj(dp,
212 dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
213 &dd->dd_parent);
13fe0198 214 if (err != 0)
b128c09f 215 goto errout;
34dc7c2f
BB
216 if (tail) {
217#ifdef ZFS_DEBUG
218 uint64_t foundobj;
219
220 err = zap_lookup(dp->dp_meta_objset,
d683ddbb
JG
221 dsl_dir_phys(dd->dd_parent)->
222 dd_child_dir_zapobj, tail,
223 sizeof (foundobj), 1, &foundobj);
34dc7c2f
BB
224 ASSERT(err || foundobj == ddobj);
225#endif
680eada9 226 (void) strlcpy(dd->dd_myname, tail,
227 sizeof (dd->dd_myname));
34dc7c2f
BB
228 } else {
229 err = zap_value_search(dp->dp_meta_objset,
d683ddbb
JG
230 dsl_dir_phys(dd->dd_parent)->
231 dd_child_dir_zapobj,
34dc7c2f
BB
232 ddobj, 0, dd->dd_myname);
233 }
13fe0198 234 if (err != 0)
b128c09f 235 goto errout;
34dc7c2f 236 } else {
c9e319fa
JL
237 (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa),
238 sizeof (dd->dd_myname));
34dc7c2f
BB
239 }
240
428870ff
BB
241 if (dsl_dir_is_clone(dd)) {
242 dmu_buf_t *origin_bonus;
243 dsl_dataset_phys_t *origin_phys;
244
245 /*
246 * We can't open the origin dataset, because
247 * that would require opening this dsl_dir.
248 * Just look at its phys directly instead.
249 */
250 err = dmu_bonus_hold(dp->dp_meta_objset,
d683ddbb
JG
251 dsl_dir_phys(dd)->dd_origin_obj, FTAG,
252 &origin_bonus);
13fe0198 253 if (err != 0)
428870ff
BB
254 goto errout;
255 origin_phys = origin_bonus->db_data;
256 dd->dd_origin_txg =
257 origin_phys->ds_creation_txg;
258 dmu_buf_rele(origin_bonus, FTAG);
37f03da8
SH
259 if (dsl_dir_is_zapified(dd)) {
260 uint64_t obj;
261 err = zap_lookup(dp->dp_meta_objset,
262 dd->dd_object, DD_FIELD_LIVELIST,
263 sizeof (uint64_t), 1, &obj);
264 if (err == 0)
265 dsl_dir_livelist_open(dd, obj);
266 else if (err != ENOENT)
267 goto errout;
268 }
428870ff
BB
269 }
270
a582d529
US
271 if (dsl_dir_is_zapified(dd)) {
272 inode_timespec_t t = {0};
2a493a4c 273 (void) zap_lookup(dp->dp_meta_objset, ddobj,
59767479 274 DD_FIELD_SNAPSHOTS_CHANGED,
a582d529
US
275 sizeof (uint64_t),
276 sizeof (inode_timespec_t) / sizeof (uint64_t),
277 &t);
278 dd->dd_snap_cmtime = t;
279 }
9681de46 280
39efbde7
GM
281 dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
282 &dd->dd_dbuf);
0c66c32d
JG
283 winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
284 if (winner != NULL) {
34dc7c2f 285 if (dd->dd_parent)
13fe0198 286 dsl_dir_rele(dd->dd_parent, dd);
37f03da8
SH
287 if (dsl_deadlist_is_open(&dd->dd_livelist))
288 dsl_dir_livelist_close(dd);
0eb21616 289 dsl_prop_fini(dd);
5a42ef04
PD
290 cv_destroy(&dd->dd_activity_cv);
291 mutex_destroy(&dd->dd_activity_lock);
34dc7c2f
BB
292 mutex_destroy(&dd->dd_lock);
293 kmem_free(dd, sizeof (dsl_dir_t));
294 dd = winner;
295 } else {
296 spa_open_ref(dp->dp_spa, dd);
297 }
298 }
299
300 /*
301 * The dsl_dir_t has both open-to-close and instantiate-to-evict
302 * holds on the spa. We need the open-to-close holds because
303 * otherwise the spa_refcnt wouldn't change when we open a
304 * dir which the spa also has open, so we could incorrectly
305 * think it was OK to unload/export/destroy the pool. We need
306 * the instantiate-to-evict hold because the dsl_dir_t has a
307 * pointer to the dd_pool, which has a pointer to the spa_t.
308 */
309 spa_open_ref(dp->dp_spa, tag);
310 ASSERT3P(dd->dd_pool, ==, dp);
311 ASSERT3U(dd->dd_object, ==, ddobj);
312 ASSERT3P(dd->dd_dbuf, ==, dbuf);
313 *ddp = dd;
314 return (0);
b128c09f
BB
315
316errout:
317 if (dd->dd_parent)
13fe0198 318 dsl_dir_rele(dd->dd_parent, dd);
37f03da8
SH
319 if (dsl_deadlist_is_open(&dd->dd_livelist))
320 dsl_dir_livelist_close(dd);
0eb21616 321 dsl_prop_fini(dd);
5a42ef04
PD
322 cv_destroy(&dd->dd_activity_cv);
323 mutex_destroy(&dd->dd_activity_lock);
b128c09f
BB
324 mutex_destroy(&dd->dd_lock);
325 kmem_free(dd, sizeof (dsl_dir_t));
326 dmu_buf_rele(dbuf, tag);
327 return (err);
34dc7c2f
BB
328}
329
330void
dd66857d 331dsl_dir_rele(dsl_dir_t *dd, const void *tag)
34dc7c2f
BB
332{
333 dprintf_dd(dd, "%s\n", "");
334 spa_close(dd->dd_pool->dp_spa, tag);
335 dmu_buf_rele(dd->dd_dbuf, tag);
336}
337
0c66c32d
JG
338/*
339 * Remove a reference to the given dsl dir that is being asynchronously
340 * released. Async releases occur from a taskq performing eviction of
341 * dsl datasets and dirs. This process is identical to a normal release
342 * with the exception of using the async API for releasing the reference on
343 * the spa.
344 */
345void
dd66857d 346dsl_dir_async_rele(dsl_dir_t *dd, const void *tag)
0c66c32d
JG
347{
348 dprintf_dd(dd, "%s\n", "");
349 spa_async_close(dd->dd_pool->dp_spa, tag);
350 dmu_buf_rele(dd->dd_dbuf, tag);
351}
352
eca7b760 353/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
34dc7c2f
BB
354void
355dsl_dir_name(dsl_dir_t *dd, char *buf)
356{
357 if (dd->dd_parent) {
358 dsl_dir_name(dd->dd_parent, buf);
eca7b760
IK
359 VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
360 ZFS_MAX_DATASET_NAME_LEN);
34dc7c2f
BB
361 } else {
362 buf[0] = '\0';
363 }
364 if (!MUTEX_HELD(&dd->dd_lock)) {
365 /*
366 * recursive mutex so that we can use
367 * dprintf_dd() with dd_lock held
368 */
369 mutex_enter(&dd->dd_lock);
eca7b760
IK
370 VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
371 <, ZFS_MAX_DATASET_NAME_LEN);
34dc7c2f
BB
372 mutex_exit(&dd->dd_lock);
373 } else {
eca7b760
IK
374 VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
375 <, ZFS_MAX_DATASET_NAME_LEN);
34dc7c2f
BB
376 }
377}
378
29809a6c 379/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
34dc7c2f
BB
380int
381dsl_dir_namelen(dsl_dir_t *dd)
382{
383 int result = 0;
384
385 if (dd->dd_parent) {
386 /* parent's name + 1 for the "/" */
387 result = dsl_dir_namelen(dd->dd_parent) + 1;
388 }
389
390 if (!MUTEX_HELD(&dd->dd_lock)) {
391 /* see dsl_dir_name */
392 mutex_enter(&dd->dd_lock);
393 result += strlen(dd->dd_myname);
394 mutex_exit(&dd->dd_lock);
395 } else {
396 result += strlen(dd->dd_myname);
397 }
398
399 return (result);
400}
401
34dc7c2f
BB
402static int
403getcomponent(const char *path, char *component, const char **nextp)
404{
405 char *p;
13fe0198 406
9babb374 407 if ((path == NULL) || (path[0] == '\0'))
2e528b49 408 return (SET_ERROR(ENOENT));
34dc7c2f
BB
409 /* This would be a good place to reserve some namespace... */
410 p = strpbrk(path, "/@");
411 if (p && (p[1] == '/' || p[1] == '@')) {
412 /* two separators in a row */
2e528b49 413 return (SET_ERROR(EINVAL));
34dc7c2f
BB
414 }
415 if (p == NULL || p == path) {
416 /*
417 * if the first thing is an @ or /, it had better be an
418 * @ and it had better not have any more ats or slashes,
419 * and it had better have something after the @.
420 */
421 if (p != NULL &&
422 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
2e528b49 423 return (SET_ERROR(EINVAL));
eca7b760 424 if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
2e528b49 425 return (SET_ERROR(ENAMETOOLONG));
c9e319fa 426 (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN);
34dc7c2f
BB
427 p = NULL;
428 } else if (p[0] == '/') {
eca7b760 429 if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
2e528b49 430 return (SET_ERROR(ENAMETOOLONG));
7584fbe8 431 (void) strlcpy(component, path, p - path + 1);
34dc7c2f
BB
432 p++;
433 } else if (p[0] == '@') {
434 /*
435 * if the next separator is an @, there better not be
436 * any more slashes.
437 */
438 if (strchr(path, '/'))
2e528b49 439 return (SET_ERROR(EINVAL));
eca7b760 440 if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
2e528b49 441 return (SET_ERROR(ENAMETOOLONG));
7584fbe8 442 (void) strlcpy(component, path, p - path + 1);
34dc7c2f 443 } else {
13fe0198 444 panic("invalid p=%p", (void *)p);
34dc7c2f
BB
445 }
446 *nextp = p;
447 return (0);
448}
449
450/*
13fe0198
MA
451 * Return the dsl_dir_t, and possibly the last component which couldn't
452 * be found in *tail. The name must be in the specified dsl_pool_t. This
453 * thread must hold the dp_config_rwlock for the pool. Returns NULL if the
454 * path is bogus, or if tail==NULL and we couldn't parse the whole name.
455 * (*tail)[0] == '@' means that the last component is a snapshot.
34dc7c2f
BB
456 */
457int
dd66857d 458dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag,
34dc7c2f
BB
459 dsl_dir_t **ddp, const char **tailp)
460{
fcf37ec6 461 char *buf;
13fe0198 462 const char *spaname, *next, *nextnext = NULL;
34dc7c2f
BB
463 int err;
464 dsl_dir_t *dd;
34dc7c2f 465 uint64_t ddobj;
34dc7c2f 466
eca7b760 467 buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
34dc7c2f 468 err = getcomponent(name, buf, &next);
13fe0198 469 if (err != 0)
fcf37ec6 470 goto error;
34dc7c2f 471
13fe0198
MA
472 /* Make sure the name is in the specified pool. */
473 spaname = spa_name(dp->dp_spa);
474 if (strcmp(buf, spaname) != 0) {
9063f654 475 err = SET_ERROR(EXDEV);
13fe0198 476 goto error;
34dc7c2f
BB
477 }
478
13fe0198 479 ASSERT(dsl_pool_config_held(dp));
34dc7c2f 480
13fe0198
MA
481 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
482 if (err != 0) {
fcf37ec6 483 goto error;
34dc7c2f
BB
484 }
485
486 while (next != NULL) {
0c66c32d 487 dsl_dir_t *child_dd;
34dc7c2f 488 err = getcomponent(next, buf, &nextnext);
13fe0198 489 if (err != 0)
34dc7c2f
BB
490 break;
491 ASSERT(next[0] != '\0');
492 if (next[0] == '@')
493 break;
494 dprintf("looking up %s in obj%lld\n",
8e739b2c 495 buf, (longlong_t)dsl_dir_phys(dd)->dd_child_dir_zapobj);
34dc7c2f
BB
496
497 err = zap_lookup(dp->dp_meta_objset,
d683ddbb 498 dsl_dir_phys(dd)->dd_child_dir_zapobj,
34dc7c2f 499 buf, sizeof (ddobj), 1, &ddobj);
13fe0198 500 if (err != 0) {
34dc7c2f
BB
501 if (err == ENOENT)
502 err = 0;
503 break;
504 }
505
0c66c32d 506 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
13fe0198 507 if (err != 0)
34dc7c2f 508 break;
13fe0198 509 dsl_dir_rele(dd, tag);
0c66c32d 510 dd = child_dd;
34dc7c2f
BB
511 next = nextnext;
512 }
34dc7c2f 513
13fe0198
MA
514 if (err != 0) {
515 dsl_dir_rele(dd, tag);
fcf37ec6 516 goto error;
34dc7c2f
BB
517 }
518
519 /*
520 * It's an error if there's more than one component left, or
521 * tailp==NULL and there's any component left.
522 */
523 if (next != NULL &&
524 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
525 /* bad path name */
13fe0198 526 dsl_dir_rele(dd, tag);
34dc7c2f 527 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
2e528b49 528 err = SET_ERROR(ENOENT);
34dc7c2f 529 }
13fe0198 530 if (tailp != NULL)
34dc7c2f 531 *tailp = next;
fc1ecd16
DB
532 if (err == 0)
533 *ddp = dd;
fcf37ec6 534error:
eca7b760 535 kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN);
34dc7c2f
BB
536 return (err);
537}
538
788eb90c
JJ
539/*
540 * If the counts are already initialized for this filesystem and its
541 * descendants then do nothing, otherwise initialize the counts.
542 *
543 * The counts on this filesystem, and those below, may be uninitialized due to
544 * either the use of a pre-existing pool which did not support the
545 * filesystem/snapshot limit feature, or one in which the feature had not yet
546 * been enabled.
547 *
548 * Recursively descend the filesystem tree and update the filesystem/snapshot
549 * counts on each filesystem below, then update the cumulative count on the
550 * current filesystem. If the filesystem already has a count set on it,
551 * then we know that its counts, and the counts on the filesystems below it,
552 * are already correct, so we don't have to update this filesystem.
553 */
554static void
555dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
556{
557 uint64_t my_fs_cnt = 0;
558 uint64_t my_ss_cnt = 0;
559 dsl_pool_t *dp = dd->dd_pool;
560 objset_t *os = dp->dp_meta_objset;
561 zap_cursor_t *zc;
562 zap_attribute_t *za;
563 dsl_dataset_t *ds;
564
a0c9a17a 565 ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
788eb90c
JJ
566 ASSERT(dsl_pool_config_held(dp));
567 ASSERT(dmu_tx_is_syncing(tx));
568
569 dsl_dir_zapify(dd, tx);
570
571 /*
572 * If the filesystem count has already been initialized then we
573 * don't need to recurse down any further.
574 */
575 if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
576 return;
577
578 zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
579 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
580
581 /* Iterate my child dirs */
d683ddbb 582 for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
788eb90c
JJ
583 zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
584 dsl_dir_t *chld_dd;
585 uint64_t count;
586
587 VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
588 &chld_dd));
589
590 /*
4aa3b3bd 591 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets.
788eb90c 592 */
4aa3b3bd 593 if (chld_dd->dd_myname[0] == '$') {
788eb90c
JJ
594 dsl_dir_rele(chld_dd, FTAG);
595 continue;
596 }
597
598 my_fs_cnt++; /* count this child */
599
600 dsl_dir_init_fs_ss_count(chld_dd, tx);
601
602 VERIFY0(zap_lookup(os, chld_dd->dd_object,
603 DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
604 my_fs_cnt += count;
605 VERIFY0(zap_lookup(os, chld_dd->dd_object,
606 DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
607 my_ss_cnt += count;
608
609 dsl_dir_rele(chld_dd, FTAG);
610 }
611 zap_cursor_fini(zc);
612 /* Count my snapshots (we counted children's snapshots above) */
613 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
d683ddbb 614 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
788eb90c 615
d683ddbb 616 for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
788eb90c
JJ
617 zap_cursor_retrieve(zc, za) == 0;
618 zap_cursor_advance(zc)) {
619 /* Don't count temporary snapshots */
620 if (za->za_name[0] != '%')
621 my_ss_cnt++;
622 }
ca227e54 623 zap_cursor_fini(zc);
788eb90c
JJ
624
625 dsl_dataset_rele(ds, FTAG);
626
627 kmem_free(zc, sizeof (zap_cursor_t));
628 kmem_free(za, sizeof (zap_attribute_t));
629
630 /* we're in a sync task, update counts */
631 dmu_buf_will_dirty(dd->dd_dbuf, tx);
632 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
633 sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
634 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
635 sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
636}
637
638static int
639dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
640{
641 char *ddname = (char *)arg;
642 dsl_pool_t *dp = dmu_tx_pool(tx);
643 dsl_dataset_t *ds;
644 dsl_dir_t *dd;
645 int error;
646
647 error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
648 if (error != 0)
649 return (error);
650
651 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
652 dsl_dataset_rele(ds, FTAG);
653 return (SET_ERROR(ENOTSUP));
654 }
655
656 dd = ds->ds_dir;
657 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
658 dsl_dir_is_zapified(dd) &&
659 zap_contains(dp->dp_meta_objset, dd->dd_object,
660 DD_FIELD_FILESYSTEM_COUNT) == 0) {
661 dsl_dataset_rele(ds, FTAG);
662 return (SET_ERROR(EALREADY));
663 }
664
665 dsl_dataset_rele(ds, FTAG);
666 return (0);
667}
668
669static void
670dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
671{
672 char *ddname = (char *)arg;
673 dsl_pool_t *dp = dmu_tx_pool(tx);
674 dsl_dataset_t *ds;
675 spa_t *spa;
676
677 VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
678
679 spa = dsl_dataset_get_spa(ds);
680
681 if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
682 /*
683 * Since the feature was not active and we're now setting a
684 * limit, increment the feature-active counter so that the
685 * feature becomes active for the first time.
686 *
687 * We are already in a sync task so we can update the MOS.
688 */
689 spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
690 }
691
692 /*
693 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
694 * we need to ensure the counts are correct. Descend down the tree from
695 * this point and update all of the counts to be accurate.
696 */
697 dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
698
699 dsl_dataset_rele(ds, FTAG);
700}
701
702/*
703 * Make sure the feature is enabled and activate it if necessary.
704 * Since we're setting a limit, ensure the on-disk counts are valid.
705 * This is only called by the ioctl path when setting a limit value.
706 *
707 * We do not need to validate the new limit, since users who can change the
708 * limit are also allowed to exceed the limit.
709 */
710int
711dsl_dir_activate_fs_ss_limit(const char *ddname)
712{
713 int error;
714
715 error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
3d45fdd6
MA
716 dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
717 ZFS_SPACE_CHECK_RESERVED);
788eb90c
JJ
718
719 if (error == EALREADY)
720 error = 0;
721
722 return (error);
723}
724
725/*
726 * Used to determine if the filesystem_limit or snapshot_limit should be
727 * enforced. We allow the limit to be exceeded if the user has permission to
728 * write the property value. We pass in the creds that we got in the open
729 * context since we will always be the GZ root in syncing context. We also have
730 * to handle the case where we are allowed to change the limit on the current
731 * dataset, but there may be another limit in the tree above.
732 *
733 * We can never modify these two properties within a non-global zone. In
734 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
735 * can't use that function since we are already holding the dp_config_rwlock.
736 * In addition, we already have the dd and dealing with snapshots is simplified
737 * in this code.
738 */
739
740typedef enum {
741 ENFORCE_ALWAYS,
742 ENFORCE_NEVER,
743 ENFORCE_ABOVE
744} enforce_res_t;
745
746static enforce_res_t
e59a377a
MA
747dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
748 cred_t *cr, proc_t *proc)
788eb90c
JJ
749{
750 enforce_res_t enforce = ENFORCE_ALWAYS;
751 uint64_t obj;
752 dsl_dataset_t *ds;
753 uint64_t zoned;
4bc72196 754 const char *zonedstr;
788eb90c
JJ
755
756 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
757 prop == ZFS_PROP_SNAPSHOT_LIMIT);
758
759#ifdef _KERNEL
760 if (crgetzoneid(cr) != GLOBAL_ZONEID)
761 return (ENFORCE_ALWAYS);
762
e59a377a
MA
763 /*
764 * We are checking the saved credentials of the user process, which is
765 * not the current process. Note that we can't use secpolicy_zfs(),
766 * because it only works if the cred is that of the current process (on
767 * Linux).
768 */
769 if (secpolicy_zfs_proc(cr, proc) == 0)
788eb90c 770 return (ENFORCE_NEVER);
14e4e3cb
AZ
771#else
772 (void) proc;
788eb90c
JJ
773#endif
774
d683ddbb 775 if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
788eb90c
JJ
776 return (ENFORCE_ALWAYS);
777
778 ASSERT(dsl_pool_config_held(dd->dd_pool));
779
780 if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
781 return (ENFORCE_ALWAYS);
782
4bc72196
MM
783 zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED);
784 if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) {
788eb90c
JJ
785 /* Only root can access zoned fs's from the GZ */
786 enforce = ENFORCE_ALWAYS;
787 } else {
788 if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
789 enforce = ENFORCE_ABOVE;
790 }
791
792 dsl_dataset_rele(ds, FTAG);
793 return (enforce);
794}
795
796/*
797 * Check if adding additional child filesystem(s) would exceed any filesystem
798 * limits or adding additional snapshot(s) would exceed any snapshot limits.
799 * The prop argument indicates which limit to check.
800 *
801 * Note that all filesystem limits up to the root (or the highest
802 * initialized) filesystem or the given ancestor must be satisfied.
803 */
804int
805dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
e59a377a 806 dsl_dir_t *ancestor, cred_t *cr, proc_t *proc)
788eb90c
JJ
807{
808 objset_t *os = dd->dd_pool->dp_meta_objset;
809 uint64_t limit, count;
a926aab9 810 const char *count_prop;
788eb90c
JJ
811 enforce_res_t enforce;
812 int err = 0;
813
814 ASSERT(dsl_pool_config_held(dd->dd_pool));
815 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
816 prop == ZFS_PROP_SNAPSHOT_LIMIT);
817
b37d495e
AJ
818 if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
819 /*
820 * We don't enforce the limit for temporary snapshots. This is
821 * indicated by a NULL cred_t argument.
822 */
823 if (cr == NULL)
824 return (0);
825
826 count_prop = DD_FIELD_SNAPSHOT_COUNT;
827 } else {
828 count_prop = DD_FIELD_FILESYSTEM_COUNT;
829 }
788eb90c
JJ
830 /*
831 * If we're allowed to change the limit, don't enforce the limit
832 * e.g. this can happen if a snapshot is taken by an administrative
833 * user in the global zone (i.e. a recursive snapshot by root).
834 * However, we must handle the case of delegated permissions where we
835 * are allowed to change the limit on the current dataset, but there
836 * is another limit in the tree above.
837 */
e59a377a 838 enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc);
788eb90c
JJ
839 if (enforce == ENFORCE_NEVER)
840 return (0);
841
842 /*
843 * e.g. if renaming a dataset with no snapshots, count adjustment
844 * is 0.
845 */
846 if (delta == 0)
847 return (0);
848
788eb90c
JJ
849 /*
850 * If an ancestor has been provided, stop checking the limit once we
851 * hit that dir. We need this during rename so that we don't overcount
852 * the check once we recurse up to the common ancestor.
853 */
854 if (ancestor == dd)
855 return (0);
856
857 /*
858 * If we hit an uninitialized node while recursing up the tree, we can
859 * stop since we know there is no limit here (or above). The counts are
860 * not valid on this node and we know we won't touch this node's counts.
861 */
d87676a9
MA
862 if (!dsl_dir_is_zapified(dd))
863 return (0);
864 err = zap_lookup(os, dd->dd_object,
865 count_prop, sizeof (count), 1, &count);
866 if (err == ENOENT)
788eb90c 867 return (0);
d87676a9
MA
868 if (err != 0)
869 return (err);
788eb90c
JJ
870
871 err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
872 B_FALSE);
873 if (err != 0)
874 return (err);
875
876 /* Is there a limit which we've hit? */
877 if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
878 return (SET_ERROR(EDQUOT));
879
880 if (dd->dd_parent != NULL)
881 err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
e59a377a 882 ancestor, cr, proc);
788eb90c
JJ
883
884 return (err);
885}
886
887/*
888 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
889 * parents. When a new filesystem/snapshot is created, increment the count on
890 * all parents, and when a filesystem/snapshot is destroyed, decrement the
891 * count.
892 */
893void
894dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
895 dmu_tx_t *tx)
896{
897 int err;
898 objset_t *os = dd->dd_pool->dp_meta_objset;
899 uint64_t count;
900
901 ASSERT(dsl_pool_config_held(dd->dd_pool));
902 ASSERT(dmu_tx_is_syncing(tx));
903 ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
904 strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
905
906 /*
4aa3b3bd 907 * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets.
788eb90c 908 */
4aa3b3bd
PD
909 if (dd->dd_myname[0] == '$' && strcmp(prop,
910 DD_FIELD_FILESYSTEM_COUNT) == 0) {
788eb90c 911 return;
4aa3b3bd 912 }
788eb90c
JJ
913
914 /*
915 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
916 */
917 if (delta == 0)
918 return;
919
920 /*
921 * If we hit an uninitialized node while recursing up the tree, we can
922 * stop since we know the counts are not valid on this node and we
923 * know we shouldn't touch this node's counts. An uninitialized count
924 * on the node indicates that either the feature has not yet been
925 * activated or there are no limits on this part of the tree.
926 */
927 if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
928 prop, sizeof (count), 1, &count)) == ENOENT)
929 return;
930 VERIFY0(err);
931
932 count += delta;
933 /* Use a signed verify to make sure we're not neg. */
934 VERIFY3S(count, >=, 0);
935
936 VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
937 tx));
938
939 /* Roll up this additional count into our ancestors */
940 if (dd->dd_parent != NULL)
941 dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
942}
943
34dc7c2f 944uint64_t
b128c09f
BB
945dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
946 dmu_tx_t *tx)
34dc7c2f 947{
b128c09f 948 objset_t *mos = dp->dp_meta_objset;
34dc7c2f 949 uint64_t ddobj;
428870ff 950 dsl_dir_phys_t *ddphys;
34dc7c2f
BB
951 dmu_buf_t *dbuf;
952
953 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
954 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
b128c09f 955 if (pds) {
d2734cce 956 VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
b128c09f
BB
957 name, sizeof (uint64_t), 1, &ddobj, tx));
958 } else {
959 /* it's the root dir */
d2734cce 960 VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
b128c09f
BB
961 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
962 }
d2734cce 963 VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
34dc7c2f 964 dmu_buf_will_dirty(dbuf, tx);
428870ff 965 ddphys = dbuf->db_data;
34dc7c2f 966
428870ff 967 ddphys->dd_creation_time = gethrestime_sec();
788eb90c 968 if (pds) {
428870ff 969 ddphys->dd_parent_obj = pds->dd_object;
788eb90c
JJ
970
971 /* update the filesystem counts */
972 dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
973 }
428870ff 974 ddphys->dd_props_zapobj = zap_create(mos,
34dc7c2f 975 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
428870ff 976 ddphys->dd_child_dir_zapobj = zap_create(mos,
34dc7c2f 977 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
b128c09f 978 if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
428870ff 979 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
b5256303 980
34dc7c2f
BB
981 dmu_buf_rele(dbuf, FTAG);
982
983 return (ddobj);
984}
985
b128c09f
BB
986boolean_t
987dsl_dir_is_clone(dsl_dir_t *dd)
34dc7c2f 988{
d683ddbb 989 return (dsl_dir_phys(dd)->dd_origin_obj &&
b128c09f 990 (dd->dd_pool->dp_origin_snap == NULL ||
d683ddbb 991 dsl_dir_phys(dd)->dd_origin_obj !=
b128c09f 992 dd->dd_pool->dp_origin_snap->ds_object));
34dc7c2f
BB
993}
994
d99a0153
CW
995uint64_t
996dsl_dir_get_used(dsl_dir_t *dd)
997{
998 return (dsl_dir_phys(dd)->dd_used_bytes);
999}
1000
d2734cce
SD
1001uint64_t
1002dsl_dir_get_compressed(dsl_dir_t *dd)
1003{
1004 return (dsl_dir_phys(dd)->dd_compressed_bytes);
1005}
1006
d99a0153
CW
1007uint64_t
1008dsl_dir_get_quota(dsl_dir_t *dd)
1009{
1010 return (dsl_dir_phys(dd)->dd_quota);
1011}
1012
1013uint64_t
1014dsl_dir_get_reservation(dsl_dir_t *dd)
1015{
1016 return (dsl_dir_phys(dd)->dd_reserved);
1017}
1018
1019uint64_t
1020dsl_dir_get_compressratio(dsl_dir_t *dd)
1021{
1022 /* a fixed point number, 100x the ratio */
1023 return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
1024 (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
1025 dsl_dir_phys(dd)->dd_compressed_bytes));
1026}
1027
1028uint64_t
1029dsl_dir_get_logicalused(dsl_dir_t *dd)
1030{
1031 return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
1032}
1033
1034uint64_t
1035dsl_dir_get_usedsnap(dsl_dir_t *dd)
1036{
1037 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
1038}
1039
1040uint64_t
1041dsl_dir_get_usedds(dsl_dir_t *dd)
1042{
1043 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
1044}
1045
1046uint64_t
1047dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
1048{
1049 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
1050}
1051
1052uint64_t
1053dsl_dir_get_usedchild(dsl_dir_t *dd)
1054{
1055 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
1056 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
1057}
1058
34dc7c2f 1059void
d99a0153
CW
1060dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
1061{
1062 dsl_dataset_t *ds;
1063 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
1064 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
1065
1066 dsl_dataset_name(ds, buf);
1067
1068 dsl_dataset_rele(ds, FTAG);
1069}
1070
1071int
1072dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
34dc7c2f 1073{
d99a0153
CW
1074 if (dsl_dir_is_zapified(dd)) {
1075 objset_t *os = dd->dd_pool->dp_meta_objset;
1076 return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
1077 sizeof (*count), 1, count));
1078 } else {
28caa74b 1079 return (SET_ERROR(ENOENT));
d99a0153
CW
1080 }
1081}
1082
1083int
1084dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
1085{
1086 if (dsl_dir_is_zapified(dd)) {
1087 objset_t *os = dd->dd_pool->dp_meta_objset;
1088 return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
1089 sizeof (*count), 1, count));
1090 } else {
28caa74b 1091 return (SET_ERROR(ENOENT));
d99a0153
CW
1092 }
1093}
b5256303 1094
d99a0153
CW
1095void
1096dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
1097{
34dc7c2f 1098 mutex_enter(&dd->dd_lock);
d683ddbb 1099 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
d99a0153 1100 dsl_dir_get_quota(dd));
34dc7c2f 1101 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
d99a0153 1102 dsl_dir_get_reservation(dd));
24a64651 1103 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
d99a0153 1104 dsl_dir_get_logicalused(dd));
d683ddbb 1105 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
b128c09f 1106 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
d99a0153 1107 dsl_dir_get_usedsnap(dd));
b128c09f 1108 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
d99a0153 1109 dsl_dir_get_usedds(dd));
b128c09f 1110 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
d99a0153 1111 dsl_dir_get_usedrefreserv(dd));
b128c09f 1112 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
d99a0153 1113 dsl_dir_get_usedchild(dd));
b128c09f 1114 }
34dc7c2f
BB
1115 mutex_exit(&dd->dd_lock);
1116
d99a0153
CW
1117 uint64_t count;
1118 if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
1119 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
1120 count);
1121 }
1122 if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
1123 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
1124 count);
788eb90c
JJ
1125 }
1126
b128c09f 1127 if (dsl_dir_is_clone(dd)) {
eca7b760 1128 char buf[ZFS_MAX_DATASET_NAME_LEN];
d99a0153 1129 dsl_dir_get_origin(dd, buf);
34dc7c2f
BB
1130 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
1131 }
d99a0153 1132
34dc7c2f
BB
1133}
1134
1135void
1136dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
1137{
1138 dsl_pool_t *dp = dd->dd_pool;
1139
d683ddbb 1140 ASSERT(dsl_dir_phys(dd));
34dc7c2f 1141
13fe0198 1142 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
34dc7c2f
BB
1143 /* up the hold count until we can be written out */
1144 dmu_buf_add_ref(dd->dd_dbuf, dd);
1145 }
1146}
1147
1148static int64_t
1149parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
1150{
d683ddbb
JG
1151 uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
1152 uint64_t new_accounted =
1153 MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
34dc7c2f
BB
1154 return (new_accounted - old_accounted);
1155}
1156
1157void
1158dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
1159{
1160 ASSERT(dmu_tx_is_syncing(tx));
1161
34dc7c2f 1162 mutex_enter(&dd->dd_lock);
3fa93bb8 1163 ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]);
8e739b2c
RE
1164 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", (u_longlong_t)tx->tx_txg,
1165 (u_longlong_t)dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024);
3fa93bb8 1166 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0;
34dc7c2f
BB
1167 mutex_exit(&dd->dd_lock);
1168
1169 /* release the hold from dsl_dir_dirty */
1170 dmu_buf_rele(dd->dd_dbuf, dd);
1171}
1172
1173static uint64_t
1174dsl_dir_space_towrite(dsl_dir_t *dd)
1175{
1176 uint64_t space = 0;
34dc7c2f
BB
1177
1178 ASSERT(MUTEX_HELD(&dd->dd_lock));
1179
3ec3bc21
BB
1180 for (int i = 0; i < TXG_SIZE; i++) {
1181 space += dd->dd_space_towrite[i & TXG_MASK];
1182 ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
34dc7c2f
BB
1183 }
1184 return (space);
1185}
1186
1187/*
1188 * How much space would dd have available if ancestor had delta applied
1189 * to it? If ondiskonly is set, we're only interested in what's
1190 * on-disk, not estimated pending changes.
1191 */
1192uint64_t
1193dsl_dir_space_available(dsl_dir_t *dd,
1194 dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
1195{
1196 uint64_t parentspace, myspace, quota, used;
1197
1198 /*
1199 * If there are no restrictions otherwise, assume we have
1200 * unlimited space available.
1201 */
1202 quota = UINT64_MAX;
1203 parentspace = UINT64_MAX;
1204
1205 if (dd->dd_parent != NULL) {
1206 parentspace = dsl_dir_space_available(dd->dd_parent,
1207 ancestor, delta, ondiskonly);
1208 }
1209
1210 mutex_enter(&dd->dd_lock);
d683ddbb
JG
1211 if (dsl_dir_phys(dd)->dd_quota != 0)
1212 quota = dsl_dir_phys(dd)->dd_quota;
1213 used = dsl_dir_phys(dd)->dd_used_bytes;
34dc7c2f
BB
1214 if (!ondiskonly)
1215 used += dsl_dir_space_towrite(dd);
34dc7c2f
BB
1216
1217 if (dd->dd_parent == NULL) {
d2734cce
SD
1218 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
1219 ZFS_SPACE_CHECK_NORMAL);
34dc7c2f
BB
1220 quota = MIN(quota, poolsize);
1221 }
1222
d683ddbb 1223 if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
34dc7c2f
BB
1224 /*
1225 * We have some space reserved, in addition to what our
1226 * parent gave us.
1227 */
d683ddbb 1228 parentspace += dsl_dir_phys(dd)->dd_reserved - used;
34dc7c2f
BB
1229 }
1230
b128c09f
BB
1231 if (dd == ancestor) {
1232 ASSERT(delta <= 0);
1233 ASSERT(used >= -delta);
1234 used += delta;
1235 if (parentspace != UINT64_MAX)
1236 parentspace -= delta;
1237 }
1238
34dc7c2f
BB
1239 if (used > quota) {
1240 /* over quota */
1241 myspace = 0;
34dc7c2f
BB
1242 } else {
1243 /*
1244 * the lesser of the space provided by our parent and
1245 * the space left in our quota
1246 */
1247 myspace = MIN(parentspace, quota - used);
1248 }
1249
1250 mutex_exit(&dd->dd_lock);
1251
1252 return (myspace);
1253}
1254
1255struct tempreserve {
1256 list_node_t tr_node;
34dc7c2f
BB
1257 dsl_dir_t *tr_ds;
1258 uint64_t tr_size;
1259};
1260
1261static int
1262dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
3ec3bc21 1263 boolean_t ignorequota, list_t *tr_list,
34dc7c2f
BB
1264 dmu_tx_t *tx, boolean_t first)
1265{
419c80e6 1266 uint64_t txg;
3ec3bc21 1267 uint64_t quota;
34dc7c2f 1268 struct tempreserve *tr;
419c80e6 1269 int retval;
8af08a69 1270 uint64_t ext_quota;
419c80e6
D
1271 uint64_t ref_rsrv;
1272
1273top_of_function:
1274 txg = tx->tx_txg;
1275 retval = EDQUOT;
1276 ref_rsrv = 0;
34dc7c2f
BB
1277
1278 ASSERT3U(txg, !=, 0);
1279 ASSERT3S(asize, >, 0);
1280
1281 mutex_enter(&dd->dd_lock);
1282
1283 /*
1284 * Check against the dsl_dir's quota. We don't add in the delta
1285 * when checking for over-quota because they get one free hit.
1286 */
3ec3bc21
BB
1287 uint64_t est_inflight = dsl_dir_space_towrite(dd);
1288 for (int i = 0; i < TXG_SIZE; i++)
34dc7c2f 1289 est_inflight += dd->dd_tempreserved[i];
3ec3bc21 1290 uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
34dc7c2f
BB
1291
1292 /*
1293 * On the first iteration, fetch the dataset's used-on-disk and
1294 * refreservation values. Also, if checkrefquota is set, test if
1295 * allocating this space would exceed the dataset's refquota.
1296 */
1297 if (first && tx->tx_objset) {
1298 int error;
428870ff 1299 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
34dc7c2f 1300
3ec3bc21 1301 error = dsl_dataset_check_quota(ds, !netfree,
34dc7c2f 1302 asize, est_inflight, &used_on_disk, &ref_rsrv);
3ec3bc21 1303 if (error != 0) {
34dc7c2f 1304 mutex_exit(&dd->dd_lock);
3d920a15 1305 DMU_TX_STAT_BUMP(dmu_tx_quota);
34dc7c2f
BB
1306 return (error);
1307 }
1308 }
1309
1310 /*
1311 * If this transaction will result in a net free of space,
1312 * we want to let it through.
1313 */
d683ddbb 1314 if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
34dc7c2f
BB
1315 quota = UINT64_MAX;
1316 else
d683ddbb 1317 quota = dsl_dir_phys(dd)->dd_quota;
34dc7c2f
BB
1318
1319 /*
428870ff
BB
1320 * Adjust the quota against the actual pool size at the root
1321 * minus any outstanding deferred frees.
34dc7c2f
BB
1322 * To ensure that it's possible to remove files from a full
1323 * pool without inducing transient overcommits, we throttle
1324 * netfree transactions against a quota that is slightly larger,
1325 * but still within the pool's allocation slop. In cases where
1326 * we're very close to full, this will allow a steady trickle of
1327 * removes to get through.
1328 */
1329 if (dd->dd_parent == NULL) {
d2734cce
SD
1330 uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
1331 (netfree) ?
1332 ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
1333
1334 if (avail < quota) {
1335 quota = avail;
28caa74b 1336 retval = SET_ERROR(ENOSPC);
34dc7c2f
BB
1337 }
1338 }
1339
1340 /*
1341 * If they are requesting more space, and our current estimate
1342 * is over quota, they get to try again unless the actual
6df43169
BB
1343 * on-disk is over quota and there are no pending changes
1344 * or deferred frees (which may free up space for us).
34dc7c2f 1345 */
8af08a69
MZ
1346 ext_quota = quota >> 5;
1347 if (quota == UINT64_MAX)
1348 ext_quota = 0;
1349
1350 if (used_on_disk >= quota) {
1351 /* Quota exceeded */
1352 mutex_exit(&dd->dd_lock);
1353 DMU_TX_STAT_BUMP(dmu_tx_quota);
1354 return (retval);
1355 } else if (used_on_disk + est_inflight >= quota + ext_quota) {
6df43169
BB
1356 if (est_inflight > 0 || used_on_disk < quota) {
1357 retval = SET_ERROR(ERESTART);
1358 } else {
1359 ASSERT3U(used_on_disk, >=, quota);
1360
1361 if (retval == ENOSPC && (used_on_disk - quota) <
1362 dsl_pool_deferred_space(dd->dd_pool)) {
1363 retval = SET_ERROR(ERESTART);
1364 }
1365 }
1366
34dc7c2f
BB
1367 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
1368 "quota=%lluK tr=%lluK err=%d\n",
8e739b2c
RE
1369 (u_longlong_t)used_on_disk>>10,
1370 (u_longlong_t)est_inflight>>10,
1371 (u_longlong_t)quota>>10, (u_longlong_t)asize>>10, retval);
34dc7c2f 1372 mutex_exit(&dd->dd_lock);
3d920a15 1373 DMU_TX_STAT_BUMP(dmu_tx_quota);
6df43169 1374 return (retval);
34dc7c2f
BB
1375 }
1376
1377 /* We need to up our estimated delta before dropping dd_lock */
3ec3bc21 1378 dd->dd_tempreserved[txg & TXG_MASK] += asize;
34dc7c2f 1379
3ec3bc21 1380 uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
34dc7c2f
BB
1381 asize - ref_rsrv);
1382 mutex_exit(&dd->dd_lock);
1383
79c76d5b 1384 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
34dc7c2f
BB
1385 tr->tr_ds = dd;
1386 tr->tr_size = asize;
1387 list_insert_tail(tr_list, tr);
1388
1389 /* see if it's OK with our parent */
3ec3bc21 1390 if (dd->dd_parent != NULL && parent_rsrv != 0) {
419c80e6
D
1391 /*
1392 * Recurse on our parent without recursion. This has been
1393 * observed to be potentially large stack usage even within
1394 * the test suite. Largest seen stack was 7632 bytes on linux.
1395 */
1396
1397 dd = dd->dd_parent;
1398 asize = parent_rsrv;
1399 ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
1400 first = B_FALSE;
1401 goto top_of_function;
34dc7c2f 1402
34dc7c2f
BB
1403 } else {
1404 return (0);
1405 }
1406}
1407
1408/*
1409 * Reserve space in this dsl_dir, to be used in this tx's txg.
1410 * After the space has been dirtied (and dsl_dir_willuse_space()
1411 * has been called), the reservation should be canceled, using
1412 * dsl_dir_tempreserve_clear().
1413 */
1414int
1415dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
3ec3bc21 1416 boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
34dc7c2f
BB
1417{
1418 int err;
1419 list_t *tr_list;
1420
1421 if (asize == 0) {
1422 *tr_cookiep = NULL;
1423 return (0);
1424 }
1425
79c76d5b 1426 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
34dc7c2f
BB
1427 list_create(tr_list, sizeof (struct tempreserve),
1428 offsetof(struct tempreserve, tr_node));
1429 ASSERT3S(asize, >, 0);
34dc7c2f 1430
dae3e9ea 1431 err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
34dc7c2f
BB
1432 if (err == 0) {
1433 struct tempreserve *tr;
1434
79c76d5b 1435 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
34dc7c2f
BB
1436 tr->tr_size = lsize;
1437 list_insert_tail(tr_list, tr);
34dc7c2f
BB
1438 } else {
1439 if (err == EAGAIN) {
e8b96c60
MA
1440 /*
1441 * If arc_memory_throttle() detected that pageout
1442 * is running and we are low on memory, we delay new
1443 * non-pageout transactions to give pageout an
1444 * advantage.
1445 *
1446 * It is unfortunate to be delaying while the caller's
1447 * locks are held.
1448 */
63fd3c6c
AL
1449 txg_delay(dd->dd_pool, tx->tx_txg,
1450 MSEC2NSEC(10), MSEC2NSEC(10));
2e528b49 1451 err = SET_ERROR(ERESTART);
34dc7c2f 1452 }
34dc7c2f
BB
1453 }
1454
1455 if (err == 0) {
3ec3bc21
BB
1456 err = dsl_dir_tempreserve_impl(dd, asize, netfree,
1457 B_FALSE, tr_list, tx, B_TRUE);
34dc7c2f
BB
1458 }
1459
13fe0198 1460 if (err != 0)
34dc7c2f
BB
1461 dsl_dir_tempreserve_clear(tr_list, tx);
1462 else
1463 *tr_cookiep = tr_list;
1464
1465 return (err);
1466}
1467
1468/*
1469 * Clear a temporary reservation that we previously made with
1470 * dsl_dir_tempreserve_space().
1471 */
1472void
1473dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
1474{
1475 int txgidx = tx->tx_txg & TXG_MASK;
1476 list_t *tr_list = tr_cookie;
1477 struct tempreserve *tr;
1478
1479 ASSERT3U(tx->tx_txg, !=, 0);
1480
1481 if (tr_cookie == NULL)
1482 return;
1483
e8b96c60
MA
1484 while ((tr = list_head(tr_list)) != NULL) {
1485 if (tr->tr_ds) {
34dc7c2f
BB
1486 mutex_enter(&tr->tr_ds->dd_lock);
1487 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
1488 tr->tr_size);
1489 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
1490 mutex_exit(&tr->tr_ds->dd_lock);
1491 } else {
1492 arc_tempreserve_clear(tr->tr_size);
1493 }
1494 list_remove(tr_list, tr);
1495 kmem_free(tr, sizeof (struct tempreserve));
1496 }
1497
1498 kmem_free(tr_list, sizeof (list_t));
1499}
1500
e8b96c60
MA
1501/*
1502 * This should be called from open context when we think we're going to write
1503 * or free space, for example when dirtying data. Be conservative; it's okay
1504 * to write less space or free more, but we don't want to write more or free
1505 * less than the amount specified.
1ba16159
AB
1506 *
1507 * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
e1cfd73f 1508 * version however it has been adjusted to use an iterative rather than
1ba16159 1509 * recursive algorithm to minimize stack usage.
e8b96c60
MA
1510 */
1511void
1512dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
34dc7c2f
BB
1513{
1514 int64_t parent_space;
1515 uint64_t est_used;
1516
1ba16159
AB
1517 do {
1518 mutex_enter(&dd->dd_lock);
1519 if (space > 0)
1520 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
34dc7c2f 1521
1ba16159 1522 est_used = dsl_dir_space_towrite(dd) +
d683ddbb 1523 dsl_dir_phys(dd)->dd_used_bytes;
1ba16159
AB
1524 parent_space = parent_delta(dd, est_used, space);
1525 mutex_exit(&dd->dd_lock);
34dc7c2f 1526
1ba16159
AB
1527 /* Make sure that we clean up dd_space_to* */
1528 dsl_dir_dirty(dd, tx);
34dc7c2f 1529
1ba16159
AB
1530 dd = dd->dd_parent;
1531 space = parent_space;
1532 } while (space && dd);
34dc7c2f
BB
1533}
1534
1535/* call from syncing context when we actually write/free space for this dd */
1536void
b128c09f 1537dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
34dc7c2f
BB
1538 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
1539{
1540 int64_t accounted_delta;
a169a625 1541
c1b5869b
AM
1542 ASSERT(dmu_tx_is_syncing(tx));
1543 ASSERT(type < DD_USED_NUM);
1544
1545 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1546
a169a625
MA
1547 /*
1548 * dsl_dataset_set_refreservation_sync_impl() calls this with
1549 * dd_lock held, so that it can atomically update
1550 * ds->ds_reserved and the dsl_dir accounting, so that
1551 * dsl_dataset_check_quota() can see dataset and dir accounting
1552 * consistently.
1553 */
b128c09f 1554 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
b128c09f
BB
1555 if (needlock)
1556 mutex_enter(&dd->dd_lock);
c1b5869b
AM
1557 dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
1558 accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
1559 ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
1560 ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
34dc7c2f 1561 ASSERT(uncompressed >= 0 ||
c1b5869b
AM
1562 ddp->dd_uncompressed_bytes >= -uncompressed);
1563 ddp->dd_used_bytes += used;
1564 ddp->dd_uncompressed_bytes += uncompressed;
1565 ddp->dd_compressed_bytes += compressed;
1566
1567 if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1568 ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used);
1569 ddp->dd_used_breakdown[type] += used;
6d8da841 1570#ifdef ZFS_DEBUG
d6320ddb
BB
1571 {
1572 dd_used_t t;
1573 uint64_t u = 0;
1574 for (t = 0; t < DD_USED_NUM; t++)
c1b5869b
AM
1575 u += ddp->dd_used_breakdown[t];
1576 ASSERT3U(u, ==, ddp->dd_used_bytes);
d6320ddb 1577 }
b128c09f
BB
1578#endif
1579 }
1580 if (needlock)
1581 mutex_exit(&dd->dd_lock);
34dc7c2f
BB
1582
1583 if (dd->dd_parent != NULL) {
c1b5869b
AM
1584 dsl_dir_diduse_transfer_space(dd->dd_parent,
1585 accounted_delta, compressed, uncompressed,
1586 used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
34dc7c2f
BB
1587 }
1588}
1589
b128c09f
BB
1590void
1591dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
1592 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1593{
b128c09f
BB
1594 ASSERT(dmu_tx_is_syncing(tx));
1595 ASSERT(oldtype < DD_USED_NUM);
1596 ASSERT(newtype < DD_USED_NUM);
1597
c1b5869b 1598 dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
d683ddbb 1599 if (delta == 0 ||
c1b5869b 1600 !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN))
b128c09f
BB
1601 return;
1602
a169a625
MA
1603 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1604 mutex_enter(&dd->dd_lock);
b128c09f 1605 ASSERT(delta > 0 ?
c1b5869b
AM
1606 ddp->dd_used_breakdown[oldtype] >= delta :
1607 ddp->dd_used_breakdown[newtype] >= -delta);
1608 ASSERT(ddp->dd_used_bytes >= ABS(delta));
1609 ddp->dd_used_breakdown[oldtype] -= delta;
1610 ddp->dd_used_breakdown[newtype] += delta;
1611 mutex_exit(&dd->dd_lock);
1612}
1613
1614void
1615dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
1616 int64_t compressed, int64_t uncompressed, int64_t tonew,
1617 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1618{
1619 int64_t accounted_delta;
1620
1621 ASSERT(dmu_tx_is_syncing(tx));
1622 ASSERT(oldtype < DD_USED_NUM);
1623 ASSERT(newtype < DD_USED_NUM);
1624
1625 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1626
1627 mutex_enter(&dd->dd_lock);
1628 dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
1629 accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
1630 ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
1631 ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
1632 ASSERT(uncompressed >= 0 ||
1633 ddp->dd_uncompressed_bytes >= -uncompressed);
1634 ddp->dd_used_bytes += used;
1635 ddp->dd_uncompressed_bytes += uncompressed;
1636 ddp->dd_compressed_bytes += compressed;
1637
1638 if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1639 ASSERT(tonew - used <= 0 ||
1640 ddp->dd_used_breakdown[oldtype] >= tonew - used);
1641 ASSERT(tonew >= 0 ||
1642 ddp->dd_used_breakdown[newtype] >= -tonew);
1643 ddp->dd_used_breakdown[oldtype] -= tonew - used;
1644 ddp->dd_used_breakdown[newtype] += tonew;
1645#ifdef ZFS_DEBUG
1646 {
1647 dd_used_t t;
1648 uint64_t u = 0;
1649 for (t = 0; t < DD_USED_NUM; t++)
1650 u += ddp->dd_used_breakdown[t];
1651 ASSERT3U(u, ==, ddp->dd_used_bytes);
1652 }
1653#endif
1654 }
a169a625 1655 mutex_exit(&dd->dd_lock);
c1b5869b
AM
1656
1657 if (dd->dd_parent != NULL) {
1658 dsl_dir_diduse_transfer_space(dd->dd_parent,
1659 accounted_delta, compressed, uncompressed,
1660 used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
1661 }
b128c09f
BB
1662}
1663
13fe0198
MA
1664typedef struct dsl_dir_set_qr_arg {
1665 const char *ddsqra_name;
1666 zprop_source_t ddsqra_source;
1667 uint64_t ddsqra_value;
1668} dsl_dir_set_qr_arg_t;
1669
34dc7c2f 1670static int
13fe0198 1671dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
34dc7c2f 1672{
13fe0198
MA
1673 dsl_dir_set_qr_arg_t *ddsqra = arg;
1674 dsl_pool_t *dp = dmu_tx_pool(tx);
1675 dsl_dataset_t *ds;
1676 int error;
1677 uint64_t towrite, newval;
34dc7c2f 1678
13fe0198
MA
1679 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1680 if (error != 0)
1681 return (error);
1682
1683 error = dsl_prop_predict(ds->ds_dir, "quota",
1684 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1685 if (error != 0) {
1686 dsl_dataset_rele(ds, FTAG);
1687 return (error);
1688 }
428870ff 1689
13fe0198
MA
1690 if (newval == 0) {
1691 dsl_dataset_rele(ds, FTAG);
34dc7c2f 1692 return (0);
13fe0198 1693 }
34dc7c2f 1694
13fe0198 1695 mutex_enter(&ds->ds_dir->dd_lock);
34dc7c2f
BB
1696 /*
1697 * If we are doing the preliminary check in open context, and
1698 * there are pending changes, then don't fail it, since the
1699 * pending changes could under-estimate the amount of space to be
1700 * freed up.
1701 */
13fe0198 1702 towrite = dsl_dir_space_towrite(ds->ds_dir);
34dc7c2f 1703 if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
d683ddbb
JG
1704 (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
1705 newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
2e528b49 1706 error = SET_ERROR(ENOSPC);
34dc7c2f 1707 }
13fe0198
MA
1708 mutex_exit(&ds->ds_dir->dd_lock);
1709 dsl_dataset_rele(ds, FTAG);
1710 return (error);
34dc7c2f
BB
1711}
1712
34dc7c2f 1713static void
13fe0198 1714dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 1715{
13fe0198
MA
1716 dsl_dir_set_qr_arg_t *ddsqra = arg;
1717 dsl_pool_t *dp = dmu_tx_pool(tx);
1718 dsl_dataset_t *ds;
1719 uint64_t newval;
428870ff 1720
13fe0198 1721 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
34dc7c2f 1722
b1118acb
MM
1723 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1724 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
1725 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1726 &ddsqra->ddsqra_value, tx);
34dc7c2f 1727
b1118acb
MM
1728 VERIFY0(dsl_prop_get_int_ds(ds,
1729 zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
1730 } else {
1731 newval = ddsqra->ddsqra_value;
1732 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1733 zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
1734 }
6f1ffb06 1735
13fe0198
MA
1736 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1737 mutex_enter(&ds->ds_dir->dd_lock);
d683ddbb 1738 dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
13fe0198
MA
1739 mutex_exit(&ds->ds_dir->dd_lock);
1740 dsl_dataset_rele(ds, FTAG);
34dc7c2f
BB
1741}
1742
1743int
428870ff 1744dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
34dc7c2f 1745{
13fe0198 1746 dsl_dir_set_qr_arg_t ddsqra;
428870ff 1747
13fe0198
MA
1748 ddsqra.ddsqra_name = ddname;
1749 ddsqra.ddsqra_source = source;
1750 ddsqra.ddsqra_value = quota;
428870ff 1751
13fe0198 1752 return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
d2734cce
SD
1753 dsl_dir_set_quota_sync, &ddsqra, 0,
1754 ZFS_SPACE_CHECK_EXTRA_RESERVED));
34dc7c2f
BB
1755}
1756
65c7cc49 1757static int
13fe0198 1758dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
34dc7c2f 1759{
13fe0198
MA
1760 dsl_dir_set_qr_arg_t *ddsqra = arg;
1761 dsl_pool_t *dp = dmu_tx_pool(tx);
1762 dsl_dataset_t *ds;
1763 dsl_dir_t *dd;
1764 uint64_t newval, used, avail;
1765 int error;
428870ff 1766
13fe0198
MA
1767 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1768 if (error != 0)
1769 return (error);
1770 dd = ds->ds_dir;
34dc7c2f
BB
1771
1772 /*
1773 * If we are doing the preliminary check in open context, the
1774 * space estimates may be inaccurate.
1775 */
13fe0198
MA
1776 if (!dmu_tx_is_syncing(tx)) {
1777 dsl_dataset_rele(ds, FTAG);
34dc7c2f 1778 return (0);
13fe0198
MA
1779 }
1780
1781 error = dsl_prop_predict(ds->ds_dir,
1782 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1783 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1784 if (error != 0) {
1785 dsl_dataset_rele(ds, FTAG);
1786 return (error);
1787 }
34dc7c2f
BB
1788
1789 mutex_enter(&dd->dd_lock);
d683ddbb 1790 used = dsl_dir_phys(dd)->dd_used_bytes;
34dc7c2f
BB
1791 mutex_exit(&dd->dd_lock);
1792
1793 if (dd->dd_parent) {
1794 avail = dsl_dir_space_available(dd->dd_parent,
1795 NULL, 0, FALSE);
1796 } else {
d2734cce
SD
1797 avail = dsl_pool_adjustedsize(dd->dd_pool,
1798 ZFS_SPACE_CHECK_NORMAL) - used;
34dc7c2f
BB
1799 }
1800
d683ddbb 1801 if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
13fe0198 1802 uint64_t delta = MAX(used, newval) -
d683ddbb 1803 MAX(used, dsl_dir_phys(dd)->dd_reserved);
d164b209 1804
13fe0198 1805 if (delta > avail ||
d683ddbb
JG
1806 (dsl_dir_phys(dd)->dd_quota > 0 &&
1807 newval > dsl_dir_phys(dd)->dd_quota))
2e528b49 1808 error = SET_ERROR(ENOSPC);
d164b209
BB
1809 }
1810
13fe0198
MA
1811 dsl_dataset_rele(ds, FTAG);
1812 return (error);
34dc7c2f
BB
1813}
1814
13fe0198 1815void
6f1ffb06 1816dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
34dc7c2f 1817{
34dc7c2f
BB
1818 uint64_t used;
1819 int64_t delta;
1820
1821 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1822
1823 mutex_enter(&dd->dd_lock);
d683ddbb
JG
1824 used = dsl_dir_phys(dd)->dd_used_bytes;
1825 delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
1826 dsl_dir_phys(dd)->dd_reserved = value;
34dc7c2f
BB
1827
1828 if (dd->dd_parent != NULL) {
1829 /* Roll up this additional usage into our ancestors */
b128c09f
BB
1830 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1831 delta, 0, 0, tx);
34dc7c2f 1832 }
b128c09f 1833 mutex_exit(&dd->dd_lock);
34dc7c2f
BB
1834}
1835
6f1ffb06 1836static void
13fe0198 1837dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
6f1ffb06 1838{
13fe0198
MA
1839 dsl_dir_set_qr_arg_t *ddsqra = arg;
1840 dsl_pool_t *dp = dmu_tx_pool(tx);
1841 dsl_dataset_t *ds;
1842 uint64_t newval;
6f1ffb06 1843
13fe0198
MA
1844 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1845
b1118acb
MM
1846 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1847 dsl_prop_set_sync_impl(ds,
1848 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1849 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1850 &ddsqra->ddsqra_value, tx);
d1d7e268 1851
b1118acb
MM
1852 VERIFY0(dsl_prop_get_int_ds(ds,
1853 zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1854 } else {
1855 newval = ddsqra->ddsqra_value;
1856 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1857 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1858 (longlong_t)newval);
1859 }
1860
13fe0198
MA
1861 dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1862 dsl_dataset_rele(ds, FTAG);
d1d7e268 1863}
6f1ffb06 1864
34dc7c2f 1865int
428870ff
BB
1866dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1867 uint64_t reservation)
34dc7c2f 1868{
13fe0198 1869 dsl_dir_set_qr_arg_t ddsqra;
428870ff 1870
13fe0198
MA
1871 ddsqra.ddsqra_name = ddname;
1872 ddsqra.ddsqra_source = source;
1873 ddsqra.ddsqra_value = reservation;
428870ff 1874
13fe0198 1875 return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
d2734cce
SD
1876 dsl_dir_set_reservation_sync, &ddsqra, 0,
1877 ZFS_SPACE_CHECK_EXTRA_RESERVED));
34dc7c2f
BB
1878}
1879
1880static dsl_dir_t *
1881closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1882{
1883 for (; ds1; ds1 = ds1->dd_parent) {
1884 dsl_dir_t *dd;
1885 for (dd = ds2; dd; dd = dd->dd_parent) {
1886 if (ds1 == dd)
1887 return (dd);
1888 }
1889 }
1890 return (NULL);
1891}
1892
1893/*
1894 * If delta is applied to dd, how much of that delta would be applied to
1895 * ancestor? Syncing context only.
1896 */
1897static int64_t
1898would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1899{
1900 if (dd == ancestor)
1901 return (delta);
1902
1903 mutex_enter(&dd->dd_lock);
d683ddbb 1904 delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
34dc7c2f
BB
1905 mutex_exit(&dd->dd_lock);
1906 return (would_change(dd->dd_parent, delta, ancestor));
1907}
1908
13fe0198
MA
1909typedef struct dsl_dir_rename_arg {
1910 const char *ddra_oldname;
1911 const char *ddra_newname;
788eb90c 1912 cred_t *ddra_cred;
e59a377a 1913 proc_t *ddra_proc;
13fe0198 1914} dsl_dir_rename_arg_t;
34dc7c2f 1915
a7ed98d8
SD
1916typedef struct dsl_valid_rename_arg {
1917 int char_delta;
1918 int nest_delta;
1919} dsl_valid_rename_arg_t;
1920
34dc7c2f 1921static int
13fe0198 1922dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
34dc7c2f 1923{
14e4e3cb 1924 (void) dp;
a7ed98d8 1925 dsl_valid_rename_arg_t *dvra = arg;
eca7b760 1926 char namebuf[ZFS_MAX_DATASET_NAME_LEN];
34dc7c2f 1927
13fe0198
MA
1928 dsl_dataset_name(ds, namebuf);
1929
a7ed98d8
SD
1930 ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
1931 <, ZFS_MAX_DATASET_NAME_LEN);
1932 int namelen = strlen(namebuf) + dvra->char_delta;
1933 int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
1934
1935 if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
1936 return (SET_ERROR(ENAMETOOLONG));
1937 if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
2e528b49 1938 return (SET_ERROR(ENAMETOOLONG));
13fe0198
MA
1939 return (0);
1940}
1941
1942static int
1943dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1944{
1945 dsl_dir_rename_arg_t *ddra = arg;
1946 dsl_pool_t *dp = dmu_tx_pool(tx);
1947 dsl_dir_t *dd, *newparent;
a7ed98d8 1948 dsl_valid_rename_arg_t dvra;
d8d418ff 1949 dsl_dataset_t *parentds;
1950 objset_t *parentos;
13fe0198
MA
1951 const char *mynewname;
1952 int error;
34dc7c2f 1953
13fe0198
MA
1954 /* target dir should exist */
1955 error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1956 if (error != 0)
1957 return (error);
1958
1959 /* new parent should exist */
1960 error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1961 &newparent, &mynewname);
1962 if (error != 0) {
1963 dsl_dir_rele(dd, FTAG);
1964 return (error);
1965 }
1966
1967 /* can't rename to different pool */
1968 if (dd->dd_pool != newparent->dd_pool) {
1969 dsl_dir_rele(newparent, FTAG);
1970 dsl_dir_rele(dd, FTAG);
9063f654 1971 return (SET_ERROR(EXDEV));
13fe0198
MA
1972 }
1973
1974 /* new name should not already exist */
1975 if (mynewname == NULL) {
1976 dsl_dir_rele(newparent, FTAG);
1977 dsl_dir_rele(dd, FTAG);
2e528b49 1978 return (SET_ERROR(EEXIST));
13fe0198
MA
1979 }
1980
d8d418ff 1981 /* can't rename below anything but filesystems (eg. no ZVOLs) */
1982 error = dsl_dataset_hold_obj(newparent->dd_pool,
1983 dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
1984 if (error != 0) {
1985 dsl_dir_rele(newparent, FTAG);
1986 dsl_dir_rele(dd, FTAG);
1987 return (error);
1988 }
1989 error = dmu_objset_from_ds(parentds, &parentos);
1990 if (error != 0) {
1991 dsl_dataset_rele(parentds, FTAG);
1992 dsl_dir_rele(newparent, FTAG);
1993 dsl_dir_rele(dd, FTAG);
1994 return (error);
1995 }
1996 if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
1997 dsl_dataset_rele(parentds, FTAG);
1998 dsl_dir_rele(newparent, FTAG);
1999 dsl_dir_rele(dd, FTAG);
2000 return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
2001 }
2002 dsl_dataset_rele(parentds, FTAG);
2003
a7ed98d8
SD
2004 ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
2005 <, ZFS_MAX_DATASET_NAME_LEN);
2006 ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
2007 <, ZFS_MAX_DATASET_NAME_LEN);
2008 dvra.char_delta = strlen(ddra->ddra_newname)
2009 - strlen(ddra->ddra_oldname);
2010 dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
2011 - get_dataset_depth(ddra->ddra_oldname);
2012
13fe0198 2013 /* if the name length is growing, validate child name lengths */
a7ed98d8 2014 if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
13fe0198 2015 error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
a7ed98d8 2016 &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
13fe0198
MA
2017 if (error != 0) {
2018 dsl_dir_rele(newparent, FTAG);
2019 dsl_dir_rele(dd, FTAG);
2020 return (error);
2021 }
2022 }
34dc7c2f 2023
788eb90c 2024 if (dmu_tx_is_syncing(tx)) {
a0c9a17a 2025 if (spa_feature_is_active(dp->dp_spa,
788eb90c
JJ
2026 SPA_FEATURE_FS_SS_LIMIT)) {
2027 /*
2028 * Although this is the check function and we don't
2029 * normally make on-disk changes in check functions,
2030 * we need to do that here.
2031 *
2032 * Ensure this portion of the tree's counts have been
2033 * initialized in case the new parent has limits set.
2034 */
2035 dsl_dir_init_fs_ss_count(dd, tx);
2036 }
2037 }
2038
13fe0198 2039 if (newparent != dd->dd_parent) {
34dc7c2f
BB
2040 /* is there enough space? */
2041 uint64_t myspace =
d683ddbb
JG
2042 MAX(dsl_dir_phys(dd)->dd_used_bytes,
2043 dsl_dir_phys(dd)->dd_reserved);
788eb90c
JJ
2044 objset_t *os = dd->dd_pool->dp_meta_objset;
2045 uint64_t fs_cnt = 0;
2046 uint64_t ss_cnt = 0;
2047
2048 if (dsl_dir_is_zapified(dd)) {
2049 int err;
2050
2051 err = zap_lookup(os, dd->dd_object,
2052 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
2053 &fs_cnt);
a0c9a17a
JJ
2054 if (err != ENOENT && err != 0) {
2055 dsl_dir_rele(newparent, FTAG);
2056 dsl_dir_rele(dd, FTAG);
788eb90c 2057 return (err);
a0c9a17a 2058 }
788eb90c
JJ
2059
2060 /*
2061 * have to add 1 for the filesystem itself that we're
2062 * moving
2063 */
2064 fs_cnt++;
2065
2066 err = zap_lookup(os, dd->dd_object,
2067 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
2068 &ss_cnt);
a0c9a17a
JJ
2069 if (err != ENOENT && err != 0) {
2070 dsl_dir_rele(newparent, FTAG);
2071 dsl_dir_rele(dd, FTAG);
788eb90c 2072 return (err);
a0c9a17a 2073 }
788eb90c 2074 }
34dc7c2f 2075
b5256303
TC
2076 /* check for encryption errors */
2077 error = dsl_dir_rename_crypt_check(dd, newparent);
2078 if (error != 0) {
2079 dsl_dir_rele(newparent, FTAG);
2080 dsl_dir_rele(dd, FTAG);
2081 return (SET_ERROR(EACCES));
2082 }
2083
34dc7c2f 2084 /* no rename into our descendant */
13fe0198
MA
2085 if (closest_common_ancestor(dd, newparent) == dd) {
2086 dsl_dir_rele(newparent, FTAG);
2087 dsl_dir_rele(dd, FTAG);
2e528b49 2088 return (SET_ERROR(EINVAL));
13fe0198 2089 }
34dc7c2f 2090
13fe0198 2091 error = dsl_dir_transfer_possible(dd->dd_parent,
e59a377a
MA
2092 newparent, fs_cnt, ss_cnt, myspace,
2093 ddra->ddra_cred, ddra->ddra_proc);
13fe0198
MA
2094 if (error != 0) {
2095 dsl_dir_rele(newparent, FTAG);
2096 dsl_dir_rele(dd, FTAG);
2097 return (error);
2098 }
34dc7c2f
BB
2099 }
2100
13fe0198
MA
2101 dsl_dir_rele(newparent, FTAG);
2102 dsl_dir_rele(dd, FTAG);
34dc7c2f
BB
2103 return (0);
2104}
2105
2106static void
13fe0198 2107dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 2108{
13fe0198
MA
2109 dsl_dir_rename_arg_t *ddra = arg;
2110 dsl_pool_t *dp = dmu_tx_pool(tx);
2111 dsl_dir_t *dd, *newparent;
2112 const char *mynewname;
34dc7c2f 2113 objset_t *mos = dp->dp_meta_objset;
34dc7c2f 2114
13fe0198
MA
2115 VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
2116 VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
2117 &mynewname));
34dc7c2f 2118
6f1ffb06 2119 /* Log this before we change the name. */
6f1ffb06 2120 spa_history_log_internal_dd(dd, "rename", tx,
13fe0198 2121 "-> %s", ddra->ddra_newname);
6f1ffb06 2122
13fe0198 2123 if (newparent != dd->dd_parent) {
788eb90c
JJ
2124 objset_t *os = dd->dd_pool->dp_meta_objset;
2125 uint64_t fs_cnt = 0;
2126 uint64_t ss_cnt = 0;
2127
2128 /*
2129 * We already made sure the dd counts were initialized in the
2130 * check function.
2131 */
a0c9a17a 2132 if (spa_feature_is_active(dp->dp_spa,
788eb90c
JJ
2133 SPA_FEATURE_FS_SS_LIMIT)) {
2134 VERIFY0(zap_lookup(os, dd->dd_object,
2135 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
2136 &fs_cnt));
2137 /* add 1 for the filesystem itself that we're moving */
2138 fs_cnt++;
2139
2140 VERIFY0(zap_lookup(os, dd->dd_object,
2141 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
2142 &ss_cnt));
2143 }
2144
2145 dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
2146 DD_FIELD_FILESYSTEM_COUNT, tx);
2147 dsl_fs_ss_count_adjust(newparent, fs_cnt,
2148 DD_FIELD_FILESYSTEM_COUNT, tx);
2149
2150 dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
2151 DD_FIELD_SNAPSHOT_COUNT, tx);
2152 dsl_fs_ss_count_adjust(newparent, ss_cnt,
2153 DD_FIELD_SNAPSHOT_COUNT, tx);
2154
b128c09f 2155 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
d683ddbb
JG
2156 -dsl_dir_phys(dd)->dd_used_bytes,
2157 -dsl_dir_phys(dd)->dd_compressed_bytes,
2158 -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
13fe0198 2159 dsl_dir_diduse_space(newparent, DD_USED_CHILD,
d683ddbb
JG
2160 dsl_dir_phys(dd)->dd_used_bytes,
2161 dsl_dir_phys(dd)->dd_compressed_bytes,
2162 dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
b128c09f 2163
d683ddbb
JG
2164 if (dsl_dir_phys(dd)->dd_reserved >
2165 dsl_dir_phys(dd)->dd_used_bytes) {
2166 uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
2167 dsl_dir_phys(dd)->dd_used_bytes;
b128c09f
BB
2168
2169 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
2170 -unused_rsrv, 0, 0, tx);
13fe0198 2171 dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
b128c09f
BB
2172 unused_rsrv, 0, 0, tx);
2173 }
34dc7c2f
BB
2174 }
2175
2176 dmu_buf_will_dirty(dd->dd_dbuf, tx);
2177
2178 /* remove from old parent zapobj */
d87676a9 2179 VERIFY0(zap_remove(mos,
d683ddbb 2180 dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
d87676a9 2181 dd->dd_myname, tx));
34dc7c2f 2182
c9d61adb 2183 (void) strlcpy(dd->dd_myname, mynewname,
2184 sizeof (dd->dd_myname));
13fe0198 2185 dsl_dir_rele(dd->dd_parent, dd);
d683ddbb 2186 dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
13fe0198
MA
2187 VERIFY0(dsl_dir_hold_obj(dp,
2188 newparent->dd_object, NULL, dd, &dd->dd_parent));
34dc7c2f
BB
2189
2190 /* add to new parent zapobj */
d683ddbb 2191 VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
13fe0198
MA
2192 dd->dd_myname, 8, 1, &dd->dd_object, tx));
2193
7b4e2723
RM
2194 /* TODO: A rename callback to avoid these layering violations. */
2195 zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
a0bd735a
BP
2196 zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
2197 ddra->ddra_newname, B_TRUE);
ba6a2402 2198
13fe0198 2199 dsl_prop_notify_all(dd);
34dc7c2f 2200
13fe0198
MA
2201 dsl_dir_rele(newparent, FTAG);
2202 dsl_dir_rele(dd, FTAG);
34dc7c2f
BB
2203}
2204
2205int
13fe0198 2206dsl_dir_rename(const char *oldname, const char *newname)
34dc7c2f 2207{
13fe0198 2208 dsl_dir_rename_arg_t ddra;
34dc7c2f 2209
13fe0198
MA
2210 ddra.ddra_oldname = oldname;
2211 ddra.ddra_newname = newname;
788eb90c 2212 ddra.ddra_cred = CRED();
e59a377a 2213 ddra.ddra_proc = curproc;
34dc7c2f 2214
13fe0198 2215 return (dsl_sync_task(oldname,
3d45fdd6
MA
2216 dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
2217 3, ZFS_SPACE_CHECK_RESERVED));
34dc7c2f
BB
2218}
2219
2220int
788eb90c 2221dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
e59a377a
MA
2222 uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space,
2223 cred_t *cr, proc_t *proc)
34dc7c2f
BB
2224{
2225 dsl_dir_t *ancestor;
2226 int64_t adelta;
2227 uint64_t avail;
788eb90c 2228 int err;
34dc7c2f
BB
2229
2230 ancestor = closest_common_ancestor(sdd, tdd);
2231 adelta = would_change(sdd, -space, ancestor);
2232 avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
2233 if (avail < space)
2e528b49 2234 return (SET_ERROR(ENOSPC));
34dc7c2f 2235
788eb90c 2236 err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
e59a377a 2237 ancestor, cr, proc);
788eb90c
JJ
2238 if (err != 0)
2239 return (err);
2240 err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
e59a377a 2241 ancestor, cr, proc);
788eb90c
JJ
2242 if (err != 0)
2243 return (err);
2244
34dc7c2f
BB
2245 return (0);
2246}
428870ff 2247
6413c95f 2248inode_timespec_t
428870ff
BB
2249dsl_dir_snap_cmtime(dsl_dir_t *dd)
2250{
6413c95f 2251 inode_timespec_t t;
428870ff
BB
2252
2253 mutex_enter(&dd->dd_lock);
2254 t = dd->dd_snap_cmtime;
2255 mutex_exit(&dd->dd_lock);
2256
2257 return (t);
2258}
2259
2260void
9681de46 2261dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx)
428870ff 2262{
a582d529 2263 dsl_pool_t *dp = dmu_tx_pool(tx);
6413c95f 2264 inode_timespec_t t;
428870ff 2265 gethrestime(&t);
a582d529 2266
428870ff
BB
2267 mutex_enter(&dd->dd_lock);
2268 dd->dd_snap_cmtime = t;
a582d529
US
2269 if (spa_feature_is_enabled(dp->dp_spa,
2270 SPA_FEATURE_EXTENSIBLE_DATASET)) {
2271 objset_t *mos = dd->dd_pool->dp_meta_objset;
2272 uint64_t ddobj = dd->dd_object;
2273 dsl_dir_zapify(dd, tx);
2274 VERIFY0(zap_update(mos, ddobj,
59767479 2275 DD_FIELD_SNAPSHOTS_CHANGED,
a582d529
US
2276 sizeof (uint64_t),
2277 sizeof (inode_timespec_t) / sizeof (uint64_t),
2278 &t, tx));
2279 }
428870ff
BB
2280 mutex_exit(&dd->dd_lock);
2281}
c28b2279 2282
fa86b5db
MA
2283void
2284dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
2285{
2286 objset_t *mos = dd->dd_pool->dp_meta_objset;
2287 dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
2288}
2289
788eb90c
JJ
2290boolean_t
2291dsl_dir_is_zapified(dsl_dir_t *dd)
2292{
2293 dmu_object_info_t doi;
2294
2295 dmu_object_info_from_db(dd->dd_dbuf, &doi);
2296 return (doi.doi_type == DMU_OTN_ZAP_METADATA);
2297}
2298
37f03da8
SH
2299void
2300dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
2301{
2302 objset_t *mos = dd->dd_pool->dp_meta_objset;
2303 ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
2304 SPA_FEATURE_LIVELIST));
2305 dsl_deadlist_open(&dd->dd_livelist, mos, obj);
2306 bplist_create(&dd->dd_pending_allocs);
2307 bplist_create(&dd->dd_pending_frees);
2308}
2309
2310void
2311dsl_dir_livelist_close(dsl_dir_t *dd)
2312{
2313 dsl_deadlist_close(&dd->dd_livelist);
2314 bplist_destroy(&dd->dd_pending_allocs);
2315 bplist_destroy(&dd->dd_pending_frees);
2316}
2317
2318void
2319dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
2320{
2321 uint64_t obj;
2322 dsl_pool_t *dp = dmu_tx_pool(tx);
2323 spa_t *spa = dp->dp_spa;
2324 livelist_condense_entry_t to_condense = spa->spa_to_condense;
2325
2326 if (!dsl_deadlist_is_open(&dd->dd_livelist))
2327 return;
2328
2329 /*
2330 * If the livelist being removed is set to be condensed, stop the
2331 * condense zthr and indicate the cancellation in the spa_to_condense
2332 * struct in case the condense no-wait synctask has already started
2333 */
2334 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
2335 if (ll_condense_thread != NULL &&
2336 (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
d87676a9
MA
2337 /*
2338 * We use zthr_wait_cycle_done instead of zthr_cancel
2339 * because we don't want to destroy the zthr, just have
2340 * it skip its current task.
2341 */
2342 spa->spa_to_condense.cancelled = B_TRUE;
2343 zthr_wait_cycle_done(ll_condense_thread);
2344 /*
2345 * If we've returned from zthr_wait_cycle_done without
2346 * clearing the to_condense data structure it's either
2347 * because the no-wait synctask has started (which is
2348 * indicated by 'syncing' field of to_condense) and we
2349 * can expect it to clear to_condense on its own.
2350 * Otherwise, we returned before the zthr ran. The
2351 * checkfunc will now fail as cancelled == B_TRUE so we
2352 * can safely NULL out ds, allowing a different dir's
2353 * livelist to be condensed.
2354 *
2355 * We can be sure that the to_condense struct will not
2356 * be repopulated at this stage because both this
2357 * function and dsl_livelist_try_condense execute in
2358 * syncing context.
2359 */
2360 if ((spa->spa_to_condense.ds != NULL) &&
2361 !spa->spa_to_condense.syncing) {
2362 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
2363 spa);
2364 spa->spa_to_condense.ds = NULL;
2365 }
37f03da8
SH
2366 }
2367
2368 dsl_dir_livelist_close(dd);
d87676a9
MA
2369 VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
2370 DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
2371 VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
2372 DD_FIELD_LIVELIST, tx));
2373 if (total) {
2374 dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
2375 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
37f03da8
SH
2376 }
2377}
2378
5a42ef04
PD
2379static int
2380dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
2381 zfs_wait_activity_t activity, boolean_t *in_progress)
2382{
2383 int error = 0;
2384
2385 ASSERT(MUTEX_HELD(&dd->dd_activity_lock));
2386
2387 switch (activity) {
2388 case ZFS_WAIT_DELETEQ: {
2389#ifdef _KERNEL
2390 objset_t *os;
2391 error = dmu_objset_from_ds(ds, &os);
2392 if (error != 0)
2393 break;
2394
2395 mutex_enter(&os->os_user_ptr_lock);
2396 void *user = dmu_objset_get_user(os);
2397 mutex_exit(&os->os_user_ptr_lock);
2398 if (dmu_objset_type(os) != DMU_OST_ZFS ||
2399 user == NULL || zfs_get_vfs_flag_unmounted(os)) {
2400 *in_progress = B_FALSE;
2401 return (0);
2402 }
2403
2404 uint64_t readonly = B_FALSE;
2405 error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly,
2406 NULL);
2407
2408 if (error != 0)
2409 break;
2410
2411 if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) {
2412 *in_progress = B_FALSE;
2413 return (0);
2414 }
2415
2416 uint64_t count, unlinked_obj;
2417 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
2418 &unlinked_obj);
2419 if (error != 0) {
2420 dsl_dataset_rele(ds, FTAG);
2421 break;
2422 }
2423 error = zap_count(os, unlinked_obj, &count);
2424
2425 if (error == 0)
2426 *in_progress = (count != 0);
2427 break;
2428#else
2429 /*
2430 * The delete queue is ZPL specific, and libzpool doesn't have
2431 * it. It doesn't make sense to wait for it.
2432 */
14e4e3cb 2433 (void) ds;
5a42ef04
PD
2434 *in_progress = B_FALSE;
2435 break;
2436#endif
2437 }
2438 default:
2439 panic("unrecognized value for activity %d", activity);
2440 }
2441
2442 return (error);
2443}
2444
2445int
2446dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity,
2447 boolean_t *waited)
2448{
2449 int error = 0;
2450 boolean_t in_progress;
2451 dsl_pool_t *dp = dd->dd_pool;
2452 for (;;) {
2453 dsl_pool_config_enter(dp, FTAG);
2454 error = dsl_dir_activity_in_progress(dd, ds, activity,
2455 &in_progress);
2456 dsl_pool_config_exit(dp, FTAG);
2457 if (error != 0 || !in_progress)
2458 break;
2459
2460 *waited = B_TRUE;
2461
2462 if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) ==
2463 0 || dd->dd_activity_cancelled) {
2464 error = SET_ERROR(EINTR);
2465 break;
2466 }
2467 }
2468 return (error);
2469}
2470
2471void
2472dsl_dir_cancel_waiters(dsl_dir_t *dd)
2473{
2474 mutex_enter(&dd->dd_activity_lock);
2475 dd->dd_activity_cancelled = B_TRUE;
2476 cv_broadcast(&dd->dd_activity_cv);
2477 while (dd->dd_activity_waiters > 0)
2478 cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock);
2479 mutex_exit(&dd->dd_activity_lock);
2480}
2481
93ce2b4c 2482#if defined(_KERNEL)
c28b2279
BB
2483EXPORT_SYMBOL(dsl_dir_set_quota);
2484EXPORT_SYMBOL(dsl_dir_set_reservation);
c28b2279 2485#endif