]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dsl_pool.c
Linux 4.11 compat: iops.getattr and friends
[mirror_zfs.git] / module / zfs / dsl_pool.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
cc9bb3e5 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
95fd54a1 24 * Copyright (c) 2013 Steven Hartland. All rights reserved.
0c66c32d 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
539d33c7 26 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
34dc7c2f
BB
27 */
28
34dc7c2f
BB
29#include <sys/dsl_pool.h>
30#include <sys/dsl_dataset.h>
428870ff 31#include <sys/dsl_prop.h>
34dc7c2f
BB
32#include <sys/dsl_dir.h>
33#include <sys/dsl_synctask.h>
428870ff
BB
34#include <sys/dsl_scan.h>
35#include <sys/dnode.h>
34dc7c2f
BB
36#include <sys/dmu_tx.h>
37#include <sys/dmu_objset.h>
38#include <sys/arc.h>
39#include <sys/zap.h>
40#include <sys/zio.h>
41#include <sys/zfs_context.h>
42#include <sys/fs/zfs.h>
b128c09f
BB
43#include <sys/zfs_znode.h>
44#include <sys/spa_impl.h>
428870ff 45#include <sys/dsl_deadlist.h>
9ae529ec
CS
46#include <sys/bptree.h>
47#include <sys/zfeature.h>
29809a6c 48#include <sys/zil_impl.h>
13fe0198 49#include <sys/dsl_userhold.h>
49ee64e5 50#include <sys/trace_txg.h>
34dc7c2f 51
e8b96c60
MA
52/*
53 * ZFS Write Throttle
54 * ------------------
55 *
56 * ZFS must limit the rate of incoming writes to the rate at which it is able
57 * to sync data modifications to the backend storage. Throttling by too much
58 * creates an artificial limit; throttling by too little can only be sustained
59 * for short periods and would lead to highly lumpy performance. On a per-pool
60 * basis, ZFS tracks the amount of modified (dirty) data. As operations change
61 * data, the amount of dirty data increases; as ZFS syncs out data, the amount
62 * of dirty data decreases. When the amount of dirty data exceeds a
63 * predetermined threshold further modifications are blocked until the amount
64 * of dirty data decreases (as data is synced out).
65 *
66 * The limit on dirty data is tunable, and should be adjusted according to
67 * both the IO capacity and available memory of the system. The larger the
68 * window, the more ZFS is able to aggregate and amortize metadata (and data)
69 * changes. However, memory is a limited resource, and allowing for more dirty
70 * data comes at the cost of keeping other useful data in memory (for example
71 * ZFS data cached by the ARC).
72 *
73 * Implementation
74 *
75 * As buffers are modified dsl_pool_willuse_space() increments both the per-
76 * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
77 * dirty space used; dsl_pool_dirty_space() decrements those values as data
78 * is synced out from dsl_pool_sync(). While only the poolwide value is
79 * relevant, the per-txg value is useful for debugging. The tunable
80 * zfs_dirty_data_max determines the dirty space limit. Once that value is
81 * exceeded, new writes are halted until space frees up.
82 *
83 * The zfs_dirty_data_sync tunable dictates the threshold at which we
84 * ensure that there is a txg syncing (see the comment in txg.c for a full
85 * description of transaction group stages).
86 *
87 * The IO scheduler uses both the dirty space limit and current amount of
88 * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
89 * issues. See the comment in vdev_queue.c for details of the IO scheduler.
90 *
91 * The delay is also calculated based on the amount of dirty data. See the
92 * comment above dmu_tx_delay() for details.
93 */
94
95/*
96 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
97 * capped at zfs_dirty_data_max_max. It can also be overridden with a module
98 * parameter.
99 */
100unsigned long zfs_dirty_data_max = 0;
101unsigned long zfs_dirty_data_max_max = 0;
102int zfs_dirty_data_max_percent = 10;
103int zfs_dirty_data_max_max_percent = 25;
b128c09f 104
e8b96c60
MA
105/*
106 * If there is at least this much dirty data, push out a txg.
107 */
108unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024;
34dc7c2f 109
e8b96c60
MA
110/*
111 * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
112 * and delay each transaction.
113 * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
114 */
115int zfs_delay_min_dirty_percent = 60;
b128c09f 116
e8b96c60
MA
117/*
118 * This controls how quickly the delay approaches infinity.
119 * Larger values cause it to delay more for a given amount of dirty data.
120 * Therefore larger values will cause there to be less dirty data for a
121 * given throughput.
122 *
123 * For the smoothest delay, this value should be about 1 billion divided
124 * by the maximum number of operations per second. This will smoothly
125 * handle between 10x and 1/10th this number.
126 *
127 * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
128 * multiply in dmu_tx_delay().
129 */
130unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
b128c09f 131
63fd3c6c
AL
132hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
133hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
134
428870ff 135int
b128c09f 136dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
34dc7c2f
BB
137{
138 uint64_t obj;
139 int err;
140
141 err = zap_lookup(dp->dp_meta_objset,
d683ddbb 142 dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
b128c09f 143 name, sizeof (obj), 1, &obj);
34dc7c2f
BB
144 if (err)
145 return (err);
146
13fe0198 147 return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
34dc7c2f
BB
148}
149
150static dsl_pool_t *
151dsl_pool_open_impl(spa_t *spa, uint64_t txg)
152{
153 dsl_pool_t *dp;
154 blkptr_t *bp = spa_get_rootblkptr(spa);
34dc7c2f
BB
155
156 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
157 dp->dp_spa = spa;
158 dp->dp_meta_rootbp = *bp;
13fe0198 159 rrw_init(&dp->dp_config_rwlock, B_TRUE);
34dc7c2f
BB
160 txg_init(dp, txg);
161
162 txg_list_create(&dp->dp_dirty_datasets,
163 offsetof(dsl_dataset_t, ds_dirty_link));
29809a6c
MA
164 txg_list_create(&dp->dp_dirty_zilogs,
165 offsetof(zilog_t, zl_dirty_link));
34dc7c2f
BB
166 txg_list_create(&dp->dp_dirty_dirs,
167 offsetof(dsl_dir_t, dd_dirty_link));
168 txg_list_create(&dp->dp_sync_tasks,
13fe0198 169 offsetof(dsl_sync_task_t, dst_node));
34dc7c2f
BB
170
171 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
e8b96c60 172 cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
34dc7c2f 173
1229323d 174 dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri,
aa9af22c 175 max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
9babb374 176
34dc7c2f
BB
177 return (dp);
178}
179
180int
9ae529ec 181dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
34dc7c2f
BB
182{
183 int err;
184 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
9ae529ec 185
b7faa7aa
G
186 /*
187 * Initialize the caller's dsl_pool_t structure before we actually open
188 * the meta objset. This is done because a self-healing write zio may
189 * be issued as part of dmu_objset_open_impl() and the spa needs its
190 * dsl_pool_t initialized in order to handle the write.
191 */
192 *dpp = dp;
193
9ae529ec
CS
194 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
195 &dp->dp_meta_objset);
b7faa7aa 196 if (err != 0) {
9ae529ec 197 dsl_pool_close(dp);
b7faa7aa
G
198 *dpp = NULL;
199 }
9ae529ec
CS
200
201 return (err);
202}
203
204int
205dsl_pool_open(dsl_pool_t *dp)
206{
207 int err;
b128c09f
BB
208 dsl_dir_t *dd;
209 dsl_dataset_t *ds;
428870ff 210 uint64_t obj;
34dc7c2f 211
13fe0198 212 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
34dc7c2f
BB
213 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
214 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
215 &dp->dp_root_dir_obj);
216 if (err)
217 goto out;
218
13fe0198 219 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
34dc7c2f
BB
220 NULL, dp, &dp->dp_root_dir);
221 if (err)
222 goto out;
223
b128c09f 224 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
34dc7c2f
BB
225 if (err)
226 goto out;
227
9ae529ec 228 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
b128c09f
BB
229 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
230 if (err)
231 goto out;
d683ddbb
JG
232 err = dsl_dataset_hold_obj(dp,
233 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
9babb374
BB
234 if (err == 0) {
235 err = dsl_dataset_hold_obj(dp,
d683ddbb 236 dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
9babb374
BB
237 &dp->dp_origin_snap);
238 dsl_dataset_rele(ds, FTAG);
239 }
13fe0198 240 dsl_dir_rele(dd, dp);
b128c09f
BB
241 if (err)
242 goto out;
b128c09f
BB
243 }
244
9ae529ec 245 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
428870ff
BB
246 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
247 &dp->dp_free_dir);
b128c09f
BB
248 if (err)
249 goto out;
428870ff 250
b128c09f 251 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
428870ff 252 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
b128c09f
BB
253 if (err)
254 goto out;
13fe0198 255 VERIFY0(bpobj_open(&dp->dp_free_bpobj,
428870ff 256 dp->dp_meta_objset, obj));
b128c09f
BB
257 }
258
fbeddd60
MA
259 /*
260 * Note: errors ignored, because the leak dir will not exist if we
261 * have not encountered a leak yet.
262 */
263 (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
264 &dp->dp_leak_dir);
265
fa86b5db 266 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
9ae529ec
CS
267 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
268 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
269 &dp->dp_bptree_obj);
270 if (err != 0)
271 goto out;
272 }
273
fa86b5db 274 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
753c3839
MA
275 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
276 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
277 &dp->dp_empty_bpobj);
278 if (err != 0)
279 goto out;
280 }
281
428870ff
BB
282 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
283 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
284 &dp->dp_tmp_userrefs_obj);
285 if (err == ENOENT)
286 err = 0;
287 if (err)
288 goto out;
289
9ae529ec 290 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
428870ff 291
34dc7c2f 292out:
13fe0198 293 rrw_exit(&dp->dp_config_rwlock, FTAG);
34dc7c2f
BB
294 return (err);
295}
296
297void
298dsl_pool_close(dsl_pool_t *dp)
299{
b128c09f 300 /*
e8b96c60
MA
301 * Drop our references from dsl_pool_open().
302 *
b128c09f
BB
303 * Since we held the origin_snap from "syncing" context (which
304 * includes pool-opening context), it actually only got a "ref"
305 * and not a hold, so just drop that here.
306 */
307 if (dp->dp_origin_snap)
13fe0198 308 dsl_dataset_rele(dp->dp_origin_snap, dp);
34dc7c2f 309 if (dp->dp_mos_dir)
13fe0198 310 dsl_dir_rele(dp->dp_mos_dir, dp);
428870ff 311 if (dp->dp_free_dir)
13fe0198 312 dsl_dir_rele(dp->dp_free_dir, dp);
fbeddd60
MA
313 if (dp->dp_leak_dir)
314 dsl_dir_rele(dp->dp_leak_dir, dp);
34dc7c2f 315 if (dp->dp_root_dir)
13fe0198 316 dsl_dir_rele(dp->dp_root_dir, dp);
34dc7c2f 317
428870ff
BB
318 bpobj_close(&dp->dp_free_bpobj);
319
34dc7c2f
BB
320 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
321 if (dp->dp_meta_objset)
428870ff 322 dmu_objset_evict(dp->dp_meta_objset);
34dc7c2f
BB
323
324 txg_list_destroy(&dp->dp_dirty_datasets);
29809a6c 325 txg_list_destroy(&dp->dp_dirty_zilogs);
428870ff 326 txg_list_destroy(&dp->dp_sync_tasks);
34dc7c2f 327 txg_list_destroy(&dp->dp_dirty_dirs);
34dc7c2f 328
ca0bf58d
PS
329 /*
330 * We can't set retry to TRUE since we're explicitly specifying
331 * a spa to flush. This is good enough; any missed buffers for
332 * this spa won't cause trouble, and they'll eventually fall
333 * out of the ARC just like any other unused buffer.
334 */
335 arc_flush(dp->dp_spa, FALSE);
336
34dc7c2f 337 txg_fini(dp);
428870ff 338 dsl_scan_fini(dp);
0c66c32d
JG
339 dmu_buf_user_evict_wait();
340
13fe0198 341 rrw_destroy(&dp->dp_config_rwlock);
34dc7c2f 342 mutex_destroy(&dp->dp_lock);
3558fd73 343 taskq_destroy(dp->dp_iput_taskq);
b128c09f 344 if (dp->dp_blkstats)
79c76d5b 345 vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
34dc7c2f
BB
346 kmem_free(dp, sizeof (dsl_pool_t));
347}
348
349dsl_pool_t *
b128c09f 350dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
34dc7c2f
BB
351{
352 int err;
353 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
354 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
428870ff 355 objset_t *os;
b128c09f 356 dsl_dataset_t *ds;
428870ff 357 uint64_t obj;
b128c09f 358
13fe0198
MA
359 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
360
b128c09f 361 /* create and open the MOS (meta-objset) */
428870ff
BB
362 dp->dp_meta_objset = dmu_objset_create_impl(spa,
363 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
34dc7c2f
BB
364
365 /* create the pool directory */
366 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
367 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
c99c9001 368 ASSERT0(err);
34dc7c2f 369
428870ff 370 /* Initialize scan structures */
13fe0198 371 VERIFY0(dsl_scan_init(dp, txg));
428870ff 372
34dc7c2f 373 /* create and open the root dir */
b128c09f 374 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
13fe0198 375 VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
34dc7c2f
BB
376 NULL, dp, &dp->dp_root_dir));
377
378 /* create and open the meta-objset dir */
b128c09f 379 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
13fe0198 380 VERIFY0(dsl_pool_open_special_dir(dp,
b128c09f
BB
381 MOS_DIR_NAME, &dp->dp_mos_dir));
382
428870ff
BB
383 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
384 /* create and open the free dir */
385 (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
386 FREE_DIR_NAME, tx);
13fe0198 387 VERIFY0(dsl_pool_open_special_dir(dp,
428870ff
BB
388 FREE_DIR_NAME, &dp->dp_free_dir));
389
390 /* create and open the free_bplist */
f1512ee6 391 obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
428870ff
BB
392 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
393 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
13fe0198 394 VERIFY0(bpobj_open(&dp->dp_free_bpobj,
428870ff
BB
395 dp->dp_meta_objset, obj));
396 }
397
b128c09f
BB
398 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
399 dsl_pool_create_origin(dp, tx);
400
401 /* create the root dataset */
428870ff 402 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
b128c09f
BB
403
404 /* create the root objset */
13fe0198 405 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
cc9bb3e5 406 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
0fe3d820
BB
407 VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds,
408 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx)));
cc9bb3e5 409 rrw_exit(&ds->ds_bp_rwlock, FTAG);
b128c09f 410#ifdef _KERNEL
428870ff 411 zfs_create_fs(os, kcred, zplprops, tx);
b128c09f
BB
412#endif
413 dsl_dataset_rele(ds, FTAG);
34dc7c2f
BB
414
415 dmu_tx_commit(tx);
416
13fe0198
MA
417 rrw_exit(&dp->dp_config_rwlock, FTAG);
418
34dc7c2f
BB
419 return (dp);
420}
421
29809a6c
MA
422/*
423 * Account for the meta-objset space in its placeholder dsl_dir.
424 */
425void
426dsl_pool_mos_diduse_space(dsl_pool_t *dp,
427 int64_t used, int64_t comp, int64_t uncomp)
428{
429 ASSERT3U(comp, ==, uncomp); /* it's all metadata */
430 mutex_enter(&dp->dp_lock);
431 dp->dp_mos_used_delta += used;
432 dp->dp_mos_compressed_delta += comp;
433 dp->dp_mos_uncompressed_delta += uncomp;
434 mutex_exit(&dp->dp_lock);
435}
436
e8b96c60
MA
437static void
438dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
439{
440 zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
441 dmu_objset_sync(dp->dp_meta_objset, zio, tx);
442 VERIFY0(zio_wait(zio));
443 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
444 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
445}
446
447static void
448dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
449{
450 ASSERT(MUTEX_HELD(&dp->dp_lock));
451
452 if (delta < 0)
453 ASSERT3U(-delta, <=, dp->dp_dirty_total);
454
455 dp->dp_dirty_total += delta;
456
457 /*
458 * Note: we signal even when increasing dp_dirty_total.
459 * This ensures forward progress -- each thread wakes the next waiter.
460 */
461 if (dp->dp_dirty_total <= zfs_dirty_data_max)
462 cv_signal(&dp->dp_spaceavail_cv);
463}
464
34dc7c2f
BB
465void
466dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
467{
468 zio_t *zio;
469 dmu_tx_t *tx;
470 dsl_dir_t *dd;
471 dsl_dataset_t *ds;
428870ff 472 objset_t *mos = dp->dp_meta_objset;
29809a6c
MA
473 list_t synced_datasets;
474
475 list_create(&synced_datasets, sizeof (dsl_dataset_t),
476 offsetof(dsl_dataset_t, ds_synced_link));
34dc7c2f
BB
477
478 tx = dmu_tx_create_assigned(dp, txg);
479
e8b96c60
MA
480 /*
481 * Write out all dirty blocks of dirty datasets.
482 */
34dc7c2f 483 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
e8b96c60 484 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
9babb374
BB
485 /*
486 * We must not sync any non-MOS datasets twice, because
487 * we may have taken a snapshot of them. However, we
488 * may sync newly-created datasets on pass 2.
489 */
490 ASSERT(!list_link_active(&ds->ds_synced_link));
29809a6c 491 list_insert_tail(&synced_datasets, ds);
34dc7c2f
BB
492 dsl_dataset_sync(ds, zio, tx);
493 }
e8b96c60 494 VERIFY0(zio_wait(zio));
9babb374 495
e8b96c60
MA
496 /*
497 * We have written all of the accounted dirty data, so our
498 * dp_space_towrite should now be zero. However, some seldom-used
499 * code paths do not adhere to this (e.g. dbuf_undirty(), also
500 * rounding error in dbuf_write_physdone).
501 * Shore up the accounting of any dirtied space now.
502 */
503 dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
34dc7c2f 504
539d33c7
GM
505 /*
506 * Update the long range free counter after
507 * we're done syncing user data
508 */
509 mutex_enter(&dp->dp_lock);
510 ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
511 dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
512 dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
513 mutex_exit(&dp->dp_lock);
514
29809a6c
MA
515 /*
516 * After the data blocks have been written (ensured by the zio_wait()
517 * above), update the user/group space accounting.
518 */
e8b96c60
MA
519 for (ds = list_head(&synced_datasets); ds != NULL;
520 ds = list_next(&synced_datasets, ds)) {
428870ff 521 dmu_objset_do_userquota_updates(ds->ds_objset, tx);
e8b96c60 522 }
9babb374
BB
523
524 /*
525 * Sync the datasets again to push out the changes due to
428870ff 526 * userspace updates. This must be done before we process the
29809a6c
MA
527 * sync tasks, so that any snapshots will have the correct
528 * user accounting information (and we won't get confused
529 * about which blocks are part of the snapshot).
9babb374
BB
530 */
531 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
e8b96c60 532 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
9babb374
BB
533 ASSERT(list_link_active(&ds->ds_synced_link));
534 dmu_buf_rele(ds->ds_dbuf, ds);
535 dsl_dataset_sync(ds, zio, tx);
536 }
e8b96c60 537 VERIFY0(zio_wait(zio));
9babb374 538
428870ff 539 /*
29809a6c
MA
540 * Now that the datasets have been completely synced, we can
541 * clean up our in-memory structures accumulated while syncing:
542 *
543 * - move dead blocks from the pending deadlist to the on-disk deadlist
29809a6c 544 * - release hold from dsl_dataset_dirty()
428870ff 545 */
e8b96c60 546 while ((ds = list_remove_head(&synced_datasets)) != NULL) {
0efd9791 547 dsl_dataset_sync_done(ds, tx);
428870ff
BB
548 }
549
e8b96c60 550 while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
34dc7c2f 551 dsl_dir_sync(dd, tx);
e8b96c60 552 }
b128c09f 553
29809a6c
MA
554 /*
555 * The MOS's space is accounted for in the pool/$MOS
556 * (dp_mos_dir). We can't modify the mos while we're syncing
557 * it, so we remember the deltas and apply them here.
558 */
559 if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
560 dp->dp_mos_uncompressed_delta != 0) {
561 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
562 dp->dp_mos_used_delta,
563 dp->dp_mos_compressed_delta,
564 dp->dp_mos_uncompressed_delta, tx);
565 dp->dp_mos_used_delta = 0;
566 dp->dp_mos_compressed_delta = 0;
567 dp->dp_mos_uncompressed_delta = 0;
568 }
569
428870ff
BB
570 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
571 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
e8b96c60 572 dsl_pool_sync_mos(dp, tx);
34dc7c2f
BB
573 }
574
29809a6c
MA
575 /*
576 * If we modify a dataset in the same txg that we want to destroy it,
577 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
578 * dsl_dir_destroy_check() will fail if there are unexpected holds.
579 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
580 * and clearing the hold on it) before we process the sync_tasks.
581 * The MOS data dirtied by the sync_tasks will be synced on the next
582 * pass.
583 */
29809a6c 584 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
13fe0198 585 dsl_sync_task_t *dst;
29809a6c
MA
586 /*
587 * No more sync tasks should have been added while we
588 * were syncing.
589 */
e8b96c60
MA
590 ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
591 while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
13fe0198 592 dsl_sync_task_sync(dst, tx);
29809a6c
MA
593 }
594
34dc7c2f 595 dmu_tx_commit(tx);
b128c09f 596
e8b96c60 597 DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
34dc7c2f
BB
598}
599
600void
428870ff 601dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
34dc7c2f 602{
29809a6c 603 zilog_t *zilog;
34dc7c2f 604
29809a6c 605 while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) {
e8b96c60 606 dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
29809a6c
MA
607 zil_clean(zilog, txg);
608 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
609 dmu_buf_rele(ds->ds_dbuf, zilog);
34dc7c2f 610 }
428870ff 611 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
34dc7c2f
BB
612}
613
614/*
615 * TRUE if the current thread is the tx_sync_thread or if we
616 * are being called from SPA context during pool initialization.
617 */
618int
619dsl_pool_sync_context(dsl_pool_t *dp)
620{
621 return (curthread == dp->dp_tx.tx_sync_thread ||
9ae529ec 622 spa_is_initializing(dp->dp_spa));
34dc7c2f
BB
623}
624
625uint64_t
626dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
627{
628 uint64_t space, resv;
629
630 /*
34dc7c2f
BB
631 * If we're trying to assess whether it's OK to do a free,
632 * cut the reservation in half to allow forward progress
633 * (e.g. make it possible to rm(1) files from a full pool).
634 */
635 space = spa_get_dspace(dp->dp_spa);
0c60cc32 636 resv = spa_get_slop_space(dp->dp_spa);
34dc7c2f
BB
637 if (netfree)
638 resv >>= 1;
639
640 return (space - resv);
641}
642
e8b96c60
MA
643boolean_t
644dsl_pool_need_dirty_delay(dsl_pool_t *dp)
34dc7c2f 645{
e8b96c60
MA
646 uint64_t delay_min_bytes =
647 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
648 boolean_t rv;
34dc7c2f 649
e8b96c60
MA
650 mutex_enter(&dp->dp_lock);
651 if (dp->dp_dirty_total > zfs_dirty_data_sync)
652 txg_kick(dp);
653 rv = (dp->dp_dirty_total > delay_min_bytes);
654 mutex_exit(&dp->dp_lock);
655 return (rv);
34dc7c2f
BB
656}
657
658void
e8b96c60 659dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
34dc7c2f 660{
e8b96c60
MA
661 if (space > 0) {
662 mutex_enter(&dp->dp_lock);
663 dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
664 dsl_pool_dirty_delta(dp, space);
665 mutex_exit(&dp->dp_lock);
666 }
34dc7c2f
BB
667}
668
669void
e8b96c60 670dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
34dc7c2f 671{
e8b96c60
MA
672 ASSERT3S(space, >=, 0);
673 if (space == 0)
34dc7c2f
BB
674 return;
675
e8b96c60
MA
676 mutex_enter(&dp->dp_lock);
677 if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
678 /* XXX writing something we didn't dirty? */
679 space = dp->dp_dirty_pertxg[txg & TXG_MASK];
34dc7c2f 680 }
e8b96c60
MA
681 ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
682 dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
683 ASSERT3U(dp->dp_dirty_total, >=, space);
684 dsl_pool_dirty_delta(dp, -space);
685 mutex_exit(&dp->dp_lock);
34dc7c2f 686}
b128c09f
BB
687
688/* ARGSUSED */
689static int
13fe0198 690upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
b128c09f
BB
691{
692 dmu_tx_t *tx = arg;
693 dsl_dataset_t *ds, *prev = NULL;
694 int err;
b128c09f 695
13fe0198 696 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
b128c09f
BB
697 if (err)
698 return (err);
699
d683ddbb
JG
700 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
701 err = dsl_dataset_hold_obj(dp,
702 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
b128c09f
BB
703 if (err) {
704 dsl_dataset_rele(ds, FTAG);
705 return (err);
706 }
707
d683ddbb 708 if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
b128c09f
BB
709 break;
710 dsl_dataset_rele(ds, FTAG);
711 ds = prev;
712 prev = NULL;
713 }
714
715 if (prev == NULL) {
716 prev = dp->dp_origin_snap;
717
718 /*
719 * The $ORIGIN can't have any data, or the accounting
720 * will be wrong.
721 */
cc9bb3e5 722 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
d683ddbb 723 ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
cc9bb3e5 724 rrw_exit(&ds->ds_bp_rwlock, FTAG);
b128c09f
BB
725
726 /* The origin doesn't get attached to itself */
727 if (ds->ds_object == prev->ds_object) {
728 dsl_dataset_rele(ds, FTAG);
729 return (0);
730 }
731
732 dmu_buf_will_dirty(ds->ds_dbuf, tx);
d683ddbb
JG
733 dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
734 dsl_dataset_phys(ds)->ds_prev_snap_txg =
735 dsl_dataset_phys(prev)->ds_creation_txg;
b128c09f
BB
736
737 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
d683ddbb 738 dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
b128c09f
BB
739
740 dmu_buf_will_dirty(prev->ds_dbuf, tx);
d683ddbb 741 dsl_dataset_phys(prev)->ds_num_children++;
b128c09f 742
d683ddbb 743 if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
b128c09f 744 ASSERT(ds->ds_prev == NULL);
13fe0198 745 VERIFY0(dsl_dataset_hold_obj(dp,
d683ddbb
JG
746 dsl_dataset_phys(ds)->ds_prev_snap_obj,
747 ds, &ds->ds_prev));
b128c09f
BB
748 }
749 }
750
d683ddbb
JG
751 ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
752 ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
b128c09f 753
d683ddbb 754 if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
428870ff 755 dmu_buf_will_dirty(prev->ds_dbuf, tx);
d683ddbb 756 dsl_dataset_phys(prev)->ds_next_clones_obj =
b128c09f
BB
757 zap_create(dp->dp_meta_objset,
758 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
759 }
13fe0198 760 VERIFY0(zap_add_int(dp->dp_meta_objset,
d683ddbb 761 dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
b128c09f
BB
762
763 dsl_dataset_rele(ds, FTAG);
764 if (prev != dp->dp_origin_snap)
765 dsl_dataset_rele(prev, FTAG);
766 return (0);
767}
768
769void
770dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
771{
772 ASSERT(dmu_tx_is_syncing(tx));
773 ASSERT(dp->dp_origin_snap != NULL);
774
13fe0198 775 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
9c43027b 776 tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
428870ff
BB
777}
778
779/* ARGSUSED */
780static int
13fe0198 781upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
428870ff
BB
782{
783 dmu_tx_t *tx = arg;
428870ff
BB
784 objset_t *mos = dp->dp_meta_objset;
785
d683ddbb 786 if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
428870ff
BB
787 dsl_dataset_t *origin;
788
13fe0198 789 VERIFY0(dsl_dataset_hold_obj(dp,
d683ddbb 790 dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
428870ff 791
d683ddbb 792 if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
428870ff 793 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
d683ddbb
JG
794 dsl_dir_phys(origin->ds_dir)->dd_clones =
795 zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
796 0, tx);
428870ff
BB
797 }
798
13fe0198 799 VERIFY0(zap_add_int(dp->dp_meta_objset,
d683ddbb
JG
800 dsl_dir_phys(origin->ds_dir)->dd_clones,
801 ds->ds_object, tx));
428870ff
BB
802
803 dsl_dataset_rele(origin, FTAG);
804 }
428870ff
BB
805 return (0);
806}
807
808void
809dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
810{
428870ff
BB
811 uint64_t obj;
812
d6320ddb
BB
813 ASSERT(dmu_tx_is_syncing(tx));
814
428870ff 815 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
13fe0198 816 VERIFY0(dsl_pool_open_special_dir(dp,
428870ff
BB
817 FREE_DIR_NAME, &dp->dp_free_dir));
818
819 /*
820 * We can't use bpobj_alloc(), because spa_version() still
821 * returns the old version, and we need a new-version bpobj with
822 * subobj support. So call dmu_object_alloc() directly.
823 */
824 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
f1512ee6 825 SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
13fe0198 826 VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
428870ff 827 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
13fe0198 828 VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
428870ff 829
13fe0198 830 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
9c43027b 831 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
b128c09f
BB
832}
833
834void
835dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
836{
837 uint64_t dsobj;
838 dsl_dataset_t *ds;
839
840 ASSERT(dmu_tx_is_syncing(tx));
841 ASSERT(dp->dp_origin_snap == NULL);
13fe0198 842 ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
b128c09f
BB
843
844 /* create the origin dir, ds, & snap-ds */
b128c09f
BB
845 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
846 NULL, 0, kcred, tx);
13fe0198
MA
847 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
848 dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
d683ddbb 849 VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
b128c09f
BB
850 dp, &dp->dp_origin_snap));
851 dsl_dataset_rele(ds, FTAG);
b128c09f 852}
9babb374
BB
853
854taskq_t *
3558fd73 855dsl_pool_iput_taskq(dsl_pool_t *dp)
9babb374 856{
3558fd73 857 return (dp->dp_iput_taskq);
9babb374 858}
428870ff
BB
859
860/*
861 * Walk through the pool-wide zap object of temporary snapshot user holds
862 * and release them.
863 */
864void
865dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
866{
867 zap_attribute_t za;
868 zap_cursor_t zc;
869 objset_t *mos = dp->dp_meta_objset;
870 uint64_t zapobj = dp->dp_tmp_userrefs_obj;
95fd54a1 871 nvlist_t *holds;
428870ff
BB
872
873 if (zapobj == 0)
874 return;
875 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
876
95fd54a1
SH
877 holds = fnvlist_alloc();
878
428870ff
BB
879 for (zap_cursor_init(&zc, mos, zapobj);
880 zap_cursor_retrieve(&zc, &za) == 0;
881 zap_cursor_advance(&zc)) {
882 char *htag;
95fd54a1 883 nvlist_t *tags;
428870ff
BB
884
885 htag = strchr(za.za_name, '-');
886 *htag = '\0';
887 ++htag;
95fd54a1
SH
888 if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
889 tags = fnvlist_alloc();
890 fnvlist_add_boolean(tags, htag);
891 fnvlist_add_nvlist(holds, za.za_name, tags);
892 fnvlist_free(tags);
893 } else {
894 fnvlist_add_boolean(tags, htag);
895 }
428870ff 896 }
95fd54a1
SH
897 dsl_dataset_user_release_tmp(dp, holds);
898 fnvlist_free(holds);
428870ff
BB
899 zap_cursor_fini(&zc);
900}
901
902/*
903 * Create the pool-wide zap object for storing temporary snapshot holds.
904 */
905void
906dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
907{
908 objset_t *mos = dp->dp_meta_objset;
909
910 ASSERT(dp->dp_tmp_userrefs_obj == 0);
911 ASSERT(dmu_tx_is_syncing(tx));
912
9ae529ec
CS
913 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
914 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
428870ff
BB
915}
916
917static int
918dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
13fe0198 919 const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
428870ff
BB
920{
921 objset_t *mos = dp->dp_meta_objset;
922 uint64_t zapobj = dp->dp_tmp_userrefs_obj;
923 char *name;
924 int error;
925
926 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
927 ASSERT(dmu_tx_is_syncing(tx));
928
929 /*
930 * If the pool was created prior to SPA_VERSION_USERREFS, the
931 * zap object for temporary holds might not exist yet.
932 */
933 if (zapobj == 0) {
934 if (holding) {
935 dsl_pool_user_hold_create_obj(dp, tx);
936 zapobj = dp->dp_tmp_userrefs_obj;
937 } else {
2e528b49 938 return (SET_ERROR(ENOENT));
428870ff
BB
939 }
940 }
941
942 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
943 if (holding)
13fe0198 944 error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
428870ff
BB
945 else
946 error = zap_remove(mos, zapobj, name, tx);
947 strfree(name);
948
949 return (error);
950}
951
952/*
953 * Add a temporary hold for the given dataset object and tag.
954 */
955int
956dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
13fe0198 957 uint64_t now, dmu_tx_t *tx)
428870ff
BB
958{
959 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
960}
961
962/*
963 * Release a temporary hold for the given dataset object and tag.
964 */
965int
966dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
967 dmu_tx_t *tx)
968{
13fe0198 969 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
428870ff
BB
970 tx, B_FALSE));
971}
c409e464 972
13fe0198
MA
973/*
974 * DSL Pool Configuration Lock
975 *
976 * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
977 * creation / destruction / rename / property setting). It must be held for
978 * read to hold a dataset or dsl_dir. I.e. you must call
979 * dsl_pool_config_enter() or dsl_pool_hold() before calling
980 * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock
981 * must be held continuously until all datasets and dsl_dirs are released.
982 *
983 * The only exception to this rule is that if a "long hold" is placed on
984 * a dataset, then the dp_config_rwlock may be dropped while the dataset
985 * is still held. The long hold will prevent the dataset from being
986 * destroyed -- the destroy will fail with EBUSY. A long hold can be
987 * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
988 * (by calling dsl_{dataset,objset}_{try}own{_obj}).
989 *
990 * Legitimate long-holders (including owners) should be long-running, cancelable
991 * tasks that should cause "zfs destroy" to fail. This includes DMU
992 * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
993 * "zfs send", and "zfs diff". There are several other long-holders whose
994 * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
995 *
996 * The usual formula for long-holding would be:
997 * dsl_pool_hold()
998 * dsl_dataset_hold()
999 * ... perform checks ...
1000 * dsl_dataset_long_hold()
1001 * dsl_pool_rele()
1002 * ... perform long-running task ...
1003 * dsl_dataset_long_rele()
1004 * dsl_dataset_rele()
1005 *
1006 * Note that when the long hold is released, the dataset is still held but
1007 * the pool is not held. The dataset may change arbitrarily during this time
1008 * (e.g. it could be destroyed). Therefore you shouldn't do anything to the
1009 * dataset except release it.
1010 *
1011 * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
1012 * or modifying operations.
1013 *
1014 * Modifying operations should generally use dsl_sync_task(). The synctask
1015 * infrastructure enforces proper locking strategy with respect to the
1016 * dp_config_rwlock. See the comment above dsl_sync_task() for details.
1017 *
1018 * Read-only operations will manually hold the pool, then the dataset, obtain
1019 * information from the dataset, then release the pool and dataset.
1020 * dmu_objset_{hold,rele}() are convenience routines that also do the pool
1021 * hold/rele.
1022 */
1023
1024int
1025dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
1026{
1027 spa_t *spa;
1028 int error;
1029
1030 error = spa_open(name, &spa, tag);
1031 if (error == 0) {
1032 *dp = spa_get_dsl(spa);
1033 dsl_pool_config_enter(*dp, tag);
1034 }
1035 return (error);
1036}
1037
1038void
1039dsl_pool_rele(dsl_pool_t *dp, void *tag)
1040{
1041 dsl_pool_config_exit(dp, tag);
1042 spa_close(dp->dp_spa, tag);
1043}
1044
1045void
1046dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
1047{
1048 /*
1049 * We use a "reentrant" reader-writer lock, but not reentrantly.
1050 *
1051 * The rrwlock can (with the track_all flag) track all reading threads,
1052 * which is very useful for debugging which code path failed to release
1053 * the lock, and for verifying that the *current* thread does hold
1054 * the lock.
1055 *
1056 * (Unlike a rwlock, which knows that N threads hold it for
1057 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
1058 * if any thread holds it for read, even if this thread doesn't).
1059 */
1060 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1061 rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
1062}
1063
5e8cd5d1
AJ
1064void
1065dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
1066{
1067 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1068 rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
1069}
1070
13fe0198
MA
1071void
1072dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
1073{
1074 rrw_exit(&dp->dp_config_rwlock, tag);
1075}
1076
1077boolean_t
1078dsl_pool_config_held(dsl_pool_t *dp)
1079{
1080 return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
1081}
1082
9c43027b
AJ
1083boolean_t
1084dsl_pool_config_held_writer(dsl_pool_t *dp)
1085{
1086 return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
1087}
1088
c409e464 1089#if defined(_KERNEL) && defined(HAVE_SPL)
40a806df
NB
1090EXPORT_SYMBOL(dsl_pool_config_enter);
1091EXPORT_SYMBOL(dsl_pool_config_exit);
1092
02730c33 1093/* BEGIN CSTYLED */
d1d7e268 1094/* zfs_dirty_data_max_percent only applied at module load in arc_init(). */
e8b96c60
MA
1095module_param(zfs_dirty_data_max_percent, int, 0444);
1096MODULE_PARM_DESC(zfs_dirty_data_max_percent, "percent of ram can be dirty");
c409e464 1097
d1d7e268 1098/* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */
e8b96c60
MA
1099module_param(zfs_dirty_data_max_max_percent, int, 0444);
1100MODULE_PARM_DESC(zfs_dirty_data_max_max_percent,
d1d7e268 1101 "zfs_dirty_data_max upper bound as % of RAM");
c409e464 1102
e8b96c60
MA
1103module_param(zfs_delay_min_dirty_percent, int, 0644);
1104MODULE_PARM_DESC(zfs_delay_min_dirty_percent, "transaction delay threshold");
c409e464 1105
e8b96c60
MA
1106module_param(zfs_dirty_data_max, ulong, 0644);
1107MODULE_PARM_DESC(zfs_dirty_data_max, "determines the dirty space limit");
c409e464 1108
d1d7e268 1109/* zfs_dirty_data_max_max only applied at module load in arc_init(). */
e8b96c60
MA
1110module_param(zfs_dirty_data_max_max, ulong, 0444);
1111MODULE_PARM_DESC(zfs_dirty_data_max_max,
d1d7e268 1112 "zfs_dirty_data_max upper bound in bytes");
c409e464 1113
e8b96c60
MA
1114module_param(zfs_dirty_data_sync, ulong, 0644);
1115MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data");
c409e464 1116
e8b96c60
MA
1117module_param(zfs_delay_scale, ulong, 0644);
1118MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity");
02730c33 1119/* END CSTYLED */
c409e464 1120#endif