X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=module%2Fzfs%2Fdsl_pool.c;h=10e967ab91ed9218415c2cfa5db3c3628b20cfd4;hb=dcec0a12c8e480262288ac76f6350652079ea173;hp=1bb49c13ae15cced8c3b625235ee5002c8f8c926;hpb=93ce2b4ca5a40c41ac945cd3aaf4a4a22bb751e1;p=mirror_zfs.git diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 1bb49c13a..10e967ab9 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #include #include @@ -81,7 +83,7 @@ * zfs_dirty_data_max determines the dirty space limit. Once that value is * exceeded, new writes are halted until space frees up. * - * The zfs_dirty_data_sync tunable dictates the threshold at which we + * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we * ensure that there is a txg syncing (see the comment in txg.c for a full * description of transaction group stages). * @@ -104,9 +106,11 @@ int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; /* - * If there is at least this much dirty data, push out a txg. + * If there's at least this much dirty data (as a percentage of + * zfs_dirty_data_max), push out a txg. This should be less than + * zfs_vdev_async_write_active_min_dirty_percent. */ -unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024; +int zfs_dirty_data_sync_percent = 20; /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in @@ -201,6 +205,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); + txg_list_create(&dp->dp_early_sync_tasks, spa, + offsetof(dsl_sync_task_t, dst_node)); dp->dp_sync_taskq = taskq_create("dp_sync_taskq", zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, @@ -217,6 +223,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri, max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", + max_ncpus, defclsyspri, max_ncpus, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); return (dp); } @@ -385,6 +394,7 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); + txg_list_destroy(&dp->dp_early_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); @@ -406,6 +416,7 @@ dsl_pool_close(dsl_pool_t *dp) rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); + taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_iput_taskq); if (dp->dp_blkstats != NULL) { mutex_destroy(&dp->dp_blkstats->zab_lock); @@ -450,6 +461,11 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp, int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); +#ifdef _KERNEL + objset_t *os; +#else + objset_t *os __attribute__((unused)); +#endif dsl_dataset_t *ds; uint64_t obj; @@ -511,18 +527,16 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp, obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx); /* create the root objset */ - VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj_flags(dp, obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + os = dmu_objset_create_impl(dp->dp_spa, ds, + dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); #ifdef _KERNEL - { - objset_t *os; - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - os = dmu_objset_create_impl(dp->dp_spa, ds, - dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - zfs_create_fs(os, kcred, zplprops, tx); - } + zfs_create_fs(os, kcred, zplprops, tx); #endif - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dmu_tx_commit(tx); @@ -574,6 +588,29 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) cv_signal(&dp->dp_spaceavail_cv); } +#ifdef ZFS_DEBUG +static boolean_t +dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) +{ + spa_t *spa = dp->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + txg_list_t *tl = &vd->vdev_ms_list; + metaslab_t *ms; + + for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; + ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { + VERIFY(range_tree_is_empty(ms->ms_freeing)); + VERIFY(range_tree_is_empty(ms->ms_checkpointing)); + } + } + + return (B_TRUE); +} +#endif + void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { @@ -589,6 +626,23 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); + /* + * Run all early sync tasks before writing out any dirty blocks. + * For more info on early sync tasks see block comment in + * dsl_early_sync_task(). + */ + if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { + dsl_sync_task_t *dst; + + ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); + while ((dst = + txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { + ASSERT(dsl_early_sync_task_verify(dp, txg)); + dsl_sync_task_sync(dst, tx); + } + ASSERT(dsl_early_sync_task_verify(dp, txg)); + } + /* * Write out all dirty blocks of dirty datasets. */ @@ -645,9 +699,22 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { + objset_t *os = ds->ds_objset; + ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); + + /* + * Release any key mappings created by calls to + * dsl_dataset_dirty() from the userquota accounting + * code paths. + */ + if (os->os_encrypted && !os->os_raw_receive && + !os->os_next_write_raw[txg & TXG_MASK]) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); + } } VERIFY0(zio_wait(zio)); @@ -657,8 +724,17 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * * - move dead blocks from the pending deadlist to the on-disk deadlist * - release hold from dsl_dataset_dirty() + * - release key mapping hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets)) != NULL) { + objset_t *os = ds->ds_objset; + + if (os->os_encrypted && !os->os_raw_receive && + !os->os_next_write_raw[txg & TXG_MASK]) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); + } + dsl_dataset_sync_done(ds, tx); } @@ -744,22 +820,66 @@ dsl_pool_sync_context(dsl_pool_t *dp) taskq_member(dp->dp_sync_taskq, curthread)); } +/* + * This function returns the amount of allocatable space in the pool + * minus whatever space is currently reserved by ZFS for specific + * purposes. Specifically: + * + * 1] Any reserved SLOP space + * 2] Any space used by the checkpoint + * 3] Any space used for deferred frees + * + * The latter 2 are especially important because they are needed to + * rectify the SPA's and DMU's different understanding of how much space + * is used. Now the DMU is aware of that extra space tracked by the SPA + * without having to maintain a separate special dir (e.g similar to + * $MOS, $FREEING, and $LEAKED). + * + * Note: By deferred frees here, we mean the frees that were deferred + * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the + * segments placed in ms_defer trees during metaslab_sync_done(). + */ uint64_t -dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) +dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) { - uint64_t space, resv; - - /* - * If we're trying to assess whether it's OK to do a free, - * cut the reservation in half to allow forward progress - * (e.g. make it possible to rm(1) files from a full pool). - */ - space = spa_get_dspace(dp->dp_spa); - resv = spa_get_slop_space(dp->dp_spa); - if (netfree) + spa_t *spa = dp->dp_spa; + uint64_t space, resv, adjustedsize; + uint64_t spa_deferred_frees = + spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; + + space = spa_get_dspace(spa) + - spa_get_checkpoint_space(spa) - spa_deferred_frees; + resv = spa_get_slop_space(spa); + + switch (slop_policy) { + case ZFS_SPACE_CHECK_NORMAL: + break; + case ZFS_SPACE_CHECK_RESERVED: resv >>= 1; + break; + case ZFS_SPACE_CHECK_EXTRA_RESERVED: + resv >>= 2; + break; + case ZFS_SPACE_CHECK_NONE: + resv = 0; + break; + default: + panic("invalid slop policy value: %d", slop_policy); + break; + } + adjustedsize = (space >= resv) ? (space - resv) : 0; + + return (adjustedsize); +} - return (space - resv); +uint64_t +dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) +{ + uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); + uint64_t deferred = + metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); + uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; + return (quota); } boolean_t @@ -767,10 +887,12 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; boolean_t rv; mutex_enter(&dp->dp_lock); - if (dp->dp_dirty_total > zfs_dirty_data_sync) + if (dp->dp_dirty_total > dirty_min_bytes) txg_kick(dp); rv = (dp->dp_dirty_total > delay_min_bytes); mutex_exit(&dp->dp_lock); @@ -979,6 +1101,12 @@ dsl_pool_iput_taskq(dsl_pool_t *dp) return (dp->dp_iput_taskq); } +taskq_t * +dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp) +{ + return (dp->dp_unlinked_drain_taskq); +} + /* * Walk through the pool-wide zap object of temporary snapshot user holds * and release them. @@ -1233,8 +1361,9 @@ module_param(zfs_dirty_data_max_max, ulong, 0444); MODULE_PARM_DESC(zfs_dirty_data_max_max, "zfs_dirty_data_max upper bound in bytes"); -module_param(zfs_dirty_data_sync, ulong, 0644); -MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data"); +module_param(zfs_dirty_data_sync_percent, int, 0644); +MODULE_PARM_DESC(zfs_dirty_data_sync_percent, + "dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); module_param(zfs_delay_scale, ulong, 0644); MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity");