#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
+/*
+ * One metaslab from each (normal-class) vdev is used by the ZIL. These are
+ * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
+ * part of the spa_embedded_log_class. The metaslab with the most free space
+ * in each vdev is selected for this purpose when the pool is opened (or a
+ * vdev is added). See vdev_metaslab_init().
+ *
+ * Log blocks can be allocated from the following locations. Each one is tried
+ * in order until the allocation succeeds:
+ * 1. dedicated log vdevs, aka "slog" (spa_log_class)
+ * 2. embedded slog metaslabs (spa_embedded_log_class)
+ * 3. other metaslabs in normal vdevs (spa_normal_class)
+ *
+ * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
+ * than this number of metaslabs in the vdev. This ensures that we don't set
+ * aside an unreasonable amount of space for the ZIL. If set to less than
+ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
+ * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
+ */
+int zfs_embedded_slog_min_ms = 64;
+
/* default target for number of metaslabs per top-level vdev */
int zfs_vdev_default_ms_count = 200;
return (ops);
}
+/*
+ * Given a vdev and a metaslab class, find which metaslab group we're
+ * interested in. All vdevs may belong to two different metaslab classes.
+ * Dedicated slog devices use only the primary metaslab group, rather than a
+ * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ */
+metaslab_group_t *
+vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
+{
+ if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+ vd->vdev_log_mg != NULL)
+ return (vd->vdev_log_mg);
+ else
+ return (vd->vdev_mg);
+}
+
/* ARGSUSED */
void
vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
metaslab_group_destroy(vd->vdev_mg);
vd->vdev_mg = NULL;
}
+ if (vd->vdev_log_mg != NULL) {
+ ASSERT0(vd->vdev_ms_count);
+ metaslab_group_destroy(vd->vdev_log_mg);
+ vd->vdev_log_mg = NULL;
+ }
ASSERT0(vd->vdev_stat.vs_space);
ASSERT0(vd->vdev_stat.vs_dspace);
if (tvd->vdev_mg)
ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+ if (tvd->vdev_log_mg)
+ ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
tvd->vdev_mg = svd->vdev_mg;
+ tvd->vdev_log_mg = svd->vdev_log_mg;
tvd->vdev_ms = svd->vdev_ms;
svd->vdev_mg = NULL;
+ svd->vdev_log_mg = NULL;
svd->vdev_ms = NULL;
if (tvd->vdev_mg != NULL)
tvd->vdev_mg->mg_vd = tvd;
+ if (tvd->vdev_log_mg != NULL)
+ tvd->vdev_log_mg->mg_vd = tvd;
tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
svd->vdev_checkpoint_sm = NULL;
vdev_free(mvd);
}
-static void
+void
vdev_metaslab_group_create(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
vd->vdev_mg = metaslab_group_create(mc, vd,
spa->spa_alloc_count);
+ if (!vd->vdev_islog) {
+ vd->vdev_log_mg = metaslab_group_create(
+ spa_embedded_log_class(spa), vd, 1);
+ }
+
/*
* The spa ashift min/max only apply for the normal metaslab
* class. Class destination is late binding so ashift boundry
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
- objset_t *mos = spa->spa_meta_objset;
- uint64_t m;
uint64_t oldc = vd->vdev_ms_count;
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
vd->vdev_ms = mspp;
vd->vdev_ms_count = newc;
- for (m = oldc; m < newc; m++) {
- uint64_t object = 0;
+ for (uint64_t m = oldc; m < newc; m++) {
+ uint64_t object = 0;
/*
* vdev_ms_array may be 0 if we are creating the "fake"
* metaslabs for an indirect vdev for zdb's leak detection.
* See zdb_leak_init().
*/
if (txg == 0 && vd->vdev_ms_array != 0) {
- error = dmu_read(mos, vd->vdev_ms_array,
+ error = dmu_read(spa->spa_meta_objset,
+ vd->vdev_ms_array,
m * sizeof (uint64_t), sizeof (uint64_t), &object,
DMU_READ_PREFETCH);
if (error != 0) {
}
}
-#ifndef _KERNEL
- /*
- * To accommodate zdb_leak_init() fake indirect
- * metaslabs, we allocate a metaslab group for
- * indirect vdevs which normally don't have one.
- */
- if (vd->vdev_mg == NULL) {
- ASSERT0(vdev_is_concrete(vd));
- vdev_metaslab_group_create(vd);
- }
-#endif
error = metaslab_init(vd->vdev_mg, m, object, txg,
&(vd->vdev_ms[m]));
if (error != 0) {
}
}
+ /*
+ * Find the emptiest metaslab on the vdev and mark it for use for
+ * embedded slog by moving it from the regular to the log metaslab
+ * group.
+ */
+ if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+ vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
+ avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
+ uint64_t slog_msid = 0;
+ uint64_t smallest = UINT64_MAX;
+
+ /*
+ * Note, we only search the new metaslabs, because the old
+ * (pre-existing) ones may be active (e.g. have non-empty
+ * range_tree's), and we don't move them to the new
+ * metaslab_t.
+ */
+ for (uint64_t m = oldc; m < newc; m++) {
+ uint64_t alloc =
+ space_map_allocated(vd->vdev_ms[m]->ms_sm);
+ if (alloc < smallest) {
+ slog_msid = m;
+ smallest = alloc;
+ }
+ }
+ metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
+ /*
+ * The metaslab was marked as dirty at the end of
+ * metaslab_init(). Remove it from the dirty list so that we
+ * can uninitialize and reinitialize it to the new class.
+ */
+ if (txg != 0) {
+ (void) txg_list_remove_this(&vd->vdev_ms_list,
+ slog_ms, txg);
+ }
+ uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
+ metaslab_fini(slog_ms);
+ VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
+ &vd->vdev_ms[slog_msid]));
+ }
+
if (txg == 0)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
*/
if (!expanding && !vd->vdev_removing) {
metaslab_group_activate(vd->vdev_mg);
+ if (vd->vdev_log_mg != NULL)
+ metaslab_group_activate(vd->vdev_log_mg);
}
if (txg == 0)
if (vd->vdev_ms != NULL) {
metaslab_group_t *mg = vd->vdev_mg;
+
metaslab_group_passivate(mg);
+ if (vd->vdev_log_mg != NULL) {
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_passivate(vd->vdev_log_mg);
+ }
uint64_t count = vd->vdev_ms_count;
for (uint64_t m = 0; m < count; m++) {
}
vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
vd->vdev_ms = NULL;
-
vd->vdev_ms_count = 0;
- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
ASSERT0(mg->mg_histogram[i]);
+ if (vd->vdev_log_mg != NULL)
+ ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
+ }
}
ASSERT0(vd->vdev_ms_count);
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
!= NULL)
metaslab_sync_done(msp, txg);
- if (reassess)
+ if (reassess) {
metaslab_sync_reassess(vd->vdev_mg);
+ if (vd->vdev_log_mg != NULL)
+ metaslab_sync_reassess(vd->vdev_log_mg);
+ }
}
void
/*
* Prevent any future allocations.
*/
+ ASSERT3P(tvd->vdev_log_mg, ==, NULL);
metaslab_group_passivate(mg);
(void) spa_vdev_state_exit(spa, vd, 0);
*/
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
+ /*
+ * The vdev fragmentation rating doesn't take into
+ * account the embedded slog metaslab (vdev_log_mg).
+ * Since it's only one metaslab, it would have a tiny
+ * impact on the overall fragmentation.
+ */
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
"Disable cache flushes");
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+ "Minimum number of metaslabs required to dedicate one for log blocks");
+
ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
"Minimum ashift used when creating new top-level vdevs");