/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
*/
/*
+ * SPA: Storage Pool Allocator
+ *
* This file contains all the routines used when modifying on-disk SPA state.
* This includes opening, importing, destroying, exporting a pool, and syncing a
* pool.
typedef enum zti_modes {
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
- ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */
ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
ZTI_MODE_NULL, /* don't create a taskq */
ZTI_NMODES
char **ereport);
static void spa_vdev_resilver_done(spa_t *spa);
-uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
+uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
id_t zio_taskq_psrset_bind = PS_NONE;
boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
uint_t zio_taskq_basedc = 80; /* base duty cycle */
err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_PUSHPAGE);
if (err)
- return err;
+ return (err);
mutex_enter(&spa->spa_props_lock);
switch ((int)prop) {
case ZPROP_INVAL:
if (!zpool_prop_feature(propname)) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
* Sanitize the input.
*/
if (nvpair_type(elem) != DATA_TYPE_UINT64) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
if (nvpair_value_uint64(elem, &intval) != 0) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
if (intval != 0) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
fname = strchr(propname, '@') + 1;
if (zfeature_lookup_name(fname, NULL) != 0) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
(intval < spa_version(spa) ||
intval > SPA_VERSION_BEFORE_FEATURES ||
has_feature))
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
case ZPOOL_PROP_DELEGATION:
case ZPOOL_PROP_AUTOEXPAND:
error = nvpair_value_uint64(elem, &intval);
if (!error && intval > 1)
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
case ZPOOL_PROP_BOOTFS:
* the bootfs property cannot be set.
*/
if (spa_version(spa) < SPA_VERSION_BOOTFS) {
- error = ENOTSUP;
+ error = SET_ERROR(ENOTSUP);
break;
}
* Make sure the vdev config is bootable
*/
if (!vdev_is_bootable(spa->spa_root_vdev)) {
- error = ENOTSUP;
+ error = SET_ERROR(ENOTSUP);
break;
}
break;
}
- if ((error = dmu_objset_hold(strval,FTAG,&os)))
+ error = dmu_objset_hold(strval, FTAG, &os);
+ if (error)
break;
/* Must be ZPL and not gzip compressed. */
if (dmu_objset_type(os) != DMU_OST_ZFS) {
- error = ENOTSUP;
+ error = SET_ERROR(ENOTSUP);
} else if ((error =
dsl_prop_get_int_ds(dmu_objset_ds(os),
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
&compress)) == 0 &&
!BOOTFS_COMPRESS_VALID(compress)) {
- error = ENOTSUP;
+ error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
}
error = nvpair_value_uint64(elem, &intval);
if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
intval > ZIO_FAILURE_MODE_PANIC))
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
/*
* This is a special case which only occurs when
*/
if (!error && spa_suspended(spa)) {
spa->spa_failmode = intval;
- error = EIO;
+ error = SET_ERROR(EIO);
}
break;
break;
if (strval[0] != '/') {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
strcmp(slash, "/..") == 0)
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
case ZPOOL_PROP_COMMENT:
break;
for (check = strval; *check != '\0'; check++) {
if (!isprint(*check)) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
check++;
}
if (strlen(strval) > ZPROP_MAX_COMMENT)
- error = E2BIG;
+ error = SET_ERROR(E2BIG);
break;
case ZPOOL_PROP_DEDUPDITTO:
if (spa_version(spa) < SPA_VERSION_DEDUP)
- error = ENOTSUP;
+ error = SET_ERROR(ENOTSUP);
else
error = nvpair_value_uint64(elem, &intval);
if (error == 0 &&
intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
default:
spa_config_exit(spa, SCL_STATE, FTAG);
if (vdev_state != VDEV_STATE_HEALTHY)
- return (ENXIO);
+ return (SET_ERROR(ENXIO));
ASSERT3U(spa_guid(spa), !=, *newguid);
int error;
uint64_t guid;
+ mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
guid = spa_generate_guid(NULL);
}
mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
return (error);
}
tqs->stqs_count = count;
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
- for (i = 0; i < count; i++) {
- taskq_t *tq;
-
- switch (mode) {
- case ZTI_MODE_FIXED:
- ASSERT3U(value, >=, 1);
- value = MAX(value, 1);
- break;
+ switch (mode) {
+ case ZTI_MODE_FIXED:
+ ASSERT3U(value, >=, 1);
+ value = MAX(value, 1);
+ break;
- case ZTI_MODE_BATCH:
- batch = B_TRUE;
- flags |= TASKQ_THREADS_CPU_PCT;
- value = zio_taskq_batch_pct;
- break;
+ case ZTI_MODE_BATCH:
+ batch = B_TRUE;
+ flags |= TASKQ_THREADS_CPU_PCT;
+ value = zio_taskq_batch_pct;
+ break;
- case ZTI_MODE_ONLINE_PERCENT:
- flags |= TASKQ_THREADS_CPU_PCT;
- break;
+ default:
+ panic("unrecognized mode for %s_%s taskq (%u:%u) in "
+ "spa_activate()",
+ zio_type_name[t], zio_taskq_types[q], mode, value);
+ break;
+ }
- default:
- panic("unrecognized mode for %s_%s taskq (%u:%u) in "
- "spa_activate()",
- zio_type_name[t], zio_taskq_types[q], mode, value);
- break;
- }
+ for (i = 0; i < count; i++) {
+ taskq_t *tq;
if (count > 1) {
(void) snprintf(name, sizeof (name), "%s_%s_%u",
tq = taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags);
} else {
- tq = taskq_create_proc(name, value, maxclsyspri, 50,
+ pri_t pri = maxclsyspri;
+ /*
+ * The write issue taskq can be extremely CPU
+ * intensive. Run it at slightly lower priority
+ * than the other taskqs.
+ */
+ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+ pri--;
+
+ tq = taskq_create_proc(name, value, pri, 50,
INT_MAX, spa->spa_proc, flags);
}
if (error) {
vdev_free(*vdp);
*vdp = NULL;
- return (EINVAL);
+ return (SET_ERROR(EINVAL));
}
for (c = 0; c < children; c++) {
if (error) {
if (error != ENXIO && error != EIO)
- error = EIO;
+ error = SET_ERROR(EIO);
return (error);
}
nvlist_t *nvl;
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
- return (EINVAL);
+ return (SET_ERROR(EINVAL));
ASSERT(spa->spa_comment == NULL);
if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
spa_guid_exists(pool_guid, 0)) {
- error = EEXIST;
+ error = SET_ERROR(EEXIST);
} else {
spa->spa_config_guid = pool_guid;
spa->spa_load_state = state;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
- return (EINVAL);
+ return (SET_ERROR(EINVAL));
parse = (type == SPA_IMPORT_EXISTING ?
VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
return (error);
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
- return (ENXIO);
+ return (SET_ERROR(ENXIO));
}
/*
hostid != myhostid) {
nvlist_free(nvconfig);
cmn_err(CE_WARN, "pool '%s' could not be "
- "loaded as it was last accessed by "
- "another system (host: %s hostid: 0x%lx). "
- "See: http://zfsonlinux.org/msg/ZFS-8000-EY",
+ "loaded as it was last accessed by another "
+ "system (host: %s hostid: 0x%lx). See: "
+ "http://zfsonlinux.org/msg/ZFS-8000-EY",
spa_name(spa), hostname,
(unsigned long)hostid);
- return (EBADF);
+ return (SET_ERROR(EBADF));
}
}
if (nvlist_lookup_nvlist(spa->spa_config,
* more toplevel vdevs are faulted.
*/
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
- return (ENXIO);
+ return (SET_ERROR(ENXIO));
if (spa_check_logs(spa)) {
*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
if ((spa = spa_lookup(pool)) == NULL) {
if (locked)
mutex_exit(&spa_namespace_lock);
- return (ENOENT);
+ return (SET_ERROR(ENOENT));
}
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
spa_remove(spa);
if (locked)
mutex_exit(&spa_namespace_lock);
- return (ENOENT);
+ return (SET_ERROR(ENOENT));
}
if (error) {
return (0);
if (ndev == 0)
- return (EINVAL);
+ return (SET_ERROR(EINVAL));
/*
* Make sure the pool is formatted with a version that supports this
* device type.
*/
if (spa_version(spa) < version)
- return (ENOTSUP);
+ return (SET_ERROR(ENOTSUP));
/*
* Set the pending device list so we correctly handle device in-use
if (!vd->vdev_ops->vdev_op_leaf) {
vdev_free(vd);
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto out;
}
#ifdef _KERNEL
if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
- error = ENOTBLK;
+ error = SET_ERROR(ENOTBLK);
vdev_free(vd);
goto out;
}
mutex_enter(&spa_namespace_lock);
if (spa_lookup(pool) != NULL) {
mutex_exit(&spa_namespace_lock);
- return (EEXIST);
+ return (SET_ERROR(EEXIST));
}
/*
ASSERT(error != 0 || spa->spa_root_vdev == rvd);
if (error == 0 && !zfs_allocatable_devs(nvroot))
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
if (error == 0 &&
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
if (config == NULL) {
cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
devpath);
- return (EIO);
+ return (SET_ERROR(EIO));
}
VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
(u_longlong_t)guid);
- error = ENOENT;
+ error = SET_ERROR(ENOENT);
goto out;
}
if (avd != bvd) {
cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
"try booting from '%s'", avd->vdev_path);
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto out;
}
"try booting from '%s'",
bvd->vdev_parent->
vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto out;
}
mutex_enter(&spa_namespace_lock);
if (spa_lookup(pool) != NULL) {
mutex_exit(&spa_namespace_lock);
- return (EEXIST);
+ return (SET_ERROR(EEXIST));
}
/*
if (dsl_dsobj_to_dsname(spa_name(spa),
spa->spa_bootfs, tmpname) == 0) {
char *cp;
- char *dsname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE);
+ char *dsname;
+
+ dsname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE);
cp = strchr(tmpname, '/');
if (cp == NULL) {
*oldconfig = NULL;
if (!(spa_mode_global & FWRITE))
- return (EROFS);
+ return (SET_ERROR(EROFS));
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(pool)) == NULL) {
mutex_exit(&spa_namespace_lock);
- return (ENOENT);
+ return (SET_ERROR(ENOENT));
}
/*
new_state != POOL_STATE_UNINITIALIZED)) {
spa_async_resume(spa);
mutex_exit(&spa_namespace_lock);
- return (EBUSY);
+ return (SET_ERROR(EBUSY));
}
/*
spa_has_active_shared_spare(spa)) {
spa_async_resume(spa);
mutex_exit(&spa_namespace_lock);
- return (EXDEV);
+ return (SET_ERROR(EXDEV));
}
/*
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
{
uint64_t txg, dtl_max_txg;
- ASSERTV(vdev_t *rvd = spa->spa_root_vdev;)
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
int newvd_isspare;
int error;
+ ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
ASSERT(spa_writeable(spa));
}
/* mark the device being resilvered */
- newvd->vdev_resilvering = B_TRUE;
+ newvd->vdev_resilver_txg = txg;
/*
* If the parent is not a mirror, or if we're replacing, insert the new
/*
* Detach a device from a mirror or replacing vdev.
+ *
* If 'replace_done' is specified, only detach if the parent
* is a replacing vdev.
*/
{
uint64_t txg;
int error;
- ASSERTV(vdev_t *rvd = spa->spa_root_vdev;)
vdev_t *vd, *pvd, *cvd, *tvd;
boolean_t unspare = B_FALSE;
uint64_t unspare_guid = 0;
char *vdpath;
int c, t;
-
+ ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
if (pvd->vdev_ops == &vdev_spare_ops)
cvd->vdev_unspare = B_FALSE;
vdev_remove_parent(cvd);
- cvd->vdev_resilvering = B_FALSE;
}
spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
continue;
} else {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
}
/* which disk is going to be split? */
if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
&glist[c]) != 0) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
/* look it up in the spa */
vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
if (vml[c] == NULL) {
- error = ENODEV;
+ error = SET_ERROR(ENODEV);
break;
}
vml[c]->vdev_children != 0 ||
vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
if (vdev_dtl_required(vml[c])) {
- error = EBUSY;
+ error = SET_ERROR(EBUSY);
break;
}
if (vd->vdev_stat.vs_alloc != 0)
error = spa_offline_log(spa);
} else {
- error = ENOTSUP;
+ error = SET_ERROR(ENOTSUP);
}
if (error)
* the spa_vdev_config_[enter/exit] functions which allow us to
* grab and release the spa_config_lock while still holding the namespace
* lock. During each step the configuration is synced out.
- */
-
-/*
- * Remove a device from the pool. Currently, this supports removing only hot
- * spares, slogs, and level 2 ARC devices.
+ *
+ * Currently, this supports removing only hot spares, slogs, and level 2 ARC
+ * devices.
*/
int
spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
spa_load_spares(spa);
spa->spa_spares.sav_sync = B_TRUE;
} else {
- error = EBUSY;
+ error = SET_ERROR(EBUSY);
}
} else if (spa->spa_l2cache.sav_vdevs != NULL &&
nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
/*
* Normal vdevs cannot be removed (yet).
*/
- error = ENOTSUP;
+ error = SET_ERROR(ENOTSUP);
} else {
/*
* There is no vdev of any kind with the specified guid.
*/
- error = ENOENT;
+ error = SET_ERROR(ENOENT);
}
if (!locked)
/*
* Find any device that's done replacing, or a vdev marked 'unspare' that's
- * current spared, so we can detach it.
+ * currently spared, so we can detach it.
*/
static vdev_t *
spa_vdev_resilver_done_hunt(vdev_t *vd)
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
sguid = ppvd->vdev_child[1]->vdev_guid;
}
+ ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
+
spa_config_exit(spa, SCL_ALL, FTAG);
if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
return;
{
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
if (dsl_scan_resilvering(spa->spa_dsl_pool))
- return (EBUSY);
+ return (SET_ERROR(EBUSY));
return (dsl_scan_cancel(spa->spa_dsl_pool));
}
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
- return (ENOTSUP);
+ return (SET_ERROR(ENOTSUP));
/*
* If a resilver was requested, but there is no DTL on a
return (0);
}
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
+static void
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+ VERIFY(zio_wait(zio) == 0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+ spa_free_sync_cb, zio, tx), ==, 0);
+ VERIFY0(zio_wait(zio));
+}
+
static void
spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
{
if (sav->sav_count == 0) {
VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
} else {
- list = kmem_alloc(sav->sav_count * sizeof (void *), KM_PUSHPAGE);
+ list = kmem_alloc(sav->sav_count*sizeof (void *), KM_PUSHPAGE);
for (i = 0; i < sav->sav_count; i++)
list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
B_FALSE, VDEV_CONFIG_L2CACHE);
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
- bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
!txg_list_empty(&dp->dp_sync_tasks, txg) ||
((dsl_scan_active(dp->dp_scan) ||
txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
- zio_t *zio = zio_root(spa, NULL, NULL, 0);
- VERIFY3U(bpobj_iterate(defer_bpo,
- spa_free_sync_cb, zio, tx), ==, 0);
- VERIFY0(zio_wait(zio));
+ spa_sync_deferred_frees(spa, tx);
}
/*
dsl_pool_sync(dp, txg);
if (pass < zfs_sync_pass_deferred_free) {
- zio_t *zio = zio_root(spa, NULL, NULL, 0);
- bplist_iterate(free_bpl, spa_free_sync_cb,
- zio, tx);
- VERIFY(zio_wait(zio) == 0);
+ spa_sync_frees(spa, free_bpl, tx);
} else {
bplist_iterate(free_bpl, bpobj_enqueue_cb,
- defer_bpo, tx);
+ &spa->spa_deferred_bpobj, tx);
}
ddt_sync(spa, txg);