From 4589f3ae4c1bb435777da8640eb915f3c713b14d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 29 Mar 2018 14:50:40 -0700 Subject: [PATCH] Optimize possible split block search space Remove duplicate segment copies to minimize the possible search space for reconstruction. Once reduced an accurate assessment can be made regarding the difficulty in reconstructing the block. Also, ztest will now run zdb with zfs_reconstruct_indirect_combinations_max set to 1000000 in an attempt to avoid checksum errors. Reviewed-by: Matthew Ahrens Reviewed-by: Tim Chase Signed-off-by: Brian Behlendorf Closes #6900 --- cmd/ztest/ztest.c | 3 +- man/man5/zfs-module-parameters.5 | 24 +++--- module/zfs/vdev_indirect.c | 124 ++++++++++++++++++++++--------- 3 files changed, 103 insertions(+), 48 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index c81d446a5..315169571 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -6221,7 +6221,8 @@ ztest_run_zdb(char *pool) ztest_get_zdb_bin(bin, len); (void) sprintf(zdb, - "%s -bcc%s%s -G -d -U %s %s", + "%s -bcc%s%s -G -d -U %s " + "-o zfs_reconstruct_indirect_combinations_max=1000000 %s", bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 6e670facb..8b007ea17 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1742,18 +1742,18 @@ Use \fB1\fR for yes and \fB0\fR for no (default). .sp .ne 2 .na -\fBzfs_reconstruct_indirect_segments_max\fR (int) -.ad -.RS 12n -When a split block which is part of a indirect vdev contains more than this -many segments, consider it too computationally expensive to check all possible -combinations. Instead, operate under the assumption that only a few segment -copies are damaged and the majority of segment copies are good, in which case -it is reasonable to randomly select sample combinations. This allows all the -segment copies to participate fairly in the reconstruction and prevents the -repeated use of one bad copy. -.sp -Default value: \fB10\fR. +\fBzfs_reconstruct_indirect_combinations_max\fR (int) +.ad +.RS 12na +If an indirect split block contains more than this many possible unique +combinations when being reconstructed, consider it too computationally +expensive to check them all. Instead, try at most +\fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected +combinations each time the block is accessed. This allows all segment +copies to participate fairly in the reconstruction when all combinations +cannot be checked and prevents repeated use of one bad copy. +.sp +Default value: \fB100\fR. .RE .sp diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 3ccdfee3b..b30ddaf26 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -204,17 +204,14 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; int zfs_condense_indirect_commit_entry_delay_ms = 0; /* - * If a split block contains more than this many segments, consider it too - * computationally expensive to check all (2^num_segments) possible - * combinations. Instead, try at most 2^_segments_max randomly-selected - * combinations. - * - * This is reasonable if only a few segment copies are damaged and the - * majority of segment copies are good. It allows all segment copies to - * participate fairly in the reconstruction and prevents repeated use of - * one bad copy. + * If an indirect split block contains more than this many possible unique + * combinations when being reconstructed, consider it too computationally + * expensive to check them all. Instead, try at most 100 randomly-selected + * combinations each time the block is accessed. This allows all segment + * copies to participate fairly in the reconstruction when all combinations + * cannot be checked and prevents repeated use of one bad copy. */ -int zfs_reconstruct_indirect_segments_max = 10; +int zfs_reconstruct_indirect_combinations_max = 100; /* * The indirect_child_t represents the vdev that we will read from, when we @@ -226,6 +223,12 @@ int zfs_reconstruct_indirect_segments_max = 10; typedef struct indirect_child { abd_t *ic_data; vdev_t *ic_vdev; + + /* + * ic_duplicate is -1 when the ic_data contents are unique, when it + * is determined to be a duplicate it refers to the primary child. + */ + int ic_duplicate; } indirect_child_t; /* @@ -1165,6 +1168,7 @@ vdev_indirect_read_all(zio_t *zio) ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); + ic->ic_duplicate = -1; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, ic->ic_data, @@ -1314,7 +1318,7 @@ vdev_indirect_repair(zio_t *zio) continue; if (ic->ic_data == NULL) continue; - if (abd_cmp(good_child->ic_data, ic->ic_data) == 0) + if (ic->ic_duplicate == is->is_good_child) continue; zio_nowait(zio_vdev_child_io(zio, NULL, @@ -1367,15 +1371,16 @@ vdev_indirect_all_checksum_errors(zio_t *zio) * * If we pointed to any mirror vdevs, this effectively does the job of the * mirror. The mirror vdev code can't do its own job because we don't know - * the checksum of each split segment individually. We have to try every - * combination of copies of split segments, until we find one that checksums - * correctly. (Or until we have tried all combinations, or have tried - * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we - * set io_error to ECKSUM to propagate the error up to the user.) + * the checksum of each split segment individually. * - * For example, if we have 3 segments in the split, - * and each points to a 2-way mirror, we will have the following pieces of - * data: + * We have to try every unique combination of copies of split segments, until + * we find one that checksums correctly. Duplicate segment copies are first + * discarded as an optimization to reduce the search space. After pruning + * there will exist at most one valid combination. + * + * When the total number of combinations is small they can all be checked. + * For example, if we have 3 segments in the split, and each points to a + * 2-way mirror with unique copies, we will have the following pieces of data: * * | mirror child * split | [0] [1] @@ -1414,12 +1419,48 @@ vdev_indirect_reconstruct_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; uint64_t attempts = 0; - uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max; - int segments = 0; + uint64_t attempts_max = UINT64_MAX; + uint64_t combinations = 1; + + if (zfs_reconstruct_indirect_combinations_max > 0) + attempts_max = zfs_reconstruct_indirect_combinations_max; + /* + * Discard duplicate copies of split segments to minimize the + * number of unique combinations when attempting reconstruction. + */ for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) - segments++; + is != NULL; is = list_next(&iv->iv_splits, is)) { + uint64_t is_copies = 0; + + for (int i = 0; i < is->is_children; i++) { + if (is->is_child[i].ic_data == NULL) + continue; + + for (int j = i + 1; j < is->is_children; j++) { + if (is->is_child[j].ic_data == NULL) + continue; + + if (is->is_child[j].ic_duplicate == -1 && + abd_cmp(is->is_child[i].ic_data, + is->is_child[j].ic_data) == 0) { + is->is_child[j].ic_duplicate = i; + } + } + + is_copies++; + } + + /* Reconstruction is impossible, no valid is->is_child[] */ + if (is_copies == 0) { + zio->io_error = EIO; + vdev_indirect_all_checksum_errors(zio); + zio_checksum_verified(zio); + return; + } + + combinations *= is_copies; + } for (;;) { /* copy data from splits to main zio */ @@ -1436,6 +1477,15 @@ vdev_indirect_reconstruct_io_done(zio_t *zio) goto next; } + /* + * If this child is a duplicate, its is_duplicate will + * refer to the primary copy. Skip this combination. + */ + if (is->is_child[is->is_good_child].ic_duplicate >= 0) { + ret = ECKSUM; + goto next; + } + abd_copy_off(zio->io_abd, is->is_child[is->is_good_child].ic_data, is->is_split_offset, 0, is->is_size); @@ -1458,14 +1508,14 @@ vdev_indirect_reconstruct_io_done(zio_t *zio) boolean_t more; next: more = B_FALSE; - if (segments <= zfs_reconstruct_indirect_segments_max) { + if (combinations <= attempts_max) { /* - * There are relatively few segments, so - * deterministically check all combinations. We do - * this by by adding one to the first split's - * good_child. If it overflows, then "carry over" to - * the next split (like counting in base is_children, - * but each digit can have a different base). + * There are relatively few possible combinations, so + * deterministically check them all. We do this by + * adding one to the first split's good_child. If it + * overflows, then "carry over" to the next split + * (like counting in base is_children, but each + * digit can have a different base). */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { @@ -1485,8 +1535,12 @@ next: */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { - is->is_good_child = - spa_get_random(is->is_children); + int c = spa_get_random(is->is_children); + + while (is->is_child[c].ic_duplicate >= 0) + c = (c + 1) % is->is_children; + + is->is_good_child = c; } more = B_TRUE; } @@ -1577,7 +1631,7 @@ module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644); MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms, "Delay while condensing vdev mapping"); -module_param(zfs_reconstruct_indirect_segments_max, int, 0644); -MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max, - "Maximum number of split segments check all combinations"); +module_param(zfs_reconstruct_indirect_combinations_max, int, 0644); +MODULE_PARM_DESC(zfs_reconstruct_indirect_combinations_max, + "Maximum number of combinations when reconstructing split segments"); #endif -- 2.39.2