]> git.proxmox.com Git - ceph.git/blob - patches/0008-os-bluestore-handle-spurious-read-errors.patch
cherry-pick - os/bluestore: handle spurious read errors
[ceph.git] / patches / 0008-os-bluestore-handle-spurious-read-errors.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Alwin Antreich <a.antreich@proxmox.com>
3 Date: Thu, 6 Dec 2018 12:14:04 +0100
4 Subject: [PATCH] os/bluestore: handle spurious read errors
5
6 Some kernels (4.9+) sometime fail to return data when reading
7 from a block device under memory pressure. This patch retries
8 the read if the checksum verification fails, tests show that
9 the first retried read succeeds in ~99.5% of the cases, so
10 3 attempts are made by default before giving up on the data.
11
12 Works-around: http://tracker.ceph.com/issues/22464
13 Signed-off-by: Paul Emmerich <paul.emmerich@croit.io>
14 (cherry picked from commit cffcbc73aaaa874829d5fc9091af3042b887f9a7)
15
16 Conflicts:
17 src/common/legacy_config_opts.h
18 - adjacent options
19 src/common/options.cc
20 - no RUNTIME flag in luminous
21 src/os/bluestore/BlueStore.cc
22 src/os/bluestore/BlueStore.h
23 - adjacent perfcounter
24 src/test/objectstore/store_test.cc
25 - adjacent tests, no #ifdef
26 - g_conf, not g_conf()
27 - no create_new_collection
28 - queue_transaction etc take osr, not ch
29
30 Signed-off-by: Alwin Antreich <a.antreich@proxmox.com>
31 ---
32 src/common/legacy_config_opts.h | 2 +
33 src/common/options.cc | 10 +++++
34 src/os/bluestore/BlueStore.cc | 35 ++++++++++++++--
35 src/os/bluestore/BlueStore.h | 4 +-
36 src/test/objectstore/store_test.cc | 67 ++++++++++++++++++++++++++++++
37 5 files changed, 114 insertions(+), 4 deletions(-)
38
39 diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
40 index a51870ef6..54d2b578b 100644
41 --- a/src/common/legacy_config_opts.h
42 +++ b/src/common/legacy_config_opts.h
43 @@ -1031,6 +1031,7 @@ OPTION(bluestore_block_preallocate_file, OPT_BOOL) //whether preallocate space i
44 OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
45 OPTION(bluestore_csum_min_block, OPT_U32)
46 OPTION(bluestore_csum_max_block, OPT_U32)
47 +OPTION(bluestore_retry_disk_reads, OPT_U64)
48 OPTION(bluestore_min_alloc_size, OPT_U32)
49 OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
50 OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
51 @@ -1124,6 +1125,7 @@ OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL)
52 OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
53 OPTION(bluestore_shard_finishers, OPT_BOOL)
54 OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
55 +OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)
56
57 OPTION(kstore_max_ops, OPT_U64)
58 OPTION(kstore_max_bytes, OPT_U64)
59 diff --git a/src/common/options.cc b/src/common/options.cc
60 index ff3bb1a1b..8983210d9 100644
61 --- a/src/common/options.cc
62 +++ b/src/common/options.cc
63 @@ -3422,6 +3422,12 @@ std::vector<Option> get_global_options() {
64 .set_description("Maximum block size to checksum")
65 .add_see_also("bluestore_csum_min_block"),
66
67 + Option("bluestore_retry_disk_reads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
68 + .set_default(3)
69 + .set_min_max(0, 255)
70 + .set_description("Number of read retries on checksum validation error")
71 + .set_long_description("Retries to read data from the disk this many times when checksum validation fails to handle spurious read errors gracefully."),
72 +
73 Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
74 .set_default(0)
75 .add_tag("mkfs")
76 @@ -3791,6 +3797,10 @@ std::vector<Option> get_global_options() {
77 .set_default(0)
78 .set_description(""),
79
80 + Option("bluestore_debug_inject_csum_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
81 + .set_default(0.0)
82 + .set_description("inject crc verification errors into bluestore device reads"),
83 +
84 // -----------------------------------------
85 // kstore
86
87 diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
88 index b326d39bc..28ababbe5 100644
89 --- a/src/os/bluestore/BlueStore.cc
90 +++ b/src/os/bluestore/BlueStore.cc
91 @@ -4171,6 +4171,8 @@ void BlueStore::_init_logger()
92 "collection");
93 b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
94 "Read EIO errors propagated to high level callers");
95 + b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
96 + "Read operations that required at least one retry due to failed checksum validation");
97 logger = b.create_perf_counters();
98 cct->get_perfcounters_collection()->add(logger);
99 }
100 @@ -6698,7 +6700,8 @@ int BlueStore::_do_read(
101 uint64_t offset,
102 size_t length,
103 bufferlist& bl,
104 - uint32_t op_flags)
105 + uint32_t op_flags,
106 + uint64_t retry_count)
107 {
108 FUNCTRACE();
109 int r = 0;
110 @@ -6919,7 +6922,14 @@ int BlueStore::_do_read(
111 bufferlist& compressed_bl = *p++;
112 if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
113 b2r_it->second.front().logical_offset) < 0) {
114 - return -EIO;
115 + // Handles spurious read errors caused by a kernel bug.
116 + // We sometimes get all-zero pages as a result of the read under
117 + // high memory pressure. Retrying the failing read succeeds in most cases.
118 + // See also: http://tracker.ceph.com/issues/22464
119 + if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
120 + return -EIO;
121 + }
122 + return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
123 }
124 bufferlist raw_bl;
125 r = _decompress(compressed_bl, &raw_bl);
126 @@ -6937,7 +6947,14 @@ int BlueStore::_do_read(
127 for (auto& reg : b2r_it->second) {
128 if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
129 reg.logical_offset) < 0) {
130 - return -EIO;
131 + // Handles spurious read errors caused by a kernel bug.
132 + // We sometimes get all-zero pages as a result of the read under
133 + // high memory pressure. Retrying the failing read succeeds in most cases.
134 + // See also: http://tracker.ceph.com/issues/22464
135 + if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
136 + return -EIO;
137 + }
138 + return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
139 }
140 if (buffered) {
141 bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
142 @@ -6981,6 +6998,11 @@ int BlueStore::_do_read(
143 assert(pos == length);
144 assert(pr == pr_end);
145 r = bl.length();
146 + if (retry_count) {
147 + logger->inc(l_bluestore_reads_with_retries);
148 + dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
149 + << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
150 + }
151 return r;
152 }
153
154 @@ -6993,6 +7015,13 @@ int BlueStore::_verify_csum(OnodeRef& o,
155 uint64_t bad_csum;
156 utime_t start = ceph_clock_now();
157 int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
158 + if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
159 + (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
160 + derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
161 + bad = blob_xoffset;
162 + r = -1;
163 + bad_csum = 0xDEADBEEF;
164 + }
165 if (r < 0) {
166 if (r == -1) {
167 PExtentVector pex;
168 diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
169 index 6a14042df..70050a567 100644
170 --- a/src/os/bluestore/BlueStore.h
171 +++ b/src/os/bluestore/BlueStore.h
172 @@ -118,6 +118,7 @@ enum {
173 l_bluestore_extent_compress,
174 l_bluestore_gc_merged,
175 l_bluestore_read_eio,
176 + l_bluestore_reads_with_retries,
177 l_bluestore_last
178 };
179
180 @@ -2366,7 +2367,8 @@ public:
181 uint64_t offset,
182 size_t len,
183 bufferlist& bl,
184 - uint32_t op_flags = 0);
185 + uint32_t op_flags = 0,
186 + uint64_t retry_count = 0);
187
188 private:
189 int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
190 diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
191 index d2152d456..326c9785d 100644
192 --- a/src/test/objectstore/store_test.cc
193 +++ b/src/test/objectstore/store_test.cc
194 @@ -6855,6 +6855,73 @@ TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice2) {
195 g_conf->apply_changes(NULL);
196 }
197
198 +TEST_P(StoreTest, SpuriousReadErrorTest) {
199 + if (string(GetParam()) != "bluestore")
200 + return;
201 +
202 + ObjectStore::Sequencer osr("test");
203 + int r;
204 + auto logger = store->get_perf_counters();
205 + coll_t cid;
206 + ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
207 + {
208 + ObjectStore::Transaction t;
209 + t.create_collection(cid, 0);
210 + cerr << "Creating collection " << cid << std::endl;
211 + r = store->queue_transaction(&osr, std::move(t), nullptr);
212 + ASSERT_EQ(r, 0);
213 + }
214 + bufferlist test_data;
215 + bufferptr ap(0x2000);
216 + memset(ap.c_str(), 'a', 0x2000);
217 + test_data.append(ap);
218 + {
219 + ObjectStore::Transaction t;
220 + t.write(cid, hoid, 0, 0x2000, test_data);
221 + r = store->queue_transaction(&osr, std::move(t), nullptr);
222 + ASSERT_EQ(r, 0);
223 + // force cache clear
224 + EXPECT_EQ(store->umount(), 0);
225 + EXPECT_EQ(store->mount(), 0);
226 + }
227 +
228 + cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl;
229 + g_conf->set_val("bluestore_retry_disk_reads", "0");
230 + g_conf->set_val("bluestore_debug_inject_csum_err_probability", "1");
231 + g_ceph_context->_conf->apply_changes(nullptr);
232 + {
233 + bufferlist in;
234 + r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
235 + ASSERT_EQ(-EIO, r);
236 + ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u);
237 + ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u);
238 + }
239 +
240 + cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl;
241 + g_conf->set_val("bluestore_retry_disk_reads", "255");
242 + g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0.8");
243 + /**
244 + * Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries
245 + * Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18
246 + * Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24
247 + */
248 + g_ceph_context->_conf->apply_changes(nullptr);
249 + {
250 + for (int i = 0; i < 25; ++i) {
251 + bufferlist in;
252 + r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
253 + ASSERT_EQ(0x2000, r);
254 + ASSERT_TRUE(bl_eq(test_data, in));
255 + }
256 + ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u);
257 + }
258 +
259 + // revert
260 + g_conf->set_val("bluestore_retry_disk_reads", "3");
261 + g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0");
262 + g_ceph_context->_conf->apply_changes(nullptr);
263 +}
264 +
265 int main(int argc, char **argv) {
266 vector<const char*> args;
267 argv_to_vec(argc, (const char **)argv, args);
268 --
269 2.19.2
270