]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2013,2014 Inktank Storage, Inc. | |
7 | * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> | |
8 | * | |
9 | * Author: Loic Dachary <loic@dachary.org> | |
10 | * | |
11 | * This is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU Lesser General Public | |
13 | * License version 2.1, as published by the Free Software | |
14 | * Foundation. See file COPYING. | |
15 | * | |
16 | */ | |
17 | ||
18 | ||
19 | #include "common/errno.h" | |
20 | #include "common/scrub_types.h" | |
21 | #include "ReplicatedBackend.h" | |
22 | #include "ScrubStore.h" | |
23 | #include "ECBackend.h" | |
24 | #include "PGBackend.h" | |
25 | #include "OSD.h" | |
26 | #include "erasure-code/ErasureCodePlugin.h" | |
27 | #include "OSDMap.h" | |
28 | #include "PGLog.h" | |
29 | #include "common/LogClient.h" | |
30 | ||
31 | #define dout_context cct | |
32 | #define dout_subsys ceph_subsys_osd | |
33 | #define DOUT_PREFIX_ARGS this | |
34 | #undef dout_prefix | |
35 | #define dout_prefix _prefix(_dout, this) | |
36 | static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) { | |
37 | return *_dout << pgb->get_parent()->gen_dbg_prefix(); | |
38 | } | |
39 | ||
40 | void PGBackend::rollback( | |
41 | const pg_log_entry_t &entry, | |
42 | ObjectStore::Transaction *t) | |
43 | { | |
44 | ||
45 | struct RollbackVisitor : public ObjectModDesc::Visitor { | |
46 | const hobject_t &hoid; | |
47 | PGBackend *pg; | |
48 | ObjectStore::Transaction t; | |
49 | RollbackVisitor( | |
50 | const hobject_t &hoid, | |
51 | PGBackend *pg) : hoid(hoid), pg(pg) {} | |
52 | void append(uint64_t old_size) override { | |
53 | ObjectStore::Transaction temp; | |
54 | pg->rollback_append(hoid, old_size, &temp); | |
55 | temp.append(t); | |
56 | temp.swap(t); | |
57 | } | |
58 | void setattrs(map<string, boost::optional<bufferlist> > &attrs) override { | |
59 | ObjectStore::Transaction temp; | |
60 | pg->rollback_setattrs(hoid, attrs, &temp); | |
61 | temp.append(t); | |
62 | temp.swap(t); | |
63 | } | |
64 | void rmobject(version_t old_version) override { | |
65 | ObjectStore::Transaction temp; | |
66 | pg->rollback_stash(hoid, old_version, &temp); | |
67 | temp.append(t); | |
68 | temp.swap(t); | |
69 | } | |
70 | void try_rmobject(version_t old_version) override { | |
71 | ObjectStore::Transaction temp; | |
72 | pg->rollback_try_stash(hoid, old_version, &temp); | |
73 | temp.append(t); | |
74 | temp.swap(t); | |
75 | } | |
76 | void create() override { | |
77 | ObjectStore::Transaction temp; | |
78 | pg->rollback_create(hoid, &temp); | |
79 | temp.append(t); | |
80 | temp.swap(t); | |
81 | } | |
82 | void update_snaps(const set<snapid_t> &snaps) override { | |
83 | ObjectStore::Transaction temp; | |
84 | pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp); | |
85 | temp.append(t); | |
86 | temp.swap(t); | |
87 | } | |
88 | void rollback_extents( | |
89 | version_t gen, | |
90 | const vector<pair<uint64_t, uint64_t> > &extents) override { | |
91 | ObjectStore::Transaction temp; | |
92 | pg->rollback_extents(gen, extents, hoid, &temp); | |
93 | temp.append(t); | |
94 | temp.swap(t); | |
95 | } | |
96 | }; | |
97 | ||
98 | assert(entry.mod_desc.can_rollback()); | |
99 | RollbackVisitor vis(entry.soid, this); | |
100 | entry.mod_desc.visit(&vis); | |
101 | t->append(vis.t); | |
102 | } | |
103 | ||
104 | struct Trimmer : public ObjectModDesc::Visitor { | |
105 | const hobject_t &soid; | |
106 | PGBackend *pg; | |
107 | ObjectStore::Transaction *t; | |
108 | Trimmer( | |
109 | const hobject_t &soid, | |
110 | PGBackend *pg, | |
111 | ObjectStore::Transaction *t) | |
112 | : soid(soid), pg(pg), t(t) {} | |
113 | void rmobject(version_t old_version) override { | |
114 | pg->trim_rollback_object( | |
115 | soid, | |
116 | old_version, | |
117 | t); | |
118 | } | |
119 | // try_rmobject defaults to rmobject | |
120 | void rollback_extents( | |
121 | version_t gen, | |
122 | const vector<pair<uint64_t, uint64_t> > &extents) override { | |
123 | pg->trim_rollback_object( | |
124 | soid, | |
125 | gen, | |
126 | t); | |
127 | } | |
128 | }; | |
129 | ||
130 | void PGBackend::rollforward( | |
131 | const pg_log_entry_t &entry, | |
132 | ObjectStore::Transaction *t) | |
133 | { | |
134 | auto dpp = get_parent()->get_dpp(); | |
135 | ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl; | |
136 | if (!entry.can_rollback()) | |
137 | return; | |
138 | Trimmer trimmer(entry.soid, this, t); | |
139 | entry.mod_desc.visit(&trimmer); | |
140 | } | |
141 | ||
142 | void PGBackend::trim( | |
143 | const pg_log_entry_t &entry, | |
144 | ObjectStore::Transaction *t) | |
145 | { | |
146 | if (!entry.can_rollback()) | |
147 | return; | |
148 | Trimmer trimmer(entry.soid, this, t); | |
149 | entry.mod_desc.visit(&trimmer); | |
150 | } | |
151 | ||
152 | void PGBackend::try_stash( | |
153 | const hobject_t &hoid, | |
154 | version_t v, | |
155 | ObjectStore::Transaction *t) | |
156 | { | |
157 | t->try_rename( | |
158 | coll, | |
159 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
160 | ghobject_t(hoid, v, get_parent()->whoami_shard().shard)); | |
161 | } | |
162 | ||
163 | void PGBackend::remove( | |
164 | const hobject_t &hoid, | |
165 | ObjectStore::Transaction *t) { | |
166 | assert(!hoid.is_temp()); | |
167 | t->remove( | |
168 | coll, | |
169 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
170 | get_parent()->pgb_clear_object_snap_mapping(hoid, t); | |
171 | } | |
172 | ||
173 | void PGBackend::on_change_cleanup(ObjectStore::Transaction *t) | |
174 | { | |
175 | dout(10) << __func__ << dendl; | |
176 | // clear temp | |
177 | for (set<hobject_t>::iterator i = temp_contents.begin(); | |
178 | i != temp_contents.end(); | |
179 | ++i) { | |
180 | dout(10) << __func__ << ": Removing oid " | |
181 | << *i << " from the temp collection" << dendl; | |
182 | t->remove( | |
183 | coll, | |
184 | ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
185 | } | |
186 | temp_contents.clear(); | |
187 | } | |
188 | ||
189 | int PGBackend::objects_list_partial( | |
190 | const hobject_t &begin, | |
191 | int min, | |
192 | int max, | |
193 | vector<hobject_t> *ls, | |
194 | hobject_t *next) | |
195 | { | |
196 | assert(ls); | |
197 | // Starts with the smallest generation to make sure the result list | |
198 | // has the marker object (it might have multiple generations | |
199 | // though, which would be filtered). | |
200 | ghobject_t _next; | |
201 | if (!begin.is_min()) | |
202 | _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard); | |
203 | ls->reserve(max); | |
204 | int r = 0; | |
205 | ||
206 | if (min > max) | |
207 | min = max; | |
208 | ||
209 | while (!_next.is_max() && ls->size() < (unsigned)min) { | |
210 | vector<ghobject_t> objects; | |
211 | r = store->collection_list( | |
212 | ch, | |
213 | _next, | |
214 | ghobject_t::get_max(), | |
215 | max - ls->size(), | |
216 | &objects, | |
217 | &_next); | |
218 | if (r != 0) { | |
219 | derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl; | |
220 | break; | |
221 | } | |
222 | for (vector<ghobject_t>::iterator i = objects.begin(); | |
223 | i != objects.end(); | |
224 | ++i) { | |
225 | if (i->is_pgmeta() || i->hobj.is_temp()) { | |
226 | continue; | |
227 | } | |
228 | if (i->is_no_gen()) { | |
229 | ls->push_back(i->hobj); | |
230 | } | |
231 | } | |
232 | } | |
233 | if (r == 0) | |
234 | *next = _next.hobj; | |
235 | return r; | |
236 | } | |
237 | ||
238 | int PGBackend::objects_list_range( | |
239 | const hobject_t &start, | |
240 | const hobject_t &end, | |
241 | snapid_t seq, | |
242 | vector<hobject_t> *ls, | |
243 | vector<ghobject_t> *gen_obs) | |
244 | { | |
245 | assert(ls); | |
246 | vector<ghobject_t> objects; | |
247 | int r = store->collection_list( | |
248 | ch, | |
249 | ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
250 | ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
251 | INT_MAX, | |
252 | &objects, | |
253 | NULL); | |
254 | ls->reserve(objects.size()); | |
255 | for (vector<ghobject_t>::iterator i = objects.begin(); | |
256 | i != objects.end(); | |
257 | ++i) { | |
258 | if (i->is_pgmeta() || i->hobj.is_temp()) { | |
259 | continue; | |
260 | } | |
261 | if (i->is_no_gen()) { | |
262 | ls->push_back(i->hobj); | |
263 | } else if (gen_obs) { | |
264 | gen_obs->push_back(*i); | |
265 | } | |
266 | } | |
267 | return r; | |
268 | } | |
269 | ||
270 | int PGBackend::objects_get_attr( | |
271 | const hobject_t &hoid, | |
272 | const string &attr, | |
273 | bufferlist *out) | |
274 | { | |
275 | bufferptr bp; | |
276 | int r = store->getattr( | |
277 | ch, | |
278 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
279 | attr.c_str(), | |
280 | bp); | |
281 | if (r >= 0 && out) { | |
282 | out->clear(); | |
283 | out->push_back(std::move(bp)); | |
284 | } | |
285 | return r; | |
286 | } | |
287 | ||
288 | int PGBackend::objects_get_attrs( | |
289 | const hobject_t &hoid, | |
290 | map<string, bufferlist> *out) | |
291 | { | |
292 | return store->getattrs( | |
293 | ch, | |
294 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
295 | *out); | |
296 | } | |
297 | ||
298 | void PGBackend::rollback_setattrs( | |
299 | const hobject_t &hoid, | |
300 | map<string, boost::optional<bufferlist> > &old_attrs, | |
301 | ObjectStore::Transaction *t) { | |
302 | map<string, bufferlist> to_set; | |
303 | assert(!hoid.is_temp()); | |
304 | for (map<string, boost::optional<bufferlist> >::iterator i = old_attrs.begin(); | |
305 | i != old_attrs.end(); | |
306 | ++i) { | |
307 | if (i->second) { | |
308 | to_set[i->first] = i->second.get(); | |
309 | } else { | |
310 | t->rmattr( | |
311 | coll, | |
312 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
313 | i->first); | |
314 | } | |
315 | } | |
316 | t->setattrs( | |
317 | coll, | |
318 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
319 | to_set); | |
320 | } | |
321 | ||
322 | void PGBackend::rollback_append( | |
323 | const hobject_t &hoid, | |
324 | uint64_t old_size, | |
325 | ObjectStore::Transaction *t) { | |
326 | assert(!hoid.is_temp()); | |
327 | t->truncate( | |
328 | coll, | |
329 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
330 | old_size); | |
331 | } | |
332 | ||
333 | void PGBackend::rollback_stash( | |
334 | const hobject_t &hoid, | |
335 | version_t old_version, | |
336 | ObjectStore::Transaction *t) { | |
337 | assert(!hoid.is_temp()); | |
338 | t->remove( | |
339 | coll, | |
340 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
341 | t->collection_move_rename( | |
342 | coll, | |
343 | ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard), | |
344 | coll, | |
345 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
346 | } | |
347 | ||
348 | void PGBackend::rollback_try_stash( | |
349 | const hobject_t &hoid, | |
350 | version_t old_version, | |
351 | ObjectStore::Transaction *t) { | |
352 | assert(!hoid.is_temp()); | |
353 | t->remove( | |
354 | coll, | |
355 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
356 | t->try_rename( | |
357 | coll, | |
358 | ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard), | |
359 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
360 | } | |
361 | ||
362 | void PGBackend::rollback_extents( | |
363 | version_t gen, | |
364 | const vector<pair<uint64_t, uint64_t> > &extents, | |
365 | const hobject_t &hoid, | |
366 | ObjectStore::Transaction *t) { | |
367 | auto shard = get_parent()->whoami_shard().shard; | |
368 | for (auto &&extent: extents) { | |
369 | t->clone_range( | |
370 | coll, | |
371 | ghobject_t(hoid, gen, shard), | |
372 | ghobject_t(hoid, ghobject_t::NO_GEN, shard), | |
373 | extent.first, | |
374 | extent.second, | |
375 | extent.first); | |
376 | } | |
377 | t->remove( | |
378 | coll, | |
379 | ghobject_t(hoid, gen, shard)); | |
380 | } | |
381 | ||
382 | void PGBackend::trim_rollback_object( | |
383 | const hobject_t &hoid, | |
384 | version_t old_version, | |
385 | ObjectStore::Transaction *t) { | |
386 | assert(!hoid.is_temp()); | |
387 | t->remove( | |
388 | coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard)); | |
389 | } | |
390 | ||
391 | PGBackend *PGBackend::build_pg_backend( | |
392 | const pg_pool_t &pool, | |
393 | const OSDMapRef curmap, | |
394 | Listener *l, | |
395 | coll_t coll, | |
396 | ObjectStore::CollectionHandle &ch, | |
397 | ObjectStore *store, | |
398 | CephContext *cct) | |
399 | { | |
400 | switch (pool.type) { | |
401 | case pg_pool_t::TYPE_REPLICATED: { | |
402 | return new ReplicatedBackend(l, coll, ch, store, cct); | |
403 | } | |
404 | case pg_pool_t::TYPE_ERASURE: { | |
405 | ErasureCodeInterfaceRef ec_impl; | |
406 | ErasureCodeProfile profile = curmap->get_erasure_code_profile(pool.erasure_code_profile); | |
407 | assert(profile.count("plugin")); | |
408 | stringstream ss; | |
409 | ceph::ErasureCodePluginRegistry::instance().factory( | |
410 | profile.find("plugin")->second, | |
411 | cct->_conf->get_val<std::string>("erasure_code_dir"), | |
412 | profile, | |
413 | &ec_impl, | |
414 | &ss); | |
415 | assert(ec_impl); | |
416 | return new ECBackend( | |
417 | l, | |
418 | coll, | |
419 | ch, | |
420 | store, | |
421 | cct, | |
422 | ec_impl, | |
423 | pool.stripe_width); | |
424 | } | |
425 | default: | |
426 | ceph_abort(); | |
427 | return NULL; | |
428 | } | |
429 | } | |
430 | ||
431 | /* | |
432 | * pg lock may or may not be held | |
433 | */ | |
434 | void PGBackend::be_scan_list( | |
435 | ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed, | |
436 | ThreadPool::TPHandle &handle) | |
437 | { | |
438 | dout(10) << __func__ << " scanning " << ls.size() << " objects" | |
439 | << (deep ? " deeply" : "") << dendl; | |
440 | int i = 0; | |
441 | for (vector<hobject_t>::const_iterator p = ls.begin(); | |
442 | p != ls.end(); | |
443 | ++p, i++) { | |
444 | handle.reset_tp_timeout(); | |
445 | hobject_t poid = *p; | |
446 | ||
447 | struct stat st; | |
448 | int r = store->stat( | |
449 | ch, | |
450 | ghobject_t( | |
451 | poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
452 | &st, | |
453 | true); | |
454 | if (r == 0) { | |
455 | ScrubMap::object &o = map.objects[poid]; | |
456 | o.size = st.st_size; | |
457 | assert(!o.negative); | |
458 | store->getattrs( | |
459 | ch, | |
460 | ghobject_t( | |
461 | poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
462 | o.attrs); | |
463 | ||
464 | // calculate the CRC32 on deep scrubs | |
465 | if (deep) { | |
466 | be_deep_scrub(*p, seed, o, handle); | |
467 | } | |
468 | ||
469 | dout(25) << __func__ << " " << poid << dendl; | |
470 | } else if (r == -ENOENT) { | |
471 | dout(25) << __func__ << " " << poid << " got " << r | |
472 | << ", skipping" << dendl; | |
473 | } else if (r == -EIO) { | |
474 | dout(25) << __func__ << " " << poid << " got " << r | |
475 | << ", stat_error" << dendl; | |
476 | ScrubMap::object &o = map.objects[poid]; | |
477 | o.stat_error = true; | |
478 | } else { | |
479 | derr << __func__ << " got: " << cpp_strerror(r) << dendl; | |
480 | ceph_abort(); | |
481 | } | |
482 | } | |
483 | } | |
484 | ||
485 | bool PGBackend::be_compare_scrub_objects( | |
486 | pg_shard_t auth_shard, | |
487 | const ScrubMap::object &auth, | |
488 | const object_info_t& auth_oi, | |
489 | const ScrubMap::object &candidate, | |
490 | shard_info_wrapper &shard_result, | |
491 | inconsistent_obj_wrapper &obj_result, | |
492 | ostream &errorstream) | |
493 | { | |
494 | enum { CLEAN, FOUND_ERROR } error = CLEAN; | |
495 | if (candidate.stat_error) { | |
496 | assert(shard_result.has_stat_error()); | |
497 | error = FOUND_ERROR; | |
498 | errorstream << "candidate had a stat error"; | |
499 | } | |
500 | if (candidate.read_error || candidate.ec_hash_mismatch || candidate.ec_size_mismatch) { | |
501 | error = FOUND_ERROR; | |
502 | errorstream << "candidate had a read error"; | |
503 | } | |
504 | if (auth.digest_present && candidate.digest_present) { | |
505 | if (auth.digest != candidate.digest) { | |
506 | if (error != CLEAN) | |
507 | errorstream << ", "; | |
508 | error = FOUND_ERROR; | |
509 | errorstream << "data_digest 0x" << std::hex << candidate.digest | |
510 | << " != data_digest 0x" << auth.digest << std::dec | |
511 | << " from shard " << auth_shard; | |
512 | obj_result.set_data_digest_mismatch(); | |
513 | } | |
514 | } | |
515 | if (auth.omap_digest_present && candidate.omap_digest_present) { | |
516 | if (auth.omap_digest != candidate.omap_digest) { | |
517 | if (error != CLEAN) | |
518 | errorstream << ", "; | |
519 | error = FOUND_ERROR; | |
520 | errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest | |
521 | << " != omap_digest 0x" << auth.omap_digest << std::dec | |
522 | << " from shard " << auth_shard; | |
523 | obj_result.set_omap_digest_mismatch(); | |
524 | } | |
525 | } | |
526 | if (parent->get_pool().is_replicated()) { | |
527 | if (auth_oi.is_data_digest() && candidate.digest_present) { | |
528 | if (auth_oi.data_digest != candidate.digest) { | |
529 | if (error != CLEAN) | |
530 | errorstream << ", "; | |
531 | error = FOUND_ERROR; | |
532 | errorstream << "data_digest 0x" << std::hex << candidate.digest | |
533 | << " != data_digest 0x" << auth_oi.data_digest << std::dec | |
534 | << " from auth oi " << auth_oi; | |
535 | shard_result.set_data_digest_mismatch_oi(); | |
536 | } | |
537 | } | |
538 | if (auth_oi.is_omap_digest() && candidate.omap_digest_present) { | |
539 | if (auth_oi.omap_digest != candidate.omap_digest) { | |
540 | if (error != CLEAN) | |
541 | errorstream << ", "; | |
542 | error = FOUND_ERROR; | |
543 | errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest | |
544 | << " != omap_digest 0x" << auth_oi.omap_digest << std::dec | |
545 | << " from auth oi " << auth_oi; | |
546 | shard_result.set_omap_digest_mismatch_oi(); | |
547 | } | |
548 | } | |
549 | } | |
550 | if (candidate.stat_error) | |
551 | return error == FOUND_ERROR; | |
552 | uint64_t oi_size = be_get_ondisk_size(auth_oi.size); | |
553 | if (oi_size != candidate.size) { | |
554 | if (error != CLEAN) | |
555 | errorstream << ", "; | |
556 | error = FOUND_ERROR; | |
557 | errorstream << "size " << candidate.size | |
558 | << " != size " << oi_size | |
559 | << " from auth oi " << auth_oi; | |
560 | shard_result.set_size_mismatch_oi(); | |
561 | } | |
562 | if (auth.size != candidate.size) { | |
563 | if (error != CLEAN) | |
564 | errorstream << ", "; | |
565 | error = FOUND_ERROR; | |
566 | errorstream << "size " << candidate.size | |
567 | << " != size " << auth.size | |
568 | << " from shard " << auth_shard; | |
569 | obj_result.set_size_mismatch(); | |
570 | } | |
571 | for (map<string,bufferptr>::const_iterator i = auth.attrs.begin(); | |
572 | i != auth.attrs.end(); | |
573 | ++i) { | |
574 | if (!candidate.attrs.count(i->first)) { | |
575 | if (error != CLEAN) | |
576 | errorstream << ", "; | |
577 | error = FOUND_ERROR; | |
578 | errorstream << "attr name mismatch '" << i->first << "'"; | |
579 | obj_result.set_attr_name_mismatch(); | |
580 | } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) { | |
581 | if (error != CLEAN) | |
582 | errorstream << ", "; | |
583 | error = FOUND_ERROR; | |
584 | errorstream << "attr value mismatch '" << i->first << "'"; | |
585 | obj_result.set_attr_value_mismatch(); | |
586 | } | |
587 | } | |
588 | for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin(); | |
589 | i != candidate.attrs.end(); | |
590 | ++i) { | |
591 | if (!auth.attrs.count(i->first)) { | |
592 | if (error != CLEAN) | |
593 | errorstream << ", "; | |
594 | error = FOUND_ERROR; | |
595 | errorstream << "attr name mismatch '" << i->first << "'"; | |
596 | obj_result.set_attr_name_mismatch(); | |
597 | } | |
598 | } | |
599 | return error == FOUND_ERROR; | |
600 | } | |
601 | ||
602 | static int dcount(const object_info_t &oi) | |
603 | { | |
604 | int count = 0; | |
605 | if (oi.is_data_digest()) | |
606 | count++; | |
607 | if (oi.is_omap_digest()) | |
608 | count++; | |
609 | return count; | |
610 | } | |
611 | ||
612 | map<pg_shard_t, ScrubMap *>::const_iterator | |
613 | PGBackend::be_select_auth_object( | |
614 | const hobject_t &obj, | |
615 | const map<pg_shard_t,ScrubMap*> &maps, | |
616 | object_info_t *auth_oi, | |
617 | map<pg_shard_t, shard_info_wrapper> &shard_map, | |
618 | inconsistent_obj_wrapper &object_error) | |
619 | { | |
620 | eversion_t auth_version; | |
621 | bufferlist auth_bl; | |
622 | ||
31f18b77 FG |
623 | // Create list of shards with primary last so it will be auth copy all |
624 | // other things being equal. | |
625 | list<pg_shard_t> shards; | |
7c673cae FG |
626 | for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin(); |
627 | j != maps.end(); | |
628 | ++j) { | |
31f18b77 FG |
629 | if (j->first == get_parent()->whoami_shard()) |
630 | continue; | |
631 | shards.push_back(j->first); | |
632 | } | |
633 | shards.push_back(get_parent()->whoami_shard()); | |
634 | ||
635 | map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end(); | |
636 | for (auto &l : shards) { | |
637 | map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l); | |
7c673cae FG |
638 | map<hobject_t, ScrubMap::object>::iterator i = |
639 | j->second->objects.find(obj); | |
640 | if (i == j->second->objects.end()) { | |
641 | continue; | |
642 | } | |
643 | string error_string; | |
644 | auto& shard_info = shard_map[j->first]; | |
645 | if (i->second.read_error) { | |
646 | shard_info.set_read_error(); | |
647 | error_string += " read_error"; | |
648 | } | |
649 | if (i->second.ec_hash_mismatch) { | |
650 | shard_info.set_ec_hash_mismatch(); | |
651 | error_string += " ec_hash_mismatch"; | |
652 | } | |
653 | if (i->second.ec_size_mismatch) { | |
654 | shard_info.set_ec_size_mismatch(); | |
655 | error_string += " ec_size_mismatch"; | |
656 | } | |
657 | ||
658 | object_info_t oi; | |
659 | bufferlist bl; | |
660 | map<string, bufferptr>::iterator k; | |
31f18b77 FG |
661 | SnapSet ss; |
662 | bufferlist ss_bl; | |
7c673cae FG |
663 | |
664 | if (i->second.stat_error) { | |
665 | shard_info.set_stat_error(); | |
666 | error_string += " stat_error"; | |
667 | // With stat_error no further checking | |
668 | // We don't need to also see a missing_object_info_attr | |
669 | goto out; | |
670 | } | |
671 | ||
672 | k = i->second.attrs.find(OI_ATTR); | |
673 | if (k == i->second.attrs.end()) { | |
674 | // no object info on object, probably corrupt | |
675 | shard_info.set_oi_attr_missing(); | |
676 | error_string += " oi_attr_missing"; | |
677 | goto out; | |
678 | } | |
679 | bl.push_back(k->second); | |
680 | try { | |
681 | bufferlist::iterator bliter = bl.begin(); | |
682 | ::decode(oi, bliter); | |
683 | } catch (...) { | |
684 | // invalid object info, probably corrupt | |
685 | shard_info.set_oi_attr_corrupted(); | |
686 | error_string += " oi_attr_corrupted"; | |
687 | goto out; | |
688 | } | |
689 | ||
31f18b77 FG |
690 | if (oi.soid != obj) { |
691 | shard_info.set_oi_attr_corrupted(); | |
692 | error_string += " oi_attr_corrupted"; | |
693 | goto out; | |
694 | } | |
695 | ||
7c673cae FG |
696 | if (auth_version != eversion_t()) { |
697 | if (!object_error.has_object_info_inconsistency() && !(bl == auth_bl)) { | |
698 | object_error.set_object_info_inconsistency(); | |
699 | error_string += " object_info_inconsistency"; | |
700 | } | |
701 | } | |
702 | ||
703 | // Don't use this particular shard because it won't be able to repair data | |
704 | // XXX: For now we can't pick one shard for repair and another's object info | |
705 | if (i->second.read_error || i->second.ec_hash_mismatch || i->second.ec_size_mismatch) | |
706 | goto out; | |
707 | ||
31f18b77 FG |
708 | // We don't set errors here for snapset, but we won't pick an auth copy if the |
709 | // snapset is missing or won't decode. | |
710 | if (obj.is_head() || obj.is_snapdir()) { | |
711 | k = i->second.attrs.find(SS_ATTR); | |
712 | if (k == i->second.attrs.end()) { | |
713 | goto out; | |
714 | } | |
715 | ss_bl.push_back(k->second); | |
716 | try { | |
717 | bufferlist::iterator bliter = ss_bl.begin(); | |
718 | ::decode(ss, bliter); | |
719 | } catch (...) { | |
720 | // invalid snapset, probably corrupt | |
721 | goto out; | |
722 | } | |
723 | } | |
724 | ||
7c673cae FG |
725 | if (auth_version == eversion_t() || oi.version > auth_version || |
726 | (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) { | |
727 | auth = j; | |
728 | *auth_oi = oi; | |
729 | auth_version = oi.version; | |
730 | auth_bl.clear(); | |
731 | auth_bl.append(bl); | |
732 | } | |
733 | ||
734 | out: | |
735 | // Check error_string because some errors already generated messages | |
736 | if (error_string != "") { | |
737 | dout(10) << __func__ << ": error(s) osd " << j->first | |
738 | << " for obj " << obj | |
739 | << "," << error_string | |
740 | << dendl; | |
741 | } | |
742 | // Keep scanning other shards | |
743 | } | |
744 | dout(10) << __func__ << ": selecting osd " << auth->first | |
745 | << " for obj " << obj | |
746 | << " with oi " << *auth_oi | |
747 | << dendl; | |
748 | return auth; | |
749 | } | |
750 | ||
751 | void PGBackend::be_compare_scrubmaps( | |
752 | const map<pg_shard_t,ScrubMap*> &maps, | |
753 | bool repair, | |
754 | map<hobject_t, set<pg_shard_t>> &missing, | |
755 | map<hobject_t, set<pg_shard_t>> &inconsistent, | |
756 | map<hobject_t, list<pg_shard_t>> &authoritative, | |
757 | map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest, | |
758 | int &shallow_errors, int &deep_errors, | |
759 | Scrub::Store *store, | |
760 | const spg_t& pgid, | |
761 | const vector<int> &acting, | |
762 | ostream &errorstream) | |
763 | { | |
764 | map<hobject_t,ScrubMap::object>::const_iterator i; | |
765 | map<pg_shard_t, ScrubMap *>::const_iterator j; | |
766 | set<hobject_t> master_set; | |
767 | utime_t now = ceph_clock_now(); | |
768 | ||
769 | // Construct master set | |
770 | for (j = maps.begin(); j != maps.end(); ++j) { | |
771 | for (i = j->second->objects.begin(); i != j->second->objects.end(); ++i) { | |
772 | master_set.insert(i->first); | |
773 | } | |
774 | } | |
775 | ||
776 | // Check maps against master set and each other | |
777 | for (set<hobject_t>::const_iterator k = master_set.begin(); | |
778 | k != master_set.end(); | |
779 | ++k) { | |
780 | object_info_t auth_oi; | |
781 | map<pg_shard_t, shard_info_wrapper> shard_map; | |
782 | ||
783 | inconsistent_obj_wrapper object_error{*k}; | |
784 | ||
785 | map<pg_shard_t, ScrubMap *>::const_iterator auth = | |
786 | be_select_auth_object(*k, maps, &auth_oi, shard_map, object_error); | |
787 | ||
788 | list<pg_shard_t> auth_list; | |
31f18b77 | 789 | set<pg_shard_t> object_errors; |
7c673cae FG |
790 | if (auth == maps.end()) { |
791 | object_error.set_version(0); | |
792 | object_error.set_auth_missing(*k, maps, shard_map, shallow_errors, deep_errors); | |
793 | if (object_error.has_deep_errors()) | |
794 | ++deep_errors; | |
795 | else if (object_error.has_shallow_errors()) | |
796 | ++shallow_errors; | |
797 | store->add_object_error(k->pool, object_error); | |
798 | errorstream << pgid.pgid << " soid " << *k | |
799 | << ": failed to pick suitable object info\n"; | |
800 | continue; | |
801 | } | |
802 | object_error.set_version(auth_oi.user_version); | |
803 | ScrubMap::object& auth_object = auth->second->objects[*k]; | |
804 | set<pg_shard_t> cur_missing; | |
805 | set<pg_shard_t> cur_inconsistent; | |
806 | ||
807 | for (j = maps.begin(); j != maps.end(); ++j) { | |
808 | if (j == auth) | |
809 | shard_map[auth->first].selected_oi = true; | |
810 | if (j->second->objects.count(*k)) { | |
811 | shard_map[j->first].set_object(j->second->objects[*k]); | |
812 | // Compare | |
813 | stringstream ss; | |
814 | bool found = be_compare_scrub_objects(auth->first, | |
815 | auth_object, | |
816 | auth_oi, | |
817 | j->second->objects[*k], | |
818 | shard_map[j->first], | |
819 | object_error, | |
820 | ss); | |
821 | // Some errors might have already been set in be_select_auth_object() | |
822 | if (shard_map[j->first].errors != 0) { | |
823 | cur_inconsistent.insert(j->first); | |
824 | if (shard_map[j->first].has_deep_errors()) | |
825 | ++deep_errors; | |
826 | else | |
827 | ++shallow_errors; | |
828 | // Only true if be_compare_scrub_objects() found errors and put something | |
829 | // in ss. | |
830 | if (found) | |
831 | errorstream << pgid << " shard " << j->first << ": soid " << *k | |
832 | << " " << ss.str() << "\n"; | |
31f18b77 FG |
833 | } else if (found) { |
834 | // Track possible shard to use as authoritative, if needed | |
835 | // There are errors, without identifying the shard | |
836 | object_errors.insert(j->first); | |
7c673cae FG |
837 | } else { |
838 | // XXX: The auth shard might get here that we don't know | |
839 | // that it has the "correct" data. | |
840 | auth_list.push_back(j->first); | |
841 | } | |
842 | } else { | |
843 | cur_missing.insert(j->first); | |
844 | shard_map[j->first].set_missing(); | |
845 | // Can't have any other errors if there is no information available | |
846 | ++shallow_errors; | |
847 | errorstream << pgid << " shard " << j->first << " missing " << *k | |
848 | << "\n"; | |
849 | } | |
850 | object_error.add_shard(j->first, shard_map[j->first]); | |
851 | } | |
852 | ||
853 | if (auth_list.empty()) { | |
31f18b77 FG |
854 | if (object_errors.empty()) { |
855 | errorstream << pgid.pgid << " soid " << *k | |
7c673cae | 856 | << ": failed to pick suitable auth object\n"; |
31f18b77 FG |
857 | goto out; |
858 | } | |
859 | // Object errors exist and nothing in auth_list | |
860 | // Prefer the auth shard otherwise take first from list. | |
861 | pg_shard_t shard; | |
862 | if (object_errors.count(auth->first)) { | |
863 | shard = auth->first; | |
864 | } else { | |
865 | shard = *(object_errors.begin()); | |
866 | } | |
867 | auth_list.push_back(shard); | |
868 | object_errors.erase(shard); | |
7c673cae | 869 | } |
31f18b77 FG |
870 | // At this point auth_list is populated, so we add the object errors shards |
871 | // as inconsistent. | |
872 | cur_inconsistent.insert(object_errors.begin(), object_errors.end()); | |
7c673cae FG |
873 | if (!cur_missing.empty()) { |
874 | missing[*k] = cur_missing; | |
875 | } | |
876 | if (!cur_inconsistent.empty()) { | |
877 | inconsistent[*k] = cur_inconsistent; | |
878 | } | |
879 | if (!cur_inconsistent.empty() || !cur_missing.empty()) { | |
880 | authoritative[*k] = auth_list; | |
881 | } else if (parent->get_pool().is_replicated()) { | |
882 | enum { | |
883 | NO = 0, | |
884 | MAYBE = 1, | |
885 | FORCE = 2, | |
886 | } update = NO; | |
887 | ||
888 | if (auth_object.digest_present && auth_object.omap_digest_present && | |
889 | (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) { | |
890 | dout(20) << __func__ << " missing digest on " << *k << dendl; | |
891 | update = MAYBE; | |
892 | } | |
893 | if (auth_object.digest_present && auth_object.omap_digest_present && | |
894 | cct->_conf->osd_debug_scrub_chance_rewrite_digest && | |
895 | (((unsigned)rand() % 100) > | |
896 | cct->_conf->osd_debug_scrub_chance_rewrite_digest)) { | |
897 | dout(20) << __func__ << " randomly updating digest on " << *k << dendl; | |
898 | update = MAYBE; | |
899 | } | |
900 | ||
901 | // recorded digest != actual digest? | |
902 | if (auth_oi.is_data_digest() && auth_object.digest_present && | |
903 | auth_oi.data_digest != auth_object.digest) { | |
904 | assert(shard_map[auth->first].has_data_digest_mismatch_oi()); | |
905 | errorstream << pgid << " recorded data digest 0x" | |
906 | << std::hex << auth_oi.data_digest << " != on disk 0x" | |
907 | << auth_object.digest << std::dec << " on " << auth_oi.soid | |
908 | << "\n"; | |
909 | if (repair) | |
910 | update = FORCE; | |
911 | } | |
912 | if (auth_oi.is_omap_digest() && auth_object.omap_digest_present && | |
913 | auth_oi.omap_digest != auth_object.omap_digest) { | |
914 | assert(shard_map[auth->first].has_omap_digest_mismatch_oi()); | |
915 | errorstream << pgid << " recorded omap digest 0x" | |
916 | << std::hex << auth_oi.omap_digest << " != on disk 0x" | |
917 | << auth_object.omap_digest << std::dec | |
918 | << " on " << auth_oi.soid << "\n"; | |
919 | if (repair) | |
920 | update = FORCE; | |
921 | } | |
922 | ||
923 | if (update != NO) { | |
924 | utime_t age = now - auth_oi.local_mtime; | |
925 | if (update == FORCE || | |
926 | age > cct->_conf->osd_deep_scrub_update_digest_min_age) { | |
927 | dout(20) << __func__ << " will update digest on " << *k << dendl; | |
928 | missing_digest[*k] = make_pair(auth_object.digest, | |
929 | auth_object.omap_digest); | |
930 | } else { | |
931 | dout(20) << __func__ << " missing digest but age " << age | |
932 | << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age | |
933 | << " on " << *k << dendl; | |
934 | } | |
935 | } | |
936 | } | |
937 | out: | |
938 | if (object_error.has_deep_errors()) | |
939 | ++deep_errors; | |
940 | else if (object_error.has_shallow_errors()) | |
941 | ++shallow_errors; | |
942 | if (object_error.errors || object_error.union_shards.errors) { | |
943 | store->add_object_error(k->pool, object_error); | |
944 | } | |
945 | } | |
946 | } |