]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGBackend.cc
bump version to 12.1.1-pve1 while rebasing patches
[ceph.git] / ceph / src / osd / PGBackend.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19#include "common/errno.h"
20#include "common/scrub_types.h"
21#include "ReplicatedBackend.h"
22#include "ScrubStore.h"
23#include "ECBackend.h"
24#include "PGBackend.h"
25#include "OSD.h"
26#include "erasure-code/ErasureCodePlugin.h"
27#include "OSDMap.h"
28#include "PGLog.h"
29#include "common/LogClient.h"
30
31#define dout_context cct
32#define dout_subsys ceph_subsys_osd
33#define DOUT_PREFIX_ARGS this
34#undef dout_prefix
35#define dout_prefix _prefix(_dout, this)
36static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
37 return *_dout << pgb->get_parent()->gen_dbg_prefix();
38}
39
40void PGBackend::rollback(
41 const pg_log_entry_t &entry,
42 ObjectStore::Transaction *t)
43{
44
45 struct RollbackVisitor : public ObjectModDesc::Visitor {
46 const hobject_t &hoid;
47 PGBackend *pg;
48 ObjectStore::Transaction t;
49 RollbackVisitor(
50 const hobject_t &hoid,
51 PGBackend *pg) : hoid(hoid), pg(pg) {}
52 void append(uint64_t old_size) override {
53 ObjectStore::Transaction temp;
54 pg->rollback_append(hoid, old_size, &temp);
55 temp.append(t);
56 temp.swap(t);
57 }
58 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
59 ObjectStore::Transaction temp;
60 pg->rollback_setattrs(hoid, attrs, &temp);
61 temp.append(t);
62 temp.swap(t);
63 }
64 void rmobject(version_t old_version) override {
65 ObjectStore::Transaction temp;
66 pg->rollback_stash(hoid, old_version, &temp);
67 temp.append(t);
68 temp.swap(t);
69 }
70 void try_rmobject(version_t old_version) override {
71 ObjectStore::Transaction temp;
72 pg->rollback_try_stash(hoid, old_version, &temp);
73 temp.append(t);
74 temp.swap(t);
75 }
76 void create() override {
77 ObjectStore::Transaction temp;
78 pg->rollback_create(hoid, &temp);
79 temp.append(t);
80 temp.swap(t);
81 }
82 void update_snaps(const set<snapid_t> &snaps) override {
83 ObjectStore::Transaction temp;
84 pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
85 temp.append(t);
86 temp.swap(t);
87 }
88 void rollback_extents(
89 version_t gen,
90 const vector<pair<uint64_t, uint64_t> > &extents) override {
91 ObjectStore::Transaction temp;
92 pg->rollback_extents(gen, extents, hoid, &temp);
93 temp.append(t);
94 temp.swap(t);
95 }
96 };
97
98 assert(entry.mod_desc.can_rollback());
99 RollbackVisitor vis(entry.soid, this);
100 entry.mod_desc.visit(&vis);
101 t->append(vis.t);
102}
103
104struct Trimmer : public ObjectModDesc::Visitor {
105 const hobject_t &soid;
106 PGBackend *pg;
107 ObjectStore::Transaction *t;
108 Trimmer(
109 const hobject_t &soid,
110 PGBackend *pg,
111 ObjectStore::Transaction *t)
112 : soid(soid), pg(pg), t(t) {}
113 void rmobject(version_t old_version) override {
114 pg->trim_rollback_object(
115 soid,
116 old_version,
117 t);
118 }
119 // try_rmobject defaults to rmobject
120 void rollback_extents(
121 version_t gen,
122 const vector<pair<uint64_t, uint64_t> > &extents) override {
123 pg->trim_rollback_object(
124 soid,
125 gen,
126 t);
127 }
128};
129
130void PGBackend::rollforward(
131 const pg_log_entry_t &entry,
132 ObjectStore::Transaction *t)
133{
134 auto dpp = get_parent()->get_dpp();
135 ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
136 if (!entry.can_rollback())
137 return;
138 Trimmer trimmer(entry.soid, this, t);
139 entry.mod_desc.visit(&trimmer);
140}
141
142void PGBackend::trim(
143 const pg_log_entry_t &entry,
144 ObjectStore::Transaction *t)
145{
146 if (!entry.can_rollback())
147 return;
148 Trimmer trimmer(entry.soid, this, t);
149 entry.mod_desc.visit(&trimmer);
150}
151
152void PGBackend::try_stash(
153 const hobject_t &hoid,
154 version_t v,
155 ObjectStore::Transaction *t)
156{
157 t->try_rename(
158 coll,
159 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
160 ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
161}
162
163void PGBackend::remove(
164 const hobject_t &hoid,
165 ObjectStore::Transaction *t) {
166 assert(!hoid.is_temp());
167 t->remove(
168 coll,
169 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
170 get_parent()->pgb_clear_object_snap_mapping(hoid, t);
171}
172
173void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
174{
175 dout(10) << __func__ << dendl;
176 // clear temp
177 for (set<hobject_t>::iterator i = temp_contents.begin();
178 i != temp_contents.end();
179 ++i) {
180 dout(10) << __func__ << ": Removing oid "
181 << *i << " from the temp collection" << dendl;
182 t->remove(
183 coll,
184 ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
185 }
186 temp_contents.clear();
187}
188
189int PGBackend::objects_list_partial(
190 const hobject_t &begin,
191 int min,
192 int max,
193 vector<hobject_t> *ls,
194 hobject_t *next)
195{
196 assert(ls);
197 // Starts with the smallest generation to make sure the result list
198 // has the marker object (it might have multiple generations
199 // though, which would be filtered).
200 ghobject_t _next;
201 if (!begin.is_min())
202 _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
203 ls->reserve(max);
204 int r = 0;
205
206 if (min > max)
207 min = max;
208
209 while (!_next.is_max() && ls->size() < (unsigned)min) {
210 vector<ghobject_t> objects;
211 r = store->collection_list(
212 ch,
213 _next,
214 ghobject_t::get_max(),
215 max - ls->size(),
216 &objects,
217 &_next);
218 if (r != 0) {
219 derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
220 break;
221 }
222 for (vector<ghobject_t>::iterator i = objects.begin();
223 i != objects.end();
224 ++i) {
225 if (i->is_pgmeta() || i->hobj.is_temp()) {
226 continue;
227 }
228 if (i->is_no_gen()) {
229 ls->push_back(i->hobj);
230 }
231 }
232 }
233 if (r == 0)
234 *next = _next.hobj;
235 return r;
236}
237
238int PGBackend::objects_list_range(
239 const hobject_t &start,
240 const hobject_t &end,
241 snapid_t seq,
242 vector<hobject_t> *ls,
243 vector<ghobject_t> *gen_obs)
244{
245 assert(ls);
246 vector<ghobject_t> objects;
247 int r = store->collection_list(
248 ch,
249 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
250 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
251 INT_MAX,
252 &objects,
253 NULL);
254 ls->reserve(objects.size());
255 for (vector<ghobject_t>::iterator i = objects.begin();
256 i != objects.end();
257 ++i) {
258 if (i->is_pgmeta() || i->hobj.is_temp()) {
259 continue;
260 }
261 if (i->is_no_gen()) {
262 ls->push_back(i->hobj);
263 } else if (gen_obs) {
264 gen_obs->push_back(*i);
265 }
266 }
267 return r;
268}
269
270int PGBackend::objects_get_attr(
271 const hobject_t &hoid,
272 const string &attr,
273 bufferlist *out)
274{
275 bufferptr bp;
276 int r = store->getattr(
277 ch,
278 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
279 attr.c_str(),
280 bp);
281 if (r >= 0 && out) {
282 out->clear();
283 out->push_back(std::move(bp));
284 }
285 return r;
286}
287
288int PGBackend::objects_get_attrs(
289 const hobject_t &hoid,
290 map<string, bufferlist> *out)
291{
292 return store->getattrs(
293 ch,
294 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
295 *out);
296}
297
298void PGBackend::rollback_setattrs(
299 const hobject_t &hoid,
300 map<string, boost::optional<bufferlist> > &old_attrs,
301 ObjectStore::Transaction *t) {
302 map<string, bufferlist> to_set;
303 assert(!hoid.is_temp());
304 for (map<string, boost::optional<bufferlist> >::iterator i = old_attrs.begin();
305 i != old_attrs.end();
306 ++i) {
307 if (i->second) {
308 to_set[i->first] = i->second.get();
309 } else {
310 t->rmattr(
311 coll,
312 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
313 i->first);
314 }
315 }
316 t->setattrs(
317 coll,
318 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
319 to_set);
320}
321
322void PGBackend::rollback_append(
323 const hobject_t &hoid,
324 uint64_t old_size,
325 ObjectStore::Transaction *t) {
326 assert(!hoid.is_temp());
327 t->truncate(
328 coll,
329 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
330 old_size);
331}
332
333void PGBackend::rollback_stash(
334 const hobject_t &hoid,
335 version_t old_version,
336 ObjectStore::Transaction *t) {
337 assert(!hoid.is_temp());
338 t->remove(
339 coll,
340 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
341 t->collection_move_rename(
342 coll,
343 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
344 coll,
345 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
346}
347
348void PGBackend::rollback_try_stash(
349 const hobject_t &hoid,
350 version_t old_version,
351 ObjectStore::Transaction *t) {
352 assert(!hoid.is_temp());
353 t->remove(
354 coll,
355 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
356 t->try_rename(
357 coll,
358 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
359 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
360}
361
362void PGBackend::rollback_extents(
363 version_t gen,
364 const vector<pair<uint64_t, uint64_t> > &extents,
365 const hobject_t &hoid,
366 ObjectStore::Transaction *t) {
367 auto shard = get_parent()->whoami_shard().shard;
368 for (auto &&extent: extents) {
369 t->clone_range(
370 coll,
371 ghobject_t(hoid, gen, shard),
372 ghobject_t(hoid, ghobject_t::NO_GEN, shard),
373 extent.first,
374 extent.second,
375 extent.first);
376 }
377 t->remove(
378 coll,
379 ghobject_t(hoid, gen, shard));
380}
381
382void PGBackend::trim_rollback_object(
383 const hobject_t &hoid,
384 version_t old_version,
385 ObjectStore::Transaction *t) {
386 assert(!hoid.is_temp());
387 t->remove(
388 coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
389}
390
391PGBackend *PGBackend::build_pg_backend(
392 const pg_pool_t &pool,
393 const OSDMapRef curmap,
394 Listener *l,
395 coll_t coll,
396 ObjectStore::CollectionHandle &ch,
397 ObjectStore *store,
398 CephContext *cct)
399{
400 switch (pool.type) {
401 case pg_pool_t::TYPE_REPLICATED: {
402 return new ReplicatedBackend(l, coll, ch, store, cct);
403 }
404 case pg_pool_t::TYPE_ERASURE: {
405 ErasureCodeInterfaceRef ec_impl;
406 ErasureCodeProfile profile = curmap->get_erasure_code_profile(pool.erasure_code_profile);
407 assert(profile.count("plugin"));
408 stringstream ss;
409 ceph::ErasureCodePluginRegistry::instance().factory(
410 profile.find("plugin")->second,
411 cct->_conf->get_val<std::string>("erasure_code_dir"),
412 profile,
413 &ec_impl,
414 &ss);
415 assert(ec_impl);
416 return new ECBackend(
417 l,
418 coll,
419 ch,
420 store,
421 cct,
422 ec_impl,
423 pool.stripe_width);
424 }
425 default:
426 ceph_abort();
427 return NULL;
428 }
429}
430
431/*
432 * pg lock may or may not be held
433 */
434void PGBackend::be_scan_list(
435 ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
436 ThreadPool::TPHandle &handle)
437{
438 dout(10) << __func__ << " scanning " << ls.size() << " objects"
439 << (deep ? " deeply" : "") << dendl;
440 int i = 0;
441 for (vector<hobject_t>::const_iterator p = ls.begin();
442 p != ls.end();
443 ++p, i++) {
444 handle.reset_tp_timeout();
445 hobject_t poid = *p;
446
447 struct stat st;
448 int r = store->stat(
449 ch,
450 ghobject_t(
451 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
452 &st,
453 true);
454 if (r == 0) {
455 ScrubMap::object &o = map.objects[poid];
456 o.size = st.st_size;
457 assert(!o.negative);
458 store->getattrs(
459 ch,
460 ghobject_t(
461 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
462 o.attrs);
463
464 // calculate the CRC32 on deep scrubs
465 if (deep) {
466 be_deep_scrub(*p, seed, o, handle);
467 }
468
469 dout(25) << __func__ << " " << poid << dendl;
470 } else if (r == -ENOENT) {
471 dout(25) << __func__ << " " << poid << " got " << r
472 << ", skipping" << dendl;
473 } else if (r == -EIO) {
474 dout(25) << __func__ << " " << poid << " got " << r
475 << ", stat_error" << dendl;
476 ScrubMap::object &o = map.objects[poid];
477 o.stat_error = true;
478 } else {
479 derr << __func__ << " got: " << cpp_strerror(r) << dendl;
480 ceph_abort();
481 }
482 }
483}
484
485bool PGBackend::be_compare_scrub_objects(
486 pg_shard_t auth_shard,
487 const ScrubMap::object &auth,
488 const object_info_t& auth_oi,
489 const ScrubMap::object &candidate,
490 shard_info_wrapper &shard_result,
491 inconsistent_obj_wrapper &obj_result,
492 ostream &errorstream)
493{
494 enum { CLEAN, FOUND_ERROR } error = CLEAN;
495 if (candidate.stat_error) {
496 assert(shard_result.has_stat_error());
497 error = FOUND_ERROR;
498 errorstream << "candidate had a stat error";
499 }
500 if (candidate.read_error || candidate.ec_hash_mismatch || candidate.ec_size_mismatch) {
501 error = FOUND_ERROR;
502 errorstream << "candidate had a read error";
503 }
504 if (auth.digest_present && candidate.digest_present) {
505 if (auth.digest != candidate.digest) {
506 if (error != CLEAN)
507 errorstream << ", ";
508 error = FOUND_ERROR;
509 errorstream << "data_digest 0x" << std::hex << candidate.digest
510 << " != data_digest 0x" << auth.digest << std::dec
511 << " from shard " << auth_shard;
512 obj_result.set_data_digest_mismatch();
513 }
514 }
515 if (auth.omap_digest_present && candidate.omap_digest_present) {
516 if (auth.omap_digest != candidate.omap_digest) {
517 if (error != CLEAN)
518 errorstream << ", ";
519 error = FOUND_ERROR;
520 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
521 << " != omap_digest 0x" << auth.omap_digest << std::dec
522 << " from shard " << auth_shard;
523 obj_result.set_omap_digest_mismatch();
524 }
525 }
526 if (parent->get_pool().is_replicated()) {
527 if (auth_oi.is_data_digest() && candidate.digest_present) {
528 if (auth_oi.data_digest != candidate.digest) {
529 if (error != CLEAN)
530 errorstream << ", ";
531 error = FOUND_ERROR;
532 errorstream << "data_digest 0x" << std::hex << candidate.digest
533 << " != data_digest 0x" << auth_oi.data_digest << std::dec
534 << " from auth oi " << auth_oi;
535 shard_result.set_data_digest_mismatch_oi();
536 }
537 }
538 if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
539 if (auth_oi.omap_digest != candidate.omap_digest) {
540 if (error != CLEAN)
541 errorstream << ", ";
542 error = FOUND_ERROR;
543 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
544 << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
545 << " from auth oi " << auth_oi;
546 shard_result.set_omap_digest_mismatch_oi();
547 }
548 }
549 }
550 if (candidate.stat_error)
551 return error == FOUND_ERROR;
552 uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
553 if (oi_size != candidate.size) {
554 if (error != CLEAN)
555 errorstream << ", ";
556 error = FOUND_ERROR;
557 errorstream << "size " << candidate.size
558 << " != size " << oi_size
559 << " from auth oi " << auth_oi;
560 shard_result.set_size_mismatch_oi();
561 }
562 if (auth.size != candidate.size) {
563 if (error != CLEAN)
564 errorstream << ", ";
565 error = FOUND_ERROR;
566 errorstream << "size " << candidate.size
567 << " != size " << auth.size
568 << " from shard " << auth_shard;
569 obj_result.set_size_mismatch();
570 }
571 for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
572 i != auth.attrs.end();
573 ++i) {
574 if (!candidate.attrs.count(i->first)) {
575 if (error != CLEAN)
576 errorstream << ", ";
577 error = FOUND_ERROR;
578 errorstream << "attr name mismatch '" << i->first << "'";
579 obj_result.set_attr_name_mismatch();
580 } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
581 if (error != CLEAN)
582 errorstream << ", ";
583 error = FOUND_ERROR;
584 errorstream << "attr value mismatch '" << i->first << "'";
585 obj_result.set_attr_value_mismatch();
586 }
587 }
588 for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
589 i != candidate.attrs.end();
590 ++i) {
591 if (!auth.attrs.count(i->first)) {
592 if (error != CLEAN)
593 errorstream << ", ";
594 error = FOUND_ERROR;
595 errorstream << "attr name mismatch '" << i->first << "'";
596 obj_result.set_attr_name_mismatch();
597 }
598 }
599 return error == FOUND_ERROR;
600}
601
602static int dcount(const object_info_t &oi)
603{
604 int count = 0;
605 if (oi.is_data_digest())
606 count++;
607 if (oi.is_omap_digest())
608 count++;
609 return count;
610}
611
612map<pg_shard_t, ScrubMap *>::const_iterator
613 PGBackend::be_select_auth_object(
614 const hobject_t &obj,
615 const map<pg_shard_t,ScrubMap*> &maps,
616 object_info_t *auth_oi,
617 map<pg_shard_t, shard_info_wrapper> &shard_map,
618 inconsistent_obj_wrapper &object_error)
619{
620 eversion_t auth_version;
621 bufferlist auth_bl;
622
31f18b77
FG
623 // Create list of shards with primary last so it will be auth copy all
624 // other things being equal.
625 list<pg_shard_t> shards;
7c673cae
FG
626 for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
627 j != maps.end();
628 ++j) {
31f18b77
FG
629 if (j->first == get_parent()->whoami_shard())
630 continue;
631 shards.push_back(j->first);
632 }
633 shards.push_back(get_parent()->whoami_shard());
634
635 map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
636 for (auto &l : shards) {
637 map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
7c673cae
FG
638 map<hobject_t, ScrubMap::object>::iterator i =
639 j->second->objects.find(obj);
640 if (i == j->second->objects.end()) {
641 continue;
642 }
643 string error_string;
644 auto& shard_info = shard_map[j->first];
645 if (i->second.read_error) {
646 shard_info.set_read_error();
647 error_string += " read_error";
648 }
649 if (i->second.ec_hash_mismatch) {
650 shard_info.set_ec_hash_mismatch();
651 error_string += " ec_hash_mismatch";
652 }
653 if (i->second.ec_size_mismatch) {
654 shard_info.set_ec_size_mismatch();
655 error_string += " ec_size_mismatch";
656 }
657
658 object_info_t oi;
659 bufferlist bl;
660 map<string, bufferptr>::iterator k;
31f18b77
FG
661 SnapSet ss;
662 bufferlist ss_bl;
7c673cae
FG
663
664 if (i->second.stat_error) {
665 shard_info.set_stat_error();
666 error_string += " stat_error";
667 // With stat_error no further checking
668 // We don't need to also see a missing_object_info_attr
669 goto out;
670 }
671
672 k = i->second.attrs.find(OI_ATTR);
673 if (k == i->second.attrs.end()) {
674 // no object info on object, probably corrupt
675 shard_info.set_oi_attr_missing();
676 error_string += " oi_attr_missing";
677 goto out;
678 }
679 bl.push_back(k->second);
680 try {
681 bufferlist::iterator bliter = bl.begin();
682 ::decode(oi, bliter);
683 } catch (...) {
684 // invalid object info, probably corrupt
685 shard_info.set_oi_attr_corrupted();
686 error_string += " oi_attr_corrupted";
687 goto out;
688 }
689
31f18b77
FG
690 if (oi.soid != obj) {
691 shard_info.set_oi_attr_corrupted();
692 error_string += " oi_attr_corrupted";
693 goto out;
694 }
695
7c673cae
FG
696 if (auth_version != eversion_t()) {
697 if (!object_error.has_object_info_inconsistency() && !(bl == auth_bl)) {
698 object_error.set_object_info_inconsistency();
699 error_string += " object_info_inconsistency";
700 }
701 }
702
703 // Don't use this particular shard because it won't be able to repair data
704 // XXX: For now we can't pick one shard for repair and another's object info
705 if (i->second.read_error || i->second.ec_hash_mismatch || i->second.ec_size_mismatch)
706 goto out;
707
31f18b77
FG
708 // We don't set errors here for snapset, but we won't pick an auth copy if the
709 // snapset is missing or won't decode.
710 if (obj.is_head() || obj.is_snapdir()) {
711 k = i->second.attrs.find(SS_ATTR);
712 if (k == i->second.attrs.end()) {
713 goto out;
714 }
715 ss_bl.push_back(k->second);
716 try {
717 bufferlist::iterator bliter = ss_bl.begin();
718 ::decode(ss, bliter);
719 } catch (...) {
720 // invalid snapset, probably corrupt
721 goto out;
722 }
723 }
724
7c673cae
FG
725 if (auth_version == eversion_t() || oi.version > auth_version ||
726 (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
727 auth = j;
728 *auth_oi = oi;
729 auth_version = oi.version;
730 auth_bl.clear();
731 auth_bl.append(bl);
732 }
733
734out:
735 // Check error_string because some errors already generated messages
736 if (error_string != "") {
737 dout(10) << __func__ << ": error(s) osd " << j->first
738 << " for obj " << obj
739 << "," << error_string
740 << dendl;
741 }
742 // Keep scanning other shards
743 }
744 dout(10) << __func__ << ": selecting osd " << auth->first
745 << " for obj " << obj
746 << " with oi " << *auth_oi
747 << dendl;
748 return auth;
749}
750
751void PGBackend::be_compare_scrubmaps(
752 const map<pg_shard_t,ScrubMap*> &maps,
753 bool repair,
754 map<hobject_t, set<pg_shard_t>> &missing,
755 map<hobject_t, set<pg_shard_t>> &inconsistent,
756 map<hobject_t, list<pg_shard_t>> &authoritative,
757 map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
758 int &shallow_errors, int &deep_errors,
759 Scrub::Store *store,
760 const spg_t& pgid,
761 const vector<int> &acting,
762 ostream &errorstream)
763{
764 map<hobject_t,ScrubMap::object>::const_iterator i;
765 map<pg_shard_t, ScrubMap *>::const_iterator j;
766 set<hobject_t> master_set;
767 utime_t now = ceph_clock_now();
768
769 // Construct master set
770 for (j = maps.begin(); j != maps.end(); ++j) {
771 for (i = j->second->objects.begin(); i != j->second->objects.end(); ++i) {
772 master_set.insert(i->first);
773 }
774 }
775
776 // Check maps against master set and each other
777 for (set<hobject_t>::const_iterator k = master_set.begin();
778 k != master_set.end();
779 ++k) {
780 object_info_t auth_oi;
781 map<pg_shard_t, shard_info_wrapper> shard_map;
782
783 inconsistent_obj_wrapper object_error{*k};
784
785 map<pg_shard_t, ScrubMap *>::const_iterator auth =
786 be_select_auth_object(*k, maps, &auth_oi, shard_map, object_error);
787
788 list<pg_shard_t> auth_list;
31f18b77 789 set<pg_shard_t> object_errors;
7c673cae
FG
790 if (auth == maps.end()) {
791 object_error.set_version(0);
792 object_error.set_auth_missing(*k, maps, shard_map, shallow_errors, deep_errors);
793 if (object_error.has_deep_errors())
794 ++deep_errors;
795 else if (object_error.has_shallow_errors())
796 ++shallow_errors;
797 store->add_object_error(k->pool, object_error);
798 errorstream << pgid.pgid << " soid " << *k
799 << ": failed to pick suitable object info\n";
800 continue;
801 }
802 object_error.set_version(auth_oi.user_version);
803 ScrubMap::object& auth_object = auth->second->objects[*k];
804 set<pg_shard_t> cur_missing;
805 set<pg_shard_t> cur_inconsistent;
806
807 for (j = maps.begin(); j != maps.end(); ++j) {
808 if (j == auth)
809 shard_map[auth->first].selected_oi = true;
810 if (j->second->objects.count(*k)) {
811 shard_map[j->first].set_object(j->second->objects[*k]);
812 // Compare
813 stringstream ss;
814 bool found = be_compare_scrub_objects(auth->first,
815 auth_object,
816 auth_oi,
817 j->second->objects[*k],
818 shard_map[j->first],
819 object_error,
820 ss);
821 // Some errors might have already been set in be_select_auth_object()
822 if (shard_map[j->first].errors != 0) {
823 cur_inconsistent.insert(j->first);
824 if (shard_map[j->first].has_deep_errors())
825 ++deep_errors;
826 else
827 ++shallow_errors;
828 // Only true if be_compare_scrub_objects() found errors and put something
829 // in ss.
830 if (found)
831 errorstream << pgid << " shard " << j->first << ": soid " << *k
832 << " " << ss.str() << "\n";
31f18b77
FG
833 } else if (found) {
834 // Track possible shard to use as authoritative, if needed
835 // There are errors, without identifying the shard
836 object_errors.insert(j->first);
7c673cae
FG
837 } else {
838 // XXX: The auth shard might get here that we don't know
839 // that it has the "correct" data.
840 auth_list.push_back(j->first);
841 }
842 } else {
843 cur_missing.insert(j->first);
844 shard_map[j->first].set_missing();
845 // Can't have any other errors if there is no information available
846 ++shallow_errors;
847 errorstream << pgid << " shard " << j->first << " missing " << *k
848 << "\n";
849 }
850 object_error.add_shard(j->first, shard_map[j->first]);
851 }
852
853 if (auth_list.empty()) {
31f18b77
FG
854 if (object_errors.empty()) {
855 errorstream << pgid.pgid << " soid " << *k
7c673cae 856 << ": failed to pick suitable auth object\n";
31f18b77
FG
857 goto out;
858 }
859 // Object errors exist and nothing in auth_list
860 // Prefer the auth shard otherwise take first from list.
861 pg_shard_t shard;
862 if (object_errors.count(auth->first)) {
863 shard = auth->first;
864 } else {
865 shard = *(object_errors.begin());
866 }
867 auth_list.push_back(shard);
868 object_errors.erase(shard);
7c673cae 869 }
31f18b77
FG
870 // At this point auth_list is populated, so we add the object errors shards
871 // as inconsistent.
872 cur_inconsistent.insert(object_errors.begin(), object_errors.end());
7c673cae
FG
873 if (!cur_missing.empty()) {
874 missing[*k] = cur_missing;
875 }
876 if (!cur_inconsistent.empty()) {
877 inconsistent[*k] = cur_inconsistent;
878 }
879 if (!cur_inconsistent.empty() || !cur_missing.empty()) {
880 authoritative[*k] = auth_list;
881 } else if (parent->get_pool().is_replicated()) {
882 enum {
883 NO = 0,
884 MAYBE = 1,
885 FORCE = 2,
886 } update = NO;
887
888 if (auth_object.digest_present && auth_object.omap_digest_present &&
889 (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) {
890 dout(20) << __func__ << " missing digest on " << *k << dendl;
891 update = MAYBE;
892 }
893 if (auth_object.digest_present && auth_object.omap_digest_present &&
894 cct->_conf->osd_debug_scrub_chance_rewrite_digest &&
895 (((unsigned)rand() % 100) >
896 cct->_conf->osd_debug_scrub_chance_rewrite_digest)) {
897 dout(20) << __func__ << " randomly updating digest on " << *k << dendl;
898 update = MAYBE;
899 }
900
901 // recorded digest != actual digest?
902 if (auth_oi.is_data_digest() && auth_object.digest_present &&
903 auth_oi.data_digest != auth_object.digest) {
904 assert(shard_map[auth->first].has_data_digest_mismatch_oi());
905 errorstream << pgid << " recorded data digest 0x"
906 << std::hex << auth_oi.data_digest << " != on disk 0x"
907 << auth_object.digest << std::dec << " on " << auth_oi.soid
908 << "\n";
909 if (repair)
910 update = FORCE;
911 }
912 if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
913 auth_oi.omap_digest != auth_object.omap_digest) {
914 assert(shard_map[auth->first].has_omap_digest_mismatch_oi());
915 errorstream << pgid << " recorded omap digest 0x"
916 << std::hex << auth_oi.omap_digest << " != on disk 0x"
917 << auth_object.omap_digest << std::dec
918 << " on " << auth_oi.soid << "\n";
919 if (repair)
920 update = FORCE;
921 }
922
923 if (update != NO) {
924 utime_t age = now - auth_oi.local_mtime;
925 if (update == FORCE ||
926 age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
927 dout(20) << __func__ << " will update digest on " << *k << dendl;
928 missing_digest[*k] = make_pair(auth_object.digest,
929 auth_object.omap_digest);
930 } else {
931 dout(20) << __func__ << " missing digest but age " << age
932 << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
933 << " on " << *k << dendl;
934 }
935 }
936 }
937out:
938 if (object_error.has_deep_errors())
939 ++deep_errors;
940 else if (object_error.has_shallow_errors())
941 ++shallow_errors;
942 if (object_error.errors || object_error.union_shards.errors) {
943 store->add_object_error(k->pool, object_error);
944 }
945 }
946}