1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
26 #include "erasure-code/ErasureCodePlugin.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
33 #define dout_context cct
34 #define dout_subsys ceph_subsys_osd
35 #define DOUT_PREFIX_ARGS this
37 #define dout_prefix _prefix(_dout, this)
38 static ostream
& _prefix(std::ostream
*_dout
, PGBackend
*pgb
) {
39 return *_dout
<< pgb
->get_parent()->gen_dbg_prefix();
42 void PGBackend::recover_delete_object(const hobject_t
&oid
, eversion_t v
,
45 assert(get_parent()->get_actingbackfill_shards().size() > 0);
46 for (const auto& shard
: get_parent()->get_actingbackfill_shards()) {
47 if (shard
== get_parent()->whoami_shard())
49 if (get_parent()->get_shard_missing(shard
).is_missing(oid
)) {
50 dout(20) << __func__
<< " will remove " << oid
<< " " << v
<< " from "
52 h
->deletes
[shard
].push_back(make_pair(oid
, v
));
53 get_parent()->begin_peer_recover(shard
, oid
);
58 void PGBackend::send_recovery_deletes(int prio
,
59 const map
<pg_shard_t
, vector
<pair
<hobject_t
, eversion_t
> > > &deletes
)
61 epoch_t min_epoch
= get_parent()->get_last_peering_reset_epoch();
62 for (const auto& p
: deletes
) {
63 const auto& shard
= p
.first
;
64 const auto& objects
= p
.second
;
65 ConnectionRef con
= get_parent()->get_con_osd_cluster(
67 get_osdmap()->get_epoch());
70 auto it
= objects
.begin();
71 while (it
!= objects
.end()) {
74 spg_t target_pg
= spg_t(get_parent()->get_info().pgid
.pgid
, shard
.shard
);
75 MOSDPGRecoveryDelete
*msg
=
76 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
78 get_osdmap()->get_epoch(),
80 msg
->set_priority(prio
);
82 while (it
!= objects
.end() &&
83 cost
< cct
->_conf
->osd_max_push_cost
&&
84 deletes
< cct
->_conf
->osd_max_push_objects
) {
85 dout(20) << __func__
<< ": sending recovery delete << " << it
->first
86 << " " << it
->second
<< " to osd." << shard
<< dendl
;
87 msg
->objects
.push_back(*it
);
88 cost
+= cct
->_conf
->osd_push_per_object_cost
;
94 get_parent()->send_message_osd_cluster(msg
, con
);
99 bool PGBackend::handle_message(OpRequestRef op
)
101 switch (op
->get_req()->get_type()) {
102 case MSG_OSD_PG_RECOVERY_DELETE
:
103 handle_recovery_delete(op
);
106 case MSG_OSD_PG_RECOVERY_DELETE_REPLY
:
107 handle_recovery_delete_reply(op
);
114 return _handle_message(op
);
117 void PGBackend::handle_recovery_delete(OpRequestRef op
)
119 const MOSDPGRecoveryDelete
*m
= static_cast<const MOSDPGRecoveryDelete
*>(op
->get_req());
120 assert(m
->get_type() == MSG_OSD_PG_RECOVERY_DELETE
);
121 dout(20) << __func__
<< " " << op
<< dendl
;
125 C_GatherBuilder
gather(cct
);
126 for (const auto &p
: m
->objects
) {
127 get_parent()->remove_missing_object(p
.first
, p
.second
, gather
.new_sub());
130 MOSDPGRecoveryDeleteReply
*reply
= new MOSDPGRecoveryDeleteReply
;
131 reply
->from
= get_parent()->whoami_shard();
132 reply
->set_priority(m
->get_priority());
133 reply
->pgid
= spg_t(get_parent()->get_info().pgid
.pgid
, m
->from
.shard
);
134 reply
->map_epoch
= m
->map_epoch
;
135 reply
->min_epoch
= m
->min_epoch
;
136 reply
->objects
= m
->objects
;
137 ConnectionRef conn
= m
->get_connection();
139 gather
.set_finisher(new FunctionContext(
142 get_parent()->send_message_osd_cluster(reply
, conn
.get());
148 void PGBackend::handle_recovery_delete_reply(OpRequestRef op
)
150 const MOSDPGRecoveryDeleteReply
*m
= static_cast<const MOSDPGRecoveryDeleteReply
*>(op
->get_req());
151 assert(m
->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY
);
152 dout(20) << __func__
<< " " << op
<< dendl
;
154 for (const auto &p
: m
->objects
) {
155 ObjectRecoveryInfo recovery_info
;
156 hobject_t oid
= p
.first
;
157 recovery_info
.version
= p
.second
;
158 get_parent()->on_peer_recover(m
->from
, oid
, recovery_info
);
159 bool peers_recovered
= true;
160 for (const auto& shard
: get_parent()->get_actingbackfill_shards()) {
161 if (shard
== get_parent()->whoami_shard())
163 if (get_parent()->get_shard_missing(shard
).is_missing(oid
)) {
164 dout(20) << __func__
<< " " << oid
<< " still missing on at least "
166 peers_recovered
= false;
170 if (peers_recovered
&& !get_parent()->get_local_missing().is_missing(oid
)) {
171 dout(20) << __func__
<< " completed recovery, local_missing = "
172 << get_parent()->get_local_missing() << dendl
;
173 object_stat_sum_t stat_diff
;
174 stat_diff
.num_objects_recovered
= 1;
175 get_parent()->on_global_recover(p
.first
, stat_diff
, true);
180 void PGBackend::rollback(
181 const pg_log_entry_t
&entry
,
182 ObjectStore::Transaction
*t
)
185 struct RollbackVisitor
: public ObjectModDesc::Visitor
{
186 const hobject_t
&hoid
;
188 ObjectStore::Transaction t
;
190 const hobject_t
&hoid
,
191 PGBackend
*pg
) : hoid(hoid
), pg(pg
) {}
192 void append(uint64_t old_size
) override
{
193 ObjectStore::Transaction temp
;
194 pg
->rollback_append(hoid
, old_size
, &temp
);
198 void setattrs(map
<string
, boost::optional
<bufferlist
> > &attrs
) override
{
199 ObjectStore::Transaction temp
;
200 pg
->rollback_setattrs(hoid
, attrs
, &temp
);
204 void rmobject(version_t old_version
) override
{
205 ObjectStore::Transaction temp
;
206 pg
->rollback_stash(hoid
, old_version
, &temp
);
210 void try_rmobject(version_t old_version
) override
{
211 ObjectStore::Transaction temp
;
212 pg
->rollback_try_stash(hoid
, old_version
, &temp
);
216 void create() override
{
217 ObjectStore::Transaction temp
;
218 pg
->rollback_create(hoid
, &temp
);
222 void update_snaps(const set
<snapid_t
> &snaps
) override
{
223 ObjectStore::Transaction temp
;
224 pg
->get_parent()->pgb_set_object_snap_mapping(hoid
, snaps
, &temp
);
228 void rollback_extents(
230 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
231 ObjectStore::Transaction temp
;
232 pg
->rollback_extents(gen
, extents
, hoid
, &temp
);
238 assert(entry
.mod_desc
.can_rollback());
239 RollbackVisitor
vis(entry
.soid
, this);
240 entry
.mod_desc
.visit(&vis
);
244 struct Trimmer
: public ObjectModDesc::Visitor
{
245 const hobject_t
&soid
;
247 ObjectStore::Transaction
*t
;
249 const hobject_t
&soid
,
251 ObjectStore::Transaction
*t
)
252 : soid(soid
), pg(pg
), t(t
) {}
253 void rmobject(version_t old_version
) override
{
254 pg
->trim_rollback_object(
259 // try_rmobject defaults to rmobject
260 void rollback_extents(
262 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
263 pg
->trim_rollback_object(
270 void PGBackend::rollforward(
271 const pg_log_entry_t
&entry
,
272 ObjectStore::Transaction
*t
)
274 auto dpp
= get_parent()->get_dpp();
275 ldpp_dout(dpp
, 20) << __func__
<< ": entry=" << entry
<< dendl
;
276 if (!entry
.can_rollback())
278 Trimmer
trimmer(entry
.soid
, this, t
);
279 entry
.mod_desc
.visit(&trimmer
);
282 void PGBackend::trim(
283 const pg_log_entry_t
&entry
,
284 ObjectStore::Transaction
*t
)
286 if (!entry
.can_rollback())
288 Trimmer
trimmer(entry
.soid
, this, t
);
289 entry
.mod_desc
.visit(&trimmer
);
292 void PGBackend::try_stash(
293 const hobject_t
&hoid
,
295 ObjectStore::Transaction
*t
)
299 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
300 ghobject_t(hoid
, v
, get_parent()->whoami_shard().shard
));
303 void PGBackend::remove(
304 const hobject_t
&hoid
,
305 ObjectStore::Transaction
*t
) {
306 assert(!hoid
.is_temp());
309 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
310 get_parent()->pgb_clear_object_snap_mapping(hoid
, t
);
313 void PGBackend::on_change_cleanup(ObjectStore::Transaction
*t
)
315 dout(10) << __func__
<< dendl
;
317 for (set
<hobject_t
>::iterator i
= temp_contents
.begin();
318 i
!= temp_contents
.end();
320 dout(10) << __func__
<< ": Removing oid "
321 << *i
<< " from the temp collection" << dendl
;
324 ghobject_t(*i
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
326 temp_contents
.clear();
329 int PGBackend::objects_list_partial(
330 const hobject_t
&begin
,
333 vector
<hobject_t
> *ls
,
337 // Starts with the smallest generation to make sure the result list
338 // has the marker object (it might have multiple generations
339 // though, which would be filtered).
342 _next
= ghobject_t(begin
, 0, get_parent()->whoami_shard().shard
);
349 while (!_next
.is_max() && ls
->size() < (unsigned)min
) {
350 vector
<ghobject_t
> objects
;
351 r
= store
->collection_list(
354 ghobject_t::get_max(),
359 derr
<< __func__
<< " list collection " << ch
<< " got: " << cpp_strerror(r
) << dendl
;
362 for (vector
<ghobject_t
>::iterator i
= objects
.begin();
365 if (i
->is_pgmeta() || i
->hobj
.is_temp()) {
368 if (i
->is_no_gen()) {
369 ls
->push_back(i
->hobj
);
378 int PGBackend::objects_list_range(
379 const hobject_t
&start
,
380 const hobject_t
&end
,
382 vector
<hobject_t
> *ls
,
383 vector
<ghobject_t
> *gen_obs
)
386 vector
<ghobject_t
> objects
;
387 int r
= store
->collection_list(
389 ghobject_t(start
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
390 ghobject_t(end
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
394 ls
->reserve(objects
.size());
395 for (vector
<ghobject_t
>::iterator i
= objects
.begin();
398 if (i
->is_pgmeta() || i
->hobj
.is_temp()) {
401 if (i
->is_no_gen()) {
402 ls
->push_back(i
->hobj
);
403 } else if (gen_obs
) {
404 gen_obs
->push_back(*i
);
410 int PGBackend::objects_get_attr(
411 const hobject_t
&hoid
,
416 int r
= store
->getattr(
418 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
423 out
->push_back(std::move(bp
));
428 int PGBackend::objects_get_attrs(
429 const hobject_t
&hoid
,
430 map
<string
, bufferlist
> *out
)
432 return store
->getattrs(
434 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
438 void PGBackend::rollback_setattrs(
439 const hobject_t
&hoid
,
440 map
<string
, boost::optional
<bufferlist
> > &old_attrs
,
441 ObjectStore::Transaction
*t
) {
442 map
<string
, bufferlist
> to_set
;
443 assert(!hoid
.is_temp());
444 for (map
<string
, boost::optional
<bufferlist
> >::iterator i
= old_attrs
.begin();
445 i
!= old_attrs
.end();
448 to_set
[i
->first
] = i
->second
.get();
452 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
458 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
462 void PGBackend::rollback_append(
463 const hobject_t
&hoid
,
465 ObjectStore::Transaction
*t
) {
466 assert(!hoid
.is_temp());
469 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
473 void PGBackend::rollback_stash(
474 const hobject_t
&hoid
,
475 version_t old_version
,
476 ObjectStore::Transaction
*t
) {
477 assert(!hoid
.is_temp());
480 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
481 t
->collection_move_rename(
483 ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
),
485 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
488 void PGBackend::rollback_try_stash(
489 const hobject_t
&hoid
,
490 version_t old_version
,
491 ObjectStore::Transaction
*t
) {
492 assert(!hoid
.is_temp());
495 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
498 ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
),
499 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
502 void PGBackend::rollback_extents(
504 const vector
<pair
<uint64_t, uint64_t> > &extents
,
505 const hobject_t
&hoid
,
506 ObjectStore::Transaction
*t
) {
507 auto shard
= get_parent()->whoami_shard().shard
;
508 for (auto &&extent
: extents
) {
511 ghobject_t(hoid
, gen
, shard
),
512 ghobject_t(hoid
, ghobject_t::NO_GEN
, shard
),
519 ghobject_t(hoid
, gen
, shard
));
522 void PGBackend::trim_rollback_object(
523 const hobject_t
&hoid
,
524 version_t old_version
,
525 ObjectStore::Transaction
*t
) {
526 assert(!hoid
.is_temp());
528 coll
, ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
));
531 PGBackend
*PGBackend::build_pg_backend(
532 const pg_pool_t
&pool
,
533 const OSDMapRef curmap
,
536 ObjectStore::CollectionHandle
&ch
,
541 case pg_pool_t::TYPE_REPLICATED
: {
542 return new ReplicatedBackend(l
, coll
, ch
, store
, cct
);
544 case pg_pool_t::TYPE_ERASURE
: {
545 ErasureCodeInterfaceRef ec_impl
;
546 ErasureCodeProfile profile
= curmap
->get_erasure_code_profile(pool
.erasure_code_profile
);
547 assert(profile
.count("plugin"));
549 ceph::ErasureCodePluginRegistry::instance().factory(
550 profile
.find("plugin")->second
,
551 cct
->_conf
->get_val
<std::string
>("erasure_code_dir"),
556 return new ECBackend(
572 * pg lock may or may not be held
574 void PGBackend::be_scan_list(
575 ScrubMap
&map
, const vector
<hobject_t
> &ls
, bool deep
, uint32_t seed
,
576 ThreadPool::TPHandle
&handle
)
578 dout(10) << __func__
<< " scanning " << ls
.size() << " objects"
579 << (deep
? " deeply" : "") << dendl
;
581 for (vector
<hobject_t
>::const_iterator p
= ls
.begin();
584 handle
.reset_tp_timeout();
591 poid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
595 ScrubMap::object
&o
= map
.objects
[poid
];
601 poid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
604 // calculate the CRC32 on deep scrubs
606 be_deep_scrub(*p
, seed
, o
, handle
);
609 dout(25) << __func__
<< " " << poid
<< dendl
;
610 } else if (r
== -ENOENT
) {
611 dout(25) << __func__
<< " " << poid
<< " got " << r
612 << ", skipping" << dendl
;
613 } else if (r
== -EIO
) {
614 dout(25) << __func__
<< " " << poid
<< " got " << r
615 << ", stat_error" << dendl
;
616 ScrubMap::object
&o
= map
.objects
[poid
];
619 derr
<< __func__
<< " got: " << cpp_strerror(r
) << dendl
;
625 bool PGBackend::be_compare_scrub_objects(
626 pg_shard_t auth_shard
,
627 const ScrubMap::object
&auth
,
628 const object_info_t
& auth_oi
,
629 const ScrubMap::object
&candidate
,
630 shard_info_wrapper
&shard_result
,
631 inconsistent_obj_wrapper
&obj_result
,
632 ostream
&errorstream
)
634 enum { CLEAN
, FOUND_ERROR
} error
= CLEAN
;
635 if (candidate
.stat_error
) {
636 assert(shard_result
.has_stat_error());
638 errorstream
<< "candidate had a stat error";
640 if (candidate
.read_error
|| candidate
.ec_hash_mismatch
|| candidate
.ec_size_mismatch
) {
642 errorstream
<< "candidate had a read error";
644 if (auth
.digest_present
&& candidate
.digest_present
) {
645 if (auth
.digest
!= candidate
.digest
) {
649 errorstream
<< "data_digest 0x" << std::hex
<< candidate
.digest
650 << " != data_digest 0x" << auth
.digest
<< std::dec
651 << " from shard " << auth_shard
;
652 obj_result
.set_data_digest_mismatch();
655 if (auth
.omap_digest_present
&& candidate
.omap_digest_present
) {
656 if (auth
.omap_digest
!= candidate
.omap_digest
) {
660 errorstream
<< "omap_digest 0x" << std::hex
<< candidate
.omap_digest
661 << " != omap_digest 0x" << auth
.omap_digest
<< std::dec
662 << " from shard " << auth_shard
;
663 obj_result
.set_omap_digest_mismatch();
666 if (parent
->get_pool().is_replicated()) {
667 if (auth_oi
.is_data_digest() && candidate
.digest_present
) {
668 if (auth_oi
.data_digest
!= candidate
.digest
) {
672 errorstream
<< "data_digest 0x" << std::hex
<< candidate
.digest
673 << " != data_digest 0x" << auth_oi
.data_digest
<< std::dec
674 << " from auth oi " << auth_oi
;
675 shard_result
.set_data_digest_mismatch_oi();
678 if (auth_oi
.is_omap_digest() && candidate
.omap_digest_present
) {
679 if (auth_oi
.omap_digest
!= candidate
.omap_digest
) {
683 errorstream
<< "omap_digest 0x" << std::hex
<< candidate
.omap_digest
684 << " != omap_digest 0x" << auth_oi
.omap_digest
<< std::dec
685 << " from auth oi " << auth_oi
;
686 shard_result
.set_omap_digest_mismatch_oi();
690 if (candidate
.stat_error
)
691 return error
== FOUND_ERROR
;
692 uint64_t oi_size
= be_get_ondisk_size(auth_oi
.size
);
693 if (oi_size
!= candidate
.size
) {
697 errorstream
<< "size " << candidate
.size
698 << " != size " << oi_size
699 << " from auth oi " << auth_oi
;
700 shard_result
.set_size_mismatch_oi();
702 if (auth
.size
!= candidate
.size
) {
706 errorstream
<< "size " << candidate
.size
707 << " != size " << auth
.size
708 << " from shard " << auth_shard
;
709 obj_result
.set_size_mismatch();
711 for (map
<string
,bufferptr
>::const_iterator i
= auth
.attrs
.begin();
712 i
!= auth
.attrs
.end();
714 if (!candidate
.attrs
.count(i
->first
)) {
718 errorstream
<< "attr name mismatch '" << i
->first
<< "'";
719 obj_result
.set_attr_name_mismatch();
720 } else if (candidate
.attrs
.find(i
->first
)->second
.cmp(i
->second
)) {
724 errorstream
<< "attr value mismatch '" << i
->first
<< "'";
725 obj_result
.set_attr_value_mismatch();
728 for (map
<string
,bufferptr
>::const_iterator i
= candidate
.attrs
.begin();
729 i
!= candidate
.attrs
.end();
731 if (!auth
.attrs
.count(i
->first
)) {
735 errorstream
<< "attr name mismatch '" << i
->first
<< "'";
736 obj_result
.set_attr_name_mismatch();
739 return error
== FOUND_ERROR
;
742 static int dcount(const object_info_t
&oi
)
745 if (oi
.is_data_digest())
747 if (oi
.is_omap_digest())
752 map
<pg_shard_t
, ScrubMap
*>::const_iterator
753 PGBackend::be_select_auth_object(
754 const hobject_t
&obj
,
755 const map
<pg_shard_t
,ScrubMap
*> &maps
,
756 object_info_t
*auth_oi
,
757 map
<pg_shard_t
, shard_info_wrapper
> &shard_map
,
758 inconsistent_obj_wrapper
&object_error
)
760 eversion_t auth_version
;
763 // Create list of shards with primary last so it will be auth copy all
764 // other things being equal.
765 list
<pg_shard_t
> shards
;
766 for (map
<pg_shard_t
, ScrubMap
*>::const_iterator j
= maps
.begin();
769 if (j
->first
== get_parent()->whoami_shard())
771 shards
.push_back(j
->first
);
773 shards
.push_back(get_parent()->whoami_shard());
775 map
<pg_shard_t
, ScrubMap
*>::const_iterator auth
= maps
.end();
776 for (auto &l
: shards
) {
777 map
<pg_shard_t
, ScrubMap
*>::const_iterator j
= maps
.find(l
);
778 map
<hobject_t
, ScrubMap::object
>::iterator i
=
779 j
->second
->objects
.find(obj
);
780 if (i
== j
->second
->objects
.end()) {
784 auto& shard_info
= shard_map
[j
->first
];
785 if (i
->second
.read_error
) {
786 shard_info
.set_read_error();
787 error_string
+= " read_error";
789 if (i
->second
.ec_hash_mismatch
) {
790 shard_info
.set_ec_hash_mismatch();
791 error_string
+= " ec_hash_mismatch";
793 if (i
->second
.ec_size_mismatch
) {
794 shard_info
.set_ec_size_mismatch();
795 error_string
+= " ec_size_mismatch";
800 map
<string
, bufferptr
>::iterator k
;
804 if (i
->second
.stat_error
) {
805 shard_info
.set_stat_error();
806 error_string
+= " stat_error";
807 // With stat_error no further checking
808 // We don't need to also see a missing_object_info_attr
812 k
= i
->second
.attrs
.find(OI_ATTR
);
813 if (k
== i
->second
.attrs
.end()) {
814 // no object info on object, probably corrupt
815 shard_info
.set_oi_attr_missing();
816 error_string
+= " oi_attr_missing";
819 bl
.push_back(k
->second
);
821 bufferlist::iterator bliter
= bl
.begin();
822 ::decode(oi
, bliter
);
824 // invalid object info, probably corrupt
825 shard_info
.set_oi_attr_corrupted();
826 error_string
+= " oi_attr_corrupted";
830 if (oi
.soid
!= obj
) {
831 shard_info
.set_oi_attr_corrupted();
832 error_string
+= " oi_attr_corrupted";
836 if (auth_version
!= eversion_t()) {
837 if (!object_error
.has_object_info_inconsistency() && !(bl
== auth_bl
)) {
838 object_error
.set_object_info_inconsistency();
839 error_string
+= " object_info_inconsistency";
843 // Don't use this particular shard because it won't be able to repair data
844 // XXX: For now we can't pick one shard for repair and another's object info
845 if (i
->second
.read_error
|| i
->second
.ec_hash_mismatch
|| i
->second
.ec_size_mismatch
)
848 // We don't set errors here for snapset, but we won't pick an auth copy if the
849 // snapset is missing or won't decode.
850 if (obj
.is_head() || obj
.is_snapdir()) {
851 k
= i
->second
.attrs
.find(SS_ATTR
);
852 if (k
== i
->second
.attrs
.end()) {
855 ss_bl
.push_back(k
->second
);
857 bufferlist::iterator bliter
= ss_bl
.begin();
858 ::decode(ss
, bliter
);
860 // invalid snapset, probably corrupt
865 if (auth_version
== eversion_t() || oi
.version
> auth_version
||
866 (oi
.version
== auth_version
&& dcount(oi
) > dcount(*auth_oi
))) {
869 auth_version
= oi
.version
;
875 // Check error_string because some errors already generated messages
876 if (error_string
!= "") {
877 dout(10) << __func__
<< ": error(s) osd " << j
->first
878 << " for obj " << obj
879 << "," << error_string
882 // Keep scanning other shards
884 dout(10) << __func__
<< ": selecting osd " << auth
->first
885 << " for obj " << obj
886 << " with oi " << *auth_oi
891 void PGBackend::be_compare_scrubmaps(
892 const map
<pg_shard_t
,ScrubMap
*> &maps
,
894 map
<hobject_t
, set
<pg_shard_t
>> &missing
,
895 map
<hobject_t
, set
<pg_shard_t
>> &inconsistent
,
896 map
<hobject_t
, list
<pg_shard_t
>> &authoritative
,
897 map
<hobject_t
, pair
<uint32_t,uint32_t>> &missing_digest
,
898 int &shallow_errors
, int &deep_errors
,
901 const vector
<int> &acting
,
902 ostream
&errorstream
)
904 map
<hobject_t
,ScrubMap::object
>::const_iterator i
;
905 map
<pg_shard_t
, ScrubMap
*>::const_iterator j
;
906 set
<hobject_t
> master_set
;
907 utime_t now
= ceph_clock_now();
909 // Construct master set
910 for (j
= maps
.begin(); j
!= maps
.end(); ++j
) {
911 for (i
= j
->second
->objects
.begin(); i
!= j
->second
->objects
.end(); ++i
) {
912 master_set
.insert(i
->first
);
916 // Check maps against master set and each other
917 for (set
<hobject_t
>::const_iterator k
= master_set
.begin();
918 k
!= master_set
.end();
920 object_info_t auth_oi
;
921 map
<pg_shard_t
, shard_info_wrapper
> shard_map
;
923 inconsistent_obj_wrapper object_error
{*k
};
925 map
<pg_shard_t
, ScrubMap
*>::const_iterator auth
=
926 be_select_auth_object(*k
, maps
, &auth_oi
, shard_map
, object_error
);
928 list
<pg_shard_t
> auth_list
;
929 set
<pg_shard_t
> object_errors
;
930 if (auth
== maps
.end()) {
931 object_error
.set_version(0);
932 object_error
.set_auth_missing(*k
, maps
, shard_map
, shallow_errors
, deep_errors
);
933 if (object_error
.has_deep_errors())
935 else if (object_error
.has_shallow_errors())
937 store
->add_object_error(k
->pool
, object_error
);
938 errorstream
<< pgid
.pgid
<< " soid " << *k
939 << ": failed to pick suitable object info\n";
942 object_error
.set_version(auth_oi
.user_version
);
943 ScrubMap::object
& auth_object
= auth
->second
->objects
[*k
];
944 set
<pg_shard_t
> cur_missing
;
945 set
<pg_shard_t
> cur_inconsistent
;
947 for (j
= maps
.begin(); j
!= maps
.end(); ++j
) {
949 shard_map
[auth
->first
].selected_oi
= true;
950 if (j
->second
->objects
.count(*k
)) {
951 shard_map
[j
->first
].set_object(j
->second
->objects
[*k
]);
954 bool found
= be_compare_scrub_objects(auth
->first
,
957 j
->second
->objects
[*k
],
961 // Some errors might have already been set in be_select_auth_object()
962 if (shard_map
[j
->first
].errors
!= 0) {
963 cur_inconsistent
.insert(j
->first
);
964 if (shard_map
[j
->first
].has_deep_errors())
968 // Only true if be_compare_scrub_objects() found errors and put something
971 errorstream
<< pgid
<< " shard " << j
->first
<< ": soid " << *k
972 << " " << ss
.str() << "\n";
974 // Track possible shard to use as authoritative, if needed
975 // There are errors, without identifying the shard
976 object_errors
.insert(j
->first
);
978 // XXX: The auth shard might get here that we don't know
979 // that it has the "correct" data.
980 auth_list
.push_back(j
->first
);
983 cur_missing
.insert(j
->first
);
984 shard_map
[j
->first
].set_missing();
985 // Can't have any other errors if there is no information available
987 errorstream
<< pgid
<< " shard " << j
->first
<< " missing " << *k
990 object_error
.add_shard(j
->first
, shard_map
[j
->first
]);
993 if (auth_list
.empty()) {
994 if (object_errors
.empty()) {
995 errorstream
<< pgid
.pgid
<< " soid " << *k
996 << ": failed to pick suitable auth object\n";
999 // Object errors exist and nothing in auth_list
1000 // Prefer the auth shard otherwise take first from list.
1002 if (object_errors
.count(auth
->first
)) {
1003 shard
= auth
->first
;
1005 shard
= *(object_errors
.begin());
1007 auth_list
.push_back(shard
);
1008 object_errors
.erase(shard
);
1010 // At this point auth_list is populated, so we add the object errors shards
1012 cur_inconsistent
.insert(object_errors
.begin(), object_errors
.end());
1013 if (!cur_missing
.empty()) {
1014 missing
[*k
] = cur_missing
;
1016 if (!cur_inconsistent
.empty()) {
1017 inconsistent
[*k
] = cur_inconsistent
;
1019 if (!cur_inconsistent
.empty() || !cur_missing
.empty()) {
1020 authoritative
[*k
] = auth_list
;
1021 } else if (parent
->get_pool().is_replicated()) {
1028 if (auth_object
.digest_present
&& auth_object
.omap_digest_present
&&
1029 (!auth_oi
.is_data_digest() || !auth_oi
.is_omap_digest())) {
1030 dout(20) << __func__
<< " missing digest on " << *k
<< dendl
;
1033 if (auth_object
.digest_present
&& auth_object
.omap_digest_present
&&
1034 cct
->_conf
->osd_debug_scrub_chance_rewrite_digest
&&
1035 (((unsigned)rand() % 100) >
1036 cct
->_conf
->osd_debug_scrub_chance_rewrite_digest
)) {
1037 dout(20) << __func__
<< " randomly updating digest on " << *k
<< dendl
;
1041 // recorded digest != actual digest?
1042 if (auth_oi
.is_data_digest() && auth_object
.digest_present
&&
1043 auth_oi
.data_digest
!= auth_object
.digest
) {
1044 assert(shard_map
[auth
->first
].has_data_digest_mismatch_oi());
1045 errorstream
<< pgid
<< " recorded data digest 0x"
1046 << std::hex
<< auth_oi
.data_digest
<< " != on disk 0x"
1047 << auth_object
.digest
<< std::dec
<< " on " << auth_oi
.soid
1052 if (auth_oi
.is_omap_digest() && auth_object
.omap_digest_present
&&
1053 auth_oi
.omap_digest
!= auth_object
.omap_digest
) {
1054 assert(shard_map
[auth
->first
].has_omap_digest_mismatch_oi());
1055 errorstream
<< pgid
<< " recorded omap digest 0x"
1056 << std::hex
<< auth_oi
.omap_digest
<< " != on disk 0x"
1057 << auth_object
.omap_digest
<< std::dec
1058 << " on " << auth_oi
.soid
<< "\n";
1064 utime_t age
= now
- auth_oi
.local_mtime
;
1065 if (update
== FORCE
||
1066 age
> cct
->_conf
->osd_deep_scrub_update_digest_min_age
) {
1067 dout(20) << __func__
<< " will update digest on " << *k
<< dendl
;
1068 missing_digest
[*k
] = make_pair(auth_object
.digest
,
1069 auth_object
.omap_digest
);
1071 dout(20) << __func__
<< " missing digest but age " << age
1072 << " < " << cct
->_conf
->osd_deep_scrub_update_digest_min_age
1073 << " on " << *k
<< dendl
;
1078 if (object_error
.has_deep_errors())
1080 else if (object_error
.has_shallow_errors())
1082 if (object_error
.errors
|| object_error
.union_shards
.errors
) {
1083 store
->add_object_error(k
->pool
, object_error
);