1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
26 #include "erasure-code/ErasureCodePlugin.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
33 #define dout_context cct
34 #define dout_subsys ceph_subsys_osd
35 #define DOUT_PREFIX_ARGS this
37 #define dout_prefix _prefix(_dout, this)
38 static ostream
& _prefix(std::ostream
*_dout
, PGBackend
*pgb
) {
39 return *_dout
<< pgb
->get_parent()->gen_dbg_prefix();
42 void PGBackend::recover_delete_object(const hobject_t
&oid
, eversion_t v
,
45 assert(get_parent()->get_actingbackfill_shards().size() > 0);
46 for (const auto& shard
: get_parent()->get_actingbackfill_shards()) {
47 if (shard
== get_parent()->whoami_shard())
49 if (get_parent()->get_shard_missing(shard
).is_missing(oid
)) {
50 dout(20) << __func__
<< " will remove " << oid
<< " " << v
<< " from "
52 h
->deletes
[shard
].push_back(make_pair(oid
, v
));
53 get_parent()->begin_peer_recover(shard
, oid
);
58 void PGBackend::send_recovery_deletes(int prio
,
59 const map
<pg_shard_t
, vector
<pair
<hobject_t
, eversion_t
> > > &deletes
)
61 epoch_t min_epoch
= get_parent()->get_last_peering_reset_epoch();
62 for (const auto& p
: deletes
) {
63 const auto& shard
= p
.first
;
64 const auto& objects
= p
.second
;
65 ConnectionRef con
= get_parent()->get_con_osd_cluster(
67 get_osdmap()->get_epoch());
70 auto it
= objects
.begin();
71 while (it
!= objects
.end()) {
74 spg_t target_pg
= spg_t(get_parent()->get_info().pgid
.pgid
, shard
.shard
);
75 MOSDPGRecoveryDelete
*msg
=
76 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
78 get_osdmap()->get_epoch(),
80 msg
->set_priority(prio
);
82 while (it
!= objects
.end() &&
83 cost
< cct
->_conf
->osd_max_push_cost
&&
84 deletes
< cct
->_conf
->osd_max_push_objects
) {
85 dout(20) << __func__
<< ": sending recovery delete << " << it
->first
86 << " " << it
->second
<< " to osd." << shard
<< dendl
;
87 msg
->objects
.push_back(*it
);
88 cost
+= cct
->_conf
->osd_push_per_object_cost
;
94 get_parent()->send_message_osd_cluster(msg
, con
);
99 bool PGBackend::handle_message(OpRequestRef op
)
101 switch (op
->get_req()->get_type()) {
102 case MSG_OSD_PG_RECOVERY_DELETE
:
103 handle_recovery_delete(op
);
106 case MSG_OSD_PG_RECOVERY_DELETE_REPLY
:
107 handle_recovery_delete_reply(op
);
114 return _handle_message(op
);
117 void PGBackend::handle_recovery_delete(OpRequestRef op
)
119 const MOSDPGRecoveryDelete
*m
= static_cast<const MOSDPGRecoveryDelete
*>(op
->get_req());
120 assert(m
->get_type() == MSG_OSD_PG_RECOVERY_DELETE
);
121 dout(20) << __func__
<< " " << op
<< dendl
;
125 C_GatherBuilder
gather(cct
);
126 for (const auto &p
: m
->objects
) {
127 get_parent()->remove_missing_object(p
.first
, p
.second
, gather
.new_sub());
130 MOSDPGRecoveryDeleteReply
*reply
= new MOSDPGRecoveryDeleteReply
;
131 reply
->from
= get_parent()->whoami_shard();
132 reply
->set_priority(m
->get_priority());
133 reply
->pgid
= spg_t(get_parent()->get_info().pgid
.pgid
, m
->from
.shard
);
134 reply
->map_epoch
= m
->map_epoch
;
135 reply
->min_epoch
= m
->min_epoch
;
136 reply
->objects
= m
->objects
;
137 ConnectionRef conn
= m
->get_connection();
139 gather
.set_finisher(new FunctionContext(
142 get_parent()->send_message_osd_cluster(reply
, conn
.get());
150 void PGBackend::handle_recovery_delete_reply(OpRequestRef op
)
152 const MOSDPGRecoveryDeleteReply
*m
= static_cast<const MOSDPGRecoveryDeleteReply
*>(op
->get_req());
153 assert(m
->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY
);
154 dout(20) << __func__
<< " " << op
<< dendl
;
156 for (const auto &p
: m
->objects
) {
157 ObjectRecoveryInfo recovery_info
;
158 hobject_t oid
= p
.first
;
159 recovery_info
.version
= p
.second
;
160 get_parent()->on_peer_recover(m
->from
, oid
, recovery_info
);
161 bool peers_recovered
= true;
162 for (const auto& shard
: get_parent()->get_actingbackfill_shards()) {
163 if (shard
== get_parent()->whoami_shard())
165 if (get_parent()->get_shard_missing(shard
).is_missing(oid
)) {
166 dout(20) << __func__
<< " " << oid
<< " still missing on at least "
168 peers_recovered
= false;
172 if (peers_recovered
&& !get_parent()->get_local_missing().is_missing(oid
)) {
173 dout(20) << __func__
<< " completed recovery, local_missing = "
174 << get_parent()->get_local_missing() << dendl
;
175 object_stat_sum_t stat_diff
;
176 stat_diff
.num_objects_recovered
= 1;
177 get_parent()->on_global_recover(p
.first
, stat_diff
, true);
182 void PGBackend::rollback(
183 const pg_log_entry_t
&entry
,
184 ObjectStore::Transaction
*t
)
187 struct RollbackVisitor
: public ObjectModDesc::Visitor
{
188 const hobject_t
&hoid
;
190 ObjectStore::Transaction t
;
192 const hobject_t
&hoid
,
193 PGBackend
*pg
) : hoid(hoid
), pg(pg
) {}
194 void append(uint64_t old_size
) override
{
195 ObjectStore::Transaction temp
;
196 pg
->rollback_append(hoid
, old_size
, &temp
);
200 void setattrs(map
<string
, boost::optional
<bufferlist
> > &attrs
) override
{
201 ObjectStore::Transaction temp
;
202 pg
->rollback_setattrs(hoid
, attrs
, &temp
);
206 void rmobject(version_t old_version
) override
{
207 ObjectStore::Transaction temp
;
208 pg
->rollback_stash(hoid
, old_version
, &temp
);
212 void try_rmobject(version_t old_version
) override
{
213 ObjectStore::Transaction temp
;
214 pg
->rollback_try_stash(hoid
, old_version
, &temp
);
218 void create() override
{
219 ObjectStore::Transaction temp
;
220 pg
->rollback_create(hoid
, &temp
);
224 void update_snaps(const set
<snapid_t
> &snaps
) override
{
225 ObjectStore::Transaction temp
;
226 pg
->get_parent()->pgb_set_object_snap_mapping(hoid
, snaps
, &temp
);
230 void rollback_extents(
232 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
233 ObjectStore::Transaction temp
;
234 pg
->rollback_extents(gen
, extents
, hoid
, &temp
);
240 assert(entry
.mod_desc
.can_rollback());
241 RollbackVisitor
vis(entry
.soid
, this);
242 entry
.mod_desc
.visit(&vis
);
246 struct Trimmer
: public ObjectModDesc::Visitor
{
247 const hobject_t
&soid
;
249 ObjectStore::Transaction
*t
;
251 const hobject_t
&soid
,
253 ObjectStore::Transaction
*t
)
254 : soid(soid
), pg(pg
), t(t
) {}
255 void rmobject(version_t old_version
) override
{
256 pg
->trim_rollback_object(
261 // try_rmobject defaults to rmobject
262 void rollback_extents(
264 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
265 pg
->trim_rollback_object(
272 void PGBackend::rollforward(
273 const pg_log_entry_t
&entry
,
274 ObjectStore::Transaction
*t
)
276 auto dpp
= get_parent()->get_dpp();
277 ldpp_dout(dpp
, 20) << __func__
<< ": entry=" << entry
<< dendl
;
278 if (!entry
.can_rollback())
280 Trimmer
trimmer(entry
.soid
, this, t
);
281 entry
.mod_desc
.visit(&trimmer
);
284 void PGBackend::trim(
285 const pg_log_entry_t
&entry
,
286 ObjectStore::Transaction
*t
)
288 if (!entry
.can_rollback())
290 Trimmer
trimmer(entry
.soid
, this, t
);
291 entry
.mod_desc
.visit(&trimmer
);
294 void PGBackend::try_stash(
295 const hobject_t
&hoid
,
297 ObjectStore::Transaction
*t
)
301 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
302 ghobject_t(hoid
, v
, get_parent()->whoami_shard().shard
));
305 void PGBackend::remove(
306 const hobject_t
&hoid
,
307 ObjectStore::Transaction
*t
) {
308 assert(!hoid
.is_temp());
311 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
312 get_parent()->pgb_clear_object_snap_mapping(hoid
, t
);
315 void PGBackend::on_change_cleanup(ObjectStore::Transaction
*t
)
317 dout(10) << __func__
<< dendl
;
319 for (set
<hobject_t
>::iterator i
= temp_contents
.begin();
320 i
!= temp_contents
.end();
322 dout(10) << __func__
<< ": Removing oid "
323 << *i
<< " from the temp collection" << dendl
;
326 ghobject_t(*i
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
328 temp_contents
.clear();
331 int PGBackend::objects_list_partial(
332 const hobject_t
&begin
,
335 vector
<hobject_t
> *ls
,
339 // Starts with the smallest generation to make sure the result list
340 // has the marker object (it might have multiple generations
341 // though, which would be filtered).
344 _next
= ghobject_t(begin
, 0, get_parent()->whoami_shard().shard
);
351 while (!_next
.is_max() && ls
->size() < (unsigned)min
) {
352 vector
<ghobject_t
> objects
;
353 r
= store
->collection_list(
356 ghobject_t::get_max(),
361 derr
<< __func__
<< " list collection " << ch
<< " got: " << cpp_strerror(r
) << dendl
;
364 for (vector
<ghobject_t
>::iterator i
= objects
.begin();
367 if (i
->is_pgmeta() || i
->hobj
.is_temp()) {
370 if (i
->is_no_gen()) {
371 ls
->push_back(i
->hobj
);
380 int PGBackend::objects_list_range(
381 const hobject_t
&start
,
382 const hobject_t
&end
,
384 vector
<hobject_t
> *ls
,
385 vector
<ghobject_t
> *gen_obs
)
388 vector
<ghobject_t
> objects
;
389 int r
= store
->collection_list(
391 ghobject_t(start
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
392 ghobject_t(end
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
396 ls
->reserve(objects
.size());
397 for (vector
<ghobject_t
>::iterator i
= objects
.begin();
400 if (i
->is_pgmeta() || i
->hobj
.is_temp()) {
403 if (i
->is_no_gen()) {
404 ls
->push_back(i
->hobj
);
405 } else if (gen_obs
) {
406 gen_obs
->push_back(*i
);
412 int PGBackend::objects_get_attr(
413 const hobject_t
&hoid
,
418 int r
= store
->getattr(
420 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
425 out
->push_back(std::move(bp
));
430 int PGBackend::objects_get_attrs(
431 const hobject_t
&hoid
,
432 map
<string
, bufferlist
> *out
)
434 return store
->getattrs(
436 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
440 void PGBackend::rollback_setattrs(
441 const hobject_t
&hoid
,
442 map
<string
, boost::optional
<bufferlist
> > &old_attrs
,
443 ObjectStore::Transaction
*t
) {
444 map
<string
, bufferlist
> to_set
;
445 assert(!hoid
.is_temp());
446 for (map
<string
, boost::optional
<bufferlist
> >::iterator i
= old_attrs
.begin();
447 i
!= old_attrs
.end();
450 to_set
[i
->first
] = i
->second
.get();
454 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
460 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
464 void PGBackend::rollback_append(
465 const hobject_t
&hoid
,
467 ObjectStore::Transaction
*t
) {
468 assert(!hoid
.is_temp());
471 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
475 void PGBackend::rollback_stash(
476 const hobject_t
&hoid
,
477 version_t old_version
,
478 ObjectStore::Transaction
*t
) {
479 assert(!hoid
.is_temp());
482 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
483 t
->collection_move_rename(
485 ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
),
487 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
490 void PGBackend::rollback_try_stash(
491 const hobject_t
&hoid
,
492 version_t old_version
,
493 ObjectStore::Transaction
*t
) {
494 assert(!hoid
.is_temp());
497 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
500 ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
),
501 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
504 void PGBackend::rollback_extents(
506 const vector
<pair
<uint64_t, uint64_t> > &extents
,
507 const hobject_t
&hoid
,
508 ObjectStore::Transaction
*t
) {
509 auto shard
= get_parent()->whoami_shard().shard
;
510 for (auto &&extent
: extents
) {
513 ghobject_t(hoid
, gen
, shard
),
514 ghobject_t(hoid
, ghobject_t::NO_GEN
, shard
),
521 ghobject_t(hoid
, gen
, shard
));
524 void PGBackend::trim_rollback_object(
525 const hobject_t
&hoid
,
526 version_t old_version
,
527 ObjectStore::Transaction
*t
) {
528 assert(!hoid
.is_temp());
530 coll
, ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
));
533 PGBackend
*PGBackend::build_pg_backend(
534 const pg_pool_t
&pool
,
535 const OSDMapRef curmap
,
538 ObjectStore::CollectionHandle
&ch
,
543 case pg_pool_t::TYPE_REPLICATED
: {
544 return new ReplicatedBackend(l
, coll
, ch
, store
, cct
);
546 case pg_pool_t::TYPE_ERASURE
: {
547 ErasureCodeInterfaceRef ec_impl
;
548 ErasureCodeProfile profile
= curmap
->get_erasure_code_profile(pool
.erasure_code_profile
);
549 assert(profile
.count("plugin"));
551 ceph::ErasureCodePluginRegistry::instance().factory(
552 profile
.find("plugin")->second
,
553 cct
->_conf
->get_val
<std::string
>("erasure_code_dir"),
558 return new ECBackend(
573 int PGBackend::be_scan_list(
575 ScrubMapBuilder
&pos
)
577 dout(10) << __func__
<< " " << pos
<< dendl
;
579 assert(pos
.pos
< pos
.ls
.size());
580 hobject_t
& poid
= pos
.ls
[pos
.pos
];
586 poid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
590 ScrubMap::object
&o
= map
.objects
[poid
];
596 poid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
600 r
= be_deep_scrub(poid
, map
, pos
, o
);
602 dout(25) << __func__
<< " " << poid
<< dendl
;
603 } else if (r
== -ENOENT
) {
604 dout(25) << __func__
<< " " << poid
<< " got " << r
605 << ", skipping" << dendl
;
606 } else if (r
== -EIO
) {
607 dout(25) << __func__
<< " " << poid
<< " got " << r
608 << ", stat_error" << dendl
;
609 ScrubMap::object
&o
= map
.objects
[poid
];
612 derr
<< __func__
<< " got: " << cpp_strerror(r
) << dendl
;
615 if (r
== -EINPROGRESS
) {
622 bool PGBackend::be_compare_scrub_objects(
623 pg_shard_t auth_shard
,
624 const ScrubMap::object
&auth
,
625 const object_info_t
& auth_oi
,
626 const ScrubMap::object
&candidate
,
627 shard_info_wrapper
&shard_result
,
628 inconsistent_obj_wrapper
&obj_result
,
629 ostream
&errorstream
)
631 enum { CLEAN
, FOUND_ERROR
} error
= CLEAN
;
632 if (candidate
.stat_error
) {
633 assert(shard_result
.has_stat_error());
635 errorstream
<< "candidate had a stat error";
637 if (candidate
.read_error
|| candidate
.ec_hash_mismatch
|| candidate
.ec_size_mismatch
) {
639 errorstream
<< "candidate had a read error";
641 if (auth
.digest_present
&& candidate
.digest_present
) {
642 if (auth
.digest
!= candidate
.digest
) {
646 errorstream
<< "data_digest 0x" << std::hex
<< candidate
.digest
647 << " != data_digest 0x" << auth
.digest
<< std::dec
648 << " from shard " << auth_shard
;
649 obj_result
.set_data_digest_mismatch();
652 if (auth
.omap_digest_present
&& candidate
.omap_digest_present
) {
653 if (auth
.omap_digest
!= candidate
.omap_digest
) {
657 errorstream
<< "omap_digest 0x" << std::hex
<< candidate
.omap_digest
658 << " != omap_digest 0x" << auth
.omap_digest
<< std::dec
659 << " from shard " << auth_shard
;
660 obj_result
.set_omap_digest_mismatch();
663 if (parent
->get_pool().is_replicated()) {
664 if (auth_oi
.is_data_digest() && candidate
.digest_present
) {
665 if (auth_oi
.data_digest
!= candidate
.digest
) {
669 errorstream
<< "data_digest 0x" << std::hex
<< candidate
.digest
670 << " != data_digest 0x" << auth_oi
.data_digest
<< std::dec
671 << " from auth oi " << auth_oi
;
672 shard_result
.set_data_digest_mismatch_info();
675 if (auth_oi
.is_omap_digest() && candidate
.omap_digest_present
) {
676 if (auth_oi
.omap_digest
!= candidate
.omap_digest
) {
680 errorstream
<< "omap_digest 0x" << std::hex
<< candidate
.omap_digest
681 << " != omap_digest 0x" << auth_oi
.omap_digest
<< std::dec
682 << " from auth oi " << auth_oi
;
683 shard_result
.set_omap_digest_mismatch_info();
687 if (candidate
.stat_error
)
688 return error
== FOUND_ERROR
;
689 uint64_t oi_size
= be_get_ondisk_size(auth_oi
.size
);
690 if (oi_size
!= candidate
.size
) {
694 errorstream
<< "size " << candidate
.size
695 << " != size " << oi_size
696 << " from auth oi " << auth_oi
;
697 shard_result
.set_size_mismatch_info();
699 if (auth
.size
!= candidate
.size
) {
703 errorstream
<< "size " << candidate
.size
704 << " != size " << auth
.size
705 << " from shard " << auth_shard
;
706 obj_result
.set_size_mismatch();
708 for (map
<string
,bufferptr
>::const_iterator i
= auth
.attrs
.begin();
709 i
!= auth
.attrs
.end();
711 // We check system keys seperately
712 if (i
->first
== OI_ATTR
|| i
->first
[0] != '_')
714 if (!candidate
.attrs
.count(i
->first
)) {
718 errorstream
<< "attr name mismatch '" << i
->first
<< "'";
719 obj_result
.set_attr_name_mismatch();
720 } else if (candidate
.attrs
.find(i
->first
)->second
.cmp(i
->second
)) {
724 errorstream
<< "attr value mismatch '" << i
->first
<< "'";
725 obj_result
.set_attr_value_mismatch();
728 for (map
<string
,bufferptr
>::const_iterator i
= candidate
.attrs
.begin();
729 i
!= candidate
.attrs
.end();
731 // We check system keys seperately
732 if (i
->first
== OI_ATTR
|| i
->first
[0] != '_')
734 if (!auth
.attrs
.count(i
->first
)) {
738 errorstream
<< "attr name mismatch '" << i
->first
<< "'";
739 obj_result
.set_attr_name_mismatch();
742 return error
== FOUND_ERROR
;
745 static int dcount(const object_info_t
&oi
, bool prioritize
)
748 // Prioritize bluestore objects when osd_distrust_data_digest is set
751 if (oi
.is_data_digest())
753 if (oi
.is_omap_digest())
758 map
<pg_shard_t
, ScrubMap
*>::const_iterator
759 PGBackend::be_select_auth_object(
760 const hobject_t
&obj
,
761 const map
<pg_shard_t
,ScrubMap
*> &maps
,
762 object_info_t
*auth_oi
,
763 map
<pg_shard_t
, shard_info_wrapper
> &shard_map
,
764 inconsistent_obj_wrapper
&object_error
,
767 eversion_t auth_version
;
768 bool auth_prio
= false;
769 bufferlist first_oi_bl
, first_ss_bl
, first_hk_bl
;
771 // Create list of shards with primary first so it will be auth copy all
772 // other things being equal.
773 list
<pg_shard_t
> shards
;
774 for (map
<pg_shard_t
, ScrubMap
*>::const_iterator j
= maps
.begin();
777 if (j
->first
== get_parent()->whoami_shard())
779 shards
.push_back(j
->first
);
781 shards
.push_front(get_parent()->whoami_shard());
783 map
<pg_shard_t
, ScrubMap
*>::const_iterator auth
= maps
.end();
785 for (auto &l
: shards
) {
786 bool oi_prio
= false;
787 map
<pg_shard_t
, ScrubMap
*>::const_iterator j
= maps
.find(l
);
788 map
<hobject_t
, ScrubMap::object
>::iterator i
=
789 j
->second
->objects
.find(obj
);
790 if (i
== j
->second
->objects
.end()) {
794 auto& shard_info
= shard_map
[j
->first
];
795 if (j
->first
== get_parent()->whoami_shard())
796 shard_info
.primary
= true;
797 if (i
->second
.read_error
) {
798 shard_info
.set_read_error();
799 error_string
+= " read_error";
801 if (i
->second
.ec_hash_mismatch
) {
802 shard_info
.set_ec_hash_mismatch();
803 error_string
+= " ec_hash_mismatch";
805 if (i
->second
.ec_size_mismatch
) {
806 shard_info
.set_ec_size_mismatch();
807 error_string
+= " ec_size_mismatch";
812 map
<string
, bufferptr
>::iterator k
;
814 bufferlist ss_bl
, hk_bl
;
816 if (i
->second
.stat_error
) {
817 shard_info
.set_stat_error();
818 error_string
+= " stat_error";
819 // With stat_error no further checking
820 // We don't need to also see a missing_object_info_attr
824 // We won't pick an auth copy if the snapset is missing or won't decode.
825 if (obj
.is_head() || obj
.is_snapdir()) {
826 k
= i
->second
.attrs
.find(SS_ATTR
);
827 if (k
== i
->second
.attrs
.end()) {
828 shard_info
.set_snapset_missing();
829 error_string
+= " snapset_missing";
831 ss_bl
.push_back(k
->second
);
833 bufferlist::iterator bliter
= ss_bl
.begin();
834 ::decode(ss
, bliter
);
835 if (first_ss_bl
.length() == 0) {
836 first_ss_bl
.append(ss_bl
);
837 } else if (!object_error
.has_snapset_inconsistency() && !ss_bl
.contents_equal(first_ss_bl
)) {
838 object_error
.set_snapset_inconsistency();
839 error_string
+= " snapset_inconsistency";
842 // invalid snapset, probably corrupt
843 shard_info
.set_snapset_corrupted();
844 error_string
+= " snapset_corrupted";
849 if (parent
->get_pool().is_erasure()) {
851 k
= i
->second
.attrs
.find(ECUtil::get_hinfo_key());
852 if (k
== i
->second
.attrs
.end()) {
853 shard_info
.set_hinfo_missing();
854 error_string
+= " hinfo_key_missing";
856 hk_bl
.push_back(k
->second
);
858 bufferlist::iterator bliter
= hk_bl
.begin();
860 if (first_hk_bl
.length() == 0) {
861 first_hk_bl
.append(hk_bl
);
862 } else if (!object_error
.has_hinfo_inconsistency() && !hk_bl
.contents_equal(first_hk_bl
)) {
863 object_error
.set_hinfo_inconsistency();
864 error_string
+= " hinfo_inconsistency";
867 // invalid snapset, probably corrupt
868 shard_info
.set_hinfo_corrupted();
869 error_string
+= " hinfo_corrupted";
874 k
= i
->second
.attrs
.find(OI_ATTR
);
875 if (k
== i
->second
.attrs
.end()) {
876 // no object info on object, probably corrupt
877 shard_info
.set_info_missing();
878 error_string
+= " info_missing";
881 bl
.push_back(k
->second
);
883 bufferlist::iterator bliter
= bl
.begin();
884 ::decode(oi
, bliter
);
886 // invalid object info, probably corrupt
887 shard_info
.set_info_corrupted();
888 error_string
+= " info_corrupted";
892 // This is automatically corrected in PG::_repair_oinfo_oid()
893 assert(oi
.soid
== obj
);
895 if (first_oi_bl
.length() == 0) {
896 first_oi_bl
.append(bl
);
897 } else if (!object_error
.has_object_info_inconsistency() && !bl
.contents_equal(first_oi_bl
)) {
898 object_error
.set_object_info_inconsistency();
899 error_string
+= " object_info_inconsistency";
902 if (i
->second
.size
!= be_get_ondisk_size(oi
.size
)) {
903 dout(5) << __func__
<< " size " << i
->second
.size
<< " oi size " << oi
.size
<< dendl
;
904 shard_info
.set_obj_size_info_mismatch();
905 error_string
+= " obj_size_info_mismatch";
908 // digest_match will only be true if computed digests are the same
909 if (auth_version
!= eversion_t()
910 && auth
->second
->objects
[obj
].digest_present
911 && i
->second
.digest_present
912 && auth
->second
->objects
[obj
].digest
!= i
->second
.digest
) {
913 digest_match
= false;
914 dout(10) << __func__
<< " digest_match = false, " << obj
<< " data_digest 0x" << std::hex
<< i
->second
.digest
915 << " != data_digest 0x" << auth
->second
->objects
[obj
].digest
<< std::dec
919 // Don't use this particular shard due to previous errors
920 // XXX: For now we can't pick one shard for repair and another's object info or snapset
921 if (shard_info
.errors
)
924 // XXX: Do I want replicated only?
925 if (parent
->get_pool().is_replicated() && cct
->_conf
->osd_distrust_data_digest
) {
926 // This is a boost::optional<bool> so see if option set AND it has the value true
927 // We give priority to a replica where the ObjectStore like BlueStore has builtin checksum
928 if (j
->second
->has_builtin_csum
&& j
->second
->has_builtin_csum
== true) {
933 if (auth_version
== eversion_t() || oi
.version
> auth_version
||
934 (oi
.version
== auth_version
&& dcount(oi
, oi_prio
) > dcount(*auth_oi
, auth_prio
))) {
937 auth_version
= oi
.version
;
942 // Check error_string because some errors already generated messages
943 if (error_string
!= "") {
944 dout(10) << __func__
<< ": error(s) osd " << j
->first
945 << " for obj " << obj
946 << "," << error_string
949 // Keep scanning other shards
951 dout(10) << __func__
<< ": selecting osd " << auth
->first
952 << " for obj " << obj
953 << " with oi " << *auth_oi
958 void PGBackend::be_compare_scrubmaps(
959 const map
<pg_shard_t
,ScrubMap
*> &maps
,
960 const set
<hobject_t
> &master_set
,
962 map
<hobject_t
, set
<pg_shard_t
>> &missing
,
963 map
<hobject_t
, set
<pg_shard_t
>> &inconsistent
,
964 map
<hobject_t
, list
<pg_shard_t
>> &authoritative
,
965 map
<hobject_t
, pair
<boost::optional
<uint32_t>,
966 boost::optional
<uint32_t>>> &missing_digest
,
967 int &shallow_errors
, int &deep_errors
,
970 const vector
<int> &acting
,
971 ostream
&errorstream
)
973 utime_t now
= ceph_clock_now();
975 // Check maps against master set and each other
976 for (set
<hobject_t
>::const_iterator k
= master_set
.begin();
977 k
!= master_set
.end();
979 object_info_t auth_oi
;
980 map
<pg_shard_t
, shard_info_wrapper
> shard_map
;
982 inconsistent_obj_wrapper object_error
{*k
};
985 map
<pg_shard_t
, ScrubMap
*>::const_iterator auth
=
986 be_select_auth_object(*k
, maps
, &auth_oi
, shard_map
, object_error
,
989 list
<pg_shard_t
> auth_list
;
990 set
<pg_shard_t
> object_errors
;
991 if (auth
== maps
.end()) {
992 object_error
.set_version(0);
993 object_error
.set_auth_missing(*k
, maps
, shard_map
, shallow_errors
,
994 deep_errors
, get_parent()->whoami_shard());
995 if (object_error
.has_deep_errors())
997 else if (object_error
.has_shallow_errors())
999 store
->add_object_error(k
->pool
, object_error
);
1000 errorstream
<< pgid
.pgid
<< " soid " << *k
1001 << ": failed to pick suitable object info\n";
1004 object_error
.set_version(auth_oi
.user_version
);
1005 ScrubMap::object
& auth_object
= auth
->second
->objects
[*k
];
1006 set
<pg_shard_t
> cur_missing
;
1007 set
<pg_shard_t
> cur_inconsistent
;
1008 bool fix_digest
= false;
1010 for (auto j
= maps
.cbegin(); j
!= maps
.cend(); ++j
) {
1012 shard_map
[auth
->first
].selected_oi
= true;
1013 if (j
->second
->objects
.count(*k
)) {
1014 shard_map
[j
->first
].set_object(j
->second
->objects
[*k
]);
1017 bool found
= be_compare_scrub_objects(auth
->first
,
1020 j
->second
->objects
[*k
],
1021 shard_map
[j
->first
],
1025 dout(20) << __func__
<< (repair
? " repair " : " ") << (parent
->get_pool().is_replicated() ? "replicated " : "")
1026 << (j
== auth
? "auth " : "") << "shards " << shard_map
.size() << (digest_match
? " digest_match " : " ")
1027 << (shard_map
[j
->first
].has_data_digest_mismatch_info() ? "info_mismatch " : "")
1028 << (shard_map
[j
->first
].only_data_digest_mismatch_info() ? "only" : "")
1031 if (cct
->_conf
->osd_distrust_data_digest
) {
1032 if (digest_match
&& parent
->get_pool().is_replicated()
1033 && shard_map
[j
->first
].has_data_digest_mismatch_info()) {
1036 shard_map
[j
->first
].clear_data_digest_mismatch_info();
1037 // If all replicas match, but they don't match object_info we can
1038 // repair it by using missing_digest mechanism
1039 } else if (repair
&& parent
->get_pool().is_replicated() && j
== auth
&& shard_map
.size() > 1
1040 && digest_match
&& shard_map
[j
->first
].only_data_digest_mismatch_info()
1041 && auth_object
.digest_present
) {
1042 // Set in missing_digests
1045 shard_map
[j
->first
].clear_data_digest_mismatch_info();
1046 errorstream
<< pgid
<< " : soid " << *k
<< " repairing object info data_digest" << "\n";
1048 // Some errors might have already been set in be_select_auth_object()
1049 if (shard_map
[j
->first
].errors
!= 0) {
1050 cur_inconsistent
.insert(j
->first
);
1051 if (shard_map
[j
->first
].has_deep_errors())
1055 // Only true if be_compare_scrub_objects() found errors and put something
1058 errorstream
<< pgid
<< " shard " << j
->first
<< ": soid " << *k
1059 << " " << ss
.str() << "\n";
1060 } else if (object_error
.errors
!= 0) {
1061 // Track possible shard to use as authoritative, if needed
1062 // There are errors, without identifying the shard
1063 object_errors
.insert(j
->first
);
1065 errorstream
<< pgid
<< " : soid " << *k
<< " " << ss
.str() << "\n";
1067 // XXX: The auth shard might get here that we don't know
1068 // that it has the "correct" data.
1069 auth_list
.push_back(j
->first
);
1072 cur_missing
.insert(j
->first
);
1073 shard_map
[j
->first
].set_missing();
1074 shard_map
[j
->first
].primary
= (j
->first
== get_parent()->whoami_shard());
1075 // Can't have any other errors if there is no information available
1077 errorstream
<< pgid
<< " shard " << j
->first
<< " missing " << *k
1080 object_error
.add_shard(j
->first
, shard_map
[j
->first
]);
1083 if (auth_list
.empty()) {
1084 if (object_errors
.empty()) {
1085 errorstream
<< pgid
.pgid
<< " soid " << *k
1086 << ": failed to pick suitable auth object\n";
1089 // Object errors exist and nothing in auth_list
1090 // Prefer the auth shard otherwise take first from list.
1092 if (object_errors
.count(auth
->first
)) {
1093 shard
= auth
->first
;
1095 shard
= *(object_errors
.begin());
1097 auth_list
.push_back(shard
);
1098 object_errors
.erase(shard
);
1100 // At this point auth_list is populated, so we add the object errors shards
1102 cur_inconsistent
.insert(object_errors
.begin(), object_errors
.end());
1103 if (!cur_missing
.empty()) {
1104 missing
[*k
] = cur_missing
;
1106 if (!cur_inconsistent
.empty()) {
1107 inconsistent
[*k
] = cur_inconsistent
;
1111 boost::optional
<uint32_t> data_digest
, omap_digest
;
1112 assert(auth_object
.digest_present
);
1113 data_digest
= auth_object
.digest
;
1114 if (auth_object
.omap_digest_present
) {
1115 omap_digest
= auth_object
.omap_digest
;
1117 missing_digest
[*k
] = make_pair(data_digest
, omap_digest
);
1119 // Special handling of this particular type of inconsistency
1120 // This can over-ride a data_digest or set an omap_digest
1121 // when all replicas match but the object info is wrong.
1122 if (!cur_inconsistent
.empty() || !cur_missing
.empty()) {
1123 authoritative
[*k
] = auth_list
;
1124 } else if (!fix_digest
&& parent
->get_pool().is_replicated()) {
1131 if (auth_object
.digest_present
&& !auth_oi
.is_data_digest()) {
1132 dout(20) << __func__
<< " missing data digest on " << *k
<< dendl
;
1135 if (auth_object
.omap_digest_present
&& !auth_oi
.is_omap_digest()) {
1136 dout(20) << __func__
<< " missing omap digest on " << *k
<< dendl
;
1140 // recorded digest != actual digest?
1141 if (auth_oi
.is_data_digest() && auth_object
.digest_present
&&
1142 auth_oi
.data_digest
!= auth_object
.digest
) {
1143 assert(cct
->_conf
->osd_distrust_data_digest
1144 || shard_map
[auth
->first
].has_data_digest_mismatch_info());
1145 errorstream
<< pgid
<< " recorded data digest 0x"
1146 << std::hex
<< auth_oi
.data_digest
<< " != on disk 0x"
1147 << auth_object
.digest
<< std::dec
<< " on " << auth_oi
.soid
1152 if (auth_oi
.is_omap_digest() && auth_object
.omap_digest_present
&&
1153 auth_oi
.omap_digest
!= auth_object
.omap_digest
) {
1154 assert(shard_map
[auth
->first
].has_omap_digest_mismatch_info());
1155 errorstream
<< pgid
<< " recorded omap digest 0x"
1156 << std::hex
<< auth_oi
.omap_digest
<< " != on disk 0x"
1157 << auth_object
.omap_digest
<< std::dec
1158 << " on " << auth_oi
.soid
<< "\n";
1164 utime_t age
= now
- auth_oi
.local_mtime
;
1165 if (update
== FORCE
||
1166 age
> cct
->_conf
->osd_deep_scrub_update_digest_min_age
) {
1167 boost::optional
<uint32_t> data_digest
, omap_digest
;
1168 if (auth_object
.digest_present
) {
1169 data_digest
= auth_object
.digest
;
1170 dout(20) << __func__
<< " will update data digest on " << *k
<< dendl
;
1172 if (auth_object
.omap_digest_present
) {
1173 omap_digest
= auth_object
.omap_digest
;
1174 dout(20) << __func__
<< " will update omap digest on " << *k
<< dendl
;
1176 missing_digest
[*k
] = make_pair(data_digest
, omap_digest
);
1178 dout(20) << __func__
<< " missing digest but age " << age
1179 << " < " << cct
->_conf
->osd_deep_scrub_update_digest_min_age
1180 << " on " << *k
<< dendl
;
1185 if (object_error
.has_deep_errors())
1187 else if (object_error
.has_shallow_errors())
1189 if (object_error
.errors
|| object_error
.union_shards
.errors
) {
1190 store
->add_object_error(k
->pool
, object_error
);
1195 void PGBackend::be_large_omap_check(const map
<pg_shard_t
,ScrubMap
*> &maps
,
1196 const set
<hobject_t
> &master_set
,
1197 int& large_omap_objects
,
1198 ostream
&warnstream
) const
1200 bool needs_check
= false;
1201 for (const auto& map
: maps
) {
1202 if (map
.second
->has_large_omap_object_errors
) {
1212 // Iterate through objects and check large omap object flag
1213 for (const auto& k
: master_set
) {
1214 for (const auto& map
: maps
) {
1215 ScrubMap::object
& obj
= map
.second
->objects
[k
];
1216 if (obj
.large_omap_object_found
) {
1217 large_omap_objects
++;
1218 warnstream
<< "Large omap object found. Object: " << k
<< " Key count: "
1219 << obj
.large_omap_object_key_count
<< " Size (bytes): "
1220 << obj
.large_omap_object_value_size
<< '\n';