1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "osd/scrubber/ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
26 #include "erasure-code/ErasureCodePlugin.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
38 using std::ostringstream
;
42 using std::stringstream
;
45 using ceph::bufferlist
;
46 using ceph::bufferptr
;
47 using ceph::ErasureCodeProfile
;
48 using ceph::ErasureCodeInterfaceRef
;
50 #define dout_context cct
51 #define dout_subsys ceph_subsys_osd
52 #define DOUT_PREFIX_ARGS this
54 #define dout_prefix _prefix(_dout, this)
55 static ostream
& _prefix(std::ostream
*_dout
, PGBackend
*pgb
) {
56 return pgb
->get_parent()->gen_dbg_prefix(*_dout
);
59 void PGBackend::recover_delete_object(const hobject_t
&oid
, eversion_t v
,
62 ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
63 for (const auto& shard
: get_parent()->get_acting_recovery_backfill_shards()) {
64 if (shard
== get_parent()->whoami_shard())
66 if (get_parent()->get_shard_missing(shard
).is_missing(oid
)) {
67 dout(20) << __func__
<< " will remove " << oid
<< " " << v
<< " from "
69 h
->deletes
[shard
].push_back(make_pair(oid
, v
));
70 get_parent()->begin_peer_recover(shard
, oid
);
75 void PGBackend::send_recovery_deletes(int prio
,
76 const map
<pg_shard_t
, vector
<pair
<hobject_t
, eversion_t
> > > &deletes
)
78 epoch_t min_epoch
= get_parent()->get_last_peering_reset_epoch();
79 for (const auto& p
: deletes
) {
80 const auto& shard
= p
.first
;
81 const auto& objects
= p
.second
;
82 ConnectionRef con
= get_parent()->get_con_osd_cluster(
87 auto it
= objects
.begin();
88 while (it
!= objects
.end()) {
91 spg_t target_pg
= spg_t(get_parent()->get_info().pgid
.pgid
, shard
.shard
);
92 MOSDPGRecoveryDelete
*msg
=
93 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
97 msg
->set_priority(prio
);
99 while (it
!= objects
.end() &&
100 cost
< cct
->_conf
->osd_max_push_cost
&&
101 deletes
< cct
->_conf
->osd_max_push_objects
) {
102 dout(20) << __func__
<< ": sending recovery delete << " << it
->first
103 << " " << it
->second
<< " to osd." << shard
<< dendl
;
104 msg
->objects
.push_back(*it
);
105 cost
+= cct
->_conf
->osd_push_per_object_cost
;
111 get_parent()->send_message_osd_cluster(msg
, con
);
116 bool PGBackend::handle_message(OpRequestRef op
)
118 switch (op
->get_req()->get_type()) {
119 case MSG_OSD_PG_RECOVERY_DELETE
:
120 handle_recovery_delete(op
);
123 case MSG_OSD_PG_RECOVERY_DELETE_REPLY
:
124 handle_recovery_delete_reply(op
);
131 return _handle_message(op
);
134 void PGBackend::handle_recovery_delete(OpRequestRef op
)
136 auto m
= op
->get_req
<MOSDPGRecoveryDelete
>();
137 ceph_assert(m
->get_type() == MSG_OSD_PG_RECOVERY_DELETE
);
138 dout(20) << __func__
<< " " << op
<< dendl
;
142 C_GatherBuilder
gather(cct
);
143 for (const auto &p
: m
->objects
) {
144 get_parent()->remove_missing_object(p
.first
, p
.second
, gather
.new_sub());
147 auto reply
= make_message
<MOSDPGRecoveryDeleteReply
>();
148 reply
->from
= get_parent()->whoami_shard();
149 reply
->set_priority(m
->get_priority());
150 reply
->pgid
= spg_t(get_parent()->get_info().pgid
.pgid
, m
->from
.shard
);
151 reply
->map_epoch
= m
->map_epoch
;
152 reply
->min_epoch
= m
->min_epoch
;
153 reply
->objects
= m
->objects
;
154 ConnectionRef conn
= m
->get_connection();
156 gather
.set_finisher(new LambdaContext(
159 get_parent()->send_message_osd_cluster(reply
, conn
.get());
167 void PGBackend::handle_recovery_delete_reply(OpRequestRef op
)
169 auto m
= op
->get_req
<MOSDPGRecoveryDeleteReply
>();
170 ceph_assert(m
->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY
);
171 dout(20) << __func__
<< " " << op
<< dendl
;
173 for (const auto &p
: m
->objects
) {
174 ObjectRecoveryInfo recovery_info
;
175 hobject_t oid
= p
.first
;
176 recovery_info
.version
= p
.second
;
177 get_parent()->on_peer_recover(m
->from
, oid
, recovery_info
);
178 bool peers_recovered
= true;
179 for (const auto& shard
: get_parent()->get_acting_recovery_backfill_shards()) {
180 if (shard
== get_parent()->whoami_shard())
182 if (get_parent()->get_shard_missing(shard
).is_missing(oid
)) {
183 dout(20) << __func__
<< " " << oid
<< " still missing on at least "
185 peers_recovered
= false;
189 if (peers_recovered
&& !get_parent()->get_local_missing().is_missing(oid
)) {
190 dout(20) << __func__
<< " completed recovery, local_missing = "
191 << get_parent()->get_local_missing() << dendl
;
192 object_stat_sum_t stat_diff
;
193 stat_diff
.num_objects_recovered
= 1;
194 get_parent()->on_global_recover(p
.first
, stat_diff
, true);
199 void PGBackend::rollback(
200 const pg_log_entry_t
&entry
,
201 ObjectStore::Transaction
*t
)
204 struct RollbackVisitor
: public ObjectModDesc::Visitor
{
205 const hobject_t
&hoid
;
207 ObjectStore::Transaction t
;
209 const hobject_t
&hoid
,
210 PGBackend
*pg
) : hoid(hoid
), pg(pg
) {}
211 void append(uint64_t old_size
) override
{
212 ObjectStore::Transaction temp
;
213 pg
->rollback_append(hoid
, old_size
, &temp
);
217 void setattrs(map
<string
, std::optional
<bufferlist
> > &attrs
) override
{
218 ObjectStore::Transaction temp
;
219 pg
->rollback_setattrs(hoid
, attrs
, &temp
);
223 void rmobject(version_t old_version
) override
{
224 ObjectStore::Transaction temp
;
225 pg
->rollback_stash(hoid
, old_version
, &temp
);
229 void try_rmobject(version_t old_version
) override
{
230 ObjectStore::Transaction temp
;
231 pg
->rollback_try_stash(hoid
, old_version
, &temp
);
235 void create() override
{
236 ObjectStore::Transaction temp
;
237 pg
->rollback_create(hoid
, &temp
);
241 void update_snaps(const set
<snapid_t
> &snaps
) override
{
242 ObjectStore::Transaction temp
;
243 pg
->get_parent()->pgb_set_object_snap_mapping(hoid
, snaps
, &temp
);
247 void rollback_extents(
249 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
250 ObjectStore::Transaction temp
;
251 pg
->rollback_extents(gen
, extents
, hoid
, &temp
);
257 ceph_assert(entry
.mod_desc
.can_rollback());
258 RollbackVisitor
vis(entry
.soid
, this);
259 entry
.mod_desc
.visit(&vis
);
263 struct Trimmer
: public ObjectModDesc::Visitor
{
264 const hobject_t
&soid
;
266 ObjectStore::Transaction
*t
;
268 const hobject_t
&soid
,
270 ObjectStore::Transaction
*t
)
271 : soid(soid
), pg(pg
), t(t
) {}
272 void rmobject(version_t old_version
) override
{
273 pg
->trim_rollback_object(
278 // try_rmobject defaults to rmobject
279 void rollback_extents(
281 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
282 pg
->trim_rollback_object(
289 void PGBackend::rollforward(
290 const pg_log_entry_t
&entry
,
291 ObjectStore::Transaction
*t
)
293 auto dpp
= get_parent()->get_dpp();
294 ldpp_dout(dpp
, 20) << __func__
<< ": entry=" << entry
<< dendl
;
295 if (!entry
.can_rollback())
297 Trimmer
trimmer(entry
.soid
, this, t
);
298 entry
.mod_desc
.visit(&trimmer
);
301 void PGBackend::trim(
302 const pg_log_entry_t
&entry
,
303 ObjectStore::Transaction
*t
)
305 if (!entry
.can_rollback())
307 Trimmer
trimmer(entry
.soid
, this, t
);
308 entry
.mod_desc
.visit(&trimmer
);
311 void PGBackend::try_stash(
312 const hobject_t
&hoid
,
314 ObjectStore::Transaction
*t
)
318 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
319 ghobject_t(hoid
, v
, get_parent()->whoami_shard().shard
));
322 void PGBackend::remove(
323 const hobject_t
&hoid
,
324 ObjectStore::Transaction
*t
) {
325 ceph_assert(!hoid
.is_temp());
328 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
329 get_parent()->pgb_clear_object_snap_mapping(hoid
, t
);
332 void PGBackend::on_change_cleanup(ObjectStore::Transaction
*t
)
334 dout(10) << __func__
<< dendl
;
336 for (set
<hobject_t
>::iterator i
= temp_contents
.begin();
337 i
!= temp_contents
.end();
339 dout(10) << __func__
<< ": Removing oid "
340 << *i
<< " from the temp collection" << dendl
;
343 ghobject_t(*i
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
345 temp_contents
.clear();
348 int PGBackend::objects_list_partial(
349 const hobject_t
&begin
,
352 vector
<hobject_t
> *ls
,
356 // Starts with the smallest generation to make sure the result list
357 // has the marker object (it might have multiple generations
358 // though, which would be filtered).
361 _next
= ghobject_t(begin
, 0, get_parent()->whoami_shard().shard
);
368 while (!_next
.is_max() && ls
->size() < (unsigned)min
) {
369 vector
<ghobject_t
> objects
;
370 if (HAVE_FEATURE(parent
->min_upacting_features(),
371 OSD_FIXED_COLLECTION_LIST
)) {
372 r
= store
->collection_list(
375 ghobject_t::get_max(),
380 r
= store
->collection_list_legacy(
383 ghobject_t::get_max(),
389 derr
<< __func__
<< " list collection " << ch
<< " got: " << cpp_strerror(r
) << dendl
;
392 for (vector
<ghobject_t
>::iterator i
= objects
.begin();
395 if (i
->is_pgmeta() || i
->hobj
.is_temp()) {
398 if (i
->is_no_gen()) {
399 ls
->push_back(i
->hobj
);
408 int PGBackend::objects_list_range(
409 const hobject_t
&start
,
410 const hobject_t
&end
,
411 vector
<hobject_t
> *ls
,
412 vector
<ghobject_t
> *gen_obs
)
415 vector
<ghobject_t
> objects
;
417 if (HAVE_FEATURE(parent
->min_upacting_features(),
418 OSD_FIXED_COLLECTION_LIST
)) {
419 r
= store
->collection_list(
421 ghobject_t(start
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
422 ghobject_t(end
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
427 r
= store
->collection_list_legacy(
429 ghobject_t(start
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
430 ghobject_t(end
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
435 ls
->reserve(objects
.size());
436 for (vector
<ghobject_t
>::iterator i
= objects
.begin();
439 if (i
->is_pgmeta() || i
->hobj
.is_temp()) {
442 if (i
->is_no_gen()) {
443 ls
->push_back(i
->hobj
);
444 } else if (gen_obs
) {
445 gen_obs
->push_back(*i
);
451 int PGBackend::objects_get_attr(
452 const hobject_t
&hoid
,
457 int r
= store
->getattr(
459 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
464 out
->push_back(std::move(bp
));
469 int PGBackend::objects_get_attrs(
470 const hobject_t
&hoid
,
471 map
<string
, bufferlist
, less
<>> *out
)
473 return store
->getattrs(
475 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
479 void PGBackend::rollback_setattrs(
480 const hobject_t
&hoid
,
481 map
<string
, std::optional
<bufferlist
> > &old_attrs
,
482 ObjectStore::Transaction
*t
) {
483 map
<string
, bufferlist
, less
<>> to_set
;
484 ceph_assert(!hoid
.is_temp());
485 for (map
<string
, std::optional
<bufferlist
> >::iterator i
= old_attrs
.begin();
486 i
!= old_attrs
.end();
489 to_set
[i
->first
] = *(i
->second
);
493 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
499 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
503 void PGBackend::rollback_append(
504 const hobject_t
&hoid
,
506 ObjectStore::Transaction
*t
) {
507 ceph_assert(!hoid
.is_temp());
510 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
514 void PGBackend::rollback_stash(
515 const hobject_t
&hoid
,
516 version_t old_version
,
517 ObjectStore::Transaction
*t
) {
518 ceph_assert(!hoid
.is_temp());
521 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
522 t
->collection_move_rename(
524 ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
),
526 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
529 void PGBackend::rollback_try_stash(
530 const hobject_t
&hoid
,
531 version_t old_version
,
532 ObjectStore::Transaction
*t
) {
533 ceph_assert(!hoid
.is_temp());
536 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
539 ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
),
540 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
543 void PGBackend::rollback_extents(
545 const vector
<pair
<uint64_t, uint64_t> > &extents
,
546 const hobject_t
&hoid
,
547 ObjectStore::Transaction
*t
) {
548 auto shard
= get_parent()->whoami_shard().shard
;
549 for (auto &&extent
: extents
) {
552 ghobject_t(hoid
, gen
, shard
),
553 ghobject_t(hoid
, ghobject_t::NO_GEN
, shard
),
560 ghobject_t(hoid
, gen
, shard
));
563 void PGBackend::trim_rollback_object(
564 const hobject_t
&hoid
,
565 version_t old_version
,
566 ObjectStore::Transaction
*t
) {
567 ceph_assert(!hoid
.is_temp());
569 coll
, ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
));
572 PGBackend
*PGBackend::build_pg_backend(
573 const pg_pool_t
&pool
,
574 const map
<string
,string
>& profile
,
577 ObjectStore::CollectionHandle
&ch
,
581 ErasureCodeProfile ec_profile
= profile
;
583 case pg_pool_t::TYPE_REPLICATED
: {
584 return new ReplicatedBackend(l
, coll
, ch
, store
, cct
);
586 case pg_pool_t::TYPE_ERASURE
: {
587 ErasureCodeInterfaceRef ec_impl
;
589 ceph::ErasureCodePluginRegistry::instance().factory(
590 profile
.find("plugin")->second
,
591 cct
->_conf
.get_val
<std::string
>("erasure_code_dir"),
595 ceph_assert(ec_impl
);
596 return new ECBackend(
611 int PGBackend::be_scan_list(
613 ScrubMapBuilder
&pos
)
615 dout(10) << __func__
<< " " << pos
<< dendl
;
616 ceph_assert(!pos
.done());
617 ceph_assert(pos
.pos
< pos
.ls
.size());
618 hobject_t
& poid
= pos
.ls
[pos
.pos
];
624 poid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
628 ScrubMap::object
&o
= map
.objects
[poid
];
630 ceph_assert(!o
.negative
);
634 poid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
638 r
= be_deep_scrub(poid
, map
, pos
, o
);
640 dout(25) << __func__
<< " " << poid
<< dendl
;
641 } else if (r
== -ENOENT
) {
642 dout(25) << __func__
<< " " << poid
<< " got " << r
643 << ", skipping" << dendl
;
644 } else if (r
== -EIO
) {
645 dout(25) << __func__
<< " " << poid
<< " got " << r
646 << ", stat_error" << dendl
;
647 ScrubMap::object
&o
= map
.objects
[poid
];
650 derr
<< __func__
<< " got: " << cpp_strerror(r
) << dendl
;
653 if (r
== -EINPROGRESS
) {
660 bool PGBackend::be_compare_scrub_objects(
661 pg_shard_t auth_shard
,
662 const ScrubMap::object
&auth
,
663 const object_info_t
& auth_oi
,
664 const ScrubMap::object
&candidate
,
665 shard_info_wrapper
&shard_result
,
666 inconsistent_obj_wrapper
&obj_result
,
667 ostream
&errorstream
,
670 enum { CLEAN
, FOUND_ERROR
} error
= CLEAN
;
671 if (auth
.digest_present
&& candidate
.digest_present
) {
672 if (auth
.digest
!= candidate
.digest
) {
676 errorstream
<< "data_digest 0x" << std::hex
<< candidate
.digest
677 << " != data_digest 0x" << auth
.digest
<< std::dec
678 << " from shard " << auth_shard
;
679 obj_result
.set_data_digest_mismatch();
682 if (auth
.omap_digest_present
&& candidate
.omap_digest_present
) {
683 if (auth
.omap_digest
!= candidate
.omap_digest
) {
687 errorstream
<< "omap_digest 0x" << std::hex
<< candidate
.omap_digest
688 << " != omap_digest 0x" << auth
.omap_digest
<< std::dec
689 << " from shard " << auth_shard
;
690 obj_result
.set_omap_digest_mismatch();
693 if (parent
->get_pool().is_replicated()) {
694 if (auth_oi
.is_data_digest() && candidate
.digest_present
) {
695 if (auth_oi
.data_digest
!= candidate
.digest
) {
699 errorstream
<< "data_digest 0x" << std::hex
<< candidate
.digest
700 << " != data_digest 0x" << auth_oi
.data_digest
<< std::dec
701 << " from auth oi " << auth_oi
;
702 shard_result
.set_data_digest_mismatch_info();
705 if (auth_oi
.is_omap_digest() && candidate
.omap_digest_present
) {
706 if (auth_oi
.omap_digest
!= candidate
.omap_digest
) {
710 errorstream
<< "omap_digest 0x" << std::hex
<< candidate
.omap_digest
711 << " != omap_digest 0x" << auth_oi
.omap_digest
<< std::dec
712 << " from auth oi " << auth_oi
;
713 shard_result
.set_omap_digest_mismatch_info();
717 if (candidate
.stat_error
)
718 return error
== FOUND_ERROR
;
719 if (!shard_result
.has_info_missing()
720 && !shard_result
.has_info_corrupted()) {
721 bufferlist can_bl
, auth_bl
;
722 auto can_attr
= candidate
.attrs
.find(OI_ATTR
);
723 auto auth_attr
= auth
.attrs
.find(OI_ATTR
);
725 ceph_assert(auth_attr
!= auth
.attrs
.end());
726 ceph_assert(can_attr
!= candidate
.attrs
.end());
728 can_bl
.push_back(can_attr
->second
);
729 auth_bl
.push_back(auth_attr
->second
);
730 if (!can_bl
.contents_equal(auth_bl
)) {
734 obj_result
.set_object_info_inconsistency();
735 errorstream
<< "object info inconsistent ";
739 if (!shard_result
.has_snapset_missing()
740 && !shard_result
.has_snapset_corrupted()) {
741 bufferlist can_bl
, auth_bl
;
742 auto can_attr
= candidate
.attrs
.find(SS_ATTR
);
743 auto auth_attr
= auth
.attrs
.find(SS_ATTR
);
745 ceph_assert(auth_attr
!= auth
.attrs
.end());
746 ceph_assert(can_attr
!= candidate
.attrs
.end());
748 can_bl
.push_back(can_attr
->second
);
749 auth_bl
.push_back(auth_attr
->second
);
750 if (!can_bl
.contents_equal(auth_bl
)) {
754 obj_result
.set_snapset_inconsistency();
755 errorstream
<< "snapset inconsistent ";
759 if (parent
->get_pool().is_erasure()) {
760 if (!shard_result
.has_hinfo_missing()
761 && !shard_result
.has_hinfo_corrupted()) {
762 bufferlist can_bl
, auth_bl
;
763 auto can_hi
= candidate
.attrs
.find(ECUtil::get_hinfo_key());
764 auto auth_hi
= auth
.attrs
.find(ECUtil::get_hinfo_key());
766 ceph_assert(auth_hi
!= auth
.attrs
.end());
767 ceph_assert(can_hi
!= candidate
.attrs
.end());
769 can_bl
.push_back(can_hi
->second
);
770 auth_bl
.push_back(auth_hi
->second
);
771 if (!can_bl
.contents_equal(auth_bl
)) {
775 obj_result
.set_hinfo_inconsistency();
776 errorstream
<< "hinfo inconsistent ";
780 uint64_t oi_size
= be_get_ondisk_size(auth_oi
.size
);
781 if (oi_size
!= candidate
.size
) {
785 errorstream
<< "size " << candidate
.size
786 << " != size " << oi_size
787 << " from auth oi " << auth_oi
;
788 shard_result
.set_size_mismatch_info();
790 if (auth
.size
!= candidate
.size
) {
794 errorstream
<< "size " << candidate
.size
795 << " != size " << auth
.size
796 << " from shard " << auth_shard
;
797 obj_result
.set_size_mismatch();
799 // If the replica is too large and we didn't already count it for this object
801 if (candidate
.size
> cct
->_conf
->osd_max_object_size
802 && !obj_result
.has_size_too_large()) {
806 errorstream
<< "size " << candidate
.size
807 << " > " << cct
->_conf
->osd_max_object_size
809 obj_result
.set_size_too_large();
811 for (map
<string
,bufferptr
>::const_iterator i
= auth
.attrs
.begin();
812 i
!= auth
.attrs
.end();
814 // We check system keys seperately
815 if (i
->first
== OI_ATTR
|| i
->first
[0] != '_')
817 if (!candidate
.attrs
.count(i
->first
)) {
821 errorstream
<< "attr name mismatch '" << i
->first
<< "'";
822 obj_result
.set_attr_name_mismatch();
823 } else if (candidate
.attrs
.find(i
->first
)->second
.cmp(i
->second
)) {
827 errorstream
<< "attr value mismatch '" << i
->first
<< "'";
828 obj_result
.set_attr_value_mismatch();
831 for (map
<string
,bufferptr
>::const_iterator i
= candidate
.attrs
.begin();
832 i
!= candidate
.attrs
.end();
834 // We check system keys seperately
835 if (i
->first
== OI_ATTR
|| i
->first
[0] != '_')
837 if (!auth
.attrs
.count(i
->first
)) {
841 errorstream
<< "attr name mismatch '" << i
->first
<< "'";
842 obj_result
.set_attr_name_mismatch();
845 return error
== FOUND_ERROR
;
848 static int dcount(const object_info_t
&oi
)
851 if (oi
.is_data_digest())
853 if (oi
.is_omap_digest())
858 map
<pg_shard_t
, ScrubMap
*>::const_iterator
859 PGBackend::be_select_auth_object(
860 const hobject_t
&obj
,
861 const map
<pg_shard_t
,ScrubMap
*> &maps
,
862 object_info_t
*auth_oi
,
863 map
<pg_shard_t
, shard_info_wrapper
> &shard_map
,
866 ostream
&errorstream
)
868 eversion_t auth_version
;
870 // Create list of shards with primary first so it will be auth copy all
871 // other things being equal.
872 list
<pg_shard_t
> shards
;
873 for (map
<pg_shard_t
, ScrubMap
*>::const_iterator j
= maps
.begin();
876 if (j
->first
== get_parent()->whoami_shard())
878 shards
.push_back(j
->first
);
880 shards
.push_front(get_parent()->whoami_shard());
882 map
<pg_shard_t
, ScrubMap
*>::const_iterator auth
= maps
.end();
884 for (auto &l
: shards
) {
885 ostringstream shard_errorstream
;
887 map
<pg_shard_t
, ScrubMap
*>::const_iterator j
= maps
.find(l
);
888 map
<hobject_t
, ScrubMap::object
>::iterator i
=
889 j
->second
->objects
.find(obj
);
890 if (i
== j
->second
->objects
.end()) {
893 auto& shard_info
= shard_map
[j
->first
];
894 if (j
->first
== get_parent()->whoami_shard())
895 shard_info
.primary
= true;
896 if (i
->second
.read_error
) {
897 shard_info
.set_read_error();
899 shard_errorstream
<< ", ";
901 shard_errorstream
<< "candidate had a read error";
903 if (i
->second
.ec_hash_mismatch
) {
904 shard_info
.set_ec_hash_mismatch();
906 shard_errorstream
<< ", ";
908 shard_errorstream
<< "candidate had an ec hash mismatch";
910 if (i
->second
.ec_size_mismatch
) {
911 shard_info
.set_ec_size_mismatch();
913 shard_errorstream
<< ", ";
915 shard_errorstream
<< "candidate had an ec size mismatch";
920 map
<string
, bufferptr
>::iterator k
;
922 bufferlist ss_bl
, hk_bl
;
924 if (i
->second
.stat_error
) {
925 shard_info
.set_stat_error();
927 shard_errorstream
<< ", ";
929 shard_errorstream
<< "candidate had a stat error";
930 // With stat_error no further checking
931 // We don't need to also see a missing_object_info_attr
935 // We won't pick an auth copy if the snapset is missing or won't decode.
936 ceph_assert(!obj
.is_snapdir());
938 k
= i
->second
.attrs
.find(SS_ATTR
);
939 if (k
== i
->second
.attrs
.end()) {
940 shard_info
.set_snapset_missing();
942 shard_errorstream
<< ", ";
944 shard_errorstream
<< "candidate had a missing snapset key";
946 ss_bl
.push_back(k
->second
);
948 auto bliter
= ss_bl
.cbegin();
951 // invalid snapset, probably corrupt
952 shard_info
.set_snapset_corrupted();
954 shard_errorstream
<< ", ";
956 shard_errorstream
<< "candidate had a corrupt snapset";
961 if (parent
->get_pool().is_erasure()) {
963 k
= i
->second
.attrs
.find(ECUtil::get_hinfo_key());
964 if (k
== i
->second
.attrs
.end()) {
965 shard_info
.set_hinfo_missing();
967 shard_errorstream
<< ", ";
969 shard_errorstream
<< "candidate had a missing hinfo key";
971 hk_bl
.push_back(k
->second
);
973 auto bliter
= hk_bl
.cbegin();
976 // invalid snapset, probably corrupt
977 shard_info
.set_hinfo_corrupted();
979 shard_errorstream
<< ", ";
981 shard_errorstream
<< "candidate had a corrupt hinfo";
986 k
= i
->second
.attrs
.find(OI_ATTR
);
987 if (k
== i
->second
.attrs
.end()) {
988 // no object info on object, probably corrupt
989 shard_info
.set_info_missing();
991 shard_errorstream
<< ", ";
993 shard_errorstream
<< "candidate had a missing info key";
996 bl
.push_back(k
->second
);
998 auto bliter
= bl
.cbegin();
1001 // invalid object info, probably corrupt
1002 shard_info
.set_info_corrupted();
1004 shard_errorstream
<< ", ";
1006 shard_errorstream
<< "candidate had a corrupt info";
1010 // This is automatically corrected in PG::_repair_oinfo_oid()
1011 ceph_assert(oi
.soid
== obj
);
1013 if (i
->second
.size
!= be_get_ondisk_size(oi
.size
)) {
1014 shard_info
.set_obj_size_info_mismatch();
1016 shard_errorstream
<< ", ";
1018 shard_errorstream
<< "candidate size " << i
->second
.size
<< " info size "
1019 << oi
.size
<< " mismatch";
1022 // digest_match will only be true if computed digests are the same
1023 if (auth_version
!= eversion_t()
1024 && auth
->second
->objects
[obj
].digest_present
1025 && i
->second
.digest_present
1026 && auth
->second
->objects
[obj
].digest
!= i
->second
.digest
) {
1027 digest_match
= false;
1028 dout(10) << __func__
<< " digest_match = false, " << obj
<< " data_digest 0x" << std::hex
<< i
->second
.digest
1029 << " != data_digest 0x" << auth
->second
->objects
[obj
].digest
<< std::dec
1033 // Don't use this particular shard due to previous errors
1034 // XXX: For now we can't pick one shard for repair and another's object info or snapset
1035 if (shard_info
.errors
)
1038 if (auth_version
== eversion_t() || oi
.version
> auth_version
||
1039 (oi
.version
== auth_version
&& dcount(oi
) > dcount(*auth_oi
))) {
1042 auth_version
= oi
.version
;
1047 errorstream
<< pgid
.pgid
<< " shard " << l
<< " soid " << obj
1048 << " : " << shard_errorstream
.str() << "\n";
1049 // Keep scanning other shards
1051 dout(10) << __func__
<< ": selecting osd " << auth
->first
1052 << " for obj " << obj
1053 << " with oi " << *auth_oi
1058 void PGBackend::be_compare_scrubmaps(
1059 const map
<pg_shard_t
,ScrubMap
*> &maps
,
1060 const set
<hobject_t
> &master_set
,
1062 map
<hobject_t
, set
<pg_shard_t
>> &missing
,
1063 map
<hobject_t
, set
<pg_shard_t
>> &inconsistent
,
1064 map
<hobject_t
, list
<pg_shard_t
>> &authoritative
,
1065 map
<hobject_t
, pair
<std::optional
<uint32_t>,
1066 std::optional
<uint32_t>>> &missing_digest
,
1067 int &shallow_errors
, int &deep_errors
,
1068 Scrub::Store
*store
,
1070 const vector
<int> &acting
,
1071 ostream
&errorstream
)
1073 utime_t now
= ceph_clock_now();
1075 // Check maps against master set and each other
1076 for (set
<hobject_t
>::const_iterator k
= master_set
.begin();
1077 k
!= master_set
.end();
1079 object_info_t auth_oi
;
1080 map
<pg_shard_t
, shard_info_wrapper
> shard_map
;
1082 inconsistent_obj_wrapper object_error
{*k
};
1085 map
<pg_shard_t
, ScrubMap
*>::const_iterator auth
=
1086 be_select_auth_object(*k
, maps
, &auth_oi
, shard_map
, digest_match
,
1089 list
<pg_shard_t
> auth_list
;
1090 set
<pg_shard_t
> object_errors
;
1091 if (auth
== maps
.end()) {
1092 object_error
.set_version(0);
1093 object_error
.set_auth_missing(*k
, maps
, shard_map
, shallow_errors
,
1094 deep_errors
, get_parent()->whoami_shard());
1095 if (object_error
.has_deep_errors())
1097 else if (object_error
.has_shallow_errors())
1099 store
->add_object_error(k
->pool
, object_error
);
1100 errorstream
<< pgid
.pgid
<< " soid " << *k
1101 << " : failed to pick suitable object info\n";
1104 object_error
.set_version(auth_oi
.user_version
);
1105 ScrubMap::object
& auth_object
= auth
->second
->objects
[*k
];
1106 set
<pg_shard_t
> cur_missing
;
1107 set
<pg_shard_t
> cur_inconsistent
;
1108 bool fix_digest
= false;
1110 for (auto j
= maps
.cbegin(); j
!= maps
.cend(); ++j
) {
1112 shard_map
[auth
->first
].selected_oi
= true;
1113 if (j
->second
->objects
.count(*k
)) {
1114 shard_map
[j
->first
].set_object(j
->second
->objects
[*k
]);
1117 bool found
= be_compare_scrub_objects(auth
->first
,
1120 j
->second
->objects
[*k
],
1121 shard_map
[j
->first
],
1126 dout(20) << __func__
<< (repair
? " repair " : " ") << (parent
->get_pool().is_replicated() ? "replicated " : "")
1127 << (j
== auth
? "auth" : "") << "shards " << shard_map
.size() << (digest_match
? " digest_match " : " ")
1128 << (shard_map
[j
->first
].only_data_digest_mismatch_info() ? "'info mismatch info'" : "")
1130 // If all replicas match, but they don't match object_info we can
1131 // repair it by using missing_digest mechanism
1132 if (repair
&& parent
->get_pool().is_replicated() && j
== auth
&& shard_map
.size() > 1
1133 && digest_match
&& shard_map
[j
->first
].only_data_digest_mismatch_info()
1134 && auth_object
.digest_present
) {
1135 // Set in missing_digests
1138 shard_map
[j
->first
].clear_data_digest_mismatch_info();
1139 errorstream
<< pgid
<< " soid " << *k
<< " : repairing object info data_digest" << "\n";
1141 // Some errors might have already been set in be_select_auth_object()
1142 if (shard_map
[j
->first
].errors
!= 0) {
1143 cur_inconsistent
.insert(j
->first
);
1144 if (shard_map
[j
->first
].has_deep_errors())
1148 // Only true if be_compare_scrub_objects() found errors and put something
1151 errorstream
<< pgid
<< " shard " << j
->first
<< " soid " << *k
1152 << " : " << ss
.str() << "\n";
1154 // Track possible shard to use as authoritative, if needed
1155 // There are errors, without identifying the shard
1156 object_errors
.insert(j
->first
);
1157 errorstream
<< pgid
<< " soid " << *k
<< " : " << ss
.str() << "\n";
1159 // XXX: The auth shard might get here that we don't know
1160 // that it has the "correct" data.
1161 auth_list
.push_back(j
->first
);
1164 cur_missing
.insert(j
->first
);
1165 shard_map
[j
->first
].set_missing();
1166 shard_map
[j
->first
].primary
= (j
->first
== get_parent()->whoami_shard());
1167 // Can't have any other errors if there is no information available
1169 errorstream
<< pgid
<< " shard " << j
->first
<< " " << *k
<< " : missing\n";
1171 object_error
.add_shard(j
->first
, shard_map
[j
->first
]);
1174 if (auth_list
.empty()) {
1175 if (object_errors
.empty()) {
1176 errorstream
<< pgid
.pgid
<< " soid " << *k
1177 << " : failed to pick suitable auth object\n";
1180 // Object errors exist and nothing in auth_list
1181 // Prefer the auth shard otherwise take first from list.
1183 if (object_errors
.count(auth
->first
)) {
1184 shard
= auth
->first
;
1186 shard
= *(object_errors
.begin());
1188 auth_list
.push_back(shard
);
1189 object_errors
.erase(shard
);
1191 // At this point auth_list is populated, so we add the object errors shards
1193 cur_inconsistent
.insert(object_errors
.begin(), object_errors
.end());
1194 if (!cur_missing
.empty()) {
1195 missing
[*k
] = cur_missing
;
1197 if (!cur_inconsistent
.empty()) {
1198 inconsistent
[*k
] = cur_inconsistent
;
1202 std::optional
<uint32_t> data_digest
, omap_digest
;
1203 ceph_assert(auth_object
.digest_present
);
1204 data_digest
= auth_object
.digest
;
1205 if (auth_object
.omap_digest_present
) {
1206 omap_digest
= auth_object
.omap_digest
;
1208 missing_digest
[*k
] = make_pair(data_digest
, omap_digest
);
1210 if (!cur_inconsistent
.empty() || !cur_missing
.empty()) {
1211 authoritative
[*k
] = auth_list
;
1212 } else if (!fix_digest
&& parent
->get_pool().is_replicated()) {
1219 if (auth_object
.digest_present
&& !auth_oi
.is_data_digest()) {
1220 dout(20) << __func__
<< " missing data digest on " << *k
<< dendl
;
1223 if (auth_object
.omap_digest_present
&& !auth_oi
.is_omap_digest()) {
1224 dout(20) << __func__
<< " missing omap digest on " << *k
<< dendl
;
1228 // recorded digest != actual digest?
1229 if (auth_oi
.is_data_digest() && auth_object
.digest_present
&&
1230 auth_oi
.data_digest
!= auth_object
.digest
) {
1231 ceph_assert(shard_map
[auth
->first
].has_data_digest_mismatch_info());
1232 errorstream
<< pgid
<< " recorded data digest 0x"
1233 << std::hex
<< auth_oi
.data_digest
<< " != on disk 0x"
1234 << auth_object
.digest
<< std::dec
<< " on " << auth_oi
.soid
1239 if (auth_oi
.is_omap_digest() && auth_object
.omap_digest_present
&&
1240 auth_oi
.omap_digest
!= auth_object
.omap_digest
) {
1241 ceph_assert(shard_map
[auth
->first
].has_omap_digest_mismatch_info());
1242 errorstream
<< pgid
<< " recorded omap digest 0x"
1243 << std::hex
<< auth_oi
.omap_digest
<< " != on disk 0x"
1244 << auth_object
.omap_digest
<< std::dec
1245 << " on " << auth_oi
.soid
<< "\n";
1251 utime_t age
= now
- auth_oi
.local_mtime
;
1252 if (update
== FORCE
||
1253 age
> cct
->_conf
->osd_deep_scrub_update_digest_min_age
) {
1254 std::optional
<uint32_t> data_digest
, omap_digest
;
1255 if (auth_object
.digest_present
) {
1256 data_digest
= auth_object
.digest
;
1257 dout(20) << __func__
<< " will update data digest on " << *k
<< dendl
;
1259 if (auth_object
.omap_digest_present
) {
1260 omap_digest
= auth_object
.omap_digest
;
1261 dout(20) << __func__
<< " will update omap digest on " << *k
<< dendl
;
1263 missing_digest
[*k
] = make_pair(data_digest
, omap_digest
);
1265 dout(20) << __func__
<< " missing digest but age " << age
1266 << " < " << cct
->_conf
->osd_deep_scrub_update_digest_min_age
1267 << " on " << *k
<< dendl
;
1272 if (object_error
.has_deep_errors())
1274 else if (object_error
.has_shallow_errors())
1276 if (object_error
.errors
|| object_error
.union_shards
.errors
) {
1277 store
->add_object_error(k
->pool
, object_error
);
1282 void PGBackend::be_omap_checks(const map
<pg_shard_t
,ScrubMap
*> &maps
,
1283 const set
<hobject_t
> &master_set
,
1284 omap_stat_t
& omap_stats
,
1285 ostream
&warnstream
) const
1287 bool needs_omap_check
= false;
1288 for (const auto& map
: maps
) {
1289 if (map
.second
->has_large_omap_object_errors
|| map
.second
->has_omap_keys
) {
1290 needs_omap_check
= true;
1295 if (!needs_omap_check
) {
1296 return; // Nothing to do
1299 // Iterate through objects and update omap stats
1300 for (const auto& k
: master_set
) {
1301 for (const auto& map
: maps
) {
1302 if (map
.first
!= get_parent()->primary_shard()) {
1303 // Only set omap stats for the primary
1306 auto it
= map
.second
->objects
.find(k
);
1307 if (it
== map
.second
->objects
.end())
1309 ScrubMap::object
& obj
= it
->second
;
1310 omap_stats
.omap_bytes
+= obj
.object_omap_bytes
;
1311 omap_stats
.omap_keys
+= obj
.object_omap_keys
;
1312 if (obj
.large_omap_object_found
) {
1314 auto osdmap
= get_osdmap();
1315 osdmap
->map_to_pg(k
.pool
, k
.oid
.name
, k
.get_key(), k
.nspace
, &pg
);
1316 pg_t mpg
= osdmap
->raw_pg_to_pg(pg
);
1317 omap_stats
.large_omap_objects
++;
1318 warnstream
<< "Large omap object found. Object: " << k
1319 << " PG: " << pg
<< " (" << mpg
<< ")"
1320 << " Key count: " << obj
.large_omap_object_key_count
1321 << " Size (bytes): " << obj
.large_omap_object_value_size