1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "PrimaryLogScrub.h"
6 #include "common/scrub_types.h"
8 #include "PeeringState.h"
9 #include "PrimaryLogPG.h"
10 #include "scrub_machine.h"
12 #define dout_context (m_pg->cct)
13 #define dout_subsys ceph_subsys_osd
15 #define dout_prefix _prefix(_dout, this->m_pg)
17 template <class T
> static ostream
& _prefix(std::ostream
* _dout
, T
* t
)
19 return t
->gen_prefix(*_dout
) << " PrimaryLog scrubber pg(" << t
->pg_id
<< ") ";
22 using namespace Scrub
;
23 using Scrub::ScrubMachine
;
25 bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t
& arg
,
26 scrub_ls_result_t
& res_inout
) const
32 if (arg
.get_snapsets
) {
34 m_store
->get_snap_errors(m_pg
->get_pgid().pool(), arg
.start_after
, arg
.max_return
);
36 res_inout
.vals
= m_store
->get_object_errors(m_pg
->get_pgid().pool(), arg
.start_after
,
42 void PrimaryLogScrub::_scrub_finish()
44 auto& info
= m_pg
->info
; ///< a temporary alias
47 << " info stats: " << (info
.stats
.stats_invalid
? "invalid" : "valid")
50 if (info
.stats
.stats_invalid
) {
51 m_pl_pg
->recovery_state
.update_stats([=](auto& history
, auto& stats
) {
52 stats
.stats
= m_scrub_cstat
;
53 stats
.stats_invalid
= false;
57 if (m_pl_pg
->agent_state
)
58 m_pl_pg
->agent_choose_mode();
61 dout(10) << m_mode_desc
<< " got " << m_scrub_cstat
.sum
.num_objects
<< "/"
62 << info
.stats
.stats
.sum
.num_objects
<< " objects, "
63 << m_scrub_cstat
.sum
.num_object_clones
<< "/"
64 << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
65 << m_scrub_cstat
.sum
.num_objects_dirty
<< "/"
66 << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
67 << m_scrub_cstat
.sum
.num_objects_omap
<< "/"
68 << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
69 << m_scrub_cstat
.sum
.num_objects_pinned
<< "/"
70 << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
71 << m_scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/"
72 << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
73 << m_scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
74 << " bytes, " << m_scrub_cstat
.sum
.num_objects_manifest
<< "/"
75 << info
.stats
.stats
.sum
.num_objects_manifest
<< " manifest objects, "
76 << m_scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/"
77 << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes."
80 if (m_scrub_cstat
.sum
.num_objects
!= info
.stats
.stats
.sum
.num_objects
||
81 m_scrub_cstat
.sum
.num_object_clones
!= info
.stats
.stats
.sum
.num_object_clones
||
82 (m_scrub_cstat
.sum
.num_objects_dirty
!= info
.stats
.stats
.sum
.num_objects_dirty
&&
83 !info
.stats
.dirty_stats_invalid
) ||
84 (m_scrub_cstat
.sum
.num_objects_omap
!= info
.stats
.stats
.sum
.num_objects_omap
&&
85 !info
.stats
.omap_stats_invalid
) ||
86 (m_scrub_cstat
.sum
.num_objects_pinned
!= info
.stats
.stats
.sum
.num_objects_pinned
&&
87 !info
.stats
.pin_stats_invalid
) ||
88 (m_scrub_cstat
.sum
.num_objects_hit_set_archive
!=
89 info
.stats
.stats
.sum
.num_objects_hit_set_archive
&&
90 !info
.stats
.hitset_stats_invalid
) ||
91 (m_scrub_cstat
.sum
.num_bytes_hit_set_archive
!=
92 info
.stats
.stats
.sum
.num_bytes_hit_set_archive
&&
93 !info
.stats
.hitset_bytes_stats_invalid
) ||
94 (m_scrub_cstat
.sum
.num_objects_manifest
!=
95 info
.stats
.stats
.sum
.num_objects_manifest
&&
96 !info
.stats
.manifest_stats_invalid
) ||
97 m_scrub_cstat
.sum
.num_whiteouts
!= info
.stats
.stats
.sum
.num_whiteouts
||
98 m_scrub_cstat
.sum
.num_bytes
!= info
.stats
.stats
.sum
.num_bytes
) {
99 m_osds
->clog
->error() << info
.pgid
<< " " << m_mode_desc
<< " : stat mismatch, got "
100 << m_scrub_cstat
.sum
.num_objects
<< "/"
101 << info
.stats
.stats
.sum
.num_objects
<< " objects, "
102 << m_scrub_cstat
.sum
.num_object_clones
<< "/"
103 << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
104 << m_scrub_cstat
.sum
.num_objects_dirty
<< "/"
105 << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
106 << m_scrub_cstat
.sum
.num_objects_omap
<< "/"
107 << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
108 << m_scrub_cstat
.sum
.num_objects_pinned
<< "/"
109 << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
110 << m_scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/"
111 << info
.stats
.stats
.sum
.num_objects_hit_set_archive
112 << " hit_set_archive, " << m_scrub_cstat
.sum
.num_whiteouts
113 << "/" << info
.stats
.stats
.sum
.num_whiteouts
<< " whiteouts, "
114 << m_scrub_cstat
.sum
.num_bytes
<< "/"
115 << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
116 << m_scrub_cstat
.sum
.num_objects_manifest
<< "/"
117 << info
.stats
.stats
.sum
.num_objects_manifest
118 << " manifest objects, "
119 << m_scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/"
120 << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
121 << " hit_set_archive bytes.";
126 m_pl_pg
->recovery_state
.update_stats([this](auto& history
, auto& stats
) {
127 stats
.stats
= m_scrub_cstat
;
128 stats
.dirty_stats_invalid
= false;
129 stats
.omap_stats_invalid
= false;
130 stats
.hitset_stats_invalid
= false;
131 stats
.hitset_bytes_stats_invalid
= false;
132 stats
.pin_stats_invalid
= false;
133 stats
.manifest_stats_invalid
= false;
136 m_pl_pg
->publish_stats_to_osd();
137 m_pl_pg
->recovery_state
.share_pg_info();
140 // Clear object context cache to get repair information
142 m_pl_pg
->object_contexts
.clear();
145 static bool doing_clones(const std::optional
<SnapSet
>& snapset
,
146 const vector
<snapid_t
>::reverse_iterator
& curclone
)
148 return snapset
&& curclone
!= snapset
->clones
.rend();
151 void PrimaryLogScrub::log_missing(int missing
,
152 const std::optional
<hobject_t
>& head
,
156 bool allow_incomplete_clones
)
159 if (allow_incomplete_clones
) {
160 dout(20) << func
<< " " << m_mode_desc
<< " " << pgid
<< " " << *head
<< " skipped "
161 << missing
<< " clone(s) in cache tier" << dendl
;
163 clog
->info() << m_mode_desc
<< " " << pgid
<< " " << *head
<< " : " << missing
164 << " missing clone(s)";
168 int PrimaryLogScrub::process_clones_to(const std::optional
<hobject_t
>& head
,
169 const std::optional
<SnapSet
>& snapset
,
172 bool allow_incomplete_clones
,
173 std::optional
<snapid_t
> target
,
174 vector
<snapid_t
>::reverse_iterator
* curclone
,
175 inconsistent_snapset_wrapper
& e
)
178 ceph_assert(snapset
);
179 int missing_count
= 0;
181 // NOTE: clones are in descending order, thus **curclone > target test here
182 hobject_t
next_clone(*head
);
183 while (doing_clones(snapset
, *curclone
) && (!target
|| **curclone
> *target
)) {
186 // it is okay to be missing one or more clones in a cache tier.
187 // skip higher-numbered clones in the list.
188 if (!allow_incomplete_clones
) {
189 next_clone
.snap
= **curclone
;
190 clog
->error() << m_mode_desc
<< " " << pgid
<< " " << *head
<< " : expected clone "
191 << next_clone
<< " " << m_missing
<< " missing";
193 e
.set_clone_missing(next_clone
.snap
);
195 // Clones are descending
198 return missing_count
;
202 * Validate consistency of the object info and snap sets.
204 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
205 * the comparison of the objects is against multiple snapset.clones. There are
206 * multiple clone lists and in between lists we expect head.
212 * obj1 snap 1 head, unexpected obj1 snap 1
213 * obj2 head head, match
214 * [SnapSet clones 6 4 2 1]
215 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
216 * obj2 snap 6 obj2 snap 6, match
217 * obj2 snap 4 obj2 snap 4, match
218 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
219 * [Snapset clones 3 1]
220 * obj3 snap 3 obj3 snap 3 match
221 * obj3 snap 1 obj3 snap 1 match
222 * obj4 head head, match
224 * EOL obj4 snap 4, (expected)
226 void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap
& scrubmap
,
227 const missing_map_t
& missing_digest
)
229 dout(10) << __func__
<< " num stat obj " << m_pl_pg
->info
.stats
.stats
.sum
.num_objects
232 auto& info
= m_pl_pg
->info
;
233 const PGPool
& pool
= m_pl_pg
->pool
;
234 bool allow_incomplete_clones
= pool
.info
.allow_incomplete_clones();
236 std::optional
<snapid_t
> all_clones
; // Unspecified snapid_t or std::nullopt
238 // traverse in reverse order.
239 std::optional
<hobject_t
> head
;
240 std::optional
<SnapSet
> snapset
; // If initialized so will head (above)
241 vector
<snapid_t
>::reverse_iterator curclone
; // Defined only if snapset initialized
243 inconsistent_snapset_wrapper soid_error
, head_error
;
244 int soid_error_count
= 0;
246 for (auto p
= scrubmap
.objects
.rbegin(); p
!= scrubmap
.objects
.rend(); ++p
) {
248 const hobject_t
& soid
= p
->first
;
249 ceph_assert(!soid
.is_snapdir());
250 soid_error
= inconsistent_snapset_wrapper
{soid
};
251 object_stat_sum_t stat
;
252 std::optional
<object_info_t
> oi
;
256 if (soid
.nspace
== m_pl_pg
->cct
->_conf
->osd_hit_set_namespace
)
257 stat
.num_objects_hit_set_archive
++;
259 if (soid
.is_snap()) {
261 stat
.num_object_clones
++;
265 if (p
->second
.attrs
.count(OI_ATTR
) == 0) {
267 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
<< " : no '"
268 << OI_ATTR
<< "' attr";
270 soid_error
.set_info_missing();
273 bv
.push_back(p
->second
.attrs
[OI_ATTR
]);
275 oi
= object_info_t(); // Initialize optional<> before decode into it
277 } catch (ceph::buffer::error
& e
) {
279 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
280 << " : can't decode '" << OI_ATTR
<< "' attr " << e
.what();
282 soid_error
.set_info_corrupted();
283 soid_error
.set_info_missing(); // Not available too
288 if (m_pl_pg
->pgbackend
->be_get_ondisk_size(oi
->size
) != p
->second
.size
) {
289 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
290 << " : on disk size (" << p
->second
.size
291 << ") does not match object info size (" << oi
->size
292 << ") adjusted for ondisk to ("
293 << m_pl_pg
->pgbackend
->be_get_ondisk_size(oi
->size
) << ")";
294 soid_error
.set_size_mismatch();
298 dout(20) << m_mode_desc
<< " " << soid
<< " " << *oi
<< dendl
;
300 // A clone num_bytes will be added later when we have snapset
301 if (!soid
.is_snap()) {
302 stat
.num_bytes
+= oi
->size
;
304 if (soid
.nspace
== m_pl_pg
->cct
->_conf
->osd_hit_set_namespace
)
305 stat
.num_bytes_hit_set_archive
+= oi
->size
;
308 ++stat
.num_objects_dirty
;
309 if (oi
->is_whiteout())
310 ++stat
.num_whiteouts
;
312 ++stat
.num_objects_omap
;
313 if (oi
->is_cache_pinned())
314 ++stat
.num_objects_pinned
;
315 if (oi
->has_manifest())
316 ++stat
.num_objects_manifest
;
319 // Check for any problems while processing clones
320 if (doing_clones(snapset
, curclone
)) {
321 std::optional
<snapid_t
> target
;
322 // Expecting an object with snap for current head
323 if (soid
.has_snapset() || soid
.get_head() != head
->get_head()) {
325 dout(10) << __func__
<< " " << m_mode_desc
<< " " << info
.pgid
<< " new object " << soid
326 << " while processing " << *head
<< dendl
;
330 ceph_assert(soid
.is_snap());
334 // Log any clones we were expecting to be there up to target
335 // This will set missing, but will be a no-op if snap.soid == *curclone.
337 process_clones_to(head
, snapset
, m_osds
->clog
, info
.pgid
,
338 allow_incomplete_clones
, target
, &curclone
, head_error
);
342 // Check doing_clones() again in case we ran process_clones_to()
343 if (doing_clones(snapset
, curclone
)) {
344 // A head would have processed all clones above
345 // or all greater than *curclone.
346 ceph_assert(soid
.is_snap() && *curclone
<= soid
.snap
);
348 // After processing above clone snap should match the expected curclone
349 expected
= (*curclone
== soid
.snap
);
351 // If we aren't doing clones any longer, then expecting head
352 expected
= soid
.has_snapset();
355 // If we couldn't read the head's snapset, just ignore clones
356 if (head
&& !snapset
) {
357 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
358 << " : clone ignored due to missing snapset";
360 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
361 << " : is an unexpected clone";
364 soid_error
.set_headless();
365 m_store
->add_snap_error(pool
.id
, soid_error
);
367 if (head
&& soid
.get_head() == head
->get_head())
368 head_error
.set_clone(soid
.snap
);
373 if (soid
.has_snapset()) {
376 log_missing(missing
, head
, m_osds
->clog
, info
.pgid
, __func__
,
377 pool
.info
.allow_incomplete_clones());
380 // Save previous head error information
381 if (head
&& (head_error
.errors
|| soid_error_count
))
382 m_store
->add_snap_error(pool
.id
, head_error
);
383 // Set this as a new head object
386 head_error
= soid_error
;
387 soid_error_count
= 0;
389 dout(20) << __func__
<< " " << m_mode_desc
<< " new head " << head
<< dendl
;
391 if (p
->second
.attrs
.count(SS_ATTR
) == 0) {
392 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
<< " : no '"
393 << SS_ATTR
<< "' attr";
395 snapset
= std::nullopt
;
396 head_error
.set_snapset_missing();
399 bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
400 auto blp
= bl
.cbegin();
402 snapset
= SnapSet(); // Initialize optional<> before decoding into it
403 decode(*snapset
, blp
);
404 head_error
.ss_bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
405 } catch (ceph::buffer::error
& e
) {
406 snapset
= std::nullopt
;
407 m_osds
->clog
->error()
408 << m_mode_desc
<< " " << info
.pgid
<< " " << soid
<< " : can't decode '" << SS_ATTR
409 << "' attr " << e
.what();
411 head_error
.set_snapset_corrupted();
416 // what will be next?
417 curclone
= snapset
->clones
.rbegin();
419 if (!snapset
->clones
.empty()) {
420 dout(20) << " snapset " << *snapset
<< dendl
;
421 if (snapset
->seq
== 0) {
422 m_osds
->clog
->error()
423 << m_mode_desc
<< " " << info
.pgid
<< " " << soid
<< " : snaps.seq not set";
425 head_error
.set_snapset_error();
430 ceph_assert(soid
.is_snap());
432 ceph_assert(snapset
);
433 ceph_assert(soid
.snap
== *curclone
);
435 dout(20) << __func__
<< " " << m_mode_desc
<< " matched clone " << soid
<< dendl
;
437 if (snapset
->clone_size
.count(soid
.snap
) == 0) {
438 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
439 << " : is missing in clone_size";
441 soid_error
.set_size_mismatch();
443 if (oi
&& oi
->size
!= snapset
->clone_size
[soid
.snap
]) {
444 m_osds
->clog
->error()
445 << m_mode_desc
<< " " << info
.pgid
<< " " << soid
<< " : size " << oi
->size
446 << " != clone_size " << snapset
->clone_size
[*curclone
];
448 soid_error
.set_size_mismatch();
451 if (snapset
->clone_overlap
.count(soid
.snap
) == 0) {
452 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
453 << " : is missing in clone_overlap";
455 soid_error
.set_size_mismatch();
457 // This checking is based on get_clone_bytes(). The first 2 asserts
458 // can't happen because we know we have a clone_size and
459 // a clone_overlap. Now we check that the interval_set won't
460 // cause the last assert.
461 uint64_t size
= snapset
->clone_size
.find(soid
.snap
)->second
;
462 const interval_set
<uint64_t>& overlap
=
463 snapset
->clone_overlap
.find(soid
.snap
)->second
;
464 bool bad_interval_set
= false;
465 for (interval_set
<uint64_t>::const_iterator i
= overlap
.begin();
466 i
!= overlap
.end(); ++i
) {
467 if (size
< i
.get_len()) {
468 bad_interval_set
= true;
474 if (bad_interval_set
) {
475 m_osds
->clog
->error() << m_mode_desc
<< " " << info
.pgid
<< " " << soid
476 << " : bad interval_set in clone_overlap";
478 soid_error
.set_size_mismatch();
480 stat
.num_bytes
+= snapset
->get_clone_bytes(soid
.snap
);
487 if (soid_error
.errors
) {
488 m_store
->add_snap_error(pool
.id
, soid_error
);
492 m_scrub_cstat
.add(stat
);
495 if (doing_clones(snapset
, curclone
)) {
496 dout(10) << __func__
<< " " << m_mode_desc
<< " " << info
.pgid
497 << " No more objects while processing " << *head
<< dendl
;
500 process_clones_to(head
, snapset
, m_osds
->clog
, info
.pgid
,
501 allow_incomplete_clones
, all_clones
, &curclone
, head_error
);
504 // There could be missing found by the test above or even
505 // before dropping out of the loop for the last head.
507 log_missing(missing
, head
, m_osds
->clog
, info
.pgid
, __func__
,
508 allow_incomplete_clones
);
510 if (head
&& (head_error
.errors
|| soid_error_count
))
511 m_store
->add_snap_error(pool
.id
, head_error
);
513 dout(20) << __func__
<< " - " << missing
<< " (" << missing_digest
.size() << ") missing"
515 for (auto p
= missing_digest
.begin(); p
!= missing_digest
.end(); ++p
) {
517 ceph_assert(!p
->first
.is_snapdir());
518 dout(10) << __func__
<< " recording digests for " << p
->first
<< dendl
;
520 ObjectContextRef obc
= m_pl_pg
->get_object_context(p
->first
, false);
522 m_osds
->clog
->error() << info
.pgid
<< " " << m_mode_desc
523 << " cannot get object context for object " << p
->first
;
526 if (obc
->obs
.oi
.soid
!= p
->first
) {
527 m_osds
->clog
->error() << info
.pgid
<< " " << m_mode_desc
<< " " << p
->first
528 << " : object has a valid oi attr with a mismatched name, "
529 << " obc->obs.oi.soid: " << obc
->obs
.oi
.soid
;
532 PrimaryLogPG::OpContextUPtr ctx
= m_pl_pg
->simple_opc_create(obc
);
533 ctx
->at_version
= m_pl_pg
->get_next_version();
534 ctx
->mtime
= utime_t(); // do not update mtime
535 if (p
->second
.first
) {
536 ctx
->new_obs
.oi
.set_data_digest(*p
->second
.first
);
538 ctx
->new_obs
.oi
.clear_data_digest();
540 if (p
->second
.second
) {
541 ctx
->new_obs
.oi
.set_omap_digest(*p
->second
.second
);
543 ctx
->new_obs
.oi
.clear_omap_digest();
545 m_pl_pg
->finish_ctx(ctx
.get(), pg_log_entry_t::MODIFY
);
547 ++num_digest_updates_pending
;
548 ctx
->register_on_success([this]() {
549 dout(20) << "updating scrub digest " << num_digest_updates_pending
<< dendl
;
550 if (--num_digest_updates_pending
<= 0) {
551 m_osds
->queue_scrub_digest_update(m_pl_pg
, m_pl_pg
->is_scrub_blocking_ops());
555 m_pl_pg
->simple_opc_submit(std::move(ctx
));
558 dout(10) << __func__
<< " (" << m_mode_desc
<< ") finish" << dendl
;
561 PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG
* pg
) : PgScrubber
{pg
}, m_pl_pg
{pg
} {}
563 void PrimaryLogScrub::_scrub_clear_state()
565 dout(15) << __func__
<< dendl
;
566 m_scrub_cstat
= object_stat_collection_t();
569 void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t
& delta_stats
,
570 const hobject_t
& soid
)
572 // We scrub objects in hobject_t order, so objects before m_start have already been
573 // scrubbed and their stats have already been added to the scrubber. Objects after that
574 // point haven't been included in the scrubber's stats accounting yet, so they will be
575 // included when the scrubber gets to that object.
576 dout(15) << __func__
<< " soid: " << soid
<< " scrub is active? " << is_scrub_active()
578 if (is_primary() && is_scrub_active()) {
579 if (soid
< m_start
) {
580 dout(20) << __func__
<< " " << soid
<< " < [" << m_start
<< "," << m_end
<< ")"
582 m_scrub_cstat
.add(delta_stats
);
584 dout(20) << __func__
<< " " << soid
<< " >= [" << m_start
<< "," << m_end
<< ")"
590 bool PrimaryLogScrub::should_requeue_blocked_ops(eversion_t last_recovery_applied
) const
592 if (!is_scrub_active()) {
593 // just verify that things indeed are quiet
594 ceph_assert(m_start
== m_end
);
598 return last_recovery_applied
>= m_subset_last_update
;