1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 // re-include our assert to clobber boost's
20 #include "include/ceph_assert.h"
21 #include "include/common_fwd.h"
22 #include "osd_types.h"
23 #include "os/ObjectStore.h"
27 #include <seastar/core/future.hh>
28 #include "crimson/os/futurized_store.h"
29 #include "crimson/os/cyanstore/cyan_collection.h"
32 constexpr auto PGLOG_INDEXED_OBJECTS
= 1 << 0;
33 constexpr auto PGLOG_INDEXED_CALLER_OPS
= 1 << 1;
34 constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS
= 1 << 2;
35 constexpr auto PGLOG_INDEXED_DUPS
= 1 << 3;
36 constexpr auto PGLOG_INDEXED_ALL
= PGLOG_INDEXED_OBJECTS
37 | PGLOG_INDEXED_CALLER_OPS
38 | PGLOG_INDEXED_EXTRA_CALLER_OPS
41 struct PGLog
: DoutPrefixProvider
{
42 std::ostream
& gen_prefix(std::ostream
& out
) const override
{
45 unsigned get_subsys() const override
{
46 return static_cast<unsigned>(ceph_subsys_osd
);
48 CephContext
*get_cct() const override
{
52 ////////////////////////////// sub classes //////////////////////////////
53 struct LogEntryHandler
{
54 virtual void rollback(
55 const pg_log_entry_t
&entry
) = 0;
56 virtual void rollforward(
57 const pg_log_entry_t
&entry
) = 0;
59 const pg_log_entry_t
&entry
) = 0;
61 const hobject_t
&hoid
) = 0;
62 virtual void try_stash(
63 const hobject_t
&hoid
,
65 virtual ~LogEntryHandler() {}
67 using LogEntryHandlerRef
= unique_ptr
<LogEntryHandler
>;
71 * IndexLog - adds in-memory index of the log, by oid.
72 * plus some methods to manipulate it all.
74 struct IndexedLog
: public pg_log_t
{
75 mutable ceph::unordered_map
<hobject_t
,pg_log_entry_t
*> objects
; // ptrs into log. be careful!
76 mutable ceph::unordered_map
<osd_reqid_t
,pg_log_entry_t
*> caller_ops
;
77 mutable ceph::unordered_multimap
<osd_reqid_t
,pg_log_entry_t
*> extra_caller_ops
;
78 mutable ceph::unordered_map
<osd_reqid_t
,pg_log_dup_t
*> dup_index
;
81 list
<pg_log_entry_t
>::iterator complete_to
; // not inclusive of referenced item
82 version_t last_requested
= 0; // last object requested by primary
86 mutable __u16 indexed_data
= 0;
88 * rollback_info_trimmed_to_riter points to the first log entry <=
89 * rollback_info_trimmed_to
91 * It's a reverse_iterator because rend() is a natural representation for
92 * tail, and rbegin() works nicely for head.
94 mempool::osd_pglog::list
<pg_log_entry_t
>::reverse_iterator
95 rollback_info_trimmed_to_riter
;
98 * return true if we need to mark the pglog as dirty
100 template <typename F
>
101 bool advance_can_rollback_to(eversion_t to
, F
&&f
) {
102 bool dirty_log
= to
> can_rollback_to
|| to
> rollback_info_trimmed_to
;
104 if (to
> can_rollback_to
)
105 can_rollback_to
= to
;
107 if (to
> rollback_info_trimmed_to
)
108 rollback_info_trimmed_to
= to
;
111 while (rollback_info_trimmed_to_riter
!= log
.rbegin()) {
112 --rollback_info_trimmed_to_riter
;
113 if (rollback_info_trimmed_to_riter
->version
> rollback_info_trimmed_to
) {
114 ++rollback_info_trimmed_to_riter
;
117 f(*rollback_info_trimmed_to_riter
);
123 void reset_rollback_info_trimmed_to_riter() {
124 rollback_info_trimmed_to_riter
= log
.rbegin();
125 while (rollback_info_trimmed_to_riter
!= log
.rend() &&
126 rollback_info_trimmed_to_riter
->version
> rollback_info_trimmed_to
)
127 ++rollback_info_trimmed_to_riter
;
130 // indexes objects, caller ops and extra caller ops
133 complete_to(log
.end()),
136 rollback_info_trimmed_to_riter(log
.rbegin())
139 template <typename
... Args
>
140 explicit IndexedLog(Args
&&... args
) :
141 pg_log_t(std::forward
<Args
>(args
)...),
142 complete_to(log
.end()),
145 rollback_info_trimmed_to_riter(log
.rbegin())
147 reset_rollback_info_trimmed_to_riter();
151 IndexedLog(const IndexedLog
&rhs
) :
153 complete_to(log
.end()),
154 last_requested(rhs
.last_requested
),
156 rollback_info_trimmed_to_riter(log
.rbegin())
158 reset_rollback_info_trimmed_to_riter();
159 index(rhs
.indexed_data
);
162 IndexedLog
&operator=(const IndexedLog
&rhs
) {
164 new (this) IndexedLog(rhs
);
168 void trim_rollback_info_to(eversion_t to
, LogEntryHandler
*h
) {
169 advance_can_rollback_to(
171 [&](pg_log_entry_t
&entry
) {
175 bool roll_forward_to(eversion_t to
, LogEntryHandler
*h
) {
176 return advance_can_rollback_to(
178 [&](pg_log_entry_t
&entry
) {
179 h
->rollforward(entry
);
183 void skip_can_rollback_to_to_head() {
184 advance_can_rollback_to(head
, [&](const pg_log_entry_t
&entry
) {});
187 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
188 auto divergent
= pg_log_t::rewind_from_head(newhead
);
190 reset_rollback_info_trimmed_to_riter();
194 template <typename T
>
196 const eversion_t
&bound
, ///< [in] scan entries > bound
198 auto iter
= log
.rbegin();
199 while (iter
!= log
.rend() && iter
->version
> bound
)
203 if (iter
== log
.rbegin())
210 void claim_log_and_clear_rollback_info(const pg_log_t
& o
) {
211 // we must have already trimmed the old entries
212 ceph_assert(rollback_info_trimmed_to
== head
);
213 ceph_assert(rollback_info_trimmed_to_riter
== log
.rbegin());
215 *this = IndexedLog(o
);
217 skip_can_rollback_to_to_head();
221 void split_out_child(
227 // we must have already trimmed the old entries
228 ceph_assert(rollback_info_trimmed_to
== head
);
229 ceph_assert(rollback_info_trimmed_to_riter
== log
.rbegin());
233 rollback_info_trimmed_to_riter
= log
.rbegin();
234 reset_recovery_pointers();
237 skip_can_rollback_to_to_head();
240 void reset_recovery_pointers() {
241 complete_to
= log
.end();
245 bool logged_object(const hobject_t
& oid
) const {
246 if (!(indexed_data
& PGLOG_INDEXED_OBJECTS
)) {
249 return objects
.count(oid
);
252 bool logged_req(const osd_reqid_t
&r
) const {
253 if (!(indexed_data
& PGLOG_INDEXED_CALLER_OPS
)) {
256 if (!caller_ops
.count(r
)) {
257 if (!(indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)) {
258 index_extra_caller_ops();
260 return extra_caller_ops
.count(r
);
266 const osd_reqid_t
&r
,
268 version_t
*user_version
,
270 vector
<pg_log_op_return_item_t
> *op_returns
) const
272 ceph_assert(version
);
273 ceph_assert(user_version
);
274 ceph_assert(return_code
);
275 ceph::unordered_map
<osd_reqid_t
,pg_log_entry_t
*>::const_iterator p
;
276 if (!(indexed_data
& PGLOG_INDEXED_CALLER_OPS
)) {
279 p
= caller_ops
.find(r
);
280 if (p
!= caller_ops
.end()) {
281 *version
= p
->second
->version
;
282 *user_version
= p
->second
->user_version
;
283 *return_code
= p
->second
->return_code
;
284 *op_returns
= p
->second
->op_returns
;
288 // warning: we will return *a* request for this reqid, but not
289 // necessarily the most recent.
290 if (!(indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)) {
291 index_extra_caller_ops();
293 p
= extra_caller_ops
.find(r
);
294 if (p
!= extra_caller_ops
.end()) {
296 for (auto i
= p
->second
->extra_reqids
.begin();
297 i
!= p
->second
->extra_reqids
.end();
300 *version
= p
->second
->version
;
301 *user_version
= i
->second
;
302 *return_code
= p
->second
->return_code
;
303 *op_returns
= p
->second
->op_returns
;
304 if (*return_code
>= 0) {
305 auto it
= p
->second
->extra_reqid_return_codes
.find(idx
);
306 if (it
!= p
->second
->extra_reqid_return_codes
.end()) {
307 *return_code
= it
->second
;
313 ceph_abort_msg("in extra_caller_ops but not extra_reqids");
316 if (!(indexed_data
& PGLOG_INDEXED_DUPS
)) {
319 auto q
= dup_index
.find(r
);
320 if (q
!= dup_index
.end()) {
321 *version
= q
->second
->version
;
322 *user_version
= q
->second
->user_version
;
323 *return_code
= q
->second
->return_code
;
324 *op_returns
= q
->second
->op_returns
;
331 bool has_write_since(const hobject_t
&oid
, const eversion_t
&bound
) const {
332 for (auto i
= log
.rbegin(); i
!= log
.rend(); ++i
) {
333 if (i
->version
<= bound
)
335 if (i
->soid
.get_head() == oid
.get_head())
341 /// get a (bounded) list of recent reqids for the given object
342 void get_object_reqids(const hobject_t
& oid
, unsigned max
,
343 mempool::osd_pglog::vector
<pair
<osd_reqid_t
, version_t
> > *pls
,
344 mempool::osd_pglog::map
<uint32_t, int> *return_codes
) const {
345 // make sure object is present at least once before we do an
347 if (!(indexed_data
& PGLOG_INDEXED_OBJECTS
)) {
350 if (objects
.count(oid
) == 0)
353 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= log
.rbegin();
356 if (i
->soid
== oid
) {
357 if (i
->reqid_is_indexed()) {
358 if (i
->op
== pg_log_entry_t::ERROR
) {
359 // propagate op errors to the cache tier's PG log
360 return_codes
->emplace(pls
->size(), i
->return_code
);
362 pls
->push_back(make_pair(i
->reqid
, i
->user_version
));
365 pls
->insert(pls
->end(), i
->extra_reqids
.begin(), i
->extra_reqids
.end());
366 if (pls
->size() >= max
) {
367 if (pls
->size() > max
) {
376 void index(__u16 to_index
= PGLOG_INDEXED_ALL
) const {
377 // if to_index is 0, no need to run any of this code, especially
378 // loop below; this can happen with copy constructor for
379 // IndexedLog (and indirectly through assignment operator)
380 if (!to_index
) return;
382 if (to_index
& PGLOG_INDEXED_OBJECTS
)
384 if (to_index
& PGLOG_INDEXED_CALLER_OPS
)
386 if (to_index
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)
387 extra_caller_ops
.clear();
388 if (to_index
& PGLOG_INDEXED_DUPS
) {
390 for (auto& i
: dups
) {
391 dup_index
[i
.reqid
] = const_cast<pg_log_dup_t
*>(&i
);
395 constexpr __u16 any_log_entry_index
=
396 PGLOG_INDEXED_OBJECTS
|
397 PGLOG_INDEXED_CALLER_OPS
|
398 PGLOG_INDEXED_EXTRA_CALLER_OPS
;
400 if (to_index
& any_log_entry_index
) {
401 for (list
<pg_log_entry_t
>::const_iterator i
= log
.begin();
404 if (to_index
& PGLOG_INDEXED_OBJECTS
) {
405 if (i
->object_is_indexed()) {
406 objects
[i
->soid
] = const_cast<pg_log_entry_t
*>(&(*i
));
410 if (to_index
& PGLOG_INDEXED_CALLER_OPS
) {
411 if (i
->reqid_is_indexed()) {
412 caller_ops
[i
->reqid
] = const_cast<pg_log_entry_t
*>(&(*i
));
416 if (to_index
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
417 for (auto j
= i
->extra_reqids
.begin();
418 j
!= i
->extra_reqids
.end();
420 extra_caller_ops
.insert(
421 make_pair(j
->first
, const_cast<pg_log_entry_t
*>(&(*i
))));
427 indexed_data
|= to_index
;
430 void index_objects() const {
431 index(PGLOG_INDEXED_OBJECTS
);
434 void index_caller_ops() const {
435 index(PGLOG_INDEXED_CALLER_OPS
);
438 void index_extra_caller_ops() const {
439 index(PGLOG_INDEXED_EXTRA_CALLER_OPS
);
442 void index_dups() const {
443 index(PGLOG_INDEXED_DUPS
);
446 void index(pg_log_entry_t
& e
) {
447 if ((indexed_data
& PGLOG_INDEXED_OBJECTS
) && e
.object_is_indexed()) {
448 if (objects
.count(e
.soid
) == 0 ||
449 objects
[e
.soid
]->version
< e
.version
)
450 objects
[e
.soid
] = &e
;
452 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
453 // divergent merge_log indexes new before unindexing old
454 if (e
.reqid_is_indexed()) {
455 caller_ops
[e
.reqid
] = &e
;
458 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
459 for (auto j
= e
.extra_reqids
.begin();
460 j
!= e
.extra_reqids
.end();
462 extra_caller_ops
.insert(make_pair(j
->first
, &e
));
470 extra_caller_ops
.clear();
475 void unindex(const pg_log_entry_t
& e
) {
476 // NOTE: this only works if we remove from the _tail_ of the log!
477 if (indexed_data
& PGLOG_INDEXED_OBJECTS
) {
478 auto it
= objects
.find(e
.soid
);
479 if (it
!= objects
.end() && it
->second
->version
== e
.version
)
482 if (e
.reqid_is_indexed()) {
483 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
484 auto it
= caller_ops
.find(e
.reqid
);
485 // divergent merge_log indexes new before unindexing old
486 if (it
!= caller_ops
.end() && it
->second
== &e
)
487 caller_ops
.erase(it
);
490 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
491 for (auto j
= e
.extra_reqids
.begin();
492 j
!= e
.extra_reqids
.end();
494 for (ceph::unordered_multimap
<osd_reqid_t
,pg_log_entry_t
*>::iterator k
=
495 extra_caller_ops
.find(j
->first
);
496 k
!= extra_caller_ops
.end() && k
->first
== j
->first
;
498 if (k
->second
== &e
) {
499 extra_caller_ops
.erase(k
);
507 void index(pg_log_dup_t
& e
) {
508 if (indexed_data
& PGLOG_INDEXED_DUPS
) {
509 dup_index
[e
.reqid
] = &e
;
513 void unindex(const pg_log_dup_t
& e
) {
514 if (indexed_data
& PGLOG_INDEXED_DUPS
) {
515 auto i
= dup_index
.find(e
.reqid
);
516 if (i
!= dup_index
.end()) {
523 void add(const pg_log_entry_t
& e
, bool applied
= true) {
525 ceph_assert(get_can_rollback_to() == head
);
528 // make sure our buffers don't pin bigger buffers
529 e
.mod_desc
.trim_bl();
534 // riter previously pointed to the previous entry
535 if (rollback_info_trimmed_to_riter
== log
.rbegin())
536 ++rollback_info_trimmed_to_riter
;
538 ceph_assert(e
.version
> head
);
539 ceph_assert(head
.version
== 0 || e
.version
.version
> head
.version
);
543 if ((indexed_data
& PGLOG_INDEXED_OBJECTS
) && e
.object_is_indexed()) {
544 objects
[e
.soid
] = &(log
.back());
546 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
547 if (e
.reqid_is_indexed()) {
548 caller_ops
[e
.reqid
] = &(log
.back());
552 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
553 for (auto j
= e
.extra_reqids
.begin();
554 j
!= e
.extra_reqids
.end();
556 extra_caller_ops
.insert(make_pair(j
->first
, &(log
.back())));
561 skip_can_rollback_to_to_head();
568 set
<eversion_t
> *trimmed
,
569 set
<string
>* trimmed_dups
,
570 eversion_t
*write_from_dups
);
572 ostream
& print(ostream
& out
) const;
577 //////////////////// data members ////////////////////
579 pg_missing_tracker_t missing
;
582 eversion_t dirty_to
; ///< must clear/writeout all keys <= dirty_to
583 eversion_t dirty_from
; ///< must clear/writeout all keys >= dirty_from
584 eversion_t writeout_from
; ///< must writout keys >= writeout_from
585 set
<eversion_t
> trimmed
; ///< must clear keys in trimmed
586 eversion_t dirty_to_dups
; ///< must clear/writeout all dups <= dirty_to_dups
587 eversion_t dirty_from_dups
; ///< must clear/writeout all dups >= dirty_from_dups
588 eversion_t write_from_dups
; ///< must write keys >= write_from_dups
589 set
<string
> trimmed_dups
; ///< must clear keys in trimmed_dups
592 /// Log is clean on [dirty_to, dirty_from)
595 bool clear_divergent_priors
;
596 bool may_include_deletes_in_missing_dirty
= false;
598 void mark_dirty_to(eversion_t to
) {
602 void mark_dirty_from(eversion_t from
) {
603 if (from
< dirty_from
)
606 void mark_writeout_from(eversion_t from
) {
607 if (from
< writeout_from
)
608 writeout_from
= from
;
610 void mark_dirty_to_dups(eversion_t to
) {
611 if (to
> dirty_to_dups
)
614 void mark_dirty_from_dups(eversion_t from
) {
615 if (from
< dirty_from_dups
)
616 dirty_from_dups
= from
;
619 bool needs_write() const {
620 return !touched_log
|| is_dirty();
623 bool is_dirty() const {
625 (dirty_to
!= eversion_t()) ||
626 (dirty_from
!= eversion_t::max()) ||
627 (writeout_from
!= eversion_t::max()) ||
628 !(trimmed
.empty()) ||
629 !missing
.is_clean() ||
630 !(trimmed_dups
.empty()) ||
631 (dirty_to_dups
!= eversion_t()) ||
632 (dirty_from_dups
!= eversion_t::max()) ||
633 (write_from_dups
!= eversion_t::max()) ||
634 may_include_deletes_in_missing_dirty
;
637 void mark_log_for_rewrite() {
638 mark_dirty_to(eversion_t::max());
639 mark_dirty_from(eversion_t());
640 mark_dirty_to_dups(eversion_t::max());
641 mark_dirty_from_dups(eversion_t());
644 bool get_may_include_deletes_in_missing_dirty() const {
645 return may_include_deletes_in_missing_dirty
;
650 set
<string
> log_keys_debug
;
651 static void clear_after(set
<string
> *log_keys_debug
, const string
&lb
) {
654 for (set
<string
>::iterator i
= log_keys_debug
->lower_bound(lb
);
655 i
!= log_keys_debug
->end();
656 log_keys_debug
->erase(i
++));
658 static void clear_up_to(set
<string
> *log_keys_debug
, const string
&ub
) {
661 for (set
<string
>::iterator i
= log_keys_debug
->begin();
662 i
!= log_keys_debug
->end() && *i
< ub
;
663 log_keys_debug
->erase(i
++));
668 dirty_to
= eversion_t();
669 dirty_from
= eversion_t::max();
673 trimmed_dups
.clear();
674 writeout_from
= eversion_t::max();
677 dirty_to_dups
= eversion_t();
678 dirty_from_dups
= eversion_t::max();
679 write_from_dups
= eversion_t::max();
683 // cppcheck-suppress noExplicitConstructor
684 PGLog(CephContext
*cct
) :
685 dirty_from(eversion_t::max()),
686 writeout_from(eversion_t::max()),
687 dirty_from_dups(eversion_t::max()),
688 write_from_dups(eversion_t::max()),
690 pg_log_debug(!(cct
&& !(cct
->_conf
->osd_debug_pg_log_writeout
))),
693 clear_divergent_priors(false)
696 void reset_backfill();
700 //////////////////// get or set missing ////////////////////
702 const pg_missing_tracker_t
& get_missing() const { return missing
; }
704 void missing_add(const hobject_t
& oid
, eversion_t need
, eversion_t have
, bool is_delete
=false) {
705 missing
.add(oid
, need
, have
, is_delete
);
708 void missing_add_next_entry(const pg_log_entry_t
& e
) {
709 missing
.add_next_event(e
);
712 //////////////////// get or set log ////////////////////
714 const IndexedLog
&get_log() const { return log
; }
716 const eversion_t
&get_tail() const { return log
.tail
; }
718 void set_tail(eversion_t tail
) { log
.tail
= tail
; }
720 const eversion_t
&get_head() const { return log
.head
; }
722 void set_head(eversion_t head
) { log
.head
= head
; }
724 void set_last_requested(version_t last_requested
) {
725 log
.last_requested
= last_requested
;
728 void index() { log
.index(); }
730 void unindex() { log
.unindex(); }
732 void add(const pg_log_entry_t
& e
, bool applied
= true) {
733 mark_writeout_from(e
.version
);
737 void reset_recovery_pointers() { log
.reset_recovery_pointers(); }
739 static void clear_info_log(
741 ObjectStore::Transaction
*t
);
746 bool transaction_applied
= true,
749 void roll_forward_to(
750 eversion_t roll_forward_to
,
751 LogEntryHandler
*h
) {
752 if (log
.roll_forward_to(
758 eversion_t
get_can_rollback_to() const {
759 return log
.get_can_rollback_to();
762 void roll_forward(LogEntryHandler
*h
) {
768 void skip_rollforward() {
769 log
.skip_can_rollback_to_to_head();
772 //////////////////// get or set log & missing ////////////////////
774 void reset_backfill_claim_log(const pg_log_t
&o
, LogEntryHandler
*h
) {
775 log
.trim_rollback_info_to(log
.head
, h
);
776 log
.claim_log_and_clear_rollback_info(o
);
778 mark_dirty_to(eversion_t::max());
779 mark_dirty_to_dups(eversion_t::max());
786 log
.split_out_child(child_pgid
, split_bits
, &opg_log
->log
);
787 missing
.split_into(child_pgid
, split_bits
, &(opg_log
->missing
));
788 opg_log
->mark_dirty_to(eversion_t::max());
789 opg_log
->mark_dirty_to_dups(eversion_t::max());
790 mark_dirty_to(eversion_t::max());
791 mark_dirty_to_dups(eversion_t::max());
792 if (missing
.may_include_deletes
) {
793 opg_log
->set_missing_may_contain_deletes();
798 const vector
<PGLog
*>& sources
,
799 eversion_t last_update
) {
803 vector
<pg_log_t
*> slogs
;
804 for (auto s
: sources
) {
805 slogs
.push_back(&s
->log
);
807 log
.merge_from(slogs
, last_update
);
811 mark_log_for_rewrite();
814 void recover_got(hobject_t oid
, eversion_t v
, pg_info_t
&info
) {
815 if (missing
.is_missing(oid
, v
)) {
817 info
.stats
.stats
.sum
.num_objects_missing
= missing
.num_missing();
819 // raise last_complete?
820 if (missing
.get_items().empty()) {
821 log
.complete_to
= log
.log
.end();
822 info
.last_complete
= info
.last_update
;
824 auto oldest_need
= missing
.get_oldest_need();
825 while (log
.complete_to
!= log
.log
.end()) {
826 if (oldest_need
<= log
.complete_to
->version
)
828 if (info
.last_complete
< log
.complete_to
->version
)
829 info
.last_complete
= log
.complete_to
->version
;
834 ceph_assert(log
.get_can_rollback_to() >= v
);
837 void reset_complete_to(pg_info_t
*info
) {
838 if (log
.log
.empty()) // caller is split_into()
840 log
.complete_to
= log
.log
.begin();
841 ceph_assert(log
.complete_to
!= log
.log
.end());
842 auto oldest_need
= missing
.get_oldest_need();
843 if (oldest_need
!= eversion_t()) {
844 while (log
.complete_to
->version
< oldest_need
) {
846 ceph_assert(log
.complete_to
!= log
.log
.end());
851 if (log
.complete_to
== log
.log
.begin()) {
852 info
->last_complete
= eversion_t();
855 info
->last_complete
= log
.complete_to
->version
;
860 void activate_not_complete(pg_info_t
&info
) {
861 reset_complete_to(&info
);
862 log
.last_requested
= 0;
865 void proc_replica_log(pg_info_t
&oinfo
,
866 const pg_log_t
&olog
,
867 pg_missing_t
& omissing
, pg_shard_t from
) const;
869 void set_missing_may_contain_deletes() {
870 missing
.may_include_deletes
= true;
871 may_include_deletes_in_missing_dirty
= true;
874 void rebuild_missing_set_with_deletes(ObjectStore
*store
,
875 ObjectStore::CollectionHandle
& ch
,
876 const pg_info_t
&info
);
879 static void split_by_object(
880 mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
881 map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
>> *out_entries
) {
882 while (!entries
.empty()) {
883 auto &out_list
= (*out_entries
)[entries
.front().soid
];
884 out_list
.splice(out_list
.end(), entries
, entries
.begin());
889 * _merge_object_divergent_entries
891 * There are 5 distinct cases:
892 * 1) There is a more recent update: in this case we assume we adjusted the
893 * store and missing during merge_log
894 * 2) The first entry in the divergent sequence is a create. This might
895 * either be because the object is a clone or because prior_version is
896 * eversion_t(). In this case the object does not exist and we must
897 * adjust missing and the store to match.
898 * 3) We are currently missing the object. In this case, we adjust the
899 * missing to our prior_version taking care to add a divergent_prior
901 * 4) We can rollback all of the entries. In this case, we do so using
902 * the rollbacker and return -- the object does not go into missing.
903 * 5) We cannot rollback at least 1 of the entries. In this case, we
904 * clear the object out of the store and add a missing entry at
905 * prior_version taking care to add a divergent_prior if
908 template <typename missing_type
>
909 static void _merge_object_divergent_entries(
910 const IndexedLog
&log
, ///< [in] log to merge against
911 const hobject_t
&hoid
, ///< [in] object we are merging
912 const mempool::osd_pglog::list
<pg_log_entry_t
> &orig_entries
, ///< [in] entries for hoid to merge
913 const pg_info_t
&info
, ///< [in] info for merging entries
914 eversion_t olog_can_rollback_to
, ///< [in] rollback boundary of input InedexedLog
915 missing_type
&missing
, ///< [in,out] missing to adjust, use
916 LogEntryHandler
*rollbacker
, ///< [in] optional rollbacker object
917 const DoutPrefixProvider
*dpp
///< [in] logging provider
919 ldpp_dout(dpp
, 20) << __func__
<< ": merging hoid " << hoid
920 << " entries: " << orig_entries
<< dendl
;
922 if (hoid
> info
.last_backfill
) {
923 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " after last_backfill"
928 // entries is non-empty
929 ceph_assert(!orig_entries
.empty());
930 // strip out and ignore ERROR entries
931 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
933 bool seen_non_error
= false;
934 for (list
<pg_log_entry_t
>::const_iterator i
= orig_entries
.begin();
935 i
!= orig_entries
.end();
937 // all entries are on hoid
938 ceph_assert(i
->soid
== hoid
);
939 // did not see error entries before this entry and this entry is not error
940 // then this entry is the first non error entry
941 bool first_non_error
= ! seen_non_error
&& ! i
->is_error();
942 if (! i
->is_error() ) {
943 // see a non error entry now
944 seen_non_error
= true;
947 // No need to check the first entry since it prior_version is unavailable
949 // No need to check if the prior_version is the minimal version
950 // No need to check the first non-error entry since the leading error
951 // entries are not its prior version
952 if (i
!= orig_entries
.begin() && i
->prior_version
!= eversion_t() &&
954 // in increasing order of version
955 ceph_assert(i
->version
> last
);
956 // prior_version correct (unless it is an ERROR entry)
957 ceph_assert(i
->prior_version
== last
|| i
->is_error());
960 ldpp_dout(dpp
, 20) << __func__
<< ": ignoring " << *i
<< dendl
;
962 ldpp_dout(dpp
, 20) << __func__
<< ": keeping " << *i
<< dendl
;
963 entries
.push_back(*i
);
967 if (entries
.empty()) {
968 ldpp_dout(dpp
, 10) << __func__
<< ": no non-ERROR entries" << dendl
;
972 const eversion_t prior_version
= entries
.begin()->prior_version
;
973 const eversion_t first_divergent_update
= entries
.begin()->version
;
974 const eversion_t last_divergent_update
= entries
.rbegin()->version
;
975 const bool object_not_in_store
=
976 !missing
.is_missing(hoid
) &&
977 entries
.rbegin()->is_delete();
978 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << " object_not_in_store: "
979 << object_not_in_store
<< dendl
;
980 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
981 << " prior_version: " << prior_version
982 << " first_divergent_update: " << first_divergent_update
983 << " last_divergent_update: " << last_divergent_update
986 ceph::unordered_map
<hobject_t
, pg_log_entry_t
*>::const_iterator objiter
=
987 log
.objects
.find(hoid
);
988 if (objiter
!= log
.objects
.end() &&
989 objiter
->second
->version
>= first_divergent_update
) {
991 ldpp_dout(dpp
, 10) << __func__
<< ": more recent entry found: "
992 << *objiter
->second
<< ", already merged" << dendl
;
994 ceph_assert(objiter
->second
->version
> last_divergent_update
);
996 // ensure missing has been updated appropriately
997 if (objiter
->second
->is_update() ||
998 (missing
.may_include_deletes
&& objiter
->second
->is_delete())) {
999 ceph_assert(missing
.is_missing(hoid
) &&
1000 missing
.get_items().at(hoid
).need
== objiter
->second
->version
);
1002 ceph_assert(!missing
.is_missing(hoid
));
1004 missing
.revise_have(hoid
, eversion_t());
1005 missing
.mark_fully_dirty(hoid
);
1007 if (!object_not_in_store
) {
1008 rollbacker
->remove(hoid
);
1010 for (auto &&i
: entries
) {
1011 rollbacker
->trim(i
);
1017 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1018 <<" has no more recent entries in log" << dendl
;
1019 if (prior_version
== eversion_t() || entries
.front().is_clone()) {
1021 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1022 << " prior_version or op type indicates creation,"
1025 if (missing
.is_missing(hoid
))
1026 missing
.rm(missing
.get_items().find(hoid
));
1028 if (!object_not_in_store
) {
1029 rollbacker
->remove(hoid
);
1031 for (auto &&i
: entries
) {
1032 rollbacker
->trim(i
);
1038 if (missing
.is_missing(hoid
)) {
1040 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1041 << " missing, " << missing
.get_items().at(hoid
)
1042 << " adjusting" << dendl
;
1044 if (missing
.get_items().at(hoid
).have
== prior_version
) {
1045 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1046 << " missing.have is prior_version " << prior_version
1047 << " removing from missing" << dendl
;
1048 missing
.rm(missing
.get_items().find(hoid
));
1050 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1051 << " missing.have is " << missing
.get_items().at(hoid
).have
1052 << ", adjusting" << dendl
;
1053 missing
.revise_need(hoid
, prior_version
, false);
1054 if (prior_version
<= info
.log_tail
) {
1055 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1056 << " prior_version " << prior_version
1057 << " <= info.log_tail "
1058 << info
.log_tail
<< dendl
;
1062 for (auto &&i
: entries
) {
1063 rollbacker
->trim(i
);
1069 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1070 << " must be rolled back or recovered,"
1071 << " attempting to rollback"
1073 bool can_rollback
= true;
1074 // We are going to make an important decision based on the
1075 // olog_can_rollback_to value we have received, better known it.
1076 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1077 << " olog_can_rollback_to: "
1078 << olog_can_rollback_to
<< dendl
;
1079 /// Distinguish between 4) and 5)
1080 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= entries
.rbegin();
1081 i
!= entries
.rend();
1083 if (!i
->can_rollback() || i
->version
<= olog_can_rollback_to
) {
1084 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " cannot rollback "
1086 can_rollback
= false;
1093 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= entries
.rbegin();
1094 i
!= entries
.rend();
1096 ceph_assert(i
->can_rollback() && i
->version
> olog_can_rollback_to
);
1097 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1098 << " rolling back " << *i
<< dendl
;
1100 rollbacker
->rollback(*i
);
1102 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1103 << " rolled back" << dendl
;
1107 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " cannot roll back, "
1108 << "removing and adding to missing" << dendl
;
1110 if (!object_not_in_store
)
1111 rollbacker
->remove(hoid
);
1112 for (auto &&i
: entries
) {
1113 rollbacker
->trim(i
);
1116 missing
.add(hoid
, prior_version
, eversion_t(), false);
1117 if (prior_version
<= info
.log_tail
) {
1118 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1119 << " prior_version " << prior_version
1120 << " <= info.log_tail "
1121 << info
.log_tail
<< dendl
;
1126 /// Merge all entries using above
1127 template <typename missing_type
>
1128 static void _merge_divergent_entries(
1129 const IndexedLog
&log
, ///< [in] log to merge against
1130 mempool::osd_pglog::list
<pg_log_entry_t
> &entries
, ///< [in] entries to merge
1131 const pg_info_t
&oinfo
, ///< [in] info for merging entries
1132 eversion_t olog_can_rollback_to
, ///< [in] rollback boundary of input IndexedLog
1133 missing_type
&omissing
, ///< [in,out] missing to adjust, use
1134 LogEntryHandler
*rollbacker
, ///< [in] optional rollbacker object
1135 const DoutPrefixProvider
*dpp
///< [in] logging provider
1137 map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
> > split
;
1138 split_by_object(entries
, &split
);
1139 for (map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
>>::iterator i
= split
.begin();
1142 _merge_object_divergent_entries(
1147 olog_can_rollback_to
,
1155 * Exists for use in TestPGLog for simply testing single divergent log
1158 void merge_old_entry(
1159 ObjectStore::Transaction
& t
,
1160 const pg_log_entry_t
& oe
,
1161 const pg_info_t
& info
,
1162 LogEntryHandler
*rollbacker
) {
1163 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
1164 entries
.push_back(oe
);
1165 _merge_object_divergent_entries(
1170 log
.get_can_rollback_to(),
1176 bool merge_log_dups(const pg_log_t
& olog
);
1180 void rewind_divergent_log(eversion_t newhead
,
1182 LogEntryHandler
*rollbacker
,
1184 bool &dirty_big_info
);
1186 void merge_log(pg_info_t
&oinfo
,
1189 pg_info_t
&info
, LogEntryHandler
*rollbacker
,
1190 bool &dirty_info
, bool &dirty_big_info
);
1192 template <typename missing_type
>
1193 static bool append_log_entries_update_missing(
1194 const hobject_t
&last_backfill
,
1195 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
1196 bool maintain_rollback
,
1198 missing_type
&missing
,
1199 LogEntryHandler
*rollbacker
,
1200 const DoutPrefixProvider
*dpp
) {
1201 bool invalidate_stats
= false;
1202 if (log
&& !entries
.empty()) {
1203 ceph_assert(log
->head
< entries
.begin()->version
);
1205 for (list
<pg_log_entry_t
>::const_iterator p
= entries
.begin();
1208 invalidate_stats
= invalidate_stats
|| !p
->is_error();
1210 ldpp_dout(dpp
, 20) << "update missing, append " << *p
<< dendl
;
1213 if (p
->soid
<= last_backfill
&&
1215 if (missing
.may_include_deletes
) {
1216 missing
.add_next_event(*p
);
1218 if (p
->is_delete()) {
1219 missing
.rm(p
->soid
, p
->version
);
1221 missing
.add_next_event(*p
);
1224 // hack to match PG::mark_all_unfound_lost
1225 if (maintain_rollback
&& p
->is_lost_delete() && p
->can_rollback()) {
1226 rollbacker
->try_stash(p
->soid
, p
->version
.version
);
1227 } else if (p
->is_delete()) {
1228 rollbacker
->remove(p
->soid
);
1234 return invalidate_stats
;
1236 bool append_new_log_entries(
1237 const hobject_t
&last_backfill
,
1238 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
1239 LogEntryHandler
*rollbacker
) {
1240 bool invalidate_stats
= append_log_entries_update_missing(
1248 if (!entries
.empty()) {
1249 mark_writeout_from(entries
.begin()->version
);
1250 if (entries
.begin()->is_lost_delete()) {
1251 // hack: since lost deletes queue recovery directly, and don't
1252 // go through activate_not_complete() again, our complete_to
1253 // iterator may still point at log.end(). Reset it to point
1254 // before these new lost_delete entries. This only occurs
1255 // when lost+delete entries are initially added, which is
1256 // always in a list of solely lost_delete entries, so it is
1257 // sufficient to check whether the first entry is a
1259 reset_complete_to(nullptr);
1262 return invalidate_stats
;
1265 void write_log_and_missing(
1266 ObjectStore::Transaction
& t
,
1267 map
<string
,bufferlist
> *km
,
1269 const ghobject_t
&log_oid
,
1270 bool require_rollback
);
1272 static void write_log_and_missing_wo_missing(
1273 ObjectStore::Transaction
& t
,
1274 map
<string
,bufferlist
>* km
,
1277 const ghobject_t
&log_oid
, map
<eversion_t
, hobject_t
> &divergent_priors
,
1278 bool require_rollback
);
1280 static void write_log_and_missing(
1281 ObjectStore::Transaction
& t
,
1282 map
<string
,bufferlist
>* km
,
1285 const ghobject_t
&log_oid
,
1286 const pg_missing_tracker_t
&missing
,
1287 bool require_rollback
,
1288 bool *rebuilt_missing_set_with_deletes
);
1290 static void _write_log_and_missing_wo_missing(
1291 ObjectStore::Transaction
& t
,
1292 map
<string
,bufferlist
>* km
,
1294 const coll_t
& coll
, const ghobject_t
&log_oid
,
1295 map
<eversion_t
, hobject_t
> &divergent_priors
,
1296 eversion_t dirty_to
,
1297 eversion_t dirty_from
,
1298 eversion_t writeout_from
,
1299 bool dirty_divergent_priors
,
1301 bool require_rollback
,
1302 eversion_t dirty_to_dups
,
1303 eversion_t dirty_from_dups
,
1304 eversion_t write_from_dups
,
1305 set
<string
> *log_keys_debug
1308 static void _write_log_and_missing(
1309 ObjectStore::Transaction
& t
,
1310 map
<string
,bufferlist
>* km
,
1312 const coll_t
& coll
, const ghobject_t
&log_oid
,
1313 eversion_t dirty_to
,
1314 eversion_t dirty_from
,
1315 eversion_t writeout_from
,
1316 set
<eversion_t
> &&trimmed
,
1317 set
<string
> &&trimmed_dups
,
1318 const pg_missing_tracker_t
&missing
,
1320 bool require_rollback
,
1321 bool clear_divergent_priors
,
1322 eversion_t dirty_to_dups
,
1323 eversion_t dirty_from_dups
,
1324 eversion_t write_from_dups
,
1325 bool *may_include_deletes_in_missing_dirty
,
1326 set
<string
> *log_keys_debug
1329 void read_log_and_missing(
1331 ObjectStore::CollectionHandle
& ch
,
1332 ghobject_t pgmeta_oid
,
1333 const pg_info_t
&info
,
1335 bool tolerate_divergent_missing_log
,
1336 bool debug_verify_stored_missing
= false
1338 return read_log_and_missing(
1339 store
, ch
, pgmeta_oid
, info
,
1341 tolerate_divergent_missing_log
,
1342 &clear_divergent_priors
,
1344 (pg_log_debug
? &log_keys_debug
: nullptr),
1345 debug_verify_stored_missing
);
1348 template <typename missing_type
>
1349 static void read_log_and_missing(
1351 ObjectStore::CollectionHandle
&ch
,
1352 ghobject_t pgmeta_oid
,
1353 const pg_info_t
&info
,
1355 missing_type
&missing
,
1357 bool tolerate_divergent_missing_log
,
1358 bool *clear_divergent_priors
= nullptr,
1359 const DoutPrefixProvider
*dpp
= nullptr,
1360 set
<string
> *log_keys_debug
= nullptr,
1361 bool debug_verify_stored_missing
= false
1363 ldpp_dout(dpp
, 20) << "read_log_and_missing coll " << ch
->cid
1364 << " " << pgmeta_oid
<< dendl
;
1368 int r
= store
->stat(ch
, pgmeta_oid
, &st
);
1369 ceph_assert(r
== 0);
1370 ceph_assert(st
.st_size
== 0);
1372 // will get overridden below if it had been recorded
1373 eversion_t on_disk_can_rollback_to
= info
.last_update
;
1374 eversion_t on_disk_rollback_info_trimmed_to
= eversion_t();
1375 ObjectMap::ObjectMapIterator p
= store
->get_omap_iterator(ch
,
1377 map
<eversion_t
, hobject_t
> divergent_priors
;
1378 bool must_rebuild
= false;
1379 missing
.may_include_deletes
= false;
1380 list
<pg_log_entry_t
> entries
;
1381 list
<pg_log_dup_t
> dups
;
1383 for (p
->seek_to_first(); p
->valid() ; p
->next()) {
1384 // non-log pgmeta_oid keys are prefixed with _; skip those
1385 if (p
->key()[0] == '_')
1387 bufferlist bl
= p
->value();//Copy bufferlist before creating iterator
1388 auto bp
= bl
.cbegin();
1389 if (p
->key() == "divergent_priors") {
1390 decode(divergent_priors
, bp
);
1391 ldpp_dout(dpp
, 20) << "read_log_and_missing " << divergent_priors
.size()
1392 << " divergent_priors" << dendl
;
1393 must_rebuild
= true;
1394 debug_verify_stored_missing
= false;
1395 } else if (p
->key() == "can_rollback_to") {
1396 decode(on_disk_can_rollback_to
, bp
);
1397 } else if (p
->key() == "rollback_info_trimmed_to") {
1398 decode(on_disk_rollback_info_trimmed_to
, bp
);
1399 } else if (p
->key() == "may_include_deletes_in_missing") {
1400 missing
.may_include_deletes
= true;
1401 } else if (p
->key().substr(0, 7) == string("missing")) {
1403 pg_missing_item item
;
1406 ldpp_dout(dpp
, 20) << "read_log_and_missing " << item
<< dendl
;
1407 if (item
.is_delete()) {
1408 ceph_assert(missing
.may_include_deletes
);
1410 missing
.add(oid
, std::move(item
));
1411 } else if (p
->key().substr(0, 4) == string("dup_")) {
1414 if (!dups
.empty()) {
1415 ceph_assert(dups
.back().version
< dup
.version
);
1417 dups
.push_back(dup
);
1420 e
.decode_with_checksum(bp
);
1421 ldpp_dout(dpp
, 20) << "read_log_and_missing " << e
<< dendl
;
1422 if (!entries
.empty()) {
1423 pg_log_entry_t
last_e(entries
.back());
1424 ceph_assert(last_e
.version
.version
< e
.version
.version
);
1425 ceph_assert(last_e
.version
.epoch
<= e
.version
.epoch
);
1427 entries
.push_back(e
);
1429 log_keys_debug
->insert(e
.get_key_name());
1436 on_disk_can_rollback_to
,
1437 on_disk_rollback_info_trimmed_to
,
1441 if (must_rebuild
|| debug_verify_stored_missing
) {
1443 if (debug_verify_stored_missing
|| info
.last_complete
< info
.last_update
) {
1445 << "read_log_and_missing checking for missing items over interval ("
1446 << info
.last_complete
1447 << "," << info
.last_update
<< "]" << dendl
;
1450 set
<hobject_t
> checked
;
1451 set
<hobject_t
> skipped
;
1452 for (list
<pg_log_entry_t
>::reverse_iterator i
= log
.log
.rbegin();
1453 i
!= log
.log
.rend();
1455 if (i
->soid
> info
.last_backfill
)
1459 if (did
.count(i
->soid
)) continue;
1460 did
.insert(i
->soid
);
1462 if (!missing
.may_include_deletes
&& i
->is_delete())
1466 int r
= store
->getattr(
1468 ghobject_t(i
->soid
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1472 object_info_t
oi(bv
);
1473 if (oi
.version
< i
->version
) {
1474 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
1475 << " (have " << oi
.version
<< ")"
1476 << " clean_regions " << i
->clean_regions
<< dendl
;
1478 if (debug_verify_stored_missing
) {
1479 auto miter
= missing
.get_items().find(i
->soid
);
1480 ceph_assert(miter
!= missing
.get_items().end());
1481 ceph_assert(miter
->second
.need
== i
->version
);
1482 // the 'have' version is reset if an object is deleted,
1483 // then created again
1484 ceph_assert(miter
->second
.have
== oi
.version
|| miter
->second
.have
== eversion_t());
1485 checked
.insert(i
->soid
);
1487 missing
.add(i
->soid
, i
->version
, oi
.version
, i
->is_delete());
1491 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
<< dendl
;
1492 if (debug_verify_stored_missing
) {
1493 auto miter
= missing
.get_items().find(i
->soid
);
1494 if (i
->is_delete()) {
1495 ceph_assert(miter
== missing
.get_items().end() ||
1496 (miter
->second
.need
== i
->version
&&
1497 miter
->second
.have
== eversion_t()));
1499 ceph_assert(miter
!= missing
.get_items().end());
1500 ceph_assert(miter
->second
.need
== i
->version
);
1501 ceph_assert(miter
->second
.have
== eversion_t());
1503 checked
.insert(i
->soid
);
1505 missing
.add(i
->soid
, i
->version
, eversion_t(), i
->is_delete());
1509 if (debug_verify_stored_missing
) {
1510 for (auto &&i
: missing
.get_items()) {
1511 if (checked
.count(i
.first
))
1513 if (i
.first
> info
.last_backfill
) {
1514 ldpp_dout(dpp
, -1) << __func__
<< ": invalid missing set entry "
1515 << "found before last_backfill: "
1516 << i
.first
<< " " << i
.second
1517 << " last_backfill = " << info
.last_backfill
1519 ceph_abort_msg("invalid missing set entry found");
1522 int r
= store
->getattr(
1524 ghobject_t(i
.first
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1528 object_info_t
oi(bv
);
1529 ceph_assert(oi
.version
== i
.second
.have
|| eversion_t() == i
.second
.have
);
1531 ceph_assert(i
.second
.is_delete() || eversion_t() == i
.second
.have
);
1535 ceph_assert(must_rebuild
);
1536 for (map
<eversion_t
, hobject_t
>::reverse_iterator i
=
1537 divergent_priors
.rbegin();
1538 i
!= divergent_priors
.rend();
1540 if (i
->first
<= info
.last_complete
) break;
1541 if (i
->second
> info
.last_backfill
)
1543 if (did
.count(i
->second
)) continue;
1544 did
.insert(i
->second
);
1546 int r
= store
->getattr(
1548 ghobject_t(i
->second
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1552 object_info_t
oi(bv
);
1554 * 1) we see this entry in the divergent priors mapping
1555 * 2) we didn't see an entry for this object in the log
1557 * From 1 & 2 we know that either the object does not exist
1558 * or it is at the version specified in the divergent_priors
1559 * map since the object would have been deleted atomically
1560 * with the addition of the divergent_priors entry, an older
1561 * version would not have been recovered, and a newer version
1562 * would show up in the log above.
1565 * Unfortunately the assessment above is incorrect because of
1566 * http://tracker.ceph.com/issues/17916 (we were incorrectly
1567 * not removing the divergent_priors set from disk state!),
1568 * so let's check that.
1570 if (oi
.version
> i
->first
&& tolerate_divergent_missing_log
) {
1571 ldpp_dout(dpp
, 0) << "read_log divergent_priors entry (" << *i
1572 << ") inconsistent with disk state (" << oi
1573 << "), assuming it is tracker.ceph.com/issues/17916"
1576 ceph_assert(oi
.version
== i
->first
);
1579 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
<< dendl
;
1580 missing
.add(i
->second
, i
->first
, eversion_t(), false);
1584 if (clear_divergent_priors
)
1585 (*clear_divergent_priors
) = true;
1589 if (!must_rebuild
) {
1590 if (clear_divergent_priors
)
1591 (*clear_divergent_priors
) = false;
1594 ldpp_dout(dpp
, 10) << "read_log_and_missing done" << dendl
;
1595 } // static read_log_and_missing
1598 seastar::future
<> read_log_and_missing_crimson(
1599 crimson::os::FuturizedStore
&store
,
1600 crimson::os::CollectionRef ch
,
1601 const pg_info_t
&info
,
1602 ghobject_t pgmeta_oid
1604 return read_log_and_missing_crimson(
1606 log
, missing
, pgmeta_oid
,
1610 template <typename missing_type
>
1611 struct FuturizedStoreLogReader
{
1612 crimson::os::FuturizedStore
&store
;
1613 crimson::os::CollectionRef ch
;
1614 const pg_info_t
&info
;
1616 missing_type
&missing
;
1617 ghobject_t pgmeta_oid
;
1618 const DoutPrefixProvider
*dpp
;
1620 eversion_t on_disk_can_rollback_to
;
1621 eversion_t on_disk_rollback_info_trimmed_to
;
1623 std::map
<eversion_t
, hobject_t
> divergent_priors
;
1624 bool must_rebuild
= false;
1625 std::list
<pg_log_entry_t
> entries
;
1626 std::list
<pg_log_dup_t
> dups
;
1628 std::optional
<std::string
> next
;
1630 void process_entry(const std::pair
<std::string
, ceph::bufferlist
> &p
) {
1631 if (p
.first
[0] == '_')
1633 ceph::bufferlist bl
= p
.second
;//Copy bufferlist before creating iterator
1634 auto bp
= bl
.cbegin();
1635 if (p
.first
== "divergent_priors") {
1636 decode(divergent_priors
, bp
);
1637 ldpp_dout(dpp
, 20) << "read_log_and_missing " << divergent_priors
.size()
1638 << " divergent_priors" << dendl
;
1639 ceph_assert("crimson shouldn't have had divergent_priors" == 0);
1640 } else if (p
.first
== "can_rollback_to") {
1641 decode(on_disk_can_rollback_to
, bp
);
1642 } else if (p
.first
== "rollback_info_trimmed_to") {
1643 decode(on_disk_rollback_info_trimmed_to
, bp
);
1644 } else if (p
.first
== "may_include_deletes_in_missing") {
1645 missing
.may_include_deletes
= true;
1646 } else if (p
.first
.substr(0, 7) == string("missing")) {
1648 pg_missing_item item
;
1651 if (item
.is_delete()) {
1652 ceph_assert(missing
.may_include_deletes
);
1654 missing
.add(oid
, std::move(item
));
1655 } else if (p
.first
.substr(0, 4) == string("dup_")) {
1658 if (!dups
.empty()) {
1659 ceph_assert(dups
.back().version
< dup
.version
);
1661 dups
.push_back(dup
);
1664 e
.decode_with_checksum(bp
);
1665 ldpp_dout(dpp
, 20) << "read_log_and_missing " << e
<< dendl
;
1666 if (!entries
.empty()) {
1667 pg_log_entry_t
last_e(entries
.back());
1668 ceph_assert(last_e
.version
.version
< e
.version
.version
);
1669 ceph_assert(last_e
.version
.epoch
<= e
.version
.epoch
);
1671 entries
.push_back(e
);
1676 seastar::future
<> start() {
1677 // will get overridden if recorded
1678 on_disk_can_rollback_to
= info
.last_update
;
1679 missing
.may_include_deletes
= false;
1681 auto reader
= std::unique_ptr
<FuturizedStoreLogReader
>(this);
1682 return seastar::repeat(
1684 return store
.omap_get_values(ch
, pgmeta_oid
, next
).then(
1686 bool done
, crimson::os::FuturizedStore::omap_values_t values
) {
1687 for (auto &&p
: values
) {
1690 return done
? seastar::stop_iteration::yes
1691 : seastar::stop_iteration::no
;
1693 }).then([this, reader
{std::move(reader
)}]() {
1697 on_disk_can_rollback_to
,
1698 on_disk_rollback_info_trimmed_to
,
1701 return seastar::now();
1706 template <typename missing_type
>
1707 static seastar::future
<> read_log_and_missing_crimson(
1708 crimson::os::FuturizedStore
&store
,
1709 crimson::os::CollectionRef ch
,
1710 const pg_info_t
&info
,
1712 missing_type
&missing
,
1713 ghobject_t pgmeta_oid
,
1714 const DoutPrefixProvider
*dpp
= nullptr
1716 ldpp_dout(dpp
, 20) << "read_log_and_missing coll "
1718 << " " << pgmeta_oid
<< dendl
;
1719 return (new FuturizedStoreLogReader
<missing_type
>{
1720 store
, ch
, info
, log
, missing
, pgmeta_oid
, dpp
})->start();