1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 // re-include our assert to clobber boost's
20 #include "include/ceph_assert.h"
21 #include "include/common_fwd.h"
22 #include "osd_types.h"
23 #include "os/ObjectStore.h"
27 #include <seastar/core/future.hh>
28 #include "crimson/os/futurized_store.h"
29 #include "crimson/os/cyanstore/cyan_collection.h"
34 * The pg log serves three primary purposes:
36 * 1) improving recovery speed
38 * 2) detecting duplicate ops
40 * 3) making erasure coded updates safe
42 * For (1), the main data type is pg_log_entry_t. this is indexed in
43 * memory by the IndexedLog class - this is where most of the logic
44 * surrounding pg log is kept, even though the low level types are in
47 * (2) uses a type which is a subset of the full log entry, containing
48 * just the pieces we need to identify and respond to a duplicate
51 * As we trim the log, we convert pg_log_entry_t to smaller
52 * pg_log_dup_t, and finally remove them once we reach a higher
53 * limit. This is controlled by a few options:
55 * osd_min_pg_log_entries osd_max_pg_log_entries
56 * osd_pg_log_dups_tracked
58 * For example, with a min of 100, max of 1000, and dups tracked of
59 * 3000, the log entries and dups stored would span the following
60 * versions, assuming the current earliest is version 1:
62 * version: 3000 2001 2000 1 [ pg log entries ] [ pg log dups ]
64 * after osd_pg_log_trim_min subsequent writes to this PG, the log
65 * would be trimmed to look like:
67 * version: 3100 2101 2100 101 [ pg log entries ] [ pg log dups ]
69 * (3) means tracking the previous state of an object, so that we can
70 * rollback to that prior state if necessary. It's only used for
71 * erasure coding. Consider an erasure code of 4+2, for example.
73 * This means we split the object into 4 pieces (called shards) and
74 * compute 2 parity shards. Each of these shards is stored on a
75 * separate OSD. As long as 4 shards are the same version, we can
76 * recover the remaining 2 by computation. Imagine during a write, 3
77 * of the osds go down and restart, resulting in shards 0,1,2
78 * reflecting version A and shards 3,4,5 reflecting version B, after
81 * If we had no way to reconstruct version A for another shard, we
82 * would have lost the object.
84 * The actual data for rollback is stored in a look-aside object and
85 * is removed once the EC write commits on all shards. The pg log just
86 * stores the versions so we can tell how far we can rollback, and a
87 * description of the type of operation for each log entry. Beyond
88 * the pg log, see PGBackend::Trimmer and PGBackend::RollbackVisitor
89 * for more details on this.
91 * An important implication of this is that although the pg log length
92 * is normally bounded, under extreme conditions, with many EC I/Os
93 * outstanding, the log may grow beyond that point because we need to
94 * keep the rollback information for all outstanding EC I/O.
96 * For more on pg log bounds, see where it is calculated in
97 * PeeringState::calc_trim_to_aggressive().
99 * For more details on how peering uses the pg log, and architectural
100 * reasons for its existence, see:
102 * doc/dev/osd_internals/log_based_pg.rst
106 constexpr auto PGLOG_INDEXED_OBJECTS
= 1 << 0;
107 constexpr auto PGLOG_INDEXED_CALLER_OPS
= 1 << 1;
108 constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS
= 1 << 2;
109 constexpr auto PGLOG_INDEXED_DUPS
= 1 << 3;
110 constexpr auto PGLOG_INDEXED_ALL
= PGLOG_INDEXED_OBJECTS
111 | PGLOG_INDEXED_CALLER_OPS
112 | PGLOG_INDEXED_EXTRA_CALLER_OPS
113 | PGLOG_INDEXED_DUPS
;
115 struct PGLog
: DoutPrefixProvider
{
116 std::ostream
& gen_prefix(std::ostream
& out
) const override
{
119 unsigned get_subsys() const override
{
120 return static_cast<unsigned>(ceph_subsys_osd
);
122 CephContext
*get_cct() const override
{
126 ////////////////////////////// sub classes //////////////////////////////
127 struct LogEntryHandler
{
128 virtual void rollback(
129 const pg_log_entry_t
&entry
) = 0;
130 virtual void rollforward(
131 const pg_log_entry_t
&entry
) = 0;
133 const pg_log_entry_t
&entry
) = 0;
135 const hobject_t
&hoid
) = 0;
136 virtual void try_stash(
137 const hobject_t
&hoid
,
139 virtual ~LogEntryHandler() {}
141 using LogEntryHandlerRef
= std::unique_ptr
<LogEntryHandler
>;
145 * IndexLog - adds in-memory index of the log, by oid.
146 * plus some methods to manipulate it all.
148 struct IndexedLog
: public pg_log_t
{
149 mutable ceph::unordered_map
<hobject_t
,pg_log_entry_t
*> objects
; // ptrs into log. be careful!
150 mutable ceph::unordered_map
<osd_reqid_t
,pg_log_entry_t
*> caller_ops
;
151 mutable ceph::unordered_multimap
<osd_reqid_t
,pg_log_entry_t
*> extra_caller_ops
;
152 mutable ceph::unordered_map
<osd_reqid_t
,pg_log_dup_t
*> dup_index
;
155 std::list
<pg_log_entry_t
>::iterator complete_to
; // not inclusive of referenced item
156 version_t last_requested
= 0; // last object requested by primary
160 mutable __u16 indexed_data
= 0;
162 * rollback_info_trimmed_to_riter points to the first log entry <=
163 * rollback_info_trimmed_to
165 * It's a reverse_iterator because rend() is a natural representation for
166 * tail, and rbegin() works nicely for head.
168 mempool::osd_pglog::list
<pg_log_entry_t
>::reverse_iterator
169 rollback_info_trimmed_to_riter
;
172 * return true if we need to mark the pglog as dirty
174 template <typename F
>
175 bool advance_can_rollback_to(eversion_t to
, F
&&f
) {
176 bool dirty_log
= to
> can_rollback_to
|| to
> rollback_info_trimmed_to
;
178 if (to
> can_rollback_to
)
179 can_rollback_to
= to
;
181 if (to
> rollback_info_trimmed_to
)
182 rollback_info_trimmed_to
= to
;
185 while (rollback_info_trimmed_to_riter
!= log
.rbegin()) {
186 --rollback_info_trimmed_to_riter
;
187 if (rollback_info_trimmed_to_riter
->version
> rollback_info_trimmed_to
) {
188 ++rollback_info_trimmed_to_riter
;
191 f(*rollback_info_trimmed_to_riter
);
197 void reset_rollback_info_trimmed_to_riter() {
198 rollback_info_trimmed_to_riter
= log
.rbegin();
199 while (rollback_info_trimmed_to_riter
!= log
.rend() &&
200 rollback_info_trimmed_to_riter
->version
> rollback_info_trimmed_to
)
201 ++rollback_info_trimmed_to_riter
;
204 // indexes objects, caller ops and extra caller ops
207 complete_to(log
.end()),
210 rollback_info_trimmed_to_riter(log
.rbegin())
213 template <typename
... Args
>
214 explicit IndexedLog(Args
&&... args
) :
215 pg_log_t(std::forward
<Args
>(args
)...),
216 complete_to(log
.end()),
219 rollback_info_trimmed_to_riter(log
.rbegin())
221 reset_rollback_info_trimmed_to_riter();
225 IndexedLog(const IndexedLog
&rhs
) :
227 complete_to(log
.end()),
228 last_requested(rhs
.last_requested
),
230 rollback_info_trimmed_to_riter(log
.rbegin())
232 reset_rollback_info_trimmed_to_riter();
233 index(rhs
.indexed_data
);
236 IndexedLog
&operator=(const IndexedLog
&rhs
) {
238 new (this) IndexedLog(rhs
);
242 void trim_rollback_info_to(eversion_t to
, LogEntryHandler
*h
) {
243 advance_can_rollback_to(
245 [&](pg_log_entry_t
&entry
) {
249 bool roll_forward_to(eversion_t to
, LogEntryHandler
*h
) {
250 return advance_can_rollback_to(
252 [&](pg_log_entry_t
&entry
) {
253 h
->rollforward(entry
);
257 void skip_can_rollback_to_to_head() {
258 advance_can_rollback_to(head
, [&](const pg_log_entry_t
&entry
) {});
261 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
262 auto divergent
= pg_log_t::rewind_from_head(newhead
);
264 reset_rollback_info_trimmed_to_riter();
268 template <typename T
>
270 const eversion_t
&bound
, ///< [in] scan entries > bound
272 auto iter
= log
.rbegin();
273 while (iter
!= log
.rend() && iter
->version
> bound
)
277 if (iter
== log
.rbegin())
284 void claim_log_and_clear_rollback_info(const pg_log_t
& o
) {
285 // we must have already trimmed the old entries
286 ceph_assert(rollback_info_trimmed_to
== head
);
287 ceph_assert(rollback_info_trimmed_to_riter
== log
.rbegin());
289 *this = IndexedLog(o
);
291 skip_can_rollback_to_to_head();
295 void split_out_child(
301 // we must have already trimmed the old entries
302 ceph_assert(rollback_info_trimmed_to
== head
);
303 ceph_assert(rollback_info_trimmed_to_riter
== log
.rbegin());
307 rollback_info_trimmed_to_riter
= log
.rbegin();
308 reset_recovery_pointers();
311 skip_can_rollback_to_to_head();
314 void reset_recovery_pointers() {
315 complete_to
= log
.end();
319 bool logged_object(const hobject_t
& oid
) const {
320 if (!(indexed_data
& PGLOG_INDEXED_OBJECTS
)) {
323 return objects
.count(oid
);
326 bool logged_req(const osd_reqid_t
&r
) const {
327 if (!(indexed_data
& PGLOG_INDEXED_CALLER_OPS
)) {
330 if (!caller_ops
.count(r
)) {
331 if (!(indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)) {
332 index_extra_caller_ops();
334 return extra_caller_ops
.count(r
);
340 const osd_reqid_t
&r
,
342 version_t
*user_version
,
344 std::vector
<pg_log_op_return_item_t
> *op_returns
) const
346 ceph_assert(version
);
347 ceph_assert(user_version
);
348 ceph_assert(return_code
);
349 if (!(indexed_data
& PGLOG_INDEXED_CALLER_OPS
)) {
352 auto p
= caller_ops
.find(r
);
353 if (p
!= caller_ops
.end()) {
354 *version
= p
->second
->version
;
355 *user_version
= p
->second
->user_version
;
356 *return_code
= p
->second
->return_code
;
357 *op_returns
= p
->second
->op_returns
;
361 // warning: we will return *a* request for this reqid, but not
362 // necessarily the most recent.
363 if (!(indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)) {
364 index_extra_caller_ops();
366 p
= extra_caller_ops
.find(r
);
367 if (p
!= extra_caller_ops
.end()) {
369 for (auto i
= p
->second
->extra_reqids
.begin();
370 i
!= p
->second
->extra_reqids
.end();
373 *version
= p
->second
->version
;
374 *user_version
= i
->second
;
375 *return_code
= p
->second
->return_code
;
376 *op_returns
= p
->second
->op_returns
;
377 if (*return_code
>= 0) {
378 auto it
= p
->second
->extra_reqid_return_codes
.find(idx
);
379 if (it
!= p
->second
->extra_reqid_return_codes
.end()) {
380 *return_code
= it
->second
;
386 ceph_abort_msg("in extra_caller_ops but not extra_reqids");
389 if (!(indexed_data
& PGLOG_INDEXED_DUPS
)) {
392 auto q
= dup_index
.find(r
);
393 if (q
!= dup_index
.end()) {
394 *version
= q
->second
->version
;
395 *user_version
= q
->second
->user_version
;
396 *return_code
= q
->second
->return_code
;
397 *op_returns
= q
->second
->op_returns
;
404 bool has_write_since(const hobject_t
&oid
, const eversion_t
&bound
) const {
405 for (auto i
= log
.rbegin(); i
!= log
.rend(); ++i
) {
406 if (i
->version
<= bound
)
408 if (i
->soid
.get_head() == oid
.get_head())
414 /// get a (bounded) std::list of recent reqids for the given object
415 void get_object_reqids(const hobject_t
& oid
, unsigned max
,
416 mempool::osd_pglog::vector
<std::pair
<osd_reqid_t
, version_t
> > *pls
,
417 mempool::osd_pglog::map
<uint32_t, int> *return_codes
) const {
418 // make sure object is present at least once before we do an
420 if (!(indexed_data
& PGLOG_INDEXED_OBJECTS
)) {
423 if (objects
.count(oid
) == 0)
426 for (auto i
= log
.rbegin(); i
!= log
.rend(); ++i
) {
427 if (i
->soid
== oid
) {
428 if (i
->reqid_is_indexed()) {
429 if (i
->op
== pg_log_entry_t::ERROR
) {
430 // propagate op errors to the cache tier's PG log
431 return_codes
->emplace(pls
->size(), i
->return_code
);
433 pls
->push_back(std::make_pair(i
->reqid
, i
->user_version
));
436 pls
->insert(pls
->end(), i
->extra_reqids
.begin(), i
->extra_reqids
.end());
437 if (pls
->size() >= max
) {
438 if (pls
->size() > max
) {
447 void index(__u16 to_index
= PGLOG_INDEXED_ALL
) const {
448 // if to_index is 0, no need to run any of this code, especially
449 // loop below; this can happen with copy constructor for
450 // IndexedLog (and indirectly through assignment operator)
451 if (!to_index
) return;
453 if (to_index
& PGLOG_INDEXED_OBJECTS
)
455 if (to_index
& PGLOG_INDEXED_CALLER_OPS
)
457 if (to_index
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)
458 extra_caller_ops
.clear();
459 if (to_index
& PGLOG_INDEXED_DUPS
) {
461 for (auto& i
: dups
) {
462 dup_index
[i
.reqid
] = const_cast<pg_log_dup_t
*>(&i
);
466 constexpr __u16 any_log_entry_index
=
467 PGLOG_INDEXED_OBJECTS
|
468 PGLOG_INDEXED_CALLER_OPS
|
469 PGLOG_INDEXED_EXTRA_CALLER_OPS
;
471 if (to_index
& any_log_entry_index
) {
472 for (auto i
= log
.begin(); i
!= log
.end(); ++i
) {
473 if (to_index
& PGLOG_INDEXED_OBJECTS
) {
474 if (i
->object_is_indexed()) {
475 objects
[i
->soid
] = const_cast<pg_log_entry_t
*>(&(*i
));
479 if (to_index
& PGLOG_INDEXED_CALLER_OPS
) {
480 if (i
->reqid_is_indexed()) {
481 caller_ops
[i
->reqid
] = const_cast<pg_log_entry_t
*>(&(*i
));
485 if (to_index
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
486 for (auto j
= i
->extra_reqids
.begin();
487 j
!= i
->extra_reqids
.end();
489 extra_caller_ops
.insert(
490 std::make_pair(j
->first
, const_cast<pg_log_entry_t
*>(&(*i
))));
496 indexed_data
|= to_index
;
499 void index_objects() const {
500 index(PGLOG_INDEXED_OBJECTS
);
503 void index_caller_ops() const {
504 index(PGLOG_INDEXED_CALLER_OPS
);
507 void index_extra_caller_ops() const {
508 index(PGLOG_INDEXED_EXTRA_CALLER_OPS
);
511 void index_dups() const {
512 index(PGLOG_INDEXED_DUPS
);
515 void index(pg_log_entry_t
& e
) {
516 if ((indexed_data
& PGLOG_INDEXED_OBJECTS
) && e
.object_is_indexed()) {
517 if (objects
.count(e
.soid
) == 0 ||
518 objects
[e
.soid
]->version
< e
.version
)
519 objects
[e
.soid
] = &e
;
521 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
522 // divergent merge_log indexes new before unindexing old
523 if (e
.reqid_is_indexed()) {
524 caller_ops
[e
.reqid
] = &e
;
527 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
528 for (auto j
= e
.extra_reqids
.begin();
529 j
!= e
.extra_reqids
.end();
531 extra_caller_ops
.insert(std::make_pair(j
->first
, &e
));
539 extra_caller_ops
.clear();
544 void unindex(const pg_log_entry_t
& e
) {
545 // NOTE: this only works if we remove from the _tail_ of the log!
546 if (indexed_data
& PGLOG_INDEXED_OBJECTS
) {
547 auto it
= objects
.find(e
.soid
);
548 if (it
!= objects
.end() && it
->second
->version
== e
.version
)
551 if (e
.reqid_is_indexed()) {
552 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
553 auto it
= caller_ops
.find(e
.reqid
);
554 // divergent merge_log indexes new before unindexing old
555 if (it
!= caller_ops
.end() && it
->second
== &e
)
556 caller_ops
.erase(it
);
559 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
560 for (auto j
= e
.extra_reqids
.begin();
561 j
!= e
.extra_reqids
.end();
563 for (auto k
= extra_caller_ops
.find(j
->first
);
564 k
!= extra_caller_ops
.end() && k
->first
== j
->first
;
566 if (k
->second
== &e
) {
567 extra_caller_ops
.erase(k
);
575 void index(pg_log_dup_t
& e
) {
576 if (indexed_data
& PGLOG_INDEXED_DUPS
) {
577 dup_index
[e
.reqid
] = &e
;
581 void unindex(const pg_log_dup_t
& e
) {
582 if (indexed_data
& PGLOG_INDEXED_DUPS
) {
583 auto i
= dup_index
.find(e
.reqid
);
584 if (i
!= dup_index
.end()) {
591 void add(const pg_log_entry_t
& e
, bool applied
= true) {
593 ceph_assert(get_can_rollback_to() == head
);
596 // make sure our buffers don't pin bigger buffers
597 e
.mod_desc
.trim_bl();
602 // riter previously pointed to the previous entry
603 if (rollback_info_trimmed_to_riter
== log
.rbegin())
604 ++rollback_info_trimmed_to_riter
;
606 ceph_assert(e
.version
> head
);
607 ceph_assert(head
.version
== 0 || e
.version
.version
> head
.version
);
611 if ((indexed_data
& PGLOG_INDEXED_OBJECTS
) && e
.object_is_indexed()) {
612 objects
[e
.soid
] = &(log
.back());
614 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
615 if (e
.reqid_is_indexed()) {
616 caller_ops
[e
.reqid
] = &(log
.back());
620 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
621 for (auto j
= e
.extra_reqids
.begin();
622 j
!= e
.extra_reqids
.end();
624 extra_caller_ops
.insert(std::make_pair(j
->first
, &(log
.back())));
629 skip_can_rollback_to_to_head();
636 std::set
<eversion_t
> *trimmed
,
637 std::set
<std::string
>* trimmed_dups
,
638 eversion_t
*write_from_dups
);
640 std::ostream
& print(std::ostream
& out
) const;
645 //////////////////// data members ////////////////////
647 pg_missing_tracker_t missing
;
650 eversion_t dirty_to
; ///< must clear/writeout all keys <= dirty_to
651 eversion_t dirty_from
; ///< must clear/writeout all keys >= dirty_from
652 eversion_t writeout_from
; ///< must writout keys >= writeout_from
653 std::set
<eversion_t
> trimmed
; ///< must clear keys in trimmed
654 eversion_t dirty_to_dups
; ///< must clear/writeout all dups <= dirty_to_dups
655 eversion_t dirty_from_dups
; ///< must clear/writeout all dups >= dirty_from_dups
656 eversion_t write_from_dups
; ///< must write keys >= write_from_dups
657 std::set
<std::string
> trimmed_dups
; ///< must clear keys in trimmed_dups
660 /// Log is clean on [dirty_to, dirty_from)
663 bool clear_divergent_priors
;
664 bool may_include_deletes_in_missing_dirty
= false;
666 void mark_dirty_to(eversion_t to
) {
670 void mark_dirty_from(eversion_t from
) {
671 if (from
< dirty_from
)
674 void mark_writeout_from(eversion_t from
) {
675 if (from
< writeout_from
)
676 writeout_from
= from
;
678 void mark_dirty_to_dups(eversion_t to
) {
679 if (to
> dirty_to_dups
)
682 void mark_dirty_from_dups(eversion_t from
) {
683 if (from
< dirty_from_dups
)
684 dirty_from_dups
= from
;
687 bool needs_write() const {
688 return !touched_log
|| is_dirty();
691 bool is_dirty() const {
693 (dirty_to
!= eversion_t()) ||
694 (dirty_from
!= eversion_t::max()) ||
695 (writeout_from
!= eversion_t::max()) ||
696 !(trimmed
.empty()) ||
697 !missing
.is_clean() ||
698 !(trimmed_dups
.empty()) ||
699 (dirty_to_dups
!= eversion_t()) ||
700 (dirty_from_dups
!= eversion_t::max()) ||
701 (write_from_dups
!= eversion_t::max()) ||
702 may_include_deletes_in_missing_dirty
;
705 void mark_log_for_rewrite() {
706 mark_dirty_to(eversion_t::max());
707 mark_dirty_from(eversion_t());
708 mark_dirty_to_dups(eversion_t::max());
709 mark_dirty_from_dups(eversion_t());
712 bool get_may_include_deletes_in_missing_dirty() const {
713 return may_include_deletes_in_missing_dirty
;
718 std::set
<std::string
> log_keys_debug
;
719 static void clear_after(std::set
<std::string
> *log_keys_debug
, const std::string
&lb
) {
722 for (auto i
= log_keys_debug
->lower_bound(lb
);
723 i
!= log_keys_debug
->end();
724 log_keys_debug
->erase(i
++));
726 static void clear_up_to(std::set
<std::string
> *log_keys_debug
, const std::string
&ub
) {
729 for (auto i
= log_keys_debug
->begin();
730 i
!= log_keys_debug
->end() && *i
< ub
;
731 log_keys_debug
->erase(i
++));
736 dirty_to
= eversion_t();
737 dirty_from
= eversion_t::max();
741 trimmed_dups
.clear();
742 writeout_from
= eversion_t::max();
745 dirty_to_dups
= eversion_t();
746 dirty_from_dups
= eversion_t::max();
747 write_from_dups
= eversion_t::max();
751 // cppcheck-suppress noExplicitConstructor
752 PGLog(CephContext
*cct
) :
753 dirty_from(eversion_t::max()),
754 writeout_from(eversion_t::max()),
755 dirty_from_dups(eversion_t::max()),
756 write_from_dups(eversion_t::max()),
758 pg_log_debug(!(cct
&& !(cct
->_conf
->osd_debug_pg_log_writeout
))),
761 clear_divergent_priors(false)
764 void reset_backfill();
768 //////////////////// get or std::set missing ////////////////////
770 const pg_missing_tracker_t
& get_missing() const { return missing
; }
772 void missing_add(const hobject_t
& oid
, eversion_t need
, eversion_t have
, bool is_delete
=false) {
773 missing
.add(oid
, need
, have
, is_delete
);
776 void missing_add_next_entry(const pg_log_entry_t
& e
) {
777 missing
.add_next_event(e
);
780 //////////////////// get or std::set log ////////////////////
782 const IndexedLog
&get_log() const { return log
; }
784 const eversion_t
&get_tail() const { return log
.tail
; }
786 void set_tail(eversion_t tail
) { log
.tail
= tail
; }
788 const eversion_t
&get_head() const { return log
.head
; }
790 void set_head(eversion_t head
) { log
.head
= head
; }
792 void set_last_requested(version_t last_requested
) {
793 log
.last_requested
= last_requested
;
796 void index() { log
.index(); }
798 void unindex() { log
.unindex(); }
800 void add(const pg_log_entry_t
& e
, bool applied
= true) {
801 mark_writeout_from(e
.version
);
805 void reset_recovery_pointers() { log
.reset_recovery_pointers(); }
807 static void clear_info_log(
809 ObjectStore::Transaction
*t
);
814 bool transaction_applied
= true,
817 void roll_forward_to(
818 eversion_t roll_forward_to
,
819 LogEntryHandler
*h
) {
820 if (log
.roll_forward_to(
826 eversion_t
get_can_rollback_to() const {
827 return log
.get_can_rollback_to();
830 void roll_forward(LogEntryHandler
*h
) {
836 void skip_rollforward() {
837 log
.skip_can_rollback_to_to_head();
840 //////////////////// get or std::set log & missing ////////////////////
842 void reset_backfill_claim_log(const pg_log_t
&o
, LogEntryHandler
*h
) {
843 log
.trim_rollback_info_to(log
.head
, h
);
844 log
.claim_log_and_clear_rollback_info(o
);
846 mark_dirty_to(eversion_t::max());
847 mark_dirty_to_dups(eversion_t::max());
854 log
.split_out_child(child_pgid
, split_bits
, &opg_log
->log
);
855 missing
.split_into(child_pgid
, split_bits
, &(opg_log
->missing
));
856 opg_log
->mark_dirty_to(eversion_t::max());
857 opg_log
->mark_dirty_to_dups(eversion_t::max());
858 mark_dirty_to(eversion_t::max());
859 mark_dirty_to_dups(eversion_t::max());
860 if (missing
.may_include_deletes
) {
861 opg_log
->set_missing_may_contain_deletes();
866 const std::vector
<PGLog
*>& sources
,
867 eversion_t last_update
) {
871 std::vector
<pg_log_t
*> slogs
;
872 for (auto s
: sources
) {
873 slogs
.push_back(&s
->log
);
875 log
.merge_from(slogs
, last_update
);
879 mark_log_for_rewrite();
882 void recover_got(hobject_t oid
, eversion_t v
, pg_info_t
&info
) {
883 if (missing
.is_missing(oid
, v
)) {
885 info
.stats
.stats
.sum
.num_objects_missing
= missing
.num_missing();
887 // raise last_complete?
888 if (missing
.get_items().empty()) {
889 log
.complete_to
= log
.log
.end();
890 info
.last_complete
= info
.last_update
;
892 auto oldest_need
= missing
.get_oldest_need();
893 while (log
.complete_to
!= log
.log
.end()) {
894 if (oldest_need
<= log
.complete_to
->version
)
896 if (info
.last_complete
< log
.complete_to
->version
)
897 info
.last_complete
= log
.complete_to
->version
;
902 ceph_assert(log
.get_can_rollback_to() >= v
);
905 void reset_complete_to(pg_info_t
*info
) {
906 if (log
.log
.empty()) // caller is split_into()
908 log
.complete_to
= log
.log
.begin();
909 ceph_assert(log
.complete_to
!= log
.log
.end());
910 auto oldest_need
= missing
.get_oldest_need();
911 if (oldest_need
!= eversion_t()) {
912 while (log
.complete_to
->version
< oldest_need
) {
914 ceph_assert(log
.complete_to
!= log
.log
.end());
919 if (log
.complete_to
== log
.log
.begin()) {
920 info
->last_complete
= eversion_t();
923 info
->last_complete
= log
.complete_to
->version
;
928 void activate_not_complete(pg_info_t
&info
) {
929 reset_complete_to(&info
);
930 log
.last_requested
= 0;
933 void proc_replica_log(pg_info_t
&oinfo
,
934 const pg_log_t
&olog
,
935 pg_missing_t
& omissing
, pg_shard_t from
) const;
937 void set_missing_may_contain_deletes() {
938 missing
.may_include_deletes
= true;
939 may_include_deletes_in_missing_dirty
= true;
942 void rebuild_missing_set_with_deletes(ObjectStore
*store
,
943 ObjectStore::CollectionHandle
& ch
,
944 const pg_info_t
&info
);
947 seastar::future
<> rebuild_missing_set_with_deletes_crimson(
948 crimson::os::FuturizedStore::Shard
&store
,
949 crimson::os::CollectionRef ch
,
950 const pg_info_t
&info
);
954 static void split_by_object(
955 mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
956 std::map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
>> *out_entries
) {
957 while (!entries
.empty()) {
958 auto &out_list
= (*out_entries
)[entries
.front().soid
];
959 out_list
.splice(out_list
.end(), entries
, entries
.begin());
964 * _merge_object_divergent_entries
966 * There are 5 distinct cases:
967 * 1) There is a more recent update: in this case we assume we adjusted the
968 * store and missing during merge_log
969 * 2) The first entry in the divergent sequence is a create. This might
970 * either be because the object is a clone or because prior_version is
971 * eversion_t(). In this case the object does not exist and we must
972 * adjust missing and the store to match.
973 * 3) We are currently missing the object. In this case, we adjust the
974 * missing to our prior_version taking care to add a divergent_prior
976 * 4) We can rollback all of the entries. In this case, we do so using
977 * the rollbacker and return -- the object does not go into missing.
978 * 5) We cannot rollback at least 1 of the entries. In this case, we
979 * clear the object out of the store and add a missing entry at
980 * prior_version taking care to add a divergent_prior if
983 template <typename missing_type
>
984 static void _merge_object_divergent_entries(
985 const IndexedLog
&log
, ///< [in] log to merge against
986 const hobject_t
&hoid
, ///< [in] object we are merging
987 const mempool::osd_pglog::list
<pg_log_entry_t
> &orig_entries
, ///< [in] entries for hoid to merge
988 const pg_info_t
&info
, ///< [in] info for merging entries
989 eversion_t olog_can_rollback_to
, ///< [in] rollback boundary of input InedexedLog
990 missing_type
&missing
, ///< [in,out] missing to adjust, use
991 LogEntryHandler
*rollbacker
, ///< [in] optional rollbacker object
992 const DoutPrefixProvider
*dpp
///< [in] logging provider
994 ldpp_dout(dpp
, 20) << __func__
<< ": merging hoid " << hoid
995 << " entries: " << orig_entries
<< dendl
;
997 if (hoid
> info
.last_backfill
) {
998 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " after last_backfill"
1003 // entries is non-empty
1004 ceph_assert(!orig_entries
.empty());
1005 // strip out and ignore ERROR entries
1006 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
1008 bool seen_non_error
= false;
1009 for (auto i
= orig_entries
.begin();
1010 i
!= orig_entries
.end();
1012 // all entries are on hoid
1013 ceph_assert(i
->soid
== hoid
);
1014 // did not see error entries before this entry and this entry is not error
1015 // then this entry is the first non error entry
1016 bool first_non_error
= ! seen_non_error
&& ! i
->is_error();
1017 if (! i
->is_error() ) {
1018 // see a non error entry now
1019 seen_non_error
= true;
1022 // No need to check the first entry since it prior_version is unavailable
1024 // No need to check if the prior_version is the minimal version
1025 // No need to check the first non-error entry since the leading error
1026 // entries are not its prior version
1027 if (i
!= orig_entries
.begin() && i
->prior_version
!= eversion_t() &&
1028 ! first_non_error
) {
1029 // in increasing order of version
1030 ceph_assert(i
->version
> last
);
1031 // prior_version correct (unless it is an ERROR entry)
1032 ceph_assert(i
->prior_version
== last
|| i
->is_error());
1034 if (i
->is_error()) {
1035 ldpp_dout(dpp
, 20) << __func__
<< ": ignoring " << *i
<< dendl
;
1037 ldpp_dout(dpp
, 20) << __func__
<< ": keeping " << *i
<< dendl
;
1038 entries
.push_back(*i
);
1042 if (entries
.empty()) {
1043 ldpp_dout(dpp
, 10) << __func__
<< ": no non-ERROR entries" << dendl
;
1047 const eversion_t prior_version
= entries
.begin()->prior_version
;
1048 const eversion_t first_divergent_update
= entries
.begin()->version
;
1049 const eversion_t last_divergent_update
= entries
.rbegin()->version
;
1050 const bool object_not_in_store
=
1051 !missing
.is_missing(hoid
) &&
1052 entries
.rbegin()->is_delete();
1053 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << " object_not_in_store: "
1054 << object_not_in_store
<< dendl
;
1055 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1056 << " prior_version: " << prior_version
1057 << " first_divergent_update: " << first_divergent_update
1058 << " last_divergent_update: " << last_divergent_update
1061 auto objiter
= log
.objects
.find(hoid
);
1062 if (objiter
!= log
.objects
.end() &&
1063 objiter
->second
->version
>= first_divergent_update
) {
1065 ldpp_dout(dpp
, 10) << __func__
<< ": more recent entry found: "
1066 << *objiter
->second
<< ", already merged" << dendl
;
1068 ceph_assert(objiter
->second
->version
> last_divergent_update
);
1070 // ensure missing has been updated appropriately
1071 if (objiter
->second
->is_update() ||
1072 (missing
.may_include_deletes
&& objiter
->second
->is_delete())) {
1073 ceph_assert(missing
.is_missing(hoid
) &&
1074 missing
.get_items().at(hoid
).need
== objiter
->second
->version
);
1076 ceph_assert(!missing
.is_missing(hoid
));
1078 missing
.revise_have(hoid
, eversion_t());
1079 missing
.mark_fully_dirty(hoid
);
1081 if (!object_not_in_store
) {
1082 rollbacker
->remove(hoid
);
1084 for (auto &&i
: entries
) {
1085 rollbacker
->trim(i
);
1091 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1092 <<" has no more recent entries in log" << dendl
;
1093 if (prior_version
== eversion_t() || entries
.front().is_clone()) {
1095 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1096 << " prior_version or op type indicates creation,"
1099 if (missing
.is_missing(hoid
))
1100 missing
.rm(missing
.get_items().find(hoid
));
1102 if (!object_not_in_store
) {
1103 rollbacker
->remove(hoid
);
1105 for (auto &&i
: entries
) {
1106 rollbacker
->trim(i
);
1112 if (missing
.is_missing(hoid
)) {
1114 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1115 << " missing, " << missing
.get_items().at(hoid
)
1116 << " adjusting" << dendl
;
1118 if (missing
.get_items().at(hoid
).have
== prior_version
) {
1119 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1120 << " missing.have is prior_version " << prior_version
1121 << " removing from missing" << dendl
;
1122 missing
.rm(missing
.get_items().find(hoid
));
1124 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1125 << " missing.have is " << missing
.get_items().at(hoid
).have
1126 << ", adjusting" << dendl
;
1127 missing
.revise_need(hoid
, prior_version
, false);
1128 if (prior_version
<= info
.log_tail
) {
1129 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1130 << " prior_version " << prior_version
1131 << " <= info.log_tail "
1132 << info
.log_tail
<< dendl
;
1136 for (auto &&i
: entries
) {
1137 rollbacker
->trim(i
);
1143 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1144 << " must be rolled back or recovered,"
1145 << " attempting to rollback"
1147 bool can_rollback
= true;
1148 // We are going to make an important decision based on the
1149 // olog_can_rollback_to value we have received, better known it.
1150 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1151 << " olog_can_rollback_to: "
1152 << olog_can_rollback_to
<< dendl
;
1153 /// Distinguish between 4) and 5)
1154 for (auto i
= entries
.rbegin(); i
!= entries
.rend(); ++i
) {
1155 if (!i
->can_rollback() || i
->version
<= olog_can_rollback_to
) {
1156 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " cannot rollback "
1158 can_rollback
= false;
1165 for (auto i
= entries
.rbegin(); i
!= entries
.rend(); ++i
) {
1166 ceph_assert(i
->can_rollback() && i
->version
> olog_can_rollback_to
);
1167 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1168 << " rolling back " << *i
<< dendl
;
1170 rollbacker
->rollback(*i
);
1172 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1173 << " rolled back" << dendl
;
1177 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " cannot roll back, "
1178 << "removing and adding to missing" << dendl
;
1180 if (!object_not_in_store
)
1181 rollbacker
->remove(hoid
);
1182 for (auto &&i
: entries
) {
1183 rollbacker
->trim(i
);
1186 missing
.add(hoid
, prior_version
, eversion_t(), false);
1187 if (prior_version
<= info
.log_tail
) {
1188 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1189 << " prior_version " << prior_version
1190 << " <= info.log_tail "
1191 << info
.log_tail
<< dendl
;
1196 /// Merge all entries using above
1197 template <typename missing_type
>
1198 static void _merge_divergent_entries(
1199 const IndexedLog
&log
, ///< [in] log to merge against
1200 mempool::osd_pglog::list
<pg_log_entry_t
> &entries
, ///< [in] entries to merge
1201 const pg_info_t
&oinfo
, ///< [in] info for merging entries
1202 eversion_t olog_can_rollback_to
, ///< [in] rollback boundary of input IndexedLog
1203 missing_type
&omissing
, ///< [in,out] missing to adjust, use
1204 LogEntryHandler
*rollbacker
, ///< [in] optional rollbacker object
1205 const DoutPrefixProvider
*dpp
///< [in] logging provider
1207 std::map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
> > split
;
1208 split_by_object(entries
, &split
);
1209 for (auto i
= split
.begin(); i
!= split
.end(); ++i
) {
1210 _merge_object_divergent_entries(
1215 olog_can_rollback_to
,
1223 * Exists for use in TestPGLog for simply testing single divergent log
1226 void merge_old_entry(
1227 ObjectStore::Transaction
& t
,
1228 const pg_log_entry_t
& oe
,
1229 const pg_info_t
& info
,
1230 LogEntryHandler
*rollbacker
) {
1231 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
1232 entries
.push_back(oe
);
1233 _merge_object_divergent_entries(
1238 log
.get_can_rollback_to(),
1244 bool merge_log_dups(const pg_log_t
& olog
);
1248 void rewind_divergent_log(eversion_t newhead
,
1250 LogEntryHandler
*rollbacker
,
1252 bool &dirty_big_info
);
1254 void merge_log(pg_info_t
&oinfo
,
1257 pg_info_t
&info
, LogEntryHandler
*rollbacker
,
1258 bool &dirty_info
, bool &dirty_big_info
);
1260 template <typename missing_type
>
1261 static bool append_log_entries_update_missing(
1262 const hobject_t
&last_backfill
,
1263 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
1264 bool maintain_rollback
,
1266 missing_type
&missing
,
1267 LogEntryHandler
*rollbacker
,
1268 const DoutPrefixProvider
*dpp
) {
1269 bool invalidate_stats
= false;
1270 if (log
&& !entries
.empty()) {
1271 ceph_assert(log
->head
< entries
.begin()->version
);
1273 for (auto p
= entries
.begin(); p
!= entries
.end(); ++p
) {
1274 invalidate_stats
= invalidate_stats
|| !p
->is_error();
1276 ldpp_dout(dpp
, 20) << "update missing, append " << *p
<< dendl
;
1279 if (p
->soid
<= last_backfill
&&
1281 if (missing
.may_include_deletes
) {
1282 missing
.add_next_event(*p
);
1284 if (p
->is_delete()) {
1285 missing
.rm(p
->soid
, p
->version
);
1287 missing
.add_next_event(*p
);
1290 // hack to match PG::mark_all_unfound_lost
1291 if (maintain_rollback
&& p
->is_lost_delete() && p
->can_rollback()) {
1292 rollbacker
->try_stash(p
->soid
, p
->version
.version
);
1293 } else if (p
->is_delete()) {
1294 rollbacker
->remove(p
->soid
);
1300 return invalidate_stats
;
1302 bool append_new_log_entries(
1303 const hobject_t
&last_backfill
,
1304 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
1305 LogEntryHandler
*rollbacker
) {
1306 bool invalidate_stats
= append_log_entries_update_missing(
1314 if (!entries
.empty()) {
1315 mark_writeout_from(entries
.begin()->version
);
1316 if (entries
.begin()->is_lost_delete()) {
1317 // hack: since lost deletes queue recovery directly, and don't
1318 // go through activate_not_complete() again, our complete_to
1319 // iterator may still point at log.end(). Reset it to point
1320 // before these new lost_delete entries. This only occurs
1321 // when lost+delete entries are initially added, which is
1322 // always in a std::list of solely lost_delete entries, so it is
1323 // sufficient to check whether the first entry is a
1325 reset_complete_to(nullptr);
1328 return invalidate_stats
;
1331 void write_log_and_missing(
1332 ObjectStore::Transaction
& t
,
1333 std::map
<std::string
,ceph::buffer::list
> *km
,
1335 const ghobject_t
&log_oid
,
1336 bool require_rollback
);
1338 static void write_log_and_missing_wo_missing(
1339 ObjectStore::Transaction
& t
,
1340 std::map
<std::string
,ceph::buffer::list
>* km
,
1343 const ghobject_t
&log_oid
, std::map
<eversion_t
, hobject_t
> &divergent_priors
,
1344 bool require_rollback
,
1345 const DoutPrefixProvider
*dpp
= nullptr);
1347 static void write_log_and_missing(
1348 ObjectStore::Transaction
& t
,
1349 std::map
<std::string
,ceph::buffer::list
>* km
,
1352 const ghobject_t
&log_oid
,
1353 const pg_missing_tracker_t
&missing
,
1354 bool require_rollback
,
1355 bool *rebuilt_missing_set_with_deletes
,
1356 const DoutPrefixProvider
*dpp
= nullptr);
1358 static void _write_log_and_missing_wo_missing(
1359 ObjectStore::Transaction
& t
,
1360 std::map
<std::string
,ceph::buffer::list
>* km
,
1362 const coll_t
& coll
, const ghobject_t
&log_oid
,
1363 std::map
<eversion_t
, hobject_t
> &divergent_priors
,
1364 eversion_t dirty_to
,
1365 eversion_t dirty_from
,
1366 eversion_t writeout_from
,
1367 bool dirty_divergent_priors
,
1369 bool require_rollback
,
1370 eversion_t dirty_to_dups
,
1371 eversion_t dirty_from_dups
,
1372 eversion_t write_from_dups
,
1373 std::set
<std::string
> *log_keys_debug
,
1374 const DoutPrefixProvider
*dpp
= nullptr
1377 static void _write_log_and_missing(
1378 ObjectStore::Transaction
& t
,
1379 std::map
<std::string
,ceph::buffer::list
>* km
,
1381 const coll_t
& coll
, const ghobject_t
&log_oid
,
1382 eversion_t dirty_to
,
1383 eversion_t dirty_from
,
1384 eversion_t writeout_from
,
1385 std::set
<eversion_t
> &&trimmed
,
1386 std::set
<std::string
> &&trimmed_dups
,
1387 const pg_missing_tracker_t
&missing
,
1389 bool require_rollback
,
1390 bool clear_divergent_priors
,
1391 eversion_t dirty_to_dups
,
1392 eversion_t dirty_from_dups
,
1393 eversion_t write_from_dups
,
1394 bool *may_include_deletes_in_missing_dirty
,
1395 std::set
<std::string
> *log_keys_debug
,
1396 const DoutPrefixProvider
*dpp
= nullptr
1399 void read_log_and_missing(
1401 ObjectStore::CollectionHandle
& ch
,
1402 ghobject_t pgmeta_oid
,
1403 const pg_info_t
&info
,
1404 std::ostringstream
&oss
,
1405 bool tolerate_divergent_missing_log
,
1406 bool debug_verify_stored_missing
= false
1408 return read_log_and_missing(
1409 cct
, store
, ch
, pgmeta_oid
, info
,
1411 tolerate_divergent_missing_log
,
1412 &clear_divergent_priors
,
1414 (pg_log_debug
? &log_keys_debug
: nullptr),
1415 debug_verify_stored_missing
);
1418 template <typename missing_type
>
1419 static void read_log_and_missing(
1422 ObjectStore::CollectionHandle
&ch
,
1423 ghobject_t pgmeta_oid
,
1424 const pg_info_t
&info
,
1426 missing_type
&missing
,
1427 std::ostringstream
&oss
,
1428 bool tolerate_divergent_missing_log
,
1429 bool *clear_divergent_priors
= nullptr,
1430 const DoutPrefixProvider
*dpp
= nullptr,
1431 std::set
<std::string
> *log_keys_debug
= nullptr,
1432 bool debug_verify_stored_missing
= false
1434 ldpp_dout(dpp
, 10) << "read_log_and_missing coll " << ch
->cid
1435 << " " << pgmeta_oid
<< dendl
;
1436 size_t total_dups
= 0;
1440 int r
= store
->stat(ch
, pgmeta_oid
, &st
);
1441 ceph_assert(r
== 0);
1442 ceph_assert(st
.st_size
== 0);
1444 // will get overridden below if it had been recorded
1445 eversion_t on_disk_can_rollback_to
= info
.last_update
;
1446 eversion_t on_disk_rollback_info_trimmed_to
= eversion_t();
1447 ObjectMap::ObjectMapIterator p
= store
->get_omap_iterator(ch
,
1449 std::map
<eversion_t
, hobject_t
> divergent_priors
;
1450 bool must_rebuild
= false;
1451 missing
.may_include_deletes
= false;
1452 std::list
<pg_log_entry_t
> entries
;
1453 std::list
<pg_log_dup_t
> dups
;
1454 const auto NUM_DUPS_WARN_THRESHOLD
= 2*cct
->_conf
->osd_pg_log_dups_tracked
;
1457 for (p
->seek_to_first(); p
->valid() ; p
->next()) {
1458 // non-log pgmeta_oid keys are prefixed with _; skip those
1459 if (p
->key()[0] == '_')
1461 auto bl
= p
->value();//Copy ceph::buffer::list before creating iterator
1462 auto bp
= bl
.cbegin();
1463 if (p
->key() == "divergent_priors") {
1464 decode(divergent_priors
, bp
);
1465 ldpp_dout(dpp
, 20) << "read_log_and_missing " << divergent_priors
.size()
1466 << " divergent_priors" << dendl
;
1467 must_rebuild
= true;
1468 debug_verify_stored_missing
= false;
1469 } else if (p
->key() == "can_rollback_to") {
1470 decode(on_disk_can_rollback_to
, bp
);
1471 } else if (p
->key() == "rollback_info_trimmed_to") {
1472 decode(on_disk_rollback_info_trimmed_to
, bp
);
1473 } else if (p
->key() == "may_include_deletes_in_missing") {
1474 missing
.may_include_deletes
= true;
1475 } else if (p
->key().substr(0, 7) == std::string("missing")) {
1477 pg_missing_item item
;
1480 ldpp_dout(dpp
, 20) << "read_log_and_missing " << item
<< dendl
;
1481 if (item
.is_delete()) {
1482 ceph_assert(missing
.may_include_deletes
);
1484 missing
.add(oid
, std::move(item
));
1485 } else if (p
->key().substr(0, 4) == std::string("dup_")) {
1489 if (!dups
.empty()) {
1490 ceph_assert(dups
.back().version
< dup
.version
);
1492 if (dups
.size() == NUM_DUPS_WARN_THRESHOLD
) {
1493 ldpp_dout(dpp
, 0) << "read_log_and_missing WARN num of dups exceeded "
1494 << NUM_DUPS_WARN_THRESHOLD
<< "."
1495 << " You can be hit by THE DUPS BUG"
1496 << " https://tracker.ceph.com/issues/53729."
1497 << " Consider ceph-objectstore-tool --op trim-pg-log-dups"
1500 dups
.push_back(dup
);
1503 e
.decode_with_checksum(bp
);
1504 ldpp_dout(dpp
, 20) << "read_log_and_missing " << e
<< dendl
;
1505 if (!entries
.empty()) {
1506 pg_log_entry_t
last_e(entries
.back());
1507 ceph_assert(last_e
.version
.version
< e
.version
.version
);
1508 ceph_assert(last_e
.version
.epoch
<= e
.version
.epoch
);
1510 entries
.push_back(e
);
1512 log_keys_debug
->insert(e
.get_key_name());
1516 if (info
.pgid
.is_no_shard()) {
1517 // replicated pool pg does not persist this key
1518 assert(on_disk_rollback_info_trimmed_to
== eversion_t());
1519 on_disk_rollback_info_trimmed_to
= info
.last_update
;
1524 on_disk_can_rollback_to
,
1525 on_disk_rollback_info_trimmed_to
,
1529 if (must_rebuild
|| debug_verify_stored_missing
) {
1531 if (debug_verify_stored_missing
|| info
.last_complete
< info
.last_update
) {
1533 << "read_log_and_missing checking for missing items over interval ("
1534 << info
.last_complete
1535 << "," << info
.last_update
<< "]" << dendl
;
1537 std::set
<hobject_t
> did
;
1538 std::set
<hobject_t
> checked
;
1539 std::set
<hobject_t
> skipped
;
1540 for (auto i
= log
.log
.rbegin(); i
!= log
.log
.rend(); ++i
) {
1541 if (i
->soid
> info
.last_backfill
)
1545 if (did
.count(i
->soid
)) continue;
1546 did
.insert(i
->soid
);
1548 if (!missing
.may_include_deletes
&& i
->is_delete())
1551 ceph::buffer::list bv
;
1552 int r
= store
->getattr(
1554 ghobject_t(i
->soid
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1558 object_info_t
oi(bv
);
1559 if (oi
.version
< i
->version
) {
1560 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
1561 << " (have " << oi
.version
<< ")"
1562 << " clean_regions " << i
->clean_regions
<< dendl
;
1564 if (debug_verify_stored_missing
) {
1565 auto miter
= missing
.get_items().find(i
->soid
);
1566 ceph_assert(miter
!= missing
.get_items().end());
1567 ceph_assert(miter
->second
.need
== i
->version
);
1568 // the 'have' version is reset if an object is deleted,
1569 // then created again
1570 ceph_assert(miter
->second
.have
== oi
.version
|| miter
->second
.have
== eversion_t());
1571 checked
.insert(i
->soid
);
1573 missing
.add(i
->soid
, i
->version
, oi
.version
, i
->is_delete());
1577 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
<< dendl
;
1578 if (debug_verify_stored_missing
) {
1579 auto miter
= missing
.get_items().find(i
->soid
);
1580 if (i
->is_delete()) {
1581 ceph_assert(miter
== missing
.get_items().end() ||
1582 (miter
->second
.need
== i
->version
&&
1583 miter
->second
.have
== eversion_t()));
1585 ceph_assert(miter
!= missing
.get_items().end());
1586 ceph_assert(miter
->second
.need
== i
->version
);
1587 ceph_assert(miter
->second
.have
== eversion_t());
1589 checked
.insert(i
->soid
);
1591 missing
.add(i
->soid
, i
->version
, eversion_t(), i
->is_delete());
1595 if (debug_verify_stored_missing
) {
1596 for (auto &&i
: missing
.get_items()) {
1597 if (checked
.count(i
.first
))
1599 if (i
.first
> info
.last_backfill
) {
1600 ldpp_dout(dpp
, -1) << __func__
<< ": invalid missing std::set entry "
1601 << "found before last_backfill: "
1602 << i
.first
<< " " << i
.second
1603 << " last_backfill = " << info
.last_backfill
1605 ceph_abort_msg("invalid missing std::set entry found");
1607 ceph::buffer::list bv
;
1608 int r
= store
->getattr(
1610 ghobject_t(i
.first
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1614 object_info_t
oi(bv
);
1615 ceph_assert(oi
.version
== i
.second
.have
|| eversion_t() == i
.second
.have
);
1617 ceph_assert(i
.second
.is_delete() || eversion_t() == i
.second
.have
);
1621 ceph_assert(must_rebuild
);
1622 for (auto i
= divergent_priors
.rbegin();
1623 i
!= divergent_priors
.rend();
1625 if (i
->first
<= info
.last_complete
) break;
1626 if (i
->second
> info
.last_backfill
)
1628 if (did
.count(i
->second
)) continue;
1629 did
.insert(i
->second
);
1630 ceph::buffer::list bv
;
1631 int r
= store
->getattr(
1633 ghobject_t(i
->second
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1637 object_info_t
oi(bv
);
1639 * 1) we see this entry in the divergent priors mapping
1640 * 2) we didn't see an entry for this object in the log
1642 * From 1 & 2 we know that either the object does not exist
1643 * or it is at the version specified in the divergent_priors
1644 * map since the object would have been deleted atomically
1645 * with the addition of the divergent_priors entry, an older
1646 * version would not have been recovered, and a newer version
1647 * would show up in the log above.
1650 * Unfortunately the assessment above is incorrect because of
1651 * http://tracker.ceph.com/issues/17916 (we were incorrectly
1652 * not removing the divergent_priors std::set from disk state!),
1653 * so let's check that.
1655 if (oi
.version
> i
->first
&& tolerate_divergent_missing_log
) {
1656 ldpp_dout(dpp
, 0) << "read_log divergent_priors entry (" << *i
1657 << ") inconsistent with disk state (" << oi
1658 << "), assuming it is tracker.ceph.com/issues/17916"
1661 ceph_assert(oi
.version
== i
->first
);
1664 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
<< dendl
;
1665 missing
.add(i
->second
, i
->first
, eversion_t(), false);
1669 if (clear_divergent_priors
)
1670 (*clear_divergent_priors
) = true;
1674 if (!must_rebuild
) {
1675 if (clear_divergent_priors
)
1676 (*clear_divergent_priors
) = false;
1679 ldpp_dout(dpp
, 10) << "read_log_and_missing done coll " << ch
->cid
1680 << " total_dups=" << total_dups
1681 << " log.dups.size()=" << log
.dups
.size() << dendl
;
1682 } // static read_log_and_missing
1685 seastar::future
<> read_log_and_missing_crimson(
1686 crimson::os::FuturizedStore::Shard
&store
,
1687 crimson::os::CollectionRef ch
,
1688 const pg_info_t
&info
,
1689 ghobject_t pgmeta_oid
1691 return read_log_and_missing_crimson(
1693 log
, (pg_log_debug
? &log_keys_debug
: nullptr),
1694 missing
, pgmeta_oid
, this);
1697 static seastar::future
<> read_log_and_missing_crimson(
1698 crimson::os::FuturizedStore::Shard
&store
,
1699 crimson::os::CollectionRef ch
,
1700 const pg_info_t
&info
,
1702 std::set
<std::string
>* log_keys_debug
,
1703 pg_missing_tracker_t
&missing
,
1704 ghobject_t pgmeta_oid
,
1705 const DoutPrefixProvider
*dpp
= nullptr);