1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 // re-include our assert to clobber boost's
20 #include "include/assert.h"
21 #include "osd_types.h"
22 #include "os/ObjectStore.h"
26 #define PGLOG_INDEXED_OBJECTS (1 << 0)
27 #define PGLOG_INDEXED_CALLER_OPS (1 << 1)
28 #define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2)
29 #define PGLOG_INDEXED_DUPS (1 << 3)
30 #define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | \
31 PGLOG_INDEXED_CALLER_OPS | \
32 PGLOG_INDEXED_EXTRA_CALLER_OPS | \
37 struct PGLog
: DoutPrefixProvider
{
38 DoutPrefixProvider
*prefix_provider
;
39 string
gen_prefix() const override
{
40 return prefix_provider
? prefix_provider
->gen_prefix() : "";
42 unsigned get_subsys() const override
{
43 return prefix_provider
? prefix_provider
->get_subsys() :
44 (unsigned)ceph_subsys_osd
;
46 CephContext
*get_cct() const override
{
50 ////////////////////////////// sub classes //////////////////////////////
51 struct LogEntryHandler
{
52 virtual void rollback(
53 const pg_log_entry_t
&entry
) = 0;
54 virtual void rollforward(
55 const pg_log_entry_t
&entry
) = 0;
57 const pg_log_entry_t
&entry
) = 0;
59 const hobject_t
&hoid
) = 0;
60 virtual void try_stash(
61 const hobject_t
&hoid
,
63 virtual ~LogEntryHandler() {}
67 class read_log_and_missing_error
: public buffer::error
{
69 explicit read_log_and_missing_error(const char *what
) {
70 snprintf(buf
, sizeof(buf
), "read_log_and_missing_error: %s", what
);
72 const char *what() const throw () override
{
81 * IndexLog - adds in-memory index of the log, by oid.
82 * plus some methods to manipulate it all.
84 struct IndexedLog
: public pg_log_t
{
85 mutable ceph::unordered_map
<hobject_t
,pg_log_entry_t
*> objects
; // ptrs into log. be careful!
86 mutable ceph::unordered_map
<osd_reqid_t
,pg_log_entry_t
*> caller_ops
;
87 mutable ceph::unordered_multimap
<osd_reqid_t
,pg_log_entry_t
*> extra_caller_ops
;
88 mutable ceph::unordered_map
<osd_reqid_t
,pg_log_dup_t
*> dup_index
;
91 list
<pg_log_entry_t
>::iterator complete_to
; // not inclusive of referenced item
92 version_t last_requested
= 0; // last object requested by primary
96 mutable __u16 indexed_data
= 0;
98 * rollback_info_trimmed_to_riter points to the first log entry <=
99 * rollback_info_trimmed_to
101 * It's a reverse_iterator because rend() is a natural representation for
102 * tail, and rbegin() works nicely for head.
104 mempool::osd_pglog::list
<pg_log_entry_t
>::reverse_iterator
105 rollback_info_trimmed_to_riter
;
107 template <typename F
>
108 void advance_can_rollback_to(eversion_t to
, F
&&f
) {
109 if (to
> can_rollback_to
)
110 can_rollback_to
= to
;
112 if (to
> rollback_info_trimmed_to
)
113 rollback_info_trimmed_to
= to
;
115 while (rollback_info_trimmed_to_riter
!= log
.rbegin()) {
116 --rollback_info_trimmed_to_riter
;
117 if (rollback_info_trimmed_to_riter
->version
> rollback_info_trimmed_to
) {
118 ++rollback_info_trimmed_to_riter
;
121 f(*rollback_info_trimmed_to_riter
);
125 void reset_rollback_info_trimmed_to_riter() {
126 rollback_info_trimmed_to_riter
= log
.rbegin();
127 while (rollback_info_trimmed_to_riter
!= log
.rend() &&
128 rollback_info_trimmed_to_riter
->version
> rollback_info_trimmed_to
)
129 ++rollback_info_trimmed_to_riter
;
132 // indexes objects, caller ops and extra caller ops
135 complete_to(log
.end()),
138 rollback_info_trimmed_to_riter(log
.rbegin())
141 template <typename
... Args
>
142 IndexedLog(Args
&&... args
) :
143 pg_log_t(std::forward
<Args
>(args
)...),
144 complete_to(log
.end()),
147 rollback_info_trimmed_to_riter(log
.rbegin())
149 reset_rollback_info_trimmed_to_riter();
153 IndexedLog(const IndexedLog
&rhs
) :
155 complete_to(log
.end()),
156 last_requested(rhs
.last_requested
),
158 rollback_info_trimmed_to_riter(log
.rbegin())
160 reset_rollback_info_trimmed_to_riter();
161 index(rhs
.indexed_data
);
164 IndexedLog
&operator=(const IndexedLog
&rhs
) {
166 new (this) IndexedLog(rhs
);
170 void trim_rollback_info_to(eversion_t to
, LogEntryHandler
*h
) {
171 advance_can_rollback_to(
173 [&](pg_log_entry_t
&entry
) {
177 void roll_forward_to(eversion_t to
, LogEntryHandler
*h
) {
178 advance_can_rollback_to(
180 [&](pg_log_entry_t
&entry
) {
181 h
->rollforward(entry
);
185 void skip_can_rollback_to_to_head() {
186 advance_can_rollback_to(head
, [&](const pg_log_entry_t
&entry
) {});
189 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
190 auto divergent
= pg_log_t::rewind_from_head(newhead
);
192 reset_rollback_info_trimmed_to_riter();
196 template <typename T
>
198 const eversion_t
&bound
, ///< [in] scan entries > bound
200 auto iter
= log
.rbegin();
201 while (iter
!= log
.rend() && iter
->version
> bound
)
205 if (iter
== log
.rbegin())
212 void claim_log_and_clear_rollback_info(const pg_log_t
& o
) {
213 // we must have already trimmed the old entries
214 assert(rollback_info_trimmed_to
== head
);
215 assert(rollback_info_trimmed_to_riter
== log
.rbegin());
217 *this = IndexedLog(o
);
219 skip_can_rollback_to_to_head();
223 void split_out_child(
229 // we must have already trimmed the old entries
230 assert(rollback_info_trimmed_to
== head
);
231 assert(rollback_info_trimmed_to_riter
== log
.rbegin());
235 rollback_info_trimmed_to_riter
= log
.rbegin();
236 reset_recovery_pointers();
239 skip_can_rollback_to_to_head();
242 void reset_recovery_pointers() {
243 complete_to
= log
.end();
247 bool logged_object(const hobject_t
& oid
) const {
248 if (!(indexed_data
& PGLOG_INDEXED_OBJECTS
)) {
251 return objects
.count(oid
);
254 bool logged_req(const osd_reqid_t
&r
) const {
255 if (!(indexed_data
& PGLOG_INDEXED_CALLER_OPS
)) {
258 if (!caller_ops
.count(r
)) {
259 if (!(indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)) {
260 index_extra_caller_ops();
262 return extra_caller_ops
.count(r
);
268 const osd_reqid_t
&r
,
270 version_t
*user_version
,
271 int *return_code
) const
274 assert(user_version
);
276 ceph::unordered_map
<osd_reqid_t
,pg_log_entry_t
*>::const_iterator p
;
277 if (!(indexed_data
& PGLOG_INDEXED_CALLER_OPS
)) {
280 p
= caller_ops
.find(r
);
281 if (p
!= caller_ops
.end()) {
282 *version
= p
->second
->version
;
283 *user_version
= p
->second
->user_version
;
284 *return_code
= p
->second
->return_code
;
288 // warning: we will return *a* request for this reqid, but not
289 // necessarily the most recent.
290 if (!(indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)) {
291 index_extra_caller_ops();
293 p
= extra_caller_ops
.find(r
);
294 if (p
!= extra_caller_ops
.end()) {
295 for (auto i
= p
->second
->extra_reqids
.begin();
296 i
!= p
->second
->extra_reqids
.end();
299 *version
= p
->second
->version
;
300 *user_version
= i
->second
;
301 *return_code
= p
->second
->return_code
;
305 assert(0 == "in extra_caller_ops but not extra_reqids");
308 if (!(indexed_data
& PGLOG_INDEXED_DUPS
)) {
311 auto q
= dup_index
.find(r
);
312 if (q
!= dup_index
.end()) {
313 *version
= q
->second
->version
;
314 *user_version
= q
->second
->user_version
;
315 *return_code
= q
->second
->return_code
;
322 /// get a (bounded) list of recent reqids for the given object
323 void get_object_reqids(const hobject_t
& oid
, unsigned max
,
324 mempool::osd_pglog::vector
<pair
<osd_reqid_t
, version_t
> > *pls
) const {
325 // make sure object is present at least once before we do an
327 if (!(indexed_data
& PGLOG_INDEXED_OBJECTS
)) {
330 if (objects
.count(oid
) == 0)
332 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= log
.rbegin();
335 if (i
->soid
== oid
) {
336 if (i
->reqid_is_indexed())
337 pls
->push_back(make_pair(i
->reqid
, i
->user_version
));
338 pls
->insert(pls
->end(), i
->extra_reqids
.begin(), i
->extra_reqids
.end());
339 if (pls
->size() >= max
) {
340 if (pls
->size() > max
) {
349 void index(__u16 to_index
= PGLOG_INDEXED_ALL
) const {
350 // if to_index is 0, no need to run any of this code, especially
351 // loop below; this can happen with copy constructor for
352 // IndexedLog (and indirectly through assignment operator)
353 if (!to_index
) return;
355 if (to_index
& PGLOG_INDEXED_OBJECTS
)
357 if (to_index
& PGLOG_INDEXED_CALLER_OPS
)
359 if (to_index
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)
360 extra_caller_ops
.clear();
361 if (to_index
& PGLOG_INDEXED_DUPS
) {
363 for (auto& i
: dups
) {
364 dup_index
[i
.reqid
] = const_cast<pg_log_dup_t
*>(&i
);
368 constexpr __u16 any_log_entry_index
=
369 PGLOG_INDEXED_OBJECTS
|
370 PGLOG_INDEXED_CALLER_OPS
|
371 PGLOG_INDEXED_EXTRA_CALLER_OPS
;
373 if (to_index
& any_log_entry_index
) {
374 for (list
<pg_log_entry_t
>::const_iterator i
= log
.begin();
377 if (to_index
& PGLOG_INDEXED_OBJECTS
) {
378 if (i
->object_is_indexed()) {
379 objects
[i
->soid
] = const_cast<pg_log_entry_t
*>(&(*i
));
383 if (to_index
& PGLOG_INDEXED_CALLER_OPS
) {
384 if (i
->reqid_is_indexed()) {
385 caller_ops
[i
->reqid
] = const_cast<pg_log_entry_t
*>(&(*i
));
389 if (to_index
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
390 for (auto j
= i
->extra_reqids
.begin();
391 j
!= i
->extra_reqids
.end();
393 extra_caller_ops
.insert(
394 make_pair(j
->first
, const_cast<pg_log_entry_t
*>(&(*i
))));
400 indexed_data
|= to_index
;
403 void index_objects() const {
404 index(PGLOG_INDEXED_OBJECTS
);
407 void index_caller_ops() const {
408 index(PGLOG_INDEXED_CALLER_OPS
);
411 void index_extra_caller_ops() const {
412 index(PGLOG_INDEXED_EXTRA_CALLER_OPS
);
415 void index_dups() const {
416 index(PGLOG_INDEXED_DUPS
);
419 void index(pg_log_entry_t
& e
) {
420 if ((indexed_data
& PGLOG_INDEXED_OBJECTS
) && e
.object_is_indexed()) {
421 if (objects
.count(e
.soid
) == 0 ||
422 objects
[e
.soid
]->version
< e
.version
)
423 objects
[e
.soid
] = &e
;
425 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
426 // divergent merge_log indexes new before unindexing old
427 if (e
.reqid_is_indexed()) {
428 caller_ops
[e
.reqid
] = &e
;
431 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
432 for (auto j
= e
.extra_reqids
.begin();
433 j
!= e
.extra_reqids
.end();
435 extra_caller_ops
.insert(make_pair(j
->first
, &e
));
443 extra_caller_ops
.clear();
448 void unindex(const pg_log_entry_t
& e
) {
449 // NOTE: this only works if we remove from the _tail_ of the log!
450 if (indexed_data
& PGLOG_INDEXED_OBJECTS
) {
451 if (objects
.count(e
.soid
) && objects
[e
.soid
]->version
== e
.version
)
452 objects
.erase(e
.soid
);
454 if (e
.reqid_is_indexed()) {
455 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
456 // divergent merge_log indexes new before unindexing old
457 if (caller_ops
.count(e
.reqid
) && caller_ops
[e
.reqid
] == &e
)
458 caller_ops
.erase(e
.reqid
);
461 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
462 for (auto j
= e
.extra_reqids
.begin();
463 j
!= e
.extra_reqids
.end();
465 for (ceph::unordered_multimap
<osd_reqid_t
,pg_log_entry_t
*>::iterator k
=
466 extra_caller_ops
.find(j
->first
);
467 k
!= extra_caller_ops
.end() && k
->first
== j
->first
;
469 if (k
->second
== &e
) {
470 extra_caller_ops
.erase(k
);
478 void index(pg_log_dup_t
& e
) {
479 if (indexed_data
& PGLOG_INDEXED_DUPS
) {
480 dup_index
[e
.reqid
] = &e
;
484 void unindex(const pg_log_dup_t
& e
) {
485 if (indexed_data
& PGLOG_INDEXED_DUPS
) {
486 auto i
= dup_index
.find(e
.reqid
);
487 if (i
!= dup_index
.end()) {
494 void add(const pg_log_entry_t
& e
, bool applied
= true) {
496 assert(get_can_rollback_to() == head
);
499 // make sure our buffers don't pin bigger buffers
500 e
.mod_desc
.trim_bl();
505 // riter previously pointed to the previous entry
506 if (rollback_info_trimmed_to_riter
== log
.rbegin())
507 ++rollback_info_trimmed_to_riter
;
509 assert(e
.version
> head
);
510 assert(head
.version
== 0 || e
.version
.version
> head
.version
);
514 if ((indexed_data
& PGLOG_INDEXED_OBJECTS
) && e
.object_is_indexed()) {
515 objects
[e
.soid
] = &(log
.back());
517 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
518 if (e
.reqid_is_indexed()) {
519 caller_ops
[e
.reqid
] = &(log
.back());
523 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
524 for (auto j
= e
.extra_reqids
.begin();
525 j
!= e
.extra_reqids
.end();
527 extra_caller_ops
.insert(make_pair(j
->first
, &(log
.back())));
532 skip_can_rollback_to_to_head();
539 set
<eversion_t
> *trimmed
,
540 set
<string
>* trimmed_dups
,
541 eversion_t
*write_from_dups
);
543 ostream
& print(ostream
& out
) const;
548 //////////////////// data members ////////////////////
550 pg_missing_tracker_t missing
;
553 eversion_t dirty_to
; ///< must clear/writeout all keys <= dirty_to
554 eversion_t dirty_from
; ///< must clear/writeout all keys >= dirty_from
555 eversion_t writeout_from
; ///< must writout keys >= writeout_from
556 set
<eversion_t
> trimmed
; ///< must clear keys in trimmed
557 eversion_t dirty_to_dups
; ///< must clear/writeout all dups <= dirty_to_dups
558 eversion_t dirty_from_dups
; ///< must clear/writeout all dups >= dirty_from_dups
559 eversion_t write_from_dups
; ///< must write keys >= write_from_dups
560 set
<string
> trimmed_dups
; ///< must clear keys in trimmed_dups
563 /// Log is clean on [dirty_to, dirty_from)
565 bool clear_divergent_priors
;
566 bool rebuilt_missing_with_deletes
= false;
568 void mark_dirty_to(eversion_t to
) {
572 void mark_dirty_from(eversion_t from
) {
573 if (from
< dirty_from
)
576 void mark_writeout_from(eversion_t from
) {
577 if (from
< writeout_from
)
578 writeout_from
= from
;
580 void mark_dirty_to_dups(eversion_t to
) {
581 if (to
> dirty_to_dups
)
584 void mark_dirty_from_dups(eversion_t from
) {
585 if (from
< dirty_from_dups
)
586 dirty_from_dups
= from
;
589 bool is_dirty() const {
590 return !touched_log
||
591 (dirty_to
!= eversion_t()) ||
592 (dirty_from
!= eversion_t::max()) ||
593 (writeout_from
!= eversion_t::max()) ||
594 !(trimmed
.empty()) ||
595 !missing
.is_clean() ||
596 !(trimmed_dups
.empty()) ||
597 (dirty_to_dups
!= eversion_t()) ||
598 (dirty_from_dups
!= eversion_t::max()) ||
599 (write_from_dups
!= eversion_t::max()) ||
600 rebuilt_missing_with_deletes
;
602 void mark_log_for_rewrite() {
603 mark_dirty_to(eversion_t::max());
604 mark_dirty_from(eversion_t());
605 mark_dirty_to_dups(eversion_t::max());
606 mark_dirty_from_dups(eversion_t());
609 bool get_rebuilt_missing_with_deletes() const {
610 return rebuilt_missing_with_deletes
;
615 set
<string
> log_keys_debug
;
616 static void clear_after(set
<string
> *log_keys_debug
, const string
&lb
) {
619 for (set
<string
>::iterator i
= log_keys_debug
->lower_bound(lb
);
620 i
!= log_keys_debug
->end();
621 log_keys_debug
->erase(i
++));
623 static void clear_up_to(set
<string
> *log_keys_debug
, const string
&ub
) {
626 for (set
<string
>::iterator i
= log_keys_debug
->begin();
627 i
!= log_keys_debug
->end() && *i
< ub
;
628 log_keys_debug
->erase(i
++));
633 dirty_to
= eversion_t();
634 dirty_from
= eversion_t::max();
637 trimmed_dups
.clear();
638 writeout_from
= eversion_t::max();
641 dirty_to_dups
= eversion_t();
642 dirty_from_dups
= eversion_t::max();
643 write_from_dups
= eversion_t::max();
647 // cppcheck-suppress noExplicitConstructor
648 PGLog(CephContext
*cct
, DoutPrefixProvider
*dpp
= nullptr) :
649 prefix_provider(dpp
),
650 dirty_from(eversion_t::max()),
651 writeout_from(eversion_t::max()),
652 dirty_from_dups(eversion_t::max()),
653 write_from_dups(eversion_t::max()),
655 pg_log_debug(!(cct
&& !(cct
->_conf
->osd_debug_pg_log_writeout
))),
657 clear_divergent_priors(false)
660 void reset_backfill();
664 //////////////////// get or set missing ////////////////////
666 const pg_missing_tracker_t
& get_missing() const { return missing
; }
667 void revise_have(hobject_t oid
, eversion_t have
) {
668 missing
.revise_have(oid
, have
);
671 void missing_add(const hobject_t
& oid
, eversion_t need
, eversion_t have
) {
672 missing
.add(oid
, need
, have
, false);
675 //////////////////// get or set log ////////////////////
677 const IndexedLog
&get_log() const { return log
; }
679 const eversion_t
&get_tail() const { return log
.tail
; }
681 void set_tail(eversion_t tail
) { log
.tail
= tail
; }
683 const eversion_t
&get_head() const { return log
.head
; }
685 void set_head(eversion_t head
) { log
.head
= head
; }
687 void set_last_requested(version_t last_requested
) {
688 log
.last_requested
= last_requested
;
691 void index() { log
.index(); }
693 void unindex() { log
.unindex(); }
695 void add(const pg_log_entry_t
& e
, bool applied
= true) {
696 mark_writeout_from(e
.version
);
700 void reset_recovery_pointers() { log
.reset_recovery_pointers(); }
702 static void clear_info_log(
704 ObjectStore::Transaction
*t
);
709 bool transaction_applied
= true);
711 void roll_forward_to(
712 eversion_t roll_forward_to
,
713 LogEntryHandler
*h
) {
719 eversion_t
get_can_rollback_to() const {
720 return log
.get_can_rollback_to();
723 void roll_forward(LogEntryHandler
*h
) {
729 //////////////////// get or set log & missing ////////////////////
731 void reset_backfill_claim_log(const pg_log_t
&o
, LogEntryHandler
*h
) {
732 log
.trim_rollback_info_to(log
.head
, h
);
733 log
.claim_log_and_clear_rollback_info(o
);
735 mark_dirty_to(eversion_t::max());
736 mark_dirty_to_dups(eversion_t::max());
743 log
.split_out_child(child_pgid
, split_bits
, &opg_log
->log
);
744 missing
.split_into(child_pgid
, split_bits
, &(opg_log
->missing
));
745 opg_log
->mark_dirty_to(eversion_t::max());
746 opg_log
->mark_dirty_to_dups(eversion_t::max());
747 mark_dirty_to(eversion_t::max());
748 mark_dirty_to_dups(eversion_t::max());
749 if (missing
.may_include_deletes
)
750 opg_log
->rebuilt_missing_with_deletes
= true;
753 void recover_got(hobject_t oid
, eversion_t v
, pg_info_t
&info
) {
754 if (missing
.is_missing(oid
, v
)) {
757 // raise last_complete?
758 if (missing
.get_items().empty()) {
759 log
.complete_to
= log
.log
.end();
760 info
.last_complete
= info
.last_update
;
762 while (log
.complete_to
!= log
.log
.end()) {
763 if (missing
.get_items().at(
764 missing
.get_rmissing().begin()->second
765 ).need
<= log
.complete_to
->version
)
767 if (info
.last_complete
< log
.complete_to
->version
)
768 info
.last_complete
= log
.complete_to
->version
;
773 assert(log
.get_can_rollback_to() >= v
);
776 void reset_complete_to(pg_info_t
*info
) {
777 log
.complete_to
= log
.log
.begin();
778 while (!missing
.get_items().empty() && log
.complete_to
->version
<
779 missing
.get_items().at(
780 missing
.get_rmissing().begin()->second
782 assert(log
.complete_to
!= log
.log
.end());
785 assert(log
.complete_to
!= log
.log
.end());
786 if (log
.complete_to
== log
.log
.begin()) {
788 info
->last_complete
= eversion_t();
792 info
->last_complete
= log
.complete_to
->version
;
797 void activate_not_complete(pg_info_t
&info
) {
798 reset_complete_to(&info
);
799 log
.last_requested
= 0;
802 void proc_replica_log(pg_info_t
&oinfo
,
803 const pg_log_t
&olog
,
804 pg_missing_t
& omissing
, pg_shard_t from
) const;
806 void rebuild_missing_set_with_deletes(ObjectStore
*store
,
808 const pg_info_t
&info
);
811 static void split_by_object(
812 mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
813 map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
>> *out_entries
) {
814 while (!entries
.empty()) {
815 auto &out_list
= (*out_entries
)[entries
.front().soid
];
816 out_list
.splice(out_list
.end(), entries
, entries
.begin());
821 * _merge_object_divergent_entries
823 * There are 5 distinct cases:
824 * 1) There is a more recent update: in this case we assume we adjusted the
825 * store and missing during merge_log
826 * 2) The first entry in the divergent sequence is a create. This might
827 * either be because the object is a clone or because prior_version is
828 * eversion_t(). In this case the object does not exist and we must
829 * adjust missing and the store to match.
830 * 3) We are currently missing the object. In this case, we adjust the
831 * missing to our prior_version taking care to add a divergent_prior
833 * 4) We can rollback all of the entries. In this case, we do so using
834 * the rollbacker and return -- the object does not go into missing.
835 * 5) We cannot rollback at least 1 of the entries. In this case, we
836 * clear the object out of the store and add a missing entry at
837 * prior_version taking care to add a divergent_prior if
840 template <typename missing_type
>
841 static void _merge_object_divergent_entries(
842 const IndexedLog
&log
, ///< [in] log to merge against
843 const hobject_t
&hoid
, ///< [in] object we are merging
844 const mempool::osd_pglog::list
<pg_log_entry_t
> &orig_entries
, ///< [in] entries for hoid to merge
845 const pg_info_t
&info
, ///< [in] info for merging entries
846 eversion_t olog_can_rollback_to
, ///< [in] rollback boundary
847 missing_type
&missing
, ///< [in,out] missing to adjust, use
848 LogEntryHandler
*rollbacker
, ///< [in] optional rollbacker object
849 const DoutPrefixProvider
*dpp
///< [in] logging provider
851 ldpp_dout(dpp
, 20) << __func__
<< ": merging hoid " << hoid
852 << " entries: " << orig_entries
<< dendl
;
854 if (hoid
> info
.last_backfill
) {
855 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " after last_backfill"
860 // entries is non-empty
861 assert(!orig_entries
.empty());
862 // strip out and ignore ERROR entries
863 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
865 bool seen_non_error
= false;
866 for (list
<pg_log_entry_t
>::const_iterator i
= orig_entries
.begin();
867 i
!= orig_entries
.end();
869 // all entries are on hoid
870 assert(i
->soid
== hoid
);
871 // did not see error entries before this entry and this entry is not error
872 // then this entry is the first non error entry
873 bool first_non_error
= ! seen_non_error
&& ! i
->is_error();
874 if (! i
->is_error() ) {
875 // see a non error entry now
876 seen_non_error
= true;
879 // No need to check the first entry since it prior_version is unavailable
881 // No need to check if the prior_version is the minimal version
882 // No need to check the first non-error entry since the leading error
883 // entries are not its prior version
884 if (i
!= orig_entries
.begin() && i
->prior_version
!= eversion_t() &&
886 // in increasing order of version
887 assert(i
->version
> last
);
888 // prior_version correct (unless it is an ERROR entry)
889 assert(i
->prior_version
== last
|| i
->is_error());
892 ldpp_dout(dpp
, 20) << __func__
<< ": ignoring " << *i
<< dendl
;
894 ldpp_dout(dpp
, 20) << __func__
<< ": keeping " << *i
<< dendl
;
895 entries
.push_back(*i
);
899 if (entries
.empty()) {
900 ldpp_dout(dpp
, 10) << __func__
<< ": no non-ERROR entries" << dendl
;
904 const eversion_t prior_version
= entries
.begin()->prior_version
;
905 const eversion_t first_divergent_update
= entries
.begin()->version
;
906 const eversion_t last_divergent_update
= entries
.rbegin()->version
;
907 const bool object_not_in_store
=
908 !missing
.is_missing(hoid
) &&
909 entries
.rbegin()->is_delete();
910 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
911 << " prior_version: " << prior_version
912 << " first_divergent_update: " << first_divergent_update
913 << " last_divergent_update: " << last_divergent_update
916 ceph::unordered_map
<hobject_t
, pg_log_entry_t
*>::const_iterator objiter
=
917 log
.objects
.find(hoid
);
918 if (objiter
!= log
.objects
.end() &&
919 objiter
->second
->version
>= first_divergent_update
) {
921 ldpp_dout(dpp
, 10) << __func__
<< ": more recent entry found: "
922 << *objiter
->second
<< ", already merged" << dendl
;
924 assert(objiter
->second
->version
> last_divergent_update
);
926 // ensure missing has been updated appropriately
927 if (objiter
->second
->is_update() ||
928 (missing
.may_include_deletes
&& objiter
->second
->is_delete())) {
929 assert(missing
.is_missing(hoid
) &&
930 missing
.get_items().at(hoid
).need
== objiter
->second
->version
);
932 assert(!missing
.is_missing(hoid
));
934 missing
.revise_have(hoid
, eversion_t());
936 if (!object_not_in_store
) {
937 rollbacker
->remove(hoid
);
939 for (auto &&i
: entries
) {
946 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
947 <<" has no more recent entries in log" << dendl
;
948 if (prior_version
== eversion_t() || entries
.front().is_clone()) {
950 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
951 << " prior_version or op type indicates creation,"
954 if (missing
.is_missing(hoid
))
955 missing
.rm(missing
.get_items().find(hoid
));
957 if (!object_not_in_store
) {
958 rollbacker
->remove(hoid
);
960 for (auto &&i
: entries
) {
967 if (missing
.is_missing(hoid
)) {
969 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
970 << " missing, " << missing
.get_items().at(hoid
)
971 << " adjusting" << dendl
;
973 if (missing
.get_items().at(hoid
).have
== prior_version
) {
974 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
975 << " missing.have is prior_version " << prior_version
976 << " removing from missing" << dendl
;
977 missing
.rm(missing
.get_items().find(hoid
));
979 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
980 << " missing.have is " << missing
.get_items().at(hoid
).have
981 << ", adjusting" << dendl
;
982 missing
.revise_need(hoid
, prior_version
, false);
983 if (prior_version
<= info
.log_tail
) {
984 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
985 << " prior_version " << prior_version
986 << " <= info.log_tail "
987 << info
.log_tail
<< dendl
;
991 for (auto &&i
: entries
) {
998 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
999 << " must be rolled back or recovered,"
1000 << " attempting to rollback"
1002 bool can_rollback
= true;
1003 /// Distinguish between 4) and 5)
1004 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= entries
.rbegin();
1005 i
!= entries
.rend();
1007 if (!i
->can_rollback() || i
->version
<= olog_can_rollback_to
) {
1008 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " cannot rollback "
1010 can_rollback
= false;
1017 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= entries
.rbegin();
1018 i
!= entries
.rend();
1020 assert(i
->can_rollback() && i
->version
> olog_can_rollback_to
);
1021 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1022 << " rolling back " << *i
<< dendl
;
1024 rollbacker
->rollback(*i
);
1026 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1027 << " rolled back" << dendl
;
1031 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " cannot roll back, "
1032 << "removing and adding to missing" << dendl
;
1034 if (!object_not_in_store
)
1035 rollbacker
->remove(hoid
);
1036 for (auto &&i
: entries
) {
1037 rollbacker
->trim(i
);
1040 missing
.add(hoid
, prior_version
, eversion_t(), false);
1041 if (prior_version
<= info
.log_tail
) {
1042 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
1043 << " prior_version " << prior_version
1044 << " <= info.log_tail "
1045 << info
.log_tail
<< dendl
;
1050 /// Merge all entries using above
1051 template <typename missing_type
>
1052 static void _merge_divergent_entries(
1053 const IndexedLog
&log
, ///< [in] log to merge against
1054 mempool::osd_pglog::list
<pg_log_entry_t
> &entries
, ///< [in] entries to merge
1055 const pg_info_t
&oinfo
, ///< [in] info for merging entries
1056 eversion_t olog_can_rollback_to
, ///< [in] rollback boundary
1057 missing_type
&omissing
, ///< [in,out] missing to adjust, use
1058 LogEntryHandler
*rollbacker
, ///< [in] optional rollbacker object
1059 const DoutPrefixProvider
*dpp
///< [in] logging provider
1061 map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
> > split
;
1062 split_by_object(entries
, &split
);
1063 for (map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
>>::iterator i
= split
.begin();
1066 _merge_object_divergent_entries(
1071 olog_can_rollback_to
,
1079 * Exists for use in TestPGLog for simply testing single divergent log
1082 void merge_old_entry(
1083 ObjectStore::Transaction
& t
,
1084 const pg_log_entry_t
& oe
,
1085 const pg_info_t
& info
,
1086 LogEntryHandler
*rollbacker
) {
1087 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
1088 entries
.push_back(oe
);
1089 _merge_object_divergent_entries(
1094 log
.get_can_rollback_to(),
1100 bool merge_log_dups(const pg_log_t
& olog
);
1104 void rewind_divergent_log(eversion_t newhead
,
1106 LogEntryHandler
*rollbacker
,
1108 bool &dirty_big_info
);
1110 void merge_log(pg_info_t
&oinfo
,
1113 pg_info_t
&info
, LogEntryHandler
*rollbacker
,
1114 bool &dirty_info
, bool &dirty_big_info
);
1116 template <typename missing_type
>
1117 static bool append_log_entries_update_missing(
1118 const hobject_t
&last_backfill
,
1119 bool last_backfill_bitwise
,
1120 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
1121 bool maintain_rollback
,
1123 missing_type
&missing
,
1124 LogEntryHandler
*rollbacker
,
1125 const DoutPrefixProvider
*dpp
) {
1126 bool invalidate_stats
= false;
1127 if (log
&& !entries
.empty()) {
1128 assert(log
->head
< entries
.begin()->version
);
1130 for (list
<pg_log_entry_t
>::const_iterator p
= entries
.begin();
1133 invalidate_stats
= invalidate_stats
|| !p
->is_error();
1135 ldpp_dout(dpp
, 20) << "update missing, append " << *p
<< dendl
;
1138 if (p
->soid
<= last_backfill
&&
1140 if (missing
.may_include_deletes
) {
1141 missing
.add_next_event(*p
);
1143 if (p
->is_delete()) {
1144 missing
.rm(p
->soid
, p
->version
);
1146 missing
.add_next_event(*p
);
1149 // hack to match PG::mark_all_unfound_lost
1150 if (maintain_rollback
&& p
->is_lost_delete() && p
->can_rollback()) {
1151 rollbacker
->try_stash(p
->soid
, p
->version
.version
);
1152 } else if (p
->is_delete()) {
1153 rollbacker
->remove(p
->soid
);
1159 return invalidate_stats
;
1161 bool append_new_log_entries(
1162 const hobject_t
&last_backfill
,
1163 bool last_backfill_bitwise
,
1164 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
1165 LogEntryHandler
*rollbacker
) {
1166 bool invalidate_stats
= append_log_entries_update_missing(
1168 last_backfill_bitwise
,
1175 if (!entries
.empty()) {
1176 mark_writeout_from(entries
.begin()->version
);
1177 if (entries
.begin()->is_lost_delete()) {
1178 // hack: since lost deletes queue recovery directly, and don't
1179 // go through activate_not_complete() again, our complete_to
1180 // iterator may still point at log.end(). Reset it to point
1181 // before these new lost_delete entries. This only occurs
1182 // when lost+delete entries are initially added, which is
1183 // always in a list of solely lost_delete entries, so it is
1184 // sufficient to check whether the first entry is a
1186 reset_complete_to(nullptr);
1189 return invalidate_stats
;
1192 void write_log_and_missing(
1193 ObjectStore::Transaction
& t
,
1194 map
<string
,bufferlist
> *km
,
1196 const ghobject_t
&log_oid
,
1197 bool require_rollback
);
1199 static void write_log_and_missing_wo_missing(
1200 ObjectStore::Transaction
& t
,
1201 map
<string
,bufferlist
>* km
,
1204 const ghobject_t
&log_oid
, map
<eversion_t
, hobject_t
> &divergent_priors
,
1205 bool require_rollback
);
1207 static void write_log_and_missing(
1208 ObjectStore::Transaction
& t
,
1209 map
<string
,bufferlist
>* km
,
1212 const ghobject_t
&log_oid
,
1213 const pg_missing_tracker_t
&missing
,
1214 bool require_rollback
,
1215 bool *rebuilt_missing_set_with_deletes
);
1217 static void _write_log_and_missing_wo_missing(
1218 ObjectStore::Transaction
& t
,
1219 map
<string
,bufferlist
>* km
,
1221 const coll_t
& coll
, const ghobject_t
&log_oid
,
1222 map
<eversion_t
, hobject_t
> &divergent_priors
,
1223 eversion_t dirty_to
,
1224 eversion_t dirty_from
,
1225 eversion_t writeout_from
,
1226 const set
<eversion_t
> &trimmed
,
1227 const set
<string
> &trimmed_dups
,
1228 bool dirty_divergent_priors
,
1230 bool require_rollback
,
1231 eversion_t dirty_to_dups
,
1232 eversion_t dirty_from_dups
,
1233 eversion_t write_from_dups
,
1234 set
<string
> *log_keys_debug
1237 static void _write_log_and_missing(
1238 ObjectStore::Transaction
& t
,
1239 map
<string
,bufferlist
>* km
,
1241 const coll_t
& coll
, const ghobject_t
&log_oid
,
1242 eversion_t dirty_to
,
1243 eversion_t dirty_from
,
1244 eversion_t writeout_from
,
1245 const set
<eversion_t
> &trimmed
,
1246 const set
<string
> &trimmed_dups
,
1247 const pg_missing_tracker_t
&missing
,
1249 bool require_rollback
,
1250 bool clear_divergent_priors
,
1251 eversion_t dirty_to_dups
,
1252 eversion_t dirty_from_dups
,
1253 eversion_t write_from_dups
,
1254 bool *rebuilt_missing_with_deletes
,
1255 set
<string
> *log_keys_debug
1258 void read_log_and_missing(
1263 const pg_info_t
&info
,
1264 bool force_rebuild_missing
,
1266 bool tolerate_divergent_missing_log
,
1267 bool debug_verify_stored_missing
= false
1269 return read_log_and_missing(
1270 store
, pg_coll
, log_coll
, log_oid
, info
,
1271 log
, missing
, force_rebuild_missing
, oss
,
1272 tolerate_divergent_missing_log
,
1273 &clear_divergent_priors
,
1275 (pg_log_debug
? &log_keys_debug
: nullptr),
1276 debug_verify_stored_missing
);
1279 template <typename missing_type
>
1280 static void read_log_and_missing(
1285 const pg_info_t
&info
,
1287 missing_type
&missing
,
1288 bool force_rebuild_missing
,
1290 bool tolerate_divergent_missing_log
,
1291 bool *clear_divergent_priors
= nullptr,
1292 const DoutPrefixProvider
*dpp
= nullptr,
1293 set
<string
> *log_keys_debug
= nullptr,
1294 bool debug_verify_stored_missing
= false
1296 ldpp_dout(dpp
, 20) << "read_log_and_missing coll " << pg_coll
1297 << " log_oid " << log_oid
<< dendl
;
1301 int r
= store
->stat(log_coll
, log_oid
, &st
);
1303 assert(st
.st_size
== 0);
1305 // will get overridden below if it had been recorded
1306 eversion_t on_disk_can_rollback_to
= info
.last_update
;
1307 eversion_t on_disk_rollback_info_trimmed_to
= eversion_t();
1308 ObjectMap::ObjectMapIterator p
= store
->get_omap_iterator(log_coll
, log_oid
);
1309 map
<eversion_t
, hobject_t
> divergent_priors
;
1310 bool must_rebuild
= force_rebuild_missing
;
1311 missing
.may_include_deletes
= false;
1312 list
<pg_log_entry_t
> entries
;
1313 list
<pg_log_dup_t
> dups
;
1315 for (p
->seek_to_first(); p
->valid() ; p
->next(false)) {
1316 // non-log pgmeta_oid keys are prefixed with _; skip those
1317 if (p
->key()[0] == '_')
1319 bufferlist bl
= p
->value();//Copy bufferlist before creating iterator
1320 bufferlist::iterator bp
= bl
.begin();
1321 if (p
->key() == "divergent_priors") {
1322 ::decode(divergent_priors
, bp
);
1323 ldpp_dout(dpp
, 20) << "read_log_and_missing " << divergent_priors
.size()
1324 << " divergent_priors" << dendl
;
1325 must_rebuild
= true;
1326 debug_verify_stored_missing
= false;
1327 } else if (p
->key() == "can_rollback_to") {
1328 ::decode(on_disk_can_rollback_to
, bp
);
1329 } else if (p
->key() == "rollback_info_trimmed_to") {
1330 ::decode(on_disk_rollback_info_trimmed_to
, bp
);
1331 } else if (p
->key() == "may_include_deletes_in_missing") {
1332 missing
.may_include_deletes
= true;
1333 } else if (p
->key().substr(0, 7) == string("missing")) {
1335 pg_missing_item item
;
1338 if (item
.is_delete()) {
1339 assert(missing
.may_include_deletes
);
1341 missing
.add(oid
, item
.need
, item
.have
, item
.is_delete());
1342 } else if (p
->key().substr(0, 4) == string("dup_")) {
1345 if (!dups
.empty()) {
1346 assert(dups
.back().version
< dup
.version
);
1348 dups
.push_back(dup
);
1351 e
.decode_with_checksum(bp
);
1352 ldpp_dout(dpp
, 20) << "read_log_and_missing " << e
<< dendl
;
1353 if (!entries
.empty()) {
1354 pg_log_entry_t
last_e(entries
.back());
1355 assert(last_e
.version
.version
< e
.version
.version
);
1356 assert(last_e
.version
.epoch
<= e
.version
.epoch
);
1358 entries
.push_back(e
);
1360 log_keys_debug
->insert(e
.get_key_name());
1367 on_disk_can_rollback_to
,
1368 on_disk_rollback_info_trimmed_to
,
1372 if (must_rebuild
|| debug_verify_stored_missing
) {
1374 if (debug_verify_stored_missing
|| info
.last_complete
< info
.last_update
) {
1376 << "read_log_and_missing checking for missing items over interval ("
1377 << info
.last_complete
1378 << "," << info
.last_update
<< "]" << dendl
;
1381 set
<hobject_t
> checked
;
1382 set
<hobject_t
> skipped
;
1383 for (list
<pg_log_entry_t
>::reverse_iterator i
= log
.log
.rbegin();
1384 i
!= log
.log
.rend();
1386 if (!debug_verify_stored_missing
&& i
->version
<= info
.last_complete
) break;
1387 if (i
->soid
> info
.last_backfill
)
1391 if (did
.count(i
->soid
)) continue;
1392 did
.insert(i
->soid
);
1394 if (!missing
.may_include_deletes
&& i
->is_delete())
1398 int r
= store
->getattr(
1400 ghobject_t(i
->soid
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1404 object_info_t
oi(bv
);
1405 if (oi
.version
< i
->version
) {
1406 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
1407 << " (have " << oi
.version
<< ")" << dendl
;
1408 if (debug_verify_stored_missing
) {
1409 auto miter
= missing
.get_items().find(i
->soid
);
1410 assert(miter
!= missing
.get_items().end());
1411 assert(miter
->second
.need
== i
->version
);
1412 // the 'have' version is reset if an object is deleted,
1413 // then created again
1414 assert(miter
->second
.have
== oi
.version
|| miter
->second
.have
== eversion_t());
1415 checked
.insert(i
->soid
);
1417 missing
.add(i
->soid
, i
->version
, oi
.version
, i
->is_delete());
1421 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
<< dendl
;
1422 if (debug_verify_stored_missing
) {
1423 auto miter
= missing
.get_items().find(i
->soid
);
1424 if (i
->is_delete()) {
1425 assert(miter
== missing
.get_items().end() ||
1426 (miter
->second
.need
== i
->version
&&
1427 miter
->second
.have
== eversion_t()));
1429 assert(miter
!= missing
.get_items().end());
1430 assert(miter
->second
.need
== i
->version
);
1431 assert(miter
->second
.have
== eversion_t());
1433 checked
.insert(i
->soid
);
1435 missing
.add(i
->soid
, i
->version
, eversion_t(), i
->is_delete());
1439 if (debug_verify_stored_missing
) {
1440 for (auto &&i
: missing
.get_items()) {
1441 if (checked
.count(i
.first
))
1443 if (i
.first
> info
.last_backfill
) {
1444 ldpp_dout(dpp
, -1) << __func__
<< ": invalid missing set entry "
1445 << "found before last_backfill: "
1446 << i
.first
<< " " << i
.second
1447 << " last_backfill = " << info
.last_backfill
1449 assert(0 == "invalid missing set entry found");
1452 int r
= store
->getattr(
1454 ghobject_t(i
.first
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1458 object_info_t
oi(bv
);
1459 assert(oi
.version
== i
.second
.have
|| eversion_t() == i
.second
.have
);
1461 assert(i
.second
.is_delete() || eversion_t() == i
.second
.have
);
1465 assert(must_rebuild
);
1466 for (map
<eversion_t
, hobject_t
>::reverse_iterator i
=
1467 divergent_priors
.rbegin();
1468 i
!= divergent_priors
.rend();
1470 if (i
->first
<= info
.last_complete
) break;
1471 if (i
->second
> info
.last_backfill
)
1473 if (did
.count(i
->second
)) continue;
1474 did
.insert(i
->second
);
1476 int r
= store
->getattr(
1478 ghobject_t(i
->second
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1482 object_info_t
oi(bv
);
1484 * 1) we see this entry in the divergent priors mapping
1485 * 2) we didn't see an entry for this object in the log
1487 * From 1 & 2 we know that either the object does not exist
1488 * or it is at the version specified in the divergent_priors
1489 * map since the object would have been deleted atomically
1490 * with the addition of the divergent_priors entry, an older
1491 * version would not have been recovered, and a newer version
1492 * would show up in the log above.
1495 * Unfortunately the assessment above is incorrect because of
1496 * http://tracker.ceph.com/issues/17916 (we were incorrectly
1497 * not removing the divergent_priors set from disk state!),
1498 * so let's check that.
1500 if (oi
.version
> i
->first
&& tolerate_divergent_missing_log
) {
1501 ldpp_dout(dpp
, 0) << "read_log divergent_priors entry (" << *i
1502 << ") inconsistent with disk state (" << oi
1503 << "), assuming it is tracker.ceph.com/issues/17916"
1506 assert(oi
.version
== i
->first
);
1509 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
<< dendl
;
1510 missing
.add(i
->second
, i
->first
, eversion_t(), false);
1514 if (clear_divergent_priors
)
1515 (*clear_divergent_priors
) = true;
1519 if (!must_rebuild
) {
1520 if (clear_divergent_priors
)
1521 (*clear_divergent_priors
) = false;
1524 ldpp_dout(dpp
, 10) << "read_log_and_missing done" << dendl
;
1525 } // static read_log_and_missing