1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
20 // re-include our assert to clobber boost's
21 #include "include/assert.h"
22 #include "osd_types.h"
23 #include "os/ObjectStore.h"
27 #define PGLOG_INDEXED_OBJECTS (1 << 0)
28 #define PGLOG_INDEXED_CALLER_OPS (1 << 1)
29 #define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2)
30 #define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | PGLOG_INDEXED_CALLER_OPS | PGLOG_INDEXED_EXTRA_CALLER_OPS)
34 struct PGLog
: DoutPrefixProvider
{
35 DoutPrefixProvider
*prefix_provider
;
36 string
gen_prefix() const override
{
37 return prefix_provider
? prefix_provider
->gen_prefix() : "";
39 unsigned get_subsys() const override
{
40 return prefix_provider
? prefix_provider
->get_subsys() :
41 (unsigned)ceph_subsys_osd
;
43 CephContext
*get_cct() const override
{
47 ////////////////////////////// sub classes //////////////////////////////
48 struct LogEntryHandler
{
49 virtual void rollback(
50 const pg_log_entry_t
&entry
) = 0;
51 virtual void rollforward(
52 const pg_log_entry_t
&entry
) = 0;
54 const pg_log_entry_t
&entry
) = 0;
56 const hobject_t
&hoid
) = 0;
57 virtual void try_stash(
58 const hobject_t
&hoid
,
60 virtual ~LogEntryHandler() {}
64 class read_log_and_missing_error
: public buffer::error
{
66 explicit read_log_and_missing_error(const char *what
) {
67 snprintf(buf
, sizeof(buf
), "read_log_and_missing_error: %s", what
);
69 const char *what() const throw () override
{
78 * IndexLog - adds in-memory index of the log, by oid.
79 * plus some methods to manipulate it all.
81 struct IndexedLog
: public pg_log_t
{
82 mutable ceph::unordered_map
<hobject_t
,pg_log_entry_t
*> objects
; // ptrs into log. be careful!
83 mutable ceph::unordered_map
<osd_reqid_t
,pg_log_entry_t
*> caller_ops
;
84 mutable ceph::unordered_multimap
<osd_reqid_t
,pg_log_entry_t
*> extra_caller_ops
;
87 list
<pg_log_entry_t
>::iterator complete_to
; // not inclusive of referenced item
88 version_t last_requested
= 0; // last object requested by primary
92 mutable __u16 indexed_data
= 0;
94 * rollback_info_trimmed_to_riter points to the first log entry <=
95 * rollback_info_trimmed_to
97 * It's a reverse_iterator because rend() is a natural representation for
98 * tail, and rbegin() works nicely for head.
100 mempool::osd_pglog::list
<pg_log_entry_t
>::reverse_iterator
101 rollback_info_trimmed_to_riter
;
103 template <typename F
>
104 void advance_can_rollback_to(eversion_t to
, F
&&f
) {
105 if (to
> can_rollback_to
)
106 can_rollback_to
= to
;
108 if (to
> rollback_info_trimmed_to
)
109 rollback_info_trimmed_to
= to
;
111 while (rollback_info_trimmed_to_riter
!= log
.rbegin()) {
112 --rollback_info_trimmed_to_riter
;
113 if (rollback_info_trimmed_to_riter
->version
> rollback_info_trimmed_to
) {
114 ++rollback_info_trimmed_to_riter
;
117 f(*rollback_info_trimmed_to_riter
);
121 void reset_rollback_info_trimmed_to_riter() {
122 rollback_info_trimmed_to_riter
= log
.rbegin();
123 while (rollback_info_trimmed_to_riter
!= log
.rend() &&
124 rollback_info_trimmed_to_riter
->version
> rollback_info_trimmed_to
)
125 ++rollback_info_trimmed_to_riter
;
128 // indexes objects, caller ops and extra caller ops
131 complete_to(log
.end()),
134 rollback_info_trimmed_to_riter(log
.rbegin())
137 template <typename
... Args
>
138 IndexedLog(Args
&&... args
) :
139 pg_log_t(std::forward
<Args
>(args
)...),
140 complete_to(log
.end()),
143 rollback_info_trimmed_to_riter(log
.rbegin()) {
144 reset_rollback_info_trimmed_to_riter();
148 IndexedLog(const IndexedLog
&rhs
) :
150 complete_to(log
.end()),
151 last_requested(rhs
.last_requested
),
153 rollback_info_trimmed_to_riter(log
.rbegin()) {
154 reset_rollback_info_trimmed_to_riter();
155 index(rhs
.indexed_data
);
157 IndexedLog
&operator=(const IndexedLog
&rhs
) {
159 new (this) IndexedLog(rhs
);
163 void trim_rollback_info_to(eversion_t to
, LogEntryHandler
*h
) {
164 advance_can_rollback_to(
166 [&](pg_log_entry_t
&entry
) {
170 void roll_forward_to(eversion_t to
, LogEntryHandler
*h
) {
171 advance_can_rollback_to(
173 [&](pg_log_entry_t
&entry
) {
174 h
->rollforward(entry
);
178 void skip_can_rollback_to_to_head() {
179 advance_can_rollback_to(head
, [&](const pg_log_entry_t
&entry
) {});
182 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
183 auto divergent
= pg_log_t::rewind_from_head(newhead
);
185 reset_rollback_info_trimmed_to_riter();
189 template <typename T
>
191 const eversion_t
&bound
, ///< [in] scan entries > bound
193 auto iter
= log
.rbegin();
194 while (iter
!= log
.rend() && iter
->version
> bound
)
198 if (iter
== log
.rbegin())
205 void claim_log_and_clear_rollback_info(const pg_log_t
& o
) {
206 // we must have already trimmed the old entries
207 assert(rollback_info_trimmed_to
== head
);
208 assert(rollback_info_trimmed_to_riter
== log
.rbegin());
210 *this = IndexedLog(o
);
212 skip_can_rollback_to_to_head();
216 void split_out_child(
222 // we must have already trimmed the old entries
223 assert(rollback_info_trimmed_to
== head
);
224 assert(rollback_info_trimmed_to_riter
== log
.rbegin());
228 rollback_info_trimmed_to_riter
= log
.rbegin();
229 reset_recovery_pointers();
232 skip_can_rollback_to_to_head();
235 void reset_recovery_pointers() {
236 complete_to
= log
.end();
240 bool logged_object(const hobject_t
& oid
) const {
241 if (!(indexed_data
& PGLOG_INDEXED_OBJECTS
)) {
244 return objects
.count(oid
);
247 bool logged_req(const osd_reqid_t
&r
) const {
248 if (!(indexed_data
& PGLOG_INDEXED_CALLER_OPS
)) {
251 if (!caller_ops
.count(r
)) {
252 if (!(indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)) {
253 index_extra_caller_ops();
255 return extra_caller_ops
.count(r
);
261 const osd_reqid_t
&r
,
263 version_t
*user_version
,
264 int *return_code
) const {
266 assert(user_version
);
268 ceph::unordered_map
<osd_reqid_t
,pg_log_entry_t
*>::const_iterator p
;
269 if (!(indexed_data
& PGLOG_INDEXED_CALLER_OPS
)) {
272 p
= caller_ops
.find(r
);
273 if (p
!= caller_ops
.end()) {
274 *version
= p
->second
->version
;
275 *user_version
= p
->second
->user_version
;
276 *return_code
= p
->second
->return_code
;
280 // warning: we will return *a* request for this reqid, but not
281 // necessarily the most recent.
282 if (!(indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)) {
283 index_extra_caller_ops();
285 p
= extra_caller_ops
.find(r
);
286 if (p
!= extra_caller_ops
.end()) {
287 for (auto i
= p
->second
->extra_reqids
.begin();
288 i
!= p
->second
->extra_reqids
.end();
291 *version
= p
->second
->version
;
292 *user_version
= i
->second
;
293 *return_code
= p
->second
->return_code
;
297 assert(0 == "in extra_caller_ops but not extra_reqids");
302 /// get a (bounded) list of recent reqids for the given object
303 void get_object_reqids(const hobject_t
& oid
, unsigned max
,
304 mempool::osd_pglog::vector
<pair
<osd_reqid_t
, version_t
> > *pls
) const {
305 // make sure object is present at least once before we do an
307 if (!(indexed_data
& PGLOG_INDEXED_OBJECTS
)) {
310 if (objects
.count(oid
) == 0)
312 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= log
.rbegin();
315 if (i
->soid
== oid
) {
316 if (i
->reqid_is_indexed())
317 pls
->push_back(make_pair(i
->reqid
, i
->user_version
));
318 pls
->insert(pls
->end(), i
->extra_reqids
.begin(), i
->extra_reqids
.end());
319 if (pls
->size() >= max
) {
320 if (pls
->size() > max
) {
329 void index(__u16 to_index
= PGLOG_INDEXED_ALL
) const {
330 if (to_index
& PGLOG_INDEXED_OBJECTS
)
332 if (to_index
& PGLOG_INDEXED_CALLER_OPS
)
334 if (to_index
& PGLOG_INDEXED_EXTRA_CALLER_OPS
)
335 extra_caller_ops
.clear();
337 for (list
<pg_log_entry_t
>::const_iterator i
= log
.begin();
340 if (to_index
& PGLOG_INDEXED_OBJECTS
) {
341 if (i
->object_is_indexed()) {
342 objects
[i
->soid
] = const_cast<pg_log_entry_t
*>(&(*i
));
346 if (to_index
& PGLOG_INDEXED_CALLER_OPS
) {
347 if (i
->reqid_is_indexed()) {
348 caller_ops
[i
->reqid
] = const_cast<pg_log_entry_t
*>(&(*i
));
352 if (to_index
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
353 for (auto j
= i
->extra_reqids
.begin();
354 j
!= i
->extra_reqids
.end();
356 extra_caller_ops
.insert(
357 make_pair(j
->first
, const_cast<pg_log_entry_t
*>(&(*i
))));
362 indexed_data
|= to_index
;
365 void index_objects() const {
366 index(PGLOG_INDEXED_OBJECTS
);
369 void index_caller_ops() const {
370 index(PGLOG_INDEXED_CALLER_OPS
);
373 void index_extra_caller_ops() const {
374 index(PGLOG_INDEXED_EXTRA_CALLER_OPS
);
377 void index(pg_log_entry_t
& e
) {
378 if ((indexed_data
& PGLOG_INDEXED_OBJECTS
) && e
.object_is_indexed()) {
379 if (objects
.count(e
.soid
) == 0 ||
380 objects
[e
.soid
]->version
< e
.version
)
381 objects
[e
.soid
] = &e
;
383 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
384 // divergent merge_log indexes new before unindexing old
385 if (e
.reqid_is_indexed()) {
386 caller_ops
[e
.reqid
] = &e
;
389 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
390 for (auto j
= e
.extra_reqids
.begin();
391 j
!= e
.extra_reqids
.end();
393 extra_caller_ops
.insert(make_pair(j
->first
, &e
));
400 extra_caller_ops
.clear();
403 void unindex(pg_log_entry_t
& e
) {
404 // NOTE: this only works if we remove from the _tail_ of the log!
405 if (indexed_data
& PGLOG_INDEXED_OBJECTS
) {
406 if (objects
.count(e
.soid
) && objects
[e
.soid
]->version
== e
.version
)
407 objects
.erase(e
.soid
);
409 if (e
.reqid_is_indexed()) {
410 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
411 // divergent merge_log indexes new before unindexing old
412 if (caller_ops
.count(e
.reqid
) && caller_ops
[e
.reqid
] == &e
)
413 caller_ops
.erase(e
.reqid
);
416 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
417 for (auto j
= e
.extra_reqids
.begin();
418 j
!= e
.extra_reqids
.end();
420 for (ceph::unordered_multimap
<osd_reqid_t
,pg_log_entry_t
*>::iterator k
=
421 extra_caller_ops
.find(j
->first
);
422 k
!= extra_caller_ops
.end() && k
->first
== j
->first
;
424 if (k
->second
== &e
) {
425 extra_caller_ops
.erase(k
);
434 void add(const pg_log_entry_t
& e
, bool applied
= true) {
436 assert(get_can_rollback_to() == head
);
439 // make sure our buffers don't pin bigger buffers
440 e
.mod_desc
.trim_bl();
445 // riter previously pointed to the previous entry
446 if (rollback_info_trimmed_to_riter
== log
.rbegin())
447 ++rollback_info_trimmed_to_riter
;
449 assert(e
.version
> head
);
450 assert(head
.version
== 0 || e
.version
.version
> head
.version
);
454 if ((indexed_data
& PGLOG_INDEXED_OBJECTS
) && e
.object_is_indexed()) {
455 objects
[e
.soid
] = &(log
.back());
457 if (indexed_data
& PGLOG_INDEXED_CALLER_OPS
) {
458 if (e
.reqid_is_indexed()) {
459 caller_ops
[e
.reqid
] = &(log
.back());
463 if (indexed_data
& PGLOG_INDEXED_EXTRA_CALLER_OPS
) {
464 for (auto j
= e
.extra_reqids
.begin();
465 j
!= e
.extra_reqids
.end();
467 extra_caller_ops
.insert(make_pair(j
->first
, &(log
.back())));
472 skip_can_rollback_to_to_head();
479 set
<eversion_t
> *trimmed
);
481 ostream
& print(ostream
& out
) const;
486 //////////////////// data members ////////////////////
488 pg_missing_tracker_t missing
;
491 eversion_t dirty_to
; ///< must clear/writeout all keys <= dirty_to
492 eversion_t dirty_from
; ///< must clear/writeout all keys >= dirty_from
493 eversion_t writeout_from
; ///< must writout keys >= writeout_from
494 set
<eversion_t
> trimmed
; ///< must clear keys in trimmed
497 /// Log is clean on [dirty_to, dirty_from)
499 bool clear_divergent_priors
;
501 void mark_dirty_to(eversion_t to
) {
505 void mark_dirty_from(eversion_t from
) {
506 if (from
< dirty_from
)
509 void mark_writeout_from(eversion_t from
) {
510 if (from
< writeout_from
)
511 writeout_from
= from
;
514 bool is_dirty() const {
515 return !touched_log
||
516 (dirty_to
!= eversion_t()) ||
517 (dirty_from
!= eversion_t::max()) ||
518 (writeout_from
!= eversion_t::max()) ||
519 !(trimmed
.empty()) ||
522 void mark_log_for_rewrite() {
523 mark_dirty_to(eversion_t::max());
524 mark_dirty_from(eversion_t());
530 set
<string
> log_keys_debug
;
531 static void clear_after(set
<string
> *log_keys_debug
, const string
&lb
) {
534 for (set
<string
>::iterator i
= log_keys_debug
->lower_bound(lb
);
535 i
!= log_keys_debug
->end();
536 log_keys_debug
->erase(i
++));
538 static void clear_up_to(set
<string
> *log_keys_debug
, const string
&ub
) {
541 for (set
<string
>::iterator i
= log_keys_debug
->begin();
542 i
!= log_keys_debug
->end() && *i
< ub
;
543 log_keys_debug
->erase(i
++));
548 dirty_to
= eversion_t();
549 dirty_from
= eversion_t::max();
552 writeout_from
= eversion_t::max();
557 // cppcheck-suppress noExplicitConstructor
558 PGLog(CephContext
*cct
, DoutPrefixProvider
*dpp
= 0) :
559 prefix_provider(dpp
),
560 dirty_from(eversion_t::max()),
561 writeout_from(eversion_t::max()),
563 pg_log_debug(!(cct
&& !(cct
->_conf
->osd_debug_pg_log_writeout
))),
565 clear_divergent_priors(false) {}
568 void reset_backfill();
572 //////////////////// get or set missing ////////////////////
574 const pg_missing_tracker_t
& get_missing() const { return missing
; }
575 void revise_have(hobject_t oid
, eversion_t have
) {
576 missing
.revise_have(oid
, have
);
579 void revise_need(hobject_t oid
, eversion_t need
) {
580 missing
.revise_need(oid
, need
);
583 void missing_add(const hobject_t
& oid
, eversion_t need
, eversion_t have
) {
584 missing
.add(oid
, need
, have
);
587 //////////////////// get or set log ////////////////////
589 const IndexedLog
&get_log() const { return log
; }
591 const eversion_t
&get_tail() const { return log
.tail
; }
593 void set_tail(eversion_t tail
) { log
.tail
= tail
; }
595 const eversion_t
&get_head() const { return log
.head
; }
597 void set_head(eversion_t head
) { log
.head
= head
; }
599 void set_last_requested(version_t last_requested
) {
600 log
.last_requested
= last_requested
;
603 void index() { log
.index(); }
605 void unindex() { log
.unindex(); }
607 void add(const pg_log_entry_t
& e
, bool applied
= true) {
608 mark_writeout_from(e
.version
);
612 void reset_recovery_pointers() { log
.reset_recovery_pointers(); }
614 static void clear_info_log(
616 ObjectStore::Transaction
*t
);
622 void roll_forward_to(
623 eversion_t roll_forward_to
,
624 LogEntryHandler
*h
) {
630 eversion_t
get_can_rollback_to() const {
631 return log
.get_can_rollback_to();
634 void roll_forward(LogEntryHandler
*h
) {
640 //////////////////// get or set log & missing ////////////////////
642 void reset_backfill_claim_log(const pg_log_t
&o
, LogEntryHandler
*h
) {
643 log
.trim_rollback_info_to(log
.head
, h
);
644 log
.claim_log_and_clear_rollback_info(o
);
646 mark_dirty_to(eversion_t::max());
653 log
.split_out_child(child_pgid
, split_bits
, &opg_log
->log
);
654 missing
.split_into(child_pgid
, split_bits
, &(opg_log
->missing
));
655 opg_log
->mark_dirty_to(eversion_t::max());
656 mark_dirty_to(eversion_t::max());
659 void recover_got(hobject_t oid
, eversion_t v
, pg_info_t
&info
) {
660 if (missing
.is_missing(oid
, v
)) {
663 // raise last_complete?
664 if (missing
.get_items().empty()) {
665 log
.complete_to
= log
.log
.end();
666 info
.last_complete
= info
.last_update
;
668 while (log
.complete_to
!= log
.log
.end()) {
669 if (missing
.get_items().at(
670 missing
.get_rmissing().begin()->second
671 ).need
<= log
.complete_to
->version
)
673 if (info
.last_complete
< log
.complete_to
->version
)
674 info
.last_complete
= log
.complete_to
->version
;
679 assert(log
.get_can_rollback_to() >= v
);
682 void activate_not_complete(pg_info_t
&info
) {
683 log
.complete_to
= log
.log
.begin();
684 while (log
.complete_to
->version
<
685 missing
.get_items().at(
686 missing
.get_rmissing().begin()->second
689 assert(log
.complete_to
!= log
.log
.end());
690 if (log
.complete_to
== log
.log
.begin()) {
691 info
.last_complete
= eversion_t();
694 info
.last_complete
= log
.complete_to
->version
;
697 log
.last_requested
= 0;
700 void proc_replica_log(pg_info_t
&oinfo
,
701 const pg_log_t
&olog
,
702 pg_missing_t
& omissing
, pg_shard_t from
) const;
705 static void split_by_object(
706 mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
707 map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
>> *out_entries
) {
708 while (!entries
.empty()) {
709 auto &out_list
= (*out_entries
)[entries
.front().soid
];
710 out_list
.splice(out_list
.end(), entries
, entries
.begin());
715 * _merge_object_divergent_entries
717 * There are 5 distinct cases:
718 * 1) There is a more recent update: in this case we assume we adjusted the
719 * store and missing during merge_log
720 * 2) The first entry in the divergent sequence is a create. This might
721 * either be because the object is a clone or because prior_version is
722 * eversion_t(). In this case the object does not exist and we must
723 * adjust missing and the store to match.
724 * 3) We are currently missing the object. In this case, we adjust the
725 * missing to our prior_version taking care to add a divergent_prior
727 * 4) We can rollback all of the entries. In this case, we do so using
728 * the rollbacker and return -- the object does not go into missing.
729 * 5) We cannot rollback at least 1 of the entries. In this case, we
730 * clear the object out of the store and add a missing entry at
731 * prior_version taking care to add a divergent_prior if
734 template <typename missing_type
>
735 static void _merge_object_divergent_entries(
736 const IndexedLog
&log
, ///< [in] log to merge against
737 const hobject_t
&hoid
, ///< [in] object we are merging
738 const mempool::osd_pglog::list
<pg_log_entry_t
> &orig_entries
, ///< [in] entries for hoid to merge
739 const pg_info_t
&info
, ///< [in] info for merging entries
740 eversion_t olog_can_rollback_to
, ///< [in] rollback boundary
741 missing_type
&missing
, ///< [in,out] missing to adjust, use
742 LogEntryHandler
*rollbacker
, ///< [in] optional rollbacker object
743 const DoutPrefixProvider
*dpp
///< [in] logging provider
745 ldpp_dout(dpp
, 20) << __func__
<< ": merging hoid " << hoid
746 << " entries: " << orig_entries
<< dendl
;
748 if (hoid
> info
.last_backfill
) {
749 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " after last_backfill"
754 // entries is non-empty
755 assert(!orig_entries
.empty());
756 // strip out and ignore ERROR entries
757 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
759 for (list
<pg_log_entry_t
>::const_iterator i
= orig_entries
.begin();
760 i
!= orig_entries
.end();
762 // all entries are on hoid
763 assert(i
->soid
== hoid
);
764 if (i
!= orig_entries
.begin() && i
->prior_version
!= eversion_t()) {
765 // in increasing order of version
766 assert(i
->version
> last
);
767 // prior_version correct (unless it is an ERROR entry)
768 assert(i
->prior_version
== last
|| i
->is_error());
772 ldpp_dout(dpp
, 20) << __func__
<< ": ignoring " << *i
<< dendl
;
774 ldpp_dout(dpp
, 20) << __func__
<< ": keeping " << *i
<< dendl
;
775 entries
.push_back(*i
);
778 if (entries
.empty()) {
779 ldpp_dout(dpp
, 10) << __func__
<< ": no non-ERROR entries" << dendl
;
783 const eversion_t prior_version
= entries
.begin()->prior_version
;
784 const eversion_t first_divergent_update
= entries
.begin()->version
;
785 const eversion_t last_divergent_update
= entries
.rbegin()->version
;
786 const bool object_not_in_store
=
787 !missing
.is_missing(hoid
) &&
788 entries
.rbegin()->is_delete();
789 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
790 << " prior_version: " << prior_version
791 << " first_divergent_update: " << first_divergent_update
792 << " last_divergent_update: " << last_divergent_update
795 ceph::unordered_map
<hobject_t
, pg_log_entry_t
*>::const_iterator objiter
=
796 log
.objects
.find(hoid
);
797 if (objiter
!= log
.objects
.end() &&
798 objiter
->second
->version
>= first_divergent_update
) {
800 ldpp_dout(dpp
, 10) << __func__
<< ": more recent entry found: "
801 << *objiter
->second
<< ", already merged" << dendl
;
803 assert(objiter
->second
->version
> last_divergent_update
);
805 // ensure missing has been updated appropriately
806 if (objiter
->second
->is_update()) {
807 assert(missing
.is_missing(hoid
) &&
808 missing
.get_items().at(hoid
).need
== objiter
->second
->version
);
810 assert(!missing
.is_missing(hoid
));
812 missing
.revise_have(hoid
, eversion_t());
814 if (!object_not_in_store
) {
815 rollbacker
->remove(hoid
);
817 for (auto &&i
: entries
) {
824 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
825 <<" has no more recent entries in log" << dendl
;
826 if (prior_version
== eversion_t() || entries
.front().is_clone()) {
828 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
829 << " prior_version or op type indicates creation,"
832 if (missing
.is_missing(hoid
))
833 missing
.rm(missing
.get_items().find(hoid
));
835 if (!object_not_in_store
) {
836 rollbacker
->remove(hoid
);
838 for (auto &&i
: entries
) {
845 if (missing
.is_missing(hoid
)) {
847 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
848 << " missing, " << missing
.get_items().at(hoid
)
849 << " adjusting" << dendl
;
851 if (missing
.get_items().at(hoid
).have
== prior_version
) {
852 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
853 << " missing.have is prior_version " << prior_version
854 << " removing from missing" << dendl
;
855 missing
.rm(missing
.get_items().find(hoid
));
857 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
858 << " missing.have is " << missing
.get_items().at(hoid
).have
859 << ", adjusting" << dendl
;
860 missing
.revise_need(hoid
, prior_version
);
861 if (prior_version
<= info
.log_tail
) {
862 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
863 << " prior_version " << prior_version
864 << " <= info.log_tail "
865 << info
.log_tail
<< dendl
;
869 for (auto &&i
: entries
) {
876 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
877 << " must be rolled back or recovered,"
878 << " attempting to rollback"
880 bool can_rollback
= true;
881 /// Distinguish between 4) and 5)
882 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= entries
.rbegin();
885 if (!i
->can_rollback() || i
->version
<= olog_can_rollback_to
) {
886 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " cannot rollback "
888 can_rollback
= false;
895 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= entries
.rbegin();
898 assert(i
->can_rollback() && i
->version
> olog_can_rollback_to
);
899 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
900 << " rolling back " << *i
<< dendl
;
902 rollbacker
->rollback(*i
);
904 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
905 << " rolled back" << dendl
;
909 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
<< " cannot roll back, "
910 << "removing and adding to missing" << dendl
;
912 if (!object_not_in_store
)
913 rollbacker
->remove(hoid
);
914 for (auto &&i
: entries
) {
918 missing
.add(hoid
, prior_version
, eversion_t());
919 if (prior_version
<= info
.log_tail
) {
920 ldpp_dout(dpp
, 10) << __func__
<< ": hoid " << hoid
921 << " prior_version " << prior_version
922 << " <= info.log_tail "
923 << info
.log_tail
<< dendl
;
928 /// Merge all entries using above
929 template <typename missing_type
>
930 static void _merge_divergent_entries(
931 const IndexedLog
&log
, ///< [in] log to merge against
932 mempool::osd_pglog::list
<pg_log_entry_t
> &entries
, ///< [in] entries to merge
933 const pg_info_t
&oinfo
, ///< [in] info for merging entries
934 eversion_t olog_can_rollback_to
, ///< [in] rollback boundary
935 missing_type
&omissing
, ///< [in,out] missing to adjust, use
936 LogEntryHandler
*rollbacker
, ///< [in] optional rollbacker object
937 const DoutPrefixProvider
*dpp
///< [in] logging provider
939 map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
> > split
;
940 split_by_object(entries
, &split
);
941 for (map
<hobject_t
, mempool::osd_pglog::list
<pg_log_entry_t
>>::iterator i
= split
.begin();
944 _merge_object_divergent_entries(
949 olog_can_rollback_to
,
957 * Exists for use in TestPGLog for simply testing single divergent log
960 void merge_old_entry(
961 ObjectStore::Transaction
& t
,
962 const pg_log_entry_t
& oe
,
963 const pg_info_t
& info
,
964 LogEntryHandler
*rollbacker
) {
965 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
966 entries
.push_back(oe
);
967 _merge_object_divergent_entries(
972 log
.get_can_rollback_to(),
978 void rewind_divergent_log(eversion_t newhead
,
980 LogEntryHandler
*rollbacker
,
982 bool &dirty_big_info
);
984 void merge_log(pg_info_t
&oinfo
,
987 pg_info_t
&info
, LogEntryHandler
*rollbacker
,
988 bool &dirty_info
, bool &dirty_big_info
);
990 template <typename missing_type
>
991 static bool append_log_entries_update_missing(
992 const hobject_t
&last_backfill
,
993 bool last_backfill_bitwise
,
994 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
995 bool maintain_rollback
,
997 missing_type
&missing
,
998 LogEntryHandler
*rollbacker
,
999 const DoutPrefixProvider
*dpp
) {
1000 bool invalidate_stats
= false;
1001 if (log
&& !entries
.empty()) {
1002 assert(log
->head
< entries
.begin()->version
);
1004 for (list
<pg_log_entry_t
>::const_iterator p
= entries
.begin();
1007 invalidate_stats
= invalidate_stats
|| !p
->is_error();
1009 ldpp_dout(dpp
, 20) << "update missing, append " << *p
<< dendl
;
1012 if (p
->soid
<= last_backfill
&&
1014 missing
.add_next_event(*p
);
1016 // hack to match PG::mark_all_unfound_lost
1017 if (maintain_rollback
&& p
->is_lost_delete() && p
->can_rollback()) {
1018 rollbacker
->try_stash(p
->soid
, p
->version
.version
);
1019 } else if (p
->is_delete()) {
1020 rollbacker
->remove(p
->soid
);
1025 return invalidate_stats
;
1027 bool append_new_log_entries(
1028 const hobject_t
&last_backfill
,
1029 bool last_backfill_bitwise
,
1030 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
1031 LogEntryHandler
*rollbacker
) {
1032 bool invalidate_stats
= append_log_entries_update_missing(
1034 last_backfill_bitwise
,
1041 if (!entries
.empty()) {
1042 mark_writeout_from(entries
.begin()->version
);
1044 return invalidate_stats
;
1047 void write_log_and_missing(ObjectStore::Transaction
& t
,
1048 map
<string
,bufferlist
> *km
,
1050 const ghobject_t
&log_oid
,
1051 bool require_rollback
);
1053 static void write_log_and_missing_wo_missing(
1054 ObjectStore::Transaction
& t
,
1055 map
<string
,bufferlist
>* km
,
1058 const ghobject_t
&log_oid
, map
<eversion_t
, hobject_t
> &divergent_priors
,
1059 bool require_rollback
);
1061 static void write_log_and_missing(
1062 ObjectStore::Transaction
& t
,
1063 map
<string
,bufferlist
>* km
,
1066 const ghobject_t
&log_oid
,
1067 const pg_missing_tracker_t
&missing
,
1068 bool require_rollback
);
1070 static void _write_log_and_missing_wo_missing(
1071 ObjectStore::Transaction
& t
,
1072 map
<string
,bufferlist
>* km
,
1074 const coll_t
& coll
, const ghobject_t
&log_oid
,
1075 map
<eversion_t
, hobject_t
> &divergent_priors
,
1076 eversion_t dirty_to
,
1077 eversion_t dirty_from
,
1078 eversion_t writeout_from
,
1079 const set
<eversion_t
> &trimmed
,
1080 bool dirty_divergent_priors
,
1082 bool require_rollback
,
1083 set
<string
> *log_keys_debug
1086 static void _write_log_and_missing(
1087 ObjectStore::Transaction
& t
,
1088 map
<string
,bufferlist
>* km
,
1090 const coll_t
& coll
, const ghobject_t
&log_oid
,
1091 eversion_t dirty_to
,
1092 eversion_t dirty_from
,
1093 eversion_t writeout_from
,
1094 const set
<eversion_t
> &trimmed
,
1095 const pg_missing_tracker_t
&missing
,
1097 bool require_rollback
,
1098 bool clear_divergent_priors
,
1099 set
<string
> *log_keys_debug
1102 void read_log_and_missing(
1103 ObjectStore
*store
, coll_t pg_coll
,
1104 coll_t log_coll
, ghobject_t log_oid
,
1105 const pg_info_t
&info
,
1107 bool tolerate_divergent_missing_log
,
1108 bool debug_verify_stored_missing
= false
1110 return read_log_and_missing(
1111 store
, pg_coll
, log_coll
, log_oid
, info
,
1113 tolerate_divergent_missing_log
,
1114 &clear_divergent_priors
,
1116 (pg_log_debug
? &log_keys_debug
: 0),
1117 debug_verify_stored_missing
);
1120 template <typename missing_type
>
1121 static void read_log_and_missing(ObjectStore
*store
, coll_t pg_coll
,
1122 coll_t log_coll
, ghobject_t log_oid
,
1123 const pg_info_t
&info
,
1125 missing_type
&missing
, ostringstream
&oss
,
1126 bool tolerate_divergent_missing_log
,
1127 bool *clear_divergent_priors
= NULL
,
1128 const DoutPrefixProvider
*dpp
= NULL
,
1129 set
<string
> *log_keys_debug
= 0,
1130 bool debug_verify_stored_missing
= false
1132 ldpp_dout(dpp
, 20) << "read_log_and_missing coll " << pg_coll
1133 << " log_oid " << log_oid
<< dendl
;
1137 int r
= store
->stat(log_coll
, log_oid
, &st
);
1139 assert(st
.st_size
== 0);
1141 // will get overridden below if it had been recorded
1142 eversion_t on_disk_can_rollback_to
= info
.last_update
;
1143 eversion_t on_disk_rollback_info_trimmed_to
= eversion_t();
1144 ObjectMap::ObjectMapIterator p
= store
->get_omap_iterator(log_coll
, log_oid
);
1145 map
<eversion_t
, hobject_t
> divergent_priors
;
1146 bool has_divergent_priors
= false;
1147 list
<pg_log_entry_t
> entries
;
1149 for (p
->seek_to_first(); p
->valid() ; p
->next(false)) {
1150 // non-log pgmeta_oid keys are prefixed with _; skip those
1151 if (p
->key()[0] == '_')
1153 bufferlist bl
= p
->value();//Copy bufferlist before creating iterator
1154 bufferlist::iterator bp
= bl
.begin();
1155 if (p
->key() == "divergent_priors") {
1156 ::decode(divergent_priors
, bp
);
1157 ldpp_dout(dpp
, 20) << "read_log_and_missing " << divergent_priors
.size()
1158 << " divergent_priors" << dendl
;
1159 has_divergent_priors
= true;
1160 debug_verify_stored_missing
= false;
1161 } else if (p
->key() == "can_rollback_to") {
1162 ::decode(on_disk_can_rollback_to
, bp
);
1163 } else if (p
->key() == "rollback_info_trimmed_to") {
1164 ::decode(on_disk_rollback_info_trimmed_to
, bp
);
1165 } else if (p
->key().substr(0, 7) == string("missing")) {
1166 pair
<hobject_t
, pg_missing_item
> p
;
1168 missing
.add(p
.first
, p
.second
.need
, p
.second
.have
);
1171 e
.decode_with_checksum(bp
);
1172 ldpp_dout(dpp
, 20) << "read_log_and_missing " << e
<< dendl
;
1173 if (!entries
.empty()) {
1174 pg_log_entry_t
last_e(entries
.back());
1175 assert(last_e
.version
.version
< e
.version
.version
);
1176 assert(last_e
.version
.epoch
<= e
.version
.epoch
);
1178 entries
.push_back(e
);
1180 log_keys_debug
->insert(e
.get_key_name());
1187 on_disk_can_rollback_to
,
1188 on_disk_rollback_info_trimmed_to
,
1189 std::move(entries
));
1191 if (has_divergent_priors
|| debug_verify_stored_missing
) {
1193 if (debug_verify_stored_missing
|| info
.last_complete
< info
.last_update
) {
1194 ldpp_dout(dpp
, 10) << "read_log_and_missing checking for missing items over interval ("
1195 << info
.last_complete
1196 << "," << info
.last_update
<< "]" << dendl
;
1199 set
<hobject_t
> checked
;
1200 set
<hobject_t
> skipped
;
1201 for (list
<pg_log_entry_t
>::reverse_iterator i
= log
.log
.rbegin();
1202 i
!= log
.log
.rend();
1204 if (!debug_verify_stored_missing
&& i
->version
<= info
.last_complete
) break;
1205 if (i
->soid
> info
.last_backfill
)
1209 if (did
.count(i
->soid
)) continue;
1210 did
.insert(i
->soid
);
1212 if (i
->is_delete()) continue;
1215 int r
= store
->getattr(
1217 ghobject_t(i
->soid
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1221 object_info_t
oi(bv
);
1222 if (oi
.version
< i
->version
) {
1223 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
1224 << " (have " << oi
.version
<< ")" << dendl
;
1225 if (debug_verify_stored_missing
) {
1226 auto miter
= missing
.get_items().find(i
->soid
);
1227 assert(miter
!= missing
.get_items().end());
1228 assert(miter
->second
.need
== i
->version
);
1229 assert(miter
->second
.have
== oi
.version
);
1230 checked
.insert(i
->soid
);
1232 missing
.add(i
->soid
, i
->version
, oi
.version
);
1236 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
<< dendl
;
1237 if (debug_verify_stored_missing
) {
1238 auto miter
= missing
.get_items().find(i
->soid
);
1239 assert(miter
!= missing
.get_items().end());
1240 assert(miter
->second
.need
== i
->version
);
1241 assert(miter
->second
.have
== eversion_t());
1242 checked
.insert(i
->soid
);
1244 missing
.add(i
->soid
, i
->version
, eversion_t());
1248 if (debug_verify_stored_missing
) {
1249 for (auto &&i
: missing
.get_items()) {
1250 if (checked
.count(i
.first
))
1252 if (i
.second
.need
> log
.tail
||
1253 i
.first
> info
.last_backfill
) {
1254 ldpp_dout(dpp
, -1) << __func__
<< ": invalid missing set entry found "
1257 assert(0 == "invalid missing set entry found");
1260 int r
= store
->getattr(
1262 ghobject_t(i
.first
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1266 object_info_t
oi(bv
);
1267 assert(oi
.version
== i
.second
.have
);
1269 assert(eversion_t() == i
.second
.have
);
1273 assert(has_divergent_priors
);
1274 for (map
<eversion_t
, hobject_t
>::reverse_iterator i
=
1275 divergent_priors
.rbegin();
1276 i
!= divergent_priors
.rend();
1278 if (i
->first
<= info
.last_complete
) break;
1279 if (i
->second
> info
.last_backfill
)
1281 if (did
.count(i
->second
)) continue;
1282 did
.insert(i
->second
);
1284 int r
= store
->getattr(
1286 ghobject_t(i
->second
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1290 object_info_t
oi(bv
);
1292 * 1) we see this entry in the divergent priors mapping
1293 * 2) we didn't see an entry for this object in the log
1295 * From 1 & 2 we know that either the object does not exist
1296 * or it is at the version specified in the divergent_priors
1297 * map since the object would have been deleted atomically
1298 * with the addition of the divergent_priors entry, an older
1299 * version would not have been recovered, and a newer version
1300 * would show up in the log above.
1303 * Unfortunately the assessment above is incorrect because of
1304 * http://tracker.ceph.com/issues/17916 (we were incorrectly
1305 * not removing the divergent_priors set from disk state!),
1306 * so let's check that.
1308 if (oi
.version
> i
->first
&& tolerate_divergent_missing_log
) {
1309 ldpp_dout(dpp
, 0) << "read_log divergent_priors entry (" << *i
1310 << ") inconsistent with disk state (" << oi
1311 << "), assuming it is tracker.ceph.com/issues/17916"
1314 assert(oi
.version
== i
->first
);
1317 ldpp_dout(dpp
, 15) << "read_log_and_missing missing " << *i
<< dendl
;
1318 missing
.add(i
->second
, i
->first
, eversion_t());
1322 if (clear_divergent_priors
)
1323 (*clear_divergent_priors
) = true;
1327 if (!has_divergent_priors
) {
1328 if (clear_divergent_priors
)
1329 (*clear_divergent_priors
) = false;
1332 ldpp_dout(dpp
, 10) << "read_log_and_missing done" << dendl
;
1336 #endif // CEPH_PG_LOG_H