1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 #include "include/unordered_map.h"
20 #include "common/ceph_context.h"
22 #define dout_context cct
23 #define dout_subsys ceph_subsys_osd
25 #define dout_prefix _prefix(_dout, this)
27 static ostream
& _prefix(std::ostream
*_dout
, const PGLog
*pglog
)
29 return pglog
->gen_prefix(*_dout
);
32 //////////////////// PGLog::IndexedLog ////////////////////
34 void PGLog::IndexedLog::split_out_child(
37 PGLog::IndexedLog
*target
)
40 *target
= IndexedLog(pg_log_t::split_out_child(child_pgid
, split_bits
));
43 reset_rollback_info_trimmed_to_riter();
46 void PGLog::IndexedLog::trim(
49 set
<eversion_t
> *trimmed
,
50 set
<string
>* trimmed_dups
,
51 eversion_t
*write_from_dups
)
53 ceph_assert(s
<= can_rollback_to
);
54 if (complete_to
!= log
.end())
55 lgeneric_subdout(cct
, osd
, 20) << " complete_to " << complete_to
->version
<< dendl
;
57 auto earliest_dup_version
=
58 log
.rbegin()->version
.version
< cct
->_conf
->osd_pg_log_dups_tracked
60 : log
.rbegin()->version
.version
- cct
->_conf
->osd_pg_log_dups_tracked
+ 1;
62 lgeneric_subdout(cct
, osd
, 20) << "earliest_dup_version = " << earliest_dup_version
<< dendl
;
63 while (!log
.empty()) {
64 const pg_log_entry_t
&e
= *log
.begin();
67 lgeneric_subdout(cct
, osd
, 20) << "trim " << e
<< dendl
;
69 trimmed
->emplace(e
.version
);
71 unindex(e
); // remove from index,
74 if (e
.version
.version
>= earliest_dup_version
) {
75 if (write_from_dups
!= nullptr && *write_from_dups
> e
.version
) {
76 lgeneric_subdout(cct
, osd
, 20) << "updating write_from_dups from " << *write_from_dups
<< " to " << e
.version
<< dendl
;
77 *write_from_dups
= e
.version
;
79 dups
.push_back(pg_log_dup_t(e
));
82 for (const auto& extra
: e
.extra_reqids
) {
83 int return_code
= e
.return_code
;
84 if (return_code
>= 0) {
85 auto it
= e
.extra_reqid_return_codes
.find(idx
);
86 if (it
!= e
.extra_reqid_return_codes
.end()) {
87 return_code
= it
->second
;
92 // note: extras have the same version as outer op
93 dups
.push_back(pg_log_dup_t(e
.version
, extra
.second
,
94 extra
.first
, return_code
));
99 bool reset_complete_to
= false;
100 // we are trimming past complete_to, so reset complete_to
101 if (complete_to
!= log
.end() && e
.version
>= complete_to
->version
)
102 reset_complete_to
= true;
103 if (rollback_info_trimmed_to_riter
== log
.rend() ||
104 e
.version
== rollback_info_trimmed_to_riter
->version
) {
106 rollback_info_trimmed_to_riter
= log
.rend();
111 // reset complete_to to the beginning of the log
112 if (reset_complete_to
) {
113 complete_to
= log
.begin();
114 if (complete_to
!= log
.end()) {
115 lgeneric_subdout(cct
, osd
, 20) << " moving complete_to to "
116 << log
.begin()->version
<< dendl
;
118 lgeneric_subdout(cct
, osd
, 20) << " log is now empty" << dendl
;
123 while (!dups
.empty()) {
124 const auto& e
= *dups
.begin();
125 if (e
.version
.version
>= earliest_dup_version
)
127 lgeneric_subdout(cct
, osd
, 20) << "trim dup " << e
<< dendl
;
129 trimmed_dups
->insert(e
.get_key_name());
139 ostream
& PGLog::IndexedLog::print(ostream
& out
) const
141 out
<< *this << std::endl
;
142 for (list
<pg_log_entry_t
>::const_iterator p
= log
.begin();
146 (logged_object(p
->soid
) ? "indexed" : "NOT INDEXED") <<
148 ceph_assert(!p
->reqid_is_indexed() || logged_req(p
->reqid
));
151 for (list
<pg_log_dup_t
>::const_iterator p
= dups
.begin();
154 out
<< *p
<< std::endl
;
160 //////////////////// PGLog ////////////////////
162 void PGLog::reset_backfill()
167 void PGLog::clear() {
170 log_keys_debug
.clear();
174 void PGLog::clear_info_log(
176 ObjectStore::Transaction
*t
) {
178 t
->remove(coll
, pgid
.make_pgmeta_oid());
184 bool transaction_applied
,
187 dout(10) << __func__
<< " proposed trim_to = " << trim_to
<< dendl
;
189 if (trim_to
> log
.tail
) {
190 dout(10) << __func__
<< " missing = " << missing
.num_missing() << dendl
;
191 // Don't assert for async_recovery_targets or backfill_targets
192 // or whenever there are missing items
193 if (transaction_applied
&& !async
&& (missing
.num_missing() == 0))
194 ceph_assert(trim_to
<= info
.last_complete
);
196 dout(10) << "trim " << log
<< " to " << trim_to
<< dendl
;
197 log
.trim(cct
, trim_to
, &trimmed
, &trimmed_dups
, &write_from_dups
);
198 info
.log_tail
= log
.tail
;
199 if (log
.complete_to
!= log
.log
.end())
200 dout(10) << " after trim complete_to " << log
.complete_to
->version
<< dendl
;
204 void PGLog::proc_replica_log(
206 const pg_log_t
&olog
,
207 pg_missing_t
& omissing
,
208 pg_shard_t from
) const
210 dout(10) << "proc_replica_log for osd." << from
<< ": "
211 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
213 if (olog
.head
< log
.tail
) {
214 dout(10) << __func__
<< ": osd." << from
<< " does not overlap, not looking "
215 << "for divergent objects" << dendl
;
218 if (olog
.head
== log
.head
) {
219 dout(10) << __func__
<< ": osd." << from
<< " same log head, not looking "
220 << "for divergent objects" << dendl
;
225 basically what we're doing here is rewinding the remote log,
226 dropping divergent entries, until we find something that matches
227 our master log. we then reset last_update to reflect the new
228 point up to which missing is accurate.
230 later, in activate(), missing will get wound forward again and
231 we will send the peer enough log to arrive at the same state.
234 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
= omissing
.get_items().begin();
235 i
!= omissing
.get_items().end();
237 dout(20) << " before missing " << i
->first
<< " need " << i
->second
.need
238 << " have " << i
->second
.have
<< dendl
;
241 list
<pg_log_entry_t
>::const_reverse_iterator first_non_divergent
=
244 if (first_non_divergent
== log
.log
.rend())
246 if (first_non_divergent
->version
<= olog
.head
) {
247 dout(20) << "merge_log point (usually last shared) is "
248 << *first_non_divergent
<< dendl
;
251 ++first_non_divergent
;
254 /* Because olog.head >= log.tail, we know that both pgs must at least have
255 * the event represented by log.tail. Similarly, because log.head >= olog.tail,
256 * we know that the even represented by olog.tail must be common to both logs.
257 * Furthermore, the event represented by a log tail was necessarily trimmed,
258 * thus neither olog.tail nor log.tail can be divergent. It's
259 * possible that olog/log contain no actual events between olog.head and
260 * max(log.tail, olog.tail), however, since they might have been split out.
261 * Thus, if we cannot find an event e such that
262 * log.tail <= e.version <= log.head, the last_update must actually be
263 * max(log.tail, olog.tail).
265 eversion_t limit
= std::max(olog
.tail
, log
.tail
);
267 (first_non_divergent
== log
.log
.rend() ||
268 first_non_divergent
->version
< limit
) ?
270 first_non_divergent
->version
;
272 // we merge and adjust the replica's log, rollback the rollbackable divergent entry,
273 // remove the unrollbackable divergent entry and mark the according object as missing.
274 // the rollback boundary must choose crt of the olog which going to be merged.
275 // The replica log's(olog) crt will not be modified, so it could get passed
276 // to _merge_divergent_entries() directly.
277 IndexedLog
folog(olog
);
278 auto divergent
= folog
.rewind_from_head(lu
);
279 _merge_divergent_entries(
283 olog
.get_can_rollback_to(),
288 if (lu
< oinfo
.last_update
) {
289 dout(10) << " peer osd." << from
<< " last_update now " << lu
<< dendl
;
290 oinfo
.last_update
= lu
;
293 if (omissing
.have_missing()) {
294 eversion_t first_missing
=
295 omissing
.get_items().at(omissing
.get_rmissing().begin()->second
).need
;
296 oinfo
.last_complete
= eversion_t();
297 list
<pg_log_entry_t
>::const_iterator i
= olog
.log
.begin();
301 if (i
->version
< first_missing
)
302 oinfo
.last_complete
= i
->version
;
307 oinfo
.last_complete
= oinfo
.last_update
;
309 } // proc_replica_log
312 * rewind divergent entries at the head of the log
314 * This rewinds entries off the head of our log that are divergent.
315 * This is used by replicas during activation.
317 * @param newhead new head to rewind to
319 void PGLog::rewind_divergent_log(eversion_t newhead
,
320 pg_info_t
&info
, LogEntryHandler
*rollbacker
,
321 bool &dirty_info
, bool &dirty_big_info
)
323 dout(10) << "rewind_divergent_log truncate divergent future " <<
326 // We need to preserve the original crt before it gets updated in rewind_from_head().
327 // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
328 // a divergent entry or not.
329 eversion_t original_crt
= log
.get_can_rollback_to();
330 dout(20) << __func__
<< " original_crt = " << original_crt
<< dendl
;
331 if (info
.last_complete
> newhead
)
332 info
.last_complete
= newhead
;
334 auto divergent
= log
.rewind_from_head(newhead
);
335 if (!divergent
.empty()) {
336 mark_dirty_from(divergent
.front().version
);
338 for (auto &&entry
: divergent
) {
339 dout(10) << "rewind_divergent_log future divergent " << entry
<< dendl
;
341 info
.last_update
= newhead
;
343 _merge_divergent_entries(
353 dirty_big_info
= true;
356 void PGLog::merge_log(pg_info_t
&oinfo
, pg_log_t
&olog
, pg_shard_t fromosd
,
357 pg_info_t
&info
, LogEntryHandler
*rollbacker
,
358 bool &dirty_info
, bool &dirty_big_info
)
360 dout(10) << "merge_log " << olog
<< " from osd." << fromosd
361 << " into " << log
<< dendl
;
363 // Check preconditions
365 // If our log is empty, the incoming log needs to have not been trimmed.
366 ceph_assert(!log
.null() || olog
.tail
== eversion_t());
367 // The logs must overlap.
368 ceph_assert(log
.head
>= olog
.tail
&& olog
.head
>= log
.tail
);
370 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
= missing
.get_items().begin();
371 i
!= missing
.get_items().end();
373 dout(20) << "pg_missing_t sobject: " << i
->first
<< dendl
;
376 bool changed
= false;
379 // this is just filling in history. it does not affect our
380 // missing set, as that should already be consistent with our
382 eversion_t orig_tail
= log
.tail
;
383 if (olog
.tail
< log
.tail
) {
384 dout(10) << "merge_log extending tail to " << olog
.tail
<< dendl
;
385 list
<pg_log_entry_t
>::iterator from
= olog
.log
.begin();
386 list
<pg_log_entry_t
>::iterator to
;
389 to
!= olog
.log
.end();
391 if (to
->version
> log
.tail
)
394 dout(15) << *to
<< dendl
;
399 // splice into our log.
400 log
.log
.splice(log
.log
.begin(),
403 info
.log_tail
= log
.tail
= olog
.tail
;
407 if (oinfo
.stats
.reported_seq
< info
.stats
.reported_seq
|| // make sure reported always increases
408 oinfo
.stats
.reported_epoch
< info
.stats
.reported_epoch
) {
409 oinfo
.stats
.reported_seq
= info
.stats
.reported_seq
;
410 oinfo
.stats
.reported_epoch
= info
.stats
.reported_epoch
;
412 if (info
.last_backfill
.is_max())
413 info
.stats
= oinfo
.stats
;
414 info
.hit_set
= oinfo
.hit_set
;
416 // do we have divergent entries to throw out?
417 if (olog
.head
< log
.head
) {
418 rewind_divergent_log(olog
.head
, info
, rollbacker
, dirty_info
, dirty_big_info
);
423 if (olog
.head
> log
.head
) {
424 dout(10) << "merge_log extending head to " << olog
.head
<< dendl
;
426 // find start point in olog
427 list
<pg_log_entry_t
>::iterator to
= olog
.log
.end();
428 list
<pg_log_entry_t
>::iterator from
= olog
.log
.end();
429 eversion_t lower_bound
= std::max(olog
.tail
, orig_tail
);
431 if (from
== olog
.log
.begin())
434 dout(20) << " ? " << *from
<< dendl
;
435 if (from
->version
<= log
.head
) {
436 lower_bound
= std::max(lower_bound
, from
->version
);
441 dout(20) << "merge_log cut point (usually last shared) is "
442 << lower_bound
<< dendl
;
443 mark_dirty_from(lower_bound
);
445 // We need to preserve the original crt before it gets updated in rewind_from_head().
446 // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
447 // a divergent entry or not.
448 eversion_t original_crt
= log
.get_can_rollback_to();
449 dout(20) << __func__
<< " original_crt = " << original_crt
<< dendl
;
450 auto divergent
= log
.rewind_from_head(lower_bound
);
451 // move aside divergent items
452 for (auto &&oe
: divergent
) {
453 dout(10) << "merge_log divergent " << oe
<< dendl
;
455 log
.roll_forward_to(log
.head
, rollbacker
);
457 mempool::osd_pglog::list
<pg_log_entry_t
> new_entries
;
458 new_entries
.splice(new_entries
.end(), olog
.log
, from
, to
);
459 append_log_entries_update_missing(
461 info
.last_backfill_bitwise
,
469 _merge_divergent_entries(
478 info
.last_update
= log
.head
= olog
.head
;
480 // We cannot rollback into the new log entries
481 log
.skip_can_rollback_to_to_head();
483 info
.last_user_version
= oinfo
.last_user_version
;
484 info
.purged_snaps
= oinfo
.purged_snaps
;
485 // update num_missing too
486 // we might have appended some more missing objects above
487 info
.stats
.stats
.sum
.num_objects_missing
= missing
.num_missing();
493 if (merge_log_dups(olog
)) {
497 dout(10) << "merge_log result " << log
<< " " << missing
<<
498 " changed=" << changed
<< dendl
;
502 dirty_big_info
= true;
507 // returns true if any changes were made to log.dups
508 bool PGLog::merge_log_dups(const pg_log_t
& olog
) {
509 bool changed
= false;
511 if (!olog
.dups
.empty()) {
512 if (log
.dups
.empty()) {
513 dout(10) << "merge_log copying olog dups to log " <<
514 olog
.dups
.front().version
<< " to " <<
515 olog
.dups
.back().version
<< dendl
;
517 dirty_from_dups
= eversion_t();
518 dirty_to_dups
= eversion_t::max();
519 // since our log.dups is empty just copy them
520 for (const auto& i
: olog
.dups
) {
521 log
.dups
.push_back(i
);
522 log
.index(log
.dups
.back());
525 // since our log.dups is not empty try to extend on each end
527 if (olog
.dups
.back().version
> log
.dups
.back().version
) {
528 // extend the dups's tail (i.e., newer dups)
529 dout(10) << "merge_log extending dups tail to " <<
530 olog
.dups
.back().version
<< dendl
;
533 auto log_tail_version
= log
.dups
.back().version
;
535 auto insert_cursor
= log
.dups
.end();
536 eversion_t last_shared
= eversion_t::max();
537 for (auto i
= olog
.dups
.crbegin(); i
!= olog
.dups
.crend(); ++i
) {
538 if (i
->version
<= log_tail_version
) break;
539 log
.dups
.insert(insert_cursor
, *i
);
540 last_shared
= i
->version
;
542 auto prev
= insert_cursor
;
544 // be sure to pass reference of copy in log.dups
547 --insert_cursor
; // make sure we insert in reverse order
549 mark_dirty_from_dups(last_shared
);
552 if (olog
.dups
.front().version
< log
.dups
.front().version
) {
553 // extend the dups's head (i.e., older dups)
554 dout(10) << "merge_log extending dups head to " <<
555 olog
.dups
.front().version
<< dendl
;
559 auto insert_cursor
= log
.dups
.begin();
560 for (auto i
= olog
.dups
.cbegin(); i
!= olog
.dups
.cend(); ++i
) {
561 if (i
->version
>= insert_cursor
->version
) break;
562 log
.dups
.insert(insert_cursor
, *i
);
564 auto prev
= insert_cursor
;
566 // be sure to pass address of copy in log.dups
569 mark_dirty_to_dups(last
);
574 // remove any dup entries that overlap with pglog
575 if (!log
.dups
.empty() && log
.dups
.back().version
> log
.tail
) {
576 dout(10) << "merge_log removed dups overlapping log entries (" <<
577 log
.tail
<< "," << log
.dups
.back().version
<< "]" << dendl
;
580 while (!log
.dups
.empty() && log
.dups
.back().version
> log
.tail
) {
581 log
.unindex(log
.dups
.back());
582 mark_dirty_from_dups(log
.dups
.back().version
);
590 void PGLog::check() {
593 if (log
.log
.size() != log_keys_debug
.size()) {
594 derr
<< "log.log.size() != log_keys_debug.size()" << dendl
;
595 derr
<< "actual log:" << dendl
;
596 for (list
<pg_log_entry_t
>::iterator i
= log
.log
.begin();
599 derr
<< " " << *i
<< dendl
;
601 derr
<< "log_keys_debug:" << dendl
;
602 for (set
<string
>::const_iterator i
= log_keys_debug
.begin();
603 i
!= log_keys_debug
.end();
605 derr
<< " " << *i
<< dendl
;
608 ceph_assert(log
.log
.size() == log_keys_debug
.size());
609 for (list
<pg_log_entry_t
>::iterator i
= log
.log
.begin();
612 ceph_assert(log_keys_debug
.count(i
->get_key_name()));
617 void PGLog::write_log_and_missing(
618 ObjectStore::Transaction
& t
,
619 map
<string
,bufferlist
> *km
,
621 const ghobject_t
&log_oid
,
622 bool require_rollback
)
625 dout(6) << "write_log_and_missing with: "
626 << "dirty_to: " << dirty_to
627 << ", dirty_from: " << dirty_from
628 << ", writeout_from: " << writeout_from
629 << ", trimmed: " << trimmed
630 << ", trimmed_dups: " << trimmed_dups
631 << ", clear_divergent_priors: " << clear_divergent_priors
633 _write_log_and_missing(
634 t
, km
, log
, coll
, log_oid
,
639 std::move(trimmed_dups
),
643 clear_divergent_priors
,
647 &rebuilt_missing_with_deletes
,
648 (pg_log_debug
? &log_keys_debug
: nullptr));
651 dout(10) << "log is not dirty" << dendl
;
656 void PGLog::write_log_and_missing_wo_missing(
657 ObjectStore::Transaction
& t
,
658 map
<string
,bufferlist
> *km
,
660 const coll_t
& coll
, const ghobject_t
&log_oid
,
661 map
<eversion_t
, hobject_t
> &divergent_priors
,
662 bool require_rollback
665 _write_log_and_missing_wo_missing(
666 t
, km
, log
, coll
, log_oid
,
667 divergent_priors
, eversion_t::max(), eversion_t(), eversion_t(),
668 true, true, require_rollback
,
669 eversion_t::max(), eversion_t(), eversion_t(), nullptr);
673 void PGLog::write_log_and_missing(
674 ObjectStore::Transaction
& t
,
675 map
<string
,bufferlist
> *km
,
678 const ghobject_t
&log_oid
,
679 const pg_missing_tracker_t
&missing
,
680 bool require_rollback
,
681 bool *rebuilt_missing_with_deletes
)
683 _write_log_and_missing(
684 t
, km
, log
, coll
, log_oid
,
691 true, require_rollback
, false,
695 rebuilt_missing_with_deletes
, nullptr);
699 void PGLog::_write_log_and_missing_wo_missing(
700 ObjectStore::Transaction
& t
,
701 map
<string
,bufferlist
> *km
,
703 const coll_t
& coll
, const ghobject_t
&log_oid
,
704 map
<eversion_t
, hobject_t
> &divergent_priors
,
706 eversion_t dirty_from
,
707 eversion_t writeout_from
,
708 bool dirty_divergent_priors
,
710 bool require_rollback
,
711 eversion_t dirty_to_dups
,
712 eversion_t dirty_from_dups
,
713 eversion_t write_from_dups
,
714 set
<string
> *log_keys_debug
717 // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
719 t
.touch(coll
, log_oid
);
720 if (dirty_to
!= eversion_t()) {
723 eversion_t().get_key_name(), dirty_to
.get_key_name());
724 clear_up_to(log_keys_debug
, dirty_to
.get_key_name());
726 if (dirty_to
!= eversion_t::max() && dirty_from
!= eversion_t::max()) {
727 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
730 dirty_from
.get_key_name(), eversion_t::max().get_key_name());
731 clear_after(log_keys_debug
, dirty_from
.get_key_name());
734 for (list
<pg_log_entry_t
>::iterator p
= log
.log
.begin();
735 p
!= log
.log
.end() && p
->version
<= dirty_to
;
737 bufferlist
bl(sizeof(*p
) * 2);
738 p
->encode_with_checksum(bl
);
739 (*km
)[p
->get_key_name()].claim(bl
);
742 for (list
<pg_log_entry_t
>::reverse_iterator p
= log
.log
.rbegin();
743 p
!= log
.log
.rend() &&
744 (p
->version
>= dirty_from
|| p
->version
>= writeout_from
) &&
745 p
->version
>= dirty_to
;
747 bufferlist
bl(sizeof(*p
) * 2);
748 p
->encode_with_checksum(bl
);
749 (*km
)[p
->get_key_name()].claim(bl
);
752 if (log_keys_debug
) {
753 for (map
<string
, bufferlist
>::iterator i
= (*km
).begin();
756 if (i
->first
[0] == '_')
758 ceph_assert(!log_keys_debug
->count(i
->first
));
759 log_keys_debug
->insert(i
->first
);
763 // process dups after log_keys_debug is filled, so dups do not
764 // end up in that set
765 if (dirty_to_dups
!= eversion_t()) {
766 pg_log_dup_t min
, dirty_to_dup
;
767 dirty_to_dup
.version
= dirty_to_dups
;
770 min
.get_key_name(), dirty_to_dup
.get_key_name());
772 if (dirty_to_dups
!= eversion_t::max() && dirty_from_dups
!= eversion_t::max()) {
773 pg_log_dup_t max
, dirty_from_dup
;
774 max
.version
= eversion_t::max();
775 dirty_from_dup
.version
= dirty_from_dups
;
778 dirty_from_dup
.get_key_name(), max
.get_key_name());
781 for (const auto& entry
: log
.dups
) {
782 if (entry
.version
> dirty_to_dups
)
786 (*km
)[entry
.get_key_name()].claim(bl
);
789 for (list
<pg_log_dup_t
>::reverse_iterator p
= log
.dups
.rbegin();
790 p
!= log
.dups
.rend() &&
791 (p
->version
>= dirty_from_dups
|| p
->version
>= write_from_dups
) &&
792 p
->version
>= dirty_to_dups
;
796 (*km
)[p
->get_key_name()].claim(bl
);
799 if (dirty_divergent_priors
) {
800 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
801 encode(divergent_priors
, (*km
)["divergent_priors"]);
803 if (require_rollback
) {
805 log
.get_can_rollback_to(),
806 (*km
)["can_rollback_to"]);
808 log
.get_rollback_info_trimmed_to(),
809 (*km
)["rollback_info_trimmed_to"]);
814 void PGLog::_write_log_and_missing(
815 ObjectStore::Transaction
& t
,
816 map
<string
,bufferlist
>* km
,
818 const coll_t
& coll
, const ghobject_t
&log_oid
,
820 eversion_t dirty_from
,
821 eversion_t writeout_from
,
822 set
<eversion_t
> &&trimmed
,
823 set
<string
> &&trimmed_dups
,
824 const pg_missing_tracker_t
&missing
,
826 bool require_rollback
,
827 bool clear_divergent_priors
,
828 eversion_t dirty_to_dups
,
829 eversion_t dirty_from_dups
,
830 eversion_t write_from_dups
,
831 bool *rebuilt_missing_with_deletes
, // in/out param
832 set
<string
> *log_keys_debug
834 set
<string
> to_remove
;
835 to_remove
.swap(trimmed_dups
);
836 for (auto& t
: trimmed
) {
837 string key
= t
.get_key_name();
838 if (log_keys_debug
) {
839 auto it
= log_keys_debug
->find(key
);
840 ceph_assert(it
!= log_keys_debug
->end());
841 log_keys_debug
->erase(it
);
843 to_remove
.emplace(std::move(key
));
848 t
.touch(coll
, log_oid
);
849 if (dirty_to
!= eversion_t()) {
852 eversion_t().get_key_name(), dirty_to
.get_key_name());
853 clear_up_to(log_keys_debug
, dirty_to
.get_key_name());
855 if (dirty_to
!= eversion_t::max() && dirty_from
!= eversion_t::max()) {
856 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
859 dirty_from
.get_key_name(), eversion_t::max().get_key_name());
860 clear_after(log_keys_debug
, dirty_from
.get_key_name());
863 for (list
<pg_log_entry_t
>::iterator p
= log
.log
.begin();
864 p
!= log
.log
.end() && p
->version
<= dirty_to
;
866 bufferlist
bl(sizeof(*p
) * 2);
867 p
->encode_with_checksum(bl
);
868 (*km
)[p
->get_key_name()].claim(bl
);
871 for (list
<pg_log_entry_t
>::reverse_iterator p
= log
.log
.rbegin();
872 p
!= log
.log
.rend() &&
873 (p
->version
>= dirty_from
|| p
->version
>= writeout_from
) &&
874 p
->version
>= dirty_to
;
876 bufferlist
bl(sizeof(*p
) * 2);
877 p
->encode_with_checksum(bl
);
878 (*km
)[p
->get_key_name()].claim(bl
);
881 if (log_keys_debug
) {
882 for (map
<string
, bufferlist
>::iterator i
= (*km
).begin();
885 if (i
->first
[0] == '_')
887 ceph_assert(!log_keys_debug
->count(i
->first
));
888 log_keys_debug
->insert(i
->first
);
892 // process dups after log_keys_debug is filled, so dups do not
893 // end up in that set
894 if (dirty_to_dups
!= eversion_t()) {
895 pg_log_dup_t min
, dirty_to_dup
;
896 dirty_to_dup
.version
= dirty_to_dups
;
899 min
.get_key_name(), dirty_to_dup
.get_key_name());
901 if (dirty_to_dups
!= eversion_t::max() && dirty_from_dups
!= eversion_t::max()) {
902 pg_log_dup_t max
, dirty_from_dup
;
903 max
.version
= eversion_t::max();
904 dirty_from_dup
.version
= dirty_from_dups
;
907 dirty_from_dup
.get_key_name(), max
.get_key_name());
910 for (const auto& entry
: log
.dups
) {
911 if (entry
.version
> dirty_to_dups
)
915 (*km
)[entry
.get_key_name()].claim(bl
);
918 for (list
<pg_log_dup_t
>::reverse_iterator p
= log
.dups
.rbegin();
919 p
!= log
.dups
.rend() &&
920 (p
->version
>= dirty_from_dups
|| p
->version
>= write_from_dups
) &&
921 p
->version
>= dirty_to_dups
;
925 (*km
)[p
->get_key_name()].claim(bl
);
928 if (clear_divergent_priors
) {
929 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
930 to_remove
.insert("divergent_priors");
932 // since we encode individual missing items instead of a whole
933 // missing set, we need another key to store this bit of state
934 if (*rebuilt_missing_with_deletes
) {
935 (*km
)["may_include_deletes_in_missing"] = bufferlist();
936 *rebuilt_missing_with_deletes
= false;
939 [&](const hobject_t
&obj
) {
940 string key
= string("missing/") + obj
.to_str();
941 pg_missing_item item
;
942 if (!missing
.is_missing(obj
, &item
)) {
943 to_remove
.insert(key
);
945 uint64_t features
= missing
.may_include_deletes
? CEPH_FEATURE_OSD_RECOVERY_DELETES
: 0;
946 encode(make_pair(obj
, item
), (*km
)[key
], features
);
949 if (require_rollback
) {
951 log
.get_can_rollback_to(),
952 (*km
)["can_rollback_to"]);
954 log
.get_rollback_info_trimmed_to(),
955 (*km
)["rollback_info_trimmed_to"]);
958 if (!to_remove
.empty())
959 t
.omap_rmkeys(coll
, log_oid
, to_remove
);
962 void PGLog::rebuild_missing_set_with_deletes(
964 ObjectStore::CollectionHandle
& ch
,
965 const pg_info_t
&info
)
967 // save entries not generated from the current log (e.g. added due
968 // to repair, EIO handling, or divergent_priors).
969 map
<hobject_t
, pg_missing_item
> extra_missing
;
970 for (const auto& p
: missing
.get_items()) {
971 if (!log
.logged_object(p
.first
)) {
972 dout(20) << __func__
<< " extra missing entry: " << p
.first
973 << " " << p
.second
<< dendl
;
974 extra_missing
[p
.first
] = p
.second
;
978 missing
.may_include_deletes
= true;
980 // go through the log and add items that are not present or older
981 // versions on disk, just as if we were reading the log + metadata
982 // off disk originally
984 for (list
<pg_log_entry_t
>::reverse_iterator i
= log
.log
.rbegin();
987 if (i
->version
<= info
.last_complete
)
989 if (i
->soid
> info
.last_backfill
||
991 did
.find(i
->soid
) != did
.end())
996 int r
= store
->getattr(
998 ghobject_t(i
->soid
, ghobject_t::NO_GEN
, info
.pgid
.shard
),
1001 dout(20) << __func__
<< " check for log entry: " << *i
<< " = " << r
<< dendl
;
1004 object_info_t
oi(bv
);
1005 dout(20) << __func__
<< " store version = " << oi
.version
<< dendl
;
1006 if (oi
.version
< i
->version
) {
1007 missing
.add(i
->soid
, i
->version
, oi
.version
, i
->is_delete());
1010 missing
.add(i
->soid
, i
->version
, eversion_t(), i
->is_delete());
1014 for (const auto& p
: extra_missing
) {
1015 missing
.add(p
.first
, p
.second
.need
, p
.second
.have
, p
.second
.is_delete());
1017 rebuilt_missing_with_deletes
= true;