1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 #include "include/unordered_map.h"
20 #include "common/ceph_context.h"
22 #define dout_context cct
23 #define dout_subsys ceph_subsys_osd
25 #define dout_prefix _prefix(_dout, this)
27 static ostream
& _prefix(std::ostream
*_dout
, const PGLog
*pglog
)
29 return *_dout
<< pglog
->gen_prefix();
32 //////////////////// PGLog::IndexedLog ////////////////////
34 void PGLog::IndexedLog::split_out_child(
37 PGLog::IndexedLog
*target
)
40 *target
= pg_log_t::split_out_child(child_pgid
, split_bits
);
43 reset_rollback_info_trimmed_to_riter();
46 void PGLog::IndexedLog::trim(
49 set
<eversion_t
> *trimmed
)
51 if (complete_to
!= log
.end() &&
52 complete_to
->version
<= s
) {
53 generic_dout(0) << " bad trim to " << s
<< " when complete_to is "
54 << complete_to
->version
55 << " on " << *this << dendl
;
58 assert(s
<= can_rollback_to
);
60 while (!log
.empty()) {
61 pg_log_entry_t
&e
= *log
.begin();
64 generic_dout(20) << "trim " << e
<< dendl
;
66 trimmed
->insert(e
.version
);
68 unindex(e
); // remove from index,
70 if (rollback_info_trimmed_to_riter
== log
.rend() ||
71 e
.version
== rollback_info_trimmed_to_riter
->version
) {
73 rollback_info_trimmed_to_riter
= log
.rend();
84 ostream
& PGLog::IndexedLog::print(ostream
& out
) const
86 out
<< *this << std::endl
;
87 for (list
<pg_log_entry_t
>::const_iterator p
= log
.begin();
90 out
<< *p
<< " " << (logged_object(p
->soid
) ? "indexed":"NOT INDEXED") << std::endl
;
91 assert(!p
->reqid_is_indexed() || logged_req(p
->reqid
));
96 //////////////////// PGLog ////////////////////
98 void PGLog::reset_backfill()
103 void PGLog::clear() {
106 log_keys_debug
.clear();
110 void PGLog::clear_info_log(
112 ObjectStore::Transaction
*t
) {
114 t
->remove(coll
, pgid
.make_pgmeta_oid());
122 if (trim_to
> log
.tail
) {
123 // We shouldn't be trimming the log past last_complete
124 assert(trim_to
<= info
.last_complete
);
126 dout(10) << "trim " << log
<< " to " << trim_to
<< dendl
;
127 log
.trim(cct
, trim_to
, &trimmed
);
128 info
.log_tail
= log
.tail
;
132 void PGLog::proc_replica_log(
134 const pg_log_t
&olog
,
135 pg_missing_t
& omissing
,
136 pg_shard_t from
) const
138 dout(10) << "proc_replica_log for osd." << from
<< ": "
139 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
141 if (olog
.head
< log
.tail
) {
142 dout(10) << __func__
<< ": osd." << from
<< " does not overlap, not looking "
143 << "for divergent objects" << dendl
;
146 if (olog
.head
== log
.head
) {
147 dout(10) << __func__
<< ": osd." << from
<< " same log head, not looking "
148 << "for divergent objects" << dendl
;
151 assert(olog
.head
>= log
.tail
);
154 basically what we're doing here is rewinding the remote log,
155 dropping divergent entries, until we find something that matches
156 our master log. we then reset last_update to reflect the new
157 point up to which missing is accurate.
159 later, in activate(), missing will get wound forward again and
160 we will send the peer enough log to arrive at the same state.
163 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
= omissing
.get_items().begin();
164 i
!= omissing
.get_items().end();
166 dout(20) << " before missing " << i
->first
<< " need " << i
->second
.need
167 << " have " << i
->second
.have
<< dendl
;
170 list
<pg_log_entry_t
>::const_reverse_iterator first_non_divergent
=
173 if (first_non_divergent
== log
.log
.rend())
175 if (first_non_divergent
->version
<= olog
.head
) {
176 dout(20) << "merge_log point (usually last shared) is "
177 << *first_non_divergent
<< dendl
;
180 ++first_non_divergent
;
183 /* Because olog.head >= log.tail, we know that both pgs must at least have
184 * the event represented by log.tail. Similarly, because log.head >= olog.tail,
185 * we know that the even represented by olog.tail must be common to both logs.
186 * Furthermore, the event represented by a log tail was necessarily trimmed,
187 * thus neither olog.tail nor log.tail can be divergent. It's
188 * possible that olog/log contain no actual events between olog.head and
189 * MAX(log.tail, olog.tail), however, since they might have been split out.
190 * Thus, if we cannot find an event e such that
191 * log.tail <= e.version <= log.head, the last_update must actually be
192 * MAX(log.tail, olog.tail).
194 eversion_t limit
= MAX(olog
.tail
, log
.tail
);
196 (first_non_divergent
== log
.log
.rend() ||
197 first_non_divergent
->version
< limit
) ?
199 first_non_divergent
->version
;
201 IndexedLog
folog(olog
);
202 auto divergent
= folog
.rewind_from_head(lu
);
203 _merge_divergent_entries(
207 olog
.get_can_rollback_to(),
212 if (lu
< oinfo
.last_update
) {
213 dout(10) << " peer osd." << from
<< " last_update now " << lu
<< dendl
;
214 oinfo
.last_update
= lu
;
217 if (omissing
.have_missing()) {
218 eversion_t first_missing
=
219 omissing
.get_items().at(omissing
.get_rmissing().begin()->second
).need
;
220 oinfo
.last_complete
= eversion_t();
221 list
<pg_log_entry_t
>::const_iterator i
= olog
.log
.begin();
225 if (i
->version
< first_missing
)
226 oinfo
.last_complete
= i
->version
;
231 oinfo
.last_complete
= oinfo
.last_update
;
236 * rewind divergent entries at the head of the log
238 * This rewinds entries off the head of our log that are divergent.
239 * This is used by replicas during activation.
241 * @param newhead new head to rewind to
243 void PGLog::rewind_divergent_log(eversion_t newhead
,
244 pg_info_t
&info
, LogEntryHandler
*rollbacker
,
245 bool &dirty_info
, bool &dirty_big_info
)
247 dout(10) << "rewind_divergent_log truncate divergent future " << newhead
<< dendl
;
250 if (info
.last_complete
> newhead
)
251 info
.last_complete
= newhead
;
253 auto divergent
= log
.rewind_from_head(newhead
);
254 if (!divergent
.empty()) {
255 mark_dirty_from(divergent
.front().version
);
257 for (auto &&entry
: divergent
) {
258 dout(10) << "rewind_divergent_log future divergent " << entry
<< dendl
;
260 info
.last_update
= newhead
;
262 _merge_divergent_entries(
266 log
.get_can_rollback_to(),
272 dirty_big_info
= true;
275 void PGLog::merge_log(pg_info_t
&oinfo
, pg_log_t
&olog
, pg_shard_t fromosd
,
276 pg_info_t
&info
, LogEntryHandler
*rollbacker
,
277 bool &dirty_info
, bool &dirty_big_info
)
279 dout(10) << "merge_log " << olog
<< " from osd." << fromosd
280 << " into " << log
<< dendl
;
282 // Check preconditions
284 // If our log is empty, the incoming log needs to have not been trimmed.
285 assert(!log
.null() || olog
.tail
== eversion_t());
286 // The logs must overlap.
287 assert(log
.head
>= olog
.tail
&& olog
.head
>= log
.tail
);
289 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
= missing
.get_items().begin();
290 i
!= missing
.get_items().end();
292 dout(20) << "pg_missing_t sobject: " << i
->first
<< dendl
;
295 bool changed
= false;
298 // this is just filling in history. it does not affect our
299 // missing set, as that should already be consistent with our
301 eversion_t orig_tail
= log
.tail
;
302 if (olog
.tail
< log
.tail
) {
303 dout(10) << "merge_log extending tail to " << olog
.tail
<< dendl
;
304 list
<pg_log_entry_t
>::iterator from
= olog
.log
.begin();
305 list
<pg_log_entry_t
>::iterator to
;
308 to
!= olog
.log
.end();
310 if (to
->version
> log
.tail
)
313 dout(15) << *to
<< dendl
;
318 // splice into our log.
319 log
.log
.splice(log
.log
.begin(),
322 info
.log_tail
= log
.tail
= olog
.tail
;
326 if (oinfo
.stats
.reported_seq
< info
.stats
.reported_seq
|| // make sure reported always increases
327 oinfo
.stats
.reported_epoch
< info
.stats
.reported_epoch
) {
328 oinfo
.stats
.reported_seq
= info
.stats
.reported_seq
;
329 oinfo
.stats
.reported_epoch
= info
.stats
.reported_epoch
;
331 if (info
.last_backfill
.is_max())
332 info
.stats
= oinfo
.stats
;
333 info
.hit_set
= oinfo
.hit_set
;
335 // do we have divergent entries to throw out?
336 if (olog
.head
< log
.head
) {
337 rewind_divergent_log(olog
.head
, info
, rollbacker
, dirty_info
, dirty_big_info
);
342 if (olog
.head
> log
.head
) {
343 dout(10) << "merge_log extending head to " << olog
.head
<< dendl
;
345 // find start point in olog
346 list
<pg_log_entry_t
>::iterator to
= olog
.log
.end();
347 list
<pg_log_entry_t
>::iterator from
= olog
.log
.end();
348 eversion_t lower_bound
= MAX(olog
.tail
, orig_tail
);
350 if (from
== olog
.log
.begin())
353 dout(20) << " ? " << *from
<< dendl
;
354 if (from
->version
<= log
.head
) {
355 lower_bound
= MAX(lower_bound
, from
->version
);
360 dout(20) << "merge_log cut point (usually last shared) is "
361 << lower_bound
<< dendl
;
362 mark_dirty_from(lower_bound
);
364 auto divergent
= log
.rewind_from_head(lower_bound
);
365 // move aside divergent items
366 for (auto &&oe
: divergent
) {
367 dout(10) << "merge_log divergent " << oe
<< dendl
;
369 log
.roll_forward_to(log
.head
, rollbacker
);
371 mempool::osd::list
<pg_log_entry_t
> new_entries
;
372 new_entries
.splice(new_entries
.end(), olog
.log
, from
, to
);
373 append_log_entries_update_missing(
375 info
.last_backfill_bitwise
,
383 _merge_divergent_entries(
387 log
.get_can_rollback_to(),
392 info
.last_update
= log
.head
= olog
.head
;
394 // We cannot rollback into the new log entries
395 log
.skip_can_rollback_to_to_head();
397 info
.last_user_version
= oinfo
.last_user_version
;
398 info
.purged_snaps
= oinfo
.purged_snaps
;
403 dout(10) << "merge_log result " << log
<< " " << missing
<< " changed=" << changed
<< dendl
;
407 dirty_big_info
= true;
411 void PGLog::check() {
414 if (log
.log
.size() != log_keys_debug
.size()) {
415 derr
<< "log.log.size() != log_keys_debug.size()" << dendl
;
416 derr
<< "actual log:" << dendl
;
417 for (list
<pg_log_entry_t
>::iterator i
= log
.log
.begin();
420 derr
<< " " << *i
<< dendl
;
422 derr
<< "log_keys_debug:" << dendl
;
423 for (set
<string
>::const_iterator i
= log_keys_debug
.begin();
424 i
!= log_keys_debug
.end();
426 derr
<< " " << *i
<< dendl
;
429 assert(log
.log
.size() == log_keys_debug
.size());
430 for (list
<pg_log_entry_t
>::iterator i
= log
.log
.begin();
433 assert(log_keys_debug
.count(i
->get_key_name()));
437 void PGLog::write_log_and_missing(
438 ObjectStore::Transaction
& t
,
439 map
<string
,bufferlist
> *km
,
440 const coll_t
& coll
, const ghobject_t
&log_oid
,
441 bool require_rollback
)
444 dout(5) << "write_log_and_missing with: "
445 << "dirty_to: " << dirty_to
446 << ", dirty_from: " << dirty_from
447 << ", writeout_from: " << writeout_from
448 << ", trimmed: " << trimmed
449 << ", clear_divergent_priors: " << clear_divergent_priors
451 _write_log_and_missing(
452 t
, km
, log
, coll
, log_oid
,
460 clear_divergent_priors
,
461 (pg_log_debug
? &log_keys_debug
: 0));
464 dout(10) << "log is not dirty" << dendl
;
468 void PGLog::write_log_and_missing_wo_missing(
469 ObjectStore::Transaction
& t
,
470 map
<string
,bufferlist
> *km
,
472 const coll_t
& coll
, const ghobject_t
&log_oid
,
473 map
<eversion_t
, hobject_t
> &divergent_priors
,
474 bool require_rollback
)
476 _write_log_and_missing_wo_missing(
477 t
, km
, log
, coll
, log_oid
,
478 divergent_priors
, eversion_t::max(), eversion_t(), eversion_t(),
480 true, true, require_rollback
, 0);
483 void PGLog::write_log_and_missing(
484 ObjectStore::Transaction
& t
,
485 map
<string
,bufferlist
> *km
,
488 const ghobject_t
&log_oid
,
489 const pg_missing_tracker_t
&missing
,
490 bool require_rollback
)
492 _write_log_and_missing(
493 t
, km
, log
, coll
, log_oid
,
499 true, require_rollback
, false, 0);
502 void PGLog::_write_log_and_missing_wo_missing(
503 ObjectStore::Transaction
& t
,
504 map
<string
,bufferlist
> *km
,
506 const coll_t
& coll
, const ghobject_t
&log_oid
,
507 map
<eversion_t
, hobject_t
> &divergent_priors
,
509 eversion_t dirty_from
,
510 eversion_t writeout_from
,
511 const set
<eversion_t
> &trimmed
,
512 bool dirty_divergent_priors
,
514 bool require_rollback
,
515 set
<string
> *log_keys_debug
518 set
<string
> to_remove
;
519 for (set
<eversion_t
>::const_iterator i
= trimmed
.begin();
522 to_remove
.insert(i
->get_key_name());
523 if (log_keys_debug
) {
524 assert(log_keys_debug
->count(i
->get_key_name()));
525 log_keys_debug
->erase(i
->get_key_name());
529 //dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
531 t
.touch(coll
, log_oid
);
532 if (dirty_to
!= eversion_t()) {
535 eversion_t().get_key_name(), dirty_to
.get_key_name());
536 clear_up_to(log_keys_debug
, dirty_to
.get_key_name());
538 if (dirty_to
!= eversion_t::max() && dirty_from
!= eversion_t::max()) {
539 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
542 dirty_from
.get_key_name(), eversion_t::max().get_key_name());
543 clear_after(log_keys_debug
, dirty_from
.get_key_name());
546 for (list
<pg_log_entry_t
>::iterator p
= log
.log
.begin();
547 p
!= log
.log
.end() && p
->version
<= dirty_to
;
549 bufferlist
bl(sizeof(*p
) * 2);
550 p
->encode_with_checksum(bl
);
551 (*km
)[p
->get_key_name()].claim(bl
);
554 for (list
<pg_log_entry_t
>::reverse_iterator p
= log
.log
.rbegin();
555 p
!= log
.log
.rend() &&
556 (p
->version
>= dirty_from
|| p
->version
>= writeout_from
) &&
557 p
->version
>= dirty_to
;
559 bufferlist
bl(sizeof(*p
) * 2);
560 p
->encode_with_checksum(bl
);
561 (*km
)[p
->get_key_name()].claim(bl
);
564 if (log_keys_debug
) {
565 for (map
<string
, bufferlist
>::iterator i
= (*km
).begin();
568 if (i
->first
[0] == '_')
570 assert(!log_keys_debug
->count(i
->first
));
571 log_keys_debug
->insert(i
->first
);
575 if (dirty_divergent_priors
) {
576 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
577 ::encode(divergent_priors
, (*km
)["divergent_priors"]);
579 if (require_rollback
) {
581 log
.get_can_rollback_to(),
582 (*km
)["can_rollback_to"]);
584 log
.get_rollback_info_trimmed_to(),
585 (*km
)["rollback_info_trimmed_to"]);
588 if (!to_remove
.empty())
589 t
.omap_rmkeys(coll
, log_oid
, to_remove
);
592 void PGLog::_write_log_and_missing(
593 ObjectStore::Transaction
& t
,
594 map
<string
,bufferlist
>* km
,
596 const coll_t
& coll
, const ghobject_t
&log_oid
,
598 eversion_t dirty_from
,
599 eversion_t writeout_from
,
600 const set
<eversion_t
> &trimmed
,
601 const pg_missing_tracker_t
&missing
,
603 bool require_rollback
,
604 bool clear_divergent_priors
,
605 set
<string
> *log_keys_debug
607 set
<string
> to_remove
;
608 for (set
<eversion_t
>::const_iterator i
= trimmed
.begin();
611 to_remove
.insert(i
->get_key_name());
612 if (log_keys_debug
) {
613 assert(log_keys_debug
->count(i
->get_key_name()));
614 log_keys_debug
->erase(i
->get_key_name());
619 t
.touch(coll
, log_oid
);
620 if (dirty_to
!= eversion_t()) {
623 eversion_t().get_key_name(), dirty_to
.get_key_name());
624 clear_up_to(log_keys_debug
, dirty_to
.get_key_name());
626 if (dirty_to
!= eversion_t::max() && dirty_from
!= eversion_t::max()) {
627 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
630 dirty_from
.get_key_name(), eversion_t::max().get_key_name());
631 clear_after(log_keys_debug
, dirty_from
.get_key_name());
634 for (list
<pg_log_entry_t
>::iterator p
= log
.log
.begin();
635 p
!= log
.log
.end() && p
->version
<= dirty_to
;
637 bufferlist
bl(sizeof(*p
) * 2);
638 p
->encode_with_checksum(bl
);
639 (*km
)[p
->get_key_name()].claim(bl
);
642 for (list
<pg_log_entry_t
>::reverse_iterator p
= log
.log
.rbegin();
643 p
!= log
.log
.rend() &&
644 (p
->version
>= dirty_from
|| p
->version
>= writeout_from
) &&
645 p
->version
>= dirty_to
;
647 bufferlist
bl(sizeof(*p
) * 2);
648 p
->encode_with_checksum(bl
);
649 (*km
)[p
->get_key_name()].claim(bl
);
652 if (log_keys_debug
) {
653 for (map
<string
, bufferlist
>::iterator i
= (*km
).begin();
656 if (i
->first
[0] == '_')
658 assert(!log_keys_debug
->count(i
->first
));
659 log_keys_debug
->insert(i
->first
);
663 if (clear_divergent_priors
) {
664 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
665 to_remove
.insert("divergent_priors");
668 [&](const hobject_t
&obj
) {
669 string key
= string("missing/") + obj
.to_str();
670 pg_missing_item item
;
671 if (!missing
.is_missing(obj
, &item
)) {
672 to_remove
.insert(key
);
674 ::encode(make_pair(obj
, item
), (*km
)[key
]);
677 if (require_rollback
) {
679 log
.get_can_rollback_to(),
680 (*km
)["can_rollback_to"]);
682 log
.get_rollback_info_trimmed_to(),
683 (*km
)["rollback_info_trimmed_to"]);
686 if (!to_remove
.empty())
687 t
.omap_rmkeys(coll
, log_oid
, to_remove
);