]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGLog.h
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / osd / PGLog.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17 #pragma once
18
19 // re-include our assert to clobber boost's
20 #include "include/ceph_assert.h"
21 #include "osd_types.h"
22 #include "os/ObjectStore.h"
23 #include <list>
24
25 constexpr auto PGLOG_INDEXED_OBJECTS = 1 << 0;
26 constexpr auto PGLOG_INDEXED_CALLER_OPS = 1 << 1;
27 constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2;
28 constexpr auto PGLOG_INDEXED_DUPS = 1 << 3;
29 constexpr auto PGLOG_INDEXED_ALL = PGLOG_INDEXED_OBJECTS
30 | PGLOG_INDEXED_CALLER_OPS
31 | PGLOG_INDEXED_EXTRA_CALLER_OPS
32 | PGLOG_INDEXED_DUPS;
33
34 class CephContext;
35
36 struct PGLog : DoutPrefixProvider {
37 std::ostream& gen_prefix(std::ostream& out) const override {
38 return out;
39 }
40 unsigned get_subsys() const override {
41 return static_cast<unsigned>(ceph_subsys_osd);
42 }
43 CephContext *get_cct() const override {
44 return cct;
45 }
46
47 ////////////////////////////// sub classes //////////////////////////////
48 struct LogEntryHandler {
49 virtual void rollback(
50 const pg_log_entry_t &entry) = 0;
51 virtual void rollforward(
52 const pg_log_entry_t &entry) = 0;
53 virtual void trim(
54 const pg_log_entry_t &entry) = 0;
55 virtual void remove(
56 const hobject_t &hoid) = 0;
57 virtual void try_stash(
58 const hobject_t &hoid,
59 version_t v) = 0;
60 virtual ~LogEntryHandler() {}
61 };
62
63 public:
64 /**
65 * IndexLog - adds in-memory index of the log, by oid.
66 * plus some methods to manipulate it all.
67 */
68 struct IndexedLog : public pg_log_t {
69 mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful!
70 mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
71 mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
72 mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index;
73
74 // recovery pointers
75 list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
76 version_t last_requested = 0; // last object requested by primary
77
78 //
79 private:
80 mutable __u16 indexed_data = 0;
81 /**
82 * rollback_info_trimmed_to_riter points to the first log entry <=
83 * rollback_info_trimmed_to
84 *
85 * It's a reverse_iterator because rend() is a natural representation for
86 * tail, and rbegin() works nicely for head.
87 */
88 mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator
89 rollback_info_trimmed_to_riter;
90
91 template <typename F>
92 void advance_can_rollback_to(eversion_t to, F &&f) {
93 if (to > can_rollback_to)
94 can_rollback_to = to;
95
96 if (to > rollback_info_trimmed_to)
97 rollback_info_trimmed_to = to;
98
99 while (rollback_info_trimmed_to_riter != log.rbegin()) {
100 --rollback_info_trimmed_to_riter;
101 if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
102 ++rollback_info_trimmed_to_riter;
103 break;
104 }
105 f(*rollback_info_trimmed_to_riter);
106 }
107 }
108
109 void reset_rollback_info_trimmed_to_riter() {
110 rollback_info_trimmed_to_riter = log.rbegin();
111 while (rollback_info_trimmed_to_riter != log.rend() &&
112 rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
113 ++rollback_info_trimmed_to_riter;
114 }
115
116 // indexes objects, caller ops and extra caller ops
117 public:
118 IndexedLog() :
119 complete_to(log.end()),
120 last_requested(0),
121 indexed_data(0),
122 rollback_info_trimmed_to_riter(log.rbegin())
123 { }
124
125 template <typename... Args>
126 explicit IndexedLog(Args&&... args) :
127 pg_log_t(std::forward<Args>(args)...),
128 complete_to(log.end()),
129 last_requested(0),
130 indexed_data(0),
131 rollback_info_trimmed_to_riter(log.rbegin())
132 {
133 reset_rollback_info_trimmed_to_riter();
134 index();
135 }
136
137 IndexedLog(const IndexedLog &rhs) :
138 pg_log_t(rhs),
139 complete_to(log.end()),
140 last_requested(rhs.last_requested),
141 indexed_data(0),
142 rollback_info_trimmed_to_riter(log.rbegin())
143 {
144 reset_rollback_info_trimmed_to_riter();
145 index(rhs.indexed_data);
146 }
147
148 IndexedLog &operator=(const IndexedLog &rhs) {
149 this->~IndexedLog();
150 new (this) IndexedLog(rhs);
151 return *this;
152 }
153
154 void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) {
155 advance_can_rollback_to(
156 to,
157 [&](pg_log_entry_t &entry) {
158 h->trim(entry);
159 });
160 }
161 void roll_forward_to(eversion_t to, LogEntryHandler *h) {
162 advance_can_rollback_to(
163 to,
164 [&](pg_log_entry_t &entry) {
165 h->rollforward(entry);
166 });
167 }
168
169 void skip_can_rollback_to_to_head() {
170 advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
171 }
172
173 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
174 auto divergent = pg_log_t::rewind_from_head(newhead);
175 index();
176 reset_rollback_info_trimmed_to_riter();
177 return divergent;
178 }
179
180 template <typename T>
181 void scan_log_after(
182 const eversion_t &bound, ///< [in] scan entries > bound
183 T &&f) const {
184 auto iter = log.rbegin();
185 while (iter != log.rend() && iter->version > bound)
186 ++iter;
187
188 while (true) {
189 if (iter == log.rbegin())
190 break;
191 f(*(--iter));
192 }
193 }
194
195 /****/
196 void claim_log_and_clear_rollback_info(const pg_log_t& o) {
197 // we must have already trimmed the old entries
198 ceph_assert(rollback_info_trimmed_to == head);
199 ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
200
201 *this = IndexedLog(o);
202
203 skip_can_rollback_to_to_head();
204 index();
205 }
206
207 void split_out_child(
208 pg_t child_pgid,
209 unsigned split_bits,
210 IndexedLog *target);
211
212 void zero() {
213 // we must have already trimmed the old entries
214 ceph_assert(rollback_info_trimmed_to == head);
215 ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
216
217 unindex();
218 pg_log_t::clear();
219 rollback_info_trimmed_to_riter = log.rbegin();
220 reset_recovery_pointers();
221 }
222 void clear() {
223 skip_can_rollback_to_to_head();
224 zero();
225 }
226 void reset_recovery_pointers() {
227 complete_to = log.end();
228 last_requested = 0;
229 }
230
231 bool logged_object(const hobject_t& oid) const {
232 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
233 index_objects();
234 }
235 return objects.count(oid);
236 }
237
238 bool logged_req(const osd_reqid_t &r) const {
239 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
240 index_caller_ops();
241 }
242 if (!caller_ops.count(r)) {
243 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
244 index_extra_caller_ops();
245 }
246 return extra_caller_ops.count(r);
247 }
248 return true;
249 }
250
251 bool get_request(
252 const osd_reqid_t &r,
253 eversion_t *version,
254 version_t *user_version,
255 int *return_code) const
256 {
257 ceph_assert(version);
258 ceph_assert(user_version);
259 ceph_assert(return_code);
260 ceph::unordered_map<osd_reqid_t,pg_log_entry_t*>::const_iterator p;
261 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
262 index_caller_ops();
263 }
264 p = caller_ops.find(r);
265 if (p != caller_ops.end()) {
266 *version = p->second->version;
267 *user_version = p->second->user_version;
268 *return_code = p->second->return_code;
269 return true;
270 }
271
272 // warning: we will return *a* request for this reqid, but not
273 // necessarily the most recent.
274 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
275 index_extra_caller_ops();
276 }
277 p = extra_caller_ops.find(r);
278 if (p != extra_caller_ops.end()) {
279 uint32_t idx = 0;
280 for (auto i = p->second->extra_reqids.begin();
281 i != p->second->extra_reqids.end();
282 ++idx, ++i) {
283 if (i->first == r) {
284 *version = p->second->version;
285 *user_version = i->second;
286 *return_code = p->second->return_code;
287 if (*return_code >= 0) {
288 auto it = p->second->extra_reqid_return_codes.find(idx);
289 if (it != p->second->extra_reqid_return_codes.end()) {
290 *return_code = it->second;
291 }
292 }
293 return true;
294 }
295 }
296 ceph_abort_msg("in extra_caller_ops but not extra_reqids");
297 }
298
299 if (!(indexed_data & PGLOG_INDEXED_DUPS)) {
300 index_dups();
301 }
302 auto q = dup_index.find(r);
303 if (q != dup_index.end()) {
304 *version = q->second->version;
305 *user_version = q->second->user_version;
306 *return_code = q->second->return_code;
307 return true;
308 }
309
310 return false;
311 }
312
313 /// get a (bounded) list of recent reqids for the given object
314 void get_object_reqids(const hobject_t& oid, unsigned max,
315 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *pls,
316 mempool::osd_pglog::map<uint32_t, int> *return_codes) const {
317 // make sure object is present at least once before we do an
318 // O(n) search.
319 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
320 index_objects();
321 }
322 if (objects.count(oid) == 0)
323 return;
324
325 for (list<pg_log_entry_t>::const_reverse_iterator i = log.rbegin();
326 i != log.rend();
327 ++i) {
328 if (i->soid == oid) {
329 if (i->reqid_is_indexed()) {
330 if (i->op == pg_log_entry_t::ERROR) {
331 // propagate op errors to the cache tier's PG log
332 return_codes->emplace(pls->size(), i->return_code);
333 }
334 pls->push_back(make_pair(i->reqid, i->user_version));
335 }
336
337 pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end());
338 if (pls->size() >= max) {
339 if (pls->size() > max) {
340 pls->resize(max);
341 }
342 return;
343 }
344 }
345 }
346 }
347
348 void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
349 // if to_index is 0, no need to run any of this code, especially
350 // loop below; this can happen with copy constructor for
351 // IndexedLog (and indirectly through assignment operator)
352 if (!to_index) return;
353
354 if (to_index & PGLOG_INDEXED_OBJECTS)
355 objects.clear();
356 if (to_index & PGLOG_INDEXED_CALLER_OPS)
357 caller_ops.clear();
358 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
359 extra_caller_ops.clear();
360 if (to_index & PGLOG_INDEXED_DUPS) {
361 dup_index.clear();
362 for (auto& i : dups) {
363 dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i);
364 }
365 }
366
367 constexpr __u16 any_log_entry_index =
368 PGLOG_INDEXED_OBJECTS |
369 PGLOG_INDEXED_CALLER_OPS |
370 PGLOG_INDEXED_EXTRA_CALLER_OPS;
371
372 if (to_index & any_log_entry_index) {
373 for (list<pg_log_entry_t>::const_iterator i = log.begin();
374 i != log.end();
375 ++i) {
376 if (to_index & PGLOG_INDEXED_OBJECTS) {
377 if (i->object_is_indexed()) {
378 objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
379 }
380 }
381
382 if (to_index & PGLOG_INDEXED_CALLER_OPS) {
383 if (i->reqid_is_indexed()) {
384 caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
385 }
386 }
387
388 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
389 for (auto j = i->extra_reqids.begin();
390 j != i->extra_reqids.end();
391 ++j) {
392 extra_caller_ops.insert(
393 make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
394 }
395 }
396 }
397 }
398
399 indexed_data |= to_index;
400 }
401
402 void index_objects() const {
403 index(PGLOG_INDEXED_OBJECTS);
404 }
405
406 void index_caller_ops() const {
407 index(PGLOG_INDEXED_CALLER_OPS);
408 }
409
410 void index_extra_caller_ops() const {
411 index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
412 }
413
414 void index_dups() const {
415 index(PGLOG_INDEXED_DUPS);
416 }
417
418 void index(pg_log_entry_t& e) {
419 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
420 if (objects.count(e.soid) == 0 ||
421 objects[e.soid]->version < e.version)
422 objects[e.soid] = &e;
423 }
424 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
425 // divergent merge_log indexes new before unindexing old
426 if (e.reqid_is_indexed()) {
427 caller_ops[e.reqid] = &e;
428 }
429 }
430 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
431 for (auto j = e.extra_reqids.begin();
432 j != e.extra_reqids.end();
433 ++j) {
434 extra_caller_ops.insert(make_pair(j->first, &e));
435 }
436 }
437 }
438
439 void unindex() {
440 objects.clear();
441 caller_ops.clear();
442 extra_caller_ops.clear();
443 dup_index.clear();
444 indexed_data = 0;
445 }
446
447 void unindex(const pg_log_entry_t& e) {
448 // NOTE: this only works if we remove from the _tail_ of the log!
449 if (indexed_data & PGLOG_INDEXED_OBJECTS) {
450 auto it = objects.find(e.soid);
451 if (it != objects.end() && it->second->version == e.version)
452 objects.erase(it);
453 }
454 if (e.reqid_is_indexed()) {
455 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
456 auto it = caller_ops.find(e.reqid);
457 // divergent merge_log indexes new before unindexing old
458 if (it != caller_ops.end() && it->second == &e)
459 caller_ops.erase(it);
460 }
461 }
462 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
463 for (auto j = e.extra_reqids.begin();
464 j != e.extra_reqids.end();
465 ++j) {
466 for (ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*>::iterator k =
467 extra_caller_ops.find(j->first);
468 k != extra_caller_ops.end() && k->first == j->first;
469 ++k) {
470 if (k->second == &e) {
471 extra_caller_ops.erase(k);
472 break;
473 }
474 }
475 }
476 }
477 }
478
479 void index(pg_log_dup_t& e) {
480 if (indexed_data & PGLOG_INDEXED_DUPS) {
481 dup_index[e.reqid] = &e;
482 }
483 }
484
485 void unindex(const pg_log_dup_t& e) {
486 if (indexed_data & PGLOG_INDEXED_DUPS) {
487 auto i = dup_index.find(e.reqid);
488 if (i != dup_index.end()) {
489 dup_index.erase(i);
490 }
491 }
492 }
493
494 // actors
495 void add(const pg_log_entry_t& e, bool applied = true) {
496 if (!applied) {
497 ceph_assert(get_can_rollback_to() == head);
498 }
499
500 // make sure our buffers don't pin bigger buffers
501 e.mod_desc.trim_bl();
502
503 // add to log
504 log.push_back(e);
505
506 // riter previously pointed to the previous entry
507 if (rollback_info_trimmed_to_riter == log.rbegin())
508 ++rollback_info_trimmed_to_riter;
509
510 ceph_assert(e.version > head);
511 ceph_assert(head.version == 0 || e.version.version > head.version);
512 head = e.version;
513
514 // to our index
515 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
516 objects[e.soid] = &(log.back());
517 }
518 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
519 if (e.reqid_is_indexed()) {
520 caller_ops[e.reqid] = &(log.back());
521 }
522 }
523
524 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
525 for (auto j = e.extra_reqids.begin();
526 j != e.extra_reqids.end();
527 ++j) {
528 extra_caller_ops.insert(make_pair(j->first, &(log.back())));
529 }
530 }
531
532 if (!applied) {
533 skip_can_rollback_to_to_head();
534 }
535 } // add
536
537 void trim(
538 CephContext* cct,
539 eversion_t s,
540 set<eversion_t> *trimmed,
541 set<string>* trimmed_dups,
542 eversion_t *write_from_dups);
543
544 ostream& print(ostream& out) const;
545 }; // IndexedLog
546
547
548 protected:
549 //////////////////// data members ////////////////////
550
551 pg_missing_tracker_t missing;
552 IndexedLog log;
553
554 eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to
555 eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
556 eversion_t writeout_from; ///< must writout keys >= writeout_from
557 set<eversion_t> trimmed; ///< must clear keys in trimmed
558 eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups
559 eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups
560 eversion_t write_from_dups; ///< must write keys >= write_from_dups
561 set<string> trimmed_dups; ///< must clear keys in trimmed_dups
562 CephContext *cct;
563 bool pg_log_debug;
564 /// Log is clean on [dirty_to, dirty_from)
565 bool touched_log;
566 bool clear_divergent_priors;
567 bool rebuilt_missing_with_deletes = false;
568
569 void mark_dirty_to(eversion_t to) {
570 if (to > dirty_to)
571 dirty_to = to;
572 }
573 void mark_dirty_from(eversion_t from) {
574 if (from < dirty_from)
575 dirty_from = from;
576 }
577 void mark_writeout_from(eversion_t from) {
578 if (from < writeout_from)
579 writeout_from = from;
580 }
581 void mark_dirty_to_dups(eversion_t to) {
582 if (to > dirty_to_dups)
583 dirty_to_dups = to;
584 }
585 void mark_dirty_from_dups(eversion_t from) {
586 if (from < dirty_from_dups)
587 dirty_from_dups = from;
588 }
589 public:
590 bool is_dirty() const {
591 return !touched_log ||
592 (dirty_to != eversion_t()) ||
593 (dirty_from != eversion_t::max()) ||
594 (writeout_from != eversion_t::max()) ||
595 !(trimmed.empty()) ||
596 !missing.is_clean() ||
597 !(trimmed_dups.empty()) ||
598 (dirty_to_dups != eversion_t()) ||
599 (dirty_from_dups != eversion_t::max()) ||
600 (write_from_dups != eversion_t::max()) ||
601 rebuilt_missing_with_deletes;
602 }
603 void mark_log_for_rewrite() {
604 mark_dirty_to(eversion_t::max());
605 mark_dirty_from(eversion_t());
606 mark_dirty_to_dups(eversion_t::max());
607 mark_dirty_from_dups(eversion_t());
608 touched_log = false;
609 }
610 bool get_rebuilt_missing_with_deletes() const {
611 return rebuilt_missing_with_deletes;
612 }
613 protected:
614
615 /// DEBUG
616 set<string> log_keys_debug;
617 static void clear_after(set<string> *log_keys_debug, const string &lb) {
618 if (!log_keys_debug)
619 return;
620 for (set<string>::iterator i = log_keys_debug->lower_bound(lb);
621 i != log_keys_debug->end();
622 log_keys_debug->erase(i++));
623 }
624 static void clear_up_to(set<string> *log_keys_debug, const string &ub) {
625 if (!log_keys_debug)
626 return;
627 for (set<string>::iterator i = log_keys_debug->begin();
628 i != log_keys_debug->end() && *i < ub;
629 log_keys_debug->erase(i++));
630 }
631
632 void check();
633 void undirty() {
634 dirty_to = eversion_t();
635 dirty_from = eversion_t::max();
636 touched_log = true;
637 trimmed.clear();
638 trimmed_dups.clear();
639 writeout_from = eversion_t::max();
640 check();
641 missing.flush();
642 dirty_to_dups = eversion_t();
643 dirty_from_dups = eversion_t::max();
644 write_from_dups = eversion_t::max();
645 }
646 public:
647
648 // cppcheck-suppress noExplicitConstructor
649 PGLog(CephContext *cct) :
650 dirty_from(eversion_t::max()),
651 writeout_from(eversion_t::max()),
652 dirty_from_dups(eversion_t::max()),
653 write_from_dups(eversion_t::max()),
654 cct(cct),
655 pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
656 touched_log(false),
657 clear_divergent_priors(false)
658 { }
659
660 void reset_backfill();
661
662 void clear();
663
664 //////////////////// get or set missing ////////////////////
665
666 const pg_missing_tracker_t& get_missing() const { return missing; }
667
668 void missing_add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete=false) {
669 missing.add(oid, need, have, is_delete);
670 }
671
672 void missing_add_next_entry(const pg_log_entry_t& e) {
673 missing.add_next_event(e);
674 }
675
676 //////////////////// get or set log ////////////////////
677
678 const IndexedLog &get_log() const { return log; }
679
680 const eversion_t &get_tail() const { return log.tail; }
681
682 void set_tail(eversion_t tail) { log.tail = tail; }
683
684 const eversion_t &get_head() const { return log.head; }
685
686 void set_head(eversion_t head) { log.head = head; }
687
688 void set_last_requested(version_t last_requested) {
689 log.last_requested = last_requested;
690 }
691
692 void index() { log.index(); }
693
694 void unindex() { log.unindex(); }
695
696 void add(const pg_log_entry_t& e, bool applied = true) {
697 mark_writeout_from(e.version);
698 log.add(e, applied);
699 }
700
701 void reset_recovery_pointers() { log.reset_recovery_pointers(); }
702
703 static void clear_info_log(
704 spg_t pgid,
705 ObjectStore::Transaction *t);
706
707 void trim(
708 eversion_t trim_to,
709 pg_info_t &info,
710 bool transaction_applied = true,
711 bool async = false);
712
713 void roll_forward_to(
714 eversion_t roll_forward_to,
715 LogEntryHandler *h) {
716 log.roll_forward_to(
717 roll_forward_to,
718 h);
719 }
720
721 eversion_t get_can_rollback_to() const {
722 return log.get_can_rollback_to();
723 }
724
725 void roll_forward(LogEntryHandler *h) {
726 roll_forward_to(
727 log.head,
728 h);
729 }
730
731 void skip_rollforward() {
732 log.skip_can_rollback_to_to_head();
733 }
734
735 //////////////////// get or set log & missing ////////////////////
736
737 void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) {
738 log.trim_rollback_info_to(log.head, h);
739 log.claim_log_and_clear_rollback_info(o);
740 missing.clear();
741 mark_dirty_to(eversion_t::max());
742 mark_dirty_to_dups(eversion_t::max());
743 }
744
745 void split_into(
746 pg_t child_pgid,
747 unsigned split_bits,
748 PGLog *opg_log) {
749 log.split_out_child(child_pgid, split_bits, &opg_log->log);
750 missing.split_into(child_pgid, split_bits, &(opg_log->missing));
751 opg_log->mark_dirty_to(eversion_t::max());
752 opg_log->mark_dirty_to_dups(eversion_t::max());
753 mark_dirty_to(eversion_t::max());
754 mark_dirty_to_dups(eversion_t::max());
755 if (missing.may_include_deletes)
756 opg_log->rebuilt_missing_with_deletes = true;
757 }
758
759 void merge_from(
760 const vector<PGLog*>& sources,
761 eversion_t last_update) {
762 unindex();
763 missing.clear();
764
765 vector<pg_log_t*> slogs;
766 for (auto s : sources) {
767 slogs.push_back(&s->log);
768 }
769 log.merge_from(slogs, last_update);
770
771 index();
772
773 mark_log_for_rewrite();
774 }
775
776 void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
777 if (missing.is_missing(oid, v)) {
778 missing.got(oid, v);
779 info.stats.stats.sum.num_objects_missing = missing.num_missing();
780
781 // raise last_complete?
782 if (missing.get_items().empty()) {
783 log.complete_to = log.log.end();
784 info.last_complete = info.last_update;
785 }
786 auto oldest_need = missing.get_oldest_need();
787 while (log.complete_to != log.log.end()) {
788 if (oldest_need <= log.complete_to->version)
789 break;
790 if (info.last_complete < log.complete_to->version)
791 info.last_complete = log.complete_to->version;
792 ++log.complete_to;
793 }
794 }
795
796 ceph_assert(log.get_can_rollback_to() >= v);
797 }
798
799 void reset_complete_to(pg_info_t *info) {
800 if (log.log.empty()) // caller is split_into()
801 return;
802 log.complete_to = log.log.begin();
803 ceph_assert(log.complete_to != log.log.end());
804 auto oldest_need = missing.get_oldest_need();
805 if (oldest_need != eversion_t()) {
806 while (log.complete_to->version < oldest_need) {
807 ++log.complete_to;
808 ceph_assert(log.complete_to != log.log.end());
809 }
810 }
811 if (!info)
812 return;
813 if (log.complete_to == log.log.begin()) {
814 info->last_complete = eversion_t();
815 } else {
816 --log.complete_to;
817 info->last_complete = log.complete_to->version;
818 ++log.complete_to;
819 }
820 }
821
822 void activate_not_complete(pg_info_t &info) {
823 reset_complete_to(&info);
824 log.last_requested = 0;
825 }
826
827 void proc_replica_log(pg_info_t &oinfo,
828 const pg_log_t &olog,
829 pg_missing_t& omissing, pg_shard_t from) const;
830
831 void rebuild_missing_set_with_deletes(ObjectStore *store,
832 ObjectStore::CollectionHandle& ch,
833 const pg_info_t &info);
834
835 protected:
836 static void split_by_object(
837 mempool::osd_pglog::list<pg_log_entry_t> &entries,
838 map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) {
839 while (!entries.empty()) {
840 auto &out_list = (*out_entries)[entries.front().soid];
841 out_list.splice(out_list.end(), entries, entries.begin());
842 }
843 }
844
845 /**
846 * _merge_object_divergent_entries
847 *
848 * There are 5 distinct cases:
849 * 1) There is a more recent update: in this case we assume we adjusted the
850 * store and missing during merge_log
851 * 2) The first entry in the divergent sequence is a create. This might
852 * either be because the object is a clone or because prior_version is
853 * eversion_t(). In this case the object does not exist and we must
854 * adjust missing and the store to match.
855 * 3) We are currently missing the object. In this case, we adjust the
856 * missing to our prior_version taking care to add a divergent_prior
857 * if necessary
858 * 4) We can rollback all of the entries. In this case, we do so using
859 * the rollbacker and return -- the object does not go into missing.
860 * 5) We cannot rollback at least 1 of the entries. In this case, we
861 * clear the object out of the store and add a missing entry at
862 * prior_version taking care to add a divergent_prior if
863 * necessary.
864 */
865 template <typename missing_type>
866 static void _merge_object_divergent_entries(
867 const IndexedLog &log, ///< [in] log to merge against
868 const hobject_t &hoid, ///< [in] object we are merging
869 const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
870 const pg_info_t &info, ///< [in] info for merging entries
871 eversion_t olog_can_rollback_to, ///< [in] rollback boundary
872 eversion_t original_can_rollback_to, ///< [in] original rollback boundary
873 missing_type &missing, ///< [in,out] missing to adjust, use
874 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
875 const DoutPrefixProvider *dpp ///< [in] logging provider
876 ) {
877 ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
878 << " entries: " << orig_entries << dendl;
879
880 if (hoid > info.last_backfill) {
881 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill"
882 << dendl;
883 return;
884 }
885
886 // entries is non-empty
887 ceph_assert(!orig_entries.empty());
888 // strip out and ignore ERROR entries
889 mempool::osd_pglog::list<pg_log_entry_t> entries;
890 eversion_t last;
891 bool seen_non_error = false;
892 for (list<pg_log_entry_t>::const_iterator i = orig_entries.begin();
893 i != orig_entries.end();
894 ++i) {
895 // all entries are on hoid
896 ceph_assert(i->soid == hoid);
897 // did not see error entries before this entry and this entry is not error
898 // then this entry is the first non error entry
899 bool first_non_error = ! seen_non_error && ! i->is_error();
900 if (! i->is_error() ) {
901 // see a non error entry now
902 seen_non_error = true;
903 }
904
905 // No need to check the first entry since it prior_version is unavailable
906 // in the list
907 // No need to check if the prior_version is the minimal version
908 // No need to check the first non-error entry since the leading error
909 // entries are not its prior version
910 if (i != orig_entries.begin() && i->prior_version != eversion_t() &&
911 ! first_non_error) {
912 // in increasing order of version
913 ceph_assert(i->version > last);
914 // prior_version correct (unless it is an ERROR entry)
915 ceph_assert(i->prior_version == last || i->is_error());
916 }
917 if (i->is_error()) {
918 ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
919 } else {
920 ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
921 entries.push_back(*i);
922 last = i->version;
923 }
924 }
925 if (entries.empty()) {
926 ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl;
927 return;
928 }
929
930 const eversion_t prior_version = entries.begin()->prior_version;
931 const eversion_t first_divergent_update = entries.begin()->version;
932 const eversion_t last_divergent_update = entries.rbegin()->version;
933 const bool object_not_in_store =
934 !missing.is_missing(hoid) &&
935 entries.rbegin()->is_delete();
936 ldpp_dout(dpp, 10) << __func__ << ": hoid " << " object_not_in_store: "
937 << object_not_in_store << dendl;
938 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
939 << " prior_version: " << prior_version
940 << " first_divergent_update: " << first_divergent_update
941 << " last_divergent_update: " << last_divergent_update
942 << dendl;
943
944 ceph::unordered_map<hobject_t, pg_log_entry_t*>::const_iterator objiter =
945 log.objects.find(hoid);
946 if (objiter != log.objects.end() &&
947 objiter->second->version >= first_divergent_update) {
948 /// Case 1)
949 ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: "
950 << *objiter->second << ", already merged" << dendl;
951
952 ceph_assert(objiter->second->version > last_divergent_update);
953
954 // ensure missing has been updated appropriately
955 if (objiter->second->is_update() ||
956 (missing.may_include_deletes && objiter->second->is_delete())) {
957 ceph_assert(missing.is_missing(hoid) &&
958 missing.get_items().at(hoid).need == objiter->second->version);
959 } else {
960 ceph_assert(!missing.is_missing(hoid));
961 }
962 missing.revise_have(hoid, eversion_t());
963 if (rollbacker) {
964 if (!object_not_in_store) {
965 rollbacker->remove(hoid);
966 }
967 for (auto &&i: entries) {
968 rollbacker->trim(i);
969 }
970 }
971 return;
972 }
973
974 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
975 <<" has no more recent entries in log" << dendl;
976 if (prior_version == eversion_t() || entries.front().is_clone()) {
977 /// Case 2)
978 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
979 << " prior_version or op type indicates creation,"
980 << " deleting"
981 << dendl;
982 if (missing.is_missing(hoid))
983 missing.rm(missing.get_items().find(hoid));
984 if (rollbacker) {
985 if (!object_not_in_store) {
986 rollbacker->remove(hoid);
987 }
988 for (auto &&i: entries) {
989 rollbacker->trim(i);
990 }
991 }
992 return;
993 }
994
995 if (missing.is_missing(hoid)) {
996 /// Case 3)
997 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
998 << " missing, " << missing.get_items().at(hoid)
999 << " adjusting" << dendl;
1000
1001 if (missing.get_items().at(hoid).have == prior_version) {
1002 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1003 << " missing.have is prior_version " << prior_version
1004 << " removing from missing" << dendl;
1005 missing.rm(missing.get_items().find(hoid));
1006 } else {
1007 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1008 << " missing.have is " << missing.get_items().at(hoid).have
1009 << ", adjusting" << dendl;
1010 missing.revise_need(hoid, prior_version, false);
1011 if (prior_version <= info.log_tail) {
1012 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1013 << " prior_version " << prior_version
1014 << " <= info.log_tail "
1015 << info.log_tail << dendl;
1016 }
1017 }
1018 if (rollbacker) {
1019 for (auto &&i: entries) {
1020 rollbacker->trim(i);
1021 }
1022 }
1023 return;
1024 }
1025
1026 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1027 << " must be rolled back or recovered,"
1028 << " attempting to rollback"
1029 << dendl;
1030 bool can_rollback = true;
1031 // We are going to make an important decision based on the
1032 // olog_can_rollback_to value we have received, better known it.
1033 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1034 << " olog_can_rollback_to: "
1035 << olog_can_rollback_to << dendl;
1036 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1037 << " original_crt: "
1038 << original_can_rollback_to << dendl;
1039 /// Distinguish between 4) and 5)
1040 for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin();
1041 i != entries.rend();
1042 ++i) {
1043 /// Use original_can_rollback_to instead of olog_can_rollback_to to check
1044 // if we can rollback or not. This is to ensure that we don't try to rollback
1045 // to an object that has been deleted and doesn't exist.
1046 if (!i->can_rollback() || i->version <= original_can_rollback_to) {
1047 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback "
1048 << *i << dendl;
1049 can_rollback = false;
1050 break;
1051 }
1052 }
1053
1054 if (can_rollback) {
1055 /// Case 4)
1056 for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin();
1057 i != entries.rend();
1058 ++i) {
1059 ceph_assert(i->can_rollback() && i->version > original_can_rollback_to);
1060 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1061 << " rolling back " << *i << dendl;
1062 if (rollbacker)
1063 rollbacker->rollback(*i);
1064 }
1065 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1066 << " rolled back" << dendl;
1067 return;
1068 } else {
1069 /// Case 5)
1070 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, "
1071 << "removing and adding to missing" << dendl;
1072 if (rollbacker) {
1073 if (!object_not_in_store)
1074 rollbacker->remove(hoid);
1075 for (auto &&i: entries) {
1076 rollbacker->trim(i);
1077 }
1078 }
1079 missing.add(hoid, prior_version, eversion_t(), false);
1080 if (prior_version <= info.log_tail) {
1081 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1082 << " prior_version " << prior_version
1083 << " <= info.log_tail "
1084 << info.log_tail << dendl;
1085 }
1086 }
1087 }
1088
1089 /// Merge all entries using above
1090 template <typename missing_type>
1091 static void _merge_divergent_entries(
1092 const IndexedLog &log, ///< [in] log to merge against
1093 mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge
1094 const pg_info_t &oinfo, ///< [in] info for merging entries
1095 eversion_t olog_can_rollback_to, ///< [in] rollback boundary
1096 eversion_t original_can_rollback_to, ///< [in] original rollback boundary
1097 missing_type &omissing, ///< [in,out] missing to adjust, use
1098 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
1099 const DoutPrefixProvider *dpp ///< [in] logging provider
1100 ) {
1101 map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
1102 split_by_object(entries, &split);
1103 for (map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>>::iterator i = split.begin();
1104 i != split.end();
1105 ++i) {
1106 _merge_object_divergent_entries(
1107 log,
1108 i->first,
1109 i->second,
1110 oinfo,
1111 olog_can_rollback_to,
1112 original_can_rollback_to,
1113 omissing,
1114 rollbacker,
1115 dpp);
1116 }
1117 }
1118
1119 /**
1120 * Exists for use in TestPGLog for simply testing single divergent log
1121 * cases
1122 */
1123 void merge_old_entry(
1124 ObjectStore::Transaction& t,
1125 const pg_log_entry_t& oe,
1126 const pg_info_t& info,
1127 LogEntryHandler *rollbacker) {
1128 mempool::osd_pglog::list<pg_log_entry_t> entries;
1129 entries.push_back(oe);
1130 _merge_object_divergent_entries(
1131 log,
1132 oe.soid,
1133 entries,
1134 info,
1135 log.get_can_rollback_to(),
1136 log.get_can_rollback_to(),
1137 missing,
1138 rollbacker,
1139 this);
1140 }
1141
1142 bool merge_log_dups(const pg_log_t& olog);
1143
1144 public:
1145
1146 void rewind_divergent_log(eversion_t newhead,
1147 pg_info_t &info,
1148 LogEntryHandler *rollbacker,
1149 bool &dirty_info,
1150 bool &dirty_big_info);
1151
1152 void merge_log(pg_info_t &oinfo,
1153 pg_log_t &olog,
1154 pg_shard_t from,
1155 pg_info_t &info, LogEntryHandler *rollbacker,
1156 bool &dirty_info, bool &dirty_big_info);
1157
1158 template <typename missing_type>
1159 static bool append_log_entries_update_missing(
1160 const hobject_t &last_backfill,
1161 bool last_backfill_bitwise,
1162 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
1163 bool maintain_rollback,
1164 IndexedLog *log,
1165 missing_type &missing,
1166 LogEntryHandler *rollbacker,
1167 const DoutPrefixProvider *dpp) {
1168 bool invalidate_stats = false;
1169 if (log && !entries.empty()) {
1170 ceph_assert(log->head < entries.begin()->version);
1171 }
1172 for (list<pg_log_entry_t>::const_iterator p = entries.begin();
1173 p != entries.end();
1174 ++p) {
1175 invalidate_stats = invalidate_stats || !p->is_error();
1176 if (log) {
1177 ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl;
1178 log->add(*p);
1179 }
1180 if (p->soid <= last_backfill &&
1181 !p->is_error()) {
1182 if (missing.may_include_deletes) {
1183 missing.add_next_event(*p);
1184 } else {
1185 if (p->is_delete()) {
1186 missing.rm(p->soid, p->version);
1187 } else {
1188 missing.add_next_event(*p);
1189 }
1190 if (rollbacker) {
1191 // hack to match PG::mark_all_unfound_lost
1192 if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
1193 rollbacker->try_stash(p->soid, p->version.version);
1194 } else if (p->is_delete()) {
1195 rollbacker->remove(p->soid);
1196 }
1197 }
1198 }
1199 }
1200 }
1201 return invalidate_stats;
1202 }
1203 bool append_new_log_entries(
1204 const hobject_t &last_backfill,
1205 bool last_backfill_bitwise,
1206 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
1207 LogEntryHandler *rollbacker) {
1208 bool invalidate_stats = append_log_entries_update_missing(
1209 last_backfill,
1210 last_backfill_bitwise,
1211 entries,
1212 true,
1213 &log,
1214 missing,
1215 rollbacker,
1216 this);
1217 if (!entries.empty()) {
1218 mark_writeout_from(entries.begin()->version);
1219 if (entries.begin()->is_lost_delete()) {
1220 // hack: since lost deletes queue recovery directly, and don't
1221 // go through activate_not_complete() again, our complete_to
1222 // iterator may still point at log.end(). Reset it to point
1223 // before these new lost_delete entries. This only occurs
1224 // when lost+delete entries are initially added, which is
1225 // always in a list of solely lost_delete entries, so it is
1226 // sufficient to check whether the first entry is a
1227 // lost_delete
1228 reset_complete_to(nullptr);
1229 }
1230 }
1231 return invalidate_stats;
1232 }
1233
1234 void write_log_and_missing(
1235 ObjectStore::Transaction& t,
1236 map<string,bufferlist> *km,
1237 const coll_t& coll,
1238 const ghobject_t &log_oid,
1239 bool require_rollback);
1240
1241 static void write_log_and_missing_wo_missing(
1242 ObjectStore::Transaction& t,
1243 map<string,bufferlist>* km,
1244 pg_log_t &log,
1245 const coll_t& coll,
1246 const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
1247 bool require_rollback);
1248
1249 static void write_log_and_missing(
1250 ObjectStore::Transaction& t,
1251 map<string,bufferlist>* km,
1252 pg_log_t &log,
1253 const coll_t& coll,
1254 const ghobject_t &log_oid,
1255 const pg_missing_tracker_t &missing,
1256 bool require_rollback,
1257 bool *rebuilt_missing_set_with_deletes);
1258
1259 static void _write_log_and_missing_wo_missing(
1260 ObjectStore::Transaction& t,
1261 map<string,bufferlist>* km,
1262 pg_log_t &log,
1263 const coll_t& coll, const ghobject_t &log_oid,
1264 map<eversion_t, hobject_t> &divergent_priors,
1265 eversion_t dirty_to,
1266 eversion_t dirty_from,
1267 eversion_t writeout_from,
1268 bool dirty_divergent_priors,
1269 bool touch_log,
1270 bool require_rollback,
1271 eversion_t dirty_to_dups,
1272 eversion_t dirty_from_dups,
1273 eversion_t write_from_dups,
1274 set<string> *log_keys_debug
1275 );
1276
1277 static void _write_log_and_missing(
1278 ObjectStore::Transaction& t,
1279 map<string,bufferlist>* km,
1280 pg_log_t &log,
1281 const coll_t& coll, const ghobject_t &log_oid,
1282 eversion_t dirty_to,
1283 eversion_t dirty_from,
1284 eversion_t writeout_from,
1285 set<eversion_t> &&trimmed,
1286 set<string> &&trimmed_dups,
1287 const pg_missing_tracker_t &missing,
1288 bool touch_log,
1289 bool require_rollback,
1290 bool clear_divergent_priors,
1291 eversion_t dirty_to_dups,
1292 eversion_t dirty_from_dups,
1293 eversion_t write_from_dups,
1294 bool *rebuilt_missing_with_deletes,
1295 set<string> *log_keys_debug
1296 );
1297
1298 void read_log_and_missing(
1299 ObjectStore *store,
1300 ObjectStore::CollectionHandle& ch,
1301 ghobject_t pgmeta_oid,
1302 const pg_info_t &info,
1303 ostringstream &oss,
1304 bool tolerate_divergent_missing_log,
1305 bool debug_verify_stored_missing = false
1306 ) {
1307 return read_log_and_missing(
1308 store, ch, pgmeta_oid, info,
1309 log, missing, oss,
1310 tolerate_divergent_missing_log,
1311 &clear_divergent_priors,
1312 this,
1313 (pg_log_debug ? &log_keys_debug : nullptr),
1314 debug_verify_stored_missing);
1315 }
1316
1317 template <typename missing_type>
1318 static void read_log_and_missing(
1319 ObjectStore *store,
1320 ObjectStore::CollectionHandle &ch,
1321 ghobject_t pgmeta_oid,
1322 const pg_info_t &info,
1323 IndexedLog &log,
1324 missing_type &missing,
1325 ostringstream &oss,
1326 bool tolerate_divergent_missing_log,
1327 bool *clear_divergent_priors = nullptr,
1328 const DoutPrefixProvider *dpp = nullptr,
1329 set<string> *log_keys_debug = nullptr,
1330 bool debug_verify_stored_missing = false
1331 ) {
1332 ldpp_dout(dpp, 20) << "read_log_and_missing coll " << ch->cid
1333 << " " << pgmeta_oid << dendl;
1334
1335 // legacy?
1336 struct stat st;
1337 int r = store->stat(ch, pgmeta_oid, &st);
1338 ceph_assert(r == 0);
1339 ceph_assert(st.st_size == 0);
1340
1341 // will get overridden below if it had been recorded
1342 eversion_t on_disk_can_rollback_to = info.last_update;
1343 eversion_t on_disk_rollback_info_trimmed_to = eversion_t();
1344 ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch,
1345 pgmeta_oid);
1346 map<eversion_t, hobject_t> divergent_priors;
1347 bool must_rebuild = false;
1348 missing.may_include_deletes = false;
1349 list<pg_log_entry_t> entries;
1350 list<pg_log_dup_t> dups;
1351 if (p) {
1352 for (p->seek_to_first(); p->valid() ; p->next()) {
1353 // non-log pgmeta_oid keys are prefixed with _; skip those
1354 if (p->key()[0] == '_')
1355 continue;
1356 bufferlist bl = p->value();//Copy bufferlist before creating iterator
1357 auto bp = bl.cbegin();
1358 if (p->key() == "divergent_priors") {
1359 decode(divergent_priors, bp);
1360 ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
1361 << " divergent_priors" << dendl;
1362 must_rebuild = true;
1363 debug_verify_stored_missing = false;
1364 } else if (p->key() == "can_rollback_to") {
1365 decode(on_disk_can_rollback_to, bp);
1366 } else if (p->key() == "rollback_info_trimmed_to") {
1367 decode(on_disk_rollback_info_trimmed_to, bp);
1368 } else if (p->key() == "may_include_deletes_in_missing") {
1369 missing.may_include_deletes = true;
1370 } else if (p->key().substr(0, 7) == string("missing")) {
1371 hobject_t oid;
1372 pg_missing_item item;
1373 decode(oid, bp);
1374 decode(item, bp);
1375 if (item.is_delete()) {
1376 ceph_assert(missing.may_include_deletes);
1377 }
1378 missing.add(oid, item.need, item.have, item.is_delete());
1379 } else if (p->key().substr(0, 4) == string("dup_")) {
1380 pg_log_dup_t dup;
1381 decode(dup, bp);
1382 if (!dups.empty()) {
1383 ceph_assert(dups.back().version < dup.version);
1384 }
1385 dups.push_back(dup);
1386 } else {
1387 pg_log_entry_t e;
1388 e.decode_with_checksum(bp);
1389 ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
1390 if (!entries.empty()) {
1391 pg_log_entry_t last_e(entries.back());
1392 ceph_assert(last_e.version.version < e.version.version);
1393 ceph_assert(last_e.version.epoch <= e.version.epoch);
1394 }
1395 entries.push_back(e);
1396 if (log_keys_debug)
1397 log_keys_debug->insert(e.get_key_name());
1398 }
1399 }
1400 }
1401 log = IndexedLog(
1402 info.last_update,
1403 info.log_tail,
1404 on_disk_can_rollback_to,
1405 on_disk_rollback_info_trimmed_to,
1406 std::move(entries),
1407 std::move(dups));
1408
1409 if (must_rebuild || debug_verify_stored_missing) {
1410 // build missing
1411 if (debug_verify_stored_missing || info.last_complete < info.last_update) {
1412 ldpp_dout(dpp, 10)
1413 << "read_log_and_missing checking for missing items over interval ("
1414 << info.last_complete
1415 << "," << info.last_update << "]" << dendl;
1416
1417 set<hobject_t> did;
1418 set<hobject_t> checked;
1419 set<hobject_t> skipped;
1420 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
1421 i != log.log.rend();
1422 ++i) {
1423 if (!debug_verify_stored_missing && i->version <= info.last_complete) break;
1424 if (i->soid > info.last_backfill)
1425 continue;
1426 if (i->is_error())
1427 continue;
1428 if (did.count(i->soid)) continue;
1429 did.insert(i->soid);
1430
1431 if (!missing.may_include_deletes && i->is_delete())
1432 continue;
1433
1434 bufferlist bv;
1435 int r = store->getattr(
1436 ch,
1437 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
1438 OI_ATTR,
1439 bv);
1440 if (r >= 0) {
1441 object_info_t oi(bv);
1442 if (oi.version < i->version) {
1443 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i
1444 << " (have " << oi.version << ")" << dendl;
1445 if (debug_verify_stored_missing) {
1446 auto miter = missing.get_items().find(i->soid);
1447 ceph_assert(miter != missing.get_items().end());
1448 ceph_assert(miter->second.need == i->version);
1449 // the 'have' version is reset if an object is deleted,
1450 // then created again
1451 ceph_assert(miter->second.have == oi.version || miter->second.have == eversion_t());
1452 checked.insert(i->soid);
1453 } else {
1454 missing.add(i->soid, i->version, oi.version, i->is_delete());
1455 }
1456 }
1457 } else {
1458 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
1459 if (debug_verify_stored_missing) {
1460 auto miter = missing.get_items().find(i->soid);
1461 if (i->is_delete()) {
1462 ceph_assert(miter == missing.get_items().end() ||
1463 (miter->second.need == i->version &&
1464 miter->second.have == eversion_t()));
1465 } else {
1466 ceph_assert(miter != missing.get_items().end());
1467 ceph_assert(miter->second.need == i->version);
1468 ceph_assert(miter->second.have == eversion_t());
1469 }
1470 checked.insert(i->soid);
1471 } else {
1472 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
1473 }
1474 }
1475 }
1476 if (debug_verify_stored_missing) {
1477 for (auto &&i: missing.get_items()) {
1478 if (checked.count(i.first))
1479 continue;
1480 if (i.first > info.last_backfill) {
1481 ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry "
1482 << "found before last_backfill: "
1483 << i.first << " " << i.second
1484 << " last_backfill = " << info.last_backfill
1485 << dendl;
1486 ceph_abort_msg("invalid missing set entry found");
1487 }
1488 bufferlist bv;
1489 int r = store->getattr(
1490 ch,
1491 ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard),
1492 OI_ATTR,
1493 bv);
1494 if (r >= 0) {
1495 object_info_t oi(bv);
1496 ceph_assert(oi.version == i.second.have || eversion_t() == i.second.have);
1497 } else {
1498 ceph_assert(i.second.is_delete() || eversion_t() == i.second.have);
1499 }
1500 }
1501 } else {
1502 ceph_assert(must_rebuild);
1503 for (map<eversion_t, hobject_t>::reverse_iterator i =
1504 divergent_priors.rbegin();
1505 i != divergent_priors.rend();
1506 ++i) {
1507 if (i->first <= info.last_complete) break;
1508 if (i->second > info.last_backfill)
1509 continue;
1510 if (did.count(i->second)) continue;
1511 did.insert(i->second);
1512 bufferlist bv;
1513 int r = store->getattr(
1514 ch,
1515 ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard),
1516 OI_ATTR,
1517 bv);
1518 if (r >= 0) {
1519 object_info_t oi(bv);
1520 /**
1521 * 1) we see this entry in the divergent priors mapping
1522 * 2) we didn't see an entry for this object in the log
1523 *
1524 * From 1 & 2 we know that either the object does not exist
1525 * or it is at the version specified in the divergent_priors
1526 * map since the object would have been deleted atomically
1527 * with the addition of the divergent_priors entry, an older
1528 * version would not have been recovered, and a newer version
1529 * would show up in the log above.
1530 */
1531 /**
1532 * Unfortunately the assessment above is incorrect because of
1533 * http://tracker.ceph.com/issues/17916 (we were incorrectly
1534 * not removing the divergent_priors set from disk state!),
1535 * so let's check that.
1536 */
1537 if (oi.version > i->first && tolerate_divergent_missing_log) {
1538 ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i
1539 << ") inconsistent with disk state (" << oi
1540 << "), assuming it is tracker.ceph.com/issues/17916"
1541 << dendl;
1542 } else {
1543 ceph_assert(oi.version == i->first);
1544 }
1545 } else {
1546 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
1547 missing.add(i->second, i->first, eversion_t(), false);
1548 }
1549 }
1550 }
1551 if (clear_divergent_priors)
1552 (*clear_divergent_priors) = true;
1553 }
1554 }
1555
1556 if (!must_rebuild) {
1557 if (clear_divergent_priors)
1558 (*clear_divergent_priors) = false;
1559 missing.flush();
1560 }
1561 ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl;
1562 } // static read_log_and_missing
1563 }; // struct PGLog