]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGLog.h
update sources to v12.1.3
[ceph.git] / ceph / src / osd / PGLog.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
c07f9fc5 17#pragma once
7c673cae
FG
18
19// re-include our assert to clobber boost's
20#include "include/assert.h"
21#include "osd_types.h"
22#include "os/ObjectStore.h"
23#include <list>
24using namespace std;
25
26#define PGLOG_INDEXED_OBJECTS (1 << 0)
27#define PGLOG_INDEXED_CALLER_OPS (1 << 1)
28#define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2)
c07f9fc5
FG
29#define PGLOG_INDEXED_DUPS (1 << 3)
30#define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | \
31 PGLOG_INDEXED_CALLER_OPS | \
32 PGLOG_INDEXED_EXTRA_CALLER_OPS | \
33 PGLOG_INDEXED_DUPS)
7c673cae
FG
34
35class CephContext;
36
37struct PGLog : DoutPrefixProvider {
38 DoutPrefixProvider *prefix_provider;
39 string gen_prefix() const override {
40 return prefix_provider ? prefix_provider->gen_prefix() : "";
41 }
42 unsigned get_subsys() const override {
43 return prefix_provider ? prefix_provider->get_subsys() :
44 (unsigned)ceph_subsys_osd;
45 }
46 CephContext *get_cct() const override {
47 return cct;
48 }
49
50 ////////////////////////////// sub classes //////////////////////////////
51 struct LogEntryHandler {
52 virtual void rollback(
53 const pg_log_entry_t &entry) = 0;
54 virtual void rollforward(
55 const pg_log_entry_t &entry) = 0;
56 virtual void trim(
57 const pg_log_entry_t &entry) = 0;
58 virtual void remove(
59 const hobject_t &hoid) = 0;
60 virtual void try_stash(
61 const hobject_t &hoid,
62 version_t v) = 0;
63 virtual ~LogEntryHandler() {}
64 };
65
66 /* Exceptions */
67 class read_log_and_missing_error : public buffer::error {
68 public:
69 explicit read_log_and_missing_error(const char *what) {
70 snprintf(buf, sizeof(buf), "read_log_and_missing_error: %s", what);
71 }
72 const char *what() const throw () override {
73 return buf;
74 }
75 private:
76 char buf[512];
77 };
78
79public:
80 /**
81 * IndexLog - adds in-memory index of the log, by oid.
82 * plus some methods to manipulate it all.
83 */
84 struct IndexedLog : public pg_log_t {
85 mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful!
86 mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
87 mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
c07f9fc5 88 mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index;
7c673cae
FG
89
90 // recovery pointers
91 list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
92 version_t last_requested = 0; // last object requested by primary
93
94 //
95 private:
96 mutable __u16 indexed_data = 0;
97 /**
98 * rollback_info_trimmed_to_riter points to the first log entry <=
99 * rollback_info_trimmed_to
100 *
101 * It's a reverse_iterator because rend() is a natural representation for
102 * tail, and rbegin() works nicely for head.
103 */
31f18b77 104 mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator
7c673cae
FG
105 rollback_info_trimmed_to_riter;
106
107 template <typename F>
108 void advance_can_rollback_to(eversion_t to, F &&f) {
109 if (to > can_rollback_to)
110 can_rollback_to = to;
111
112 if (to > rollback_info_trimmed_to)
113 rollback_info_trimmed_to = to;
114
115 while (rollback_info_trimmed_to_riter != log.rbegin()) {
116 --rollback_info_trimmed_to_riter;
117 if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
118 ++rollback_info_trimmed_to_riter;
119 break;
120 }
121 f(*rollback_info_trimmed_to_riter);
122 }
123 }
124
125 void reset_rollback_info_trimmed_to_riter() {
126 rollback_info_trimmed_to_riter = log.rbegin();
127 while (rollback_info_trimmed_to_riter != log.rend() &&
128 rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
129 ++rollback_info_trimmed_to_riter;
130 }
131
132 // indexes objects, caller ops and extra caller ops
133 public:
134 IndexedLog() :
135 complete_to(log.end()),
136 last_requested(0),
137 indexed_data(0),
138 rollback_info_trimmed_to_riter(log.rbegin())
c07f9fc5 139 { }
7c673cae
FG
140
141 template <typename... Args>
142 IndexedLog(Args&&... args) :
143 pg_log_t(std::forward<Args>(args)...),
144 complete_to(log.end()),
145 last_requested(0),
146 indexed_data(0),
c07f9fc5
FG
147 rollback_info_trimmed_to_riter(log.rbegin())
148 {
7c673cae
FG
149 reset_rollback_info_trimmed_to_riter();
150 index();
151 }
152
153 IndexedLog(const IndexedLog &rhs) :
154 pg_log_t(rhs),
155 complete_to(log.end()),
156 last_requested(rhs.last_requested),
157 indexed_data(0),
c07f9fc5
FG
158 rollback_info_trimmed_to_riter(log.rbegin())
159 {
7c673cae
FG
160 reset_rollback_info_trimmed_to_riter();
161 index(rhs.indexed_data);
162 }
c07f9fc5 163
7c673cae
FG
164 IndexedLog &operator=(const IndexedLog &rhs) {
165 this->~IndexedLog();
166 new (this) IndexedLog(rhs);
167 return *this;
168 }
169
170 void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) {
171 advance_can_rollback_to(
172 to,
173 [&](pg_log_entry_t &entry) {
174 h->trim(entry);
175 });
176 }
177 void roll_forward_to(eversion_t to, LogEntryHandler *h) {
178 advance_can_rollback_to(
179 to,
180 [&](pg_log_entry_t &entry) {
181 h->rollforward(entry);
182 });
183 }
184
185 void skip_can_rollback_to_to_head() {
186 advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
187 }
188
31f18b77 189 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
7c673cae
FG
190 auto divergent = pg_log_t::rewind_from_head(newhead);
191 index();
192 reset_rollback_info_trimmed_to_riter();
193 return divergent;
194 }
195
196 template <typename T>
197 void scan_log_after(
198 const eversion_t &bound, ///< [in] scan entries > bound
199 T &&f) const {
200 auto iter = log.rbegin();
201 while (iter != log.rend() && iter->version > bound)
202 ++iter;
203
204 while (true) {
205 if (iter == log.rbegin())
206 break;
207 f(*(--iter));
208 }
209 }
210
211 /****/
212 void claim_log_and_clear_rollback_info(const pg_log_t& o) {
213 // we must have already trimmed the old entries
214 assert(rollback_info_trimmed_to == head);
215 assert(rollback_info_trimmed_to_riter == log.rbegin());
216
217 *this = IndexedLog(o);
218
219 skip_can_rollback_to_to_head();
220 index();
221 }
222
223 void split_out_child(
224 pg_t child_pgid,
225 unsigned split_bits,
226 IndexedLog *target);
227
228 void zero() {
229 // we must have already trimmed the old entries
230 assert(rollback_info_trimmed_to == head);
231 assert(rollback_info_trimmed_to_riter == log.rbegin());
232
233 unindex();
234 pg_log_t::clear();
235 rollback_info_trimmed_to_riter = log.rbegin();
236 reset_recovery_pointers();
237 }
238 void clear() {
239 skip_can_rollback_to_to_head();
240 zero();
241 }
242 void reset_recovery_pointers() {
243 complete_to = log.end();
244 last_requested = 0;
245 }
246
247 bool logged_object(const hobject_t& oid) const {
248 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
249 index_objects();
250 }
251 return objects.count(oid);
252 }
253
254 bool logged_req(const osd_reqid_t &r) const {
255 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
256 index_caller_ops();
257 }
258 if (!caller_ops.count(r)) {
259 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
260 index_extra_caller_ops();
261 }
262 return extra_caller_ops.count(r);
263 }
264 return true;
265 }
266
267 bool get_request(
268 const osd_reqid_t &r,
269 eversion_t *version,
270 version_t *user_version,
c07f9fc5
FG
271 int *return_code) const
272 {
7c673cae
FG
273 assert(version);
274 assert(user_version);
275 assert(return_code);
276 ceph::unordered_map<osd_reqid_t,pg_log_entry_t*>::const_iterator p;
277 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
278 index_caller_ops();
279 }
280 p = caller_ops.find(r);
281 if (p != caller_ops.end()) {
282 *version = p->second->version;
283 *user_version = p->second->user_version;
284 *return_code = p->second->return_code;
285 return true;
286 }
287
288 // warning: we will return *a* request for this reqid, but not
289 // necessarily the most recent.
290 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
291 index_extra_caller_ops();
292 }
293 p = extra_caller_ops.find(r);
294 if (p != extra_caller_ops.end()) {
31f18b77 295 for (auto i = p->second->extra_reqids.begin();
7c673cae
FG
296 i != p->second->extra_reqids.end();
297 ++i) {
298 if (i->first == r) {
299 *version = p->second->version;
300 *user_version = i->second;
301 *return_code = p->second->return_code;
302 return true;
303 }
304 }
305 assert(0 == "in extra_caller_ops but not extra_reqids");
306 }
c07f9fc5
FG
307
308 if (!(indexed_data & PGLOG_INDEXED_DUPS)) {
309 index_dups();
310 }
311 auto q = dup_index.find(r);
312 if (q != dup_index.end()) {
313 *version = q->second->version;
314 *user_version = q->second->user_version;
315 *return_code = q->second->return_code;
316 return true;
317 }
318
7c673cae
FG
319 return false;
320 }
321
322 /// get a (bounded) list of recent reqids for the given object
323 void get_object_reqids(const hobject_t& oid, unsigned max,
31f18b77 324 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *pls) const {
7c673cae
FG
325 // make sure object is present at least once before we do an
326 // O(n) search.
327 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
328 index_objects();
329 }
330 if (objects.count(oid) == 0)
331 return;
332 for (list<pg_log_entry_t>::const_reverse_iterator i = log.rbegin();
333 i != log.rend();
334 ++i) {
335 if (i->soid == oid) {
336 if (i->reqid_is_indexed())
337 pls->push_back(make_pair(i->reqid, i->user_version));
338 pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end());
339 if (pls->size() >= max) {
340 if (pls->size() > max) {
341 pls->resize(max);
342 }
343 return;
344 }
345 }
346 }
347 }
c07f9fc5 348
7c673cae 349 void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
c07f9fc5
FG
350 // if to_index is 0, no need to run any of this code, especially
351 // loop below; this can happen with copy constructor for
352 // IndexedLog (and indirectly through assignment operator)
353 if (!to_index) return;
354
7c673cae
FG
355 if (to_index & PGLOG_INDEXED_OBJECTS)
356 objects.clear();
357 if (to_index & PGLOG_INDEXED_CALLER_OPS)
358 caller_ops.clear();
359 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
360 extra_caller_ops.clear();
c07f9fc5
FG
361 if (to_index & PGLOG_INDEXED_DUPS) {
362 dup_index.clear();
363 for (auto& i : dups) {
364 dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i);
365 }
366 }
7c673cae 367
c07f9fc5
FG
368 constexpr __u16 any_log_entry_index =
369 PGLOG_INDEXED_OBJECTS |
370 PGLOG_INDEXED_CALLER_OPS |
371 PGLOG_INDEXED_EXTRA_CALLER_OPS;
372
373 if (to_index & any_log_entry_index) {
374 for (list<pg_log_entry_t>::const_iterator i = log.begin();
375 i != log.end();
376 ++i) {
377 if (to_index & PGLOG_INDEXED_OBJECTS) {
378 if (i->object_is_indexed()) {
379 objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
380 }
7c673cae 381 }
7c673cae 382
c07f9fc5
FG
383 if (to_index & PGLOG_INDEXED_CALLER_OPS) {
384 if (i->reqid_is_indexed()) {
385 caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
386 }
7c673cae 387 }
c07f9fc5
FG
388
389 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
390 for (auto j = i->extra_reqids.begin();
391 j != i->extra_reqids.end();
392 ++j) {
393 extra_caller_ops.insert(
394 make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
395 }
7c673cae
FG
396 }
397 }
398 }
c07f9fc5 399
7c673cae
FG
400 indexed_data |= to_index;
401 }
402
403 void index_objects() const {
404 index(PGLOG_INDEXED_OBJECTS);
405 }
406
407 void index_caller_ops() const {
408 index(PGLOG_INDEXED_CALLER_OPS);
409 }
410
411 void index_extra_caller_ops() const {
412 index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
413 }
414
c07f9fc5
FG
415 void index_dups() const {
416 index(PGLOG_INDEXED_DUPS);
417 }
418
7c673cae
FG
419 void index(pg_log_entry_t& e) {
420 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
421 if (objects.count(e.soid) == 0 ||
422 objects[e.soid]->version < e.version)
423 objects[e.soid] = &e;
424 }
425 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
426 // divergent merge_log indexes new before unindexing old
427 if (e.reqid_is_indexed()) {
428 caller_ops[e.reqid] = &e;
429 }
430 }
431 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
31f18b77 432 for (auto j = e.extra_reqids.begin();
7c673cae
FG
433 j != e.extra_reqids.end();
434 ++j) {
435 extra_caller_ops.insert(make_pair(j->first, &e));
436 }
437 }
438 }
c07f9fc5 439
7c673cae
FG
440 void unindex() {
441 objects.clear();
442 caller_ops.clear();
443 extra_caller_ops.clear();
c07f9fc5 444 dup_index.clear();
7c673cae
FG
445 indexed_data = 0;
446 }
c07f9fc5
FG
447
448 void unindex(const pg_log_entry_t& e) {
7c673cae
FG
449 // NOTE: this only works if we remove from the _tail_ of the log!
450 if (indexed_data & PGLOG_INDEXED_OBJECTS) {
451 if (objects.count(e.soid) && objects[e.soid]->version == e.version)
452 objects.erase(e.soid);
453 }
454 if (e.reqid_is_indexed()) {
455 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
456 // divergent merge_log indexes new before unindexing old
457 if (caller_ops.count(e.reqid) && caller_ops[e.reqid] == &e)
c07f9fc5 458 caller_ops.erase(e.reqid);
7c673cae
FG
459 }
460 }
461 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
31f18b77 462 for (auto j = e.extra_reqids.begin();
7c673cae
FG
463 j != e.extra_reqids.end();
464 ++j) {
465 for (ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*>::iterator k =
466 extra_caller_ops.find(j->first);
467 k != extra_caller_ops.end() && k->first == j->first;
468 ++k) {
469 if (k->second == &e) {
470 extra_caller_ops.erase(k);
471 break;
472 }
473 }
474 }
475 }
476 }
477
c07f9fc5 478 void index(pg_log_dup_t& e) {
d2e6a577 479 if (indexed_data & PGLOG_INDEXED_DUPS) {
c07f9fc5
FG
480 dup_index[e.reqid] = &e;
481 }
482 }
483
484 void unindex(const pg_log_dup_t& e) {
d2e6a577 485 if (indexed_data & PGLOG_INDEXED_DUPS) {
c07f9fc5
FG
486 auto i = dup_index.find(e.reqid);
487 if (i != dup_index.end()) {
488 dup_index.erase(i);
489 }
490 }
491 }
492
7c673cae
FG
493 // actors
494 void add(const pg_log_entry_t& e, bool applied = true) {
495 if (!applied) {
496 assert(get_can_rollback_to() == head);
497 }
498
31f18b77
FG
499 // make sure our buffers don't pin bigger buffers
500 e.mod_desc.trim_bl();
501
7c673cae
FG
502 // add to log
503 log.push_back(e);
504
505 // riter previously pointed to the previous entry
506 if (rollback_info_trimmed_to_riter == log.rbegin())
507 ++rollback_info_trimmed_to_riter;
508
509 assert(e.version > head);
510 assert(head.version == 0 || e.version.version > head.version);
511 head = e.version;
512
513 // to our index
514 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
515 objects[e.soid] = &(log.back());
516 }
517 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
518 if (e.reqid_is_indexed()) {
519 caller_ops[e.reqid] = &(log.back());
520 }
521 }
c07f9fc5 522
7c673cae 523 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
31f18b77 524 for (auto j = e.extra_reqids.begin();
7c673cae
FG
525 j != e.extra_reqids.end();
526 ++j) {
527 extra_caller_ops.insert(make_pair(j->first, &(log.back())));
528 }
529 }
530
531 if (!applied) {
532 skip_can_rollback_to_to_head();
533 }
c07f9fc5 534 } // add
7c673cae
FG
535
536 void trim(
537 CephContext* cct,
538 eversion_t s,
c07f9fc5
FG
539 set<eversion_t> *trimmed,
540 set<string>* trimmed_dups,
541 bool* dirty_dups);
7c673cae
FG
542
543 ostream& print(ostream& out) const;
c07f9fc5 544 }; // IndexedLog
7c673cae
FG
545
546
547protected:
548 //////////////////// data members ////////////////////
549
550 pg_missing_tracker_t missing;
551 IndexedLog log;
552
553 eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to
554 eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
555 eversion_t writeout_from; ///< must writout keys >= writeout_from
556 set<eversion_t> trimmed; ///< must clear keys in trimmed
c07f9fc5 557 set<string> trimmed_dups; ///< must clear keys in trimmed_dups
7c673cae
FG
558 CephContext *cct;
559 bool pg_log_debug;
560 /// Log is clean on [dirty_to, dirty_from)
561 bool touched_log;
562 bool clear_divergent_priors;
c07f9fc5
FG
563 bool dirty_dups; /// log.dups is updated
564 bool rebuilt_missing_with_deletes = false;
7c673cae
FG
565
566 void mark_dirty_to(eversion_t to) {
567 if (to > dirty_to)
568 dirty_to = to;
569 }
570 void mark_dirty_from(eversion_t from) {
571 if (from < dirty_from)
572 dirty_from = from;
573 }
574 void mark_writeout_from(eversion_t from) {
575 if (from < writeout_from)
576 writeout_from = from;
577 }
578public:
579 bool is_dirty() const {
580 return !touched_log ||
581 (dirty_to != eversion_t()) ||
582 (dirty_from != eversion_t::max()) ||
583 (writeout_from != eversion_t::max()) ||
584 !(trimmed.empty()) ||
c07f9fc5
FG
585 !missing.is_clean() ||
586 !(trimmed_dups.empty()) ||
587 dirty_dups ||
588 rebuilt_missing_with_deletes;
7c673cae
FG
589 }
590 void mark_log_for_rewrite() {
591 mark_dirty_to(eversion_t::max());
592 mark_dirty_from(eversion_t());
593 touched_log = false;
594 }
c07f9fc5
FG
595 bool get_rebuilt_missing_with_deletes() const {
596 return rebuilt_missing_with_deletes;
597 }
7c673cae
FG
598protected:
599
600 /// DEBUG
601 set<string> log_keys_debug;
602 static void clear_after(set<string> *log_keys_debug, const string &lb) {
603 if (!log_keys_debug)
604 return;
605 for (set<string>::iterator i = log_keys_debug->lower_bound(lb);
606 i != log_keys_debug->end();
607 log_keys_debug->erase(i++));
608 }
609 static void clear_up_to(set<string> *log_keys_debug, const string &ub) {
610 if (!log_keys_debug)
611 return;
612 for (set<string>::iterator i = log_keys_debug->begin();
613 i != log_keys_debug->end() && *i < ub;
614 log_keys_debug->erase(i++));
615 }
616
617 void check();
618 void undirty() {
619 dirty_to = eversion_t();
620 dirty_from = eversion_t::max();
621 touched_log = true;
622 trimmed.clear();
c07f9fc5 623 trimmed_dups.clear();
7c673cae
FG
624 writeout_from = eversion_t::max();
625 check();
626 missing.flush();
c07f9fc5 627 dirty_dups = false;
7c673cae
FG
628 }
629public:
c07f9fc5 630
7c673cae 631 // cppcheck-suppress noExplicitConstructor
c07f9fc5 632 PGLog(CephContext *cct, DoutPrefixProvider *dpp = nullptr) :
7c673cae
FG
633 prefix_provider(dpp),
634 dirty_from(eversion_t::max()),
635 writeout_from(eversion_t::max()),
636 cct(cct),
637 pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
638 touched_log(false),
c07f9fc5
FG
639 clear_divergent_priors(false),
640 dirty_dups(false)
641 { }
7c673cae
FG
642
643 void reset_backfill();
644
645 void clear();
646
647 //////////////////// get or set missing ////////////////////
648
649 const pg_missing_tracker_t& get_missing() const { return missing; }
650 void revise_have(hobject_t oid, eversion_t have) {
651 missing.revise_have(oid, have);
652 }
653
7c673cae 654 void missing_add(const hobject_t& oid, eversion_t need, eversion_t have) {
c07f9fc5 655 missing.add(oid, need, have, false);
7c673cae
FG
656 }
657
7c673cae
FG
658 //////////////////// get or set log ////////////////////
659
660 const IndexedLog &get_log() const { return log; }
661
662 const eversion_t &get_tail() const { return log.tail; }
663
664 void set_tail(eversion_t tail) { log.tail = tail; }
665
666 const eversion_t &get_head() const { return log.head; }
667
668 void set_head(eversion_t head) { log.head = head; }
669
670 void set_last_requested(version_t last_requested) {
671 log.last_requested = last_requested;
672 }
673
674 void index() { log.index(); }
675
676 void unindex() { log.unindex(); }
677
678 void add(const pg_log_entry_t& e, bool applied = true) {
679 mark_writeout_from(e.version);
680 log.add(e, applied);
681 }
682
683 void reset_recovery_pointers() { log.reset_recovery_pointers(); }
684
685 static void clear_info_log(
686 spg_t pgid,
687 ObjectStore::Transaction *t);
688
689 void trim(
690 eversion_t trim_to,
691 pg_info_t &info);
692
693 void roll_forward_to(
694 eversion_t roll_forward_to,
695 LogEntryHandler *h) {
696 log.roll_forward_to(
697 roll_forward_to,
698 h);
699 }
700
701 eversion_t get_can_rollback_to() const {
702 return log.get_can_rollback_to();
703 }
704
705 void roll_forward(LogEntryHandler *h) {
706 roll_forward_to(
707 log.head,
708 h);
709 }
710
711 //////////////////// get or set log & missing ////////////////////
712
713 void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) {
714 log.trim_rollback_info_to(log.head, h);
715 log.claim_log_and_clear_rollback_info(o);
716 missing.clear();
717 mark_dirty_to(eversion_t::max());
718 }
719
720 void split_into(
721 pg_t child_pgid,
722 unsigned split_bits,
c07f9fc5 723 PGLog *opg_log) {
7c673cae
FG
724 log.split_out_child(child_pgid, split_bits, &opg_log->log);
725 missing.split_into(child_pgid, split_bits, &(opg_log->missing));
726 opg_log->mark_dirty_to(eversion_t::max());
727 mark_dirty_to(eversion_t::max());
c07f9fc5
FG
728 if (missing.may_include_deletes)
729 opg_log->rebuilt_missing_with_deletes = true;
7c673cae
FG
730 }
731
732 void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
733 if (missing.is_missing(oid, v)) {
734 missing.got(oid, v);
c07f9fc5 735
7c673cae
FG
736 // raise last_complete?
737 if (missing.get_items().empty()) {
738 log.complete_to = log.log.end();
739 info.last_complete = info.last_update;
740 }
741 while (log.complete_to != log.log.end()) {
742 if (missing.get_items().at(
743 missing.get_rmissing().begin()->second
744 ).need <= log.complete_to->version)
745 break;
746 if (info.last_complete < log.complete_to->version)
747 info.last_complete = log.complete_to->version;
748 ++log.complete_to;
749 }
750 }
751
752 assert(log.get_can_rollback_to() >= v);
753 }
754
c07f9fc5 755 void reset_complete_to(pg_info_t *info) {
7c673cae 756 log.complete_to = log.log.begin();
c07f9fc5 757 while (!missing.get_items().empty() && log.complete_to->version <
7c673cae
FG
758 missing.get_items().at(
759 missing.get_rmissing().begin()->second
c07f9fc5 760 ).need) {
d2e6a577 761 assert(log.complete_to != log.log.end());
7c673cae 762 ++log.complete_to;
c07f9fc5 763 }
7c673cae
FG
764 assert(log.complete_to != log.log.end());
765 if (log.complete_to == log.log.begin()) {
c07f9fc5
FG
766 if (info)
767 info->last_complete = eversion_t();
7c673cae
FG
768 } else {
769 --log.complete_to;
c07f9fc5
FG
770 if (info)
771 info->last_complete = log.complete_to->version;
7c673cae
FG
772 ++log.complete_to;
773 }
c07f9fc5
FG
774 }
775
776 void activate_not_complete(pg_info_t &info) {
777 reset_complete_to(&info);
7c673cae
FG
778 log.last_requested = 0;
779 }
780
781 void proc_replica_log(pg_info_t &oinfo,
782 const pg_log_t &olog,
783 pg_missing_t& omissing, pg_shard_t from) const;
784
c07f9fc5
FG
785 void rebuild_missing_set_with_deletes(ObjectStore *store,
786 coll_t pg_coll,
787 const pg_info_t &info);
788
7c673cae
FG
789protected:
790 static void split_by_object(
31f18b77
FG
791 mempool::osd_pglog::list<pg_log_entry_t> &entries,
792 map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) {
7c673cae 793 while (!entries.empty()) {
31f18b77 794 auto &out_list = (*out_entries)[entries.front().soid];
7c673cae
FG
795 out_list.splice(out_list.end(), entries, entries.begin());
796 }
797 }
798
799 /**
800 * _merge_object_divergent_entries
801 *
802 * There are 5 distinct cases:
803 * 1) There is a more recent update: in this case we assume we adjusted the
804 * store and missing during merge_log
805 * 2) The first entry in the divergent sequence is a create. This might
806 * either be because the object is a clone or because prior_version is
807 * eversion_t(). In this case the object does not exist and we must
808 * adjust missing and the store to match.
809 * 3) We are currently missing the object. In this case, we adjust the
810 * missing to our prior_version taking care to add a divergent_prior
811 * if necessary
812 * 4) We can rollback all of the entries. In this case, we do so using
813 * the rollbacker and return -- the object does not go into missing.
814 * 5) We cannot rollback at least 1 of the entries. In this case, we
815 * clear the object out of the store and add a missing entry at
816 * prior_version taking care to add a divergent_prior if
817 * necessary.
818 */
819 template <typename missing_type>
820 static void _merge_object_divergent_entries(
821 const IndexedLog &log, ///< [in] log to merge against
822 const hobject_t &hoid, ///< [in] object we are merging
31f18b77 823 const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
7c673cae
FG
824 const pg_info_t &info, ///< [in] info for merging entries
825 eversion_t olog_can_rollback_to, ///< [in] rollback boundary
c07f9fc5 826 missing_type &missing, ///< [in,out] missing to adjust, use
7c673cae
FG
827 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
828 const DoutPrefixProvider *dpp ///< [in] logging provider
829 ) {
830 ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
31f18b77 831 << " entries: " << orig_entries << dendl;
7c673cae
FG
832
833 if (hoid > info.last_backfill) {
834 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill"
835 << dendl;
836 return;
837 }
838
839 // entries is non-empty
31f18b77
FG
840 assert(!orig_entries.empty());
841 // strip out and ignore ERROR entries
842 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae 843 eversion_t last;
d2e6a577 844 bool seen_non_error = false;
31f18b77
FG
845 for (list<pg_log_entry_t>::const_iterator i = orig_entries.begin();
846 i != orig_entries.end();
7c673cae
FG
847 ++i) {
848 // all entries are on hoid
849 assert(i->soid == hoid);
d2e6a577
FG
850 // did not see error entries before this entry and this entry is not error
851 // then this entry is the first non error entry
852 bool first_non_error = ! seen_non_error && ! i->is_error();
853 if (! i->is_error() ) {
854 // see a non error entry now
855 seen_non_error = true;
856 }
857
858 // No need to check the first entry since it prior_version is unavailable
859 // in the list
860 // No need to check if the prior_version is the minimal version
861 // No need to check the first non-error entry since the leading error
862 // entries are not its prior version
863 if (i != orig_entries.begin() && i->prior_version != eversion_t() &&
864 ! first_non_error) {
7c673cae
FG
865 // in increasing order of version
866 assert(i->version > last);
31f18b77
FG
867 // prior_version correct (unless it is an ERROR entry)
868 assert(i->prior_version == last || i->is_error());
7c673cae 869 }
31f18b77
FG
870 if (i->is_error()) {
871 ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
872 } else {
873 ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
874 entries.push_back(*i);
d2e6a577 875 last = i->version;
31f18b77
FG
876 }
877 }
878 if (entries.empty()) {
879 ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl;
880 return;
7c673cae
FG
881 }
882
883 const eversion_t prior_version = entries.begin()->prior_version;
884 const eversion_t first_divergent_update = entries.begin()->version;
885 const eversion_t last_divergent_update = entries.rbegin()->version;
886 const bool object_not_in_store =
887 !missing.is_missing(hoid) &&
888 entries.rbegin()->is_delete();
889 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
890 << " prior_version: " << prior_version
891 << " first_divergent_update: " << first_divergent_update
892 << " last_divergent_update: " << last_divergent_update
893 << dendl;
894
895 ceph::unordered_map<hobject_t, pg_log_entry_t*>::const_iterator objiter =
896 log.objects.find(hoid);
897 if (objiter != log.objects.end() &&
898 objiter->second->version >= first_divergent_update) {
899 /// Case 1)
900 ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: "
901 << *objiter->second << ", already merged" << dendl;
902
903 assert(objiter->second->version > last_divergent_update);
904
905 // ensure missing has been updated appropriately
c07f9fc5
FG
906 if (objiter->second->is_update() ||
907 (missing.may_include_deletes && objiter->second->is_delete())) {
7c673cae
FG
908 assert(missing.is_missing(hoid) &&
909 missing.get_items().at(hoid).need == objiter->second->version);
910 } else {
911 assert(!missing.is_missing(hoid));
912 }
913 missing.revise_have(hoid, eversion_t());
914 if (rollbacker) {
915 if (!object_not_in_store) {
916 rollbacker->remove(hoid);
917 }
918 for (auto &&i: entries) {
919 rollbacker->trim(i);
920 }
921 }
922 return;
923 }
924
925 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
926 <<" has no more recent entries in log" << dendl;
927 if (prior_version == eversion_t() || entries.front().is_clone()) {
928 /// Case 2)
929 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
930 << " prior_version or op type indicates creation,"
931 << " deleting"
932 << dendl;
933 if (missing.is_missing(hoid))
934 missing.rm(missing.get_items().find(hoid));
935 if (rollbacker) {
936 if (!object_not_in_store) {
937 rollbacker->remove(hoid);
938 }
939 for (auto &&i: entries) {
940 rollbacker->trim(i);
941 }
942 }
943 return;
944 }
945
946 if (missing.is_missing(hoid)) {
947 /// Case 3)
948 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
949 << " missing, " << missing.get_items().at(hoid)
950 << " adjusting" << dendl;
951
952 if (missing.get_items().at(hoid).have == prior_version) {
953 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
954 << " missing.have is prior_version " << prior_version
955 << " removing from missing" << dendl;
956 missing.rm(missing.get_items().find(hoid));
957 } else {
958 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
959 << " missing.have is " << missing.get_items().at(hoid).have
960 << ", adjusting" << dendl;
c07f9fc5 961 missing.revise_need(hoid, prior_version, false);
7c673cae
FG
962 if (prior_version <= info.log_tail) {
963 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
964 << " prior_version " << prior_version
965 << " <= info.log_tail "
966 << info.log_tail << dendl;
967 }
968 }
969 if (rollbacker) {
970 for (auto &&i: entries) {
971 rollbacker->trim(i);
972 }
973 }
974 return;
975 }
976
977 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
978 << " must be rolled back or recovered,"
979 << " attempting to rollback"
980 << dendl;
981 bool can_rollback = true;
982 /// Distinguish between 4) and 5)
983 for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin();
984 i != entries.rend();
985 ++i) {
986 if (!i->can_rollback() || i->version <= olog_can_rollback_to) {
987 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback "
988 << *i << dendl;
989 can_rollback = false;
990 break;
991 }
992 }
993
994 if (can_rollback) {
995 /// Case 4)
996 for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin();
997 i != entries.rend();
998 ++i) {
999 assert(i->can_rollback() && i->version > olog_can_rollback_to);
1000 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1001 << " rolling back " << *i << dendl;
1002 if (rollbacker)
1003 rollbacker->rollback(*i);
1004 }
1005 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1006 << " rolled back" << dendl;
1007 return;
1008 } else {
1009 /// Case 5)
1010 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, "
1011 << "removing and adding to missing" << dendl;
1012 if (rollbacker) {
1013 if (!object_not_in_store)
1014 rollbacker->remove(hoid);
1015 for (auto &&i: entries) {
1016 rollbacker->trim(i);
1017 }
1018 }
c07f9fc5 1019 missing.add(hoid, prior_version, eversion_t(), false);
7c673cae
FG
1020 if (prior_version <= info.log_tail) {
1021 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1022 << " prior_version " << prior_version
1023 << " <= info.log_tail "
1024 << info.log_tail << dendl;
1025 }
1026 }
1027 }
1028
1029 /// Merge all entries using above
1030 template <typename missing_type>
1031 static void _merge_divergent_entries(
1032 const IndexedLog &log, ///< [in] log to merge against
31f18b77 1033 mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge
7c673cae
FG
1034 const pg_info_t &oinfo, ///< [in] info for merging entries
1035 eversion_t olog_can_rollback_to, ///< [in] rollback boundary
1036 missing_type &omissing, ///< [in,out] missing to adjust, use
1037 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
1038 const DoutPrefixProvider *dpp ///< [in] logging provider
1039 ) {
31f18b77 1040 map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
7c673cae 1041 split_by_object(entries, &split);
31f18b77 1042 for (map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>>::iterator i = split.begin();
7c673cae
FG
1043 i != split.end();
1044 ++i) {
1045 _merge_object_divergent_entries(
1046 log,
1047 i->first,
1048 i->second,
1049 oinfo,
1050 olog_can_rollback_to,
1051 omissing,
1052 rollbacker,
1053 dpp);
1054 }
1055 }
1056
1057 /**
1058 * Exists for use in TestPGLog for simply testing single divergent log
1059 * cases
1060 */
1061 void merge_old_entry(
1062 ObjectStore::Transaction& t,
1063 const pg_log_entry_t& oe,
1064 const pg_info_t& info,
1065 LogEntryHandler *rollbacker) {
31f18b77 1066 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae
FG
1067 entries.push_back(oe);
1068 _merge_object_divergent_entries(
1069 log,
1070 oe.soid,
1071 entries,
1072 info,
1073 log.get_can_rollback_to(),
1074 missing,
1075 rollbacker,
1076 this);
1077 }
c07f9fc5
FG
1078
1079 bool merge_log_dups(const pg_log_t& olog);
1080
7c673cae 1081public:
c07f9fc5 1082
7c673cae
FG
1083 void rewind_divergent_log(eversion_t newhead,
1084 pg_info_t &info,
1085 LogEntryHandler *rollbacker,
1086 bool &dirty_info,
1087 bool &dirty_big_info);
1088
1089 void merge_log(pg_info_t &oinfo,
1090 pg_log_t &olog,
1091 pg_shard_t from,
1092 pg_info_t &info, LogEntryHandler *rollbacker,
1093 bool &dirty_info, bool &dirty_big_info);
1094
1095 template <typename missing_type>
1096 static bool append_log_entries_update_missing(
1097 const hobject_t &last_backfill,
1098 bool last_backfill_bitwise,
31f18b77 1099 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
1100 bool maintain_rollback,
1101 IndexedLog *log,
1102 missing_type &missing,
1103 LogEntryHandler *rollbacker,
1104 const DoutPrefixProvider *dpp) {
1105 bool invalidate_stats = false;
1106 if (log && !entries.empty()) {
1107 assert(log->head < entries.begin()->version);
1108 }
1109 for (list<pg_log_entry_t>::const_iterator p = entries.begin();
1110 p != entries.end();
1111 ++p) {
1112 invalidate_stats = invalidate_stats || !p->is_error();
1113 if (log) {
1114 ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl;
1115 log->add(*p);
1116 }
1117 if (p->soid <= last_backfill &&
1118 !p->is_error()) {
c07f9fc5
FG
1119 if (missing.may_include_deletes) {
1120 missing.add_next_event(*p);
1121 } else {
1122 if (p->is_delete()) {
1123 missing.rm(p->soid, p->version);
1124 } else {
1125 missing.add_next_event(*p);
1126 }
1127 if (rollbacker) {
1128 // hack to match PG::mark_all_unfound_lost
1129 if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
1130 rollbacker->try_stash(p->soid, p->version.version);
1131 } else if (p->is_delete()) {
1132 rollbacker->remove(p->soid);
1133 }
7c673cae
FG
1134 }
1135 }
1136 }
1137 }
1138 return invalidate_stats;
1139 }
1140 bool append_new_log_entries(
1141 const hobject_t &last_backfill,
1142 bool last_backfill_bitwise,
31f18b77 1143 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
1144 LogEntryHandler *rollbacker) {
1145 bool invalidate_stats = append_log_entries_update_missing(
1146 last_backfill,
1147 last_backfill_bitwise,
1148 entries,
1149 true,
1150 &log,
1151 missing,
1152 rollbacker,
1153 this);
1154 if (!entries.empty()) {
1155 mark_writeout_from(entries.begin()->version);
c07f9fc5
FG
1156 if (entries.begin()->is_lost_delete()) {
1157 // hack: since lost deletes queue recovery directly, and don't
1158 // go through activate_not_complete() again, our complete_to
1159 // iterator may still point at log.end(). Reset it to point
1160 // before these new lost_delete entries. This only occurs
1161 // when lost+delete entries are initially added, which is
1162 // always in a list of solely lost_delete entries, so it is
1163 // sufficient to check whether the first entry is a
1164 // lost_delete
1165 reset_complete_to(nullptr);
1166 }
7c673cae
FG
1167 }
1168 return invalidate_stats;
1169 }
1170
c07f9fc5
FG
1171 void write_log_and_missing(
1172 ObjectStore::Transaction& t,
1173 map<string,bufferlist> *km,
1174 const coll_t& coll,
1175 const ghobject_t &log_oid,
1176 bool require_rollback);
7c673cae
FG
1177
1178 static void write_log_and_missing_wo_missing(
1179 ObjectStore::Transaction& t,
1180 map<string,bufferlist>* km,
1181 pg_log_t &log,
1182 const coll_t& coll,
1183 const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
c07f9fc5
FG
1184 bool require_rollback,
1185 bool dirty_dups);
7c673cae
FG
1186
1187 static void write_log_and_missing(
1188 ObjectStore::Transaction& t,
1189 map<string,bufferlist>* km,
1190 pg_log_t &log,
1191 const coll_t& coll,
1192 const ghobject_t &log_oid,
1193 const pg_missing_tracker_t &missing,
c07f9fc5
FG
1194 bool require_rollback,
1195 bool dirty_dups,
1196 bool *rebuilt_missing_set_with_deletes);
7c673cae
FG
1197
1198 static void _write_log_and_missing_wo_missing(
1199 ObjectStore::Transaction& t,
1200 map<string,bufferlist>* km,
1201 pg_log_t &log,
1202 const coll_t& coll, const ghobject_t &log_oid,
1203 map<eversion_t, hobject_t> &divergent_priors,
1204 eversion_t dirty_to,
1205 eversion_t dirty_from,
1206 eversion_t writeout_from,
1207 const set<eversion_t> &trimmed,
c07f9fc5 1208 const set<string> &trimmed_dups,
7c673cae
FG
1209 bool dirty_divergent_priors,
1210 bool touch_log,
1211 bool require_rollback,
c07f9fc5 1212 bool dirty_dups,
7c673cae
FG
1213 set<string> *log_keys_debug
1214 );
1215
1216 static void _write_log_and_missing(
1217 ObjectStore::Transaction& t,
1218 map<string,bufferlist>* km,
1219 pg_log_t &log,
1220 const coll_t& coll, const ghobject_t &log_oid,
1221 eversion_t dirty_to,
1222 eversion_t dirty_from,
1223 eversion_t writeout_from,
1224 const set<eversion_t> &trimmed,
c07f9fc5 1225 const set<string> &trimmed_dups,
7c673cae
FG
1226 const pg_missing_tracker_t &missing,
1227 bool touch_log,
1228 bool require_rollback,
1229 bool clear_divergent_priors,
c07f9fc5
FG
1230 bool dirty_dups,
1231 bool *rebuilt_missing_with_deletes,
7c673cae
FG
1232 set<string> *log_keys_debug
1233 );
1234
1235 void read_log_and_missing(
c07f9fc5
FG
1236 ObjectStore *store,
1237 coll_t pg_coll,
1238 coll_t log_coll,
1239 ghobject_t log_oid,
7c673cae 1240 const pg_info_t &info,
d2e6a577 1241 bool force_rebuild_missing,
7c673cae
FG
1242 ostringstream &oss,
1243 bool tolerate_divergent_missing_log,
1244 bool debug_verify_stored_missing = false
1245 ) {
1246 return read_log_and_missing(
1247 store, pg_coll, log_coll, log_oid, info,
d2e6a577 1248 log, missing, force_rebuild_missing, oss,
7c673cae
FG
1249 tolerate_divergent_missing_log,
1250 &clear_divergent_priors,
1251 this,
c07f9fc5 1252 (pg_log_debug ? &log_keys_debug : nullptr),
7c673cae
FG
1253 debug_verify_stored_missing);
1254 }
1255
1256 template <typename missing_type>
c07f9fc5
FG
1257 static void read_log_and_missing(
1258 ObjectStore *store,
1259 coll_t pg_coll,
1260 coll_t log_coll,
1261 ghobject_t log_oid,
7c673cae
FG
1262 const pg_info_t &info,
1263 IndexedLog &log,
c07f9fc5 1264 missing_type &missing,
d2e6a577 1265 bool force_rebuild_missing,
c07f9fc5 1266 ostringstream &oss,
7c673cae 1267 bool tolerate_divergent_missing_log,
c07f9fc5
FG
1268 bool *clear_divergent_priors = nullptr,
1269 const DoutPrefixProvider *dpp = nullptr,
1270 set<string> *log_keys_debug = nullptr,
7c673cae
FG
1271 bool debug_verify_stored_missing = false
1272 ) {
1273 ldpp_dout(dpp, 20) << "read_log_and_missing coll " << pg_coll
1274 << " log_oid " << log_oid << dendl;
1275
1276 // legacy?
1277 struct stat st;
1278 int r = store->stat(log_coll, log_oid, &st);
1279 assert(r == 0);
1280 assert(st.st_size == 0);
1281
1282 // will get overridden below if it had been recorded
1283 eversion_t on_disk_can_rollback_to = info.last_update;
1284 eversion_t on_disk_rollback_info_trimmed_to = eversion_t();
1285 ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid);
1286 map<eversion_t, hobject_t> divergent_priors;
c07f9fc5 1287 missing.may_include_deletes = false;
7c673cae 1288 list<pg_log_entry_t> entries;
c07f9fc5 1289 list<pg_log_dup_t> dups;
7c673cae
FG
1290 if (p) {
1291 for (p->seek_to_first(); p->valid() ; p->next(false)) {
1292 // non-log pgmeta_oid keys are prefixed with _; skip those
1293 if (p->key()[0] == '_')
1294 continue;
1295 bufferlist bl = p->value();//Copy bufferlist before creating iterator
1296 bufferlist::iterator bp = bl.begin();
1297 if (p->key() == "divergent_priors") {
1298 ::decode(divergent_priors, bp);
1299 ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
1300 << " divergent_priors" << dendl;
d2e6a577 1301 assert(force_rebuild_missing);
7c673cae
FG
1302 debug_verify_stored_missing = false;
1303 } else if (p->key() == "can_rollback_to") {
1304 ::decode(on_disk_can_rollback_to, bp);
1305 } else if (p->key() == "rollback_info_trimmed_to") {
1306 ::decode(on_disk_rollback_info_trimmed_to, bp);
c07f9fc5
FG
1307 } else if (p->key() == "may_include_deletes_in_missing") {
1308 missing.may_include_deletes = true;
7c673cae 1309 } else if (p->key().substr(0, 7) == string("missing")) {
c07f9fc5
FG
1310 hobject_t oid;
1311 pg_missing_item item;
1312 ::decode(oid, bp);
1313 ::decode(item, bp);
1314 if (item.is_delete()) {
1315 assert(missing.may_include_deletes);
1316 }
1317 missing.add(oid, item.need, item.have, item.is_delete());
1318 } else if (p->key().substr(0, 4) == string("dup_")) {
1319 pg_log_dup_t dup;
1320 ::decode(dup, bp);
1321 if (!dups.empty()) {
1322 assert(dups.back().version < dup.version);
1323 }
1324 dups.push_back(dup);
7c673cae
FG
1325 } else {
1326 pg_log_entry_t e;
1327 e.decode_with_checksum(bp);
1328 ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
1329 if (!entries.empty()) {
1330 pg_log_entry_t last_e(entries.back());
1331 assert(last_e.version.version < e.version.version);
1332 assert(last_e.version.epoch <= e.version.epoch);
1333 }
1334 entries.push_back(e);
1335 if (log_keys_debug)
1336 log_keys_debug->insert(e.get_key_name());
1337 }
1338 }
1339 }
1340 log = IndexedLog(
1341 info.last_update,
1342 info.log_tail,
1343 on_disk_can_rollback_to,
1344 on_disk_rollback_info_trimmed_to,
c07f9fc5
FG
1345 std::move(entries),
1346 std::move(dups));
7c673cae 1347
d2e6a577 1348 if (force_rebuild_missing || debug_verify_stored_missing) {
7c673cae
FG
1349 // build missing
1350 if (debug_verify_stored_missing || info.last_complete < info.last_update) {
c07f9fc5
FG
1351 ldpp_dout(dpp, 10)
1352 << "read_log_and_missing checking for missing items over interval ("
1353 << info.last_complete
1354 << "," << info.last_update << "]" << dendl;
7c673cae
FG
1355
1356 set<hobject_t> did;
1357 set<hobject_t> checked;
1358 set<hobject_t> skipped;
1359 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
1360 i != log.log.rend();
1361 ++i) {
1362 if (!debug_verify_stored_missing && i->version <= info.last_complete) break;
1363 if (i->soid > info.last_backfill)
1364 continue;
1365 if (i->is_error())
1366 continue;
1367 if (did.count(i->soid)) continue;
1368 did.insert(i->soid);
1369
c07f9fc5
FG
1370 if (!missing.may_include_deletes && i->is_delete())
1371 continue;
7c673cae
FG
1372
1373 bufferlist bv;
1374 int r = store->getattr(
1375 pg_coll,
1376 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
1377 OI_ATTR,
1378 bv);
1379 if (r >= 0) {
1380 object_info_t oi(bv);
1381 if (oi.version < i->version) {
1382 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i
1383 << " (have " << oi.version << ")" << dendl;
1384 if (debug_verify_stored_missing) {
1385 auto miter = missing.get_items().find(i->soid);
1386 assert(miter != missing.get_items().end());
1387 assert(miter->second.need == i->version);
c07f9fc5
FG
1388 // the 'have' version is reset if an object is deleted,
1389 // then created again
1390 assert(miter->second.have == oi.version || miter->second.have == eversion_t());
7c673cae
FG
1391 checked.insert(i->soid);
1392 } else {
c07f9fc5 1393 missing.add(i->soid, i->version, oi.version, i->is_delete());
7c673cae
FG
1394 }
1395 }
1396 } else {
1397 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
1398 if (debug_verify_stored_missing) {
1399 auto miter = missing.get_items().find(i->soid);
c07f9fc5
FG
1400 if (i->is_delete()) {
1401 assert(miter == missing.get_items().end() ||
1402 (miter->second.need == i->version &&
1403 miter->second.have == eversion_t()));
1404 } else {
1405 assert(miter != missing.get_items().end());
1406 assert(miter->second.need == i->version);
1407 assert(miter->second.have == eversion_t());
1408 }
7c673cae
FG
1409 checked.insert(i->soid);
1410 } else {
c07f9fc5 1411 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
7c673cae
FG
1412 }
1413 }
1414 }
1415 if (debug_verify_stored_missing) {
1416 for (auto &&i: missing.get_items()) {
1417 if (checked.count(i.first))
1418 continue;
c07f9fc5
FG
1419 if (i.first > info.last_backfill) {
1420 ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry "
1421 << "found before last_backfill: "
1422 << i.first << " " << i.second
1423 << " last_backfill = " << info.last_backfill
1424 << dendl;
7c673cae
FG
1425 assert(0 == "invalid missing set entry found");
1426 }
1427 bufferlist bv;
1428 int r = store->getattr(
1429 pg_coll,
1430 ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard),
1431 OI_ATTR,
1432 bv);
1433 if (r >= 0) {
1434 object_info_t oi(bv);
1435 assert(oi.version == i.second.have);
1436 } else {
c07f9fc5 1437 assert(i.second.is_delete() || eversion_t() == i.second.have);
7c673cae
FG
1438 }
1439 }
1440 } else {
d2e6a577 1441 assert(force_rebuild_missing);
7c673cae
FG
1442 for (map<eversion_t, hobject_t>::reverse_iterator i =
1443 divergent_priors.rbegin();
1444 i != divergent_priors.rend();
1445 ++i) {
1446 if (i->first <= info.last_complete) break;
1447 if (i->second > info.last_backfill)
1448 continue;
1449 if (did.count(i->second)) continue;
1450 did.insert(i->second);
1451 bufferlist bv;
1452 int r = store->getattr(
1453 pg_coll,
1454 ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard),
1455 OI_ATTR,
1456 bv);
1457 if (r >= 0) {
1458 object_info_t oi(bv);
1459 /**
1460 * 1) we see this entry in the divergent priors mapping
1461 * 2) we didn't see an entry for this object in the log
1462 *
1463 * From 1 & 2 we know that either the object does not exist
1464 * or it is at the version specified in the divergent_priors
1465 * map since the object would have been deleted atomically
1466 * with the addition of the divergent_priors entry, an older
1467 * version would not have been recovered, and a newer version
1468 * would show up in the log above.
1469 */
1470 /**
1471 * Unfortunately the assessment above is incorrect because of
1472 * http://tracker.ceph.com/issues/17916 (we were incorrectly
1473 * not removing the divergent_priors set from disk state!),
1474 * so let's check that.
1475 */
1476 if (oi.version > i->first && tolerate_divergent_missing_log) {
1477 ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i
1478 << ") inconsistent with disk state (" << oi
1479 << "), assuming it is tracker.ceph.com/issues/17916"
1480 << dendl;
1481 } else {
1482 assert(oi.version == i->first);
1483 }
1484 } else {
1485 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
c07f9fc5 1486 missing.add(i->second, i->first, eversion_t(), false);
7c673cae
FG
1487 }
1488 }
1489 }
1490 if (clear_divergent_priors)
1491 (*clear_divergent_priors) = true;
1492 }
1493 }
1494
d2e6a577 1495 if (!force_rebuild_missing) {
7c673cae
FG
1496 if (clear_divergent_priors)
1497 (*clear_divergent_priors) = false;
1498 missing.flush();
1499 }
1500 ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl;
c07f9fc5
FG
1501 } // static read_log_and_missing
1502}; // struct PGLog