]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGLog.h
Import ceph 15.2.8
[ceph.git] / ceph / src / osd / PGLog.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17 #pragma once
18
19 // re-include our assert to clobber boost's
20 #include "include/ceph_assert.h"
21 #include "include/common_fwd.h"
22 #include "osd_types.h"
23 #include "os/ObjectStore.h"
24 #include <list>
25
26 #ifdef WITH_SEASTAR
27 #include <seastar/core/future.hh>
28 #include "crimson/os/futurized_store.h"
29 #include "crimson/os/cyanstore/cyan_collection.h"
30 #endif
31
32 constexpr auto PGLOG_INDEXED_OBJECTS = 1 << 0;
33 constexpr auto PGLOG_INDEXED_CALLER_OPS = 1 << 1;
34 constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2;
35 constexpr auto PGLOG_INDEXED_DUPS = 1 << 3;
36 constexpr auto PGLOG_INDEXED_ALL = PGLOG_INDEXED_OBJECTS
37 | PGLOG_INDEXED_CALLER_OPS
38 | PGLOG_INDEXED_EXTRA_CALLER_OPS
39 | PGLOG_INDEXED_DUPS;
40
41 struct PGLog : DoutPrefixProvider {
42 std::ostream& gen_prefix(std::ostream& out) const override {
43 return out;
44 }
45 unsigned get_subsys() const override {
46 return static_cast<unsigned>(ceph_subsys_osd);
47 }
48 CephContext *get_cct() const override {
49 return cct;
50 }
51
52 ////////////////////////////// sub classes //////////////////////////////
53 struct LogEntryHandler {
54 virtual void rollback(
55 const pg_log_entry_t &entry) = 0;
56 virtual void rollforward(
57 const pg_log_entry_t &entry) = 0;
58 virtual void trim(
59 const pg_log_entry_t &entry) = 0;
60 virtual void remove(
61 const hobject_t &hoid) = 0;
62 virtual void try_stash(
63 const hobject_t &hoid,
64 version_t v) = 0;
65 virtual ~LogEntryHandler() {}
66 };
67 using LogEntryHandlerRef = unique_ptr<LogEntryHandler>;
68
69 public:
70 /**
71 * IndexLog - adds in-memory index of the log, by oid.
72 * plus some methods to manipulate it all.
73 */
74 struct IndexedLog : public pg_log_t {
75 mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful!
76 mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
77 mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
78 mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index;
79
80 // recovery pointers
81 list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
82 version_t last_requested = 0; // last object requested by primary
83
84 //
85 private:
86 mutable __u16 indexed_data = 0;
87 /**
88 * rollback_info_trimmed_to_riter points to the first log entry <=
89 * rollback_info_trimmed_to
90 *
91 * It's a reverse_iterator because rend() is a natural representation for
92 * tail, and rbegin() works nicely for head.
93 */
94 mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator
95 rollback_info_trimmed_to_riter;
96
97 /*
98 * return true if we need to mark the pglog as dirty
99 */
100 template <typename F>
101 bool advance_can_rollback_to(eversion_t to, F &&f) {
102 bool dirty_log = to > can_rollback_to || to > rollback_info_trimmed_to;
103 if (dirty_log) {
104 if (to > can_rollback_to)
105 can_rollback_to = to;
106
107 if (to > rollback_info_trimmed_to)
108 rollback_info_trimmed_to = to;
109 }
110
111 while (rollback_info_trimmed_to_riter != log.rbegin()) {
112 --rollback_info_trimmed_to_riter;
113 if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
114 ++rollback_info_trimmed_to_riter;
115 break;
116 }
117 f(*rollback_info_trimmed_to_riter);
118 }
119
120 return dirty_log;
121 }
122
123 void reset_rollback_info_trimmed_to_riter() {
124 rollback_info_trimmed_to_riter = log.rbegin();
125 while (rollback_info_trimmed_to_riter != log.rend() &&
126 rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
127 ++rollback_info_trimmed_to_riter;
128 }
129
130 // indexes objects, caller ops and extra caller ops
131 public:
132 IndexedLog() :
133 complete_to(log.end()),
134 last_requested(0),
135 indexed_data(0),
136 rollback_info_trimmed_to_riter(log.rbegin())
137 { }
138
139 template <typename... Args>
140 explicit IndexedLog(Args&&... args) :
141 pg_log_t(std::forward<Args>(args)...),
142 complete_to(log.end()),
143 last_requested(0),
144 indexed_data(0),
145 rollback_info_trimmed_to_riter(log.rbegin())
146 {
147 reset_rollback_info_trimmed_to_riter();
148 index();
149 }
150
151 IndexedLog(const IndexedLog &rhs) :
152 pg_log_t(rhs),
153 complete_to(log.end()),
154 last_requested(rhs.last_requested),
155 indexed_data(0),
156 rollback_info_trimmed_to_riter(log.rbegin())
157 {
158 reset_rollback_info_trimmed_to_riter();
159 index(rhs.indexed_data);
160 }
161
162 IndexedLog &operator=(const IndexedLog &rhs) {
163 this->~IndexedLog();
164 new (this) IndexedLog(rhs);
165 return *this;
166 }
167
168 void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) {
169 advance_can_rollback_to(
170 to,
171 [&](pg_log_entry_t &entry) {
172 h->trim(entry);
173 });
174 }
175 bool roll_forward_to(eversion_t to, LogEntryHandler *h) {
176 return advance_can_rollback_to(
177 to,
178 [&](pg_log_entry_t &entry) {
179 h->rollforward(entry);
180 });
181 }
182
183 void skip_can_rollback_to_to_head() {
184 advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
185 }
186
187 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
188 auto divergent = pg_log_t::rewind_from_head(newhead);
189 index();
190 reset_rollback_info_trimmed_to_riter();
191 return divergent;
192 }
193
194 template <typename T>
195 void scan_log_after(
196 const eversion_t &bound, ///< [in] scan entries > bound
197 T &&f) const {
198 auto iter = log.rbegin();
199 while (iter != log.rend() && iter->version > bound)
200 ++iter;
201
202 while (true) {
203 if (iter == log.rbegin())
204 break;
205 f(*(--iter));
206 }
207 }
208
209 /****/
210 void claim_log_and_clear_rollback_info(const pg_log_t& o) {
211 // we must have already trimmed the old entries
212 ceph_assert(rollback_info_trimmed_to == head);
213 ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
214
215 *this = IndexedLog(o);
216
217 skip_can_rollback_to_to_head();
218 index();
219 }
220
221 void split_out_child(
222 pg_t child_pgid,
223 unsigned split_bits,
224 IndexedLog *target);
225
226 void zero() {
227 // we must have already trimmed the old entries
228 ceph_assert(rollback_info_trimmed_to == head);
229 ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
230
231 unindex();
232 pg_log_t::clear();
233 rollback_info_trimmed_to_riter = log.rbegin();
234 reset_recovery_pointers();
235 }
236 void clear() {
237 skip_can_rollback_to_to_head();
238 zero();
239 }
240 void reset_recovery_pointers() {
241 complete_to = log.end();
242 last_requested = 0;
243 }
244
245 bool logged_object(const hobject_t& oid) const {
246 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
247 index_objects();
248 }
249 return objects.count(oid);
250 }
251
252 bool logged_req(const osd_reqid_t &r) const {
253 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
254 index_caller_ops();
255 }
256 if (!caller_ops.count(r)) {
257 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
258 index_extra_caller_ops();
259 }
260 return extra_caller_ops.count(r);
261 }
262 return true;
263 }
264
265 bool get_request(
266 const osd_reqid_t &r,
267 eversion_t *version,
268 version_t *user_version,
269 int *return_code,
270 vector<pg_log_op_return_item_t> *op_returns) const
271 {
272 ceph_assert(version);
273 ceph_assert(user_version);
274 ceph_assert(return_code);
275 ceph::unordered_map<osd_reqid_t,pg_log_entry_t*>::const_iterator p;
276 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
277 index_caller_ops();
278 }
279 p = caller_ops.find(r);
280 if (p != caller_ops.end()) {
281 *version = p->second->version;
282 *user_version = p->second->user_version;
283 *return_code = p->second->return_code;
284 *op_returns = p->second->op_returns;
285 return true;
286 }
287
288 // warning: we will return *a* request for this reqid, but not
289 // necessarily the most recent.
290 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
291 index_extra_caller_ops();
292 }
293 p = extra_caller_ops.find(r);
294 if (p != extra_caller_ops.end()) {
295 uint32_t idx = 0;
296 for (auto i = p->second->extra_reqids.begin();
297 i != p->second->extra_reqids.end();
298 ++idx, ++i) {
299 if (i->first == r) {
300 *version = p->second->version;
301 *user_version = i->second;
302 *return_code = p->second->return_code;
303 *op_returns = p->second->op_returns;
304 if (*return_code >= 0) {
305 auto it = p->second->extra_reqid_return_codes.find(idx);
306 if (it != p->second->extra_reqid_return_codes.end()) {
307 *return_code = it->second;
308 }
309 }
310 return true;
311 }
312 }
313 ceph_abort_msg("in extra_caller_ops but not extra_reqids");
314 }
315
316 if (!(indexed_data & PGLOG_INDEXED_DUPS)) {
317 index_dups();
318 }
319 auto q = dup_index.find(r);
320 if (q != dup_index.end()) {
321 *version = q->second->version;
322 *user_version = q->second->user_version;
323 *return_code = q->second->return_code;
324 *op_returns = q->second->op_returns;
325 return true;
326 }
327
328 return false;
329 }
330
331 bool has_write_since(const hobject_t &oid, const eversion_t &bound) const {
332 for (auto i = log.rbegin(); i != log.rend(); ++i) {
333 if (i->version <= bound)
334 return false;
335 if (i->soid.get_head() == oid.get_head())
336 return true;
337 }
338 return false;
339 }
340
341 /// get a (bounded) list of recent reqids for the given object
342 void get_object_reqids(const hobject_t& oid, unsigned max,
343 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *pls,
344 mempool::osd_pglog::map<uint32_t, int> *return_codes) const {
345 // make sure object is present at least once before we do an
346 // O(n) search.
347 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
348 index_objects();
349 }
350 if (objects.count(oid) == 0)
351 return;
352
353 for (list<pg_log_entry_t>::const_reverse_iterator i = log.rbegin();
354 i != log.rend();
355 ++i) {
356 if (i->soid == oid) {
357 if (i->reqid_is_indexed()) {
358 if (i->op == pg_log_entry_t::ERROR) {
359 // propagate op errors to the cache tier's PG log
360 return_codes->emplace(pls->size(), i->return_code);
361 }
362 pls->push_back(make_pair(i->reqid, i->user_version));
363 }
364
365 pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end());
366 if (pls->size() >= max) {
367 if (pls->size() > max) {
368 pls->resize(max);
369 }
370 return;
371 }
372 }
373 }
374 }
375
376 void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
377 // if to_index is 0, no need to run any of this code, especially
378 // loop below; this can happen with copy constructor for
379 // IndexedLog (and indirectly through assignment operator)
380 if (!to_index) return;
381
382 if (to_index & PGLOG_INDEXED_OBJECTS)
383 objects.clear();
384 if (to_index & PGLOG_INDEXED_CALLER_OPS)
385 caller_ops.clear();
386 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
387 extra_caller_ops.clear();
388 if (to_index & PGLOG_INDEXED_DUPS) {
389 dup_index.clear();
390 for (auto& i : dups) {
391 dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i);
392 }
393 }
394
395 constexpr __u16 any_log_entry_index =
396 PGLOG_INDEXED_OBJECTS |
397 PGLOG_INDEXED_CALLER_OPS |
398 PGLOG_INDEXED_EXTRA_CALLER_OPS;
399
400 if (to_index & any_log_entry_index) {
401 for (list<pg_log_entry_t>::const_iterator i = log.begin();
402 i != log.end();
403 ++i) {
404 if (to_index & PGLOG_INDEXED_OBJECTS) {
405 if (i->object_is_indexed()) {
406 objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
407 }
408 }
409
410 if (to_index & PGLOG_INDEXED_CALLER_OPS) {
411 if (i->reqid_is_indexed()) {
412 caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
413 }
414 }
415
416 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
417 for (auto j = i->extra_reqids.begin();
418 j != i->extra_reqids.end();
419 ++j) {
420 extra_caller_ops.insert(
421 make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
422 }
423 }
424 }
425 }
426
427 indexed_data |= to_index;
428 }
429
430 void index_objects() const {
431 index(PGLOG_INDEXED_OBJECTS);
432 }
433
434 void index_caller_ops() const {
435 index(PGLOG_INDEXED_CALLER_OPS);
436 }
437
438 void index_extra_caller_ops() const {
439 index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
440 }
441
442 void index_dups() const {
443 index(PGLOG_INDEXED_DUPS);
444 }
445
446 void index(pg_log_entry_t& e) {
447 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
448 if (objects.count(e.soid) == 0 ||
449 objects[e.soid]->version < e.version)
450 objects[e.soid] = &e;
451 }
452 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
453 // divergent merge_log indexes new before unindexing old
454 if (e.reqid_is_indexed()) {
455 caller_ops[e.reqid] = &e;
456 }
457 }
458 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
459 for (auto j = e.extra_reqids.begin();
460 j != e.extra_reqids.end();
461 ++j) {
462 extra_caller_ops.insert(make_pair(j->first, &e));
463 }
464 }
465 }
466
467 void unindex() {
468 objects.clear();
469 caller_ops.clear();
470 extra_caller_ops.clear();
471 dup_index.clear();
472 indexed_data = 0;
473 }
474
475 void unindex(const pg_log_entry_t& e) {
476 // NOTE: this only works if we remove from the _tail_ of the log!
477 if (indexed_data & PGLOG_INDEXED_OBJECTS) {
478 auto it = objects.find(e.soid);
479 if (it != objects.end() && it->second->version == e.version)
480 objects.erase(it);
481 }
482 if (e.reqid_is_indexed()) {
483 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
484 auto it = caller_ops.find(e.reqid);
485 // divergent merge_log indexes new before unindexing old
486 if (it != caller_ops.end() && it->second == &e)
487 caller_ops.erase(it);
488 }
489 }
490 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
491 for (auto j = e.extra_reqids.begin();
492 j != e.extra_reqids.end();
493 ++j) {
494 for (ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*>::iterator k =
495 extra_caller_ops.find(j->first);
496 k != extra_caller_ops.end() && k->first == j->first;
497 ++k) {
498 if (k->second == &e) {
499 extra_caller_ops.erase(k);
500 break;
501 }
502 }
503 }
504 }
505 }
506
507 void index(pg_log_dup_t& e) {
508 if (indexed_data & PGLOG_INDEXED_DUPS) {
509 dup_index[e.reqid] = &e;
510 }
511 }
512
513 void unindex(const pg_log_dup_t& e) {
514 if (indexed_data & PGLOG_INDEXED_DUPS) {
515 auto i = dup_index.find(e.reqid);
516 if (i != dup_index.end()) {
517 dup_index.erase(i);
518 }
519 }
520 }
521
522 // actors
523 void add(const pg_log_entry_t& e, bool applied = true) {
524 if (!applied) {
525 ceph_assert(get_can_rollback_to() == head);
526 }
527
528 // make sure our buffers don't pin bigger buffers
529 e.mod_desc.trim_bl();
530
531 // add to log
532 log.push_back(e);
533
534 // riter previously pointed to the previous entry
535 if (rollback_info_trimmed_to_riter == log.rbegin())
536 ++rollback_info_trimmed_to_riter;
537
538 ceph_assert(e.version > head);
539 ceph_assert(head.version == 0 || e.version.version > head.version);
540 head = e.version;
541
542 // to our index
543 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
544 objects[e.soid] = &(log.back());
545 }
546 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
547 if (e.reqid_is_indexed()) {
548 caller_ops[e.reqid] = &(log.back());
549 }
550 }
551
552 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
553 for (auto j = e.extra_reqids.begin();
554 j != e.extra_reqids.end();
555 ++j) {
556 extra_caller_ops.insert(make_pair(j->first, &(log.back())));
557 }
558 }
559
560 if (!applied) {
561 skip_can_rollback_to_to_head();
562 }
563 } // add
564
565 void trim(
566 CephContext* cct,
567 eversion_t s,
568 set<eversion_t> *trimmed,
569 set<string>* trimmed_dups,
570 eversion_t *write_from_dups);
571
572 ostream& print(ostream& out) const;
573 }; // IndexedLog
574
575
576 protected:
577 //////////////////// data members ////////////////////
578
579 pg_missing_tracker_t missing;
580 IndexedLog log;
581
582 eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to
583 eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
584 eversion_t writeout_from; ///< must writout keys >= writeout_from
585 set<eversion_t> trimmed; ///< must clear keys in trimmed
586 eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups
587 eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups
588 eversion_t write_from_dups; ///< must write keys >= write_from_dups
589 set<string> trimmed_dups; ///< must clear keys in trimmed_dups
590 CephContext *cct;
591 bool pg_log_debug;
592 /// Log is clean on [dirty_to, dirty_from)
593 bool touched_log;
594 bool dirty_log;
595 bool clear_divergent_priors;
596 bool may_include_deletes_in_missing_dirty = false;
597
598 void mark_dirty_to(eversion_t to) {
599 if (to > dirty_to)
600 dirty_to = to;
601 }
602 void mark_dirty_from(eversion_t from) {
603 if (from < dirty_from)
604 dirty_from = from;
605 }
606 void mark_writeout_from(eversion_t from) {
607 if (from < writeout_from)
608 writeout_from = from;
609 }
610 void mark_dirty_to_dups(eversion_t to) {
611 if (to > dirty_to_dups)
612 dirty_to_dups = to;
613 }
614 void mark_dirty_from_dups(eversion_t from) {
615 if (from < dirty_from_dups)
616 dirty_from_dups = from;
617 }
618 public:
619 bool needs_write() const {
620 return !touched_log || is_dirty();
621 }
622
623 bool is_dirty() const {
624 return dirty_log ||
625 (dirty_to != eversion_t()) ||
626 (dirty_from != eversion_t::max()) ||
627 (writeout_from != eversion_t::max()) ||
628 !(trimmed.empty()) ||
629 !missing.is_clean() ||
630 !(trimmed_dups.empty()) ||
631 (dirty_to_dups != eversion_t()) ||
632 (dirty_from_dups != eversion_t::max()) ||
633 (write_from_dups != eversion_t::max()) ||
634 may_include_deletes_in_missing_dirty;
635 }
636
637 void mark_log_for_rewrite() {
638 mark_dirty_to(eversion_t::max());
639 mark_dirty_from(eversion_t());
640 mark_dirty_to_dups(eversion_t::max());
641 mark_dirty_from_dups(eversion_t());
642 touched_log = false;
643 }
644 bool get_may_include_deletes_in_missing_dirty() const {
645 return may_include_deletes_in_missing_dirty;
646 }
647 protected:
648
649 /// DEBUG
650 set<string> log_keys_debug;
651 static void clear_after(set<string> *log_keys_debug, const string &lb) {
652 if (!log_keys_debug)
653 return;
654 for (set<string>::iterator i = log_keys_debug->lower_bound(lb);
655 i != log_keys_debug->end();
656 log_keys_debug->erase(i++));
657 }
658 static void clear_up_to(set<string> *log_keys_debug, const string &ub) {
659 if (!log_keys_debug)
660 return;
661 for (set<string>::iterator i = log_keys_debug->begin();
662 i != log_keys_debug->end() && *i < ub;
663 log_keys_debug->erase(i++));
664 }
665
666 void check();
667 void undirty() {
668 dirty_to = eversion_t();
669 dirty_from = eversion_t::max();
670 touched_log = true;
671 dirty_log = false;
672 trimmed.clear();
673 trimmed_dups.clear();
674 writeout_from = eversion_t::max();
675 check();
676 missing.flush();
677 dirty_to_dups = eversion_t();
678 dirty_from_dups = eversion_t::max();
679 write_from_dups = eversion_t::max();
680 }
681 public:
682
683 // cppcheck-suppress noExplicitConstructor
684 PGLog(CephContext *cct) :
685 dirty_from(eversion_t::max()),
686 writeout_from(eversion_t::max()),
687 dirty_from_dups(eversion_t::max()),
688 write_from_dups(eversion_t::max()),
689 cct(cct),
690 pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
691 touched_log(false),
692 dirty_log(false),
693 clear_divergent_priors(false)
694 { }
695
696 void reset_backfill();
697
698 void clear();
699
700 //////////////////// get or set missing ////////////////////
701
702 const pg_missing_tracker_t& get_missing() const { return missing; }
703
704 void missing_add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete=false) {
705 missing.add(oid, need, have, is_delete);
706 }
707
708 void missing_add_next_entry(const pg_log_entry_t& e) {
709 missing.add_next_event(e);
710 }
711
712 //////////////////// get or set log ////////////////////
713
714 const IndexedLog &get_log() const { return log; }
715
716 const eversion_t &get_tail() const { return log.tail; }
717
718 void set_tail(eversion_t tail) { log.tail = tail; }
719
720 const eversion_t &get_head() const { return log.head; }
721
722 void set_head(eversion_t head) { log.head = head; }
723
724 void set_last_requested(version_t last_requested) {
725 log.last_requested = last_requested;
726 }
727
728 void index() { log.index(); }
729
730 void unindex() { log.unindex(); }
731
732 void add(const pg_log_entry_t& e, bool applied = true) {
733 mark_writeout_from(e.version);
734 log.add(e, applied);
735 }
736
737 void reset_recovery_pointers() { log.reset_recovery_pointers(); }
738
739 static void clear_info_log(
740 spg_t pgid,
741 ObjectStore::Transaction *t);
742
743 void trim(
744 eversion_t trim_to,
745 pg_info_t &info,
746 bool transaction_applied = true,
747 bool async = false);
748
749 void roll_forward_to(
750 eversion_t roll_forward_to,
751 LogEntryHandler *h) {
752 if (log.roll_forward_to(
753 roll_forward_to,
754 h))
755 dirty_log = true;
756 }
757
758 eversion_t get_can_rollback_to() const {
759 return log.get_can_rollback_to();
760 }
761
762 void roll_forward(LogEntryHandler *h) {
763 roll_forward_to(
764 log.head,
765 h);
766 }
767
768 void skip_rollforward() {
769 log.skip_can_rollback_to_to_head();
770 }
771
772 //////////////////// get or set log & missing ////////////////////
773
774 void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) {
775 log.trim_rollback_info_to(log.head, h);
776 log.claim_log_and_clear_rollback_info(o);
777 missing.clear();
778 mark_dirty_to(eversion_t::max());
779 mark_dirty_to_dups(eversion_t::max());
780 }
781
782 void split_into(
783 pg_t child_pgid,
784 unsigned split_bits,
785 PGLog *opg_log) {
786 log.split_out_child(child_pgid, split_bits, &opg_log->log);
787 missing.split_into(child_pgid, split_bits, &(opg_log->missing));
788 opg_log->mark_dirty_to(eversion_t::max());
789 opg_log->mark_dirty_to_dups(eversion_t::max());
790 mark_dirty_to(eversion_t::max());
791 mark_dirty_to_dups(eversion_t::max());
792 if (missing.may_include_deletes) {
793 opg_log->set_missing_may_contain_deletes();
794 }
795 }
796
797 void merge_from(
798 const vector<PGLog*>& sources,
799 eversion_t last_update) {
800 unindex();
801 missing.clear();
802
803 vector<pg_log_t*> slogs;
804 for (auto s : sources) {
805 slogs.push_back(&s->log);
806 }
807 log.merge_from(slogs, last_update);
808
809 index();
810
811 mark_log_for_rewrite();
812 }
813
814 void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
815 if (missing.is_missing(oid, v)) {
816 missing.got(oid, v);
817 info.stats.stats.sum.num_objects_missing = missing.num_missing();
818
819 // raise last_complete?
820 if (missing.get_items().empty()) {
821 log.complete_to = log.log.end();
822 info.last_complete = info.last_update;
823 }
824 auto oldest_need = missing.get_oldest_need();
825 while (log.complete_to != log.log.end()) {
826 if (oldest_need <= log.complete_to->version)
827 break;
828 if (info.last_complete < log.complete_to->version)
829 info.last_complete = log.complete_to->version;
830 ++log.complete_to;
831 }
832 }
833
834 ceph_assert(log.get_can_rollback_to() >= v);
835 }
836
837 void reset_complete_to(pg_info_t *info) {
838 if (log.log.empty()) // caller is split_into()
839 return;
840 log.complete_to = log.log.begin();
841 ceph_assert(log.complete_to != log.log.end());
842 auto oldest_need = missing.get_oldest_need();
843 if (oldest_need != eversion_t()) {
844 while (log.complete_to->version < oldest_need) {
845 ++log.complete_to;
846 ceph_assert(log.complete_to != log.log.end());
847 }
848 }
849 if (!info)
850 return;
851 if (log.complete_to == log.log.begin()) {
852 info->last_complete = eversion_t();
853 } else {
854 --log.complete_to;
855 info->last_complete = log.complete_to->version;
856 ++log.complete_to;
857 }
858 }
859
860 void activate_not_complete(pg_info_t &info) {
861 reset_complete_to(&info);
862 log.last_requested = 0;
863 }
864
865 void proc_replica_log(pg_info_t &oinfo,
866 const pg_log_t &olog,
867 pg_missing_t& omissing, pg_shard_t from) const;
868
869 void set_missing_may_contain_deletes() {
870 missing.may_include_deletes = true;
871 may_include_deletes_in_missing_dirty = true;
872 }
873
874 void rebuild_missing_set_with_deletes(ObjectStore *store,
875 ObjectStore::CollectionHandle& ch,
876 const pg_info_t &info);
877
878 protected:
879 static void split_by_object(
880 mempool::osd_pglog::list<pg_log_entry_t> &entries,
881 map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) {
882 while (!entries.empty()) {
883 auto &out_list = (*out_entries)[entries.front().soid];
884 out_list.splice(out_list.end(), entries, entries.begin());
885 }
886 }
887
888 /**
889 * _merge_object_divergent_entries
890 *
891 * There are 5 distinct cases:
892 * 1) There is a more recent update: in this case we assume we adjusted the
893 * store and missing during merge_log
894 * 2) The first entry in the divergent sequence is a create. This might
895 * either be because the object is a clone or because prior_version is
896 * eversion_t(). In this case the object does not exist and we must
897 * adjust missing and the store to match.
898 * 3) We are currently missing the object. In this case, we adjust the
899 * missing to our prior_version taking care to add a divergent_prior
900 * if necessary
901 * 4) We can rollback all of the entries. In this case, we do so using
902 * the rollbacker and return -- the object does not go into missing.
903 * 5) We cannot rollback at least 1 of the entries. In this case, we
904 * clear the object out of the store and add a missing entry at
905 * prior_version taking care to add a divergent_prior if
906 * necessary.
907 */
908 template <typename missing_type>
909 static void _merge_object_divergent_entries(
910 const IndexedLog &log, ///< [in] log to merge against
911 const hobject_t &hoid, ///< [in] object we are merging
912 const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
913 const pg_info_t &info, ///< [in] info for merging entries
914 eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input InedexedLog
915 missing_type &missing, ///< [in,out] missing to adjust, use
916 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
917 const DoutPrefixProvider *dpp ///< [in] logging provider
918 ) {
919 ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
920 << " entries: " << orig_entries << dendl;
921
922 if (hoid > info.last_backfill) {
923 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill"
924 << dendl;
925 return;
926 }
927
928 // entries is non-empty
929 ceph_assert(!orig_entries.empty());
930 // strip out and ignore ERROR entries
931 mempool::osd_pglog::list<pg_log_entry_t> entries;
932 eversion_t last;
933 bool seen_non_error = false;
934 for (list<pg_log_entry_t>::const_iterator i = orig_entries.begin();
935 i != orig_entries.end();
936 ++i) {
937 // all entries are on hoid
938 ceph_assert(i->soid == hoid);
939 // did not see error entries before this entry and this entry is not error
940 // then this entry is the first non error entry
941 bool first_non_error = ! seen_non_error && ! i->is_error();
942 if (! i->is_error() ) {
943 // see a non error entry now
944 seen_non_error = true;
945 }
946
947 // No need to check the first entry since it prior_version is unavailable
948 // in the list
949 // No need to check if the prior_version is the minimal version
950 // No need to check the first non-error entry since the leading error
951 // entries are not its prior version
952 if (i != orig_entries.begin() && i->prior_version != eversion_t() &&
953 ! first_non_error) {
954 // in increasing order of version
955 ceph_assert(i->version > last);
956 // prior_version correct (unless it is an ERROR entry)
957 ceph_assert(i->prior_version == last || i->is_error());
958 }
959 if (i->is_error()) {
960 ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
961 } else {
962 ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
963 entries.push_back(*i);
964 last = i->version;
965 }
966 }
967 if (entries.empty()) {
968 ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl;
969 return;
970 }
971
972 const eversion_t prior_version = entries.begin()->prior_version;
973 const eversion_t first_divergent_update = entries.begin()->version;
974 const eversion_t last_divergent_update = entries.rbegin()->version;
975 const bool object_not_in_store =
976 !missing.is_missing(hoid) &&
977 entries.rbegin()->is_delete();
978 ldpp_dout(dpp, 10) << __func__ << ": hoid " << " object_not_in_store: "
979 << object_not_in_store << dendl;
980 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
981 << " prior_version: " << prior_version
982 << " first_divergent_update: " << first_divergent_update
983 << " last_divergent_update: " << last_divergent_update
984 << dendl;
985
986 ceph::unordered_map<hobject_t, pg_log_entry_t*>::const_iterator objiter =
987 log.objects.find(hoid);
988 if (objiter != log.objects.end() &&
989 objiter->second->version >= first_divergent_update) {
990 /// Case 1)
991 ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: "
992 << *objiter->second << ", already merged" << dendl;
993
994 ceph_assert(objiter->second->version > last_divergent_update);
995
996 // ensure missing has been updated appropriately
997 if (objiter->second->is_update() ||
998 (missing.may_include_deletes && objiter->second->is_delete())) {
999 ceph_assert(missing.is_missing(hoid) &&
1000 missing.get_items().at(hoid).need == objiter->second->version);
1001 } else {
1002 ceph_assert(!missing.is_missing(hoid));
1003 }
1004 missing.revise_have(hoid, eversion_t());
1005 missing.mark_fully_dirty(hoid);
1006 if (rollbacker) {
1007 if (!object_not_in_store) {
1008 rollbacker->remove(hoid);
1009 }
1010 for (auto &&i: entries) {
1011 rollbacker->trim(i);
1012 }
1013 }
1014 return;
1015 }
1016
1017 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1018 <<" has no more recent entries in log" << dendl;
1019 if (prior_version == eversion_t() || entries.front().is_clone()) {
1020 /// Case 2)
1021 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1022 << " prior_version or op type indicates creation,"
1023 << " deleting"
1024 << dendl;
1025 if (missing.is_missing(hoid))
1026 missing.rm(missing.get_items().find(hoid));
1027 if (rollbacker) {
1028 if (!object_not_in_store) {
1029 rollbacker->remove(hoid);
1030 }
1031 for (auto &&i: entries) {
1032 rollbacker->trim(i);
1033 }
1034 }
1035 return;
1036 }
1037
1038 if (missing.is_missing(hoid)) {
1039 /// Case 3)
1040 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1041 << " missing, " << missing.get_items().at(hoid)
1042 << " adjusting" << dendl;
1043
1044 if (missing.get_items().at(hoid).have == prior_version) {
1045 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1046 << " missing.have is prior_version " << prior_version
1047 << " removing from missing" << dendl;
1048 missing.rm(missing.get_items().find(hoid));
1049 } else {
1050 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1051 << " missing.have is " << missing.get_items().at(hoid).have
1052 << ", adjusting" << dendl;
1053 missing.revise_need(hoid, prior_version, false);
1054 if (prior_version <= info.log_tail) {
1055 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1056 << " prior_version " << prior_version
1057 << " <= info.log_tail "
1058 << info.log_tail << dendl;
1059 }
1060 }
1061 if (rollbacker) {
1062 for (auto &&i: entries) {
1063 rollbacker->trim(i);
1064 }
1065 }
1066 return;
1067 }
1068
1069 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1070 << " must be rolled back or recovered,"
1071 << " attempting to rollback"
1072 << dendl;
1073 bool can_rollback = true;
1074 // We are going to make an important decision based on the
1075 // olog_can_rollback_to value we have received, better known it.
1076 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1077 << " olog_can_rollback_to: "
1078 << olog_can_rollback_to << dendl;
1079 /// Distinguish between 4) and 5)
1080 for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin();
1081 i != entries.rend();
1082 ++i) {
1083 if (!i->can_rollback() || i->version <= olog_can_rollback_to) {
1084 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback "
1085 << *i << dendl;
1086 can_rollback = false;
1087 break;
1088 }
1089 }
1090
1091 if (can_rollback) {
1092 /// Case 4)
1093 for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin();
1094 i != entries.rend();
1095 ++i) {
1096 ceph_assert(i->can_rollback() && i->version > olog_can_rollback_to);
1097 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1098 << " rolling back " << *i << dendl;
1099 if (rollbacker)
1100 rollbacker->rollback(*i);
1101 }
1102 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1103 << " rolled back" << dendl;
1104 return;
1105 } else {
1106 /// Case 5)
1107 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, "
1108 << "removing and adding to missing" << dendl;
1109 if (rollbacker) {
1110 if (!object_not_in_store)
1111 rollbacker->remove(hoid);
1112 for (auto &&i: entries) {
1113 rollbacker->trim(i);
1114 }
1115 }
1116 missing.add(hoid, prior_version, eversion_t(), false);
1117 if (prior_version <= info.log_tail) {
1118 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1119 << " prior_version " << prior_version
1120 << " <= info.log_tail "
1121 << info.log_tail << dendl;
1122 }
1123 }
1124 }
1125
1126 /// Merge all entries using above
1127 template <typename missing_type>
1128 static void _merge_divergent_entries(
1129 const IndexedLog &log, ///< [in] log to merge against
1130 mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge
1131 const pg_info_t &oinfo, ///< [in] info for merging entries
1132 eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input IndexedLog
1133 missing_type &omissing, ///< [in,out] missing to adjust, use
1134 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
1135 const DoutPrefixProvider *dpp ///< [in] logging provider
1136 ) {
1137 map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
1138 split_by_object(entries, &split);
1139 for (map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>>::iterator i = split.begin();
1140 i != split.end();
1141 ++i) {
1142 _merge_object_divergent_entries(
1143 log,
1144 i->first,
1145 i->second,
1146 oinfo,
1147 olog_can_rollback_to,
1148 omissing,
1149 rollbacker,
1150 dpp);
1151 }
1152 }
1153
1154 /**
1155 * Exists for use in TestPGLog for simply testing single divergent log
1156 * cases
1157 */
1158 void merge_old_entry(
1159 ObjectStore::Transaction& t,
1160 const pg_log_entry_t& oe,
1161 const pg_info_t& info,
1162 LogEntryHandler *rollbacker) {
1163 mempool::osd_pglog::list<pg_log_entry_t> entries;
1164 entries.push_back(oe);
1165 _merge_object_divergent_entries(
1166 log,
1167 oe.soid,
1168 entries,
1169 info,
1170 log.get_can_rollback_to(),
1171 missing,
1172 rollbacker,
1173 this);
1174 }
1175
1176 bool merge_log_dups(const pg_log_t& olog);
1177
1178 public:
1179
1180 void rewind_divergent_log(eversion_t newhead,
1181 pg_info_t &info,
1182 LogEntryHandler *rollbacker,
1183 bool &dirty_info,
1184 bool &dirty_big_info);
1185
1186 void merge_log(pg_info_t &oinfo,
1187 pg_log_t &olog,
1188 pg_shard_t from,
1189 pg_info_t &info, LogEntryHandler *rollbacker,
1190 bool &dirty_info, bool &dirty_big_info);
1191
1192 template <typename missing_type>
1193 static bool append_log_entries_update_missing(
1194 const hobject_t &last_backfill,
1195 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
1196 bool maintain_rollback,
1197 IndexedLog *log,
1198 missing_type &missing,
1199 LogEntryHandler *rollbacker,
1200 const DoutPrefixProvider *dpp) {
1201 bool invalidate_stats = false;
1202 if (log && !entries.empty()) {
1203 ceph_assert(log->head < entries.begin()->version);
1204 }
1205 for (list<pg_log_entry_t>::const_iterator p = entries.begin();
1206 p != entries.end();
1207 ++p) {
1208 invalidate_stats = invalidate_stats || !p->is_error();
1209 if (log) {
1210 ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl;
1211 log->add(*p);
1212 }
1213 if (p->soid <= last_backfill &&
1214 !p->is_error()) {
1215 if (missing.may_include_deletes) {
1216 missing.add_next_event(*p);
1217 } else {
1218 if (p->is_delete()) {
1219 missing.rm(p->soid, p->version);
1220 } else {
1221 missing.add_next_event(*p);
1222 }
1223 if (rollbacker) {
1224 // hack to match PG::mark_all_unfound_lost
1225 if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
1226 rollbacker->try_stash(p->soid, p->version.version);
1227 } else if (p->is_delete()) {
1228 rollbacker->remove(p->soid);
1229 }
1230 }
1231 }
1232 }
1233 }
1234 return invalidate_stats;
1235 }
1236 bool append_new_log_entries(
1237 const hobject_t &last_backfill,
1238 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
1239 LogEntryHandler *rollbacker) {
1240 bool invalidate_stats = append_log_entries_update_missing(
1241 last_backfill,
1242 entries,
1243 true,
1244 &log,
1245 missing,
1246 rollbacker,
1247 this);
1248 if (!entries.empty()) {
1249 mark_writeout_from(entries.begin()->version);
1250 if (entries.begin()->is_lost_delete()) {
1251 // hack: since lost deletes queue recovery directly, and don't
1252 // go through activate_not_complete() again, our complete_to
1253 // iterator may still point at log.end(). Reset it to point
1254 // before these new lost_delete entries. This only occurs
1255 // when lost+delete entries are initially added, which is
1256 // always in a list of solely lost_delete entries, so it is
1257 // sufficient to check whether the first entry is a
1258 // lost_delete
1259 reset_complete_to(nullptr);
1260 }
1261 }
1262 return invalidate_stats;
1263 }
1264
1265 void write_log_and_missing(
1266 ObjectStore::Transaction& t,
1267 map<string,bufferlist> *km,
1268 const coll_t& coll,
1269 const ghobject_t &log_oid,
1270 bool require_rollback);
1271
1272 static void write_log_and_missing_wo_missing(
1273 ObjectStore::Transaction& t,
1274 map<string,bufferlist>* km,
1275 pg_log_t &log,
1276 const coll_t& coll,
1277 const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
1278 bool require_rollback);
1279
1280 static void write_log_and_missing(
1281 ObjectStore::Transaction& t,
1282 map<string,bufferlist>* km,
1283 pg_log_t &log,
1284 const coll_t& coll,
1285 const ghobject_t &log_oid,
1286 const pg_missing_tracker_t &missing,
1287 bool require_rollback,
1288 bool *rebuilt_missing_set_with_deletes);
1289
1290 static void _write_log_and_missing_wo_missing(
1291 ObjectStore::Transaction& t,
1292 map<string,bufferlist>* km,
1293 pg_log_t &log,
1294 const coll_t& coll, const ghobject_t &log_oid,
1295 map<eversion_t, hobject_t> &divergent_priors,
1296 eversion_t dirty_to,
1297 eversion_t dirty_from,
1298 eversion_t writeout_from,
1299 bool dirty_divergent_priors,
1300 bool touch_log,
1301 bool require_rollback,
1302 eversion_t dirty_to_dups,
1303 eversion_t dirty_from_dups,
1304 eversion_t write_from_dups,
1305 set<string> *log_keys_debug
1306 );
1307
1308 static void _write_log_and_missing(
1309 ObjectStore::Transaction& t,
1310 map<string,bufferlist>* km,
1311 pg_log_t &log,
1312 const coll_t& coll, const ghobject_t &log_oid,
1313 eversion_t dirty_to,
1314 eversion_t dirty_from,
1315 eversion_t writeout_from,
1316 set<eversion_t> &&trimmed,
1317 set<string> &&trimmed_dups,
1318 const pg_missing_tracker_t &missing,
1319 bool touch_log,
1320 bool require_rollback,
1321 bool clear_divergent_priors,
1322 eversion_t dirty_to_dups,
1323 eversion_t dirty_from_dups,
1324 eversion_t write_from_dups,
1325 bool *may_include_deletes_in_missing_dirty,
1326 set<string> *log_keys_debug
1327 );
1328
1329 void read_log_and_missing(
1330 ObjectStore *store,
1331 ObjectStore::CollectionHandle& ch,
1332 ghobject_t pgmeta_oid,
1333 const pg_info_t &info,
1334 ostringstream &oss,
1335 bool tolerate_divergent_missing_log,
1336 bool debug_verify_stored_missing = false
1337 ) {
1338 return read_log_and_missing(
1339 store, ch, pgmeta_oid, info,
1340 log, missing, oss,
1341 tolerate_divergent_missing_log,
1342 &clear_divergent_priors,
1343 this,
1344 (pg_log_debug ? &log_keys_debug : nullptr),
1345 debug_verify_stored_missing);
1346 }
1347
1348 template <typename missing_type>
1349 static void read_log_and_missing(
1350 ObjectStore *store,
1351 ObjectStore::CollectionHandle &ch,
1352 ghobject_t pgmeta_oid,
1353 const pg_info_t &info,
1354 IndexedLog &log,
1355 missing_type &missing,
1356 ostringstream &oss,
1357 bool tolerate_divergent_missing_log,
1358 bool *clear_divergent_priors = nullptr,
1359 const DoutPrefixProvider *dpp = nullptr,
1360 set<string> *log_keys_debug = nullptr,
1361 bool debug_verify_stored_missing = false
1362 ) {
1363 ldpp_dout(dpp, 20) << "read_log_and_missing coll " << ch->cid
1364 << " " << pgmeta_oid << dendl;
1365
1366 // legacy?
1367 struct stat st;
1368 int r = store->stat(ch, pgmeta_oid, &st);
1369 ceph_assert(r == 0);
1370 ceph_assert(st.st_size == 0);
1371
1372 // will get overridden below if it had been recorded
1373 eversion_t on_disk_can_rollback_to = info.last_update;
1374 eversion_t on_disk_rollback_info_trimmed_to = eversion_t();
1375 ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch,
1376 pgmeta_oid);
1377 map<eversion_t, hobject_t> divergent_priors;
1378 bool must_rebuild = false;
1379 missing.may_include_deletes = false;
1380 list<pg_log_entry_t> entries;
1381 list<pg_log_dup_t> dups;
1382 if (p) {
1383 for (p->seek_to_first(); p->valid() ; p->next()) {
1384 // non-log pgmeta_oid keys are prefixed with _; skip those
1385 if (p->key()[0] == '_')
1386 continue;
1387 bufferlist bl = p->value();//Copy bufferlist before creating iterator
1388 auto bp = bl.cbegin();
1389 if (p->key() == "divergent_priors") {
1390 decode(divergent_priors, bp);
1391 ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
1392 << " divergent_priors" << dendl;
1393 must_rebuild = true;
1394 debug_verify_stored_missing = false;
1395 } else if (p->key() == "can_rollback_to") {
1396 decode(on_disk_can_rollback_to, bp);
1397 } else if (p->key() == "rollback_info_trimmed_to") {
1398 decode(on_disk_rollback_info_trimmed_to, bp);
1399 } else if (p->key() == "may_include_deletes_in_missing") {
1400 missing.may_include_deletes = true;
1401 } else if (p->key().substr(0, 7) == string("missing")) {
1402 hobject_t oid;
1403 pg_missing_item item;
1404 decode(oid, bp);
1405 decode(item, bp);
1406 ldpp_dout(dpp, 20) << "read_log_and_missing " << item << dendl;
1407 if (item.is_delete()) {
1408 ceph_assert(missing.may_include_deletes);
1409 }
1410 missing.add(oid, std::move(item));
1411 } else if (p->key().substr(0, 4) == string("dup_")) {
1412 pg_log_dup_t dup;
1413 decode(dup, bp);
1414 if (!dups.empty()) {
1415 ceph_assert(dups.back().version < dup.version);
1416 }
1417 dups.push_back(dup);
1418 } else {
1419 pg_log_entry_t e;
1420 e.decode_with_checksum(bp);
1421 ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
1422 if (!entries.empty()) {
1423 pg_log_entry_t last_e(entries.back());
1424 ceph_assert(last_e.version.version < e.version.version);
1425 ceph_assert(last_e.version.epoch <= e.version.epoch);
1426 }
1427 entries.push_back(e);
1428 if (log_keys_debug)
1429 log_keys_debug->insert(e.get_key_name());
1430 }
1431 }
1432 }
1433 log = IndexedLog(
1434 info.last_update,
1435 info.log_tail,
1436 on_disk_can_rollback_to,
1437 on_disk_rollback_info_trimmed_to,
1438 std::move(entries),
1439 std::move(dups));
1440
1441 if (must_rebuild || debug_verify_stored_missing) {
1442 // build missing
1443 if (debug_verify_stored_missing || info.last_complete < info.last_update) {
1444 ldpp_dout(dpp, 10)
1445 << "read_log_and_missing checking for missing items over interval ("
1446 << info.last_complete
1447 << "," << info.last_update << "]" << dendl;
1448
1449 set<hobject_t> did;
1450 set<hobject_t> checked;
1451 set<hobject_t> skipped;
1452 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
1453 i != log.log.rend();
1454 ++i) {
1455 if (i->soid > info.last_backfill)
1456 continue;
1457 if (i->is_error())
1458 continue;
1459 if (did.count(i->soid)) continue;
1460 did.insert(i->soid);
1461
1462 if (!missing.may_include_deletes && i->is_delete())
1463 continue;
1464
1465 bufferlist bv;
1466 int r = store->getattr(
1467 ch,
1468 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
1469 OI_ATTR,
1470 bv);
1471 if (r >= 0) {
1472 object_info_t oi(bv);
1473 if (oi.version < i->version) {
1474 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i
1475 << " (have " << oi.version << ")"
1476 << " clean_regions " << i->clean_regions << dendl;
1477
1478 if (debug_verify_stored_missing) {
1479 auto miter = missing.get_items().find(i->soid);
1480 ceph_assert(miter != missing.get_items().end());
1481 ceph_assert(miter->second.need == i->version);
1482 // the 'have' version is reset if an object is deleted,
1483 // then created again
1484 ceph_assert(miter->second.have == oi.version || miter->second.have == eversion_t());
1485 checked.insert(i->soid);
1486 } else {
1487 missing.add(i->soid, i->version, oi.version, i->is_delete());
1488 }
1489 }
1490 } else {
1491 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
1492 if (debug_verify_stored_missing) {
1493 auto miter = missing.get_items().find(i->soid);
1494 if (i->is_delete()) {
1495 ceph_assert(miter == missing.get_items().end() ||
1496 (miter->second.need == i->version &&
1497 miter->second.have == eversion_t()));
1498 } else {
1499 ceph_assert(miter != missing.get_items().end());
1500 ceph_assert(miter->second.need == i->version);
1501 ceph_assert(miter->second.have == eversion_t());
1502 }
1503 checked.insert(i->soid);
1504 } else {
1505 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
1506 }
1507 }
1508 }
1509 if (debug_verify_stored_missing) {
1510 for (auto &&i: missing.get_items()) {
1511 if (checked.count(i.first))
1512 continue;
1513 if (i.first > info.last_backfill) {
1514 ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry "
1515 << "found before last_backfill: "
1516 << i.first << " " << i.second
1517 << " last_backfill = " << info.last_backfill
1518 << dendl;
1519 ceph_abort_msg("invalid missing set entry found");
1520 }
1521 bufferlist bv;
1522 int r = store->getattr(
1523 ch,
1524 ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard),
1525 OI_ATTR,
1526 bv);
1527 if (r >= 0) {
1528 object_info_t oi(bv);
1529 ceph_assert(oi.version == i.second.have || eversion_t() == i.second.have);
1530 } else {
1531 ceph_assert(i.second.is_delete() || eversion_t() == i.second.have);
1532 }
1533 }
1534 } else {
1535 ceph_assert(must_rebuild);
1536 for (map<eversion_t, hobject_t>::reverse_iterator i =
1537 divergent_priors.rbegin();
1538 i != divergent_priors.rend();
1539 ++i) {
1540 if (i->first <= info.last_complete) break;
1541 if (i->second > info.last_backfill)
1542 continue;
1543 if (did.count(i->second)) continue;
1544 did.insert(i->second);
1545 bufferlist bv;
1546 int r = store->getattr(
1547 ch,
1548 ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard),
1549 OI_ATTR,
1550 bv);
1551 if (r >= 0) {
1552 object_info_t oi(bv);
1553 /**
1554 * 1) we see this entry in the divergent priors mapping
1555 * 2) we didn't see an entry for this object in the log
1556 *
1557 * From 1 & 2 we know that either the object does not exist
1558 * or it is at the version specified in the divergent_priors
1559 * map since the object would have been deleted atomically
1560 * with the addition of the divergent_priors entry, an older
1561 * version would not have been recovered, and a newer version
1562 * would show up in the log above.
1563 */
1564 /**
1565 * Unfortunately the assessment above is incorrect because of
1566 * http://tracker.ceph.com/issues/17916 (we were incorrectly
1567 * not removing the divergent_priors set from disk state!),
1568 * so let's check that.
1569 */
1570 if (oi.version > i->first && tolerate_divergent_missing_log) {
1571 ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i
1572 << ") inconsistent with disk state (" << oi
1573 << "), assuming it is tracker.ceph.com/issues/17916"
1574 << dendl;
1575 } else {
1576 ceph_assert(oi.version == i->first);
1577 }
1578 } else {
1579 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
1580 missing.add(i->second, i->first, eversion_t(), false);
1581 }
1582 }
1583 }
1584 if (clear_divergent_priors)
1585 (*clear_divergent_priors) = true;
1586 }
1587 }
1588
1589 if (!must_rebuild) {
1590 if (clear_divergent_priors)
1591 (*clear_divergent_priors) = false;
1592 missing.flush();
1593 }
1594 ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl;
1595 } // static read_log_and_missing
1596
1597 #ifdef WITH_SEASTAR
1598 seastar::future<> read_log_and_missing_crimson(
1599 crimson::os::FuturizedStore &store,
1600 crimson::os::CollectionRef ch,
1601 const pg_info_t &info,
1602 ghobject_t pgmeta_oid
1603 ) {
1604 return read_log_and_missing_crimson(
1605 store, ch, info,
1606 log, missing, pgmeta_oid,
1607 this);
1608 }
1609
1610 template <typename missing_type>
1611 struct FuturizedStoreLogReader {
1612 crimson::os::FuturizedStore &store;
1613 crimson::os::CollectionRef ch;
1614 const pg_info_t &info;
1615 IndexedLog &log;
1616 missing_type &missing;
1617 ghobject_t pgmeta_oid;
1618 const DoutPrefixProvider *dpp;
1619
1620 eversion_t on_disk_can_rollback_to;
1621 eversion_t on_disk_rollback_info_trimmed_to;
1622
1623 std::map<eversion_t, hobject_t> divergent_priors;
1624 bool must_rebuild = false;
1625 std::list<pg_log_entry_t> entries;
1626 std::list<pg_log_dup_t> dups;
1627
1628 std::optional<std::string> next;
1629
1630 void process_entry(const std::pair<std::string, ceph::bufferlist> &p) {
1631 if (p.first[0] == '_')
1632 return;
1633 ceph::bufferlist bl = p.second;//Copy bufferlist before creating iterator
1634 auto bp = bl.cbegin();
1635 if (p.first == "divergent_priors") {
1636 decode(divergent_priors, bp);
1637 ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
1638 << " divergent_priors" << dendl;
1639 ceph_assert("crimson shouldn't have had divergent_priors" == 0);
1640 } else if (p.first == "can_rollback_to") {
1641 decode(on_disk_can_rollback_to, bp);
1642 } else if (p.first == "rollback_info_trimmed_to") {
1643 decode(on_disk_rollback_info_trimmed_to, bp);
1644 } else if (p.first == "may_include_deletes_in_missing") {
1645 missing.may_include_deletes = true;
1646 } else if (p.first.substr(0, 7) == string("missing")) {
1647 hobject_t oid;
1648 pg_missing_item item;
1649 decode(oid, bp);
1650 decode(item, bp);
1651 if (item.is_delete()) {
1652 ceph_assert(missing.may_include_deletes);
1653 }
1654 missing.add(oid, std::move(item));
1655 } else if (p.first.substr(0, 4) == string("dup_")) {
1656 pg_log_dup_t dup;
1657 decode(dup, bp);
1658 if (!dups.empty()) {
1659 ceph_assert(dups.back().version < dup.version);
1660 }
1661 dups.push_back(dup);
1662 } else {
1663 pg_log_entry_t e;
1664 e.decode_with_checksum(bp);
1665 ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
1666 if (!entries.empty()) {
1667 pg_log_entry_t last_e(entries.back());
1668 ceph_assert(last_e.version.version < e.version.version);
1669 ceph_assert(last_e.version.epoch <= e.version.epoch);
1670 }
1671 entries.push_back(e);
1672 }
1673 }
1674
1675
1676 seastar::future<> start() {
1677 // will get overridden if recorded
1678 on_disk_can_rollback_to = info.last_update;
1679 missing.may_include_deletes = false;
1680
1681 auto reader = std::unique_ptr<FuturizedStoreLogReader>(this);
1682 return seastar::repeat(
1683 [this]() {
1684 return store.omap_get_values(ch, pgmeta_oid, next).then(
1685 [this](
1686 bool done, crimson::os::FuturizedStore::omap_values_t values) {
1687 for (auto &&p : values) {
1688 process_entry(p);
1689 }
1690 return done ? seastar::stop_iteration::yes
1691 : seastar::stop_iteration::no;
1692 });
1693 }).then([this, reader{std::move(reader)}]() {
1694 log = IndexedLog(
1695 info.last_update,
1696 info.log_tail,
1697 on_disk_can_rollback_to,
1698 on_disk_rollback_info_trimmed_to,
1699 std::move(entries),
1700 std::move(dups));
1701 return seastar::now();
1702 });
1703 }
1704 };
1705
1706 template <typename missing_type>
1707 static seastar::future<> read_log_and_missing_crimson(
1708 crimson::os::FuturizedStore &store,
1709 crimson::os::CollectionRef ch,
1710 const pg_info_t &info,
1711 IndexedLog &log,
1712 missing_type &missing,
1713 ghobject_t pgmeta_oid,
1714 const DoutPrefixProvider *dpp = nullptr
1715 ) {
1716 ldpp_dout(dpp, 20) << "read_log_and_missing coll "
1717 << ch->get_cid()
1718 << " " << pgmeta_oid << dendl;
1719 return (new FuturizedStoreLogReader<missing_type>{
1720 store, ch, info, log, missing, pgmeta_oid, dpp})->start();
1721 }
1722
1723 #endif
1724
1725 }; // struct PGLog