]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGLog.h
update sources to v12.1.0
[ceph.git] / ceph / src / osd / PGLog.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17 #ifndef CEPH_PG_LOG_H
18 #define CEPH_PG_LOG_H
19
20 // re-include our assert to clobber boost's
21 #include "include/assert.h"
22 #include "osd_types.h"
23 #include "os/ObjectStore.h"
24 #include <list>
25 using namespace std;
26
27 #define PGLOG_INDEXED_OBJECTS (1 << 0)
28 #define PGLOG_INDEXED_CALLER_OPS (1 << 1)
29 #define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2)
30 #define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | PGLOG_INDEXED_CALLER_OPS | PGLOG_INDEXED_EXTRA_CALLER_OPS)
31
32 class CephContext;
33
34 struct PGLog : DoutPrefixProvider {
35 DoutPrefixProvider *prefix_provider;
36 string gen_prefix() const override {
37 return prefix_provider ? prefix_provider->gen_prefix() : "";
38 }
39 unsigned get_subsys() const override {
40 return prefix_provider ? prefix_provider->get_subsys() :
41 (unsigned)ceph_subsys_osd;
42 }
43 CephContext *get_cct() const override {
44 return cct;
45 }
46
47 ////////////////////////////// sub classes //////////////////////////////
48 struct LogEntryHandler {
49 virtual void rollback(
50 const pg_log_entry_t &entry) = 0;
51 virtual void rollforward(
52 const pg_log_entry_t &entry) = 0;
53 virtual void trim(
54 const pg_log_entry_t &entry) = 0;
55 virtual void remove(
56 const hobject_t &hoid) = 0;
57 virtual void try_stash(
58 const hobject_t &hoid,
59 version_t v) = 0;
60 virtual ~LogEntryHandler() {}
61 };
62
63 /* Exceptions */
64 class read_log_and_missing_error : public buffer::error {
65 public:
66 explicit read_log_and_missing_error(const char *what) {
67 snprintf(buf, sizeof(buf), "read_log_and_missing_error: %s", what);
68 }
69 const char *what() const throw () override {
70 return buf;
71 }
72 private:
73 char buf[512];
74 };
75
76 public:
77 /**
78 * IndexLog - adds in-memory index of the log, by oid.
79 * plus some methods to manipulate it all.
80 */
81 struct IndexedLog : public pg_log_t {
82 mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful!
83 mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
84 mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
85
86 // recovery pointers
87 list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
88 version_t last_requested = 0; // last object requested by primary
89
90 //
91 private:
92 mutable __u16 indexed_data = 0;
93 /**
94 * rollback_info_trimmed_to_riter points to the first log entry <=
95 * rollback_info_trimmed_to
96 *
97 * It's a reverse_iterator because rend() is a natural representation for
98 * tail, and rbegin() works nicely for head.
99 */
100 mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator
101 rollback_info_trimmed_to_riter;
102
103 template <typename F>
104 void advance_can_rollback_to(eversion_t to, F &&f) {
105 if (to > can_rollback_to)
106 can_rollback_to = to;
107
108 if (to > rollback_info_trimmed_to)
109 rollback_info_trimmed_to = to;
110
111 while (rollback_info_trimmed_to_riter != log.rbegin()) {
112 --rollback_info_trimmed_to_riter;
113 if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
114 ++rollback_info_trimmed_to_riter;
115 break;
116 }
117 f(*rollback_info_trimmed_to_riter);
118 }
119 }
120
121 void reset_rollback_info_trimmed_to_riter() {
122 rollback_info_trimmed_to_riter = log.rbegin();
123 while (rollback_info_trimmed_to_riter != log.rend() &&
124 rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
125 ++rollback_info_trimmed_to_riter;
126 }
127
128 // indexes objects, caller ops and extra caller ops
129 public:
130 IndexedLog() :
131 complete_to(log.end()),
132 last_requested(0),
133 indexed_data(0),
134 rollback_info_trimmed_to_riter(log.rbegin())
135 {}
136
137 template <typename... Args>
138 IndexedLog(Args&&... args) :
139 pg_log_t(std::forward<Args>(args)...),
140 complete_to(log.end()),
141 last_requested(0),
142 indexed_data(0),
143 rollback_info_trimmed_to_riter(log.rbegin()) {
144 reset_rollback_info_trimmed_to_riter();
145 index();
146 }
147
148 IndexedLog(const IndexedLog &rhs) :
149 pg_log_t(rhs),
150 complete_to(log.end()),
151 last_requested(rhs.last_requested),
152 indexed_data(0),
153 rollback_info_trimmed_to_riter(log.rbegin()) {
154 reset_rollback_info_trimmed_to_riter();
155 index(rhs.indexed_data);
156 }
157 IndexedLog &operator=(const IndexedLog &rhs) {
158 this->~IndexedLog();
159 new (this) IndexedLog(rhs);
160 return *this;
161 }
162
163 void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) {
164 advance_can_rollback_to(
165 to,
166 [&](pg_log_entry_t &entry) {
167 h->trim(entry);
168 });
169 }
170 void roll_forward_to(eversion_t to, LogEntryHandler *h) {
171 advance_can_rollback_to(
172 to,
173 [&](pg_log_entry_t &entry) {
174 h->rollforward(entry);
175 });
176 }
177
178 void skip_can_rollback_to_to_head() {
179 advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
180 }
181
182 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
183 auto divergent = pg_log_t::rewind_from_head(newhead);
184 index();
185 reset_rollback_info_trimmed_to_riter();
186 return divergent;
187 }
188
189 template <typename T>
190 void scan_log_after(
191 const eversion_t &bound, ///< [in] scan entries > bound
192 T &&f) const {
193 auto iter = log.rbegin();
194 while (iter != log.rend() && iter->version > bound)
195 ++iter;
196
197 while (true) {
198 if (iter == log.rbegin())
199 break;
200 f(*(--iter));
201 }
202 }
203
204 /****/
205 void claim_log_and_clear_rollback_info(const pg_log_t& o) {
206 // we must have already trimmed the old entries
207 assert(rollback_info_trimmed_to == head);
208 assert(rollback_info_trimmed_to_riter == log.rbegin());
209
210 *this = IndexedLog(o);
211
212 skip_can_rollback_to_to_head();
213 index();
214 }
215
216 void split_out_child(
217 pg_t child_pgid,
218 unsigned split_bits,
219 IndexedLog *target);
220
221 void zero() {
222 // we must have already trimmed the old entries
223 assert(rollback_info_trimmed_to == head);
224 assert(rollback_info_trimmed_to_riter == log.rbegin());
225
226 unindex();
227 pg_log_t::clear();
228 rollback_info_trimmed_to_riter = log.rbegin();
229 reset_recovery_pointers();
230 }
231 void clear() {
232 skip_can_rollback_to_to_head();
233 zero();
234 }
235 void reset_recovery_pointers() {
236 complete_to = log.end();
237 last_requested = 0;
238 }
239
240 bool logged_object(const hobject_t& oid) const {
241 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
242 index_objects();
243 }
244 return objects.count(oid);
245 }
246
247 bool logged_req(const osd_reqid_t &r) const {
248 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
249 index_caller_ops();
250 }
251 if (!caller_ops.count(r)) {
252 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
253 index_extra_caller_ops();
254 }
255 return extra_caller_ops.count(r);
256 }
257 return true;
258 }
259
260 bool get_request(
261 const osd_reqid_t &r,
262 eversion_t *version,
263 version_t *user_version,
264 int *return_code) const {
265 assert(version);
266 assert(user_version);
267 assert(return_code);
268 ceph::unordered_map<osd_reqid_t,pg_log_entry_t*>::const_iterator p;
269 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
270 index_caller_ops();
271 }
272 p = caller_ops.find(r);
273 if (p != caller_ops.end()) {
274 *version = p->second->version;
275 *user_version = p->second->user_version;
276 *return_code = p->second->return_code;
277 return true;
278 }
279
280 // warning: we will return *a* request for this reqid, but not
281 // necessarily the most recent.
282 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
283 index_extra_caller_ops();
284 }
285 p = extra_caller_ops.find(r);
286 if (p != extra_caller_ops.end()) {
287 for (auto i = p->second->extra_reqids.begin();
288 i != p->second->extra_reqids.end();
289 ++i) {
290 if (i->first == r) {
291 *version = p->second->version;
292 *user_version = i->second;
293 *return_code = p->second->return_code;
294 return true;
295 }
296 }
297 assert(0 == "in extra_caller_ops but not extra_reqids");
298 }
299 return false;
300 }
301
302 /// get a (bounded) list of recent reqids for the given object
303 void get_object_reqids(const hobject_t& oid, unsigned max,
304 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *pls) const {
305 // make sure object is present at least once before we do an
306 // O(n) search.
307 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
308 index_objects();
309 }
310 if (objects.count(oid) == 0)
311 return;
312 for (list<pg_log_entry_t>::const_reverse_iterator i = log.rbegin();
313 i != log.rend();
314 ++i) {
315 if (i->soid == oid) {
316 if (i->reqid_is_indexed())
317 pls->push_back(make_pair(i->reqid, i->user_version));
318 pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end());
319 if (pls->size() >= max) {
320 if (pls->size() > max) {
321 pls->resize(max);
322 }
323 return;
324 }
325 }
326 }
327 }
328
329 void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
330 if (to_index & PGLOG_INDEXED_OBJECTS)
331 objects.clear();
332 if (to_index & PGLOG_INDEXED_CALLER_OPS)
333 caller_ops.clear();
334 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
335 extra_caller_ops.clear();
336
337 for (list<pg_log_entry_t>::const_iterator i = log.begin();
338 i != log.end();
339 ++i) {
340 if (to_index & PGLOG_INDEXED_OBJECTS) {
341 if (i->object_is_indexed()) {
342 objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
343 }
344 }
345
346 if (to_index & PGLOG_INDEXED_CALLER_OPS) {
347 if (i->reqid_is_indexed()) {
348 caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
349 }
350 }
351
352 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
353 for (auto j = i->extra_reqids.begin();
354 j != i->extra_reqids.end();
355 ++j) {
356 extra_caller_ops.insert(
357 make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
358 }
359 }
360 }
361
362 indexed_data |= to_index;
363 }
364
365 void index_objects() const {
366 index(PGLOG_INDEXED_OBJECTS);
367 }
368
369 void index_caller_ops() const {
370 index(PGLOG_INDEXED_CALLER_OPS);
371 }
372
373 void index_extra_caller_ops() const {
374 index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
375 }
376
377 void index(pg_log_entry_t& e) {
378 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
379 if (objects.count(e.soid) == 0 ||
380 objects[e.soid]->version < e.version)
381 objects[e.soid] = &e;
382 }
383 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
384 // divergent merge_log indexes new before unindexing old
385 if (e.reqid_is_indexed()) {
386 caller_ops[e.reqid] = &e;
387 }
388 }
389 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
390 for (auto j = e.extra_reqids.begin();
391 j != e.extra_reqids.end();
392 ++j) {
393 extra_caller_ops.insert(make_pair(j->first, &e));
394 }
395 }
396 }
397 void unindex() {
398 objects.clear();
399 caller_ops.clear();
400 extra_caller_ops.clear();
401 indexed_data = 0;
402 }
403 void unindex(pg_log_entry_t& e) {
404 // NOTE: this only works if we remove from the _tail_ of the log!
405 if (indexed_data & PGLOG_INDEXED_OBJECTS) {
406 if (objects.count(e.soid) && objects[e.soid]->version == e.version)
407 objects.erase(e.soid);
408 }
409 if (e.reqid_is_indexed()) {
410 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
411 // divergent merge_log indexes new before unindexing old
412 if (caller_ops.count(e.reqid) && caller_ops[e.reqid] == &e)
413 caller_ops.erase(e.reqid);
414 }
415 }
416 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
417 for (auto j = e.extra_reqids.begin();
418 j != e.extra_reqids.end();
419 ++j) {
420 for (ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*>::iterator k =
421 extra_caller_ops.find(j->first);
422 k != extra_caller_ops.end() && k->first == j->first;
423 ++k) {
424 if (k->second == &e) {
425 extra_caller_ops.erase(k);
426 break;
427 }
428 }
429 }
430 }
431 }
432
433 // actors
434 void add(const pg_log_entry_t& e, bool applied = true) {
435 if (!applied) {
436 assert(get_can_rollback_to() == head);
437 }
438
439 // make sure our buffers don't pin bigger buffers
440 e.mod_desc.trim_bl();
441
442 // add to log
443 log.push_back(e);
444
445 // riter previously pointed to the previous entry
446 if (rollback_info_trimmed_to_riter == log.rbegin())
447 ++rollback_info_trimmed_to_riter;
448
449 assert(e.version > head);
450 assert(head.version == 0 || e.version.version > head.version);
451 head = e.version;
452
453 // to our index
454 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
455 objects[e.soid] = &(log.back());
456 }
457 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
458 if (e.reqid_is_indexed()) {
459 caller_ops[e.reqid] = &(log.back());
460 }
461 }
462
463 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
464 for (auto j = e.extra_reqids.begin();
465 j != e.extra_reqids.end();
466 ++j) {
467 extra_caller_ops.insert(make_pair(j->first, &(log.back())));
468 }
469 }
470
471 if (!applied) {
472 skip_can_rollback_to_to_head();
473 }
474 }
475
476 void trim(
477 CephContext* cct,
478 eversion_t s,
479 set<eversion_t> *trimmed);
480
481 ostream& print(ostream& out) const;
482 };
483
484
485 protected:
486 //////////////////// data members ////////////////////
487
488 pg_missing_tracker_t missing;
489 IndexedLog log;
490
491 eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to
492 eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
493 eversion_t writeout_from; ///< must writout keys >= writeout_from
494 set<eversion_t> trimmed; ///< must clear keys in trimmed
495 CephContext *cct;
496 bool pg_log_debug;
497 /// Log is clean on [dirty_to, dirty_from)
498 bool touched_log;
499 bool clear_divergent_priors;
500
501 void mark_dirty_to(eversion_t to) {
502 if (to > dirty_to)
503 dirty_to = to;
504 }
505 void mark_dirty_from(eversion_t from) {
506 if (from < dirty_from)
507 dirty_from = from;
508 }
509 void mark_writeout_from(eversion_t from) {
510 if (from < writeout_from)
511 writeout_from = from;
512 }
513 public:
514 bool is_dirty() const {
515 return !touched_log ||
516 (dirty_to != eversion_t()) ||
517 (dirty_from != eversion_t::max()) ||
518 (writeout_from != eversion_t::max()) ||
519 !(trimmed.empty()) ||
520 !missing.is_clean();
521 }
522 void mark_log_for_rewrite() {
523 mark_dirty_to(eversion_t::max());
524 mark_dirty_from(eversion_t());
525 touched_log = false;
526 }
527 protected:
528
529 /// DEBUG
530 set<string> log_keys_debug;
531 static void clear_after(set<string> *log_keys_debug, const string &lb) {
532 if (!log_keys_debug)
533 return;
534 for (set<string>::iterator i = log_keys_debug->lower_bound(lb);
535 i != log_keys_debug->end();
536 log_keys_debug->erase(i++));
537 }
538 static void clear_up_to(set<string> *log_keys_debug, const string &ub) {
539 if (!log_keys_debug)
540 return;
541 for (set<string>::iterator i = log_keys_debug->begin();
542 i != log_keys_debug->end() && *i < ub;
543 log_keys_debug->erase(i++));
544 }
545
546 void check();
547 void undirty() {
548 dirty_to = eversion_t();
549 dirty_from = eversion_t::max();
550 touched_log = true;
551 trimmed.clear();
552 writeout_from = eversion_t::max();
553 check();
554 missing.flush();
555 }
556 public:
557 // cppcheck-suppress noExplicitConstructor
558 PGLog(CephContext *cct, DoutPrefixProvider *dpp = 0) :
559 prefix_provider(dpp),
560 dirty_from(eversion_t::max()),
561 writeout_from(eversion_t::max()),
562 cct(cct),
563 pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
564 touched_log(false),
565 clear_divergent_priors(false) {}
566
567
568 void reset_backfill();
569
570 void clear();
571
572 //////////////////// get or set missing ////////////////////
573
574 const pg_missing_tracker_t& get_missing() const { return missing; }
575 void revise_have(hobject_t oid, eversion_t have) {
576 missing.revise_have(oid, have);
577 }
578
579 void revise_need(hobject_t oid, eversion_t need) {
580 missing.revise_need(oid, need);
581 }
582
583 void missing_add(const hobject_t& oid, eversion_t need, eversion_t have) {
584 missing.add(oid, need, have);
585 }
586
587 //////////////////// get or set log ////////////////////
588
589 const IndexedLog &get_log() const { return log; }
590
591 const eversion_t &get_tail() const { return log.tail; }
592
593 void set_tail(eversion_t tail) { log.tail = tail; }
594
595 const eversion_t &get_head() const { return log.head; }
596
597 void set_head(eversion_t head) { log.head = head; }
598
599 void set_last_requested(version_t last_requested) {
600 log.last_requested = last_requested;
601 }
602
603 void index() { log.index(); }
604
605 void unindex() { log.unindex(); }
606
607 void add(const pg_log_entry_t& e, bool applied = true) {
608 mark_writeout_from(e.version);
609 log.add(e, applied);
610 }
611
612 void reset_recovery_pointers() { log.reset_recovery_pointers(); }
613
614 static void clear_info_log(
615 spg_t pgid,
616 ObjectStore::Transaction *t);
617
618 void trim(
619 eversion_t trim_to,
620 pg_info_t &info);
621
622 void roll_forward_to(
623 eversion_t roll_forward_to,
624 LogEntryHandler *h) {
625 log.roll_forward_to(
626 roll_forward_to,
627 h);
628 }
629
630 eversion_t get_can_rollback_to() const {
631 return log.get_can_rollback_to();
632 }
633
634 void roll_forward(LogEntryHandler *h) {
635 roll_forward_to(
636 log.head,
637 h);
638 }
639
640 //////////////////// get or set log & missing ////////////////////
641
642 void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) {
643 log.trim_rollback_info_to(log.head, h);
644 log.claim_log_and_clear_rollback_info(o);
645 missing.clear();
646 mark_dirty_to(eversion_t::max());
647 }
648
649 void split_into(
650 pg_t child_pgid,
651 unsigned split_bits,
652 PGLog *opg_log) {
653 log.split_out_child(child_pgid, split_bits, &opg_log->log);
654 missing.split_into(child_pgid, split_bits, &(opg_log->missing));
655 opg_log->mark_dirty_to(eversion_t::max());
656 mark_dirty_to(eversion_t::max());
657 }
658
659 void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
660 if (missing.is_missing(oid, v)) {
661 missing.got(oid, v);
662
663 // raise last_complete?
664 if (missing.get_items().empty()) {
665 log.complete_to = log.log.end();
666 info.last_complete = info.last_update;
667 }
668 while (log.complete_to != log.log.end()) {
669 if (missing.get_items().at(
670 missing.get_rmissing().begin()->second
671 ).need <= log.complete_to->version)
672 break;
673 if (info.last_complete < log.complete_to->version)
674 info.last_complete = log.complete_to->version;
675 ++log.complete_to;
676 }
677 }
678
679 assert(log.get_can_rollback_to() >= v);
680 }
681
682 void activate_not_complete(pg_info_t &info) {
683 log.complete_to = log.log.begin();
684 while (log.complete_to->version <
685 missing.get_items().at(
686 missing.get_rmissing().begin()->second
687 ).need)
688 ++log.complete_to;
689 assert(log.complete_to != log.log.end());
690 if (log.complete_to == log.log.begin()) {
691 info.last_complete = eversion_t();
692 } else {
693 --log.complete_to;
694 info.last_complete = log.complete_to->version;
695 ++log.complete_to;
696 }
697 log.last_requested = 0;
698 }
699
700 void proc_replica_log(pg_info_t &oinfo,
701 const pg_log_t &olog,
702 pg_missing_t& omissing, pg_shard_t from) const;
703
704 protected:
705 static void split_by_object(
706 mempool::osd_pglog::list<pg_log_entry_t> &entries,
707 map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) {
708 while (!entries.empty()) {
709 auto &out_list = (*out_entries)[entries.front().soid];
710 out_list.splice(out_list.end(), entries, entries.begin());
711 }
712 }
713
714 /**
715 * _merge_object_divergent_entries
716 *
717 * There are 5 distinct cases:
718 * 1) There is a more recent update: in this case we assume we adjusted the
719 * store and missing during merge_log
720 * 2) The first entry in the divergent sequence is a create. This might
721 * either be because the object is a clone or because prior_version is
722 * eversion_t(). In this case the object does not exist and we must
723 * adjust missing and the store to match.
724 * 3) We are currently missing the object. In this case, we adjust the
725 * missing to our prior_version taking care to add a divergent_prior
726 * if necessary
727 * 4) We can rollback all of the entries. In this case, we do so using
728 * the rollbacker and return -- the object does not go into missing.
729 * 5) We cannot rollback at least 1 of the entries. In this case, we
730 * clear the object out of the store and add a missing entry at
731 * prior_version taking care to add a divergent_prior if
732 * necessary.
733 */
734 template <typename missing_type>
735 static void _merge_object_divergent_entries(
736 const IndexedLog &log, ///< [in] log to merge against
737 const hobject_t &hoid, ///< [in] object we are merging
738 const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
739 const pg_info_t &info, ///< [in] info for merging entries
740 eversion_t olog_can_rollback_to, ///< [in] rollback boundary
741 missing_type &missing, ///< [in,out] missing to adjust, use
742 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
743 const DoutPrefixProvider *dpp ///< [in] logging provider
744 ) {
745 ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
746 << " entries: " << orig_entries << dendl;
747
748 if (hoid > info.last_backfill) {
749 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill"
750 << dendl;
751 return;
752 }
753
754 // entries is non-empty
755 assert(!orig_entries.empty());
756 // strip out and ignore ERROR entries
757 mempool::osd_pglog::list<pg_log_entry_t> entries;
758 eversion_t last;
759 for (list<pg_log_entry_t>::const_iterator i = orig_entries.begin();
760 i != orig_entries.end();
761 ++i) {
762 // all entries are on hoid
763 assert(i->soid == hoid);
764 if (i != orig_entries.begin() && i->prior_version != eversion_t()) {
765 // in increasing order of version
766 assert(i->version > last);
767 // prior_version correct (unless it is an ERROR entry)
768 assert(i->prior_version == last || i->is_error());
769 }
770 last = i->version;
771 if (i->is_error()) {
772 ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
773 } else {
774 ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
775 entries.push_back(*i);
776 }
777 }
778 if (entries.empty()) {
779 ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl;
780 return;
781 }
782
783 const eversion_t prior_version = entries.begin()->prior_version;
784 const eversion_t first_divergent_update = entries.begin()->version;
785 const eversion_t last_divergent_update = entries.rbegin()->version;
786 const bool object_not_in_store =
787 !missing.is_missing(hoid) &&
788 entries.rbegin()->is_delete();
789 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
790 << " prior_version: " << prior_version
791 << " first_divergent_update: " << first_divergent_update
792 << " last_divergent_update: " << last_divergent_update
793 << dendl;
794
795 ceph::unordered_map<hobject_t, pg_log_entry_t*>::const_iterator objiter =
796 log.objects.find(hoid);
797 if (objiter != log.objects.end() &&
798 objiter->second->version >= first_divergent_update) {
799 /// Case 1)
800 ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: "
801 << *objiter->second << ", already merged" << dendl;
802
803 assert(objiter->second->version > last_divergent_update);
804
805 // ensure missing has been updated appropriately
806 if (objiter->second->is_update()) {
807 assert(missing.is_missing(hoid) &&
808 missing.get_items().at(hoid).need == objiter->second->version);
809 } else {
810 assert(!missing.is_missing(hoid));
811 }
812 missing.revise_have(hoid, eversion_t());
813 if (rollbacker) {
814 if (!object_not_in_store) {
815 rollbacker->remove(hoid);
816 }
817 for (auto &&i: entries) {
818 rollbacker->trim(i);
819 }
820 }
821 return;
822 }
823
824 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
825 <<" has no more recent entries in log" << dendl;
826 if (prior_version == eversion_t() || entries.front().is_clone()) {
827 /// Case 2)
828 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
829 << " prior_version or op type indicates creation,"
830 << " deleting"
831 << dendl;
832 if (missing.is_missing(hoid))
833 missing.rm(missing.get_items().find(hoid));
834 if (rollbacker) {
835 if (!object_not_in_store) {
836 rollbacker->remove(hoid);
837 }
838 for (auto &&i: entries) {
839 rollbacker->trim(i);
840 }
841 }
842 return;
843 }
844
845 if (missing.is_missing(hoid)) {
846 /// Case 3)
847 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
848 << " missing, " << missing.get_items().at(hoid)
849 << " adjusting" << dendl;
850
851 if (missing.get_items().at(hoid).have == prior_version) {
852 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
853 << " missing.have is prior_version " << prior_version
854 << " removing from missing" << dendl;
855 missing.rm(missing.get_items().find(hoid));
856 } else {
857 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
858 << " missing.have is " << missing.get_items().at(hoid).have
859 << ", adjusting" << dendl;
860 missing.revise_need(hoid, prior_version);
861 if (prior_version <= info.log_tail) {
862 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
863 << " prior_version " << prior_version
864 << " <= info.log_tail "
865 << info.log_tail << dendl;
866 }
867 }
868 if (rollbacker) {
869 for (auto &&i: entries) {
870 rollbacker->trim(i);
871 }
872 }
873 return;
874 }
875
876 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
877 << " must be rolled back or recovered,"
878 << " attempting to rollback"
879 << dendl;
880 bool can_rollback = true;
881 /// Distinguish between 4) and 5)
882 for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin();
883 i != entries.rend();
884 ++i) {
885 if (!i->can_rollback() || i->version <= olog_can_rollback_to) {
886 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback "
887 << *i << dendl;
888 can_rollback = false;
889 break;
890 }
891 }
892
893 if (can_rollback) {
894 /// Case 4)
895 for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin();
896 i != entries.rend();
897 ++i) {
898 assert(i->can_rollback() && i->version > olog_can_rollback_to);
899 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
900 << " rolling back " << *i << dendl;
901 if (rollbacker)
902 rollbacker->rollback(*i);
903 }
904 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
905 << " rolled back" << dendl;
906 return;
907 } else {
908 /// Case 5)
909 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, "
910 << "removing and adding to missing" << dendl;
911 if (rollbacker) {
912 if (!object_not_in_store)
913 rollbacker->remove(hoid);
914 for (auto &&i: entries) {
915 rollbacker->trim(i);
916 }
917 }
918 missing.add(hoid, prior_version, eversion_t());
919 if (prior_version <= info.log_tail) {
920 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
921 << " prior_version " << prior_version
922 << " <= info.log_tail "
923 << info.log_tail << dendl;
924 }
925 }
926 }
927
928 /// Merge all entries using above
929 template <typename missing_type>
930 static void _merge_divergent_entries(
931 const IndexedLog &log, ///< [in] log to merge against
932 mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge
933 const pg_info_t &oinfo, ///< [in] info for merging entries
934 eversion_t olog_can_rollback_to, ///< [in] rollback boundary
935 missing_type &omissing, ///< [in,out] missing to adjust, use
936 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
937 const DoutPrefixProvider *dpp ///< [in] logging provider
938 ) {
939 map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
940 split_by_object(entries, &split);
941 for (map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>>::iterator i = split.begin();
942 i != split.end();
943 ++i) {
944 _merge_object_divergent_entries(
945 log,
946 i->first,
947 i->second,
948 oinfo,
949 olog_can_rollback_to,
950 omissing,
951 rollbacker,
952 dpp);
953 }
954 }
955
956 /**
957 * Exists for use in TestPGLog for simply testing single divergent log
958 * cases
959 */
960 void merge_old_entry(
961 ObjectStore::Transaction& t,
962 const pg_log_entry_t& oe,
963 const pg_info_t& info,
964 LogEntryHandler *rollbacker) {
965 mempool::osd_pglog::list<pg_log_entry_t> entries;
966 entries.push_back(oe);
967 _merge_object_divergent_entries(
968 log,
969 oe.soid,
970 entries,
971 info,
972 log.get_can_rollback_to(),
973 missing,
974 rollbacker,
975 this);
976 }
977 public:
978 void rewind_divergent_log(eversion_t newhead,
979 pg_info_t &info,
980 LogEntryHandler *rollbacker,
981 bool &dirty_info,
982 bool &dirty_big_info);
983
984 void merge_log(pg_info_t &oinfo,
985 pg_log_t &olog,
986 pg_shard_t from,
987 pg_info_t &info, LogEntryHandler *rollbacker,
988 bool &dirty_info, bool &dirty_big_info);
989
990 template <typename missing_type>
991 static bool append_log_entries_update_missing(
992 const hobject_t &last_backfill,
993 bool last_backfill_bitwise,
994 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
995 bool maintain_rollback,
996 IndexedLog *log,
997 missing_type &missing,
998 LogEntryHandler *rollbacker,
999 const DoutPrefixProvider *dpp) {
1000 bool invalidate_stats = false;
1001 if (log && !entries.empty()) {
1002 assert(log->head < entries.begin()->version);
1003 }
1004 for (list<pg_log_entry_t>::const_iterator p = entries.begin();
1005 p != entries.end();
1006 ++p) {
1007 invalidate_stats = invalidate_stats || !p->is_error();
1008 if (log) {
1009 ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl;
1010 log->add(*p);
1011 }
1012 if (p->soid <= last_backfill &&
1013 !p->is_error()) {
1014 missing.add_next_event(*p);
1015 if (rollbacker) {
1016 // hack to match PG::mark_all_unfound_lost
1017 if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
1018 rollbacker->try_stash(p->soid, p->version.version);
1019 } else if (p->is_delete()) {
1020 rollbacker->remove(p->soid);
1021 }
1022 }
1023 }
1024 }
1025 return invalidate_stats;
1026 }
1027 bool append_new_log_entries(
1028 const hobject_t &last_backfill,
1029 bool last_backfill_bitwise,
1030 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
1031 LogEntryHandler *rollbacker) {
1032 bool invalidate_stats = append_log_entries_update_missing(
1033 last_backfill,
1034 last_backfill_bitwise,
1035 entries,
1036 true,
1037 &log,
1038 missing,
1039 rollbacker,
1040 this);
1041 if (!entries.empty()) {
1042 mark_writeout_from(entries.begin()->version);
1043 }
1044 return invalidate_stats;
1045 }
1046
1047 void write_log_and_missing(ObjectStore::Transaction& t,
1048 map<string,bufferlist> *km,
1049 const coll_t& coll,
1050 const ghobject_t &log_oid,
1051 bool require_rollback);
1052
1053 static void write_log_and_missing_wo_missing(
1054 ObjectStore::Transaction& t,
1055 map<string,bufferlist>* km,
1056 pg_log_t &log,
1057 const coll_t& coll,
1058 const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
1059 bool require_rollback);
1060
1061 static void write_log_and_missing(
1062 ObjectStore::Transaction& t,
1063 map<string,bufferlist>* km,
1064 pg_log_t &log,
1065 const coll_t& coll,
1066 const ghobject_t &log_oid,
1067 const pg_missing_tracker_t &missing,
1068 bool require_rollback);
1069
1070 static void _write_log_and_missing_wo_missing(
1071 ObjectStore::Transaction& t,
1072 map<string,bufferlist>* km,
1073 pg_log_t &log,
1074 const coll_t& coll, const ghobject_t &log_oid,
1075 map<eversion_t, hobject_t> &divergent_priors,
1076 eversion_t dirty_to,
1077 eversion_t dirty_from,
1078 eversion_t writeout_from,
1079 const set<eversion_t> &trimmed,
1080 bool dirty_divergent_priors,
1081 bool touch_log,
1082 bool require_rollback,
1083 set<string> *log_keys_debug
1084 );
1085
1086 static void _write_log_and_missing(
1087 ObjectStore::Transaction& t,
1088 map<string,bufferlist>* km,
1089 pg_log_t &log,
1090 const coll_t& coll, const ghobject_t &log_oid,
1091 eversion_t dirty_to,
1092 eversion_t dirty_from,
1093 eversion_t writeout_from,
1094 const set<eversion_t> &trimmed,
1095 const pg_missing_tracker_t &missing,
1096 bool touch_log,
1097 bool require_rollback,
1098 bool clear_divergent_priors,
1099 set<string> *log_keys_debug
1100 );
1101
1102 void read_log_and_missing(
1103 ObjectStore *store, coll_t pg_coll,
1104 coll_t log_coll, ghobject_t log_oid,
1105 const pg_info_t &info,
1106 ostringstream &oss,
1107 bool tolerate_divergent_missing_log,
1108 bool debug_verify_stored_missing = false
1109 ) {
1110 return read_log_and_missing(
1111 store, pg_coll, log_coll, log_oid, info,
1112 log, missing, oss,
1113 tolerate_divergent_missing_log,
1114 &clear_divergent_priors,
1115 this,
1116 (pg_log_debug ? &log_keys_debug : 0),
1117 debug_verify_stored_missing);
1118 }
1119
1120 template <typename missing_type>
1121 static void read_log_and_missing(ObjectStore *store, coll_t pg_coll,
1122 coll_t log_coll, ghobject_t log_oid,
1123 const pg_info_t &info,
1124 IndexedLog &log,
1125 missing_type &missing, ostringstream &oss,
1126 bool tolerate_divergent_missing_log,
1127 bool *clear_divergent_priors = NULL,
1128 const DoutPrefixProvider *dpp = NULL,
1129 set<string> *log_keys_debug = 0,
1130 bool debug_verify_stored_missing = false
1131 ) {
1132 ldpp_dout(dpp, 20) << "read_log_and_missing coll " << pg_coll
1133 << " log_oid " << log_oid << dendl;
1134
1135 // legacy?
1136 struct stat st;
1137 int r = store->stat(log_coll, log_oid, &st);
1138 assert(r == 0);
1139 assert(st.st_size == 0);
1140
1141 // will get overridden below if it had been recorded
1142 eversion_t on_disk_can_rollback_to = info.last_update;
1143 eversion_t on_disk_rollback_info_trimmed_to = eversion_t();
1144 ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid);
1145 map<eversion_t, hobject_t> divergent_priors;
1146 bool has_divergent_priors = false;
1147 list<pg_log_entry_t> entries;
1148 if (p) {
1149 for (p->seek_to_first(); p->valid() ; p->next(false)) {
1150 // non-log pgmeta_oid keys are prefixed with _; skip those
1151 if (p->key()[0] == '_')
1152 continue;
1153 bufferlist bl = p->value();//Copy bufferlist before creating iterator
1154 bufferlist::iterator bp = bl.begin();
1155 if (p->key() == "divergent_priors") {
1156 ::decode(divergent_priors, bp);
1157 ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
1158 << " divergent_priors" << dendl;
1159 has_divergent_priors = true;
1160 debug_verify_stored_missing = false;
1161 } else if (p->key() == "can_rollback_to") {
1162 ::decode(on_disk_can_rollback_to, bp);
1163 } else if (p->key() == "rollback_info_trimmed_to") {
1164 ::decode(on_disk_rollback_info_trimmed_to, bp);
1165 } else if (p->key().substr(0, 7) == string("missing")) {
1166 pair<hobject_t, pg_missing_item> p;
1167 ::decode(p, bp);
1168 missing.add(p.first, p.second.need, p.second.have);
1169 } else {
1170 pg_log_entry_t e;
1171 e.decode_with_checksum(bp);
1172 ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
1173 if (!entries.empty()) {
1174 pg_log_entry_t last_e(entries.back());
1175 assert(last_e.version.version < e.version.version);
1176 assert(last_e.version.epoch <= e.version.epoch);
1177 }
1178 entries.push_back(e);
1179 if (log_keys_debug)
1180 log_keys_debug->insert(e.get_key_name());
1181 }
1182 }
1183 }
1184 log = IndexedLog(
1185 info.last_update,
1186 info.log_tail,
1187 on_disk_can_rollback_to,
1188 on_disk_rollback_info_trimmed_to,
1189 std::move(entries));
1190
1191 if (has_divergent_priors || debug_verify_stored_missing) {
1192 // build missing
1193 if (debug_verify_stored_missing || info.last_complete < info.last_update) {
1194 ldpp_dout(dpp, 10) << "read_log_and_missing checking for missing items over interval ("
1195 << info.last_complete
1196 << "," << info.last_update << "]" << dendl;
1197
1198 set<hobject_t> did;
1199 set<hobject_t> checked;
1200 set<hobject_t> skipped;
1201 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
1202 i != log.log.rend();
1203 ++i) {
1204 if (!debug_verify_stored_missing && i->version <= info.last_complete) break;
1205 if (i->soid > info.last_backfill)
1206 continue;
1207 if (i->is_error())
1208 continue;
1209 if (did.count(i->soid)) continue;
1210 did.insert(i->soid);
1211
1212 if (i->is_delete()) continue;
1213
1214 bufferlist bv;
1215 int r = store->getattr(
1216 pg_coll,
1217 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
1218 OI_ATTR,
1219 bv);
1220 if (r >= 0) {
1221 object_info_t oi(bv);
1222 if (oi.version < i->version) {
1223 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i
1224 << " (have " << oi.version << ")" << dendl;
1225 if (debug_verify_stored_missing) {
1226 auto miter = missing.get_items().find(i->soid);
1227 assert(miter != missing.get_items().end());
1228 assert(miter->second.need == i->version);
1229 assert(miter->second.have == oi.version);
1230 checked.insert(i->soid);
1231 } else {
1232 missing.add(i->soid, i->version, oi.version);
1233 }
1234 }
1235 } else {
1236 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
1237 if (debug_verify_stored_missing) {
1238 auto miter = missing.get_items().find(i->soid);
1239 assert(miter != missing.get_items().end());
1240 assert(miter->second.need == i->version);
1241 assert(miter->second.have == eversion_t());
1242 checked.insert(i->soid);
1243 } else {
1244 missing.add(i->soid, i->version, eversion_t());
1245 }
1246 }
1247 }
1248 if (debug_verify_stored_missing) {
1249 for (auto &&i: missing.get_items()) {
1250 if (checked.count(i.first))
1251 continue;
1252 if (i.second.need > log.tail ||
1253 i.first > info.last_backfill) {
1254 ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry found "
1255 << i.first
1256 << dendl;
1257 assert(0 == "invalid missing set entry found");
1258 }
1259 bufferlist bv;
1260 int r = store->getattr(
1261 pg_coll,
1262 ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard),
1263 OI_ATTR,
1264 bv);
1265 if (r >= 0) {
1266 object_info_t oi(bv);
1267 assert(oi.version == i.second.have);
1268 } else {
1269 assert(eversion_t() == i.second.have);
1270 }
1271 }
1272 } else {
1273 assert(has_divergent_priors);
1274 for (map<eversion_t, hobject_t>::reverse_iterator i =
1275 divergent_priors.rbegin();
1276 i != divergent_priors.rend();
1277 ++i) {
1278 if (i->first <= info.last_complete) break;
1279 if (i->second > info.last_backfill)
1280 continue;
1281 if (did.count(i->second)) continue;
1282 did.insert(i->second);
1283 bufferlist bv;
1284 int r = store->getattr(
1285 pg_coll,
1286 ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard),
1287 OI_ATTR,
1288 bv);
1289 if (r >= 0) {
1290 object_info_t oi(bv);
1291 /**
1292 * 1) we see this entry in the divergent priors mapping
1293 * 2) we didn't see an entry for this object in the log
1294 *
1295 * From 1 & 2 we know that either the object does not exist
1296 * or it is at the version specified in the divergent_priors
1297 * map since the object would have been deleted atomically
1298 * with the addition of the divergent_priors entry, an older
1299 * version would not have been recovered, and a newer version
1300 * would show up in the log above.
1301 */
1302 /**
1303 * Unfortunately the assessment above is incorrect because of
1304 * http://tracker.ceph.com/issues/17916 (we were incorrectly
1305 * not removing the divergent_priors set from disk state!),
1306 * so let's check that.
1307 */
1308 if (oi.version > i->first && tolerate_divergent_missing_log) {
1309 ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i
1310 << ") inconsistent with disk state (" << oi
1311 << "), assuming it is tracker.ceph.com/issues/17916"
1312 << dendl;
1313 } else {
1314 assert(oi.version == i->first);
1315 }
1316 } else {
1317 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
1318 missing.add(i->second, i->first, eversion_t());
1319 }
1320 }
1321 }
1322 if (clear_divergent_priors)
1323 (*clear_divergent_priors) = true;
1324 }
1325 }
1326
1327 if (!has_divergent_priors) {
1328 if (clear_divergent_priors)
1329 (*clear_divergent_priors) = false;
1330 missing.flush();
1331 }
1332 ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl;
1333 }
1334 };
1335
1336 #endif // CEPH_PG_LOG_H