]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGLog.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / osd / PGLog.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
c07f9fc5 17#pragma once
7c673cae
FG
18
19// re-include our assert to clobber boost's
11fdf7f2 20#include "include/ceph_assert.h"
9f95a23c 21#include "include/common_fwd.h"
7c673cae
FG
22#include "osd_types.h"
23#include "os/ObjectStore.h"
24#include <list>
7c673cae 25
9f95a23c
TL
26#ifdef WITH_SEASTAR
27#include <seastar/core/future.hh>
28#include "crimson/os/futurized_store.h"
29#include "crimson/os/cyanstore/cyan_collection.h"
30#endif
31
f67539c2
TL
32/** @name PG Log
33 *
34 * The pg log serves three primary purposes:
35 *
36 * 1) improving recovery speed
37 *
38 * 2) detecting duplicate ops
39 *
40 * 3) making erasure coded updates safe
41 *
42 * For (1), the main data type is pg_log_entry_t. this is indexed in
43 * memory by the IndexedLog class - this is where most of the logic
44 * surrounding pg log is kept, even though the low level types are in
45 * src/osd/osd_types.h
46 *
47 * (2) uses a type which is a subset of the full log entry, containing
48 * just the pieces we need to identify and respond to a duplicate
49 * request.
50 *
51 * As we trim the log, we convert pg_log_entry_t to smaller
52 * pg_log_dup_t, and finally remove them once we reach a higher
53 * limit. This is controlled by a few options:
54 *
55 * osd_min_pg_log_entries osd_max_pg_log_entries
56 * osd_pg_log_dups_tracked
57 *
58 * For example, with a min of 100, max of 1000, and dups tracked of
59 * 3000, the log entries and dups stored would span the following
60 * versions, assuming the current earliest is version 1:
61 *
62 * version: 3000 2001 2000 1 [ pg log entries ] [ pg log dups ]
63 *
64 * after osd_pg_log_trim_min subsequent writes to this PG, the log
65 * would be trimmed to look like:
66 *
67 * version: 3100 2101 2100 101 [ pg log entries ] [ pg log dups ]
68 *
69 * (3) means tracking the previous state of an object, so that we can
70 * rollback to that prior state if necessary. It's only used for
71 * erasure coding. Consider an erasure code of 4+2, for example.
72 *
73 * This means we split the object into 4 pieces (called shards) and
74 * compute 2 parity shards. Each of these shards is stored on a
75 * separate OSD. As long as 4 shards are the same version, we can
76 * recover the remaining 2 by computation. Imagine during a write, 3
77 * of the osds go down and restart, resulting in shards 0,1,2
78 * reflecting version A and shards 3,4,5 reflecting version B, after
79 * the write.
80 *
81 * If we had no way to reconstruct version A for another shard, we
82 * would have lost the object.
83 *
84 * The actual data for rollback is stored in a look-aside object and
85 * is removed once the EC write commits on all shards. The pg log just
86 * stores the versions so we can tell how far we can rollback, and a
87 * description of the type of operation for each log entry. Beyond
88 * the pg log, see PGBackend::Trimmer and PGBackend::RollbackVisitor
89 * for more details on this.
90 *
91 * An important implication of this is that although the pg log length
92 * is normally bounded, under extreme conditions, with many EC I/Os
93 * outstanding, the log may grow beyond that point because we need to
94 * keep the rollback information for all outstanding EC I/O.
95 *
96 * For more on pg log bounds, see where it is calculated in
97 * PeeringState::calc_trim_to_aggressive().
98 *
99 * For more details on how peering uses the pg log, and architectural
100 * reasons for its existence, see:
101 *
102 * doc/dev/osd_internals/log_based_pg.rst
103 *
104 */
105
11fdf7f2
TL
106constexpr auto PGLOG_INDEXED_OBJECTS = 1 << 0;
107constexpr auto PGLOG_INDEXED_CALLER_OPS = 1 << 1;
108constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2;
109constexpr auto PGLOG_INDEXED_DUPS = 1 << 3;
110constexpr auto PGLOG_INDEXED_ALL = PGLOG_INDEXED_OBJECTS
111 | PGLOG_INDEXED_CALLER_OPS
112 | PGLOG_INDEXED_EXTRA_CALLER_OPS
113 | PGLOG_INDEXED_DUPS;
7c673cae 114
7c673cae 115struct PGLog : DoutPrefixProvider {
11fdf7f2
TL
116 std::ostream& gen_prefix(std::ostream& out) const override {
117 return out;
7c673cae
FG
118 }
119 unsigned get_subsys() const override {
11fdf7f2 120 return static_cast<unsigned>(ceph_subsys_osd);
7c673cae
FG
121 }
122 CephContext *get_cct() const override {
123 return cct;
124 }
125
126 ////////////////////////////// sub classes //////////////////////////////
127 struct LogEntryHandler {
128 virtual void rollback(
129 const pg_log_entry_t &entry) = 0;
130 virtual void rollforward(
131 const pg_log_entry_t &entry) = 0;
132 virtual void trim(
133 const pg_log_entry_t &entry) = 0;
134 virtual void remove(
135 const hobject_t &hoid) = 0;
136 virtual void try_stash(
137 const hobject_t &hoid,
138 version_t v) = 0;
139 virtual ~LogEntryHandler() {}
140 };
f67539c2 141 using LogEntryHandlerRef = std::unique_ptr<LogEntryHandler>;
7c673cae 142
7c673cae
FG
143public:
144 /**
145 * IndexLog - adds in-memory index of the log, by oid.
146 * plus some methods to manipulate it all.
147 */
148 struct IndexedLog : public pg_log_t {
149 mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful!
150 mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
151 mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
c07f9fc5 152 mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index;
7c673cae
FG
153
154 // recovery pointers
f67539c2 155 std::list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
7c673cae
FG
156 version_t last_requested = 0; // last object requested by primary
157
158 //
159 private:
160 mutable __u16 indexed_data = 0;
161 /**
162 * rollback_info_trimmed_to_riter points to the first log entry <=
163 * rollback_info_trimmed_to
164 *
165 * It's a reverse_iterator because rend() is a natural representation for
166 * tail, and rbegin() works nicely for head.
167 */
31f18b77 168 mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator
7c673cae
FG
169 rollback_info_trimmed_to_riter;
170
eafe8130
TL
171 /*
172 * return true if we need to mark the pglog as dirty
173 */
7c673cae 174 template <typename F>
eafe8130
TL
175 bool advance_can_rollback_to(eversion_t to, F &&f) {
176 bool dirty_log = to > can_rollback_to || to > rollback_info_trimmed_to;
177 if (dirty_log) {
178 if (to > can_rollback_to)
179 can_rollback_to = to;
180
181 if (to > rollback_info_trimmed_to)
182 rollback_info_trimmed_to = to;
183 }
7c673cae
FG
184
185 while (rollback_info_trimmed_to_riter != log.rbegin()) {
186 --rollback_info_trimmed_to_riter;
187 if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
188 ++rollback_info_trimmed_to_riter;
189 break;
190 }
191 f(*rollback_info_trimmed_to_riter);
192 }
eafe8130
TL
193
194 return dirty_log;
7c673cae
FG
195 }
196
197 void reset_rollback_info_trimmed_to_riter() {
198 rollback_info_trimmed_to_riter = log.rbegin();
199 while (rollback_info_trimmed_to_riter != log.rend() &&
200 rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
201 ++rollback_info_trimmed_to_riter;
202 }
203
204 // indexes objects, caller ops and extra caller ops
205 public:
206 IndexedLog() :
207 complete_to(log.end()),
208 last_requested(0),
209 indexed_data(0),
210 rollback_info_trimmed_to_riter(log.rbegin())
c07f9fc5 211 { }
7c673cae
FG
212
213 template <typename... Args>
11fdf7f2 214 explicit IndexedLog(Args&&... args) :
7c673cae
FG
215 pg_log_t(std::forward<Args>(args)...),
216 complete_to(log.end()),
217 last_requested(0),
218 indexed_data(0),
c07f9fc5
FG
219 rollback_info_trimmed_to_riter(log.rbegin())
220 {
7c673cae
FG
221 reset_rollback_info_trimmed_to_riter();
222 index();
223 }
224
225 IndexedLog(const IndexedLog &rhs) :
226 pg_log_t(rhs),
227 complete_to(log.end()),
228 last_requested(rhs.last_requested),
229 indexed_data(0),
c07f9fc5
FG
230 rollback_info_trimmed_to_riter(log.rbegin())
231 {
7c673cae
FG
232 reset_rollback_info_trimmed_to_riter();
233 index(rhs.indexed_data);
234 }
c07f9fc5 235
7c673cae
FG
236 IndexedLog &operator=(const IndexedLog &rhs) {
237 this->~IndexedLog();
238 new (this) IndexedLog(rhs);
239 return *this;
240 }
241
242 void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) {
243 advance_can_rollback_to(
244 to,
245 [&](pg_log_entry_t &entry) {
246 h->trim(entry);
247 });
248 }
eafe8130
TL
249 bool roll_forward_to(eversion_t to, LogEntryHandler *h) {
250 return advance_can_rollback_to(
7c673cae
FG
251 to,
252 [&](pg_log_entry_t &entry) {
253 h->rollforward(entry);
254 });
255 }
256
257 void skip_can_rollback_to_to_head() {
258 advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
259 }
260
31f18b77 261 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
7c673cae
FG
262 auto divergent = pg_log_t::rewind_from_head(newhead);
263 index();
264 reset_rollback_info_trimmed_to_riter();
265 return divergent;
266 }
267
268 template <typename T>
269 void scan_log_after(
270 const eversion_t &bound, ///< [in] scan entries > bound
271 T &&f) const {
272 auto iter = log.rbegin();
273 while (iter != log.rend() && iter->version > bound)
274 ++iter;
275
276 while (true) {
277 if (iter == log.rbegin())
278 break;
279 f(*(--iter));
280 }
281 }
282
283 /****/
284 void claim_log_and_clear_rollback_info(const pg_log_t& o) {
285 // we must have already trimmed the old entries
11fdf7f2
TL
286 ceph_assert(rollback_info_trimmed_to == head);
287 ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
7c673cae
FG
288
289 *this = IndexedLog(o);
290
291 skip_can_rollback_to_to_head();
292 index();
293 }
294
295 void split_out_child(
296 pg_t child_pgid,
297 unsigned split_bits,
298 IndexedLog *target);
299
300 void zero() {
301 // we must have already trimmed the old entries
11fdf7f2
TL
302 ceph_assert(rollback_info_trimmed_to == head);
303 ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
7c673cae
FG
304
305 unindex();
306 pg_log_t::clear();
307 rollback_info_trimmed_to_riter = log.rbegin();
308 reset_recovery_pointers();
309 }
310 void clear() {
311 skip_can_rollback_to_to_head();
312 zero();
313 }
314 void reset_recovery_pointers() {
315 complete_to = log.end();
316 last_requested = 0;
317 }
318
319 bool logged_object(const hobject_t& oid) const {
320 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
321 index_objects();
322 }
323 return objects.count(oid);
324 }
325
326 bool logged_req(const osd_reqid_t &r) const {
327 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
328 index_caller_ops();
329 }
330 if (!caller_ops.count(r)) {
331 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
332 index_extra_caller_ops();
333 }
334 return extra_caller_ops.count(r);
335 }
336 return true;
337 }
338
339 bool get_request(
340 const osd_reqid_t &r,
341 eversion_t *version,
342 version_t *user_version,
9f95a23c 343 int *return_code,
f67539c2 344 std::vector<pg_log_op_return_item_t> *op_returns) const
c07f9fc5 345 {
11fdf7f2
TL
346 ceph_assert(version);
347 ceph_assert(user_version);
348 ceph_assert(return_code);
7c673cae
FG
349 if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
350 index_caller_ops();
351 }
f67539c2 352 auto p = caller_ops.find(r);
7c673cae
FG
353 if (p != caller_ops.end()) {
354 *version = p->second->version;
355 *user_version = p->second->user_version;
356 *return_code = p->second->return_code;
9f95a23c 357 *op_returns = p->second->op_returns;
7c673cae
FG
358 return true;
359 }
360
361 // warning: we will return *a* request for this reqid, but not
362 // necessarily the most recent.
363 if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
364 index_extra_caller_ops();
365 }
366 p = extra_caller_ops.find(r);
367 if (p != extra_caller_ops.end()) {
11fdf7f2 368 uint32_t idx = 0;
31f18b77 369 for (auto i = p->second->extra_reqids.begin();
7c673cae 370 i != p->second->extra_reqids.end();
11fdf7f2 371 ++idx, ++i) {
7c673cae
FG
372 if (i->first == r) {
373 *version = p->second->version;
374 *user_version = i->second;
375 *return_code = p->second->return_code;
9f95a23c 376 *op_returns = p->second->op_returns;
11fdf7f2
TL
377 if (*return_code >= 0) {
378 auto it = p->second->extra_reqid_return_codes.find(idx);
379 if (it != p->second->extra_reqid_return_codes.end()) {
380 *return_code = it->second;
381 }
382 }
7c673cae
FG
383 return true;
384 }
385 }
11fdf7f2 386 ceph_abort_msg("in extra_caller_ops but not extra_reqids");
7c673cae 387 }
c07f9fc5
FG
388
389 if (!(indexed_data & PGLOG_INDEXED_DUPS)) {
390 index_dups();
391 }
392 auto q = dup_index.find(r);
393 if (q != dup_index.end()) {
394 *version = q->second->version;
395 *user_version = q->second->user_version;
396 *return_code = q->second->return_code;
9f95a23c 397 *op_returns = q->second->op_returns;
c07f9fc5
FG
398 return true;
399 }
400
7c673cae
FG
401 return false;
402 }
403
9f95a23c
TL
404 bool has_write_since(const hobject_t &oid, const eversion_t &bound) const {
405 for (auto i = log.rbegin(); i != log.rend(); ++i) {
406 if (i->version <= bound)
407 return false;
408 if (i->soid.get_head() == oid.get_head())
409 return true;
410 }
411 return false;
412 }
413
f67539c2 414 /// get a (bounded) std::list of recent reqids for the given object
7c673cae 415 void get_object_reqids(const hobject_t& oid, unsigned max,
f67539c2 416 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > *pls,
11fdf7f2 417 mempool::osd_pglog::map<uint32_t, int> *return_codes) const {
7c673cae
FG
418 // make sure object is present at least once before we do an
419 // O(n) search.
420 if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
421 index_objects();
422 }
423 if (objects.count(oid) == 0)
424 return;
11fdf7f2 425
f67539c2 426 for (auto i = log.rbegin(); i != log.rend(); ++i) {
7c673cae 427 if (i->soid == oid) {
11fdf7f2
TL
428 if (i->reqid_is_indexed()) {
429 if (i->op == pg_log_entry_t::ERROR) {
430 // propagate op errors to the cache tier's PG log
431 return_codes->emplace(pls->size(), i->return_code);
432 }
f67539c2 433 pls->push_back(std::make_pair(i->reqid, i->user_version));
11fdf7f2
TL
434 }
435
7c673cae
FG
436 pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end());
437 if (pls->size() >= max) {
438 if (pls->size() > max) {
439 pls->resize(max);
440 }
441 return;
442 }
443 }
444 }
445 }
c07f9fc5 446
7c673cae 447 void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
c07f9fc5
FG
448 // if to_index is 0, no need to run any of this code, especially
449 // loop below; this can happen with copy constructor for
450 // IndexedLog (and indirectly through assignment operator)
451 if (!to_index) return;
452
7c673cae
FG
453 if (to_index & PGLOG_INDEXED_OBJECTS)
454 objects.clear();
455 if (to_index & PGLOG_INDEXED_CALLER_OPS)
456 caller_ops.clear();
457 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
458 extra_caller_ops.clear();
c07f9fc5
FG
459 if (to_index & PGLOG_INDEXED_DUPS) {
460 dup_index.clear();
461 for (auto& i : dups) {
462 dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i);
463 }
464 }
7c673cae 465
c07f9fc5
FG
466 constexpr __u16 any_log_entry_index =
467 PGLOG_INDEXED_OBJECTS |
468 PGLOG_INDEXED_CALLER_OPS |
469 PGLOG_INDEXED_EXTRA_CALLER_OPS;
470
471 if (to_index & any_log_entry_index) {
f67539c2 472 for (auto i = log.begin(); i != log.end(); ++i) {
c07f9fc5
FG
473 if (to_index & PGLOG_INDEXED_OBJECTS) {
474 if (i->object_is_indexed()) {
475 objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
476 }
7c673cae 477 }
7c673cae 478
c07f9fc5
FG
479 if (to_index & PGLOG_INDEXED_CALLER_OPS) {
480 if (i->reqid_is_indexed()) {
481 caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
482 }
7c673cae 483 }
c07f9fc5
FG
484
485 if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
486 for (auto j = i->extra_reqids.begin();
487 j != i->extra_reqids.end();
488 ++j) {
489 extra_caller_ops.insert(
f67539c2 490 std::make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
c07f9fc5 491 }
7c673cae
FG
492 }
493 }
494 }
c07f9fc5 495
7c673cae
FG
496 indexed_data |= to_index;
497 }
498
499 void index_objects() const {
500 index(PGLOG_INDEXED_OBJECTS);
501 }
502
503 void index_caller_ops() const {
504 index(PGLOG_INDEXED_CALLER_OPS);
505 }
506
507 void index_extra_caller_ops() const {
508 index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
509 }
510
c07f9fc5
FG
511 void index_dups() const {
512 index(PGLOG_INDEXED_DUPS);
513 }
514
7c673cae
FG
515 void index(pg_log_entry_t& e) {
516 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
517 if (objects.count(e.soid) == 0 ||
518 objects[e.soid]->version < e.version)
519 objects[e.soid] = &e;
520 }
521 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
522 // divergent merge_log indexes new before unindexing old
523 if (e.reqid_is_indexed()) {
524 caller_ops[e.reqid] = &e;
525 }
526 }
527 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
31f18b77 528 for (auto j = e.extra_reqids.begin();
7c673cae
FG
529 j != e.extra_reqids.end();
530 ++j) {
f67539c2 531 extra_caller_ops.insert(std::make_pair(j->first, &e));
7c673cae
FG
532 }
533 }
534 }
c07f9fc5 535
7c673cae
FG
536 void unindex() {
537 objects.clear();
538 caller_ops.clear();
539 extra_caller_ops.clear();
c07f9fc5 540 dup_index.clear();
7c673cae
FG
541 indexed_data = 0;
542 }
c07f9fc5
FG
543
544 void unindex(const pg_log_entry_t& e) {
7c673cae
FG
545 // NOTE: this only works if we remove from the _tail_ of the log!
546 if (indexed_data & PGLOG_INDEXED_OBJECTS) {
11fdf7f2
TL
547 auto it = objects.find(e.soid);
548 if (it != objects.end() && it->second->version == e.version)
549 objects.erase(it);
7c673cae
FG
550 }
551 if (e.reqid_is_indexed()) {
552 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
11fdf7f2 553 auto it = caller_ops.find(e.reqid);
7c673cae 554 // divergent merge_log indexes new before unindexing old
11fdf7f2
TL
555 if (it != caller_ops.end() && it->second == &e)
556 caller_ops.erase(it);
7c673cae
FG
557 }
558 }
559 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
31f18b77 560 for (auto j = e.extra_reqids.begin();
7c673cae
FG
561 j != e.extra_reqids.end();
562 ++j) {
f67539c2 563 for (auto k = extra_caller_ops.find(j->first);
7c673cae
FG
564 k != extra_caller_ops.end() && k->first == j->first;
565 ++k) {
566 if (k->second == &e) {
567 extra_caller_ops.erase(k);
568 break;
569 }
570 }
571 }
572 }
573 }
574
c07f9fc5 575 void index(pg_log_dup_t& e) {
d2e6a577 576 if (indexed_data & PGLOG_INDEXED_DUPS) {
c07f9fc5
FG
577 dup_index[e.reqid] = &e;
578 }
579 }
580
581 void unindex(const pg_log_dup_t& e) {
d2e6a577 582 if (indexed_data & PGLOG_INDEXED_DUPS) {
c07f9fc5
FG
583 auto i = dup_index.find(e.reqid);
584 if (i != dup_index.end()) {
585 dup_index.erase(i);
586 }
587 }
588 }
589
7c673cae
FG
590 // actors
591 void add(const pg_log_entry_t& e, bool applied = true) {
592 if (!applied) {
11fdf7f2 593 ceph_assert(get_can_rollback_to() == head);
7c673cae
FG
594 }
595
31f18b77
FG
596 // make sure our buffers don't pin bigger buffers
597 e.mod_desc.trim_bl();
598
7c673cae
FG
599 // add to log
600 log.push_back(e);
601
602 // riter previously pointed to the previous entry
603 if (rollback_info_trimmed_to_riter == log.rbegin())
604 ++rollback_info_trimmed_to_riter;
605
11fdf7f2
TL
606 ceph_assert(e.version > head);
607 ceph_assert(head.version == 0 || e.version.version > head.version);
7c673cae
FG
608 head = e.version;
609
610 // to our index
611 if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
612 objects[e.soid] = &(log.back());
613 }
614 if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
615 if (e.reqid_is_indexed()) {
616 caller_ops[e.reqid] = &(log.back());
617 }
618 }
c07f9fc5 619
7c673cae 620 if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
31f18b77 621 for (auto j = e.extra_reqids.begin();
7c673cae
FG
622 j != e.extra_reqids.end();
623 ++j) {
f67539c2 624 extra_caller_ops.insert(std::make_pair(j->first, &(log.back())));
7c673cae
FG
625 }
626 }
627
628 if (!applied) {
629 skip_can_rollback_to_to_head();
630 }
c07f9fc5 631 } // add
7c673cae
FG
632
633 void trim(
634 CephContext* cct,
635 eversion_t s,
f67539c2
TL
636 std::set<eversion_t> *trimmed,
637 std::set<std::string>* trimmed_dups,
181888fb 638 eversion_t *write_from_dups);
7c673cae 639
f67539c2 640 std::ostream& print(std::ostream& out) const;
c07f9fc5 641 }; // IndexedLog
7c673cae
FG
642
643
644protected:
645 //////////////////// data members ////////////////////
646
647 pg_missing_tracker_t missing;
648 IndexedLog log;
649
650 eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to
651 eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
652 eversion_t writeout_from; ///< must writout keys >= writeout_from
f67539c2 653 std::set<eversion_t> trimmed; ///< must clear keys in trimmed
181888fb
FG
654 eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups
655 eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups
656 eversion_t write_from_dups; ///< must write keys >= write_from_dups
f67539c2 657 std::set<std::string> trimmed_dups; ///< must clear keys in trimmed_dups
7c673cae
FG
658 CephContext *cct;
659 bool pg_log_debug;
660 /// Log is clean on [dirty_to, dirty_from)
661 bool touched_log;
eafe8130 662 bool dirty_log;
7c673cae 663 bool clear_divergent_priors;
9f95a23c 664 bool may_include_deletes_in_missing_dirty = false;
7c673cae
FG
665
666 void mark_dirty_to(eversion_t to) {
667 if (to > dirty_to)
668 dirty_to = to;
669 }
670 void mark_dirty_from(eversion_t from) {
671 if (from < dirty_from)
672 dirty_from = from;
673 }
674 void mark_writeout_from(eversion_t from) {
675 if (from < writeout_from)
676 writeout_from = from;
677 }
181888fb
FG
678 void mark_dirty_to_dups(eversion_t to) {
679 if (to > dirty_to_dups)
680 dirty_to_dups = to;
681 }
682 void mark_dirty_from_dups(eversion_t from) {
683 if (from < dirty_from_dups)
684 dirty_from_dups = from;
685 }
7c673cae 686public:
9f95a23c
TL
687 bool needs_write() const {
688 return !touched_log || is_dirty();
689 }
690
7c673cae 691 bool is_dirty() const {
9f95a23c 692 return dirty_log ||
7c673cae
FG
693 (dirty_to != eversion_t()) ||
694 (dirty_from != eversion_t::max()) ||
695 (writeout_from != eversion_t::max()) ||
696 !(trimmed.empty()) ||
c07f9fc5
FG
697 !missing.is_clean() ||
698 !(trimmed_dups.empty()) ||
181888fb
FG
699 (dirty_to_dups != eversion_t()) ||
700 (dirty_from_dups != eversion_t::max()) ||
701 (write_from_dups != eversion_t::max()) ||
9f95a23c 702 may_include_deletes_in_missing_dirty;
7c673cae 703 }
9f95a23c 704
7c673cae
FG
705 void mark_log_for_rewrite() {
706 mark_dirty_to(eversion_t::max());
707 mark_dirty_from(eversion_t());
181888fb
FG
708 mark_dirty_to_dups(eversion_t::max());
709 mark_dirty_from_dups(eversion_t());
7c673cae
FG
710 touched_log = false;
711 }
9f95a23c
TL
712 bool get_may_include_deletes_in_missing_dirty() const {
713 return may_include_deletes_in_missing_dirty;
c07f9fc5 714 }
7c673cae
FG
715protected:
716
717 /// DEBUG
f67539c2
TL
718 std::set<std::string> log_keys_debug;
719 static void clear_after(std::set<std::string> *log_keys_debug, const std::string &lb) {
7c673cae
FG
720 if (!log_keys_debug)
721 return;
f67539c2 722 for (auto i = log_keys_debug->lower_bound(lb);
7c673cae
FG
723 i != log_keys_debug->end();
724 log_keys_debug->erase(i++));
725 }
f67539c2 726 static void clear_up_to(std::set<std::string> *log_keys_debug, const std::string &ub) {
7c673cae
FG
727 if (!log_keys_debug)
728 return;
f67539c2 729 for (auto i = log_keys_debug->begin();
7c673cae
FG
730 i != log_keys_debug->end() && *i < ub;
731 log_keys_debug->erase(i++));
732 }
733
734 void check();
735 void undirty() {
736 dirty_to = eversion_t();
737 dirty_from = eversion_t::max();
738 touched_log = true;
eafe8130 739 dirty_log = false;
7c673cae 740 trimmed.clear();
c07f9fc5 741 trimmed_dups.clear();
7c673cae
FG
742 writeout_from = eversion_t::max();
743 check();
744 missing.flush();
181888fb
FG
745 dirty_to_dups = eversion_t();
746 dirty_from_dups = eversion_t::max();
747 write_from_dups = eversion_t::max();
7c673cae
FG
748 }
749public:
c07f9fc5 750
7c673cae 751 // cppcheck-suppress noExplicitConstructor
11fdf7f2 752 PGLog(CephContext *cct) :
7c673cae
FG
753 dirty_from(eversion_t::max()),
754 writeout_from(eversion_t::max()),
181888fb
FG
755 dirty_from_dups(eversion_t::max()),
756 write_from_dups(eversion_t::max()),
7c673cae
FG
757 cct(cct),
758 pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
759 touched_log(false),
eafe8130 760 dirty_log(false),
181888fb 761 clear_divergent_priors(false)
c07f9fc5 762 { }
7c673cae
FG
763
764 void reset_backfill();
765
766 void clear();
767
f67539c2 768 //////////////////// get or std::set missing ////////////////////
7c673cae
FG
769
770 const pg_missing_tracker_t& get_missing() const { return missing; }
11fdf7f2
TL
771
772 void missing_add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete=false) {
773 missing.add(oid, need, have, is_delete);
7c673cae
FG
774 }
775
11fdf7f2
TL
776 void missing_add_next_entry(const pg_log_entry_t& e) {
777 missing.add_next_event(e);
7c673cae
FG
778 }
779
f67539c2 780 //////////////////// get or std::set log ////////////////////
7c673cae
FG
781
782 const IndexedLog &get_log() const { return log; }
783
784 const eversion_t &get_tail() const { return log.tail; }
785
786 void set_tail(eversion_t tail) { log.tail = tail; }
787
788 const eversion_t &get_head() const { return log.head; }
789
790 void set_head(eversion_t head) { log.head = head; }
791
792 void set_last_requested(version_t last_requested) {
793 log.last_requested = last_requested;
794 }
795
796 void index() { log.index(); }
797
798 void unindex() { log.unindex(); }
799
800 void add(const pg_log_entry_t& e, bool applied = true) {
801 mark_writeout_from(e.version);
802 log.add(e, applied);
803 }
804
805 void reset_recovery_pointers() { log.reset_recovery_pointers(); }
806
807 static void clear_info_log(
808 spg_t pgid,
809 ObjectStore::Transaction *t);
810
811 void trim(
812 eversion_t trim_to,
f64942e4 813 pg_info_t &info,
11fdf7f2
TL
814 bool transaction_applied = true,
815 bool async = false);
7c673cae
FG
816
817 void roll_forward_to(
818 eversion_t roll_forward_to,
819 LogEntryHandler *h) {
eafe8130
TL
820 if (log.roll_forward_to(
821 roll_forward_to,
822 h))
823 dirty_log = true;
7c673cae
FG
824 }
825
826 eversion_t get_can_rollback_to() const {
827 return log.get_can_rollback_to();
828 }
829
830 void roll_forward(LogEntryHandler *h) {
831 roll_forward_to(
832 log.head,
833 h);
834 }
835
81eedcae
TL
836 void skip_rollforward() {
837 log.skip_can_rollback_to_to_head();
838 }
839
f67539c2 840 //////////////////// get or std::set log & missing ////////////////////
7c673cae
FG
841
842 void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) {
843 log.trim_rollback_info_to(log.head, h);
844 log.claim_log_and_clear_rollback_info(o);
845 missing.clear();
846 mark_dirty_to(eversion_t::max());
181888fb 847 mark_dirty_to_dups(eversion_t::max());
7c673cae
FG
848 }
849
850 void split_into(
851 pg_t child_pgid,
852 unsigned split_bits,
c07f9fc5 853 PGLog *opg_log) {
7c673cae
FG
854 log.split_out_child(child_pgid, split_bits, &opg_log->log);
855 missing.split_into(child_pgid, split_bits, &(opg_log->missing));
856 opg_log->mark_dirty_to(eversion_t::max());
181888fb 857 opg_log->mark_dirty_to_dups(eversion_t::max());
7c673cae 858 mark_dirty_to(eversion_t::max());
181888fb 859 mark_dirty_to_dups(eversion_t::max());
9f95a23c
TL
860 if (missing.may_include_deletes) {
861 opg_log->set_missing_may_contain_deletes();
862 }
7c673cae
FG
863 }
864
11fdf7f2 865 void merge_from(
f67539c2 866 const std::vector<PGLog*>& sources,
11fdf7f2
TL
867 eversion_t last_update) {
868 unindex();
869 missing.clear();
870
f67539c2 871 std::vector<pg_log_t*> slogs;
11fdf7f2
TL
872 for (auto s : sources) {
873 slogs.push_back(&s->log);
874 }
875 log.merge_from(slogs, last_update);
876
877 index();
878
879 mark_log_for_rewrite();
880 }
881
7c673cae
FG
882 void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
883 if (missing.is_missing(oid, v)) {
884 missing.got(oid, v);
11fdf7f2 885 info.stats.stats.sum.num_objects_missing = missing.num_missing();
c07f9fc5 886
7c673cae
FG
887 // raise last_complete?
888 if (missing.get_items().empty()) {
889 log.complete_to = log.log.end();
890 info.last_complete = info.last_update;
891 }
11fdf7f2 892 auto oldest_need = missing.get_oldest_need();
7c673cae 893 while (log.complete_to != log.log.end()) {
11fdf7f2 894 if (oldest_need <= log.complete_to->version)
7c673cae
FG
895 break;
896 if (info.last_complete < log.complete_to->version)
897 info.last_complete = log.complete_to->version;
898 ++log.complete_to;
899 }
900 }
901
11fdf7f2 902 ceph_assert(log.get_can_rollback_to() >= v);
7c673cae
FG
903 }
904
c07f9fc5 905 void reset_complete_to(pg_info_t *info) {
81eedcae
TL
906 if (log.log.empty()) // caller is split_into()
907 return;
7c673cae 908 log.complete_to = log.log.begin();
11fdf7f2
TL
909 ceph_assert(log.complete_to != log.log.end());
910 auto oldest_need = missing.get_oldest_need();
911 if (oldest_need != eversion_t()) {
912 while (log.complete_to->version < oldest_need) {
913 ++log.complete_to;
914 ceph_assert(log.complete_to != log.log.end());
915 }
c07f9fc5 916 }
11fdf7f2
TL
917 if (!info)
918 return;
7c673cae 919 if (log.complete_to == log.log.begin()) {
11fdf7f2 920 info->last_complete = eversion_t();
7c673cae
FG
921 } else {
922 --log.complete_to;
11fdf7f2 923 info->last_complete = log.complete_to->version;
7c673cae
FG
924 ++log.complete_to;
925 }
c07f9fc5
FG
926 }
927
928 void activate_not_complete(pg_info_t &info) {
929 reset_complete_to(&info);
7c673cae
FG
930 log.last_requested = 0;
931 }
932
933 void proc_replica_log(pg_info_t &oinfo,
934 const pg_log_t &olog,
935 pg_missing_t& omissing, pg_shard_t from) const;
936
9f95a23c
TL
937 void set_missing_may_contain_deletes() {
938 missing.may_include_deletes = true;
939 may_include_deletes_in_missing_dirty = true;
940 }
941
c07f9fc5 942 void rebuild_missing_set_with_deletes(ObjectStore *store,
11fdf7f2 943 ObjectStore::CollectionHandle& ch,
c07f9fc5
FG
944 const pg_info_t &info);
945
7c673cae
FG
946protected:
947 static void split_by_object(
31f18b77 948 mempool::osd_pglog::list<pg_log_entry_t> &entries,
f67539c2 949 std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) {
7c673cae 950 while (!entries.empty()) {
31f18b77 951 auto &out_list = (*out_entries)[entries.front().soid];
7c673cae
FG
952 out_list.splice(out_list.end(), entries, entries.begin());
953 }
954 }
955
956 /**
957 * _merge_object_divergent_entries
958 *
959 * There are 5 distinct cases:
960 * 1) There is a more recent update: in this case we assume we adjusted the
961 * store and missing during merge_log
962 * 2) The first entry in the divergent sequence is a create. This might
963 * either be because the object is a clone or because prior_version is
964 * eversion_t(). In this case the object does not exist and we must
965 * adjust missing and the store to match.
966 * 3) We are currently missing the object. In this case, we adjust the
967 * missing to our prior_version taking care to add a divergent_prior
968 * if necessary
969 * 4) We can rollback all of the entries. In this case, we do so using
970 * the rollbacker and return -- the object does not go into missing.
971 * 5) We cannot rollback at least 1 of the entries. In this case, we
972 * clear the object out of the store and add a missing entry at
973 * prior_version taking care to add a divergent_prior if
974 * necessary.
975 */
976 template <typename missing_type>
977 static void _merge_object_divergent_entries(
978 const IndexedLog &log, ///< [in] log to merge against
979 const hobject_t &hoid, ///< [in] object we are merging
31f18b77 980 const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
7c673cae 981 const pg_info_t &info, ///< [in] info for merging entries
eafe8130 982 eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input InedexedLog
c07f9fc5 983 missing_type &missing, ///< [in,out] missing to adjust, use
7c673cae
FG
984 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
985 const DoutPrefixProvider *dpp ///< [in] logging provider
986 ) {
987 ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
31f18b77 988 << " entries: " << orig_entries << dendl;
7c673cae
FG
989
990 if (hoid > info.last_backfill) {
991 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill"
992 << dendl;
993 return;
994 }
995
996 // entries is non-empty
11fdf7f2 997 ceph_assert(!orig_entries.empty());
31f18b77
FG
998 // strip out and ignore ERROR entries
999 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae 1000 eversion_t last;
d2e6a577 1001 bool seen_non_error = false;
f67539c2 1002 for (auto i = orig_entries.begin();
31f18b77 1003 i != orig_entries.end();
7c673cae
FG
1004 ++i) {
1005 // all entries are on hoid
11fdf7f2 1006 ceph_assert(i->soid == hoid);
d2e6a577
FG
1007 // did not see error entries before this entry and this entry is not error
1008 // then this entry is the first non error entry
1009 bool first_non_error = ! seen_non_error && ! i->is_error();
1010 if (! i->is_error() ) {
1011 // see a non error entry now
1012 seen_non_error = true;
1013 }
1014
1015 // No need to check the first entry since it prior_version is unavailable
f67539c2 1016 // in the std::list
d2e6a577
FG
1017 // No need to check if the prior_version is the minimal version
1018 // No need to check the first non-error entry since the leading error
1019 // entries are not its prior version
1020 if (i != orig_entries.begin() && i->prior_version != eversion_t() &&
1021 ! first_non_error) {
7c673cae 1022 // in increasing order of version
11fdf7f2 1023 ceph_assert(i->version > last);
31f18b77 1024 // prior_version correct (unless it is an ERROR entry)
11fdf7f2 1025 ceph_assert(i->prior_version == last || i->is_error());
7c673cae 1026 }
31f18b77
FG
1027 if (i->is_error()) {
1028 ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
1029 } else {
1030 ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
1031 entries.push_back(*i);
d2e6a577 1032 last = i->version;
31f18b77
FG
1033 }
1034 }
1035 if (entries.empty()) {
1036 ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl;
1037 return;
7c673cae
FG
1038 }
1039
1040 const eversion_t prior_version = entries.begin()->prior_version;
1041 const eversion_t first_divergent_update = entries.begin()->version;
1042 const eversion_t last_divergent_update = entries.rbegin()->version;
1043 const bool object_not_in_store =
1044 !missing.is_missing(hoid) &&
1045 entries.rbegin()->is_delete();
81eedcae
TL
1046 ldpp_dout(dpp, 10) << __func__ << ": hoid " << " object_not_in_store: "
1047 << object_not_in_store << dendl;
7c673cae
FG
1048 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1049 << " prior_version: " << prior_version
1050 << " first_divergent_update: " << first_divergent_update
1051 << " last_divergent_update: " << last_divergent_update
1052 << dendl;
1053
f67539c2 1054 auto objiter = log.objects.find(hoid);
7c673cae
FG
1055 if (objiter != log.objects.end() &&
1056 objiter->second->version >= first_divergent_update) {
1057 /// Case 1)
1058 ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: "
1059 << *objiter->second << ", already merged" << dendl;
1060
11fdf7f2 1061 ceph_assert(objiter->second->version > last_divergent_update);
7c673cae
FG
1062
1063 // ensure missing has been updated appropriately
c07f9fc5
FG
1064 if (objiter->second->is_update() ||
1065 (missing.may_include_deletes && objiter->second->is_delete())) {
11fdf7f2 1066 ceph_assert(missing.is_missing(hoid) &&
7c673cae
FG
1067 missing.get_items().at(hoid).need == objiter->second->version);
1068 } else {
11fdf7f2 1069 ceph_assert(!missing.is_missing(hoid));
7c673cae
FG
1070 }
1071 missing.revise_have(hoid, eversion_t());
9f95a23c 1072 missing.mark_fully_dirty(hoid);
7c673cae
FG
1073 if (rollbacker) {
1074 if (!object_not_in_store) {
1075 rollbacker->remove(hoid);
1076 }
1077 for (auto &&i: entries) {
1078 rollbacker->trim(i);
1079 }
1080 }
1081 return;
1082 }
1083
1084 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1085 <<" has no more recent entries in log" << dendl;
1086 if (prior_version == eversion_t() || entries.front().is_clone()) {
1087 /// Case 2)
1088 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1089 << " prior_version or op type indicates creation,"
1090 << " deleting"
1091 << dendl;
1092 if (missing.is_missing(hoid))
1093 missing.rm(missing.get_items().find(hoid));
1094 if (rollbacker) {
1095 if (!object_not_in_store) {
1096 rollbacker->remove(hoid);
1097 }
1098 for (auto &&i: entries) {
1099 rollbacker->trim(i);
1100 }
1101 }
1102 return;
1103 }
1104
1105 if (missing.is_missing(hoid)) {
1106 /// Case 3)
1107 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1108 << " missing, " << missing.get_items().at(hoid)
1109 << " adjusting" << dendl;
1110
1111 if (missing.get_items().at(hoid).have == prior_version) {
1112 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1113 << " missing.have is prior_version " << prior_version
1114 << " removing from missing" << dendl;
1115 missing.rm(missing.get_items().find(hoid));
1116 } else {
1117 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1118 << " missing.have is " << missing.get_items().at(hoid).have
1119 << ", adjusting" << dendl;
c07f9fc5 1120 missing.revise_need(hoid, prior_version, false);
7c673cae
FG
1121 if (prior_version <= info.log_tail) {
1122 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1123 << " prior_version " << prior_version
1124 << " <= info.log_tail "
1125 << info.log_tail << dendl;
1126 }
1127 }
1128 if (rollbacker) {
1129 for (auto &&i: entries) {
1130 rollbacker->trim(i);
1131 }
1132 }
1133 return;
1134 }
1135
1136 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1137 << " must be rolled back or recovered,"
1138 << " attempting to rollback"
1139 << dendl;
1140 bool can_rollback = true;
11fdf7f2
TL
1141 // We are going to make an important decision based on the
1142 // olog_can_rollback_to value we have received, better known it.
1143 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1144 << " olog_can_rollback_to: "
1145 << olog_can_rollback_to << dendl;
7c673cae 1146 /// Distinguish between 4) and 5)
f67539c2 1147 for (auto i = entries.rbegin(); i != entries.rend(); ++i) {
eafe8130 1148 if (!i->can_rollback() || i->version <= olog_can_rollback_to) {
7c673cae
FG
1149 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback "
1150 << *i << dendl;
1151 can_rollback = false;
1152 break;
1153 }
1154 }
1155
1156 if (can_rollback) {
1157 /// Case 4)
f67539c2 1158 for (auto i = entries.rbegin(); i != entries.rend(); ++i) {
eafe8130 1159 ceph_assert(i->can_rollback() && i->version > olog_can_rollback_to);
7c673cae
FG
1160 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1161 << " rolling back " << *i << dendl;
1162 if (rollbacker)
1163 rollbacker->rollback(*i);
1164 }
1165 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1166 << " rolled back" << dendl;
1167 return;
1168 } else {
1169 /// Case 5)
1170 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, "
1171 << "removing and adding to missing" << dendl;
1172 if (rollbacker) {
1173 if (!object_not_in_store)
1174 rollbacker->remove(hoid);
1175 for (auto &&i: entries) {
1176 rollbacker->trim(i);
1177 }
1178 }
c07f9fc5 1179 missing.add(hoid, prior_version, eversion_t(), false);
7c673cae
FG
1180 if (prior_version <= info.log_tail) {
1181 ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
1182 << " prior_version " << prior_version
1183 << " <= info.log_tail "
1184 << info.log_tail << dendl;
1185 }
1186 }
1187 }
1188
1189 /// Merge all entries using above
1190 template <typename missing_type>
1191 static void _merge_divergent_entries(
1192 const IndexedLog &log, ///< [in] log to merge against
31f18b77 1193 mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge
7c673cae 1194 const pg_info_t &oinfo, ///< [in] info for merging entries
eafe8130 1195 eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input IndexedLog
7c673cae
FG
1196 missing_type &omissing, ///< [in,out] missing to adjust, use
1197 LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
1198 const DoutPrefixProvider *dpp ///< [in] logging provider
1199 ) {
f67539c2 1200 std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
7c673cae 1201 split_by_object(entries, &split);
f67539c2 1202 for (auto i = split.begin(); i != split.end(); ++i) {
7c673cae
FG
1203 _merge_object_divergent_entries(
1204 log,
1205 i->first,
1206 i->second,
1207 oinfo,
1208 olog_can_rollback_to,
1209 omissing,
1210 rollbacker,
1211 dpp);
1212 }
1213 }
1214
1215 /**
1216 * Exists for use in TestPGLog for simply testing single divergent log
1217 * cases
1218 */
1219 void merge_old_entry(
1220 ObjectStore::Transaction& t,
1221 const pg_log_entry_t& oe,
1222 const pg_info_t& info,
1223 LogEntryHandler *rollbacker) {
31f18b77 1224 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae
FG
1225 entries.push_back(oe);
1226 _merge_object_divergent_entries(
1227 log,
1228 oe.soid,
1229 entries,
1230 info,
1231 log.get_can_rollback_to(),
1232 missing,
1233 rollbacker,
1234 this);
1235 }
c07f9fc5
FG
1236
1237 bool merge_log_dups(const pg_log_t& olog);
1238
7c673cae 1239public:
c07f9fc5 1240
7c673cae
FG
1241 void rewind_divergent_log(eversion_t newhead,
1242 pg_info_t &info,
1243 LogEntryHandler *rollbacker,
1244 bool &dirty_info,
1245 bool &dirty_big_info);
1246
1247 void merge_log(pg_info_t &oinfo,
f67539c2 1248 pg_log_t&& olog,
7c673cae
FG
1249 pg_shard_t from,
1250 pg_info_t &info, LogEntryHandler *rollbacker,
1251 bool &dirty_info, bool &dirty_big_info);
1252
1253 template <typename missing_type>
1254 static bool append_log_entries_update_missing(
1255 const hobject_t &last_backfill,
31f18b77 1256 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
1257 bool maintain_rollback,
1258 IndexedLog *log,
1259 missing_type &missing,
1260 LogEntryHandler *rollbacker,
1261 const DoutPrefixProvider *dpp) {
1262 bool invalidate_stats = false;
1263 if (log && !entries.empty()) {
11fdf7f2 1264 ceph_assert(log->head < entries.begin()->version);
7c673cae 1265 }
f67539c2 1266 for (auto p = entries.begin(); p != entries.end(); ++p) {
7c673cae
FG
1267 invalidate_stats = invalidate_stats || !p->is_error();
1268 if (log) {
1269 ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl;
1270 log->add(*p);
1271 }
1272 if (p->soid <= last_backfill &&
1273 !p->is_error()) {
c07f9fc5
FG
1274 if (missing.may_include_deletes) {
1275 missing.add_next_event(*p);
1276 } else {
1277 if (p->is_delete()) {
1278 missing.rm(p->soid, p->version);
1279 } else {
1280 missing.add_next_event(*p);
1281 }
1282 if (rollbacker) {
1283 // hack to match PG::mark_all_unfound_lost
1284 if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
1285 rollbacker->try_stash(p->soid, p->version.version);
1286 } else if (p->is_delete()) {
1287 rollbacker->remove(p->soid);
1288 }
7c673cae
FG
1289 }
1290 }
1291 }
1292 }
1293 return invalidate_stats;
1294 }
1295 bool append_new_log_entries(
1296 const hobject_t &last_backfill,
31f18b77 1297 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
1298 LogEntryHandler *rollbacker) {
1299 bool invalidate_stats = append_log_entries_update_missing(
1300 last_backfill,
7c673cae
FG
1301 entries,
1302 true,
1303 &log,
1304 missing,
1305 rollbacker,
1306 this);
1307 if (!entries.empty()) {
1308 mark_writeout_from(entries.begin()->version);
c07f9fc5
FG
1309 if (entries.begin()->is_lost_delete()) {
1310 // hack: since lost deletes queue recovery directly, and don't
1311 // go through activate_not_complete() again, our complete_to
1312 // iterator may still point at log.end(). Reset it to point
1313 // before these new lost_delete entries. This only occurs
1314 // when lost+delete entries are initially added, which is
f67539c2 1315 // always in a std::list of solely lost_delete entries, so it is
c07f9fc5
FG
1316 // sufficient to check whether the first entry is a
1317 // lost_delete
1318 reset_complete_to(nullptr);
1319 }
7c673cae
FG
1320 }
1321 return invalidate_stats;
1322 }
1323
c07f9fc5
FG
1324 void write_log_and_missing(
1325 ObjectStore::Transaction& t,
f67539c2 1326 std::map<std::string,ceph::buffer::list> *km,
c07f9fc5
FG
1327 const coll_t& coll,
1328 const ghobject_t &log_oid,
1329 bool require_rollback);
7c673cae
FG
1330
1331 static void write_log_and_missing_wo_missing(
1332 ObjectStore::Transaction& t,
f67539c2 1333 std::map<std::string,ceph::buffer::list>* km,
7c673cae
FG
1334 pg_log_t &log,
1335 const coll_t& coll,
f67539c2 1336 const ghobject_t &log_oid, std::map<eversion_t, hobject_t> &divergent_priors,
181888fb 1337 bool require_rollback);
7c673cae
FG
1338
1339 static void write_log_and_missing(
1340 ObjectStore::Transaction& t,
f67539c2 1341 std::map<std::string,ceph::buffer::list>* km,
7c673cae
FG
1342 pg_log_t &log,
1343 const coll_t& coll,
1344 const ghobject_t &log_oid,
1345 const pg_missing_tracker_t &missing,
c07f9fc5 1346 bool require_rollback,
c07f9fc5 1347 bool *rebuilt_missing_set_with_deletes);
7c673cae
FG
1348
1349 static void _write_log_and_missing_wo_missing(
1350 ObjectStore::Transaction& t,
f67539c2 1351 std::map<std::string,ceph::buffer::list>* km,
7c673cae
FG
1352 pg_log_t &log,
1353 const coll_t& coll, const ghobject_t &log_oid,
f67539c2 1354 std::map<eversion_t, hobject_t> &divergent_priors,
7c673cae
FG
1355 eversion_t dirty_to,
1356 eversion_t dirty_from,
1357 eversion_t writeout_from,
7c673cae
FG
1358 bool dirty_divergent_priors,
1359 bool touch_log,
1360 bool require_rollback,
181888fb
FG
1361 eversion_t dirty_to_dups,
1362 eversion_t dirty_from_dups,
1363 eversion_t write_from_dups,
f67539c2 1364 std::set<std::string> *log_keys_debug
7c673cae
FG
1365 );
1366
1367 static void _write_log_and_missing(
1368 ObjectStore::Transaction& t,
f67539c2 1369 std::map<std::string,ceph::buffer::list>* km,
7c673cae
FG
1370 pg_log_t &log,
1371 const coll_t& coll, const ghobject_t &log_oid,
1372 eversion_t dirty_to,
1373 eversion_t dirty_from,
1374 eversion_t writeout_from,
f67539c2
TL
1375 std::set<eversion_t> &&trimmed,
1376 std::set<std::string> &&trimmed_dups,
7c673cae
FG
1377 const pg_missing_tracker_t &missing,
1378 bool touch_log,
1379 bool require_rollback,
1380 bool clear_divergent_priors,
181888fb
FG
1381 eversion_t dirty_to_dups,
1382 eversion_t dirty_from_dups,
1383 eversion_t write_from_dups,
9f95a23c 1384 bool *may_include_deletes_in_missing_dirty,
f67539c2 1385 std::set<std::string> *log_keys_debug
7c673cae
FG
1386 );
1387
1388 void read_log_and_missing(
c07f9fc5 1389 ObjectStore *store,
11fdf7f2
TL
1390 ObjectStore::CollectionHandle& ch,
1391 ghobject_t pgmeta_oid,
7c673cae 1392 const pg_info_t &info,
f67539c2 1393 std::ostringstream &oss,
7c673cae
FG
1394 bool tolerate_divergent_missing_log,
1395 bool debug_verify_stored_missing = false
1396 ) {
1397 return read_log_and_missing(
11fdf7f2
TL
1398 store, ch, pgmeta_oid, info,
1399 log, missing, oss,
7c673cae
FG
1400 tolerate_divergent_missing_log,
1401 &clear_divergent_priors,
1402 this,
c07f9fc5 1403 (pg_log_debug ? &log_keys_debug : nullptr),
7c673cae
FG
1404 debug_verify_stored_missing);
1405 }
1406
1407 template <typename missing_type>
c07f9fc5
FG
1408 static void read_log_and_missing(
1409 ObjectStore *store,
11fdf7f2
TL
1410 ObjectStore::CollectionHandle &ch,
1411 ghobject_t pgmeta_oid,
7c673cae
FG
1412 const pg_info_t &info,
1413 IndexedLog &log,
c07f9fc5 1414 missing_type &missing,
f67539c2 1415 std::ostringstream &oss,
7c673cae 1416 bool tolerate_divergent_missing_log,
c07f9fc5
FG
1417 bool *clear_divergent_priors = nullptr,
1418 const DoutPrefixProvider *dpp = nullptr,
f67539c2 1419 std::set<std::string> *log_keys_debug = nullptr,
7c673cae
FG
1420 bool debug_verify_stored_missing = false
1421 ) {
11fdf7f2
TL
1422 ldpp_dout(dpp, 20) << "read_log_and_missing coll " << ch->cid
1423 << " " << pgmeta_oid << dendl;
7c673cae
FG
1424
1425 // legacy?
1426 struct stat st;
11fdf7f2
TL
1427 int r = store->stat(ch, pgmeta_oid, &st);
1428 ceph_assert(r == 0);
1429 ceph_assert(st.st_size == 0);
7c673cae
FG
1430
1431 // will get overridden below if it had been recorded
1432 eversion_t on_disk_can_rollback_to = info.last_update;
1433 eversion_t on_disk_rollback_info_trimmed_to = eversion_t();
11fdf7f2
TL
1434 ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch,
1435 pgmeta_oid);
f67539c2 1436 std::map<eversion_t, hobject_t> divergent_priors;
11fdf7f2 1437 bool must_rebuild = false;
c07f9fc5 1438 missing.may_include_deletes = false;
f67539c2
TL
1439 std::list<pg_log_entry_t> entries;
1440 std::list<pg_log_dup_t> dups;
7c673cae 1441 if (p) {
f67539c2 1442 using ceph::decode;
11fdf7f2 1443 for (p->seek_to_first(); p->valid() ; p->next()) {
7c673cae
FG
1444 // non-log pgmeta_oid keys are prefixed with _; skip those
1445 if (p->key()[0] == '_')
1446 continue;
f67539c2 1447 auto bl = p->value();//Copy ceph::buffer::list before creating iterator
11fdf7f2 1448 auto bp = bl.cbegin();
7c673cae 1449 if (p->key() == "divergent_priors") {
11fdf7f2 1450 decode(divergent_priors, bp);
7c673cae
FG
1451 ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
1452 << " divergent_priors" << dendl;
35e4c445 1453 must_rebuild = true;
7c673cae
FG
1454 debug_verify_stored_missing = false;
1455 } else if (p->key() == "can_rollback_to") {
11fdf7f2 1456 decode(on_disk_can_rollback_to, bp);
7c673cae 1457 } else if (p->key() == "rollback_info_trimmed_to") {
11fdf7f2 1458 decode(on_disk_rollback_info_trimmed_to, bp);
c07f9fc5
FG
1459 } else if (p->key() == "may_include_deletes_in_missing") {
1460 missing.may_include_deletes = true;
f67539c2 1461 } else if (p->key().substr(0, 7) == std::string("missing")) {
c07f9fc5
FG
1462 hobject_t oid;
1463 pg_missing_item item;
11fdf7f2
TL
1464 decode(oid, bp);
1465 decode(item, bp);
9f95a23c 1466 ldpp_dout(dpp, 20) << "read_log_and_missing " << item << dendl;
c07f9fc5 1467 if (item.is_delete()) {
11fdf7f2 1468 ceph_assert(missing.may_include_deletes);
c07f9fc5 1469 }
9f95a23c 1470 missing.add(oid, std::move(item));
f67539c2 1471 } else if (p->key().substr(0, 4) == std::string("dup_")) {
c07f9fc5 1472 pg_log_dup_t dup;
11fdf7f2 1473 decode(dup, bp);
c07f9fc5 1474 if (!dups.empty()) {
11fdf7f2 1475 ceph_assert(dups.back().version < dup.version);
c07f9fc5
FG
1476 }
1477 dups.push_back(dup);
7c673cae
FG
1478 } else {
1479 pg_log_entry_t e;
1480 e.decode_with_checksum(bp);
1481 ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
1482 if (!entries.empty()) {
1483 pg_log_entry_t last_e(entries.back());
11fdf7f2
TL
1484 ceph_assert(last_e.version.version < e.version.version);
1485 ceph_assert(last_e.version.epoch <= e.version.epoch);
7c673cae
FG
1486 }
1487 entries.push_back(e);
1488 if (log_keys_debug)
1489 log_keys_debug->insert(e.get_key_name());
1490 }
1491 }
1492 }
1493 log = IndexedLog(
1494 info.last_update,
1495 info.log_tail,
1496 on_disk_can_rollback_to,
1497 on_disk_rollback_info_trimmed_to,
c07f9fc5
FG
1498 std::move(entries),
1499 std::move(dups));
7c673cae 1500
35e4c445 1501 if (must_rebuild || debug_verify_stored_missing) {
7c673cae
FG
1502 // build missing
1503 if (debug_verify_stored_missing || info.last_complete < info.last_update) {
c07f9fc5
FG
1504 ldpp_dout(dpp, 10)
1505 << "read_log_and_missing checking for missing items over interval ("
1506 << info.last_complete
1507 << "," << info.last_update << "]" << dendl;
7c673cae 1508
f67539c2
TL
1509 std::set<hobject_t> did;
1510 std::set<hobject_t> checked;
1511 std::set<hobject_t> skipped;
1512 for (auto i = log.log.rbegin(); i != log.log.rend(); ++i) {
7c673cae
FG
1513 if (i->soid > info.last_backfill)
1514 continue;
1515 if (i->is_error())
1516 continue;
1517 if (did.count(i->soid)) continue;
1518 did.insert(i->soid);
1519
c07f9fc5
FG
1520 if (!missing.may_include_deletes && i->is_delete())
1521 continue;
7c673cae 1522
f67539c2 1523 ceph::buffer::list bv;
7c673cae 1524 int r = store->getattr(
11fdf7f2 1525 ch,
7c673cae
FG
1526 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
1527 OI_ATTR,
1528 bv);
1529 if (r >= 0) {
1530 object_info_t oi(bv);
1531 if (oi.version < i->version) {
1532 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i
9f95a23c
TL
1533 << " (have " << oi.version << ")"
1534 << " clean_regions " << i->clean_regions << dendl;
1535
7c673cae
FG
1536 if (debug_verify_stored_missing) {
1537 auto miter = missing.get_items().find(i->soid);
11fdf7f2
TL
1538 ceph_assert(miter != missing.get_items().end());
1539 ceph_assert(miter->second.need == i->version);
c07f9fc5
FG
1540 // the 'have' version is reset if an object is deleted,
1541 // then created again
11fdf7f2 1542 ceph_assert(miter->second.have == oi.version || miter->second.have == eversion_t());
7c673cae
FG
1543 checked.insert(i->soid);
1544 } else {
c07f9fc5 1545 missing.add(i->soid, i->version, oi.version, i->is_delete());
7c673cae
FG
1546 }
1547 }
1548 } else {
1549 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
1550 if (debug_verify_stored_missing) {
1551 auto miter = missing.get_items().find(i->soid);
c07f9fc5 1552 if (i->is_delete()) {
11fdf7f2 1553 ceph_assert(miter == missing.get_items().end() ||
c07f9fc5
FG
1554 (miter->second.need == i->version &&
1555 miter->second.have == eversion_t()));
1556 } else {
11fdf7f2
TL
1557 ceph_assert(miter != missing.get_items().end());
1558 ceph_assert(miter->second.need == i->version);
1559 ceph_assert(miter->second.have == eversion_t());
c07f9fc5 1560 }
7c673cae
FG
1561 checked.insert(i->soid);
1562 } else {
c07f9fc5 1563 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
7c673cae
FG
1564 }
1565 }
1566 }
1567 if (debug_verify_stored_missing) {
1568 for (auto &&i: missing.get_items()) {
1569 if (checked.count(i.first))
1570 continue;
c07f9fc5 1571 if (i.first > info.last_backfill) {
f67539c2 1572 ldpp_dout(dpp, -1) << __func__ << ": invalid missing std::set entry "
c07f9fc5
FG
1573 << "found before last_backfill: "
1574 << i.first << " " << i.second
1575 << " last_backfill = " << info.last_backfill
1576 << dendl;
f67539c2 1577 ceph_abort_msg("invalid missing std::set entry found");
7c673cae 1578 }
f67539c2 1579 ceph::buffer::list bv;
7c673cae 1580 int r = store->getattr(
11fdf7f2 1581 ch,
7c673cae
FG
1582 ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard),
1583 OI_ATTR,
1584 bv);
1585 if (r >= 0) {
1586 object_info_t oi(bv);
11fdf7f2 1587 ceph_assert(oi.version == i.second.have || eversion_t() == i.second.have);
7c673cae 1588 } else {
11fdf7f2 1589 ceph_assert(i.second.is_delete() || eversion_t() == i.second.have);
7c673cae
FG
1590 }
1591 }
1592 } else {
11fdf7f2 1593 ceph_assert(must_rebuild);
f67539c2 1594 for (auto i = divergent_priors.rbegin();
7c673cae
FG
1595 i != divergent_priors.rend();
1596 ++i) {
1597 if (i->first <= info.last_complete) break;
1598 if (i->second > info.last_backfill)
1599 continue;
1600 if (did.count(i->second)) continue;
1601 did.insert(i->second);
f67539c2 1602 ceph::buffer::list bv;
7c673cae 1603 int r = store->getattr(
11fdf7f2 1604 ch,
7c673cae
FG
1605 ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard),
1606 OI_ATTR,
1607 bv);
1608 if (r >= 0) {
1609 object_info_t oi(bv);
1610 /**
1611 * 1) we see this entry in the divergent priors mapping
1612 * 2) we didn't see an entry for this object in the log
1613 *
1614 * From 1 & 2 we know that either the object does not exist
1615 * or it is at the version specified in the divergent_priors
1616 * map since the object would have been deleted atomically
1617 * with the addition of the divergent_priors entry, an older
1618 * version would not have been recovered, and a newer version
1619 * would show up in the log above.
1620 */
f67539c2 1621 /**
7c673cae
FG
1622 * Unfortunately the assessment above is incorrect because of
1623 * http://tracker.ceph.com/issues/17916 (we were incorrectly
f67539c2 1624 * not removing the divergent_priors std::set from disk state!),
7c673cae
FG
1625 * so let's check that.
1626 */
1627 if (oi.version > i->first && tolerate_divergent_missing_log) {
1628 ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i
1629 << ") inconsistent with disk state (" << oi
1630 << "), assuming it is tracker.ceph.com/issues/17916"
1631 << dendl;
1632 } else {
11fdf7f2 1633 ceph_assert(oi.version == i->first);
7c673cae
FG
1634 }
1635 } else {
1636 ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
c07f9fc5 1637 missing.add(i->second, i->first, eversion_t(), false);
7c673cae
FG
1638 }
1639 }
1640 }
1641 if (clear_divergent_priors)
1642 (*clear_divergent_priors) = true;
1643 }
1644 }
1645
35e4c445 1646 if (!must_rebuild) {
7c673cae
FG
1647 if (clear_divergent_priors)
1648 (*clear_divergent_priors) = false;
1649 missing.flush();
1650 }
1651 ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl;
c07f9fc5 1652 } // static read_log_and_missing
9f95a23c
TL
1653
1654#ifdef WITH_SEASTAR
1655 seastar::future<> read_log_and_missing_crimson(
1656 crimson::os::FuturizedStore &store,
1657 crimson::os::CollectionRef ch,
1658 const pg_info_t &info,
1659 ghobject_t pgmeta_oid
1660 ) {
1661 return read_log_and_missing_crimson(
1662 store, ch, info,
f67539c2
TL
1663 log, (pg_log_debug ? &log_keys_debug : nullptr),
1664 missing, pgmeta_oid, this);
9f95a23c
TL
1665 }
1666
9f95a23c
TL
1667 static seastar::future<> read_log_and_missing_crimson(
1668 crimson::os::FuturizedStore &store,
1669 crimson::os::CollectionRef ch,
1670 const pg_info_t &info,
1671 IndexedLog &log,
f67539c2
TL
1672 std::set<std::string>* log_keys_debug,
1673 pg_missing_tracker_t &missing,
9f95a23c 1674 ghobject_t pgmeta_oid,
f67539c2 1675 const DoutPrefixProvider *dpp = nullptr);
9f95a23c
TL
1676
1677#endif
1678
c07f9fc5 1679}; // struct PGLog