]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGLog.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / osd / PGLog.cc
CommitLineData
c07f9fc5 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
c07f9fc5 13 * License version 2.1, as published by the Free Software
7c673cae 14 * Foundation. See file COPYING.
c07f9fc5 15 *
7c673cae
FG
16 */
17
18#include "PGLog.h"
19#include "include/unordered_map.h"
20#include "common/ceph_context.h"
21
22#define dout_context cct
23#define dout_subsys ceph_subsys_osd
24#undef dout_prefix
25#define dout_prefix _prefix(_dout, this)
26
27static ostream& _prefix(std::ostream *_dout, const PGLog *pglog)
28{
11fdf7f2 29 return pglog->gen_prefix(*_dout);
7c673cae
FG
30}
31
32//////////////////// PGLog::IndexedLog ////////////////////
33
34void PGLog::IndexedLog::split_out_child(
35 pg_t child_pgid,
36 unsigned split_bits,
37 PGLog::IndexedLog *target)
38{
39 unindex();
c07f9fc5 40 *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
7c673cae
FG
41 index();
42 target->index();
43 reset_rollback_info_trimmed_to_riter();
44}
45
46void PGLog::IndexedLog::trim(
47 CephContext* cct,
48 eversion_t s,
c07f9fc5
FG
49 set<eversion_t> *trimmed,
50 set<string>* trimmed_dups,
181888fb 51 eversion_t *write_from_dups)
7c673cae 52{
11fdf7f2 53 ceph_assert(s <= can_rollback_to);
f64942e4
AA
54 if (complete_to != log.end())
55 lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl;
7c673cae 56
c07f9fc5
FG
57 auto earliest_dup_version =
58 log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
59 ? 0u
81eedcae 60 : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked + 1;
c07f9fc5 61
81eedcae 62 lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl;
7c673cae 63 while (!log.empty()) {
c07f9fc5 64 const pg_log_entry_t &e = *log.begin();
7c673cae
FG
65 if (e.version > s)
66 break;
f64942e4 67 lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl;
7c673cae 68 if (trimmed)
11fdf7f2 69 trimmed->emplace(e.version);
7c673cae
FG
70
71 unindex(e); // remove from index,
72
c07f9fc5
FG
73 // add to dup list
74 if (e.version.version >= earliest_dup_version) {
181888fb 75 if (write_from_dups != nullptr && *write_from_dups > e.version) {
f64942e4 76 lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
181888fb
FG
77 *write_from_dups = e.version;
78 }
c07f9fc5
FG
79 dups.push_back(pg_log_dup_t(e));
80 index(dups.back());
11fdf7f2 81 uint32_t idx = 0;
c07f9fc5 82 for (const auto& extra : e.extra_reqids) {
11fdf7f2
TL
83 int return_code = e.return_code;
84 if (return_code >= 0) {
85 auto it = e.extra_reqid_return_codes.find(idx);
86 if (it != e.extra_reqid_return_codes.end()) {
87 return_code = it->second;
9f95a23c 88 // FIXME: we aren't setting op_returns for these extra_reqids
11fdf7f2
TL
89 }
90 }
91 ++idx;
92
c07f9fc5
FG
93 // note: extras have the same version as outer op
94 dups.push_back(pg_log_dup_t(e.version, extra.second,
11fdf7f2 95 extra.first, return_code));
c07f9fc5
FG
96 index(dups.back());
97 }
98 }
99
f64942e4
AA
100 bool reset_complete_to = false;
101 // we are trimming past complete_to, so reset complete_to
102 if (complete_to != log.end() && e.version >= complete_to->version)
103 reset_complete_to = true;
7c673cae
FG
104 if (rollback_info_trimmed_to_riter == log.rend() ||
105 e.version == rollback_info_trimmed_to_riter->version) {
106 log.pop_front();
107 rollback_info_trimmed_to_riter = log.rend();
108 } else {
109 log.pop_front();
110 }
f64942e4
AA
111
112 // reset complete_to to the beginning of the log
113 if (reset_complete_to) {
f64942e4 114 complete_to = log.begin();
11fdf7f2
TL
115 if (complete_to != log.end()) {
116 lgeneric_subdout(cct, osd, 20) << " moving complete_to to "
117 << log.begin()->version << dendl;
118 } else {
119 lgeneric_subdout(cct, osd, 20) << " log is now empty" << dendl;
120 }
f64942e4 121 }
7c673cae
FG
122 }
123
c07f9fc5
FG
124 while (!dups.empty()) {
125 const auto& e = *dups.begin();
126 if (e.version.version >= earliest_dup_version)
127 break;
f64942e4 128 lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl;
c07f9fc5
FG
129 if (trimmed_dups)
130 trimmed_dups->insert(e.get_key_name());
11fdf7f2 131 unindex(e);
c07f9fc5
FG
132 dups.pop_front();
133 }
134
7c673cae
FG
135 // raise tail?
136 if (tail < s)
137 tail = s;
138}
139
140ostream& PGLog::IndexedLog::print(ostream& out) const
141{
142 out << *this << std::endl;
143 for (list<pg_log_entry_t>::const_iterator p = log.begin();
144 p != log.end();
145 ++p) {
c07f9fc5
FG
146 out << *p << " " <<
147 (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
148 std::endl;
11fdf7f2 149 ceph_assert(!p->reqid_is_indexed() || logged_req(p->reqid));
7c673cae 150 }
c07f9fc5
FG
151
152 for (list<pg_log_dup_t>::const_iterator p = dups.begin();
153 p != dups.end();
154 ++p) {
155 out << *p << std::endl;
156 }
157
7c673cae
FG
158 return out;
159}
160
161//////////////////// PGLog ////////////////////
162
163void PGLog::reset_backfill()
164{
165 missing.clear();
166}
167
168void PGLog::clear() {
169 missing.clear();
170 log.clear();
171 log_keys_debug.clear();
172 undirty();
173}
174
175void PGLog::clear_info_log(
176 spg_t pgid,
177 ObjectStore::Transaction *t) {
178 coll_t coll(pgid);
179 t->remove(coll, pgid.make_pgmeta_oid());
180}
181
182void PGLog::trim(
183 eversion_t trim_to,
f64942e4 184 pg_info_t &info,
11fdf7f2
TL
185 bool transaction_applied,
186 bool async)
7c673cae 187{
f64942e4 188 dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl;
7c673cae
FG
189 // trim?
190 if (trim_to > log.tail) {
f64942e4 191 dout(10) << __func__ << " missing = " << missing.num_missing() << dendl;
11fdf7f2 192 // Don't assert for async_recovery_targets or backfill_targets
f64942e4 193 // or whenever there are missing items
11fdf7f2
TL
194 if (transaction_applied && !async && (missing.num_missing() == 0))
195 ceph_assert(trim_to <= info.last_complete);
7c673cae
FG
196
197 dout(10) << "trim " << log << " to " << trim_to << dendl;
181888fb 198 log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
7c673cae 199 info.log_tail = log.tail;
f64942e4
AA
200 if (log.complete_to != log.log.end())
201 dout(10) << " after trim complete_to " << log.complete_to->version << dendl;
7c673cae
FG
202 }
203}
204
205void PGLog::proc_replica_log(
206 pg_info_t &oinfo,
207 const pg_log_t &olog,
208 pg_missing_t& omissing,
209 pg_shard_t from) const
210{
211 dout(10) << "proc_replica_log for osd." << from << ": "
212 << oinfo << " " << olog << " " << omissing << dendl;
213
214 if (olog.head < log.tail) {
215 dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
216 << "for divergent objects" << dendl;
217 return;
218 }
219 if (olog.head == log.head) {
220 dout(10) << __func__ << ": osd." << from << " same log head, not looking "
221 << "for divergent objects" << dendl;
222 return;
223 }
7c673cae
FG
224
225 /*
226 basically what we're doing here is rewinding the remote log,
227 dropping divergent entries, until we find something that matches
228 our master log. we then reset last_update to reflect the new
229 point up to which missing is accurate.
230
231 later, in activate(), missing will get wound forward again and
232 we will send the peer enough log to arrive at the same state.
233 */
234
235 for (map<hobject_t, pg_missing_item>::const_iterator i = omissing.get_items().begin();
236 i != omissing.get_items().end();
237 ++i) {
238 dout(20) << " before missing " << i->first << " need " << i->second.need
239 << " have " << i->second.have << dendl;
240 }
241
242 list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
243 log.log.rbegin();
244 while (1) {
245 if (first_non_divergent == log.log.rend())
246 break;
247 if (first_non_divergent->version <= olog.head) {
248 dout(20) << "merge_log point (usually last shared) is "
249 << *first_non_divergent << dendl;
250 break;
251 }
252 ++first_non_divergent;
253 }
254
255 /* Because olog.head >= log.tail, we know that both pgs must at least have
256 * the event represented by log.tail. Similarly, because log.head >= olog.tail,
257 * we know that the even represented by olog.tail must be common to both logs.
258 * Furthermore, the event represented by a log tail was necessarily trimmed,
259 * thus neither olog.tail nor log.tail can be divergent. It's
260 * possible that olog/log contain no actual events between olog.head and
11fdf7f2 261 * max(log.tail, olog.tail), however, since they might have been split out.
7c673cae
FG
262 * Thus, if we cannot find an event e such that
263 * log.tail <= e.version <= log.head, the last_update must actually be
11fdf7f2 264 * max(log.tail, olog.tail).
7c673cae 265 */
11fdf7f2 266 eversion_t limit = std::max(olog.tail, log.tail);
7c673cae
FG
267 eversion_t lu =
268 (first_non_divergent == log.log.rend() ||
269 first_non_divergent->version < limit) ?
270 limit :
271 first_non_divergent->version;
272
eafe8130
TL
273 // we merge and adjust the replica's log, rollback the rollbackable divergent entry,
274 // remove the unrollbackable divergent entry and mark the according object as missing.
275 // the rollback boundary must choose crt of the olog which going to be merged.
276 // The replica log's(olog) crt will not be modified, so it could get passed
277 // to _merge_divergent_entries() directly.
7c673cae
FG
278 IndexedLog folog(olog);
279 auto divergent = folog.rewind_from_head(lu);
280 _merge_divergent_entries(
281 folog,
282 divergent,
283 oinfo,
284 olog.get_can_rollback_to(),
285 omissing,
286 0,
287 this);
288
289 if (lu < oinfo.last_update) {
290 dout(10) << " peer osd." << from << " last_update now " << lu << dendl;
291 oinfo.last_update = lu;
292 }
293
294 if (omissing.have_missing()) {
295 eversion_t first_missing =
296 omissing.get_items().at(omissing.get_rmissing().begin()->second).need;
297 oinfo.last_complete = eversion_t();
298 list<pg_log_entry_t>::const_iterator i = olog.log.begin();
299 for (;
300 i != olog.log.end();
301 ++i) {
302 if (i->version < first_missing)
303 oinfo.last_complete = i->version;
304 else
305 break;
306 }
307 } else {
308 oinfo.last_complete = oinfo.last_update;
309 }
c07f9fc5 310} // proc_replica_log
7c673cae
FG
311
312/**
313 * rewind divergent entries at the head of the log
314 *
315 * This rewinds entries off the head of our log that are divergent.
316 * This is used by replicas during activation.
317 *
318 * @param newhead new head to rewind to
319 */
320void PGLog::rewind_divergent_log(eversion_t newhead,
321 pg_info_t &info, LogEntryHandler *rollbacker,
322 bool &dirty_info, bool &dirty_big_info)
323{
c07f9fc5
FG
324 dout(10) << "rewind_divergent_log truncate divergent future " <<
325 newhead << dendl;
7c673cae 326
81eedcae
TL
327 // We need to preserve the original crt before it gets updated in rewind_from_head().
328 // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
329 // a divergent entry or not.
330 eversion_t original_crt = log.get_can_rollback_to();
331 dout(20) << __func__ << " original_crt = " << original_crt << dendl;
7c673cae
FG
332 if (info.last_complete > newhead)
333 info.last_complete = newhead;
334
335 auto divergent = log.rewind_from_head(newhead);
336 if (!divergent.empty()) {
337 mark_dirty_from(divergent.front().version);
338 }
339 for (auto &&entry: divergent) {
340 dout(10) << "rewind_divergent_log future divergent " << entry << dendl;
341 }
342 info.last_update = newhead;
343
344 _merge_divergent_entries(
345 log,
346 divergent,
347 info,
81eedcae 348 original_crt,
7c673cae
FG
349 missing,
350 rollbacker,
351 this);
352
353 dirty_info = true;
354 dirty_big_info = true;
355}
356
357void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
358 pg_info_t &info, LogEntryHandler *rollbacker,
359 bool &dirty_info, bool &dirty_big_info)
360{
361 dout(10) << "merge_log " << olog << " from osd." << fromosd
362 << " into " << log << dendl;
363
364 // Check preconditions
365
366 // If our log is empty, the incoming log needs to have not been trimmed.
11fdf7f2 367 ceph_assert(!log.null() || olog.tail == eversion_t());
7c673cae 368 // The logs must overlap.
11fdf7f2 369 ceph_assert(log.head >= olog.tail && olog.head >= log.tail);
7c673cae
FG
370
371 for (map<hobject_t, pg_missing_item>::const_iterator i = missing.get_items().begin();
372 i != missing.get_items().end();
373 ++i) {
374 dout(20) << "pg_missing_t sobject: " << i->first << dendl;
375 }
376
377 bool changed = false;
378
379 // extend on tail?
380 // this is just filling in history. it does not affect our
381 // missing set, as that should already be consistent with our
382 // current log.
383 eversion_t orig_tail = log.tail;
384 if (olog.tail < log.tail) {
385 dout(10) << "merge_log extending tail to " << olog.tail << dendl;
386 list<pg_log_entry_t>::iterator from = olog.log.begin();
387 list<pg_log_entry_t>::iterator to;
388 eversion_t last;
389 for (to = from;
390 to != olog.log.end();
391 ++to) {
392 if (to->version > log.tail)
393 break;
394 log.index(*to);
395 dout(15) << *to << dendl;
396 last = to->version;
397 }
398 mark_dirty_to(last);
399
400 // splice into our log.
401 log.log.splice(log.log.begin(),
402 olog.log, from, to);
c07f9fc5 403
7c673cae
FG
404 info.log_tail = log.tail = olog.tail;
405 changed = true;
406 }
407
408 if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases
409 oinfo.stats.reported_epoch < info.stats.reported_epoch) {
410 oinfo.stats.reported_seq = info.stats.reported_seq;
411 oinfo.stats.reported_epoch = info.stats.reported_epoch;
412 }
413 if (info.last_backfill.is_max())
414 info.stats = oinfo.stats;
415 info.hit_set = oinfo.hit_set;
416
417 // do we have divergent entries to throw out?
418 if (olog.head < log.head) {
419 rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
420 changed = true;
421 }
422
423 // extend on head?
424 if (olog.head > log.head) {
425 dout(10) << "merge_log extending head to " << olog.head << dendl;
c07f9fc5 426
7c673cae
FG
427 // find start point in olog
428 list<pg_log_entry_t>::iterator to = olog.log.end();
429 list<pg_log_entry_t>::iterator from = olog.log.end();
11fdf7f2 430 eversion_t lower_bound = std::max(olog.tail, orig_tail);
7c673cae
FG
431 while (1) {
432 if (from == olog.log.begin())
433 break;
434 --from;
435 dout(20) << " ? " << *from << dendl;
436 if (from->version <= log.head) {
11fdf7f2 437 lower_bound = std::max(lower_bound, from->version);
7c673cae
FG
438 ++from;
439 break;
440 }
441 }
442 dout(20) << "merge_log cut point (usually last shared) is "
443 << lower_bound << dendl;
444 mark_dirty_from(lower_bound);
445
81eedcae
TL
446 // We need to preserve the original crt before it gets updated in rewind_from_head().
447 // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
448 // a divergent entry or not.
449 eversion_t original_crt = log.get_can_rollback_to();
450 dout(20) << __func__ << " original_crt = " << original_crt << dendl;
7c673cae
FG
451 auto divergent = log.rewind_from_head(lower_bound);
452 // move aside divergent items
453 for (auto &&oe: divergent) {
454 dout(10) << "merge_log divergent " << oe << dendl;
455 }
456 log.roll_forward_to(log.head, rollbacker);
457
31f18b77 458 mempool::osd_pglog::list<pg_log_entry_t> new_entries;
7c673cae
FG
459 new_entries.splice(new_entries.end(), olog.log, from, to);
460 append_log_entries_update_missing(
461 info.last_backfill,
7c673cae
FG
462 new_entries,
463 false,
464 &log,
465 missing,
466 rollbacker,
467 this);
468
469 _merge_divergent_entries(
470 log,
471 divergent,
472 info,
81eedcae 473 original_crt,
7c673cae
FG
474 missing,
475 rollbacker,
476 this);
477
478 info.last_update = log.head = olog.head;
479
480 // We cannot rollback into the new log entries
481 log.skip_can_rollback_to_to_head();
482
483 info.last_user_version = oinfo.last_user_version;
484 info.purged_snaps = oinfo.purged_snaps;
eafe8130
TL
485 // update num_missing too
486 // we might have appended some more missing objects above
487 info.stats.stats.sum.num_objects_missing = missing.num_missing();
7c673cae
FG
488
489 changed = true;
490 }
c07f9fc5
FG
491
492 // now handle dups
493 if (merge_log_dups(olog)) {
c07f9fc5
FG
494 changed = true;
495 }
496
497 dout(10) << "merge_log result " << log << " " << missing <<
498 " changed=" << changed << dendl;
7c673cae
FG
499
500 if (changed) {
501 dirty_info = true;
502 dirty_big_info = true;
503 }
504}
505
c07f9fc5
FG
506
507// returns true if any changes were made to log.dups
508bool PGLog::merge_log_dups(const pg_log_t& olog) {
509 bool changed = false;
510
511 if (!olog.dups.empty()) {
512 if (log.dups.empty()) {
513 dout(10) << "merge_log copying olog dups to log " <<
514 olog.dups.front().version << " to " <<
515 olog.dups.back().version << dendl;
516 changed = true;
181888fb
FG
517 dirty_from_dups = eversion_t();
518 dirty_to_dups = eversion_t::max();
c07f9fc5
FG
519 // since our log.dups is empty just copy them
520 for (const auto& i : olog.dups) {
521 log.dups.push_back(i);
522 log.index(log.dups.back());
523 }
524 } else {
525 // since our log.dups is not empty try to extend on each end
526
527 if (olog.dups.back().version > log.dups.back().version) {
528 // extend the dups's tail (i.e., newer dups)
529 dout(10) << "merge_log extending dups tail to " <<
530 olog.dups.back().version << dendl;
531 changed = true;
532
533 auto log_tail_version = log.dups.back().version;
534
535 auto insert_cursor = log.dups.end();
181888fb 536 eversion_t last_shared = eversion_t::max();
c07f9fc5
FG
537 for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
538 if (i->version <= log_tail_version) break;
539 log.dups.insert(insert_cursor, *i);
181888fb 540 last_shared = i->version;
c07f9fc5
FG
541
542 auto prev = insert_cursor;
543 --prev;
544 // be sure to pass reference of copy in log.dups
545 log.index(*prev);
546
547 --insert_cursor; // make sure we insert in reverse order
548 }
181888fb 549 mark_dirty_from_dups(last_shared);
c07f9fc5
FG
550 }
551
552 if (olog.dups.front().version < log.dups.front().version) {
553 // extend the dups's head (i.e., older dups)
554 dout(10) << "merge_log extending dups head to " <<
555 olog.dups.front().version << dendl;
556 changed = true;
557
181888fb 558 eversion_t last;
c07f9fc5
FG
559 auto insert_cursor = log.dups.begin();
560 for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
561 if (i->version >= insert_cursor->version) break;
562 log.dups.insert(insert_cursor, *i);
181888fb 563 last = i->version;
c07f9fc5
FG
564 auto prev = insert_cursor;
565 --prev;
566 // be sure to pass address of copy in log.dups
567 log.index(*prev);
568 }
181888fb 569 mark_dirty_to_dups(last);
c07f9fc5
FG
570 }
571 }
572 }
573
574 // remove any dup entries that overlap with pglog
81eedcae
TL
575 if (!log.dups.empty() && log.dups.back().version > log.tail) {
576 dout(10) << "merge_log removed dups overlapping log entries (" <<
c07f9fc5
FG
577 log.tail << "," << log.dups.back().version << "]" << dendl;
578 changed = true;
579
81eedcae 580 while (!log.dups.empty() && log.dups.back().version > log.tail) {
c07f9fc5 581 log.unindex(log.dups.back());
181888fb 582 mark_dirty_from_dups(log.dups.back().version);
c07f9fc5
FG
583 log.dups.pop_back();
584 }
585 }
586
587 return changed;
588}
589
7c673cae
FG
590void PGLog::check() {
591 if (!pg_log_debug)
592 return;
593 if (log.log.size() != log_keys_debug.size()) {
594 derr << "log.log.size() != log_keys_debug.size()" << dendl;
595 derr << "actual log:" << dendl;
596 for (list<pg_log_entry_t>::iterator i = log.log.begin();
597 i != log.log.end();
598 ++i) {
599 derr << " " << *i << dendl;
600 }
601 derr << "log_keys_debug:" << dendl;
602 for (set<string>::const_iterator i = log_keys_debug.begin();
603 i != log_keys_debug.end();
604 ++i) {
605 derr << " " << *i << dendl;
606 }
607 }
11fdf7f2 608 ceph_assert(log.log.size() == log_keys_debug.size());
7c673cae
FG
609 for (list<pg_log_entry_t>::iterator i = log.log.begin();
610 i != log.log.end();
611 ++i) {
11fdf7f2 612 ceph_assert(log_keys_debug.count(i->get_key_name()));
7c673cae
FG
613 }
614}
615
c07f9fc5 616// non-static
7c673cae
FG
617void PGLog::write_log_and_missing(
618 ObjectStore::Transaction& t,
619 map<string,bufferlist> *km,
c07f9fc5
FG
620 const coll_t& coll,
621 const ghobject_t &log_oid,
7c673cae
FG
622 bool require_rollback)
623{
9f95a23c 624 if (needs_write()) {
11fdf7f2 625 dout(6) << "write_log_and_missing with: "
7c673cae
FG
626 << "dirty_to: " << dirty_to
627 << ", dirty_from: " << dirty_from
628 << ", writeout_from: " << writeout_from
629 << ", trimmed: " << trimmed
c07f9fc5 630 << ", trimmed_dups: " << trimmed_dups
7c673cae
FG
631 << ", clear_divergent_priors: " << clear_divergent_priors
632 << dendl;
633 _write_log_and_missing(
634 t, km, log, coll, log_oid,
635 dirty_to,
636 dirty_from,
637 writeout_from,
11fdf7f2
TL
638 std::move(trimmed),
639 std::move(trimmed_dups),
7c673cae
FG
640 missing,
641 !touched_log,
642 require_rollback,
643 clear_divergent_priors,
181888fb
FG
644 dirty_to_dups,
645 dirty_from_dups,
646 write_from_dups,
9f95a23c 647 &may_include_deletes_in_missing_dirty,
c07f9fc5 648 (pg_log_debug ? &log_keys_debug : nullptr));
7c673cae
FG
649 undirty();
650 } else {
651 dout(10) << "log is not dirty" << dendl;
652 }
653}
654
c07f9fc5 655// static
7c673cae
FG
656void PGLog::write_log_and_missing_wo_missing(
657 ObjectStore::Transaction& t,
658 map<string,bufferlist> *km,
659 pg_log_t &log,
660 const coll_t& coll, const ghobject_t &log_oid,
661 map<eversion_t, hobject_t> &divergent_priors,
181888fb
FG
662 bool require_rollback
663 )
7c673cae
FG
664{
665 _write_log_and_missing_wo_missing(
666 t, km, log, coll, log_oid,
667 divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
181888fb
FG
668 true, true, require_rollback,
669 eversion_t::max(), eversion_t(), eversion_t(), nullptr);
7c673cae
FG
670}
671
c07f9fc5 672// static
7c673cae
FG
673void PGLog::write_log_and_missing(
674 ObjectStore::Transaction& t,
675 map<string,bufferlist> *km,
676 pg_log_t &log,
677 const coll_t& coll,
678 const ghobject_t &log_oid,
679 const pg_missing_tracker_t &missing,
c07f9fc5 680 bool require_rollback,
9f95a23c 681 bool *may_include_deletes_in_missing_dirty)
7c673cae
FG
682{
683 _write_log_and_missing(
684 t, km, log, coll, log_oid,
685 eversion_t::max(),
686 eversion_t(),
687 eversion_t(),
688 set<eversion_t>(),
c07f9fc5 689 set<string>(),
7c673cae 690 missing,
181888fb
FG
691 true, require_rollback, false,
692 eversion_t::max(),
693 eversion_t(),
694 eversion_t(),
9f95a23c 695 may_include_deletes_in_missing_dirty, nullptr);
7c673cae
FG
696}
697
c07f9fc5 698// static
7c673cae
FG
699void PGLog::_write_log_and_missing_wo_missing(
700 ObjectStore::Transaction& t,
701 map<string,bufferlist> *km,
702 pg_log_t &log,
703 const coll_t& coll, const ghobject_t &log_oid,
704 map<eversion_t, hobject_t> &divergent_priors,
705 eversion_t dirty_to,
706 eversion_t dirty_from,
707 eversion_t writeout_from,
7c673cae
FG
708 bool dirty_divergent_priors,
709 bool touch_log,
710 bool require_rollback,
181888fb
FG
711 eversion_t dirty_to_dups,
712 eversion_t dirty_from_dups,
713 eversion_t write_from_dups,
7c673cae
FG
714 set<string> *log_keys_debug
715 )
716{
c07f9fc5 717 // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
7c673cae
FG
718 if (touch_log)
719 t.touch(coll, log_oid);
720 if (dirty_to != eversion_t()) {
721 t.omap_rmkeyrange(
722 coll, log_oid,
723 eversion_t().get_key_name(), dirty_to.get_key_name());
724 clear_up_to(log_keys_debug, dirty_to.get_key_name());
725 }
726 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
c07f9fc5 727 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
7c673cae
FG
728 t.omap_rmkeyrange(
729 coll, log_oid,
730 dirty_from.get_key_name(), eversion_t::max().get_key_name());
731 clear_after(log_keys_debug, dirty_from.get_key_name());
732 }
733
734 for (list<pg_log_entry_t>::iterator p = log.log.begin();
735 p != log.log.end() && p->version <= dirty_to;
736 ++p) {
737 bufferlist bl(sizeof(*p) * 2);
738 p->encode_with_checksum(bl);
739 (*km)[p->get_key_name()].claim(bl);
740 }
741
742 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
743 p != log.log.rend() &&
744 (p->version >= dirty_from || p->version >= writeout_from) &&
745 p->version >= dirty_to;
746 ++p) {
747 bufferlist bl(sizeof(*p) * 2);
748 p->encode_with_checksum(bl);
749 (*km)[p->get_key_name()].claim(bl);
750 }
751
752 if (log_keys_debug) {
753 for (map<string, bufferlist>::iterator i = (*km).begin();
754 i != (*km).end();
755 ++i) {
756 if (i->first[0] == '_')
757 continue;
11fdf7f2 758 ceph_assert(!log_keys_debug->count(i->first));
7c673cae
FG
759 log_keys_debug->insert(i->first);
760 }
761 }
762
181888fb 763 // process dups after log_keys_debug is filled, so dups do not
c07f9fc5 764 // end up in that set
181888fb
FG
765 if (dirty_to_dups != eversion_t()) {
766 pg_log_dup_t min, dirty_to_dup;
767 dirty_to_dup.version = dirty_to_dups;
c07f9fc5
FG
768 t.omap_rmkeyrange(
769 coll, log_oid,
181888fb
FG
770 min.get_key_name(), dirty_to_dup.get_key_name());
771 }
772 if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
773 pg_log_dup_t max, dirty_from_dup;
774 max.version = eversion_t::max();
775 dirty_from_dup.version = dirty_from_dups;
776 t.omap_rmkeyrange(
777 coll, log_oid,
778 dirty_from_dup.get_key_name(), max.get_key_name());
779 }
780
781 for (const auto& entry : log.dups) {
782 if (entry.version > dirty_to_dups)
783 break;
784 bufferlist bl;
11fdf7f2 785 encode(entry, bl);
181888fb
FG
786 (*km)[entry.get_key_name()].claim(bl);
787 }
788
789 for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
790 p != log.dups.rend() &&
791 (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
792 p->version >= dirty_to_dups;
793 ++p) {
794 bufferlist bl;
11fdf7f2 795 encode(*p, bl);
181888fb 796 (*km)[p->get_key_name()].claim(bl);
c07f9fc5
FG
797 }
798
7c673cae
FG
799 if (dirty_divergent_priors) {
800 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
11fdf7f2 801 encode(divergent_priors, (*km)["divergent_priors"]);
7c673cae
FG
802 }
803 if (require_rollback) {
11fdf7f2 804 encode(
7c673cae
FG
805 log.get_can_rollback_to(),
806 (*km)["can_rollback_to"]);
11fdf7f2 807 encode(
7c673cae
FG
808 log.get_rollback_info_trimmed_to(),
809 (*km)["rollback_info_trimmed_to"]);
810 }
7c673cae
FG
811}
812
c07f9fc5 813// static
7c673cae
FG
814void PGLog::_write_log_and_missing(
815 ObjectStore::Transaction& t,
816 map<string,bufferlist>* km,
817 pg_log_t &log,
818 const coll_t& coll, const ghobject_t &log_oid,
819 eversion_t dirty_to,
820 eversion_t dirty_from,
821 eversion_t writeout_from,
11fdf7f2
TL
822 set<eversion_t> &&trimmed,
823 set<string> &&trimmed_dups,
7c673cae
FG
824 const pg_missing_tracker_t &missing,
825 bool touch_log,
826 bool require_rollback,
827 bool clear_divergent_priors,
181888fb
FG
828 eversion_t dirty_to_dups,
829 eversion_t dirty_from_dups,
830 eversion_t write_from_dups,
9f95a23c 831 bool *may_include_deletes_in_missing_dirty, // in/out param
7c673cae
FG
832 set<string> *log_keys_debug
833 ) {
11fdf7f2
TL
834 set<string> to_remove;
835 to_remove.swap(trimmed_dups);
836 for (auto& t : trimmed) {
837 string key = t.get_key_name();
7c673cae 838 if (log_keys_debug) {
11fdf7f2
TL
839 auto it = log_keys_debug->find(key);
840 ceph_assert(it != log_keys_debug->end());
841 log_keys_debug->erase(it);
7c673cae 842 }
11fdf7f2 843 to_remove.emplace(std::move(key));
7c673cae 844 }
11fdf7f2 845 trimmed.clear();
7c673cae
FG
846
847 if (touch_log)
848 t.touch(coll, log_oid);
849 if (dirty_to != eversion_t()) {
850 t.omap_rmkeyrange(
851 coll, log_oid,
852 eversion_t().get_key_name(), dirty_to.get_key_name());
853 clear_up_to(log_keys_debug, dirty_to.get_key_name());
854 }
855 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
856 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
857 t.omap_rmkeyrange(
858 coll, log_oid,
859 dirty_from.get_key_name(), eversion_t::max().get_key_name());
860 clear_after(log_keys_debug, dirty_from.get_key_name());
861 }
862
863 for (list<pg_log_entry_t>::iterator p = log.log.begin();
864 p != log.log.end() && p->version <= dirty_to;
865 ++p) {
866 bufferlist bl(sizeof(*p) * 2);
867 p->encode_with_checksum(bl);
868 (*km)[p->get_key_name()].claim(bl);
869 }
870
871 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
872 p != log.log.rend() &&
873 (p->version >= dirty_from || p->version >= writeout_from) &&
874 p->version >= dirty_to;
875 ++p) {
876 bufferlist bl(sizeof(*p) * 2);
877 p->encode_with_checksum(bl);
878 (*km)[p->get_key_name()].claim(bl);
879 }
880
881 if (log_keys_debug) {
882 for (map<string, bufferlist>::iterator i = (*km).begin();
883 i != (*km).end();
884 ++i) {
885 if (i->first[0] == '_')
886 continue;
11fdf7f2 887 ceph_assert(!log_keys_debug->count(i->first));
7c673cae
FG
888 log_keys_debug->insert(i->first);
889 }
890 }
891
181888fb 892 // process dups after log_keys_debug is filled, so dups do not
c07f9fc5 893 // end up in that set
181888fb
FG
894 if (dirty_to_dups != eversion_t()) {
895 pg_log_dup_t min, dirty_to_dup;
896 dirty_to_dup.version = dirty_to_dups;
c07f9fc5
FG
897 t.omap_rmkeyrange(
898 coll, log_oid,
181888fb
FG
899 min.get_key_name(), dirty_to_dup.get_key_name());
900 }
901 if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
902 pg_log_dup_t max, dirty_from_dup;
903 max.version = eversion_t::max();
904 dirty_from_dup.version = dirty_from_dups;
905 t.omap_rmkeyrange(
906 coll, log_oid,
907 dirty_from_dup.get_key_name(), max.get_key_name());
908 }
909
910 for (const auto& entry : log.dups) {
911 if (entry.version > dirty_to_dups)
912 break;
913 bufferlist bl;
11fdf7f2 914 encode(entry, bl);
181888fb
FG
915 (*km)[entry.get_key_name()].claim(bl);
916 }
917
918 for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
919 p != log.dups.rend() &&
920 (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
921 p->version >= dirty_to_dups;
922 ++p) {
923 bufferlist bl;
11fdf7f2 924 encode(*p, bl);
181888fb 925 (*km)[p->get_key_name()].claim(bl);
c07f9fc5
FG
926 }
927
7c673cae
FG
928 if (clear_divergent_priors) {
929 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
930 to_remove.insert("divergent_priors");
931 }
c07f9fc5
FG
932 // since we encode individual missing items instead of a whole
933 // missing set, we need another key to store this bit of state
9f95a23c 934 if (*may_include_deletes_in_missing_dirty) {
c07f9fc5 935 (*km)["may_include_deletes_in_missing"] = bufferlist();
9f95a23c 936 *may_include_deletes_in_missing_dirty = false;
c07f9fc5 937 }
7c673cae
FG
938 missing.get_changed(
939 [&](const hobject_t &obj) {
940 string key = string("missing/") + obj.to_str();
941 pg_missing_item item;
942 if (!missing.is_missing(obj, &item)) {
943 to_remove.insert(key);
944 } else {
c07f9fc5 945 uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0;
11fdf7f2 946 encode(make_pair(obj, item), (*km)[key], features);
7c673cae
FG
947 }
948 });
949 if (require_rollback) {
11fdf7f2 950 encode(
7c673cae
FG
951 log.get_can_rollback_to(),
952 (*km)["can_rollback_to"]);
11fdf7f2 953 encode(
7c673cae
FG
954 log.get_rollback_info_trimmed_to(),
955 (*km)["rollback_info_trimmed_to"]);
956 }
957
958 if (!to_remove.empty())
959 t.omap_rmkeys(coll, log_oid, to_remove);
960}
c07f9fc5 961
11fdf7f2
TL
962void PGLog::rebuild_missing_set_with_deletes(
963 ObjectStore *store,
964 ObjectStore::CollectionHandle& ch,
965 const pg_info_t &info)
c07f9fc5
FG
966{
967 // save entries not generated from the current log (e.g. added due
968 // to repair, EIO handling, or divergent_priors).
969 map<hobject_t, pg_missing_item> extra_missing;
970 for (const auto& p : missing.get_items()) {
971 if (!log.logged_object(p.first)) {
972 dout(20) << __func__ << " extra missing entry: " << p.first
973 << " " << p.second << dendl;
974 extra_missing[p.first] = p.second;
975 }
976 }
977 missing.clear();
c07f9fc5
FG
978
979 // go through the log and add items that are not present or older
980 // versions on disk, just as if we were reading the log + metadata
981 // off disk originally
982 set<hobject_t> did;
983 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
984 i != log.log.rend();
985 ++i) {
986 if (i->version <= info.last_complete)
987 break;
988 if (i->soid > info.last_backfill ||
989 i->is_error() ||
990 did.find(i->soid) != did.end())
991 continue;
992 did.insert(i->soid);
993
994 bufferlist bv;
995 int r = store->getattr(
11fdf7f2 996 ch,
c07f9fc5
FG
997 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
998 OI_ATTR,
999 bv);
1000 dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
1001
1002 if (r >= 0) {
1003 object_info_t oi(bv);
1004 dout(20) << __func__ << " store version = " << oi.version << dendl;
1005 if (oi.version < i->version) {
1006 missing.add(i->soid, i->version, oi.version, i->is_delete());
1007 }
1008 } else {
1009 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
1010 }
1011 }
1012
1013 for (const auto& p : extra_missing) {
1014 missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
1015 }
9f95a23c
TL
1016
1017 set_missing_may_contain_deletes();
c07f9fc5 1018}