]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGLog.cc
import 12.2.13 release
[ceph.git] / ceph / src / osd / PGLog.cc
CommitLineData
c07f9fc5 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
c07f9fc5 13 * License version 2.1, as published by the Free Software
7c673cae 14 * Foundation. See file COPYING.
c07f9fc5 15 *
7c673cae
FG
16 */
17
18#include "PGLog.h"
19#include "include/unordered_map.h"
20#include "common/ceph_context.h"
21
22#define dout_context cct
23#define dout_subsys ceph_subsys_osd
24#undef dout_prefix
25#define dout_prefix _prefix(_dout, this)
26
27static ostream& _prefix(std::ostream *_dout, const PGLog *pglog)
28{
29 return *_dout << pglog->gen_prefix();
30}
31
32//////////////////// PGLog::IndexedLog ////////////////////
33
34void PGLog::IndexedLog::split_out_child(
35 pg_t child_pgid,
36 unsigned split_bits,
37 PGLog::IndexedLog *target)
38{
39 unindex();
c07f9fc5 40 *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
7c673cae
FG
41 index();
42 target->index();
43 reset_rollback_info_trimmed_to_riter();
44}
45
46void PGLog::IndexedLog::trim(
47 CephContext* cct,
48 eversion_t s,
c07f9fc5
FG
49 set<eversion_t> *trimmed,
50 set<string>* trimmed_dups,
181888fb 51 eversion_t *write_from_dups)
7c673cae 52{
7c673cae 53 assert(s <= can_rollback_to);
f64942e4
AA
54 if (complete_to != log.end())
55 lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl;
7c673cae 56
c07f9fc5
FG
57 auto earliest_dup_version =
58 log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
59 ? 0u
37b3c998 60 : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked + 1;
c07f9fc5 61
37b3c998 62 lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl;
7c673cae 63 while (!log.empty()) {
c07f9fc5 64 const pg_log_entry_t &e = *log.begin();
7c673cae
FG
65 if (e.version > s)
66 break;
f64942e4 67 lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl;
7c673cae
FG
68 if (trimmed)
69 trimmed->insert(e.version);
70
71 unindex(e); // remove from index,
72
c07f9fc5
FG
73 // add to dup list
74 if (e.version.version >= earliest_dup_version) {
181888fb 75 if (write_from_dups != nullptr && *write_from_dups > e.version) {
f64942e4 76 lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
181888fb
FG
77 *write_from_dups = e.version;
78 }
c07f9fc5
FG
79 dups.push_back(pg_log_dup_t(e));
80 index(dups.back());
81 for (const auto& extra : e.extra_reqids) {
82 // note: extras have the same version as outer op
83 dups.push_back(pg_log_dup_t(e.version, extra.second,
84 extra.first, e.return_code));
85 index(dups.back());
86 }
87 }
88
f64942e4
AA
89 bool reset_complete_to = false;
90 // we are trimming past complete_to, so reset complete_to
91 if (complete_to != log.end() && e.version >= complete_to->version)
92 reset_complete_to = true;
7c673cae
FG
93 if (rollback_info_trimmed_to_riter == log.rend() ||
94 e.version == rollback_info_trimmed_to_riter->version) {
95 log.pop_front();
96 rollback_info_trimmed_to_riter = log.rend();
97 } else {
98 log.pop_front();
99 }
f64942e4
AA
100
101 // reset complete_to to the beginning of the log
102 if (reset_complete_to) {
103 lgeneric_subdout(cct, osd, 20) << " moving complete_to " << " to "
104 << log.begin()->version << dendl;
105 complete_to = log.begin();
106 }
7c673cae
FG
107 }
108
c07f9fc5
FG
109 while (!dups.empty()) {
110 const auto& e = *dups.begin();
111 if (e.version.version >= earliest_dup_version)
112 break;
f64942e4 113 lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl;
c07f9fc5
FG
114 if (trimmed_dups)
115 trimmed_dups->insert(e.get_key_name());
116 if (indexed_data & PGLOG_INDEXED_DUPS) {
117 dup_index.erase(e.reqid);
118 }
119 dups.pop_front();
120 }
121
7c673cae
FG
122 // raise tail?
123 if (tail < s)
124 tail = s;
125}
126
127ostream& PGLog::IndexedLog::print(ostream& out) const
128{
129 out << *this << std::endl;
130 for (list<pg_log_entry_t>::const_iterator p = log.begin();
131 p != log.end();
132 ++p) {
c07f9fc5
FG
133 out << *p << " " <<
134 (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
135 std::endl;
7c673cae
FG
136 assert(!p->reqid_is_indexed() || logged_req(p->reqid));
137 }
c07f9fc5
FG
138
139 for (list<pg_log_dup_t>::const_iterator p = dups.begin();
140 p != dups.end();
141 ++p) {
142 out << *p << std::endl;
143 }
144
7c673cae
FG
145 return out;
146}
147
148//////////////////// PGLog ////////////////////
149
150void PGLog::reset_backfill()
151{
152 missing.clear();
153}
154
155void PGLog::clear() {
156 missing.clear();
157 log.clear();
158 log_keys_debug.clear();
159 undirty();
160}
161
162void PGLog::clear_info_log(
163 spg_t pgid,
164 ObjectStore::Transaction *t) {
165 coll_t coll(pgid);
166 t->remove(coll, pgid.make_pgmeta_oid());
167}
168
169void PGLog::trim(
170 eversion_t trim_to,
f64942e4
AA
171 pg_info_t &info,
172 bool transaction_applied)
7c673cae 173{
f64942e4 174 dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl;
7c673cae
FG
175 // trim?
176 if (trim_to > log.tail) {
f64942e4
AA
177 dout(10) << __func__ << " missing = " << missing.num_missing() << dendl;
178 // Don't assert for backfill_targets
179 // or whenever there are missing items
180 if (transaction_applied && (missing.num_missing() == 0))
181 assert(trim_to <= info.last_complete);
7c673cae
FG
182
183 dout(10) << "trim " << log << " to " << trim_to << dendl;
181888fb 184 log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
7c673cae 185 info.log_tail = log.tail;
f64942e4
AA
186 if (log.complete_to != log.log.end())
187 dout(10) << " after trim complete_to " << log.complete_to->version << dendl;
7c673cae
FG
188 }
189}
190
191void PGLog::proc_replica_log(
192 pg_info_t &oinfo,
193 const pg_log_t &olog,
194 pg_missing_t& omissing,
195 pg_shard_t from) const
196{
197 dout(10) << "proc_replica_log for osd." << from << ": "
198 << oinfo << " " << olog << " " << omissing << dendl;
199
200 if (olog.head < log.tail) {
201 dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
202 << "for divergent objects" << dendl;
203 return;
204 }
205 if (olog.head == log.head) {
206 dout(10) << __func__ << ": osd." << from << " same log head, not looking "
207 << "for divergent objects" << dendl;
208 return;
209 }
210 assert(olog.head >= log.tail);
211
212 /*
213 basically what we're doing here is rewinding the remote log,
214 dropping divergent entries, until we find something that matches
215 our master log. we then reset last_update to reflect the new
216 point up to which missing is accurate.
217
218 later, in activate(), missing will get wound forward again and
219 we will send the peer enough log to arrive at the same state.
220 */
221
222 for (map<hobject_t, pg_missing_item>::const_iterator i = omissing.get_items().begin();
223 i != omissing.get_items().end();
224 ++i) {
225 dout(20) << " before missing " << i->first << " need " << i->second.need
226 << " have " << i->second.have << dendl;
227 }
228
229 list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
230 log.log.rbegin();
231 while (1) {
232 if (first_non_divergent == log.log.rend())
233 break;
234 if (first_non_divergent->version <= olog.head) {
235 dout(20) << "merge_log point (usually last shared) is "
236 << *first_non_divergent << dendl;
237 break;
238 }
239 ++first_non_divergent;
240 }
241
242 /* Because olog.head >= log.tail, we know that both pgs must at least have
243 * the event represented by log.tail. Similarly, because log.head >= olog.tail,
244 * we know that the even represented by olog.tail must be common to both logs.
245 * Furthermore, the event represented by a log tail was necessarily trimmed,
246 * thus neither olog.tail nor log.tail can be divergent. It's
247 * possible that olog/log contain no actual events between olog.head and
248 * MAX(log.tail, olog.tail), however, since they might have been split out.
249 * Thus, if we cannot find an event e such that
250 * log.tail <= e.version <= log.head, the last_update must actually be
251 * MAX(log.tail, olog.tail).
252 */
253 eversion_t limit = MAX(olog.tail, log.tail);
254 eversion_t lu =
255 (first_non_divergent == log.log.rend() ||
256 first_non_divergent->version < limit) ?
257 limit :
258 first_non_divergent->version;
259
37b3c998
TL
260 // we merge and adjust the replica's log, rollback the rollbackable divergent entry,
261 // remove the unrollbackable divergent entry and mark the according object as missing.
262 // the rollback boundary must choose crt of the olog which going to be merged.
263 // The replica log's(olog) crt will not be modified, so it could get passed
264 // to _merge_divergent_entries() directly.
7c673cae
FG
265 IndexedLog folog(olog);
266 auto divergent = folog.rewind_from_head(lu);
267 _merge_divergent_entries(
268 folog,
269 divergent,
270 oinfo,
271 olog.get_can_rollback_to(),
272 omissing,
273 0,
274 this);
275
276 if (lu < oinfo.last_update) {
277 dout(10) << " peer osd." << from << " last_update now " << lu << dendl;
278 oinfo.last_update = lu;
279 }
280
281 if (omissing.have_missing()) {
282 eversion_t first_missing =
283 omissing.get_items().at(omissing.get_rmissing().begin()->second).need;
284 oinfo.last_complete = eversion_t();
285 list<pg_log_entry_t>::const_iterator i = olog.log.begin();
286 for (;
287 i != olog.log.end();
288 ++i) {
289 if (i->version < first_missing)
290 oinfo.last_complete = i->version;
291 else
292 break;
293 }
294 } else {
295 oinfo.last_complete = oinfo.last_update;
296 }
c07f9fc5 297} // proc_replica_log
7c673cae
FG
298
299/**
300 * rewind divergent entries at the head of the log
301 *
302 * This rewinds entries off the head of our log that are divergent.
303 * This is used by replicas during activation.
304 *
305 * @param newhead new head to rewind to
306 */
307void PGLog::rewind_divergent_log(eversion_t newhead,
308 pg_info_t &info, LogEntryHandler *rollbacker,
309 bool &dirty_info, bool &dirty_big_info)
310{
c07f9fc5
FG
311 dout(10) << "rewind_divergent_log truncate divergent future " <<
312 newhead << dendl;
7c673cae 313
37b3c998
TL
314 // We need to preserve the original crt before it gets updated in rewind_from_head().
315 // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
316 // a divergent entry or not.
317 eversion_t original_crt = log.get_can_rollback_to();
318 dout(20) << __func__ << " original_crt = " << original_crt << dendl;
7c673cae
FG
319 if (info.last_complete > newhead)
320 info.last_complete = newhead;
321
322 auto divergent = log.rewind_from_head(newhead);
323 if (!divergent.empty()) {
324 mark_dirty_from(divergent.front().version);
325 }
326 for (auto &&entry: divergent) {
327 dout(10) << "rewind_divergent_log future divergent " << entry << dendl;
328 }
329 info.last_update = newhead;
330
331 _merge_divergent_entries(
332 log,
333 divergent,
334 info,
37b3c998 335 original_crt,
7c673cae
FG
336 missing,
337 rollbacker,
338 this);
339
340 dirty_info = true;
341 dirty_big_info = true;
342}
343
344void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
345 pg_info_t &info, LogEntryHandler *rollbacker,
346 bool &dirty_info, bool &dirty_big_info)
347{
348 dout(10) << "merge_log " << olog << " from osd." << fromosd
349 << " into " << log << dendl;
350
351 // Check preconditions
352
353 // If our log is empty, the incoming log needs to have not been trimmed.
354 assert(!log.null() || olog.tail == eversion_t());
355 // The logs must overlap.
356 assert(log.head >= olog.tail && olog.head >= log.tail);
357
358 for (map<hobject_t, pg_missing_item>::const_iterator i = missing.get_items().begin();
359 i != missing.get_items().end();
360 ++i) {
361 dout(20) << "pg_missing_t sobject: " << i->first << dendl;
362 }
363
364 bool changed = false;
365
366 // extend on tail?
367 // this is just filling in history. it does not affect our
368 // missing set, as that should already be consistent with our
369 // current log.
370 eversion_t orig_tail = log.tail;
371 if (olog.tail < log.tail) {
372 dout(10) << "merge_log extending tail to " << olog.tail << dendl;
373 list<pg_log_entry_t>::iterator from = olog.log.begin();
374 list<pg_log_entry_t>::iterator to;
375 eversion_t last;
376 for (to = from;
377 to != olog.log.end();
378 ++to) {
379 if (to->version > log.tail)
380 break;
381 log.index(*to);
382 dout(15) << *to << dendl;
383 last = to->version;
384 }
385 mark_dirty_to(last);
386
387 // splice into our log.
388 log.log.splice(log.log.begin(),
389 olog.log, from, to);
c07f9fc5 390
7c673cae
FG
391 info.log_tail = log.tail = olog.tail;
392 changed = true;
393 }
394
395 if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases
396 oinfo.stats.reported_epoch < info.stats.reported_epoch) {
397 oinfo.stats.reported_seq = info.stats.reported_seq;
398 oinfo.stats.reported_epoch = info.stats.reported_epoch;
399 }
400 if (info.last_backfill.is_max())
401 info.stats = oinfo.stats;
402 info.hit_set = oinfo.hit_set;
403
404 // do we have divergent entries to throw out?
405 if (olog.head < log.head) {
406 rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
407 changed = true;
408 }
409
410 // extend on head?
411 if (olog.head > log.head) {
412 dout(10) << "merge_log extending head to " << olog.head << dendl;
c07f9fc5 413
7c673cae
FG
414 // find start point in olog
415 list<pg_log_entry_t>::iterator to = olog.log.end();
416 list<pg_log_entry_t>::iterator from = olog.log.end();
417 eversion_t lower_bound = MAX(olog.tail, orig_tail);
418 while (1) {
419 if (from == olog.log.begin())
420 break;
421 --from;
422 dout(20) << " ? " << *from << dendl;
423 if (from->version <= log.head) {
424 lower_bound = MAX(lower_bound, from->version);
425 ++from;
426 break;
427 }
428 }
429 dout(20) << "merge_log cut point (usually last shared) is "
430 << lower_bound << dendl;
431 mark_dirty_from(lower_bound);
432
37b3c998
TL
433 // We need to preserve the original crt before it gets updated in rewind_from_head().
434 // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
435 // a divergent entry or not.
436 eversion_t original_crt = log.get_can_rollback_to();
437 dout(20) << __func__ << " original_crt = " << original_crt << dendl;
7c673cae
FG
438 auto divergent = log.rewind_from_head(lower_bound);
439 // move aside divergent items
440 for (auto &&oe: divergent) {
441 dout(10) << "merge_log divergent " << oe << dendl;
442 }
443 log.roll_forward_to(log.head, rollbacker);
444
31f18b77 445 mempool::osd_pglog::list<pg_log_entry_t> new_entries;
7c673cae
FG
446 new_entries.splice(new_entries.end(), olog.log, from, to);
447 append_log_entries_update_missing(
448 info.last_backfill,
449 info.last_backfill_bitwise,
450 new_entries,
451 false,
452 &log,
453 missing,
454 rollbacker,
455 this);
456
457 _merge_divergent_entries(
458 log,
459 divergent,
460 info,
37b3c998 461 original_crt,
7c673cae
FG
462 missing,
463 rollbacker,
464 this);
465
466 info.last_update = log.head = olog.head;
467
468 // We cannot rollback into the new log entries
469 log.skip_can_rollback_to_to_head();
470
471 info.last_user_version = oinfo.last_user_version;
472 info.purged_snaps = oinfo.purged_snaps;
473
474 changed = true;
475 }
c07f9fc5
FG
476
477 // now handle dups
478 if (merge_log_dups(olog)) {
c07f9fc5
FG
479 changed = true;
480 }
481
482 dout(10) << "merge_log result " << log << " " << missing <<
483 " changed=" << changed << dendl;
7c673cae
FG
484
485 if (changed) {
486 dirty_info = true;
487 dirty_big_info = true;
488 }
489}
490
c07f9fc5
FG
491
492// returns true if any changes were made to log.dups
493bool PGLog::merge_log_dups(const pg_log_t& olog) {
494 bool changed = false;
495
496 if (!olog.dups.empty()) {
497 if (log.dups.empty()) {
498 dout(10) << "merge_log copying olog dups to log " <<
499 olog.dups.front().version << " to " <<
500 olog.dups.back().version << dendl;
501 changed = true;
181888fb
FG
502 dirty_from_dups = eversion_t();
503 dirty_to_dups = eversion_t::max();
c07f9fc5
FG
504 // since our log.dups is empty just copy them
505 for (const auto& i : olog.dups) {
506 log.dups.push_back(i);
507 log.index(log.dups.back());
508 }
509 } else {
510 // since our log.dups is not empty try to extend on each end
511
512 if (olog.dups.back().version > log.dups.back().version) {
513 // extend the dups's tail (i.e., newer dups)
514 dout(10) << "merge_log extending dups tail to " <<
515 olog.dups.back().version << dendl;
516 changed = true;
517
518 auto log_tail_version = log.dups.back().version;
519
520 auto insert_cursor = log.dups.end();
181888fb 521 eversion_t last_shared = eversion_t::max();
c07f9fc5
FG
522 for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
523 if (i->version <= log_tail_version) break;
524 log.dups.insert(insert_cursor, *i);
181888fb 525 last_shared = i->version;
c07f9fc5
FG
526
527 auto prev = insert_cursor;
528 --prev;
529 // be sure to pass reference of copy in log.dups
530 log.index(*prev);
531
532 --insert_cursor; // make sure we insert in reverse order
533 }
181888fb 534 mark_dirty_from_dups(last_shared);
c07f9fc5
FG
535 }
536
537 if (olog.dups.front().version < log.dups.front().version) {
538 // extend the dups's head (i.e., older dups)
539 dout(10) << "merge_log extending dups head to " <<
540 olog.dups.front().version << dendl;
541 changed = true;
542
181888fb 543 eversion_t last;
c07f9fc5
FG
544 auto insert_cursor = log.dups.begin();
545 for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
546 if (i->version >= insert_cursor->version) break;
547 log.dups.insert(insert_cursor, *i);
181888fb 548 last = i->version;
c07f9fc5
FG
549 auto prev = insert_cursor;
550 --prev;
551 // be sure to pass address of copy in log.dups
552 log.index(*prev);
553 }
181888fb 554 mark_dirty_to_dups(last);
c07f9fc5
FG
555 }
556 }
557 }
558
559 // remove any dup entries that overlap with pglog
37b3c998
TL
560 if (!log.dups.empty() && log.dups.back().version > log.tail) {
561 dout(10) << "merge_log removed dups overlapping log entries (" <<
c07f9fc5
FG
562 log.tail << "," << log.dups.back().version << "]" << dendl;
563 changed = true;
564
37b3c998 565 while (!log.dups.empty() && log.dups.back().version > log.tail) {
c07f9fc5 566 log.unindex(log.dups.back());
181888fb 567 mark_dirty_from_dups(log.dups.back().version);
c07f9fc5
FG
568 log.dups.pop_back();
569 }
570 }
571
572 return changed;
573}
574
7c673cae
FG
575void PGLog::check() {
576 if (!pg_log_debug)
577 return;
578 if (log.log.size() != log_keys_debug.size()) {
579 derr << "log.log.size() != log_keys_debug.size()" << dendl;
580 derr << "actual log:" << dendl;
581 for (list<pg_log_entry_t>::iterator i = log.log.begin();
582 i != log.log.end();
583 ++i) {
584 derr << " " << *i << dendl;
585 }
586 derr << "log_keys_debug:" << dendl;
587 for (set<string>::const_iterator i = log_keys_debug.begin();
588 i != log_keys_debug.end();
589 ++i) {
590 derr << " " << *i << dendl;
591 }
592 }
593 assert(log.log.size() == log_keys_debug.size());
594 for (list<pg_log_entry_t>::iterator i = log.log.begin();
595 i != log.log.end();
596 ++i) {
597 assert(log_keys_debug.count(i->get_key_name()));
598 }
599}
600
c07f9fc5 601// non-static
7c673cae
FG
602void PGLog::write_log_and_missing(
603 ObjectStore::Transaction& t,
604 map<string,bufferlist> *km,
c07f9fc5
FG
605 const coll_t& coll,
606 const ghobject_t &log_oid,
7c673cae
FG
607 bool require_rollback)
608{
609 if (is_dirty()) {
610 dout(5) << "write_log_and_missing with: "
611 << "dirty_to: " << dirty_to
612 << ", dirty_from: " << dirty_from
613 << ", writeout_from: " << writeout_from
614 << ", trimmed: " << trimmed
c07f9fc5 615 << ", trimmed_dups: " << trimmed_dups
7c673cae
FG
616 << ", clear_divergent_priors: " << clear_divergent_priors
617 << dendl;
618 _write_log_and_missing(
619 t, km, log, coll, log_oid,
620 dirty_to,
621 dirty_from,
622 writeout_from,
623 trimmed,
c07f9fc5 624 trimmed_dups,
7c673cae
FG
625 missing,
626 !touched_log,
627 require_rollback,
628 clear_divergent_priors,
181888fb
FG
629 dirty_to_dups,
630 dirty_from_dups,
631 write_from_dups,
c07f9fc5
FG
632 &rebuilt_missing_with_deletes,
633 (pg_log_debug ? &log_keys_debug : nullptr));
7c673cae
FG
634 undirty();
635 } else {
636 dout(10) << "log is not dirty" << dendl;
637 }
638}
639
c07f9fc5 640// static
7c673cae
FG
641void PGLog::write_log_and_missing_wo_missing(
642 ObjectStore::Transaction& t,
643 map<string,bufferlist> *km,
644 pg_log_t &log,
645 const coll_t& coll, const ghobject_t &log_oid,
646 map<eversion_t, hobject_t> &divergent_priors,
181888fb
FG
647 bool require_rollback
648 )
7c673cae
FG
649{
650 _write_log_and_missing_wo_missing(
651 t, km, log, coll, log_oid,
652 divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
653 set<eversion_t>(),
c07f9fc5 654 set<string>(),
181888fb
FG
655 true, true, require_rollback,
656 eversion_t::max(), eversion_t(), eversion_t(), nullptr);
7c673cae
FG
657}
658
c07f9fc5 659// static
7c673cae
FG
660void PGLog::write_log_and_missing(
661 ObjectStore::Transaction& t,
662 map<string,bufferlist> *km,
663 pg_log_t &log,
664 const coll_t& coll,
665 const ghobject_t &log_oid,
666 const pg_missing_tracker_t &missing,
c07f9fc5 667 bool require_rollback,
c07f9fc5 668 bool *rebuilt_missing_with_deletes)
7c673cae
FG
669{
670 _write_log_and_missing(
671 t, km, log, coll, log_oid,
672 eversion_t::max(),
673 eversion_t(),
674 eversion_t(),
675 set<eversion_t>(),
c07f9fc5 676 set<string>(),
7c673cae 677 missing,
181888fb
FG
678 true, require_rollback, false,
679 eversion_t::max(),
680 eversion_t(),
681 eversion_t(),
682 rebuilt_missing_with_deletes, nullptr);
7c673cae
FG
683}
684
c07f9fc5 685// static
7c673cae
FG
686void PGLog::_write_log_and_missing_wo_missing(
687 ObjectStore::Transaction& t,
688 map<string,bufferlist> *km,
689 pg_log_t &log,
690 const coll_t& coll, const ghobject_t &log_oid,
691 map<eversion_t, hobject_t> &divergent_priors,
692 eversion_t dirty_to,
693 eversion_t dirty_from,
694 eversion_t writeout_from,
695 const set<eversion_t> &trimmed,
c07f9fc5 696 const set<string> &trimmed_dups,
7c673cae
FG
697 bool dirty_divergent_priors,
698 bool touch_log,
699 bool require_rollback,
181888fb
FG
700 eversion_t dirty_to_dups,
701 eversion_t dirty_from_dups,
702 eversion_t write_from_dups,
7c673cae
FG
703 set<string> *log_keys_debug
704 )
705{
c07f9fc5 706 set<string> to_remove(trimmed_dups);
7c673cae
FG
707 for (set<eversion_t>::const_iterator i = trimmed.begin();
708 i != trimmed.end();
709 ++i) {
710 to_remove.insert(i->get_key_name());
711 if (log_keys_debug) {
712 assert(log_keys_debug->count(i->get_key_name()));
713 log_keys_debug->erase(i->get_key_name());
714 }
715 }
716
c07f9fc5 717 // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
7c673cae
FG
718 if (touch_log)
719 t.touch(coll, log_oid);
720 if (dirty_to != eversion_t()) {
721 t.omap_rmkeyrange(
722 coll, log_oid,
723 eversion_t().get_key_name(), dirty_to.get_key_name());
724 clear_up_to(log_keys_debug, dirty_to.get_key_name());
725 }
726 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
c07f9fc5 727 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
7c673cae
FG
728 t.omap_rmkeyrange(
729 coll, log_oid,
730 dirty_from.get_key_name(), eversion_t::max().get_key_name());
731 clear_after(log_keys_debug, dirty_from.get_key_name());
732 }
733
734 for (list<pg_log_entry_t>::iterator p = log.log.begin();
735 p != log.log.end() && p->version <= dirty_to;
736 ++p) {
737 bufferlist bl(sizeof(*p) * 2);
738 p->encode_with_checksum(bl);
739 (*km)[p->get_key_name()].claim(bl);
740 }
741
742 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
743 p != log.log.rend() &&
744 (p->version >= dirty_from || p->version >= writeout_from) &&
745 p->version >= dirty_to;
746 ++p) {
747 bufferlist bl(sizeof(*p) * 2);
748 p->encode_with_checksum(bl);
749 (*km)[p->get_key_name()].claim(bl);
750 }
751
752 if (log_keys_debug) {
753 for (map<string, bufferlist>::iterator i = (*km).begin();
754 i != (*km).end();
755 ++i) {
756 if (i->first[0] == '_')
757 continue;
758 assert(!log_keys_debug->count(i->first));
759 log_keys_debug->insert(i->first);
760 }
761 }
762
181888fb 763 // process dups after log_keys_debug is filled, so dups do not
c07f9fc5 764 // end up in that set
181888fb
FG
765 if (dirty_to_dups != eversion_t()) {
766 pg_log_dup_t min, dirty_to_dup;
767 dirty_to_dup.version = dirty_to_dups;
c07f9fc5
FG
768 t.omap_rmkeyrange(
769 coll, log_oid,
181888fb
FG
770 min.get_key_name(), dirty_to_dup.get_key_name());
771 }
772 if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
773 pg_log_dup_t max, dirty_from_dup;
774 max.version = eversion_t::max();
775 dirty_from_dup.version = dirty_from_dups;
776 t.omap_rmkeyrange(
777 coll, log_oid,
778 dirty_from_dup.get_key_name(), max.get_key_name());
779 }
780
781 for (const auto& entry : log.dups) {
782 if (entry.version > dirty_to_dups)
783 break;
784 bufferlist bl;
785 ::encode(entry, bl);
786 (*km)[entry.get_key_name()].claim(bl);
787 }
788
789 for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
790 p != log.dups.rend() &&
791 (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
792 p->version >= dirty_to_dups;
793 ++p) {
794 bufferlist bl;
795 ::encode(*p, bl);
796 (*km)[p->get_key_name()].claim(bl);
c07f9fc5
FG
797 }
798
7c673cae
FG
799 if (dirty_divergent_priors) {
800 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
801 ::encode(divergent_priors, (*km)["divergent_priors"]);
802 }
803 if (require_rollback) {
804 ::encode(
805 log.get_can_rollback_to(),
806 (*km)["can_rollback_to"]);
807 ::encode(
808 log.get_rollback_info_trimmed_to(),
809 (*km)["rollback_info_trimmed_to"]);
810 }
811
812 if (!to_remove.empty())
813 t.omap_rmkeys(coll, log_oid, to_remove);
814}
815
c07f9fc5 816// static
7c673cae
FG
817void PGLog::_write_log_and_missing(
818 ObjectStore::Transaction& t,
819 map<string,bufferlist>* km,
820 pg_log_t &log,
821 const coll_t& coll, const ghobject_t &log_oid,
822 eversion_t dirty_to,
823 eversion_t dirty_from,
824 eversion_t writeout_from,
825 const set<eversion_t> &trimmed,
c07f9fc5 826 const set<string> &trimmed_dups,
7c673cae
FG
827 const pg_missing_tracker_t &missing,
828 bool touch_log,
829 bool require_rollback,
830 bool clear_divergent_priors,
181888fb
FG
831 eversion_t dirty_to_dups,
832 eversion_t dirty_from_dups,
833 eversion_t write_from_dups,
c07f9fc5 834 bool *rebuilt_missing_with_deletes, // in/out param
7c673cae
FG
835 set<string> *log_keys_debug
836 ) {
c07f9fc5 837 set<string> to_remove(trimmed_dups);
7c673cae
FG
838 for (set<eversion_t>::const_iterator i = trimmed.begin();
839 i != trimmed.end();
840 ++i) {
841 to_remove.insert(i->get_key_name());
842 if (log_keys_debug) {
843 assert(log_keys_debug->count(i->get_key_name()));
844 log_keys_debug->erase(i->get_key_name());
845 }
846 }
847
848 if (touch_log)
849 t.touch(coll, log_oid);
850 if (dirty_to != eversion_t()) {
851 t.omap_rmkeyrange(
852 coll, log_oid,
853 eversion_t().get_key_name(), dirty_to.get_key_name());
854 clear_up_to(log_keys_debug, dirty_to.get_key_name());
855 }
856 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
857 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
858 t.omap_rmkeyrange(
859 coll, log_oid,
860 dirty_from.get_key_name(), eversion_t::max().get_key_name());
861 clear_after(log_keys_debug, dirty_from.get_key_name());
862 }
863
864 for (list<pg_log_entry_t>::iterator p = log.log.begin();
865 p != log.log.end() && p->version <= dirty_to;
866 ++p) {
867 bufferlist bl(sizeof(*p) * 2);
868 p->encode_with_checksum(bl);
869 (*km)[p->get_key_name()].claim(bl);
870 }
871
872 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
873 p != log.log.rend() &&
874 (p->version >= dirty_from || p->version >= writeout_from) &&
875 p->version >= dirty_to;
876 ++p) {
877 bufferlist bl(sizeof(*p) * 2);
878 p->encode_with_checksum(bl);
879 (*km)[p->get_key_name()].claim(bl);
880 }
881
882 if (log_keys_debug) {
883 for (map<string, bufferlist>::iterator i = (*km).begin();
884 i != (*km).end();
885 ++i) {
886 if (i->first[0] == '_')
887 continue;
888 assert(!log_keys_debug->count(i->first));
889 log_keys_debug->insert(i->first);
890 }
891 }
892
181888fb 893 // process dups after log_keys_debug is filled, so dups do not
c07f9fc5 894 // end up in that set
181888fb
FG
895 if (dirty_to_dups != eversion_t()) {
896 pg_log_dup_t min, dirty_to_dup;
897 dirty_to_dup.version = dirty_to_dups;
c07f9fc5
FG
898 t.omap_rmkeyrange(
899 coll, log_oid,
181888fb
FG
900 min.get_key_name(), dirty_to_dup.get_key_name());
901 }
902 if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
903 pg_log_dup_t max, dirty_from_dup;
904 max.version = eversion_t::max();
905 dirty_from_dup.version = dirty_from_dups;
906 t.omap_rmkeyrange(
907 coll, log_oid,
908 dirty_from_dup.get_key_name(), max.get_key_name());
909 }
910
911 for (const auto& entry : log.dups) {
912 if (entry.version > dirty_to_dups)
913 break;
914 bufferlist bl;
915 ::encode(entry, bl);
916 (*km)[entry.get_key_name()].claim(bl);
917 }
918
919 for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
920 p != log.dups.rend() &&
921 (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
922 p->version >= dirty_to_dups;
923 ++p) {
924 bufferlist bl;
925 ::encode(*p, bl);
926 (*km)[p->get_key_name()].claim(bl);
c07f9fc5
FG
927 }
928
7c673cae
FG
929 if (clear_divergent_priors) {
930 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
931 to_remove.insert("divergent_priors");
932 }
c07f9fc5
FG
933 // since we encode individual missing items instead of a whole
934 // missing set, we need another key to store this bit of state
935 if (*rebuilt_missing_with_deletes) {
936 (*km)["may_include_deletes_in_missing"] = bufferlist();
937 *rebuilt_missing_with_deletes = false;
938 }
7c673cae
FG
939 missing.get_changed(
940 [&](const hobject_t &obj) {
941 string key = string("missing/") + obj.to_str();
942 pg_missing_item item;
943 if (!missing.is_missing(obj, &item)) {
944 to_remove.insert(key);
945 } else {
c07f9fc5
FG
946 uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0;
947 ::encode(make_pair(obj, item), (*km)[key], features);
7c673cae
FG
948 }
949 });
950 if (require_rollback) {
951 ::encode(
952 log.get_can_rollback_to(),
953 (*km)["can_rollback_to"]);
954 ::encode(
955 log.get_rollback_info_trimmed_to(),
956 (*km)["rollback_info_trimmed_to"]);
957 }
958
959 if (!to_remove.empty())
960 t.omap_rmkeys(coll, log_oid, to_remove);
961}
c07f9fc5
FG
962
963void PGLog::rebuild_missing_set_with_deletes(ObjectStore *store,
964 coll_t pg_coll,
965 const pg_info_t &info)
966{
967 // save entries not generated from the current log (e.g. added due
968 // to repair, EIO handling, or divergent_priors).
969 map<hobject_t, pg_missing_item> extra_missing;
970 for (const auto& p : missing.get_items()) {
971 if (!log.logged_object(p.first)) {
972 dout(20) << __func__ << " extra missing entry: " << p.first
973 << " " << p.second << dendl;
974 extra_missing[p.first] = p.second;
975 }
976 }
977 missing.clear();
978 missing.may_include_deletes = true;
979
980 // go through the log and add items that are not present or older
981 // versions on disk, just as if we were reading the log + metadata
982 // off disk originally
983 set<hobject_t> did;
984 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
985 i != log.log.rend();
986 ++i) {
987 if (i->version <= info.last_complete)
988 break;
989 if (i->soid > info.last_backfill ||
990 i->is_error() ||
991 did.find(i->soid) != did.end())
992 continue;
993 did.insert(i->soid);
994
995 bufferlist bv;
996 int r = store->getattr(
997 pg_coll,
998 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
999 OI_ATTR,
1000 bv);
1001 dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
1002
1003 if (r >= 0) {
1004 object_info_t oi(bv);
1005 dout(20) << __func__ << " store version = " << oi.version << dendl;
1006 if (oi.version < i->version) {
1007 missing.add(i->soid, i->version, oi.version, i->is_delete());
1008 }
1009 } else {
1010 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
1011 }
1012 }
1013
1014 for (const auto& p : extra_missing) {
1015 missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
1016 }
1017 rebuilt_missing_with_deletes = true;
1018}