]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGLog.cc
update sources to v12.1.2
[ceph.git] / ceph / src / osd / PGLog.cc
CommitLineData
c07f9fc5 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
c07f9fc5 13 * License version 2.1, as published by the Free Software
7c673cae 14 * Foundation. See file COPYING.
c07f9fc5 15 *
7c673cae
FG
16 */
17
18#include "PGLog.h"
19#include "include/unordered_map.h"
20#include "common/ceph_context.h"
21
22#define dout_context cct
23#define dout_subsys ceph_subsys_osd
24#undef dout_prefix
25#define dout_prefix _prefix(_dout, this)
26
27static ostream& _prefix(std::ostream *_dout, const PGLog *pglog)
28{
29 return *_dout << pglog->gen_prefix();
30}
31
32//////////////////// PGLog::IndexedLog ////////////////////
33
34void PGLog::IndexedLog::split_out_child(
35 pg_t child_pgid,
36 unsigned split_bits,
37 PGLog::IndexedLog *target)
38{
39 unindex();
c07f9fc5 40 *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
7c673cae
FG
41 index();
42 target->index();
43 reset_rollback_info_trimmed_to_riter();
44}
45
46void PGLog::IndexedLog::trim(
47 CephContext* cct,
48 eversion_t s,
c07f9fc5
FG
49 set<eversion_t> *trimmed,
50 set<string>* trimmed_dups,
51 bool* dirty_dups)
7c673cae
FG
52{
53 if (complete_to != log.end() &&
54 complete_to->version <= s) {
55 generic_dout(0) << " bad trim to " << s << " when complete_to is "
56 << complete_to->version
57 << " on " << *this << dendl;
58 }
59
60 assert(s <= can_rollback_to);
61
c07f9fc5
FG
62 auto earliest_dup_version =
63 log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
64 ? 0u
65 : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked;
66
7c673cae 67 while (!log.empty()) {
c07f9fc5 68 const pg_log_entry_t &e = *log.begin();
7c673cae
FG
69 if (e.version > s)
70 break;
71 generic_dout(20) << "trim " << e << dendl;
72 if (trimmed)
73 trimmed->insert(e.version);
74
75 unindex(e); // remove from index,
76
c07f9fc5
FG
77 // add to dup list
78 if (e.version.version >= earliest_dup_version) {
79 if (dirty_dups) *dirty_dups = true;
80 dups.push_back(pg_log_dup_t(e));
81 index(dups.back());
82 for (const auto& extra : e.extra_reqids) {
83 // note: extras have the same version as outer op
84 dups.push_back(pg_log_dup_t(e.version, extra.second,
85 extra.first, e.return_code));
86 index(dups.back());
87 }
88 }
89
7c673cae
FG
90 if (rollback_info_trimmed_to_riter == log.rend() ||
91 e.version == rollback_info_trimmed_to_riter->version) {
92 log.pop_front();
93 rollback_info_trimmed_to_riter = log.rend();
94 } else {
95 log.pop_front();
96 }
97 }
98
c07f9fc5
FG
99 while (!dups.empty()) {
100 const auto& e = *dups.begin();
101 if (e.version.version >= earliest_dup_version)
102 break;
103 generic_dout(20) << "trim dup " << e << dendl;
104 if (trimmed_dups)
105 trimmed_dups->insert(e.get_key_name());
106 if (indexed_data & PGLOG_INDEXED_DUPS) {
107 dup_index.erase(e.reqid);
108 }
109 dups.pop_front();
110 }
111
7c673cae
FG
112 // raise tail?
113 if (tail < s)
114 tail = s;
115}
116
117ostream& PGLog::IndexedLog::print(ostream& out) const
118{
119 out << *this << std::endl;
120 for (list<pg_log_entry_t>::const_iterator p = log.begin();
121 p != log.end();
122 ++p) {
c07f9fc5
FG
123 out << *p << " " <<
124 (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
125 std::endl;
7c673cae
FG
126 assert(!p->reqid_is_indexed() || logged_req(p->reqid));
127 }
c07f9fc5
FG
128
129 for (list<pg_log_dup_t>::const_iterator p = dups.begin();
130 p != dups.end();
131 ++p) {
132 out << *p << std::endl;
133 }
134
7c673cae
FG
135 return out;
136}
137
138//////////////////// PGLog ////////////////////
139
140void PGLog::reset_backfill()
141{
142 missing.clear();
143}
144
145void PGLog::clear() {
146 missing.clear();
147 log.clear();
148 log_keys_debug.clear();
149 undirty();
150}
151
152void PGLog::clear_info_log(
153 spg_t pgid,
154 ObjectStore::Transaction *t) {
155 coll_t coll(pgid);
156 t->remove(coll, pgid.make_pgmeta_oid());
157}
158
159void PGLog::trim(
160 eversion_t trim_to,
161 pg_info_t &info)
162{
163 // trim?
164 if (trim_to > log.tail) {
165 // We shouldn't be trimming the log past last_complete
166 assert(trim_to <= info.last_complete);
167
168 dout(10) << "trim " << log << " to " << trim_to << dendl;
c07f9fc5 169 log.trim(cct, trim_to, &trimmed, &trimmed_dups, &dirty_dups);
7c673cae
FG
170 info.log_tail = log.tail;
171 }
172}
173
174void PGLog::proc_replica_log(
175 pg_info_t &oinfo,
176 const pg_log_t &olog,
177 pg_missing_t& omissing,
178 pg_shard_t from) const
179{
180 dout(10) << "proc_replica_log for osd." << from << ": "
181 << oinfo << " " << olog << " " << omissing << dendl;
182
183 if (olog.head < log.tail) {
184 dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
185 << "for divergent objects" << dendl;
186 return;
187 }
188 if (olog.head == log.head) {
189 dout(10) << __func__ << ": osd." << from << " same log head, not looking "
190 << "for divergent objects" << dendl;
191 return;
192 }
193 assert(olog.head >= log.tail);
194
195 /*
196 basically what we're doing here is rewinding the remote log,
197 dropping divergent entries, until we find something that matches
198 our master log. we then reset last_update to reflect the new
199 point up to which missing is accurate.
200
201 later, in activate(), missing will get wound forward again and
202 we will send the peer enough log to arrive at the same state.
203 */
204
205 for (map<hobject_t, pg_missing_item>::const_iterator i = omissing.get_items().begin();
206 i != omissing.get_items().end();
207 ++i) {
208 dout(20) << " before missing " << i->first << " need " << i->second.need
209 << " have " << i->second.have << dendl;
210 }
211
212 list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
213 log.log.rbegin();
214 while (1) {
215 if (first_non_divergent == log.log.rend())
216 break;
217 if (first_non_divergent->version <= olog.head) {
218 dout(20) << "merge_log point (usually last shared) is "
219 << *first_non_divergent << dendl;
220 break;
221 }
222 ++first_non_divergent;
223 }
224
225 /* Because olog.head >= log.tail, we know that both pgs must at least have
226 * the event represented by log.tail. Similarly, because log.head >= olog.tail,
227 * we know that the even represented by olog.tail must be common to both logs.
228 * Furthermore, the event represented by a log tail was necessarily trimmed,
229 * thus neither olog.tail nor log.tail can be divergent. It's
230 * possible that olog/log contain no actual events between olog.head and
231 * MAX(log.tail, olog.tail), however, since they might have been split out.
232 * Thus, if we cannot find an event e such that
233 * log.tail <= e.version <= log.head, the last_update must actually be
234 * MAX(log.tail, olog.tail).
235 */
236 eversion_t limit = MAX(olog.tail, log.tail);
237 eversion_t lu =
238 (first_non_divergent == log.log.rend() ||
239 first_non_divergent->version < limit) ?
240 limit :
241 first_non_divergent->version;
242
243 IndexedLog folog(olog);
244 auto divergent = folog.rewind_from_head(lu);
245 _merge_divergent_entries(
246 folog,
247 divergent,
248 oinfo,
249 olog.get_can_rollback_to(),
250 omissing,
251 0,
252 this);
253
254 if (lu < oinfo.last_update) {
255 dout(10) << " peer osd." << from << " last_update now " << lu << dendl;
256 oinfo.last_update = lu;
257 }
258
259 if (omissing.have_missing()) {
260 eversion_t first_missing =
261 omissing.get_items().at(omissing.get_rmissing().begin()->second).need;
262 oinfo.last_complete = eversion_t();
263 list<pg_log_entry_t>::const_iterator i = olog.log.begin();
264 for (;
265 i != olog.log.end();
266 ++i) {
267 if (i->version < first_missing)
268 oinfo.last_complete = i->version;
269 else
270 break;
271 }
272 } else {
273 oinfo.last_complete = oinfo.last_update;
274 }
c07f9fc5 275} // proc_replica_log
7c673cae
FG
276
277/**
278 * rewind divergent entries at the head of the log
279 *
280 * This rewinds entries off the head of our log that are divergent.
281 * This is used by replicas during activation.
282 *
283 * @param newhead new head to rewind to
284 */
285void PGLog::rewind_divergent_log(eversion_t newhead,
286 pg_info_t &info, LogEntryHandler *rollbacker,
287 bool &dirty_info, bool &dirty_big_info)
288{
c07f9fc5
FG
289 dout(10) << "rewind_divergent_log truncate divergent future " <<
290 newhead << dendl;
7c673cae
FG
291
292
293 if (info.last_complete > newhead)
294 info.last_complete = newhead;
295
296 auto divergent = log.rewind_from_head(newhead);
297 if (!divergent.empty()) {
298 mark_dirty_from(divergent.front().version);
299 }
300 for (auto &&entry: divergent) {
301 dout(10) << "rewind_divergent_log future divergent " << entry << dendl;
302 }
303 info.last_update = newhead;
304
305 _merge_divergent_entries(
306 log,
307 divergent,
308 info,
309 log.get_can_rollback_to(),
310 missing,
311 rollbacker,
312 this);
313
314 dirty_info = true;
315 dirty_big_info = true;
316}
317
318void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
319 pg_info_t &info, LogEntryHandler *rollbacker,
320 bool &dirty_info, bool &dirty_big_info)
321{
322 dout(10) << "merge_log " << olog << " from osd." << fromosd
323 << " into " << log << dendl;
324
325 // Check preconditions
326
327 // If our log is empty, the incoming log needs to have not been trimmed.
328 assert(!log.null() || olog.tail == eversion_t());
329 // The logs must overlap.
330 assert(log.head >= olog.tail && olog.head >= log.tail);
331
332 for (map<hobject_t, pg_missing_item>::const_iterator i = missing.get_items().begin();
333 i != missing.get_items().end();
334 ++i) {
335 dout(20) << "pg_missing_t sobject: " << i->first << dendl;
336 }
337
338 bool changed = false;
339
340 // extend on tail?
341 // this is just filling in history. it does not affect our
342 // missing set, as that should already be consistent with our
343 // current log.
344 eversion_t orig_tail = log.tail;
345 if (olog.tail < log.tail) {
346 dout(10) << "merge_log extending tail to " << olog.tail << dendl;
347 list<pg_log_entry_t>::iterator from = olog.log.begin();
348 list<pg_log_entry_t>::iterator to;
349 eversion_t last;
350 for (to = from;
351 to != olog.log.end();
352 ++to) {
353 if (to->version > log.tail)
354 break;
355 log.index(*to);
356 dout(15) << *to << dendl;
357 last = to->version;
358 }
359 mark_dirty_to(last);
360
361 // splice into our log.
362 log.log.splice(log.log.begin(),
363 olog.log, from, to);
c07f9fc5 364
7c673cae
FG
365 info.log_tail = log.tail = olog.tail;
366 changed = true;
367 }
368
369 if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases
370 oinfo.stats.reported_epoch < info.stats.reported_epoch) {
371 oinfo.stats.reported_seq = info.stats.reported_seq;
372 oinfo.stats.reported_epoch = info.stats.reported_epoch;
373 }
374 if (info.last_backfill.is_max())
375 info.stats = oinfo.stats;
376 info.hit_set = oinfo.hit_set;
377
378 // do we have divergent entries to throw out?
379 if (olog.head < log.head) {
380 rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
381 changed = true;
382 }
383
384 // extend on head?
385 if (olog.head > log.head) {
386 dout(10) << "merge_log extending head to " << olog.head << dendl;
c07f9fc5 387
7c673cae
FG
388 // find start point in olog
389 list<pg_log_entry_t>::iterator to = olog.log.end();
390 list<pg_log_entry_t>::iterator from = olog.log.end();
391 eversion_t lower_bound = MAX(olog.tail, orig_tail);
392 while (1) {
393 if (from == olog.log.begin())
394 break;
395 --from;
396 dout(20) << " ? " << *from << dendl;
397 if (from->version <= log.head) {
398 lower_bound = MAX(lower_bound, from->version);
399 ++from;
400 break;
401 }
402 }
403 dout(20) << "merge_log cut point (usually last shared) is "
404 << lower_bound << dendl;
405 mark_dirty_from(lower_bound);
406
407 auto divergent = log.rewind_from_head(lower_bound);
408 // move aside divergent items
409 for (auto &&oe: divergent) {
410 dout(10) << "merge_log divergent " << oe << dendl;
411 }
412 log.roll_forward_to(log.head, rollbacker);
413
31f18b77 414 mempool::osd_pglog::list<pg_log_entry_t> new_entries;
7c673cae
FG
415 new_entries.splice(new_entries.end(), olog.log, from, to);
416 append_log_entries_update_missing(
417 info.last_backfill,
418 info.last_backfill_bitwise,
419 new_entries,
420 false,
421 &log,
422 missing,
423 rollbacker,
424 this);
425
426 _merge_divergent_entries(
427 log,
428 divergent,
429 info,
430 log.get_can_rollback_to(),
431 missing,
432 rollbacker,
433 this);
434
435 info.last_update = log.head = olog.head;
436
437 // We cannot rollback into the new log entries
438 log.skip_can_rollback_to_to_head();
439
440 info.last_user_version = oinfo.last_user_version;
441 info.purged_snaps = oinfo.purged_snaps;
442
443 changed = true;
444 }
c07f9fc5
FG
445
446 // now handle dups
447 if (merge_log_dups(olog)) {
448 dirty_dups = true;
449 changed = true;
450 }
451
452 dout(10) << "merge_log result " << log << " " << missing <<
453 " changed=" << changed << dendl;
7c673cae
FG
454
455 if (changed) {
456 dirty_info = true;
457 dirty_big_info = true;
458 }
459}
460
c07f9fc5
FG
461
462// returns true if any changes were made to log.dups
463bool PGLog::merge_log_dups(const pg_log_t& olog) {
464 bool changed = false;
465
466 if (!olog.dups.empty()) {
467 if (log.dups.empty()) {
468 dout(10) << "merge_log copying olog dups to log " <<
469 olog.dups.front().version << " to " <<
470 olog.dups.back().version << dendl;
471 changed = true;
472 // since our log.dups is empty just copy them
473 for (const auto& i : olog.dups) {
474 log.dups.push_back(i);
475 log.index(log.dups.back());
476 }
477 } else {
478 // since our log.dups is not empty try to extend on each end
479
480 if (olog.dups.back().version > log.dups.back().version) {
481 // extend the dups's tail (i.e., newer dups)
482 dout(10) << "merge_log extending dups tail to " <<
483 olog.dups.back().version << dendl;
484 changed = true;
485
486 auto log_tail_version = log.dups.back().version;
487
488 auto insert_cursor = log.dups.end();
489 for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
490 if (i->version <= log_tail_version) break;
491 log.dups.insert(insert_cursor, *i);
492
493 auto prev = insert_cursor;
494 --prev;
495 // be sure to pass reference of copy in log.dups
496 log.index(*prev);
497
498 --insert_cursor; // make sure we insert in reverse order
499 }
500 }
501
502 if (olog.dups.front().version < log.dups.front().version) {
503 // extend the dups's head (i.e., older dups)
504 dout(10) << "merge_log extending dups head to " <<
505 olog.dups.front().version << dendl;
506 changed = true;
507
508 auto insert_cursor = log.dups.begin();
509 for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
510 if (i->version >= insert_cursor->version) break;
511 log.dups.insert(insert_cursor, *i);
512 auto prev = insert_cursor;
513 --prev;
514 // be sure to pass address of copy in log.dups
515 log.index(*prev);
516 }
517 }
518 }
519 }
520
521 // remove any dup entries that overlap with pglog
522 if (!log.dups.empty() && log.dups.back().version >= log.tail) {
523 dout(10) << "merge_log removed dups overlapping log entries [" <<
524 log.tail << "," << log.dups.back().version << "]" << dendl;
525 changed = true;
526
527 while (!log.dups.empty() && log.dups.back().version >= log.tail) {
528 log.unindex(log.dups.back());
529 log.dups.pop_back();
530 }
531 }
532
533 return changed;
534}
535
7c673cae
FG
536void PGLog::check() {
537 if (!pg_log_debug)
538 return;
539 if (log.log.size() != log_keys_debug.size()) {
540 derr << "log.log.size() != log_keys_debug.size()" << dendl;
541 derr << "actual log:" << dendl;
542 for (list<pg_log_entry_t>::iterator i = log.log.begin();
543 i != log.log.end();
544 ++i) {
545 derr << " " << *i << dendl;
546 }
547 derr << "log_keys_debug:" << dendl;
548 for (set<string>::const_iterator i = log_keys_debug.begin();
549 i != log_keys_debug.end();
550 ++i) {
551 derr << " " << *i << dendl;
552 }
553 }
554 assert(log.log.size() == log_keys_debug.size());
555 for (list<pg_log_entry_t>::iterator i = log.log.begin();
556 i != log.log.end();
557 ++i) {
558 assert(log_keys_debug.count(i->get_key_name()));
559 }
560}
561
c07f9fc5 562// non-static
7c673cae
FG
563void PGLog::write_log_and_missing(
564 ObjectStore::Transaction& t,
565 map<string,bufferlist> *km,
c07f9fc5
FG
566 const coll_t& coll,
567 const ghobject_t &log_oid,
7c673cae
FG
568 bool require_rollback)
569{
570 if (is_dirty()) {
571 dout(5) << "write_log_and_missing with: "
572 << "dirty_to: " << dirty_to
573 << ", dirty_from: " << dirty_from
574 << ", writeout_from: " << writeout_from
575 << ", trimmed: " << trimmed
c07f9fc5 576 << ", trimmed_dups: " << trimmed_dups
7c673cae
FG
577 << ", clear_divergent_priors: " << clear_divergent_priors
578 << dendl;
579 _write_log_and_missing(
580 t, km, log, coll, log_oid,
581 dirty_to,
582 dirty_from,
583 writeout_from,
584 trimmed,
c07f9fc5 585 trimmed_dups,
7c673cae
FG
586 missing,
587 !touched_log,
588 require_rollback,
589 clear_divergent_priors,
c07f9fc5
FG
590 dirty_dups,
591 &rebuilt_missing_with_deletes,
592 (pg_log_debug ? &log_keys_debug : nullptr));
7c673cae
FG
593 undirty();
594 } else {
595 dout(10) << "log is not dirty" << dendl;
596 }
597}
598
c07f9fc5 599// static
7c673cae
FG
600void PGLog::write_log_and_missing_wo_missing(
601 ObjectStore::Transaction& t,
602 map<string,bufferlist> *km,
603 pg_log_t &log,
604 const coll_t& coll, const ghobject_t &log_oid,
605 map<eversion_t, hobject_t> &divergent_priors,
c07f9fc5
FG
606 bool require_rollback,
607 bool dirty_dups)
7c673cae
FG
608{
609 _write_log_and_missing_wo_missing(
610 t, km, log, coll, log_oid,
611 divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
612 set<eversion_t>(),
c07f9fc5
FG
613 set<string>(),
614 true, true, require_rollback, dirty_dups, nullptr);
7c673cae
FG
615}
616
c07f9fc5 617// static
7c673cae
FG
618void PGLog::write_log_and_missing(
619 ObjectStore::Transaction& t,
620 map<string,bufferlist> *km,
621 pg_log_t &log,
622 const coll_t& coll,
623 const ghobject_t &log_oid,
624 const pg_missing_tracker_t &missing,
c07f9fc5
FG
625 bool require_rollback,
626 bool dirty_dups,
627 bool *rebuilt_missing_with_deletes)
7c673cae
FG
628{
629 _write_log_and_missing(
630 t, km, log, coll, log_oid,
631 eversion_t::max(),
632 eversion_t(),
633 eversion_t(),
634 set<eversion_t>(),
c07f9fc5 635 set<string>(),
7c673cae 636 missing,
c07f9fc5 637 true, require_rollback, false, dirty_dups, rebuilt_missing_with_deletes, nullptr);
7c673cae
FG
638}
639
c07f9fc5 640// static
7c673cae
FG
641void PGLog::_write_log_and_missing_wo_missing(
642 ObjectStore::Transaction& t,
643 map<string,bufferlist> *km,
644 pg_log_t &log,
645 const coll_t& coll, const ghobject_t &log_oid,
646 map<eversion_t, hobject_t> &divergent_priors,
647 eversion_t dirty_to,
648 eversion_t dirty_from,
649 eversion_t writeout_from,
650 const set<eversion_t> &trimmed,
c07f9fc5 651 const set<string> &trimmed_dups,
7c673cae
FG
652 bool dirty_divergent_priors,
653 bool touch_log,
654 bool require_rollback,
c07f9fc5 655 bool dirty_dups,
7c673cae
FG
656 set<string> *log_keys_debug
657 )
658{
c07f9fc5 659 set<string> to_remove(trimmed_dups);
7c673cae
FG
660 for (set<eversion_t>::const_iterator i = trimmed.begin();
661 i != trimmed.end();
662 ++i) {
663 to_remove.insert(i->get_key_name());
664 if (log_keys_debug) {
665 assert(log_keys_debug->count(i->get_key_name()));
666 log_keys_debug->erase(i->get_key_name());
667 }
668 }
669
c07f9fc5 670 // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
7c673cae
FG
671 if (touch_log)
672 t.touch(coll, log_oid);
673 if (dirty_to != eversion_t()) {
674 t.omap_rmkeyrange(
675 coll, log_oid,
676 eversion_t().get_key_name(), dirty_to.get_key_name());
677 clear_up_to(log_keys_debug, dirty_to.get_key_name());
678 }
679 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
c07f9fc5 680 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
7c673cae
FG
681 t.omap_rmkeyrange(
682 coll, log_oid,
683 dirty_from.get_key_name(), eversion_t::max().get_key_name());
684 clear_after(log_keys_debug, dirty_from.get_key_name());
685 }
686
687 for (list<pg_log_entry_t>::iterator p = log.log.begin();
688 p != log.log.end() && p->version <= dirty_to;
689 ++p) {
690 bufferlist bl(sizeof(*p) * 2);
691 p->encode_with_checksum(bl);
692 (*km)[p->get_key_name()].claim(bl);
693 }
694
695 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
696 p != log.log.rend() &&
697 (p->version >= dirty_from || p->version >= writeout_from) &&
698 p->version >= dirty_to;
699 ++p) {
700 bufferlist bl(sizeof(*p) * 2);
701 p->encode_with_checksum(bl);
702 (*km)[p->get_key_name()].claim(bl);
703 }
704
705 if (log_keys_debug) {
706 for (map<string, bufferlist>::iterator i = (*km).begin();
707 i != (*km).end();
708 ++i) {
709 if (i->first[0] == '_')
710 continue;
711 assert(!log_keys_debug->count(i->first));
712 log_keys_debug->insert(i->first);
713 }
714 }
715
c07f9fc5
FG
716 // process dirty_dups after log_keys_debug is filled, so dups do not
717 // end up in that set
718 if (dirty_dups) {
719 pg_log_dup_t min;
720 t.omap_rmkeyrange(
721 coll, log_oid,
722 min.get_key_name(), log.dups.begin()->get_key_name());
723 for (const auto& entry : log.dups) {
724 bufferlist bl;
725 ::encode(entry, bl);
726 (*km)[entry.get_key_name()].claim(bl);
727 }
728 }
729
7c673cae
FG
730 if (dirty_divergent_priors) {
731 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
732 ::encode(divergent_priors, (*km)["divergent_priors"]);
733 }
734 if (require_rollback) {
735 ::encode(
736 log.get_can_rollback_to(),
737 (*km)["can_rollback_to"]);
738 ::encode(
739 log.get_rollback_info_trimmed_to(),
740 (*km)["rollback_info_trimmed_to"]);
741 }
742
743 if (!to_remove.empty())
744 t.omap_rmkeys(coll, log_oid, to_remove);
745}
746
c07f9fc5 747// static
7c673cae
FG
748void PGLog::_write_log_and_missing(
749 ObjectStore::Transaction& t,
750 map<string,bufferlist>* km,
751 pg_log_t &log,
752 const coll_t& coll, const ghobject_t &log_oid,
753 eversion_t dirty_to,
754 eversion_t dirty_from,
755 eversion_t writeout_from,
756 const set<eversion_t> &trimmed,
c07f9fc5 757 const set<string> &trimmed_dups,
7c673cae
FG
758 const pg_missing_tracker_t &missing,
759 bool touch_log,
760 bool require_rollback,
761 bool clear_divergent_priors,
c07f9fc5
FG
762 bool dirty_dups,
763 bool *rebuilt_missing_with_deletes, // in/out param
7c673cae
FG
764 set<string> *log_keys_debug
765 ) {
c07f9fc5 766 set<string> to_remove(trimmed_dups);
7c673cae
FG
767 for (set<eversion_t>::const_iterator i = trimmed.begin();
768 i != trimmed.end();
769 ++i) {
770 to_remove.insert(i->get_key_name());
771 if (log_keys_debug) {
772 assert(log_keys_debug->count(i->get_key_name()));
773 log_keys_debug->erase(i->get_key_name());
774 }
775 }
776
777 if (touch_log)
778 t.touch(coll, log_oid);
779 if (dirty_to != eversion_t()) {
780 t.omap_rmkeyrange(
781 coll, log_oid,
782 eversion_t().get_key_name(), dirty_to.get_key_name());
783 clear_up_to(log_keys_debug, dirty_to.get_key_name());
784 }
785 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
786 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
787 t.omap_rmkeyrange(
788 coll, log_oid,
789 dirty_from.get_key_name(), eversion_t::max().get_key_name());
790 clear_after(log_keys_debug, dirty_from.get_key_name());
791 }
792
793 for (list<pg_log_entry_t>::iterator p = log.log.begin();
794 p != log.log.end() && p->version <= dirty_to;
795 ++p) {
796 bufferlist bl(sizeof(*p) * 2);
797 p->encode_with_checksum(bl);
798 (*km)[p->get_key_name()].claim(bl);
799 }
800
801 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
802 p != log.log.rend() &&
803 (p->version >= dirty_from || p->version >= writeout_from) &&
804 p->version >= dirty_to;
805 ++p) {
806 bufferlist bl(sizeof(*p) * 2);
807 p->encode_with_checksum(bl);
808 (*km)[p->get_key_name()].claim(bl);
809 }
810
811 if (log_keys_debug) {
812 for (map<string, bufferlist>::iterator i = (*km).begin();
813 i != (*km).end();
814 ++i) {
815 if (i->first[0] == '_')
816 continue;
817 assert(!log_keys_debug->count(i->first));
818 log_keys_debug->insert(i->first);
819 }
820 }
821
c07f9fc5
FG
822 // process dirty_dups after log_keys_debug is filled, so dups do not
823 // end up in that set
824 if (dirty_dups) {
825 pg_log_dup_t min;
826 t.omap_rmkeyrange(
827 coll, log_oid,
828 min.get_key_name(), log.dups.begin()->get_key_name());
829 for (const auto& entry : log.dups) {
830 bufferlist bl;
831 ::encode(entry, bl);
832 (*km)[entry.get_key_name()].claim(bl);
833 }
834 }
835
7c673cae
FG
836 if (clear_divergent_priors) {
837 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
838 to_remove.insert("divergent_priors");
839 }
c07f9fc5
FG
840 // since we encode individual missing items instead of a whole
841 // missing set, we need another key to store this bit of state
842 if (*rebuilt_missing_with_deletes) {
843 (*km)["may_include_deletes_in_missing"] = bufferlist();
844 *rebuilt_missing_with_deletes = false;
845 }
7c673cae
FG
846 missing.get_changed(
847 [&](const hobject_t &obj) {
848 string key = string("missing/") + obj.to_str();
849 pg_missing_item item;
850 if (!missing.is_missing(obj, &item)) {
851 to_remove.insert(key);
852 } else {
c07f9fc5
FG
853 uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0;
854 ::encode(make_pair(obj, item), (*km)[key], features);
7c673cae
FG
855 }
856 });
857 if (require_rollback) {
858 ::encode(
859 log.get_can_rollback_to(),
860 (*km)["can_rollback_to"]);
861 ::encode(
862 log.get_rollback_info_trimmed_to(),
863 (*km)["rollback_info_trimmed_to"]);
864 }
865
866 if (!to_remove.empty())
867 t.omap_rmkeys(coll, log_oid, to_remove);
868}
c07f9fc5
FG
869
870void PGLog::rebuild_missing_set_with_deletes(ObjectStore *store,
871 coll_t pg_coll,
872 const pg_info_t &info)
873{
874 // save entries not generated from the current log (e.g. added due
875 // to repair, EIO handling, or divergent_priors).
876 map<hobject_t, pg_missing_item> extra_missing;
877 for (const auto& p : missing.get_items()) {
878 if (!log.logged_object(p.first)) {
879 dout(20) << __func__ << " extra missing entry: " << p.first
880 << " " << p.second << dendl;
881 extra_missing[p.first] = p.second;
882 }
883 }
884 missing.clear();
885 missing.may_include_deletes = true;
886
887 // go through the log and add items that are not present or older
888 // versions on disk, just as if we were reading the log + metadata
889 // off disk originally
890 set<hobject_t> did;
891 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
892 i != log.log.rend();
893 ++i) {
894 if (i->version <= info.last_complete)
895 break;
896 if (i->soid > info.last_backfill ||
897 i->is_error() ||
898 did.find(i->soid) != did.end())
899 continue;
900 did.insert(i->soid);
901
902 bufferlist bv;
903 int r = store->getattr(
904 pg_coll,
905 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
906 OI_ATTR,
907 bv);
908 dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
909
910 if (r >= 0) {
911 object_info_t oi(bv);
912 dout(20) << __func__ << " store version = " << oi.version << dendl;
913 if (oi.version < i->version) {
914 missing.add(i->soid, i->version, oi.version, i->is_delete());
915 }
916 } else {
917 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
918 }
919 }
920
921 for (const auto& p : extra_missing) {
922 missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
923 }
924 rebuilt_missing_with_deletes = true;
925}