]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGLog.cc
import ceph 14.2.5
[ceph.git] / ceph / src / osd / PGLog.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "PGLog.h"
19 #include "include/unordered_map.h"
20 #include "common/ceph_context.h"
21
22 #define dout_context cct
23 #define dout_subsys ceph_subsys_osd
24 #undef dout_prefix
25 #define dout_prefix _prefix(_dout, this)
26
27 static ostream& _prefix(std::ostream *_dout, const PGLog *pglog)
28 {
29 return pglog->gen_prefix(*_dout);
30 }
31
32 //////////////////// PGLog::IndexedLog ////////////////////
33
34 void PGLog::IndexedLog::split_out_child(
35 pg_t child_pgid,
36 unsigned split_bits,
37 PGLog::IndexedLog *target)
38 {
39 unindex();
40 *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
41 index();
42 target->index();
43 reset_rollback_info_trimmed_to_riter();
44 }
45
46 void PGLog::IndexedLog::trim(
47 CephContext* cct,
48 eversion_t s,
49 set<eversion_t> *trimmed,
50 set<string>* trimmed_dups,
51 eversion_t *write_from_dups)
52 {
53 ceph_assert(s <= can_rollback_to);
54 if (complete_to != log.end())
55 lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl;
56
57 auto earliest_dup_version =
58 log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
59 ? 0u
60 : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked + 1;
61
62 lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl;
63 while (!log.empty()) {
64 const pg_log_entry_t &e = *log.begin();
65 if (e.version > s)
66 break;
67 lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl;
68 if (trimmed)
69 trimmed->emplace(e.version);
70
71 unindex(e); // remove from index,
72
73 // add to dup list
74 if (e.version.version >= earliest_dup_version) {
75 if (write_from_dups != nullptr && *write_from_dups > e.version) {
76 lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
77 *write_from_dups = e.version;
78 }
79 dups.push_back(pg_log_dup_t(e));
80 index(dups.back());
81 uint32_t idx = 0;
82 for (const auto& extra : e.extra_reqids) {
83 int return_code = e.return_code;
84 if (return_code >= 0) {
85 auto it = e.extra_reqid_return_codes.find(idx);
86 if (it != e.extra_reqid_return_codes.end()) {
87 return_code = it->second;
88 }
89 }
90 ++idx;
91
92 // note: extras have the same version as outer op
93 dups.push_back(pg_log_dup_t(e.version, extra.second,
94 extra.first, return_code));
95 index(dups.back());
96 }
97 }
98
99 bool reset_complete_to = false;
100 // we are trimming past complete_to, so reset complete_to
101 if (complete_to != log.end() && e.version >= complete_to->version)
102 reset_complete_to = true;
103 if (rollback_info_trimmed_to_riter == log.rend() ||
104 e.version == rollback_info_trimmed_to_riter->version) {
105 log.pop_front();
106 rollback_info_trimmed_to_riter = log.rend();
107 } else {
108 log.pop_front();
109 }
110
111 // reset complete_to to the beginning of the log
112 if (reset_complete_to) {
113 complete_to = log.begin();
114 if (complete_to != log.end()) {
115 lgeneric_subdout(cct, osd, 20) << " moving complete_to to "
116 << log.begin()->version << dendl;
117 } else {
118 lgeneric_subdout(cct, osd, 20) << " log is now empty" << dendl;
119 }
120 }
121 }
122
123 while (!dups.empty()) {
124 const auto& e = *dups.begin();
125 if (e.version.version >= earliest_dup_version)
126 break;
127 lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl;
128 if (trimmed_dups)
129 trimmed_dups->insert(e.get_key_name());
130 unindex(e);
131 dups.pop_front();
132 }
133
134 // raise tail?
135 if (tail < s)
136 tail = s;
137 }
138
139 ostream& PGLog::IndexedLog::print(ostream& out) const
140 {
141 out << *this << std::endl;
142 for (list<pg_log_entry_t>::const_iterator p = log.begin();
143 p != log.end();
144 ++p) {
145 out << *p << " " <<
146 (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
147 std::endl;
148 ceph_assert(!p->reqid_is_indexed() || logged_req(p->reqid));
149 }
150
151 for (list<pg_log_dup_t>::const_iterator p = dups.begin();
152 p != dups.end();
153 ++p) {
154 out << *p << std::endl;
155 }
156
157 return out;
158 }
159
160 //////////////////// PGLog ////////////////////
161
162 void PGLog::reset_backfill()
163 {
164 missing.clear();
165 }
166
167 void PGLog::clear() {
168 missing.clear();
169 log.clear();
170 log_keys_debug.clear();
171 undirty();
172 }
173
174 void PGLog::clear_info_log(
175 spg_t pgid,
176 ObjectStore::Transaction *t) {
177 coll_t coll(pgid);
178 t->remove(coll, pgid.make_pgmeta_oid());
179 }
180
181 void PGLog::trim(
182 eversion_t trim_to,
183 pg_info_t &info,
184 bool transaction_applied,
185 bool async)
186 {
187 dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl;
188 // trim?
189 if (trim_to > log.tail) {
190 dout(10) << __func__ << " missing = " << missing.num_missing() << dendl;
191 // Don't assert for async_recovery_targets or backfill_targets
192 // or whenever there are missing items
193 if (transaction_applied && !async && (missing.num_missing() == 0))
194 ceph_assert(trim_to <= info.last_complete);
195
196 dout(10) << "trim " << log << " to " << trim_to << dendl;
197 log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
198 info.log_tail = log.tail;
199 if (log.complete_to != log.log.end())
200 dout(10) << " after trim complete_to " << log.complete_to->version << dendl;
201 }
202 }
203
204 void PGLog::proc_replica_log(
205 pg_info_t &oinfo,
206 const pg_log_t &olog,
207 pg_missing_t& omissing,
208 pg_shard_t from) const
209 {
210 dout(10) << "proc_replica_log for osd." << from << ": "
211 << oinfo << " " << olog << " " << omissing << dendl;
212
213 if (olog.head < log.tail) {
214 dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
215 << "for divergent objects" << dendl;
216 return;
217 }
218 if (olog.head == log.head) {
219 dout(10) << __func__ << ": osd." << from << " same log head, not looking "
220 << "for divergent objects" << dendl;
221 return;
222 }
223
224 /*
225 basically what we're doing here is rewinding the remote log,
226 dropping divergent entries, until we find something that matches
227 our master log. we then reset last_update to reflect the new
228 point up to which missing is accurate.
229
230 later, in activate(), missing will get wound forward again and
231 we will send the peer enough log to arrive at the same state.
232 */
233
234 for (map<hobject_t, pg_missing_item>::const_iterator i = omissing.get_items().begin();
235 i != omissing.get_items().end();
236 ++i) {
237 dout(20) << " before missing " << i->first << " need " << i->second.need
238 << " have " << i->second.have << dendl;
239 }
240
241 list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
242 log.log.rbegin();
243 while (1) {
244 if (first_non_divergent == log.log.rend())
245 break;
246 if (first_non_divergent->version <= olog.head) {
247 dout(20) << "merge_log point (usually last shared) is "
248 << *first_non_divergent << dendl;
249 break;
250 }
251 ++first_non_divergent;
252 }
253
254 /* Because olog.head >= log.tail, we know that both pgs must at least have
255 * the event represented by log.tail. Similarly, because log.head >= olog.tail,
256 * we know that the even represented by olog.tail must be common to both logs.
257 * Furthermore, the event represented by a log tail was necessarily trimmed,
258 * thus neither olog.tail nor log.tail can be divergent. It's
259 * possible that olog/log contain no actual events between olog.head and
260 * max(log.tail, olog.tail), however, since they might have been split out.
261 * Thus, if we cannot find an event e such that
262 * log.tail <= e.version <= log.head, the last_update must actually be
263 * max(log.tail, olog.tail).
264 */
265 eversion_t limit = std::max(olog.tail, log.tail);
266 eversion_t lu =
267 (first_non_divergent == log.log.rend() ||
268 first_non_divergent->version < limit) ?
269 limit :
270 first_non_divergent->version;
271
272 // we merge and adjust the replica's log, rollback the rollbackable divergent entry,
273 // remove the unrollbackable divergent entry and mark the according object as missing.
274 // the rollback boundary must choose crt of the olog which going to be merged.
275 // The replica log's(olog) crt will not be modified, so it could get passed
276 // to _merge_divergent_entries() directly.
277 IndexedLog folog(olog);
278 auto divergent = folog.rewind_from_head(lu);
279 _merge_divergent_entries(
280 folog,
281 divergent,
282 oinfo,
283 olog.get_can_rollback_to(),
284 omissing,
285 0,
286 this);
287
288 if (lu < oinfo.last_update) {
289 dout(10) << " peer osd." << from << " last_update now " << lu << dendl;
290 oinfo.last_update = lu;
291 }
292
293 if (omissing.have_missing()) {
294 eversion_t first_missing =
295 omissing.get_items().at(omissing.get_rmissing().begin()->second).need;
296 oinfo.last_complete = eversion_t();
297 list<pg_log_entry_t>::const_iterator i = olog.log.begin();
298 for (;
299 i != olog.log.end();
300 ++i) {
301 if (i->version < first_missing)
302 oinfo.last_complete = i->version;
303 else
304 break;
305 }
306 } else {
307 oinfo.last_complete = oinfo.last_update;
308 }
309 } // proc_replica_log
310
311 /**
312 * rewind divergent entries at the head of the log
313 *
314 * This rewinds entries off the head of our log that are divergent.
315 * This is used by replicas during activation.
316 *
317 * @param newhead new head to rewind to
318 */
319 void PGLog::rewind_divergent_log(eversion_t newhead,
320 pg_info_t &info, LogEntryHandler *rollbacker,
321 bool &dirty_info, bool &dirty_big_info)
322 {
323 dout(10) << "rewind_divergent_log truncate divergent future " <<
324 newhead << dendl;
325
326 // We need to preserve the original crt before it gets updated in rewind_from_head().
327 // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
328 // a divergent entry or not.
329 eversion_t original_crt = log.get_can_rollback_to();
330 dout(20) << __func__ << " original_crt = " << original_crt << dendl;
331 if (info.last_complete > newhead)
332 info.last_complete = newhead;
333
334 auto divergent = log.rewind_from_head(newhead);
335 if (!divergent.empty()) {
336 mark_dirty_from(divergent.front().version);
337 }
338 for (auto &&entry: divergent) {
339 dout(10) << "rewind_divergent_log future divergent " << entry << dendl;
340 }
341 info.last_update = newhead;
342
343 _merge_divergent_entries(
344 log,
345 divergent,
346 info,
347 original_crt,
348 missing,
349 rollbacker,
350 this);
351
352 dirty_info = true;
353 dirty_big_info = true;
354 }
355
356 void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
357 pg_info_t &info, LogEntryHandler *rollbacker,
358 bool &dirty_info, bool &dirty_big_info)
359 {
360 dout(10) << "merge_log " << olog << " from osd." << fromosd
361 << " into " << log << dendl;
362
363 // Check preconditions
364
365 // If our log is empty, the incoming log needs to have not been trimmed.
366 ceph_assert(!log.null() || olog.tail == eversion_t());
367 // The logs must overlap.
368 ceph_assert(log.head >= olog.tail && olog.head >= log.tail);
369
370 for (map<hobject_t, pg_missing_item>::const_iterator i = missing.get_items().begin();
371 i != missing.get_items().end();
372 ++i) {
373 dout(20) << "pg_missing_t sobject: " << i->first << dendl;
374 }
375
376 bool changed = false;
377
378 // extend on tail?
379 // this is just filling in history. it does not affect our
380 // missing set, as that should already be consistent with our
381 // current log.
382 eversion_t orig_tail = log.tail;
383 if (olog.tail < log.tail) {
384 dout(10) << "merge_log extending tail to " << olog.tail << dendl;
385 list<pg_log_entry_t>::iterator from = olog.log.begin();
386 list<pg_log_entry_t>::iterator to;
387 eversion_t last;
388 for (to = from;
389 to != olog.log.end();
390 ++to) {
391 if (to->version > log.tail)
392 break;
393 log.index(*to);
394 dout(15) << *to << dendl;
395 last = to->version;
396 }
397 mark_dirty_to(last);
398
399 // splice into our log.
400 log.log.splice(log.log.begin(),
401 olog.log, from, to);
402
403 info.log_tail = log.tail = olog.tail;
404 changed = true;
405 }
406
407 if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases
408 oinfo.stats.reported_epoch < info.stats.reported_epoch) {
409 oinfo.stats.reported_seq = info.stats.reported_seq;
410 oinfo.stats.reported_epoch = info.stats.reported_epoch;
411 }
412 if (info.last_backfill.is_max())
413 info.stats = oinfo.stats;
414 info.hit_set = oinfo.hit_set;
415
416 // do we have divergent entries to throw out?
417 if (olog.head < log.head) {
418 rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
419 changed = true;
420 }
421
422 // extend on head?
423 if (olog.head > log.head) {
424 dout(10) << "merge_log extending head to " << olog.head << dendl;
425
426 // find start point in olog
427 list<pg_log_entry_t>::iterator to = olog.log.end();
428 list<pg_log_entry_t>::iterator from = olog.log.end();
429 eversion_t lower_bound = std::max(olog.tail, orig_tail);
430 while (1) {
431 if (from == olog.log.begin())
432 break;
433 --from;
434 dout(20) << " ? " << *from << dendl;
435 if (from->version <= log.head) {
436 lower_bound = std::max(lower_bound, from->version);
437 ++from;
438 break;
439 }
440 }
441 dout(20) << "merge_log cut point (usually last shared) is "
442 << lower_bound << dendl;
443 mark_dirty_from(lower_bound);
444
445 // We need to preserve the original crt before it gets updated in rewind_from_head().
446 // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
447 // a divergent entry or not.
448 eversion_t original_crt = log.get_can_rollback_to();
449 dout(20) << __func__ << " original_crt = " << original_crt << dendl;
450 auto divergent = log.rewind_from_head(lower_bound);
451 // move aside divergent items
452 for (auto &&oe: divergent) {
453 dout(10) << "merge_log divergent " << oe << dendl;
454 }
455 log.roll_forward_to(log.head, rollbacker);
456
457 mempool::osd_pglog::list<pg_log_entry_t> new_entries;
458 new_entries.splice(new_entries.end(), olog.log, from, to);
459 append_log_entries_update_missing(
460 info.last_backfill,
461 info.last_backfill_bitwise,
462 new_entries,
463 false,
464 &log,
465 missing,
466 rollbacker,
467 this);
468
469 _merge_divergent_entries(
470 log,
471 divergent,
472 info,
473 original_crt,
474 missing,
475 rollbacker,
476 this);
477
478 info.last_update = log.head = olog.head;
479
480 // We cannot rollback into the new log entries
481 log.skip_can_rollback_to_to_head();
482
483 info.last_user_version = oinfo.last_user_version;
484 info.purged_snaps = oinfo.purged_snaps;
485 // update num_missing too
486 // we might have appended some more missing objects above
487 info.stats.stats.sum.num_objects_missing = missing.num_missing();
488
489 changed = true;
490 }
491
492 // now handle dups
493 if (merge_log_dups(olog)) {
494 changed = true;
495 }
496
497 dout(10) << "merge_log result " << log << " " << missing <<
498 " changed=" << changed << dendl;
499
500 if (changed) {
501 dirty_info = true;
502 dirty_big_info = true;
503 }
504 }
505
506
507 // returns true if any changes were made to log.dups
508 bool PGLog::merge_log_dups(const pg_log_t& olog) {
509 bool changed = false;
510
511 if (!olog.dups.empty()) {
512 if (log.dups.empty()) {
513 dout(10) << "merge_log copying olog dups to log " <<
514 olog.dups.front().version << " to " <<
515 olog.dups.back().version << dendl;
516 changed = true;
517 dirty_from_dups = eversion_t();
518 dirty_to_dups = eversion_t::max();
519 // since our log.dups is empty just copy them
520 for (const auto& i : olog.dups) {
521 log.dups.push_back(i);
522 log.index(log.dups.back());
523 }
524 } else {
525 // since our log.dups is not empty try to extend on each end
526
527 if (olog.dups.back().version > log.dups.back().version) {
528 // extend the dups's tail (i.e., newer dups)
529 dout(10) << "merge_log extending dups tail to " <<
530 olog.dups.back().version << dendl;
531 changed = true;
532
533 auto log_tail_version = log.dups.back().version;
534
535 auto insert_cursor = log.dups.end();
536 eversion_t last_shared = eversion_t::max();
537 for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
538 if (i->version <= log_tail_version) break;
539 log.dups.insert(insert_cursor, *i);
540 last_shared = i->version;
541
542 auto prev = insert_cursor;
543 --prev;
544 // be sure to pass reference of copy in log.dups
545 log.index(*prev);
546
547 --insert_cursor; // make sure we insert in reverse order
548 }
549 mark_dirty_from_dups(last_shared);
550 }
551
552 if (olog.dups.front().version < log.dups.front().version) {
553 // extend the dups's head (i.e., older dups)
554 dout(10) << "merge_log extending dups head to " <<
555 olog.dups.front().version << dendl;
556 changed = true;
557
558 eversion_t last;
559 auto insert_cursor = log.dups.begin();
560 for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
561 if (i->version >= insert_cursor->version) break;
562 log.dups.insert(insert_cursor, *i);
563 last = i->version;
564 auto prev = insert_cursor;
565 --prev;
566 // be sure to pass address of copy in log.dups
567 log.index(*prev);
568 }
569 mark_dirty_to_dups(last);
570 }
571 }
572 }
573
574 // remove any dup entries that overlap with pglog
575 if (!log.dups.empty() && log.dups.back().version > log.tail) {
576 dout(10) << "merge_log removed dups overlapping log entries (" <<
577 log.tail << "," << log.dups.back().version << "]" << dendl;
578 changed = true;
579
580 while (!log.dups.empty() && log.dups.back().version > log.tail) {
581 log.unindex(log.dups.back());
582 mark_dirty_from_dups(log.dups.back().version);
583 log.dups.pop_back();
584 }
585 }
586
587 return changed;
588 }
589
590 void PGLog::check() {
591 if (!pg_log_debug)
592 return;
593 if (log.log.size() != log_keys_debug.size()) {
594 derr << "log.log.size() != log_keys_debug.size()" << dendl;
595 derr << "actual log:" << dendl;
596 for (list<pg_log_entry_t>::iterator i = log.log.begin();
597 i != log.log.end();
598 ++i) {
599 derr << " " << *i << dendl;
600 }
601 derr << "log_keys_debug:" << dendl;
602 for (set<string>::const_iterator i = log_keys_debug.begin();
603 i != log_keys_debug.end();
604 ++i) {
605 derr << " " << *i << dendl;
606 }
607 }
608 ceph_assert(log.log.size() == log_keys_debug.size());
609 for (list<pg_log_entry_t>::iterator i = log.log.begin();
610 i != log.log.end();
611 ++i) {
612 ceph_assert(log_keys_debug.count(i->get_key_name()));
613 }
614 }
615
616 // non-static
617 void PGLog::write_log_and_missing(
618 ObjectStore::Transaction& t,
619 map<string,bufferlist> *km,
620 const coll_t& coll,
621 const ghobject_t &log_oid,
622 bool require_rollback)
623 {
624 if (is_dirty()) {
625 dout(6) << "write_log_and_missing with: "
626 << "dirty_to: " << dirty_to
627 << ", dirty_from: " << dirty_from
628 << ", writeout_from: " << writeout_from
629 << ", trimmed: " << trimmed
630 << ", trimmed_dups: " << trimmed_dups
631 << ", clear_divergent_priors: " << clear_divergent_priors
632 << dendl;
633 _write_log_and_missing(
634 t, km, log, coll, log_oid,
635 dirty_to,
636 dirty_from,
637 writeout_from,
638 std::move(trimmed),
639 std::move(trimmed_dups),
640 missing,
641 !touched_log,
642 require_rollback,
643 clear_divergent_priors,
644 dirty_to_dups,
645 dirty_from_dups,
646 write_from_dups,
647 &rebuilt_missing_with_deletes,
648 (pg_log_debug ? &log_keys_debug : nullptr));
649 undirty();
650 } else {
651 dout(10) << "log is not dirty" << dendl;
652 }
653 }
654
655 // static
656 void PGLog::write_log_and_missing_wo_missing(
657 ObjectStore::Transaction& t,
658 map<string,bufferlist> *km,
659 pg_log_t &log,
660 const coll_t& coll, const ghobject_t &log_oid,
661 map<eversion_t, hobject_t> &divergent_priors,
662 bool require_rollback
663 )
664 {
665 _write_log_and_missing_wo_missing(
666 t, km, log, coll, log_oid,
667 divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
668 true, true, require_rollback,
669 eversion_t::max(), eversion_t(), eversion_t(), nullptr);
670 }
671
672 // static
673 void PGLog::write_log_and_missing(
674 ObjectStore::Transaction& t,
675 map<string,bufferlist> *km,
676 pg_log_t &log,
677 const coll_t& coll,
678 const ghobject_t &log_oid,
679 const pg_missing_tracker_t &missing,
680 bool require_rollback,
681 bool *rebuilt_missing_with_deletes)
682 {
683 _write_log_and_missing(
684 t, km, log, coll, log_oid,
685 eversion_t::max(),
686 eversion_t(),
687 eversion_t(),
688 set<eversion_t>(),
689 set<string>(),
690 missing,
691 true, require_rollback, false,
692 eversion_t::max(),
693 eversion_t(),
694 eversion_t(),
695 rebuilt_missing_with_deletes, nullptr);
696 }
697
698 // static
699 void PGLog::_write_log_and_missing_wo_missing(
700 ObjectStore::Transaction& t,
701 map<string,bufferlist> *km,
702 pg_log_t &log,
703 const coll_t& coll, const ghobject_t &log_oid,
704 map<eversion_t, hobject_t> &divergent_priors,
705 eversion_t dirty_to,
706 eversion_t dirty_from,
707 eversion_t writeout_from,
708 bool dirty_divergent_priors,
709 bool touch_log,
710 bool require_rollback,
711 eversion_t dirty_to_dups,
712 eversion_t dirty_from_dups,
713 eversion_t write_from_dups,
714 set<string> *log_keys_debug
715 )
716 {
717 // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
718 if (touch_log)
719 t.touch(coll, log_oid);
720 if (dirty_to != eversion_t()) {
721 t.omap_rmkeyrange(
722 coll, log_oid,
723 eversion_t().get_key_name(), dirty_to.get_key_name());
724 clear_up_to(log_keys_debug, dirty_to.get_key_name());
725 }
726 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
727 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
728 t.omap_rmkeyrange(
729 coll, log_oid,
730 dirty_from.get_key_name(), eversion_t::max().get_key_name());
731 clear_after(log_keys_debug, dirty_from.get_key_name());
732 }
733
734 for (list<pg_log_entry_t>::iterator p = log.log.begin();
735 p != log.log.end() && p->version <= dirty_to;
736 ++p) {
737 bufferlist bl(sizeof(*p) * 2);
738 p->encode_with_checksum(bl);
739 (*km)[p->get_key_name()].claim(bl);
740 }
741
742 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
743 p != log.log.rend() &&
744 (p->version >= dirty_from || p->version >= writeout_from) &&
745 p->version >= dirty_to;
746 ++p) {
747 bufferlist bl(sizeof(*p) * 2);
748 p->encode_with_checksum(bl);
749 (*km)[p->get_key_name()].claim(bl);
750 }
751
752 if (log_keys_debug) {
753 for (map<string, bufferlist>::iterator i = (*km).begin();
754 i != (*km).end();
755 ++i) {
756 if (i->first[0] == '_')
757 continue;
758 ceph_assert(!log_keys_debug->count(i->first));
759 log_keys_debug->insert(i->first);
760 }
761 }
762
763 // process dups after log_keys_debug is filled, so dups do not
764 // end up in that set
765 if (dirty_to_dups != eversion_t()) {
766 pg_log_dup_t min, dirty_to_dup;
767 dirty_to_dup.version = dirty_to_dups;
768 t.omap_rmkeyrange(
769 coll, log_oid,
770 min.get_key_name(), dirty_to_dup.get_key_name());
771 }
772 if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
773 pg_log_dup_t max, dirty_from_dup;
774 max.version = eversion_t::max();
775 dirty_from_dup.version = dirty_from_dups;
776 t.omap_rmkeyrange(
777 coll, log_oid,
778 dirty_from_dup.get_key_name(), max.get_key_name());
779 }
780
781 for (const auto& entry : log.dups) {
782 if (entry.version > dirty_to_dups)
783 break;
784 bufferlist bl;
785 encode(entry, bl);
786 (*km)[entry.get_key_name()].claim(bl);
787 }
788
789 for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
790 p != log.dups.rend() &&
791 (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
792 p->version >= dirty_to_dups;
793 ++p) {
794 bufferlist bl;
795 encode(*p, bl);
796 (*km)[p->get_key_name()].claim(bl);
797 }
798
799 if (dirty_divergent_priors) {
800 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
801 encode(divergent_priors, (*km)["divergent_priors"]);
802 }
803 if (require_rollback) {
804 encode(
805 log.get_can_rollback_to(),
806 (*km)["can_rollback_to"]);
807 encode(
808 log.get_rollback_info_trimmed_to(),
809 (*km)["rollback_info_trimmed_to"]);
810 }
811 }
812
813 // static
814 void PGLog::_write_log_and_missing(
815 ObjectStore::Transaction& t,
816 map<string,bufferlist>* km,
817 pg_log_t &log,
818 const coll_t& coll, const ghobject_t &log_oid,
819 eversion_t dirty_to,
820 eversion_t dirty_from,
821 eversion_t writeout_from,
822 set<eversion_t> &&trimmed,
823 set<string> &&trimmed_dups,
824 const pg_missing_tracker_t &missing,
825 bool touch_log,
826 bool require_rollback,
827 bool clear_divergent_priors,
828 eversion_t dirty_to_dups,
829 eversion_t dirty_from_dups,
830 eversion_t write_from_dups,
831 bool *rebuilt_missing_with_deletes, // in/out param
832 set<string> *log_keys_debug
833 ) {
834 set<string> to_remove;
835 to_remove.swap(trimmed_dups);
836 for (auto& t : trimmed) {
837 string key = t.get_key_name();
838 if (log_keys_debug) {
839 auto it = log_keys_debug->find(key);
840 ceph_assert(it != log_keys_debug->end());
841 log_keys_debug->erase(it);
842 }
843 to_remove.emplace(std::move(key));
844 }
845 trimmed.clear();
846
847 if (touch_log)
848 t.touch(coll, log_oid);
849 if (dirty_to != eversion_t()) {
850 t.omap_rmkeyrange(
851 coll, log_oid,
852 eversion_t().get_key_name(), dirty_to.get_key_name());
853 clear_up_to(log_keys_debug, dirty_to.get_key_name());
854 }
855 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
856 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
857 t.omap_rmkeyrange(
858 coll, log_oid,
859 dirty_from.get_key_name(), eversion_t::max().get_key_name());
860 clear_after(log_keys_debug, dirty_from.get_key_name());
861 }
862
863 for (list<pg_log_entry_t>::iterator p = log.log.begin();
864 p != log.log.end() && p->version <= dirty_to;
865 ++p) {
866 bufferlist bl(sizeof(*p) * 2);
867 p->encode_with_checksum(bl);
868 (*km)[p->get_key_name()].claim(bl);
869 }
870
871 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
872 p != log.log.rend() &&
873 (p->version >= dirty_from || p->version >= writeout_from) &&
874 p->version >= dirty_to;
875 ++p) {
876 bufferlist bl(sizeof(*p) * 2);
877 p->encode_with_checksum(bl);
878 (*km)[p->get_key_name()].claim(bl);
879 }
880
881 if (log_keys_debug) {
882 for (map<string, bufferlist>::iterator i = (*km).begin();
883 i != (*km).end();
884 ++i) {
885 if (i->first[0] == '_')
886 continue;
887 ceph_assert(!log_keys_debug->count(i->first));
888 log_keys_debug->insert(i->first);
889 }
890 }
891
892 // process dups after log_keys_debug is filled, so dups do not
893 // end up in that set
894 if (dirty_to_dups != eversion_t()) {
895 pg_log_dup_t min, dirty_to_dup;
896 dirty_to_dup.version = dirty_to_dups;
897 t.omap_rmkeyrange(
898 coll, log_oid,
899 min.get_key_name(), dirty_to_dup.get_key_name());
900 }
901 if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
902 pg_log_dup_t max, dirty_from_dup;
903 max.version = eversion_t::max();
904 dirty_from_dup.version = dirty_from_dups;
905 t.omap_rmkeyrange(
906 coll, log_oid,
907 dirty_from_dup.get_key_name(), max.get_key_name());
908 }
909
910 for (const auto& entry : log.dups) {
911 if (entry.version > dirty_to_dups)
912 break;
913 bufferlist bl;
914 encode(entry, bl);
915 (*km)[entry.get_key_name()].claim(bl);
916 }
917
918 for (list<pg_log_dup_t>::reverse_iterator p = log.dups.rbegin();
919 p != log.dups.rend() &&
920 (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
921 p->version >= dirty_to_dups;
922 ++p) {
923 bufferlist bl;
924 encode(*p, bl);
925 (*km)[p->get_key_name()].claim(bl);
926 }
927
928 if (clear_divergent_priors) {
929 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
930 to_remove.insert("divergent_priors");
931 }
932 // since we encode individual missing items instead of a whole
933 // missing set, we need another key to store this bit of state
934 if (*rebuilt_missing_with_deletes) {
935 (*km)["may_include_deletes_in_missing"] = bufferlist();
936 *rebuilt_missing_with_deletes = false;
937 }
938 missing.get_changed(
939 [&](const hobject_t &obj) {
940 string key = string("missing/") + obj.to_str();
941 pg_missing_item item;
942 if (!missing.is_missing(obj, &item)) {
943 to_remove.insert(key);
944 } else {
945 uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0;
946 encode(make_pair(obj, item), (*km)[key], features);
947 }
948 });
949 if (require_rollback) {
950 encode(
951 log.get_can_rollback_to(),
952 (*km)["can_rollback_to"]);
953 encode(
954 log.get_rollback_info_trimmed_to(),
955 (*km)["rollback_info_trimmed_to"]);
956 }
957
958 if (!to_remove.empty())
959 t.omap_rmkeys(coll, log_oid, to_remove);
960 }
961
962 void PGLog::rebuild_missing_set_with_deletes(
963 ObjectStore *store,
964 ObjectStore::CollectionHandle& ch,
965 const pg_info_t &info)
966 {
967 // save entries not generated from the current log (e.g. added due
968 // to repair, EIO handling, or divergent_priors).
969 map<hobject_t, pg_missing_item> extra_missing;
970 for (const auto& p : missing.get_items()) {
971 if (!log.logged_object(p.first)) {
972 dout(20) << __func__ << " extra missing entry: " << p.first
973 << " " << p.second << dendl;
974 extra_missing[p.first] = p.second;
975 }
976 }
977 missing.clear();
978 missing.may_include_deletes = true;
979
980 // go through the log and add items that are not present or older
981 // versions on disk, just as if we were reading the log + metadata
982 // off disk originally
983 set<hobject_t> did;
984 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
985 i != log.log.rend();
986 ++i) {
987 if (i->version <= info.last_complete)
988 break;
989 if (i->soid > info.last_backfill ||
990 i->is_error() ||
991 did.find(i->soid) != did.end())
992 continue;
993 did.insert(i->soid);
994
995 bufferlist bv;
996 int r = store->getattr(
997 ch,
998 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
999 OI_ATTR,
1000 bv);
1001 dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
1002
1003 if (r >= 0) {
1004 object_info_t oi(bv);
1005 dout(20) << __func__ << " store version = " << oi.version << dendl;
1006 if (oi.version < i->version) {
1007 missing.add(i->soid, i->version, oi.version, i->is_delete());
1008 }
1009 } else {
1010 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
1011 }
1012 }
1013
1014 for (const auto& p : extra_missing) {
1015 missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
1016 }
1017 rebuilt_missing_with_deletes = true;
1018 }