]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGLog.cc
7b086eb30a08999f2aae2ed47e0058c414abdafe
[ceph.git] / ceph / src / osd / PGLog.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "PGLog.h"
19 #include "include/unordered_map.h"
20 #include "common/ceph_context.h"
21
22 #define dout_context cct
23 #define dout_subsys ceph_subsys_osd
24 #undef dout_prefix
25 #define dout_prefix _prefix(_dout, this)
26
27 static ostream& _prefix(std::ostream *_dout, const PGLog *pglog)
28 {
29 return *_dout << pglog->gen_prefix();
30 }
31
32 //////////////////// PGLog::IndexedLog ////////////////////
33
34 void PGLog::IndexedLog::split_out_child(
35 pg_t child_pgid,
36 unsigned split_bits,
37 PGLog::IndexedLog *target)
38 {
39 unindex();
40 *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
41 index();
42 target->index();
43 reset_rollback_info_trimmed_to_riter();
44 }
45
46 void PGLog::IndexedLog::trim(
47 CephContext* cct,
48 eversion_t s,
49 set<eversion_t> *trimmed,
50 set<string>* trimmed_dups,
51 bool* dirty_dups)
52 {
53 if (complete_to != log.end() &&
54 complete_to->version <= s) {
55 generic_dout(0) << " bad trim to " << s << " when complete_to is "
56 << complete_to->version
57 << " on " << *this << dendl;
58 }
59
60 assert(s <= can_rollback_to);
61
62 auto earliest_dup_version =
63 log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
64 ? 0u
65 : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked;
66
67 while (!log.empty()) {
68 const pg_log_entry_t &e = *log.begin();
69 if (e.version > s)
70 break;
71 generic_dout(20) << "trim " << e << dendl;
72 if (trimmed)
73 trimmed->insert(e.version);
74
75 unindex(e); // remove from index,
76
77 // add to dup list
78 if (e.version.version >= earliest_dup_version) {
79 if (dirty_dups) *dirty_dups = true;
80 dups.push_back(pg_log_dup_t(e));
81 index(dups.back());
82 for (const auto& extra : e.extra_reqids) {
83 // note: extras have the same version as outer op
84 dups.push_back(pg_log_dup_t(e.version, extra.second,
85 extra.first, e.return_code));
86 index(dups.back());
87 }
88 }
89
90 if (rollback_info_trimmed_to_riter == log.rend() ||
91 e.version == rollback_info_trimmed_to_riter->version) {
92 log.pop_front();
93 rollback_info_trimmed_to_riter = log.rend();
94 } else {
95 log.pop_front();
96 }
97 }
98
99 while (!dups.empty()) {
100 const auto& e = *dups.begin();
101 if (e.version.version >= earliest_dup_version)
102 break;
103 generic_dout(20) << "trim dup " << e << dendl;
104 if (trimmed_dups)
105 trimmed_dups->insert(e.get_key_name());
106 if (indexed_data & PGLOG_INDEXED_DUPS) {
107 dup_index.erase(e.reqid);
108 }
109 dups.pop_front();
110 }
111
112 // raise tail?
113 if (tail < s)
114 tail = s;
115 }
116
117 ostream& PGLog::IndexedLog::print(ostream& out) const
118 {
119 out << *this << std::endl;
120 for (list<pg_log_entry_t>::const_iterator p = log.begin();
121 p != log.end();
122 ++p) {
123 out << *p << " " <<
124 (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
125 std::endl;
126 assert(!p->reqid_is_indexed() || logged_req(p->reqid));
127 }
128
129 for (list<pg_log_dup_t>::const_iterator p = dups.begin();
130 p != dups.end();
131 ++p) {
132 out << *p << std::endl;
133 }
134
135 return out;
136 }
137
138 //////////////////// PGLog ////////////////////
139
140 void PGLog::reset_backfill()
141 {
142 missing.clear();
143 }
144
145 void PGLog::clear() {
146 missing.clear();
147 log.clear();
148 log_keys_debug.clear();
149 undirty();
150 }
151
152 void PGLog::clear_info_log(
153 spg_t pgid,
154 ObjectStore::Transaction *t) {
155 coll_t coll(pgid);
156 t->remove(coll, pgid.make_pgmeta_oid());
157 }
158
159 void PGLog::trim(
160 eversion_t trim_to,
161 pg_info_t &info)
162 {
163 // trim?
164 if (trim_to > log.tail) {
165 // We shouldn't be trimming the log past last_complete
166 assert(trim_to <= info.last_complete);
167
168 dout(10) << "trim " << log << " to " << trim_to << dendl;
169 log.trim(cct, trim_to, &trimmed, &trimmed_dups, &dirty_dups);
170 info.log_tail = log.tail;
171 }
172 }
173
174 void PGLog::proc_replica_log(
175 pg_info_t &oinfo,
176 const pg_log_t &olog,
177 pg_missing_t& omissing,
178 pg_shard_t from) const
179 {
180 dout(10) << "proc_replica_log for osd." << from << ": "
181 << oinfo << " " << olog << " " << omissing << dendl;
182
183 if (olog.head < log.tail) {
184 dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
185 << "for divergent objects" << dendl;
186 return;
187 }
188 if (olog.head == log.head) {
189 dout(10) << __func__ << ": osd." << from << " same log head, not looking "
190 << "for divergent objects" << dendl;
191 return;
192 }
193 assert(olog.head >= log.tail);
194
195 /*
196 basically what we're doing here is rewinding the remote log,
197 dropping divergent entries, until we find something that matches
198 our master log. we then reset last_update to reflect the new
199 point up to which missing is accurate.
200
201 later, in activate(), missing will get wound forward again and
202 we will send the peer enough log to arrive at the same state.
203 */
204
205 for (map<hobject_t, pg_missing_item>::const_iterator i = omissing.get_items().begin();
206 i != omissing.get_items().end();
207 ++i) {
208 dout(20) << " before missing " << i->first << " need " << i->second.need
209 << " have " << i->second.have << dendl;
210 }
211
212 list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
213 log.log.rbegin();
214 while (1) {
215 if (first_non_divergent == log.log.rend())
216 break;
217 if (first_non_divergent->version <= olog.head) {
218 dout(20) << "merge_log point (usually last shared) is "
219 << *first_non_divergent << dendl;
220 break;
221 }
222 ++first_non_divergent;
223 }
224
225 /* Because olog.head >= log.tail, we know that both pgs must at least have
226 * the event represented by log.tail. Similarly, because log.head >= olog.tail,
227 * we know that the even represented by olog.tail must be common to both logs.
228 * Furthermore, the event represented by a log tail was necessarily trimmed,
229 * thus neither olog.tail nor log.tail can be divergent. It's
230 * possible that olog/log contain no actual events between olog.head and
231 * MAX(log.tail, olog.tail), however, since they might have been split out.
232 * Thus, if we cannot find an event e such that
233 * log.tail <= e.version <= log.head, the last_update must actually be
234 * MAX(log.tail, olog.tail).
235 */
236 eversion_t limit = MAX(olog.tail, log.tail);
237 eversion_t lu =
238 (first_non_divergent == log.log.rend() ||
239 first_non_divergent->version < limit) ?
240 limit :
241 first_non_divergent->version;
242
243 IndexedLog folog(olog);
244 auto divergent = folog.rewind_from_head(lu);
245 _merge_divergent_entries(
246 folog,
247 divergent,
248 oinfo,
249 olog.get_can_rollback_to(),
250 omissing,
251 0,
252 this);
253
254 if (lu < oinfo.last_update) {
255 dout(10) << " peer osd." << from << " last_update now " << lu << dendl;
256 oinfo.last_update = lu;
257 }
258
259 if (omissing.have_missing()) {
260 eversion_t first_missing =
261 omissing.get_items().at(omissing.get_rmissing().begin()->second).need;
262 oinfo.last_complete = eversion_t();
263 list<pg_log_entry_t>::const_iterator i = olog.log.begin();
264 for (;
265 i != olog.log.end();
266 ++i) {
267 if (i->version < first_missing)
268 oinfo.last_complete = i->version;
269 else
270 break;
271 }
272 } else {
273 oinfo.last_complete = oinfo.last_update;
274 }
275 } // proc_replica_log
276
277 /**
278 * rewind divergent entries at the head of the log
279 *
280 * This rewinds entries off the head of our log that are divergent.
281 * This is used by replicas during activation.
282 *
283 * @param newhead new head to rewind to
284 */
285 void PGLog::rewind_divergent_log(eversion_t newhead,
286 pg_info_t &info, LogEntryHandler *rollbacker,
287 bool &dirty_info, bool &dirty_big_info)
288 {
289 dout(10) << "rewind_divergent_log truncate divergent future " <<
290 newhead << dendl;
291
292
293 if (info.last_complete > newhead)
294 info.last_complete = newhead;
295
296 auto divergent = log.rewind_from_head(newhead);
297 if (!divergent.empty()) {
298 mark_dirty_from(divergent.front().version);
299 }
300 for (auto &&entry: divergent) {
301 dout(10) << "rewind_divergent_log future divergent " << entry << dendl;
302 }
303 info.last_update = newhead;
304
305 _merge_divergent_entries(
306 log,
307 divergent,
308 info,
309 log.get_can_rollback_to(),
310 missing,
311 rollbacker,
312 this);
313
314 dirty_info = true;
315 dirty_big_info = true;
316 }
317
318 void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
319 pg_info_t &info, LogEntryHandler *rollbacker,
320 bool &dirty_info, bool &dirty_big_info)
321 {
322 dout(10) << "merge_log " << olog << " from osd." << fromosd
323 << " into " << log << dendl;
324
325 // Check preconditions
326
327 // If our log is empty, the incoming log needs to have not been trimmed.
328 assert(!log.null() || olog.tail == eversion_t());
329 // The logs must overlap.
330 assert(log.head >= olog.tail && olog.head >= log.tail);
331
332 for (map<hobject_t, pg_missing_item>::const_iterator i = missing.get_items().begin();
333 i != missing.get_items().end();
334 ++i) {
335 dout(20) << "pg_missing_t sobject: " << i->first << dendl;
336 }
337
338 bool changed = false;
339
340 // extend on tail?
341 // this is just filling in history. it does not affect our
342 // missing set, as that should already be consistent with our
343 // current log.
344 eversion_t orig_tail = log.tail;
345 if (olog.tail < log.tail) {
346 dout(10) << "merge_log extending tail to " << olog.tail << dendl;
347 list<pg_log_entry_t>::iterator from = olog.log.begin();
348 list<pg_log_entry_t>::iterator to;
349 eversion_t last;
350 for (to = from;
351 to != olog.log.end();
352 ++to) {
353 if (to->version > log.tail)
354 break;
355 log.index(*to);
356 dout(15) << *to << dendl;
357 last = to->version;
358 }
359 mark_dirty_to(last);
360
361 // splice into our log.
362 log.log.splice(log.log.begin(),
363 olog.log, from, to);
364
365 info.log_tail = log.tail = olog.tail;
366 changed = true;
367 }
368
369 if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases
370 oinfo.stats.reported_epoch < info.stats.reported_epoch) {
371 oinfo.stats.reported_seq = info.stats.reported_seq;
372 oinfo.stats.reported_epoch = info.stats.reported_epoch;
373 }
374 if (info.last_backfill.is_max())
375 info.stats = oinfo.stats;
376 info.hit_set = oinfo.hit_set;
377
378 // do we have divergent entries to throw out?
379 if (olog.head < log.head) {
380 rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
381 changed = true;
382 }
383
384 // extend on head?
385 if (olog.head > log.head) {
386 dout(10) << "merge_log extending head to " << olog.head << dendl;
387
388 // find start point in olog
389 list<pg_log_entry_t>::iterator to = olog.log.end();
390 list<pg_log_entry_t>::iterator from = olog.log.end();
391 eversion_t lower_bound = MAX(olog.tail, orig_tail);
392 while (1) {
393 if (from == olog.log.begin())
394 break;
395 --from;
396 dout(20) << " ? " << *from << dendl;
397 if (from->version <= log.head) {
398 lower_bound = MAX(lower_bound, from->version);
399 ++from;
400 break;
401 }
402 }
403 dout(20) << "merge_log cut point (usually last shared) is "
404 << lower_bound << dendl;
405 mark_dirty_from(lower_bound);
406
407 auto divergent = log.rewind_from_head(lower_bound);
408 // move aside divergent items
409 for (auto &&oe: divergent) {
410 dout(10) << "merge_log divergent " << oe << dendl;
411 }
412 log.roll_forward_to(log.head, rollbacker);
413
414 mempool::osd_pglog::list<pg_log_entry_t> new_entries;
415 new_entries.splice(new_entries.end(), olog.log, from, to);
416 append_log_entries_update_missing(
417 info.last_backfill,
418 info.last_backfill_bitwise,
419 new_entries,
420 false,
421 &log,
422 missing,
423 rollbacker,
424 this);
425
426 _merge_divergent_entries(
427 log,
428 divergent,
429 info,
430 log.get_can_rollback_to(),
431 missing,
432 rollbacker,
433 this);
434
435 info.last_update = log.head = olog.head;
436
437 // We cannot rollback into the new log entries
438 log.skip_can_rollback_to_to_head();
439
440 info.last_user_version = oinfo.last_user_version;
441 info.purged_snaps = oinfo.purged_snaps;
442
443 changed = true;
444 }
445
446 // now handle dups
447 if (merge_log_dups(olog)) {
448 dirty_dups = true;
449 changed = true;
450 }
451
452 dout(10) << "merge_log result " << log << " " << missing <<
453 " changed=" << changed << dendl;
454
455 if (changed) {
456 dirty_info = true;
457 dirty_big_info = true;
458 }
459 }
460
461
462 // returns true if any changes were made to log.dups
463 bool PGLog::merge_log_dups(const pg_log_t& olog) {
464 bool changed = false;
465
466 if (!olog.dups.empty()) {
467 if (log.dups.empty()) {
468 dout(10) << "merge_log copying olog dups to log " <<
469 olog.dups.front().version << " to " <<
470 olog.dups.back().version << dendl;
471 changed = true;
472 // since our log.dups is empty just copy them
473 for (const auto& i : olog.dups) {
474 log.dups.push_back(i);
475 log.index(log.dups.back());
476 }
477 } else {
478 // since our log.dups is not empty try to extend on each end
479
480 if (olog.dups.back().version > log.dups.back().version) {
481 // extend the dups's tail (i.e., newer dups)
482 dout(10) << "merge_log extending dups tail to " <<
483 olog.dups.back().version << dendl;
484 changed = true;
485
486 auto log_tail_version = log.dups.back().version;
487
488 auto insert_cursor = log.dups.end();
489 for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
490 if (i->version <= log_tail_version) break;
491 log.dups.insert(insert_cursor, *i);
492
493 auto prev = insert_cursor;
494 --prev;
495 // be sure to pass reference of copy in log.dups
496 log.index(*prev);
497
498 --insert_cursor; // make sure we insert in reverse order
499 }
500 }
501
502 if (olog.dups.front().version < log.dups.front().version) {
503 // extend the dups's head (i.e., older dups)
504 dout(10) << "merge_log extending dups head to " <<
505 olog.dups.front().version << dendl;
506 changed = true;
507
508 auto insert_cursor = log.dups.begin();
509 for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
510 if (i->version >= insert_cursor->version) break;
511 log.dups.insert(insert_cursor, *i);
512 auto prev = insert_cursor;
513 --prev;
514 // be sure to pass address of copy in log.dups
515 log.index(*prev);
516 }
517 }
518 }
519 }
520
521 // remove any dup entries that overlap with pglog
522 if (!log.dups.empty() && log.dups.back().version >= log.tail) {
523 dout(10) << "merge_log removed dups overlapping log entries [" <<
524 log.tail << "," << log.dups.back().version << "]" << dendl;
525 changed = true;
526
527 while (!log.dups.empty() && log.dups.back().version >= log.tail) {
528 log.unindex(log.dups.back());
529 log.dups.pop_back();
530 }
531 }
532
533 return changed;
534 }
535
536 void PGLog::check() {
537 if (!pg_log_debug)
538 return;
539 if (log.log.size() != log_keys_debug.size()) {
540 derr << "log.log.size() != log_keys_debug.size()" << dendl;
541 derr << "actual log:" << dendl;
542 for (list<pg_log_entry_t>::iterator i = log.log.begin();
543 i != log.log.end();
544 ++i) {
545 derr << " " << *i << dendl;
546 }
547 derr << "log_keys_debug:" << dendl;
548 for (set<string>::const_iterator i = log_keys_debug.begin();
549 i != log_keys_debug.end();
550 ++i) {
551 derr << " " << *i << dendl;
552 }
553 }
554 assert(log.log.size() == log_keys_debug.size());
555 for (list<pg_log_entry_t>::iterator i = log.log.begin();
556 i != log.log.end();
557 ++i) {
558 assert(log_keys_debug.count(i->get_key_name()));
559 }
560 }
561
562 // non-static
563 void PGLog::write_log_and_missing(
564 ObjectStore::Transaction& t,
565 map<string,bufferlist> *km,
566 const coll_t& coll,
567 const ghobject_t &log_oid,
568 bool require_rollback)
569 {
570 if (is_dirty()) {
571 dout(5) << "write_log_and_missing with: "
572 << "dirty_to: " << dirty_to
573 << ", dirty_from: " << dirty_from
574 << ", writeout_from: " << writeout_from
575 << ", trimmed: " << trimmed
576 << ", trimmed_dups: " << trimmed_dups
577 << ", clear_divergent_priors: " << clear_divergent_priors
578 << dendl;
579 _write_log_and_missing(
580 t, km, log, coll, log_oid,
581 dirty_to,
582 dirty_from,
583 writeout_from,
584 trimmed,
585 trimmed_dups,
586 missing,
587 !touched_log,
588 require_rollback,
589 clear_divergent_priors,
590 dirty_dups,
591 &rebuilt_missing_with_deletes,
592 (pg_log_debug ? &log_keys_debug : nullptr));
593 undirty();
594 } else {
595 dout(10) << "log is not dirty" << dendl;
596 }
597 }
598
599 // static
600 void PGLog::write_log_and_missing_wo_missing(
601 ObjectStore::Transaction& t,
602 map<string,bufferlist> *km,
603 pg_log_t &log,
604 const coll_t& coll, const ghobject_t &log_oid,
605 map<eversion_t, hobject_t> &divergent_priors,
606 bool require_rollback,
607 bool dirty_dups)
608 {
609 _write_log_and_missing_wo_missing(
610 t, km, log, coll, log_oid,
611 divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
612 set<eversion_t>(),
613 set<string>(),
614 true, true, require_rollback, dirty_dups, nullptr);
615 }
616
617 // static
618 void PGLog::write_log_and_missing(
619 ObjectStore::Transaction& t,
620 map<string,bufferlist> *km,
621 pg_log_t &log,
622 const coll_t& coll,
623 const ghobject_t &log_oid,
624 const pg_missing_tracker_t &missing,
625 bool require_rollback,
626 bool dirty_dups,
627 bool *rebuilt_missing_with_deletes)
628 {
629 _write_log_and_missing(
630 t, km, log, coll, log_oid,
631 eversion_t::max(),
632 eversion_t(),
633 eversion_t(),
634 set<eversion_t>(),
635 set<string>(),
636 missing,
637 true, require_rollback, false, dirty_dups, rebuilt_missing_with_deletes, nullptr);
638 }
639
640 // static
641 void PGLog::_write_log_and_missing_wo_missing(
642 ObjectStore::Transaction& t,
643 map<string,bufferlist> *km,
644 pg_log_t &log,
645 const coll_t& coll, const ghobject_t &log_oid,
646 map<eversion_t, hobject_t> &divergent_priors,
647 eversion_t dirty_to,
648 eversion_t dirty_from,
649 eversion_t writeout_from,
650 const set<eversion_t> &trimmed,
651 const set<string> &trimmed_dups,
652 bool dirty_divergent_priors,
653 bool touch_log,
654 bool require_rollback,
655 bool dirty_dups,
656 set<string> *log_keys_debug
657 )
658 {
659 set<string> to_remove(trimmed_dups);
660 for (set<eversion_t>::const_iterator i = trimmed.begin();
661 i != trimmed.end();
662 ++i) {
663 to_remove.insert(i->get_key_name());
664 if (log_keys_debug) {
665 assert(log_keys_debug->count(i->get_key_name()));
666 log_keys_debug->erase(i->get_key_name());
667 }
668 }
669
670 // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
671 if (touch_log)
672 t.touch(coll, log_oid);
673 if (dirty_to != eversion_t()) {
674 t.omap_rmkeyrange(
675 coll, log_oid,
676 eversion_t().get_key_name(), dirty_to.get_key_name());
677 clear_up_to(log_keys_debug, dirty_to.get_key_name());
678 }
679 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
680 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
681 t.omap_rmkeyrange(
682 coll, log_oid,
683 dirty_from.get_key_name(), eversion_t::max().get_key_name());
684 clear_after(log_keys_debug, dirty_from.get_key_name());
685 }
686
687 for (list<pg_log_entry_t>::iterator p = log.log.begin();
688 p != log.log.end() && p->version <= dirty_to;
689 ++p) {
690 bufferlist bl(sizeof(*p) * 2);
691 p->encode_with_checksum(bl);
692 (*km)[p->get_key_name()].claim(bl);
693 }
694
695 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
696 p != log.log.rend() &&
697 (p->version >= dirty_from || p->version >= writeout_from) &&
698 p->version >= dirty_to;
699 ++p) {
700 bufferlist bl(sizeof(*p) * 2);
701 p->encode_with_checksum(bl);
702 (*km)[p->get_key_name()].claim(bl);
703 }
704
705 if (log_keys_debug) {
706 for (map<string, bufferlist>::iterator i = (*km).begin();
707 i != (*km).end();
708 ++i) {
709 if (i->first[0] == '_')
710 continue;
711 assert(!log_keys_debug->count(i->first));
712 log_keys_debug->insert(i->first);
713 }
714 }
715
716 // process dirty_dups after log_keys_debug is filled, so dups do not
717 // end up in that set
718 if (dirty_dups) {
719 pg_log_dup_t min;
720 t.omap_rmkeyrange(
721 coll, log_oid,
722 min.get_key_name(), log.dups.begin()->get_key_name());
723 for (const auto& entry : log.dups) {
724 bufferlist bl;
725 ::encode(entry, bl);
726 (*km)[entry.get_key_name()].claim(bl);
727 }
728 }
729
730 if (dirty_divergent_priors) {
731 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
732 ::encode(divergent_priors, (*km)["divergent_priors"]);
733 }
734 if (require_rollback) {
735 ::encode(
736 log.get_can_rollback_to(),
737 (*km)["can_rollback_to"]);
738 ::encode(
739 log.get_rollback_info_trimmed_to(),
740 (*km)["rollback_info_trimmed_to"]);
741 }
742
743 if (!to_remove.empty())
744 t.omap_rmkeys(coll, log_oid, to_remove);
745 }
746
747 // static
748 void PGLog::_write_log_and_missing(
749 ObjectStore::Transaction& t,
750 map<string,bufferlist>* km,
751 pg_log_t &log,
752 const coll_t& coll, const ghobject_t &log_oid,
753 eversion_t dirty_to,
754 eversion_t dirty_from,
755 eversion_t writeout_from,
756 const set<eversion_t> &trimmed,
757 const set<string> &trimmed_dups,
758 const pg_missing_tracker_t &missing,
759 bool touch_log,
760 bool require_rollback,
761 bool clear_divergent_priors,
762 bool dirty_dups,
763 bool *rebuilt_missing_with_deletes, // in/out param
764 set<string> *log_keys_debug
765 ) {
766 set<string> to_remove(trimmed_dups);
767 for (set<eversion_t>::const_iterator i = trimmed.begin();
768 i != trimmed.end();
769 ++i) {
770 to_remove.insert(i->get_key_name());
771 if (log_keys_debug) {
772 assert(log_keys_debug->count(i->get_key_name()));
773 log_keys_debug->erase(i->get_key_name());
774 }
775 }
776
777 if (touch_log)
778 t.touch(coll, log_oid);
779 if (dirty_to != eversion_t()) {
780 t.omap_rmkeyrange(
781 coll, log_oid,
782 eversion_t().get_key_name(), dirty_to.get_key_name());
783 clear_up_to(log_keys_debug, dirty_to.get_key_name());
784 }
785 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
786 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
787 t.omap_rmkeyrange(
788 coll, log_oid,
789 dirty_from.get_key_name(), eversion_t::max().get_key_name());
790 clear_after(log_keys_debug, dirty_from.get_key_name());
791 }
792
793 for (list<pg_log_entry_t>::iterator p = log.log.begin();
794 p != log.log.end() && p->version <= dirty_to;
795 ++p) {
796 bufferlist bl(sizeof(*p) * 2);
797 p->encode_with_checksum(bl);
798 (*km)[p->get_key_name()].claim(bl);
799 }
800
801 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
802 p != log.log.rend() &&
803 (p->version >= dirty_from || p->version >= writeout_from) &&
804 p->version >= dirty_to;
805 ++p) {
806 bufferlist bl(sizeof(*p) * 2);
807 p->encode_with_checksum(bl);
808 (*km)[p->get_key_name()].claim(bl);
809 }
810
811 if (log_keys_debug) {
812 for (map<string, bufferlist>::iterator i = (*km).begin();
813 i != (*km).end();
814 ++i) {
815 if (i->first[0] == '_')
816 continue;
817 assert(!log_keys_debug->count(i->first));
818 log_keys_debug->insert(i->first);
819 }
820 }
821
822 // process dirty_dups after log_keys_debug is filled, so dups do not
823 // end up in that set
824 if (dirty_dups) {
825 pg_log_dup_t min;
826 t.omap_rmkeyrange(
827 coll, log_oid,
828 min.get_key_name(), log.dups.begin()->get_key_name());
829 for (const auto& entry : log.dups) {
830 bufferlist bl;
831 ::encode(entry, bl);
832 (*km)[entry.get_key_name()].claim(bl);
833 }
834 }
835
836 if (clear_divergent_priors) {
837 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
838 to_remove.insert("divergent_priors");
839 }
840 // since we encode individual missing items instead of a whole
841 // missing set, we need another key to store this bit of state
842 if (*rebuilt_missing_with_deletes) {
843 (*km)["may_include_deletes_in_missing"] = bufferlist();
844 *rebuilt_missing_with_deletes = false;
845 }
846 missing.get_changed(
847 [&](const hobject_t &obj) {
848 string key = string("missing/") + obj.to_str();
849 pg_missing_item item;
850 if (!missing.is_missing(obj, &item)) {
851 to_remove.insert(key);
852 } else {
853 uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0;
854 ::encode(make_pair(obj, item), (*km)[key], features);
855 }
856 });
857 if (require_rollback) {
858 ::encode(
859 log.get_can_rollback_to(),
860 (*km)["can_rollback_to"]);
861 ::encode(
862 log.get_rollback_info_trimmed_to(),
863 (*km)["rollback_info_trimmed_to"]);
864 }
865
866 if (!to_remove.empty())
867 t.omap_rmkeys(coll, log_oid, to_remove);
868 }
869
870 void PGLog::rebuild_missing_set_with_deletes(ObjectStore *store,
871 coll_t pg_coll,
872 const pg_info_t &info)
873 {
874 // save entries not generated from the current log (e.g. added due
875 // to repair, EIO handling, or divergent_priors).
876 map<hobject_t, pg_missing_item> extra_missing;
877 for (const auto& p : missing.get_items()) {
878 if (!log.logged_object(p.first)) {
879 dout(20) << __func__ << " extra missing entry: " << p.first
880 << " " << p.second << dendl;
881 extra_missing[p.first] = p.second;
882 }
883 }
884 missing.clear();
885 missing.may_include_deletes = true;
886
887 // go through the log and add items that are not present or older
888 // versions on disk, just as if we were reading the log + metadata
889 // off disk originally
890 set<hobject_t> did;
891 for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
892 i != log.log.rend();
893 ++i) {
894 if (i->version <= info.last_complete)
895 break;
896 if (i->soid > info.last_backfill ||
897 i->is_error() ||
898 did.find(i->soid) != did.end())
899 continue;
900 did.insert(i->soid);
901
902 bufferlist bv;
903 int r = store->getattr(
904 pg_coll,
905 ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
906 OI_ATTR,
907 bv);
908 dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
909
910 if (r >= 0) {
911 object_info_t oi(bv);
912 dout(20) << __func__ << " store version = " << oi.version << dendl;
913 if (oi.version < i->version) {
914 missing.add(i->soid, i->version, oi.version, i->is_delete());
915 }
916 } else {
917 missing.add(i->soid, i->version, eversion_t(), i->is_delete());
918 }
919 }
920
921 for (const auto& p : extra_missing) {
922 missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
923 }
924 rebuilt_missing_with_deletes = true;
925 }