]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGLog.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / osd / PGLog.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "PGLog.h"
19 #include "include/unordered_map.h"
20 #include "common/ceph_context.h"
21
22 #define dout_context cct
23 #define dout_subsys ceph_subsys_osd
24 #undef dout_prefix
25 #define dout_prefix _prefix(_dout, this)
26
27 static ostream& _prefix(std::ostream *_dout, const PGLog *pglog)
28 {
29 return *_dout << pglog->gen_prefix();
30 }
31
32 //////////////////// PGLog::IndexedLog ////////////////////
33
34 void PGLog::IndexedLog::split_out_child(
35 pg_t child_pgid,
36 unsigned split_bits,
37 PGLog::IndexedLog *target)
38 {
39 unindex();
40 *target = pg_log_t::split_out_child(child_pgid, split_bits);
41 index();
42 target->index();
43 reset_rollback_info_trimmed_to_riter();
44 }
45
46 void PGLog::IndexedLog::trim(
47 CephContext* cct,
48 eversion_t s,
49 set<eversion_t> *trimmed)
50 {
51 if (complete_to != log.end() &&
52 complete_to->version <= s) {
53 generic_dout(0) << " bad trim to " << s << " when complete_to is "
54 << complete_to->version
55 << " on " << *this << dendl;
56 }
57
58 assert(s <= can_rollback_to);
59
60 while (!log.empty()) {
61 pg_log_entry_t &e = *log.begin();
62 if (e.version > s)
63 break;
64 generic_dout(20) << "trim " << e << dendl;
65 if (trimmed)
66 trimmed->insert(e.version);
67
68 unindex(e); // remove from index,
69
70 if (rollback_info_trimmed_to_riter == log.rend() ||
71 e.version == rollback_info_trimmed_to_riter->version) {
72 log.pop_front();
73 rollback_info_trimmed_to_riter = log.rend();
74 } else {
75 log.pop_front();
76 }
77 }
78
79 // raise tail?
80 if (tail < s)
81 tail = s;
82 }
83
84 ostream& PGLog::IndexedLog::print(ostream& out) const
85 {
86 out << *this << std::endl;
87 for (list<pg_log_entry_t>::const_iterator p = log.begin();
88 p != log.end();
89 ++p) {
90 out << *p << " " << (logged_object(p->soid) ? "indexed":"NOT INDEXED") << std::endl;
91 assert(!p->reqid_is_indexed() || logged_req(p->reqid));
92 }
93 return out;
94 }
95
96 //////////////////// PGLog ////////////////////
97
98 void PGLog::reset_backfill()
99 {
100 missing.clear();
101 }
102
103 void PGLog::clear() {
104 missing.clear();
105 log.clear();
106 log_keys_debug.clear();
107 undirty();
108 }
109
110 void PGLog::clear_info_log(
111 spg_t pgid,
112 ObjectStore::Transaction *t) {
113 coll_t coll(pgid);
114 t->remove(coll, pgid.make_pgmeta_oid());
115 }
116
117 void PGLog::trim(
118 eversion_t trim_to,
119 pg_info_t &info)
120 {
121 // trim?
122 if (trim_to > log.tail) {
123 // We shouldn't be trimming the log past last_complete
124 assert(trim_to <= info.last_complete);
125
126 dout(10) << "trim " << log << " to " << trim_to << dendl;
127 log.trim(cct, trim_to, &trimmed);
128 info.log_tail = log.tail;
129 }
130 }
131
132 void PGLog::proc_replica_log(
133 pg_info_t &oinfo,
134 const pg_log_t &olog,
135 pg_missing_t& omissing,
136 pg_shard_t from) const
137 {
138 dout(10) << "proc_replica_log for osd." << from << ": "
139 << oinfo << " " << olog << " " << omissing << dendl;
140
141 if (olog.head < log.tail) {
142 dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
143 << "for divergent objects" << dendl;
144 return;
145 }
146 if (olog.head == log.head) {
147 dout(10) << __func__ << ": osd." << from << " same log head, not looking "
148 << "for divergent objects" << dendl;
149 return;
150 }
151 assert(olog.head >= log.tail);
152
153 /*
154 basically what we're doing here is rewinding the remote log,
155 dropping divergent entries, until we find something that matches
156 our master log. we then reset last_update to reflect the new
157 point up to which missing is accurate.
158
159 later, in activate(), missing will get wound forward again and
160 we will send the peer enough log to arrive at the same state.
161 */
162
163 for (map<hobject_t, pg_missing_item>::const_iterator i = omissing.get_items().begin();
164 i != omissing.get_items().end();
165 ++i) {
166 dout(20) << " before missing " << i->first << " need " << i->second.need
167 << " have " << i->second.have << dendl;
168 }
169
170 list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
171 log.log.rbegin();
172 while (1) {
173 if (first_non_divergent == log.log.rend())
174 break;
175 if (first_non_divergent->version <= olog.head) {
176 dout(20) << "merge_log point (usually last shared) is "
177 << *first_non_divergent << dendl;
178 break;
179 }
180 ++first_non_divergent;
181 }
182
183 /* Because olog.head >= log.tail, we know that both pgs must at least have
184 * the event represented by log.tail. Similarly, because log.head >= olog.tail,
185 * we know that the even represented by olog.tail must be common to both logs.
186 * Furthermore, the event represented by a log tail was necessarily trimmed,
187 * thus neither olog.tail nor log.tail can be divergent. It's
188 * possible that olog/log contain no actual events between olog.head and
189 * MAX(log.tail, olog.tail), however, since they might have been split out.
190 * Thus, if we cannot find an event e such that
191 * log.tail <= e.version <= log.head, the last_update must actually be
192 * MAX(log.tail, olog.tail).
193 */
194 eversion_t limit = MAX(olog.tail, log.tail);
195 eversion_t lu =
196 (first_non_divergent == log.log.rend() ||
197 first_non_divergent->version < limit) ?
198 limit :
199 first_non_divergent->version;
200
201 IndexedLog folog(olog);
202 auto divergent = folog.rewind_from_head(lu);
203 _merge_divergent_entries(
204 folog,
205 divergent,
206 oinfo,
207 olog.get_can_rollback_to(),
208 omissing,
209 0,
210 this);
211
212 if (lu < oinfo.last_update) {
213 dout(10) << " peer osd." << from << " last_update now " << lu << dendl;
214 oinfo.last_update = lu;
215 }
216
217 if (omissing.have_missing()) {
218 eversion_t first_missing =
219 omissing.get_items().at(omissing.get_rmissing().begin()->second).need;
220 oinfo.last_complete = eversion_t();
221 list<pg_log_entry_t>::const_iterator i = olog.log.begin();
222 for (;
223 i != olog.log.end();
224 ++i) {
225 if (i->version < first_missing)
226 oinfo.last_complete = i->version;
227 else
228 break;
229 }
230 } else {
231 oinfo.last_complete = oinfo.last_update;
232 }
233 }
234
235 /**
236 * rewind divergent entries at the head of the log
237 *
238 * This rewinds entries off the head of our log that are divergent.
239 * This is used by replicas during activation.
240 *
241 * @param newhead new head to rewind to
242 */
243 void PGLog::rewind_divergent_log(eversion_t newhead,
244 pg_info_t &info, LogEntryHandler *rollbacker,
245 bool &dirty_info, bool &dirty_big_info)
246 {
247 dout(10) << "rewind_divergent_log truncate divergent future " << newhead << dendl;
248
249
250 if (info.last_complete > newhead)
251 info.last_complete = newhead;
252
253 auto divergent = log.rewind_from_head(newhead);
254 if (!divergent.empty()) {
255 mark_dirty_from(divergent.front().version);
256 }
257 for (auto &&entry: divergent) {
258 dout(10) << "rewind_divergent_log future divergent " << entry << dendl;
259 }
260 info.last_update = newhead;
261
262 _merge_divergent_entries(
263 log,
264 divergent,
265 info,
266 log.get_can_rollback_to(),
267 missing,
268 rollbacker,
269 this);
270
271 dirty_info = true;
272 dirty_big_info = true;
273 }
274
275 void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
276 pg_info_t &info, LogEntryHandler *rollbacker,
277 bool &dirty_info, bool &dirty_big_info)
278 {
279 dout(10) << "merge_log " << olog << " from osd." << fromosd
280 << " into " << log << dendl;
281
282 // Check preconditions
283
284 // If our log is empty, the incoming log needs to have not been trimmed.
285 assert(!log.null() || olog.tail == eversion_t());
286 // The logs must overlap.
287 assert(log.head >= olog.tail && olog.head >= log.tail);
288
289 for (map<hobject_t, pg_missing_item>::const_iterator i = missing.get_items().begin();
290 i != missing.get_items().end();
291 ++i) {
292 dout(20) << "pg_missing_t sobject: " << i->first << dendl;
293 }
294
295 bool changed = false;
296
297 // extend on tail?
298 // this is just filling in history. it does not affect our
299 // missing set, as that should already be consistent with our
300 // current log.
301 eversion_t orig_tail = log.tail;
302 if (olog.tail < log.tail) {
303 dout(10) << "merge_log extending tail to " << olog.tail << dendl;
304 list<pg_log_entry_t>::iterator from = olog.log.begin();
305 list<pg_log_entry_t>::iterator to;
306 eversion_t last;
307 for (to = from;
308 to != olog.log.end();
309 ++to) {
310 if (to->version > log.tail)
311 break;
312 log.index(*to);
313 dout(15) << *to << dendl;
314 last = to->version;
315 }
316 mark_dirty_to(last);
317
318 // splice into our log.
319 log.log.splice(log.log.begin(),
320 olog.log, from, to);
321
322 info.log_tail = log.tail = olog.tail;
323 changed = true;
324 }
325
326 if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases
327 oinfo.stats.reported_epoch < info.stats.reported_epoch) {
328 oinfo.stats.reported_seq = info.stats.reported_seq;
329 oinfo.stats.reported_epoch = info.stats.reported_epoch;
330 }
331 if (info.last_backfill.is_max())
332 info.stats = oinfo.stats;
333 info.hit_set = oinfo.hit_set;
334
335 // do we have divergent entries to throw out?
336 if (olog.head < log.head) {
337 rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
338 changed = true;
339 }
340
341 // extend on head?
342 if (olog.head > log.head) {
343 dout(10) << "merge_log extending head to " << olog.head << dendl;
344
345 // find start point in olog
346 list<pg_log_entry_t>::iterator to = olog.log.end();
347 list<pg_log_entry_t>::iterator from = olog.log.end();
348 eversion_t lower_bound = MAX(olog.tail, orig_tail);
349 while (1) {
350 if (from == olog.log.begin())
351 break;
352 --from;
353 dout(20) << " ? " << *from << dendl;
354 if (from->version <= log.head) {
355 lower_bound = MAX(lower_bound, from->version);
356 ++from;
357 break;
358 }
359 }
360 dout(20) << "merge_log cut point (usually last shared) is "
361 << lower_bound << dendl;
362 mark_dirty_from(lower_bound);
363
364 auto divergent = log.rewind_from_head(lower_bound);
365 // move aside divergent items
366 for (auto &&oe: divergent) {
367 dout(10) << "merge_log divergent " << oe << dendl;
368 }
369 log.roll_forward_to(log.head, rollbacker);
370
371 mempool::osd::list<pg_log_entry_t> new_entries;
372 new_entries.splice(new_entries.end(), olog.log, from, to);
373 append_log_entries_update_missing(
374 info.last_backfill,
375 info.last_backfill_bitwise,
376 new_entries,
377 false,
378 &log,
379 missing,
380 rollbacker,
381 this);
382
383 _merge_divergent_entries(
384 log,
385 divergent,
386 info,
387 log.get_can_rollback_to(),
388 missing,
389 rollbacker,
390 this);
391
392 info.last_update = log.head = olog.head;
393
394 // We cannot rollback into the new log entries
395 log.skip_can_rollback_to_to_head();
396
397 info.last_user_version = oinfo.last_user_version;
398 info.purged_snaps = oinfo.purged_snaps;
399
400 changed = true;
401 }
402
403 dout(10) << "merge_log result " << log << " " << missing << " changed=" << changed << dendl;
404
405 if (changed) {
406 dirty_info = true;
407 dirty_big_info = true;
408 }
409 }
410
411 void PGLog::check() {
412 if (!pg_log_debug)
413 return;
414 if (log.log.size() != log_keys_debug.size()) {
415 derr << "log.log.size() != log_keys_debug.size()" << dendl;
416 derr << "actual log:" << dendl;
417 for (list<pg_log_entry_t>::iterator i = log.log.begin();
418 i != log.log.end();
419 ++i) {
420 derr << " " << *i << dendl;
421 }
422 derr << "log_keys_debug:" << dendl;
423 for (set<string>::const_iterator i = log_keys_debug.begin();
424 i != log_keys_debug.end();
425 ++i) {
426 derr << " " << *i << dendl;
427 }
428 }
429 assert(log.log.size() == log_keys_debug.size());
430 for (list<pg_log_entry_t>::iterator i = log.log.begin();
431 i != log.log.end();
432 ++i) {
433 assert(log_keys_debug.count(i->get_key_name()));
434 }
435 }
436
437 void PGLog::write_log_and_missing(
438 ObjectStore::Transaction& t,
439 map<string,bufferlist> *km,
440 const coll_t& coll, const ghobject_t &log_oid,
441 bool require_rollback)
442 {
443 if (is_dirty()) {
444 dout(5) << "write_log_and_missing with: "
445 << "dirty_to: " << dirty_to
446 << ", dirty_from: " << dirty_from
447 << ", writeout_from: " << writeout_from
448 << ", trimmed: " << trimmed
449 << ", clear_divergent_priors: " << clear_divergent_priors
450 << dendl;
451 _write_log_and_missing(
452 t, km, log, coll, log_oid,
453 dirty_to,
454 dirty_from,
455 writeout_from,
456 trimmed,
457 missing,
458 !touched_log,
459 require_rollback,
460 clear_divergent_priors,
461 (pg_log_debug ? &log_keys_debug : 0));
462 undirty();
463 } else {
464 dout(10) << "log is not dirty" << dendl;
465 }
466 }
467
468 void PGLog::write_log_and_missing_wo_missing(
469 ObjectStore::Transaction& t,
470 map<string,bufferlist> *km,
471 pg_log_t &log,
472 const coll_t& coll, const ghobject_t &log_oid,
473 map<eversion_t, hobject_t> &divergent_priors,
474 bool require_rollback)
475 {
476 _write_log_and_missing_wo_missing(
477 t, km, log, coll, log_oid,
478 divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
479 set<eversion_t>(),
480 true, true, require_rollback, 0);
481 }
482
483 void PGLog::write_log_and_missing(
484 ObjectStore::Transaction& t,
485 map<string,bufferlist> *km,
486 pg_log_t &log,
487 const coll_t& coll,
488 const ghobject_t &log_oid,
489 const pg_missing_tracker_t &missing,
490 bool require_rollback)
491 {
492 _write_log_and_missing(
493 t, km, log, coll, log_oid,
494 eversion_t::max(),
495 eversion_t(),
496 eversion_t(),
497 set<eversion_t>(),
498 missing,
499 true, require_rollback, false, 0);
500 }
501
502 void PGLog::_write_log_and_missing_wo_missing(
503 ObjectStore::Transaction& t,
504 map<string,bufferlist> *km,
505 pg_log_t &log,
506 const coll_t& coll, const ghobject_t &log_oid,
507 map<eversion_t, hobject_t> &divergent_priors,
508 eversion_t dirty_to,
509 eversion_t dirty_from,
510 eversion_t writeout_from,
511 const set<eversion_t> &trimmed,
512 bool dirty_divergent_priors,
513 bool touch_log,
514 bool require_rollback,
515 set<string> *log_keys_debug
516 )
517 {
518 set<string> to_remove;
519 for (set<eversion_t>::const_iterator i = trimmed.begin();
520 i != trimmed.end();
521 ++i) {
522 to_remove.insert(i->get_key_name());
523 if (log_keys_debug) {
524 assert(log_keys_debug->count(i->get_key_name()));
525 log_keys_debug->erase(i->get_key_name());
526 }
527 }
528
529 //dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
530 if (touch_log)
531 t.touch(coll, log_oid);
532 if (dirty_to != eversion_t()) {
533 t.omap_rmkeyrange(
534 coll, log_oid,
535 eversion_t().get_key_name(), dirty_to.get_key_name());
536 clear_up_to(log_keys_debug, dirty_to.get_key_name());
537 }
538 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
539 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
540 t.omap_rmkeyrange(
541 coll, log_oid,
542 dirty_from.get_key_name(), eversion_t::max().get_key_name());
543 clear_after(log_keys_debug, dirty_from.get_key_name());
544 }
545
546 for (list<pg_log_entry_t>::iterator p = log.log.begin();
547 p != log.log.end() && p->version <= dirty_to;
548 ++p) {
549 bufferlist bl(sizeof(*p) * 2);
550 p->encode_with_checksum(bl);
551 (*km)[p->get_key_name()].claim(bl);
552 }
553
554 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
555 p != log.log.rend() &&
556 (p->version >= dirty_from || p->version >= writeout_from) &&
557 p->version >= dirty_to;
558 ++p) {
559 bufferlist bl(sizeof(*p) * 2);
560 p->encode_with_checksum(bl);
561 (*km)[p->get_key_name()].claim(bl);
562 }
563
564 if (log_keys_debug) {
565 for (map<string, bufferlist>::iterator i = (*km).begin();
566 i != (*km).end();
567 ++i) {
568 if (i->first[0] == '_')
569 continue;
570 assert(!log_keys_debug->count(i->first));
571 log_keys_debug->insert(i->first);
572 }
573 }
574
575 if (dirty_divergent_priors) {
576 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
577 ::encode(divergent_priors, (*km)["divergent_priors"]);
578 }
579 if (require_rollback) {
580 ::encode(
581 log.get_can_rollback_to(),
582 (*km)["can_rollback_to"]);
583 ::encode(
584 log.get_rollback_info_trimmed_to(),
585 (*km)["rollback_info_trimmed_to"]);
586 }
587
588 if (!to_remove.empty())
589 t.omap_rmkeys(coll, log_oid, to_remove);
590 }
591
592 void PGLog::_write_log_and_missing(
593 ObjectStore::Transaction& t,
594 map<string,bufferlist>* km,
595 pg_log_t &log,
596 const coll_t& coll, const ghobject_t &log_oid,
597 eversion_t dirty_to,
598 eversion_t dirty_from,
599 eversion_t writeout_from,
600 const set<eversion_t> &trimmed,
601 const pg_missing_tracker_t &missing,
602 bool touch_log,
603 bool require_rollback,
604 bool clear_divergent_priors,
605 set<string> *log_keys_debug
606 ) {
607 set<string> to_remove;
608 for (set<eversion_t>::const_iterator i = trimmed.begin();
609 i != trimmed.end();
610 ++i) {
611 to_remove.insert(i->get_key_name());
612 if (log_keys_debug) {
613 assert(log_keys_debug->count(i->get_key_name()));
614 log_keys_debug->erase(i->get_key_name());
615 }
616 }
617
618 if (touch_log)
619 t.touch(coll, log_oid);
620 if (dirty_to != eversion_t()) {
621 t.omap_rmkeyrange(
622 coll, log_oid,
623 eversion_t().get_key_name(), dirty_to.get_key_name());
624 clear_up_to(log_keys_debug, dirty_to.get_key_name());
625 }
626 if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
627 // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
628 t.omap_rmkeyrange(
629 coll, log_oid,
630 dirty_from.get_key_name(), eversion_t::max().get_key_name());
631 clear_after(log_keys_debug, dirty_from.get_key_name());
632 }
633
634 for (list<pg_log_entry_t>::iterator p = log.log.begin();
635 p != log.log.end() && p->version <= dirty_to;
636 ++p) {
637 bufferlist bl(sizeof(*p) * 2);
638 p->encode_with_checksum(bl);
639 (*km)[p->get_key_name()].claim(bl);
640 }
641
642 for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
643 p != log.log.rend() &&
644 (p->version >= dirty_from || p->version >= writeout_from) &&
645 p->version >= dirty_to;
646 ++p) {
647 bufferlist bl(sizeof(*p) * 2);
648 p->encode_with_checksum(bl);
649 (*km)[p->get_key_name()].claim(bl);
650 }
651
652 if (log_keys_debug) {
653 for (map<string, bufferlist>::iterator i = (*km).begin();
654 i != (*km).end();
655 ++i) {
656 if (i->first[0] == '_')
657 continue;
658 assert(!log_keys_debug->count(i->first));
659 log_keys_debug->insert(i->first);
660 }
661 }
662
663 if (clear_divergent_priors) {
664 //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
665 to_remove.insert("divergent_priors");
666 }
667 missing.get_changed(
668 [&](const hobject_t &obj) {
669 string key = string("missing/") + obj.to_str();
670 pg_missing_item item;
671 if (!missing.is_missing(obj, &item)) {
672 to_remove.insert(key);
673 } else {
674 ::encode(make_pair(obj, item), (*km)[key]);
675 }
676 });
677 if (require_rollback) {
678 ::encode(
679 log.get_can_rollback_to(),
680 (*km)["can_rollback_to"]);
681 ::encode(
682 log.get_rollback_info_trimmed_to(),
683 (*km)["rollback_info_trimmed_to"]);
684 }
685
686 if (!to_remove.empty())
687 t.omap_rmkeys(coll, log_oid, to_remove);
688 }