-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
+ * License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
- *
+ *
*/
#include "PGLog.h"
PGLog::IndexedLog *target)
{
unindex();
- *target = pg_log_t::split_out_child(child_pgid, split_bits);
+ *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
index();
target->index();
reset_rollback_info_trimmed_to_riter();
void PGLog::IndexedLog::trim(
CephContext* cct,
eversion_t s,
- set<eversion_t> *trimmed)
+ set<eversion_t> *trimmed,
+ set<string>* trimmed_dups,
+ bool* dirty_dups)
{
if (complete_to != log.end() &&
complete_to->version <= s) {
assert(s <= can_rollback_to);
+ auto earliest_dup_version =
+ log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
+ ? 0u
+ : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked;
+
while (!log.empty()) {
- pg_log_entry_t &e = *log.begin();
+ const pg_log_entry_t &e = *log.begin();
if (e.version > s)
break;
generic_dout(20) << "trim " << e << dendl;
unindex(e); // remove from index,
+ // add to dup list
+ if (e.version.version >= earliest_dup_version) {
+ if (dirty_dups) *dirty_dups = true;
+ dups.push_back(pg_log_dup_t(e));
+ index(dups.back());
+ for (const auto& extra : e.extra_reqids) {
+ // note: extras have the same version as outer op
+ dups.push_back(pg_log_dup_t(e.version, extra.second,
+ extra.first, e.return_code));
+ index(dups.back());
+ }
+ }
+
if (rollback_info_trimmed_to_riter == log.rend() ||
e.version == rollback_info_trimmed_to_riter->version) {
log.pop_front();
}
}
+ while (!dups.empty()) {
+ const auto& e = *dups.begin();
+ if (e.version.version >= earliest_dup_version)
+ break;
+ generic_dout(20) << "trim dup " << e << dendl;
+ if (trimmed_dups)
+ trimmed_dups->insert(e.get_key_name());
+ if (indexed_data & PGLOG_INDEXED_DUPS) {
+ dup_index.erase(e.reqid);
+ }
+ dups.pop_front();
+ }
+
// raise tail?
if (tail < s)
tail = s;
for (list<pg_log_entry_t>::const_iterator p = log.begin();
p != log.end();
++p) {
- out << *p << " " << (logged_object(p->soid) ? "indexed":"NOT INDEXED") << std::endl;
+ out << *p << " " <<
+ (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
+ std::endl;
assert(!p->reqid_is_indexed() || logged_req(p->reqid));
}
+
+ for (list<pg_log_dup_t>::const_iterator p = dups.begin();
+ p != dups.end();
+ ++p) {
+ out << *p << std::endl;
+ }
+
return out;
}
assert(trim_to <= info.last_complete);
dout(10) << "trim " << log << " to " << trim_to << dendl;
- log.trim(cct, trim_to, &trimmed);
+ log.trim(cct, trim_to, &trimmed, &trimmed_dups, &dirty_dups);
info.log_tail = log.tail;
}
}
} else {
oinfo.last_complete = oinfo.last_update;
}
-}
+} // proc_replica_log
/**
* rewind divergent entries at the head of the log
pg_info_t &info, LogEntryHandler *rollbacker,
bool &dirty_info, bool &dirty_big_info)
{
- dout(10) << "rewind_divergent_log truncate divergent future " << newhead << dendl;
+ dout(10) << "rewind_divergent_log truncate divergent future " <<
+ newhead << dendl;
if (info.last_complete > newhead)
// splice into our log.
log.log.splice(log.log.begin(),
olog.log, from, to);
-
+
info.log_tail = log.tail = olog.tail;
changed = true;
}
// extend on head?
if (olog.head > log.head) {
dout(10) << "merge_log extending head to " << olog.head << dendl;
-
+
// find start point in olog
list<pg_log_entry_t>::iterator to = olog.log.end();
list<pg_log_entry_t>::iterator from = olog.log.end();
changed = true;
}
-
- dout(10) << "merge_log result " << log << " " << missing << " changed=" << changed << dendl;
+
+ // now handle dups
+ if (merge_log_dups(olog)) {
+ dirty_dups = true;
+ changed = true;
+ }
+
+ dout(10) << "merge_log result " << log << " " << missing <<
+ " changed=" << changed << dendl;
if (changed) {
dirty_info = true;
}
}
+
+// returns true if any changes were made to log.dups
+bool PGLog::merge_log_dups(const pg_log_t& olog) {
+ bool changed = false;
+
+ if (!olog.dups.empty()) {
+ if (log.dups.empty()) {
+ dout(10) << "merge_log copying olog dups to log " <<
+ olog.dups.front().version << " to " <<
+ olog.dups.back().version << dendl;
+ changed = true;
+ // since our log.dups is empty just copy them
+ for (const auto& i : olog.dups) {
+ log.dups.push_back(i);
+ log.index(log.dups.back());
+ }
+ } else {
+ // since our log.dups is not empty try to extend on each end
+
+ if (olog.dups.back().version > log.dups.back().version) {
+ // extend the dups's tail (i.e., newer dups)
+ dout(10) << "merge_log extending dups tail to " <<
+ olog.dups.back().version << dendl;
+ changed = true;
+
+ auto log_tail_version = log.dups.back().version;
+
+ auto insert_cursor = log.dups.end();
+ for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
+ if (i->version <= log_tail_version) break;
+ log.dups.insert(insert_cursor, *i);
+
+ auto prev = insert_cursor;
+ --prev;
+ // be sure to pass reference of copy in log.dups
+ log.index(*prev);
+
+ --insert_cursor; // make sure we insert in reverse order
+ }
+ }
+
+ if (olog.dups.front().version < log.dups.front().version) {
+ // extend the dups's head (i.e., older dups)
+ dout(10) << "merge_log extending dups head to " <<
+ olog.dups.front().version << dendl;
+ changed = true;
+
+ auto insert_cursor = log.dups.begin();
+ for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
+ if (i->version >= insert_cursor->version) break;
+ log.dups.insert(insert_cursor, *i);
+ auto prev = insert_cursor;
+ --prev;
+ // be sure to pass address of copy in log.dups
+ log.index(*prev);
+ }
+ }
+ }
+ }
+
+ // remove any dup entries that overlap with pglog
+ if (!log.dups.empty() && log.dups.back().version >= log.tail) {
+ dout(10) << "merge_log removed dups overlapping log entries [" <<
+ log.tail << "," << log.dups.back().version << "]" << dendl;
+ changed = true;
+
+ while (!log.dups.empty() && log.dups.back().version >= log.tail) {
+ log.unindex(log.dups.back());
+ log.dups.pop_back();
+ }
+ }
+
+ return changed;
+}
+
void PGLog::check() {
if (!pg_log_debug)
return;
}
}
+// non-static
void PGLog::write_log_and_missing(
ObjectStore::Transaction& t,
map<string,bufferlist> *km,
- const coll_t& coll, const ghobject_t &log_oid,
+ const coll_t& coll,
+ const ghobject_t &log_oid,
bool require_rollback)
{
if (is_dirty()) {
<< ", dirty_from: " << dirty_from
<< ", writeout_from: " << writeout_from
<< ", trimmed: " << trimmed
+ << ", trimmed_dups: " << trimmed_dups
<< ", clear_divergent_priors: " << clear_divergent_priors
<< dendl;
_write_log_and_missing(
dirty_from,
writeout_from,
trimmed,
+ trimmed_dups,
missing,
!touched_log,
require_rollback,
clear_divergent_priors,
- (pg_log_debug ? &log_keys_debug : 0));
+ dirty_dups,
+ &rebuilt_missing_with_deletes,
+ (pg_log_debug ? &log_keys_debug : nullptr));
undirty();
} else {
dout(10) << "log is not dirty" << dendl;
}
}
+// static
void PGLog::write_log_and_missing_wo_missing(
ObjectStore::Transaction& t,
map<string,bufferlist> *km,
pg_log_t &log,
const coll_t& coll, const ghobject_t &log_oid,
map<eversion_t, hobject_t> &divergent_priors,
- bool require_rollback)
+ bool require_rollback,
+ bool dirty_dups)
{
_write_log_and_missing_wo_missing(
t, km, log, coll, log_oid,
divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
set<eversion_t>(),
- true, true, require_rollback, 0);
+ set<string>(),
+ true, true, require_rollback, dirty_dups, nullptr);
}
+// static
void PGLog::write_log_and_missing(
ObjectStore::Transaction& t,
map<string,bufferlist> *km,
const coll_t& coll,
const ghobject_t &log_oid,
const pg_missing_tracker_t &missing,
- bool require_rollback)
+ bool require_rollback,
+ bool dirty_dups,
+ bool *rebuilt_missing_with_deletes)
{
_write_log_and_missing(
t, km, log, coll, log_oid,
eversion_t(),
eversion_t(),
set<eversion_t>(),
+ set<string>(),
missing,
- true, require_rollback, false, 0);
+ true, require_rollback, false, dirty_dups, rebuilt_missing_with_deletes, nullptr);
}
+// static
void PGLog::_write_log_and_missing_wo_missing(
ObjectStore::Transaction& t,
map<string,bufferlist> *km,
eversion_t dirty_from,
eversion_t writeout_from,
const set<eversion_t> &trimmed,
+ const set<string> &trimmed_dups,
bool dirty_divergent_priors,
bool touch_log,
bool require_rollback,
+ bool dirty_dups,
set<string> *log_keys_debug
)
{
- set<string> to_remove;
+ set<string> to_remove(trimmed_dups);
for (set<eversion_t>::const_iterator i = trimmed.begin();
i != trimmed.end();
++i) {
}
}
-//dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
+ // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
if (touch_log)
t.touch(coll, log_oid);
if (dirty_to != eversion_t()) {
clear_up_to(log_keys_debug, dirty_to.get_key_name());
}
if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
- // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
+ // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
t.omap_rmkeyrange(
coll, log_oid,
dirty_from.get_key_name(), eversion_t::max().get_key_name());
}
}
+ // process dirty_dups after log_keys_debug is filled, so dups do not
+ // end up in that set
+ if (dirty_dups) {
+ pg_log_dup_t min;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ min.get_key_name(), log.dups.begin()->get_key_name());
+ for (const auto& entry : log.dups) {
+ bufferlist bl;
+ ::encode(entry, bl);
+ (*km)[entry.get_key_name()].claim(bl);
+ }
+ }
+
if (dirty_divergent_priors) {
//dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
::encode(divergent_priors, (*km)["divergent_priors"]);
t.omap_rmkeys(coll, log_oid, to_remove);
}
+// static
void PGLog::_write_log_and_missing(
ObjectStore::Transaction& t,
map<string,bufferlist>* km,
eversion_t dirty_from,
eversion_t writeout_from,
const set<eversion_t> &trimmed,
+ const set<string> &trimmed_dups,
const pg_missing_tracker_t &missing,
bool touch_log,
bool require_rollback,
bool clear_divergent_priors,
+ bool dirty_dups,
+ bool *rebuilt_missing_with_deletes, // in/out param
set<string> *log_keys_debug
) {
- set<string> to_remove;
+ set<string> to_remove(trimmed_dups);
for (set<eversion_t>::const_iterator i = trimmed.begin();
i != trimmed.end();
++i) {
}
}
+ // process dirty_dups after log_keys_debug is filled, so dups do not
+ // end up in that set
+ if (dirty_dups) {
+ pg_log_dup_t min;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ min.get_key_name(), log.dups.begin()->get_key_name());
+ for (const auto& entry : log.dups) {
+ bufferlist bl;
+ ::encode(entry, bl);
+ (*km)[entry.get_key_name()].claim(bl);
+ }
+ }
+
if (clear_divergent_priors) {
//dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
to_remove.insert("divergent_priors");
}
+ // since we encode individual missing items instead of a whole
+ // missing set, we need another key to store this bit of state
+ if (*rebuilt_missing_with_deletes) {
+ (*km)["may_include_deletes_in_missing"] = bufferlist();
+ *rebuilt_missing_with_deletes = false;
+ }
missing.get_changed(
[&](const hobject_t &obj) {
string key = string("missing/") + obj.to_str();
if (!missing.is_missing(obj, &item)) {
to_remove.insert(key);
} else {
- ::encode(make_pair(obj, item), (*km)[key]);
+ uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0;
+ ::encode(make_pair(obj, item), (*km)[key], features);
}
});
if (require_rollback) {
if (!to_remove.empty())
t.omap_rmkeys(coll, log_oid, to_remove);
}
+
+void PGLog::rebuild_missing_set_with_deletes(ObjectStore *store,
+ coll_t pg_coll,
+ const pg_info_t &info)
+{
+ // save entries not generated from the current log (e.g. added due
+ // to repair, EIO handling, or divergent_priors).
+ map<hobject_t, pg_missing_item> extra_missing;
+ for (const auto& p : missing.get_items()) {
+ if (!log.logged_object(p.first)) {
+ dout(20) << __func__ << " extra missing entry: " << p.first
+ << " " << p.second << dendl;
+ extra_missing[p.first] = p.second;
+ }
+ }
+ missing.clear();
+ missing.may_include_deletes = true;
+
+ // go through the log and add items that are not present or older
+ // versions on disk, just as if we were reading the log + metadata
+ // off disk originally
+ set<hobject_t> did;
+ for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
+ i != log.log.rend();
+ ++i) {
+ if (i->version <= info.last_complete)
+ break;
+ if (i->soid > info.last_backfill ||
+ i->is_error() ||
+ did.find(i->soid) != did.end())
+ continue;
+ did.insert(i->soid);
+
+ bufferlist bv;
+ int r = store->getattr(
+ pg_coll,
+ ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
+ OI_ATTR,
+ bv);
+ dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
+
+ if (r >= 0) {
+ object_info_t oi(bv);
+ dout(20) << __func__ << " store version = " << oi.version << dendl;
+ if (oi.version < i->version) {
+ missing.add(i->soid, i->version, oi.version, i->is_delete());
+ }
+ } else {
+ missing.add(i->soid, i->version, eversion_t(), i->is_delete());
+ }
+ }
+
+ for (const auto& p : extra_missing) {
+ missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
+ }
+ rebuilt_missing_with_deletes = true;
+}