X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=ceph%2Fsrc%2Fosd%2FPGLog.cc;h=7b086eb30a08999f2aae2ed47e0058c414abdafe;hb=c07f9fc5a4f48397831383549fb0482b93480643;hp=e85c6cc3333a327ecf9b86ea8b7c71bb84c99bea;hpb=9439ae556f035e65c9c107ae13ddd09457dbbecd;p=ceph.git diff --git a/ceph/src/osd/PGLog.cc b/ceph/src/osd/PGLog.cc index e85c6cc33..7b086eb30 100644 --- a/ceph/src/osd/PGLog.cc +++ b/ceph/src/osd/PGLog.cc @@ -1,4 +1,4 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system @@ -10,9 +10,9 @@ * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software + * License version 2.1, as published by the Free Software * Foundation. See file COPYING. - * + * */ #include "PGLog.h" @@ -37,7 +37,7 @@ void PGLog::IndexedLog::split_out_child( PGLog::IndexedLog *target) { unindex(); - *target = pg_log_t::split_out_child(child_pgid, split_bits); + *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits)); index(); target->index(); reset_rollback_info_trimmed_to_riter(); @@ -46,7 +46,9 @@ void PGLog::IndexedLog::split_out_child( void PGLog::IndexedLog::trim( CephContext* cct, eversion_t s, - set *trimmed) + set *trimmed, + set* trimmed_dups, + bool* dirty_dups) { if (complete_to != log.end() && complete_to->version <= s) { @@ -57,8 +59,13 @@ void PGLog::IndexedLog::trim( assert(s <= can_rollback_to); + auto earliest_dup_version = + log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked + ? 0u + : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked; + while (!log.empty()) { - pg_log_entry_t &e = *log.begin(); + const pg_log_entry_t &e = *log.begin(); if (e.version > s) break; generic_dout(20) << "trim " << e << dendl; @@ -67,6 +74,19 @@ void PGLog::IndexedLog::trim( unindex(e); // remove from index, + // add to dup list + if (e.version.version >= earliest_dup_version) { + if (dirty_dups) *dirty_dups = true; + dups.push_back(pg_log_dup_t(e)); + index(dups.back()); + for (const auto& extra : e.extra_reqids) { + // note: extras have the same version as outer op + dups.push_back(pg_log_dup_t(e.version, extra.second, + extra.first, e.return_code)); + index(dups.back()); + } + } + if (rollback_info_trimmed_to_riter == log.rend() || e.version == rollback_info_trimmed_to_riter->version) { log.pop_front(); @@ -76,6 +96,19 @@ void PGLog::IndexedLog::trim( } } + while (!dups.empty()) { + const auto& e = *dups.begin(); + if (e.version.version >= earliest_dup_version) + break; + generic_dout(20) << "trim dup " << e << dendl; + if (trimmed_dups) + trimmed_dups->insert(e.get_key_name()); + if (indexed_data & PGLOG_INDEXED_DUPS) { + dup_index.erase(e.reqid); + } + dups.pop_front(); + } + // raise tail? if (tail < s) tail = s; @@ -87,9 +120,18 @@ ostream& PGLog::IndexedLog::print(ostream& out) const for (list::const_iterator p = log.begin(); p != log.end(); ++p) { - out << *p << " " << (logged_object(p->soid) ? "indexed":"NOT INDEXED") << std::endl; + out << *p << " " << + (logged_object(p->soid) ? "indexed" : "NOT INDEXED") << + std::endl; assert(!p->reqid_is_indexed() || logged_req(p->reqid)); } + + for (list::const_iterator p = dups.begin(); + p != dups.end(); + ++p) { + out << *p << std::endl; + } + return out; } @@ -124,7 +166,7 @@ void PGLog::trim( assert(trim_to <= info.last_complete); dout(10) << "trim " << log << " to " << trim_to << dendl; - log.trim(cct, trim_to, &trimmed); + log.trim(cct, trim_to, &trimmed, &trimmed_dups, &dirty_dups); info.log_tail = log.tail; } } @@ -230,7 +272,7 @@ void PGLog::proc_replica_log( } else { oinfo.last_complete = oinfo.last_update; } -} +} // proc_replica_log /** * rewind divergent entries at the head of the log @@ -244,7 +286,8 @@ void PGLog::rewind_divergent_log(eversion_t newhead, pg_info_t &info, LogEntryHandler *rollbacker, bool &dirty_info, bool &dirty_big_info) { - dout(10) << "rewind_divergent_log truncate divergent future " << newhead << dendl; + dout(10) << "rewind_divergent_log truncate divergent future " << + newhead << dendl; if (info.last_complete > newhead) @@ -318,7 +361,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd, // splice into our log. log.log.splice(log.log.begin(), olog.log, from, to); - + info.log_tail = log.tail = olog.tail; changed = true; } @@ -341,7 +384,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd, // extend on head? if (olog.head > log.head) { dout(10) << "merge_log extending head to " << olog.head << dendl; - + // find start point in olog list::iterator to = olog.log.end(); list::iterator from = olog.log.end(); @@ -399,8 +442,15 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd, changed = true; } - - dout(10) << "merge_log result " << log << " " << missing << " changed=" << changed << dendl; + + // now handle dups + if (merge_log_dups(olog)) { + dirty_dups = true; + changed = true; + } + + dout(10) << "merge_log result " << log << " " << missing << + " changed=" << changed << dendl; if (changed) { dirty_info = true; @@ -408,6 +458,81 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd, } } + +// returns true if any changes were made to log.dups +bool PGLog::merge_log_dups(const pg_log_t& olog) { + bool changed = false; + + if (!olog.dups.empty()) { + if (log.dups.empty()) { + dout(10) << "merge_log copying olog dups to log " << + olog.dups.front().version << " to " << + olog.dups.back().version << dendl; + changed = true; + // since our log.dups is empty just copy them + for (const auto& i : olog.dups) { + log.dups.push_back(i); + log.index(log.dups.back()); + } + } else { + // since our log.dups is not empty try to extend on each end + + if (olog.dups.back().version > log.dups.back().version) { + // extend the dups's tail (i.e., newer dups) + dout(10) << "merge_log extending dups tail to " << + olog.dups.back().version << dendl; + changed = true; + + auto log_tail_version = log.dups.back().version; + + auto insert_cursor = log.dups.end(); + for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) { + if (i->version <= log_tail_version) break; + log.dups.insert(insert_cursor, *i); + + auto prev = insert_cursor; + --prev; + // be sure to pass reference of copy in log.dups + log.index(*prev); + + --insert_cursor; // make sure we insert in reverse order + } + } + + if (olog.dups.front().version < log.dups.front().version) { + // extend the dups's head (i.e., older dups) + dout(10) << "merge_log extending dups head to " << + olog.dups.front().version << dendl; + changed = true; + + auto insert_cursor = log.dups.begin(); + for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) { + if (i->version >= insert_cursor->version) break; + log.dups.insert(insert_cursor, *i); + auto prev = insert_cursor; + --prev; + // be sure to pass address of copy in log.dups + log.index(*prev); + } + } + } + } + + // remove any dup entries that overlap with pglog + if (!log.dups.empty() && log.dups.back().version >= log.tail) { + dout(10) << "merge_log removed dups overlapping log entries [" << + log.tail << "," << log.dups.back().version << "]" << dendl; + changed = true; + + while (!log.dups.empty() && log.dups.back().version >= log.tail) { + log.unindex(log.dups.back()); + log.dups.pop_back(); + } + } + + return changed; +} + void PGLog::check() { if (!pg_log_debug) return; @@ -434,10 +559,12 @@ void PGLog::check() { } } +// non-static void PGLog::write_log_and_missing( ObjectStore::Transaction& t, map *km, - const coll_t& coll, const ghobject_t &log_oid, + const coll_t& coll, + const ghobject_t &log_oid, bool require_rollback) { if (is_dirty()) { @@ -446,6 +573,7 @@ void PGLog::write_log_and_missing( << ", dirty_from: " << dirty_from << ", writeout_from: " << writeout_from << ", trimmed: " << trimmed + << ", trimmed_dups: " << trimmed_dups << ", clear_divergent_priors: " << clear_divergent_priors << dendl; _write_log_and_missing( @@ -454,32 +582,39 @@ void PGLog::write_log_and_missing( dirty_from, writeout_from, trimmed, + trimmed_dups, missing, !touched_log, require_rollback, clear_divergent_priors, - (pg_log_debug ? &log_keys_debug : 0)); + dirty_dups, + &rebuilt_missing_with_deletes, + (pg_log_debug ? &log_keys_debug : nullptr)); undirty(); } else { dout(10) << "log is not dirty" << dendl; } } +// static void PGLog::write_log_and_missing_wo_missing( ObjectStore::Transaction& t, map *km, pg_log_t &log, const coll_t& coll, const ghobject_t &log_oid, map &divergent_priors, - bool require_rollback) + bool require_rollback, + bool dirty_dups) { _write_log_and_missing_wo_missing( t, km, log, coll, log_oid, divergent_priors, eversion_t::max(), eversion_t(), eversion_t(), set(), - true, true, require_rollback, 0); + set(), + true, true, require_rollback, dirty_dups, nullptr); } +// static void PGLog::write_log_and_missing( ObjectStore::Transaction& t, map *km, @@ -487,7 +622,9 @@ void PGLog::write_log_and_missing( const coll_t& coll, const ghobject_t &log_oid, const pg_missing_tracker_t &missing, - bool require_rollback) + bool require_rollback, + bool dirty_dups, + bool *rebuilt_missing_with_deletes) { _write_log_and_missing( t, km, log, coll, log_oid, @@ -495,10 +632,12 @@ void PGLog::write_log_and_missing( eversion_t(), eversion_t(), set(), + set(), missing, - true, require_rollback, false, 0); + true, require_rollback, false, dirty_dups, rebuilt_missing_with_deletes, nullptr); } +// static void PGLog::_write_log_and_missing_wo_missing( ObjectStore::Transaction& t, map *km, @@ -509,13 +648,15 @@ void PGLog::_write_log_and_missing_wo_missing( eversion_t dirty_from, eversion_t writeout_from, const set &trimmed, + const set &trimmed_dups, bool dirty_divergent_priors, bool touch_log, bool require_rollback, + bool dirty_dups, set *log_keys_debug ) { - set to_remove; + set to_remove(trimmed_dups); for (set::const_iterator i = trimmed.begin(); i != trimmed.end(); ++i) { @@ -526,7 +667,7 @@ void PGLog::_write_log_and_missing_wo_missing( } } -//dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl; + // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl; if (touch_log) t.touch(coll, log_oid); if (dirty_to != eversion_t()) { @@ -536,7 +677,7 @@ void PGLog::_write_log_and_missing_wo_missing( clear_up_to(log_keys_debug, dirty_to.get_key_name()); } if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) { - // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl; + // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl; t.omap_rmkeyrange( coll, log_oid, dirty_from.get_key_name(), eversion_t::max().get_key_name()); @@ -572,6 +713,20 @@ void PGLog::_write_log_and_missing_wo_missing( } } + // process dirty_dups after log_keys_debug is filled, so dups do not + // end up in that set + if (dirty_dups) { + pg_log_dup_t min; + t.omap_rmkeyrange( + coll, log_oid, + min.get_key_name(), log.dups.begin()->get_key_name()); + for (const auto& entry : log.dups) { + bufferlist bl; + ::encode(entry, bl); + (*km)[entry.get_key_name()].claim(bl); + } + } + if (dirty_divergent_priors) { //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl; ::encode(divergent_priors, (*km)["divergent_priors"]); @@ -589,6 +744,7 @@ void PGLog::_write_log_and_missing_wo_missing( t.omap_rmkeys(coll, log_oid, to_remove); } +// static void PGLog::_write_log_and_missing( ObjectStore::Transaction& t, map* km, @@ -598,13 +754,16 @@ void PGLog::_write_log_and_missing( eversion_t dirty_from, eversion_t writeout_from, const set &trimmed, + const set &trimmed_dups, const pg_missing_tracker_t &missing, bool touch_log, bool require_rollback, bool clear_divergent_priors, + bool dirty_dups, + bool *rebuilt_missing_with_deletes, // in/out param set *log_keys_debug ) { - set to_remove; + set to_remove(trimmed_dups); for (set::const_iterator i = trimmed.begin(); i != trimmed.end(); ++i) { @@ -660,10 +819,30 @@ void PGLog::_write_log_and_missing( } } + // process dirty_dups after log_keys_debug is filled, so dups do not + // end up in that set + if (dirty_dups) { + pg_log_dup_t min; + t.omap_rmkeyrange( + coll, log_oid, + min.get_key_name(), log.dups.begin()->get_key_name()); + for (const auto& entry : log.dups) { + bufferlist bl; + ::encode(entry, bl); + (*km)[entry.get_key_name()].claim(bl); + } + } + if (clear_divergent_priors) { //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl; to_remove.insert("divergent_priors"); } + // since we encode individual missing items instead of a whole + // missing set, we need another key to store this bit of state + if (*rebuilt_missing_with_deletes) { + (*km)["may_include_deletes_in_missing"] = bufferlist(); + *rebuilt_missing_with_deletes = false; + } missing.get_changed( [&](const hobject_t &obj) { string key = string("missing/") + obj.to_str(); @@ -671,7 +850,8 @@ void PGLog::_write_log_and_missing( if (!missing.is_missing(obj, &item)) { to_remove.insert(key); } else { - ::encode(make_pair(obj, item), (*km)[key]); + uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0; + ::encode(make_pair(obj, item), (*km)[key], features); } }); if (require_rollback) { @@ -686,3 +866,60 @@ void PGLog::_write_log_and_missing( if (!to_remove.empty()) t.omap_rmkeys(coll, log_oid, to_remove); } + +void PGLog::rebuild_missing_set_with_deletes(ObjectStore *store, + coll_t pg_coll, + const pg_info_t &info) +{ + // save entries not generated from the current log (e.g. added due + // to repair, EIO handling, or divergent_priors). + map extra_missing; + for (const auto& p : missing.get_items()) { + if (!log.logged_object(p.first)) { + dout(20) << __func__ << " extra missing entry: " << p.first + << " " << p.second << dendl; + extra_missing[p.first] = p.second; + } + } + missing.clear(); + missing.may_include_deletes = true; + + // go through the log and add items that are not present or older + // versions on disk, just as if we were reading the log + metadata + // off disk originally + set did; + for (list::reverse_iterator i = log.log.rbegin(); + i != log.log.rend(); + ++i) { + if (i->version <= info.last_complete) + break; + if (i->soid > info.last_backfill || + i->is_error() || + did.find(i->soid) != did.end()) + continue; + did.insert(i->soid); + + bufferlist bv; + int r = store->getattr( + pg_coll, + ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl; + + if (r >= 0) { + object_info_t oi(bv); + dout(20) << __func__ << " store version = " << oi.version << dendl; + if (oi.version < i->version) { + missing.add(i->soid, i->version, oi.version, i->is_delete()); + } + } else { + missing.add(i->soid, i->version, eversion_t(), i->is_delete()); + } + } + + for (const auto& p : extra_missing) { + missing.add(p.first, p.second.need, p.second.have, p.second.is_delete()); + } + rebuilt_missing_with_deletes = true; +}