#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
+#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
/// min recovery priority for MBackfillReserve
/// base backfill priority for MBackfillReserve (inactive PG)
#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
-/// max recovery priority for MBackfillReserve
-#define OSD_RECOVERY_PRIORITY_MAX 255
+/// max manually/automatically set recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_MAX 254
+
+/// max recovery priority for MBackfillReserve, only when forced manually
+#define OSD_RECOVERY_PRIORITY_FORCED 255
typedef hobject_t collection_list_handle_t;
*/
struct osd_reqid_t {
entity_name_t name; // who
- ceph_tid_t tid;
+ ceph_tid_t tid;
int32_t inc; // incarnation
osd_reqid_t()
- : tid(0), inc(0) {}
+ : tid(0), inc(0)
+ {}
+ osd_reqid_t(const osd_reqid_t& other)
+ : name(other.name), tid(other.tid), inc(other.inc)
+ {}
osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
- : name(a), tid(t), inc(i) {}
+ : name(a), tid(t), inc(i)
+ {}
DENC(osd_reqid_t, v, p) {
DENC_START(2, 2, p);
struct pg_shard_t {
+ static const int32_t NO_OSD = 0x7fffffff;
int32_t osd;
shard_id_t shard;
pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
bool is_undefined() const {
return osd == -1;
}
+ string get_osd() const { return (osd == NO_OSD ? "NONE" : to_string(osd)); }
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &bl);
void dump(Formatter *f) const {
eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
// cppcheck-suppress noExplicitConstructor
- eversion_t(const ceph_eversion& ce) :
+ eversion_t(const ceph_eversion& ce) :
version(ce.version),
epoch(ce.epoch),
__pad(0) { }
objectstore_perf_stat_t os_perf_stat;
+ epoch_t up_from = 0;
+ uint64_t seq = 0;
+
+ uint32_t num_pgs = 0;
+
osd_stat_t() : kb(0), kb_used(0), kb_avail(0),
snap_trim_queue_len(0), num_snap_trimming(0) {}
num_snap_trimming += o.num_snap_trimming;
op_queue_age_hist.add(o.op_queue_age_hist);
os_perf_stat.add(o.os_perf_stat);
+ num_pgs += o.num_pgs;
}
void sub(const osd_stat_t& o) {
kb -= o.kb;
num_snap_trimming -= o.num_snap_trimming;
op_queue_age_hist.sub(o.op_queue_age_hist);
os_perf_stat.sub(o.os_perf_stat);
+ num_pgs -= o.num_pgs;
}
void dump(Formatter *f) const;
l.num_snap_trimming == r.num_snap_trimming &&
l.hb_peers == r.hb_peers &&
l.op_queue_age_hist == r.op_queue_age_hist &&
- l.os_perf_stat == r.os_perf_stat;
+ l.os_perf_stat == r.os_perf_stat &&
+ l.num_pgs == r.num_pgs;
}
inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
return !(l == r);
#define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too)
#define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas.
#define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline
-//#define PG_STATE_REPLAY (1<<5) // crashed, waiting for replay
-//#define PG_STATE_STRAY (1<<6) // i must notify the primary i exist.
+#define PG_STATE_RECOVERY_UNFOUND (1<<5) // recovery stopped due to unfound
+#define PG_STATE_BACKFILL_UNFOUND (1<<6) // backfill stopped due to unfound
//#define PG_STATE_SPLITTING (1<<7) // i am splitting
#define PG_STATE_SCRUBBING (1<<8) // scrubbing
//#define PG_STATE_SCRUBQ (1<<9) // queued for scrub
#define PG_STATE_STALE (1<<17) // our state for this pg is stale, unknown.
#define PG_STATE_REMAPPED (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
#define PG_STATE_DEEP_SCRUB (1<<19) // deep scrub: check CRC32 on files
-#define PG_STATE_BACKFILL (1<<20) // [active] backfilling pg content
+#define PG_STATE_BACKFILLING (1<<20) // [active] backfilling pg content
#define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
#define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
#define PG_STATE_UNDERSIZED (1<<23) // pg acting < pool size
#define PG_STATE_SNAPTRIM (1<<26) // trimming snaps
#define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
#define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
+#define PG_STATE_SNAPTRIM_ERROR (1<<29) // error stopped trimming snaps
+#define PG_STATE_FORCED_RECOVERY (1<<30) // force recovery of this pg before any other
+#define PG_STATE_FORCED_BACKFILL (1<<31) // force backfill of this pg before any other
std::string pg_state_string(int state);
std::string pg_vector_string(const vector<int32_t> &a);
-int pg_string_state(const std::string& state);
+boost::optional<uint64_t> pg_string_state(const std::string& state);
/*
* pg_pool
*/
struct pg_pool_t {
+ static const char *APPLICATION_NAME_CEPHFS;
+ static const char *APPLICATION_NAME_RBD;
+ static const char *APPLICATION_NAME_RGW;
+
enum {
TYPE_REPLICATED = 1, // replication
//TYPE_RAID4 = 2, // raid4 (never implemented)
const char *get_type_name() const {
return get_type_name(type);
}
- static const char* get_default_type() {
- return "replicated";
- }
enum {
FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
FLAG_NOSCRUB = 1<<8, // block periodic scrub
FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
+ FLAG_FULL_NO_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
+ FLAG_NEARFULL = 1<<11, // pool is nearfull
+ FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
};
static const char *get_flag_name(int f) {
case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
case FLAG_NOSCRUB: return "noscrub";
case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
+ case FLAG_FULL_NO_QUOTA: return "full_no_quota";
+ case FLAG_NEARFULL: return "nearfull";
+ case FLAG_BACKFILLFULL: return "backfillfull";
default: return "???";
}
}
return FLAG_NOSCRUB;
if (name == "nodeep-scrub")
return FLAG_NODEEP_SCRUB;
+ if (name == "full_no_quota")
+ return FLAG_FULL_NO_QUOTA;
+ if (name == "nearfull")
+ return FLAG_NEARFULL;
+ if (name == "backfillfull")
+ return FLAG_BACKFILLFULL;
return 0;
}
uint64_t flags; ///< FLAG_*
__u8 type; ///< TYPE_*
__u8 size, min_size; ///< number of osds in each pg
- __u8 crush_ruleset; ///< crush placement rule set
+ __u8 crush_rule; ///< crush placement rule
__u8 object_hash; ///< hash mapping object name to ps
private:
__u32 pg_num, pgp_num; ///< number of pgs
pool_opts_t opts; ///< options
+ /// application -> key/value metadata
+ map<string, std::map<string, string>> application_metadata;
+
private:
vector<uint32_t> grade_table;
pg_pool_t()
: flags(0), type(0), size(0), min_size(0),
- crush_ruleset(0), object_hash(0),
+ crush_rule(0), object_hash(0),
pg_num(0), pgp_num(0),
last_change(0),
last_force_op_resend(0),
unsigned get_type() const { return type; }
unsigned get_size() const { return size; }
unsigned get_min_size() const { return min_size; }
- int get_crush_ruleset() const { return crush_ruleset; }
+ int get_crush_rule() const { return crush_rule; }
int get_object_hash() const { return object_hash; }
const char *get_object_hash_name() const {
return ceph_str_hash_name(get_object_hash());
FLOOR(num_rd_kb);
FLOOR(num_wr);
FLOOR(num_wr_kb);
- FLOOR(num_scrub_errors);
FLOOR(num_shallow_scrub_errors);
FLOOR(num_deep_scrub_errors);
+ num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
FLOOR(num_objects_recovered);
FLOOR(num_bytes_recovered);
FLOOR(num_keys_recovered);
SPLIT(num_rd_kb);
SPLIT(num_wr);
SPLIT(num_wr_kb);
- SPLIT(num_scrub_errors);
- SPLIT(num_shallow_scrub_errors);
- SPLIT(num_deep_scrub_errors);
+ SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
+ SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
+ for (unsigned i = 0; i < out.size(); ++i) {
+ out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
+ out[i].num_deep_scrub_errors;
+ }
SPLIT(num_objects_recovered);
SPLIT(num_bytes_recovered);
SPLIT(num_keys_recovered);
int32_t up_primary;
int32_t acting_primary;
+ // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
+ // absurd already, so cap it to 2^32 and save 4 bytes at the same time
+ uint32_t snaptrimq_len;
+
bool stats_invalid:1;
/// true if num_objects_dirty is not accurate (because it was not
/// maintained starting from pool creation)
mapping_epoch(0),
up_primary(-1),
acting_primary(-1),
+ snaptrimq_len(0),
stats_invalid(false),
dirty_stats_invalid(false),
omap_stats_invalid(false),
log_size = f;
if (ondisk_log_size < f)
ondisk_log_size = f;
+ if (snaptrimq_len < f)
+ snaptrimq_len = f;
}
void add(const pg_stat_t& o) {
stats.add(o.stats);
log_size += o.log_size;
ondisk_log_size += o.ondisk_log_size;
+ if (((uint64_t)snaptrimq_len + (uint64_t)o.snaptrimq_len) > (uint64_t)(1 << 31)) {
+ snaptrimq_len = 1 << 31;
+ } else {
+ snaptrimq_len += o.snaptrimq_len;
+ }
}
void sub(const pg_stat_t& o) {
stats.sub(o.stats);
log_size -= o.log_size;
ondisk_log_size -= o.ondisk_log_size;
+ if (o.snaptrimq_len < snaptrimq_len) {
+ snaptrimq_len -= o.snaptrimq_len;
+ } else {
+ snaptrimq_len = 0;
+ }
}
bool is_acting_osd(int32_t osd, bool primary) const;
* history they need to worry about.
*/
struct pg_history_t {
- epoch_t epoch_created; // epoch in which PG was created
+ epoch_t epoch_created; // epoch in which *pg* was created (pool or pg)
+ epoch_t epoch_pool_created; // epoch in which *pool* was created
+ // (note: may be pg creation epoch for
+ // pre-luminous clusters)
epoch_t last_epoch_started; // lower bound on last epoch started (anywhere, not necessarily locally)
epoch_t last_interval_started; // first epoch of last_epoch_started interval
epoch_t last_epoch_clean; // lower bound on last epoch the PG was completely clean.
epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
- epoch_t last_epoch_split; // as parent
+ epoch_t last_epoch_split; // as parent or child
epoch_t last_epoch_marked_full; // pool or cluster
/**
friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
return
l.epoch_created == r.epoch_created &&
+ l.epoch_pool_created == r.epoch_pool_created &&
l.last_epoch_started == r.last_epoch_started &&
l.last_interval_started == r.last_interval_started &&
l.last_epoch_clean == r.last_epoch_clean &&
pg_history_t()
: epoch_created(0),
+ epoch_pool_created(0),
last_epoch_started(0),
last_interval_started(0),
last_epoch_clean(0),
epoch_created = other.epoch_created;
modified = true;
}
+ if (epoch_pool_created < other.epoch_pool_created) {
+ // FIXME: for jewel compat only; this should either be 0 or always the
+ // same value across all pg instances.
+ epoch_pool_created = other.epoch_pool_created;
+ modified = true;
+ }
if (last_epoch_started < other.last_epoch_started) {
last_epoch_started = other.last_epoch_started;
modified = true;
WRITE_CLASS_ENCODER(pg_history_t)
inline ostream& operator<<(ostream& out, const pg_history_t& h) {
- return out << "ec=" << h.epoch_created
+ return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
<< " lis/c " << h.last_interval_started
<< "/" << h.last_interval_clean
<< " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
unsigned new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
pg_t pgid
);
PastIntervals *past_intervals, ///< [out] intervals
ostream *out = 0 ///< [out] debug ostream
);
+
friend ostream& operator<<(ostream& out, const PastIntervals &i);
template <typename F>
}
void swap(PastIntervals &other) {
- ::swap(other.past_intervals, past_intervals);
+ using std::swap;
+ swap(other.past_intervals, past_intervals);
}
/**
TRY_DELETE = 6,
ROLLBACK_EXTENTS = 7
};
- ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {}
+ ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
+ bl.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
void claim(ObjectModDesc &other) {
bl.clear();
bl.claim(other.bl);
void swap(ObjectModDesc &other) {
bl.swap(other.bl);
- ::swap(other.can_local_rollback, can_local_rollback);
- ::swap(other.rollback_info_completed, rollback_info_completed);
- ::swap(other.max_required_version, max_required_version);
+ using std::swap;
+ swap(other.can_local_rollback, can_local_rollback);
+ swap(other.rollback_info_completed, rollback_info_completed);
+ swap(other.max_required_version, max_required_version);
}
void append_id(ModID id) {
uint8_t _id(id);
* in the case that bl contains ptrs which point into a much larger
* message buffer
*/
- void trim_bl() {
+ void trim_bl() const {
if (bl.length() > 0)
bl.rebuild();
}
bufferlist snaps; // only for clone entries
hobject_t soid;
osd_reqid_t reqid; // caller+tid to uniquely identify request
- vector<pair<osd_reqid_t, version_t> > extra_reqids;
+ mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
eversion_t version, prior_version, reverting_to;
version_t user_version; // the user version for this entry
utime_t mtime; // this is the _user_ mtime, mind you
pg_log_entry_t()
: user_version(0), return_code(0), op(0),
- invalid_hash(false), invalid_pool(false) {}
+ invalid_hash(false), invalid_pool(false) {
+ snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
pg_log_entry_t(int _op, const hobject_t& _soid,
const eversion_t& v, const eversion_t& pv,
version_t uv,
int return_code)
: soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
mtime(mt), return_code(return_code), op(_op),
- invalid_hash(false), invalid_pool(false)
- {}
+ invalid_hash(false), invalid_pool(false) {
+ snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
bool is_clone() const { return op == CLONE; }
bool is_modify() const { return op == MODIFY; }
ostream& operator<<(ostream& out, const pg_log_entry_t& e);
+struct pg_log_dup_t {
+ osd_reqid_t reqid; // caller+tid to uniquely identify request
+ eversion_t version;
+ version_t user_version; // the user version for this entry
+ int32_t return_code; // only stored for ERRORs for dup detection
+
+ pg_log_dup_t()
+ : user_version(0), return_code(0)
+ {}
+ explicit pg_log_dup_t(const pg_log_entry_t& entry)
+ : reqid(entry.reqid), version(entry.version),
+ user_version(entry.user_version), return_code(entry.return_code)
+ {}
+ pg_log_dup_t(const eversion_t& v, version_t uv,
+ const osd_reqid_t& rid, int return_code)
+ : reqid(rid), version(v), user_version(uv),
+ return_code(return_code)
+ {}
+
+ string get_key_name() const;
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<pg_log_dup_t*>& o);
+ bool operator==(const pg_log_dup_t &rhs) const {
+ return reqid == rhs.reqid &&
+ version == rhs.version &&
+ user_version == rhs.user_version &&
+ return_code == rhs.return_code;
+ }
+ bool operator!=(const pg_log_dup_t &rhs) const {
+ return !(*this == rhs);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
+};
+WRITE_CLASS_ENCODER(pg_log_dup_t)
+
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
/**
* pg_log_t - incremental log of recent pg changes.
eversion_t rollback_info_trimmed_to;
public:
- mempool::osd::list<pg_log_entry_t> log; // the actual log.
-
+ // the actual log
+ mempool::osd_pglog::list<pg_log_entry_t> log;
+
+ // entries just for dup op detection ordered oldest to newest
+ mempool::osd_pglog::list<pg_log_dup_t> dups;
+
pg_log_t() = default;
pg_log_t(const eversion_t &last_update,
const eversion_t &log_tail,
const eversion_t &can_rollback_to,
const eversion_t &rollback_info_trimmed_to,
- mempool::osd::list<pg_log_entry_t> &&entries)
+ mempool::osd_pglog::list<pg_log_entry_t> &&entries,
+ mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
: head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
rollback_info_trimmed_to(rollback_info_trimmed_to),
- log(std::move(entries)) {}
+ log(std::move(entries)), dups(std::move(dup_entries)) {}
pg_log_t(const eversion_t &last_update,
const eversion_t &log_tail,
const eversion_t &can_rollback_to,
const eversion_t &rollback_info_trimmed_to,
- const std::list<pg_log_entry_t> &entries)
+ const std::list<pg_log_entry_t> &entries,
+ const std::list<pg_log_dup_t> &dup_entries)
: head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
rollback_info_trimmed_to(rollback_info_trimmed_to) {
for (auto &&entry: entries) {
log.push_back(entry);
}
+ for (auto &&entry: dup_entries) {
+ dups.push_back(entry);
+ }
}
void clear() {
eversion_t z;
rollback_info_trimmed_to = can_rollback_to = head = tail = z;
log.clear();
+ dups.clear();
}
eversion_t get_rollback_info_trimmed_to() const {
pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
- mempool::osd::list<pg_log_entry_t> oldlog, childlog;
+ mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
oldlog.swap(log);
eversion_t old_tail;
oldlog.erase(i++);
}
+ // osd_reqid is unique, so it doesn't matter if there are extra
+ // dup entries in each pg. To avoid storing oid with the dup
+ // entries, just copy the whole list.
+ auto childdups(dups);
+
return pg_log_t(
head,
tail,
can_rollback_to,
rollback_info_trimmed_to,
- std::move(childlog));
- }
+ std::move(childlog),
+ std::move(childdups));
+ }
- mempool::osd::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
+ mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
assert(newhead >= tail);
- mempool::osd::list<pg_log_entry_t>::iterator p = log.end();
- mempool::osd::list<pg_log_entry_t> divergent;
+ mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
+ mempool::osd_pglog::list<pg_log_entry_t> divergent;
while (true) {
if (p == log.begin()) {
// yikes, the whole thing is divergent!
- ::swap(divergent, log);
+ using std::swap;
+ swap(divergent, log);
break;
}
--p;
};
WRITE_CLASS_ENCODER(pg_log_t)
-inline ostream& operator<<(ostream& out, const pg_log_t& log)
+inline ostream& operator<<(ostream& out, const pg_log_t& log)
{
out << "log((" << log.tail << "," << log.head << "], crt="
<< log.get_can_rollback_to() << ")";
*/
struct pg_missing_item {
eversion_t need, have;
- pg_missing_item() {}
- explicit pg_missing_item(eversion_t n) : need(n) {} // have no old version
- pg_missing_item(eversion_t n, eversion_t h) : need(n), have(h) {}
+ enum missing_flags_t {
+ FLAG_NONE = 0,
+ FLAG_DELETE = 1,
+ } flags;
+ pg_missing_item() : flags(FLAG_NONE) {}
+ explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
+ pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
+ set_delete(is_delete);
+ }
- void encode(bufferlist& bl) const {
- ::encode(need, bl);
- ::encode(have, bl);
+ void encode(bufferlist& bl, uint64_t features) const {
+ if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
+ // encoding a zeroed eversion_t to differentiate between this and
+ // legacy unversioned encoding - a need value of 0'0 is not
+ // possible. This can be replaced with the legacy encoding
+ // macros post-luminous.
+ eversion_t e;
+ ::encode(e, bl);
+ ::encode(need, bl);
+ ::encode(have, bl);
+ ::encode(static_cast<uint8_t>(flags), bl);
+ } else {
+ // legacy unversioned encoding
+ ::encode(need, bl);
+ ::encode(have, bl);
+ }
}
void decode(bufferlist::iterator& bl) {
- ::decode(need, bl);
- ::decode(have, bl);
+ eversion_t e;
+ ::decode(e, bl);
+ if (e != eversion_t()) {
+ // legacy encoding, this is the need value
+ need = e;
+ ::decode(have, bl);
+ } else {
+ ::decode(need, bl);
+ ::decode(have, bl);
+ uint8_t f;
+ ::decode(f, bl);
+ flags = static_cast<missing_flags_t>(f);
+ }
+ }
+
+ void set_delete(bool is_delete) {
+ flags = is_delete ? FLAG_DELETE : FLAG_NONE;
+ }
+
+ bool is_delete() const {
+ return (flags & FLAG_DELETE) == FLAG_DELETE;
+ }
+
+ string flag_str() const {
+ if (flags == FLAG_NONE) {
+ return "none";
+ } else {
+ return "delete";
+ }
}
+
void dump(Formatter *f) const {
f->dump_stream("need") << need;
f->dump_stream("have") << have;
+ f->dump_stream("flags") << flag_str();
}
static void generate_test_instances(list<pg_missing_item*>& o) {
o.push_back(new pg_missing_item);
o.push_back(new pg_missing_item);
o.back()->need = eversion_t(1, 2);
o.back()->have = eversion_t(1, 1);
+ o.push_back(new pg_missing_item);
+ o.back()->need = eversion_t(3, 5);
+ o.back()->have = eversion_t(3, 4);
+ o.back()->flags = FLAG_DELETE;
}
bool operator==(const pg_missing_item &rhs) const {
- return need == rhs.need && have == rhs.have;
+ return need == rhs.need && have == rhs.have && flags == rhs.flags;
}
bool operator!=(const pg_missing_item &rhs) const {
return !(*this == rhs);
}
};
-WRITE_CLASS_ENCODER(pg_missing_item)
+WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
ostream& operator<<(ostream& out, const pg_missing_item &item);
class pg_missing_const_i {
virtual const map<hobject_t, pg_missing_item> &
get_items() const = 0;
virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
+ virtual bool get_may_include_deletes() const = 0;
virtual unsigned int num_missing() const = 0;
virtual bool have_missing() const = 0;
virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
template <typename missing_type>
pg_missing_set(const missing_type &m) {
- for (auto &&i: missing)
- tracker.changed(i.first);
missing = m.get_items();
rmissing = m.get_rmissing();
+ may_include_deletes = m.get_may_include_deletes();
for (auto &&i: missing)
tracker.changed(i.first);
}
+ bool may_include_deletes = false;
+
const map<hobject_t, item> &get_items() const override {
return missing;
}
const map<version_t, hobject_t> &get_rmissing() const override {
return rmissing;
}
+ bool get_may_include_deletes() const override {
+ return may_include_deletes;
+ }
unsigned int num_missing() const override {
return missing.size();
}
* assumes missing is accurate up through the previous log entry.
*/
void add_next_event(const pg_log_entry_t& e) {
- if (e.is_update()) {
- map<hobject_t, item>::iterator missing_it;
- missing_it = missing.find(e.soid);
- bool is_missing_divergent_item = missing_it != missing.end();
- if (e.prior_version == eversion_t() || e.is_clone()) {
- // new object.
- if (is_missing_divergent_item) { // use iterator
- rmissing.erase((missing_it->second).need.version);
- missing_it->second = item(e.version, eversion_t()); // .have = nil
- } else // create new element in missing map
- missing[e.soid] = item(e.version, eversion_t()); // .have = nil
- } else if (is_missing_divergent_item) {
- // already missing (prior).
+ map<hobject_t, item>::iterator missing_it;
+ missing_it = missing.find(e.soid);
+ bool is_missing_divergent_item = missing_it != missing.end();
+ if (e.prior_version == eversion_t() || e.is_clone()) {
+ // new object.
+ if (is_missing_divergent_item) { // use iterator
rmissing.erase((missing_it->second).need.version);
- (missing_it->second).need = e.version; // leave .have unchanged.
- } else if (e.is_backlog()) {
- // May not have prior version
- assert(0 == "these don't exist anymore");
- } else {
- // not missing, we must have prior_version (if any)
- assert(!is_missing_divergent_item);
- missing[e.soid] = item(e.version, e.prior_version);
- }
- rmissing[e.version.version] = e.soid;
- } else if (e.is_delete()) {
- rm(e.soid, e.version);
+ missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
+ } else // create new element in missing map
+ missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
+ } else if (is_missing_divergent_item) {
+ // already missing (prior).
+ rmissing.erase((missing_it->second).need.version);
+ (missing_it->second).need = e.version; // leave .have unchanged.
+ missing_it->second.set_delete(e.is_delete());
+ } else if (e.is_backlog()) {
+ // May not have prior version
+ assert(0 == "these don't exist anymore");
+ } else {
+ // not missing, we must have prior_version (if any)
+ assert(!is_missing_divergent_item);
+ missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
}
-
+ rmissing[e.version.version] = e.soid;
tracker.changed(e.soid);
}
- void revise_need(hobject_t oid, eversion_t need) {
+ void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
if (missing.count(oid)) {
rmissing.erase(missing[oid].need.version);
missing[oid].need = need; // no not adjust .have
+ missing[oid].set_delete(is_delete);
} else {
- missing[oid] = item(need, eversion_t());
+ missing[oid] = item(need, eversion_t(), is_delete);
}
rmissing[need.version] = oid;
}
}
- void add(const hobject_t& oid, eversion_t need, eversion_t have) {
- missing[oid] = item(need, have);
+ void add(const hobject_t& oid, eversion_t need, eversion_t have,
+ bool is_delete) {
+ missing[oid] = item(need, have, is_delete);
rmissing[need.version] = oid;
tracker.changed(oid);
}
void got(const hobject_t& oid, eversion_t v) {
std::map<hobject_t, item>::iterator p = missing.find(oid);
assert(p != missing.end());
- assert(p->second.need <= v);
+ assert(p->second.need <= v || p->second.is_delete());
got(p);
}
pg_t child_pgid,
unsigned split_bits,
pg_missing_set *omissing) {
+ omissing->may_include_deletes = may_include_deletes;
unsigned mask = ~((~0)<<split_bits);
for (map<hobject_t, item>::iterator i = missing.begin();
i != missing.end();
) {
if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
- omissing->add(i->first, i->second.need, i->second.have);
+ omissing->add(i->first, i->second.need, i->second.have,
+ i->second.is_delete());
rm(i++);
} else {
++i;
}
void encode(bufferlist &bl) const {
- ENCODE_START(3, 2, bl);
- ::encode(missing, bl);
+ ENCODE_START(4, 2, bl);
+ ::encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
+ ::encode(may_include_deletes, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &bl, int64_t pool = -1) {
for (auto const &i: missing)
tracker.changed(i.first);
- DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
::decode(missing, bl);
+ if (struct_v >= 4) {
+ ::decode(may_include_deletes, bl);
+ }
DECODE_FINISH(bl);
if (struct_v < 3) {
f->close_section();
}
f->close_section();
+ f->dump_bool("may_include_deletes", may_include_deletes);
}
template <typename F>
void filter_objects(F &&f) {
o.push_back(new pg_missing_set);
o.back()->add(
hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
- eversion_t(5, 6), eversion_t(5, 1));
+ eversion_t(5, 6), eversion_t(5, 1), false);
+ o.push_back(new pg_missing_set);
+ o.back()->add(
+ hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
+ eversion_t(5, 6), eversion_t(5, 1), true);
+ o.back()->may_include_deletes = true;
}
template <typename F>
void get_changed(F &&f) const {
template <bool TrackChanges>
ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
{
- out << "missing(" << missing.num_missing();
+ out << "missing(" << missing.num_missing()
+ << " may_include_deletes = " << missing.may_include_deletes;
//if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
out << ")";
return out;
snapid_t snap_seq;
///< recent reqids on this object
- vector<pair<osd_reqid_t, version_t> > reqids;
+ mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids;
uint64_t truncate_seq;
uint64_t truncate_size;
}
-
-
-
-
// ---------------------------------------
class OSDSuperblock {
<< " " << n.timeout << "s)";
}
+struct object_info_t;
+struct object_manifest_t {
+ enum {
+ TYPE_NONE = 0,
+ TYPE_REDIRECT = 1, // start with this
+ TYPE_CHUNKED = 2, // do this later
+ };
+ uint8_t type; // redirect, chunked, ...
+ hobject_t redirect_target;
+
+ object_manifest_t() : type(0) { }
+ object_manifest_t(uint8_t type, const hobject_t& redirect_target)
+ : type(type), redirect_target(redirect_target) { }
+
+ bool is_empty() const {
+ return type == TYPE_NONE;
+ }
+ bool is_redirect() const {
+ return type == TYPE_REDIRECT;
+ }
+ bool is_chunked() const {
+ return type == TYPE_CHUNKED;
+ }
+ static const char *get_type_name(uint8_t m) {
+ switch (m) {
+ case TYPE_NONE: return "none";
+ case TYPE_REDIRECT: return "redirect";
+ case TYPE_CHUNKED: return "chunked";
+ default: return "unknown";
+ }
+ }
+ const char *get_type_name() const {
+ return get_type_name(type);
+ }
+ static void generate_test_instances(list<object_manifest_t*>& o);
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
+ void dump(Formatter *f) const;
+ friend ostream& operator<<(ostream& out, const object_info_t& oi);
+};
+WRITE_CLASS_ENCODER(object_manifest_t)
+ostream& operator<<(ostream& out, const object_manifest_t& oi);
struct object_info_t {
hobject_t soid;
FLAG_DATA_DIGEST = 1 << 4, // has data crc
FLAG_OMAP_DIGEST = 1 << 5, // has omap crc
FLAG_CACHE_PIN = 1 << 6, // pin the object in cache tier
+ FLAG_MANIFEST = 1 << 7, // has manifest
// ...
FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
} flag_t;
static string get_flag_string(flag_t flags) {
string s;
+ vector<string> sv = get_flag_vector(flags);
+ for (auto ss : sv) {
+ s += string("|") + ss;
+ }
+ if (s.length())
+ return s.substr(1);
+ return s;
+ }
+ static vector<string> get_flag_vector(flag_t flags) {
+ vector<string> sv;
if (flags & FLAG_LOST)
- s += "|lost";
+ sv.insert(sv.end(), "lost");
if (flags & FLAG_WHITEOUT)
- s += "|whiteout";
+ sv.insert(sv.end(), "whiteout");
if (flags & FLAG_DIRTY)
- s += "|dirty";
+ sv.insert(sv.end(), "dirty");
if (flags & FLAG_USES_TMAP)
- s += "|uses_tmap";
+ sv.insert(sv.end(), "uses_tmap");
if (flags & FLAG_OMAP)
- s += "|omap";
+ sv.insert(sv.end(), "omap");
if (flags & FLAG_DATA_DIGEST)
- s += "|data_digest";
+ sv.insert(sv.end(), "data_digest");
if (flags & FLAG_OMAP_DIGEST)
- s += "|omap_digest";
+ sv.insert(sv.end(), "omap_digest");
if (flags & FLAG_CACHE_PIN)
- s += "|cache_pin";
- if (s.length())
- return s.substr(1);
- return s;
+ sv.insert(sv.end(), "cache_pin");
+ if (flags & FLAG_MANIFEST)
+ sv.insert(sv.end(), "manifest");
+ return sv;
}
string get_flag_string() const {
return get_flag_string(flags);
uint64_t expected_object_size, expected_write_size;
uint32_t alloc_hint_flags;
+ struct object_manifest_t manifest;
+
void copy_user_bits(const object_info_t& other);
static ps_t legacy_object_locator_to_ps(const object_t &oid,
bool is_cache_pinned() const {
return test_flag(FLAG_CACHE_PIN);
}
+ bool has_manifest() const {
+ return test_flag(FLAG_MANIFEST);
+ }
void set_data_digest(__u32 d) {
set_flag(FLAG_DATA_DIGEST);
bool first;
bool data_complete;
bool omap_complete;
+ bool error = false;
ObjectRecoveryProgress()
: data_recovered_to(0),
objects.insert(r.objects.begin(), r.objects.end());
}
void swap(ScrubMap &r) {
- ::swap(objects, r.objects);
- ::swap(valid_through, r.valid_through);
- ::swap(incr_since, r.incr_since);
+ using std::swap;
+ swap(objects, r.objects);
+ swap(valid_through, r.valid_through);
+ swap(incr_since, r.incr_since);
}
void encode(bufferlist& bl) const;
WRITE_CLASS_ENCODER(ScrubMap::object)
WRITE_CLASS_ENCODER(ScrubMap)
-
struct OSDOp {
ceph_osd_op op;
sobject_t soid;
bufferlist indata, outdata;
- int32_t rval;
+ errorcode32_t rval;
OSDOp() : rval(0) {
memset(&op, 0, sizeof(ceph_osd_op));
* @param out [out] combined data buffer
*/
static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
+
+ /**
+ * Clear data as much as possible, leave minimal data for historical op dump
+ *
+ * @param ops [in] vector of OSDOps
+ */
+ static void clear_data(vector<OSDOp>& ops);
};
ostream& operator<<(ostream& out, const OSDOp& op);