]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_types.h
import 15.2.0 Octopus source
[ceph.git] / ceph / src / osd / osd_types.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#ifndef CEPH_OSD_TYPES_H
19#define CEPH_OSD_TYPES_H
20
9f95a23c 21#include <atomic>
7c673cae 22#include <sstream>
9f95a23c 23#include <cstdio>
7c673cae 24#include <memory>
11fdf7f2 25#include <string_view>
9f95a23c 26
7c673cae
FG
27#include <boost/scoped_ptr.hpp>
28#include <boost/optional/optional_io.hpp>
29#include <boost/variant.hpp>
9f95a23c 30#include <boost/smart_ptr/local_shared_ptr.hpp>
7c673cae
FG
31
32#include "include/rados/rados_types.hpp"
33#include "include/mempool.h"
34
35#include "msg/msg_types.h"
36#include "include/types.h"
37#include "include/utime.h"
38#include "include/CompatSet.h"
9f95a23c 39#include "common/ceph_context.h"
7c673cae
FG
40#include "common/histogram.h"
41#include "include/interval_set.h"
42#include "include/inline_memory.h"
43#include "common/Formatter.h"
44#include "common/bloom_filter.hpp"
45#include "common/hobject.h"
46#include "common/snap_types.h"
47#include "HitSet.h"
48#include "Watch.h"
49#include "include/cmp.h"
50#include "librados/ListObjectImpl.h"
51#include "compressor/Compressor.h"
9f95a23c 52#include "osd_perf_counters.h"
7c673cae
FG
53
54#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
55
56#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
57#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
58#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
59#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
60#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
61#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
62#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
63#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
64#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
65#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
66#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
67#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
68#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
69#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
70#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
c07f9fc5 71#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
9f95a23c 72#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
7c673cae
FG
73
74
81eedcae
TL
75/// pool priority range set by user
76#define OSD_POOL_PRIORITY_MAX 10
77#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
78
7c673cae
FG
79/// min recovery priority for MBackfillReserve
80#define OSD_RECOVERY_PRIORITY_MIN 0
81
82/// base backfill priority for MBackfillReserve
83#define OSD_BACKFILL_PRIORITY_BASE 100
84
85/// base backfill priority for MBackfillReserve (degraded PG)
86#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
87
88/// base recovery priority for MBackfillReserve
89#define OSD_RECOVERY_PRIORITY_BASE 180
90
91/// base backfill priority for MBackfillReserve (inactive PG)
92#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
93
81eedcae
TL
94/// base recovery priority for MRecoveryReserve (inactive PG)
95#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
96
c07f9fc5 97/// max manually/automatically set recovery priority for MBackfillReserve
a8e16298 98#define OSD_RECOVERY_PRIORITY_MAX 253
c07f9fc5 99
a8e16298
TL
100/// backfill priority for MBackfillReserve, when forced manually
101#define OSD_BACKFILL_PRIORITY_FORCED 254
102
103/// recovery priority for MRecoveryReserve, when forced manually
c07f9fc5 104#define OSD_RECOVERY_PRIORITY_FORCED 255
7c673cae 105
11fdf7f2
TL
106/// priority for pg deletion when osd is not fullish
107#define OSD_DELETE_PRIORITY_NORMAL 179
108
109/// priority for pg deletion when osd is approaching full
110#define OSD_DELETE_PRIORITY_FULLISH 219
111
112/// priority when more full
113#define OSD_DELETE_PRIORITY_FULL 255
114
81eedcae
TL
115static std::map<int, int> max_prio_map = {
116 {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
117 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
118 {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
119 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
120 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
121};
7c673cae
FG
122
123typedef hobject_t collection_list_handle_t;
124
9f95a23c 125/// convert a single CPEH_OSD_FLAG_* to a std::string
7c673cae 126const char *ceph_osd_flag_name(unsigned flag);
9f95a23c 127/// convert a single CEPH_OSD_OF_FLAG_* to a std::string
7c673cae
FG
128const char *ceph_osd_op_flag_name(unsigned flag);
129
9f95a23c
TL
130/// convert CEPH_OSD_FLAG_* op flags to a std::string
131std::string ceph_osd_flag_string(unsigned flags);
132/// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
133std::string ceph_osd_op_flag_string(unsigned flags);
134/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
135std::string ceph_osd_alloc_hint_flag_string(unsigned flags);
7c673cae 136
9f95a23c 137typedef std::map<std::string,std::string> osd_alert_list_t;
11fdf7f2 138/// map osd id -> alert_list_t
9f95a23c
TL
139typedef std::map<int, osd_alert_list_t> osd_alerts_t;
140void dump(ceph::Formatter* f, const osd_alerts_t& alerts);
141
142
143typedef interval_set<
144 snapid_t,
145 mempool::osdmap::flat_map<snapid_t,snapid_t>> snap_interval_set_t;
146
7c673cae
FG
147
148/**
149 * osd request identifier
150 *
151 * caller name + incarnation# + tid to unique identify this request.
152 */
153struct osd_reqid_t {
154 entity_name_t name; // who
c07f9fc5 155 ceph_tid_t tid;
7c673cae
FG
156 int32_t inc; // incarnation
157
158 osd_reqid_t()
c07f9fc5
FG
159 : tid(0), inc(0)
160 {}
7c673cae 161 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
c07f9fc5
FG
162 : name(a), tid(t), inc(i)
163 {}
7c673cae
FG
164
165 DENC(osd_reqid_t, v, p) {
166 DENC_START(2, 2, p);
167 denc(v.name, p);
168 denc(v.tid, p);
169 denc(v.inc, p);
170 DENC_FINISH(p);
171 }
9f95a23c
TL
172 void dump(ceph::Formatter *f) const;
173 static void generate_test_instances(std::list<osd_reqid_t*>& o);
7c673cae
FG
174};
175WRITE_CLASS_DENC(osd_reqid_t)
176
177
178
179struct pg_shard_t {
b32b8144 180 static const int32_t NO_OSD = 0x7fffffff;
7c673cae
FG
181 int32_t osd;
182 shard_id_t shard;
183 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
184 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
185 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
186 bool is_undefined() const {
187 return osd == -1;
188 }
9f95a23c
TL
189 std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); }
190 void encode(ceph::buffer::list &bl) const;
191 void decode(ceph::buffer::list::const_iterator &bl);
192 void dump(ceph::Formatter *f) const {
7c673cae
FG
193 f->dump_unsigned("osd", osd);
194 if (shard != shard_id_t::NO_SHARD) {
195 f->dump_unsigned("shard", shard);
196 }
197 }
198};
199WRITE_CLASS_ENCODER(pg_shard_t)
200WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
201WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
9f95a23c 202std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs);
7c673cae
FG
203
204class IsPGRecoverablePredicate {
205public:
206 /**
207 * have encodes the shards available
208 */
9f95a23c 209 virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
7c673cae
FG
210 virtual ~IsPGRecoverablePredicate() {}
211};
212
213class IsPGReadablePredicate {
214public:
215 /**
216 * have encodes the shards available
217 */
9f95a23c 218 virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
7c673cae
FG
219 virtual ~IsPGReadablePredicate() {}
220};
221
9f95a23c 222inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) {
7c673cae
FG
223 return out << r.name << "." << r.inc << ":" << r.tid;
224}
225
226inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
227 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
228}
229inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
230 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
231}
232inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
233 return (l.name < r.name) || (l.inc < r.inc) ||
234 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
235}
236inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
237 return (l.name < r.name) || (l.inc < r.inc) ||
238 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
239}
240inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
241inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
242
243namespace std {
244 template<> struct hash<osd_reqid_t> {
245 size_t operator()(const osd_reqid_t &r) const {
246 static hash<uint64_t> H;
247 return H(r.name.num() ^ r.tid ^ r.inc);
248 }
249 };
250} // namespace std
251
252
253// -----
254
255// a locator constrains the placement of an object. mainly, which pool
256// does it go in.
257struct object_locator_t {
258 // You specify either the hash or the key -- not both
259 int64_t pool; ///< pool id
9f95a23c
TL
260 std::string key; ///< key std::string (if non-empty)
261 std::string nspace; ///< namespace
7c673cae
FG
262 int64_t hash; ///< hash position (if >= 0)
263
264 explicit object_locator_t()
265 : pool(-1), hash(-1) {}
266 explicit object_locator_t(int64_t po)
267 : pool(po), hash(-1) {}
268 explicit object_locator_t(int64_t po, int64_t ps)
269 : pool(po), hash(ps) {}
9f95a23c 270 explicit object_locator_t(int64_t po, std::string ns)
7c673cae 271 : pool(po), nspace(ns), hash(-1) {}
9f95a23c 272 explicit object_locator_t(int64_t po, std::string ns, int64_t ps)
7c673cae 273 : pool(po), nspace(ns), hash(ps) {}
9f95a23c 274 explicit object_locator_t(int64_t po, std::string ns, std::string s)
7c673cae
FG
275 : pool(po), key(s), nspace(ns), hash(-1) {}
276 explicit object_locator_t(const hobject_t& soid)
277 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
278
279 int64_t get_pool() const {
280 return pool;
281 }
282
283 void clear() {
284 pool = -1;
285 key = "";
286 nspace = "";
287 hash = -1;
288 }
289
290 bool empty() const {
291 return pool == -1;
292 }
293
9f95a23c
TL
294 void encode(ceph::buffer::list& bl) const;
295 void decode(ceph::buffer::list::const_iterator& p);
296 void dump(ceph::Formatter *f) const;
297 static void generate_test_instances(std::list<object_locator_t*>& o);
7c673cae
FG
298};
299WRITE_CLASS_ENCODER(object_locator_t)
300
301inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
302 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
303}
304inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
305 return !(l == r);
306}
307
9f95a23c 308inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc)
7c673cae
FG
309{
310 out << "@" << loc.pool;
311 if (loc.nspace.length())
312 out << ";" << loc.nspace;
313 if (loc.key.length())
314 out << ":" << loc.key;
315 return out;
316}
317
318struct request_redirect_t {
319private:
320 object_locator_t redirect_locator; ///< this is authoritative
9f95a23c 321 std::string redirect_object; ///< If non-empty, the request goes to this object name
7c673cae 322
9f95a23c 323 friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir);
7c673cae
FG
324public:
325
326 request_redirect_t() {}
327 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
328 redirect_locator(orig) { redirect_locator.pool = rpool; }
329 explicit request_redirect_t(const object_locator_t& rloc) :
330 redirect_locator(rloc) {}
331 explicit request_redirect_t(const object_locator_t& orig,
9f95a23c 332 const std::string& robj) :
7c673cae
FG
333 redirect_locator(orig), redirect_object(robj) {}
334
7c673cae
FG
335 bool empty() const { return redirect_locator.empty() &&
336 redirect_object.empty(); }
337
9f95a23c 338 void combine_with_locator(object_locator_t& orig, std::string& obj) const {
7c673cae
FG
339 orig = redirect_locator;
340 if (!redirect_object.empty())
341 obj = redirect_object;
342 }
343
9f95a23c
TL
344 void encode(ceph::buffer::list& bl) const;
345 void decode(ceph::buffer::list::const_iterator& bl);
346 void dump(ceph::Formatter *f) const;
347 static void generate_test_instances(std::list<request_redirect_t*>& o);
7c673cae
FG
348};
349WRITE_CLASS_ENCODER(request_redirect_t)
350
9f95a23c 351inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) {
7c673cae
FG
352 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
353 return out;
354}
355
356// Internal OSD op flags - set by the OSD based on the op types
357enum {
358 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
359 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
360 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
361 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
362 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
363 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
364 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
365 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
366 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
367 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
9f95a23c 368 CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11),
7c673cae
FG
369};
370
371
372// pg stuff
373
374#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
375
376// placement seed (a hash value)
377typedef uint32_t ps_t;
378
379// old (v1) pg_t encoding (wrap old struct ceph_pg)
380struct old_pg_t {
381 ceph_pg v;
9f95a23c
TL
382 void encode(ceph::buffer::list& bl) const {
383 ceph::encode_raw(v, bl);
7c673cae 384 }
9f95a23c
TL
385 void decode(ceph::buffer::list::const_iterator& bl) {
386 ceph::decode_raw(v, bl);
7c673cae
FG
387 }
388};
389WRITE_CLASS_ENCODER(old_pg_t)
390
391// placement group id
392struct pg_t {
393 uint64_t m_pool;
394 uint32_t m_seed;
7c673cae 395
11fdf7f2
TL
396 pg_t() : m_pool(0), m_seed(0) {}
397 pg_t(ps_t seed, uint64_t pool) :
398 m_pool(pool), m_seed(seed) {}
7c673cae
FG
399 // cppcheck-suppress noExplicitConstructor
400 pg_t(const ceph_pg& cpg) :
11fdf7f2 401 m_pool(cpg.pool), m_seed(cpg.ps) {}
7c673cae
FG
402
403 // cppcheck-suppress noExplicitConstructor
404 pg_t(const old_pg_t& opg) {
405 *this = opg.v;
406 }
407
408 old_pg_t get_old_pg() const {
409 old_pg_t o;
11fdf7f2 410 ceph_assert(m_pool < 0xffffffffull);
7c673cae
FG
411 o.v.pool = m_pool;
412 o.v.ps = m_seed;
11fdf7f2 413 o.v.preferred = (__s16)-1;
7c673cae
FG
414 return o;
415 }
416
417 ps_t ps() const {
418 return m_seed;
419 }
11fdf7f2 420 int64_t pool() const {
7c673cae
FG
421 return m_pool;
422 }
7c673cae
FG
423
424 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
425 char *calc_name(char *buf, const char *suffix_backwords) const;
426
427 void set_ps(ps_t p) {
428 m_seed = p;
429 }
430 void set_pool(uint64_t p) {
431 m_pool = p;
432 }
7c673cae
FG
433
434 pg_t get_parent() const;
435 pg_t get_ancestor(unsigned old_pg_num) const;
436
437 int print(char *o, int maxlen) const;
438 bool parse(const char *s);
439
9f95a23c 440 bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set<pg_t> *pchildren) const;
7c673cae 441
11fdf7f2
TL
442 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
443 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
444 return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
445 }
446
7c673cae
FG
447 /**
448 * Returns b such that for all object o:
449 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
450 */
451 unsigned get_split_bits(unsigned pg_num) const;
452
9f95a23c 453 bool contains(int bits, const ghobject_t& oid) const {
f64942e4
AA
454 return
455 (int64_t)m_pool == oid.hobj.get_logical_pool() &&
456 oid.match(bits, ps());
7c673cae 457 }
9f95a23c 458 bool contains(int bits, const hobject_t& oid) const {
f64942e4
AA
459 return
460 (int64_t)m_pool == oid.get_logical_pool() &&
461 oid.match(bits, ps());
7c673cae
FG
462 }
463
464 hobject_t get_hobj_start() const;
465 hobject_t get_hobj_end(unsigned pg_num) const;
466
9f95a23c
TL
467 // strong ordering is supported
468 inline int compare(const pg_t& p) const noexcept {
469 if (auto delta = pool() - p.pool(); delta != 0) {
470 return delta;
471 } else if (ps() < p.ps()) {
472 return -1;
473 } else if (ps() > p.ps()) {
474 return 1;
475 } else {
476 return 0;
477 }
478 }
479
480 void encode(ceph::buffer::list& bl) const {
11fdf7f2 481 using ceph::encode;
7c673cae 482 __u8 v = 1;
11fdf7f2
TL
483 encode(v, bl);
484 encode(m_pool, bl);
485 encode(m_seed, bl);
486 encode((int32_t)-1, bl); // was preferred
7c673cae 487 }
9f95a23c 488 void decode(ceph::buffer::list::const_iterator& bl) {
11fdf7f2 489 using ceph::decode;
7c673cae 490 __u8 v;
11fdf7f2
TL
491 decode(v, bl);
492 decode(m_pool, bl);
493 decode(m_seed, bl);
9f95a23c 494 bl += sizeof(int32_t); // was preferred
7c673cae 495 }
9f95a23c 496 void decode_old(ceph::buffer::list::const_iterator& bl) {
11fdf7f2 497 using ceph::decode;
7c673cae 498 old_pg_t opg;
11fdf7f2 499 decode(opg, bl);
7c673cae
FG
500 *this = opg;
501 }
9f95a23c
TL
502 void dump(ceph::Formatter *f) const;
503 static void generate_test_instances(std::list<pg_t*>& o);
7c673cae
FG
504};
505WRITE_CLASS_ENCODER(pg_t)
506
507inline bool operator<(const pg_t& l, const pg_t& r) {
9f95a23c 508 return l.compare(r) < 0;
7c673cae
FG
509}
510inline bool operator<=(const pg_t& l, const pg_t& r) {
9f95a23c 511 return l.compare(r) <= 0;
7c673cae
FG
512}
513inline bool operator==(const pg_t& l, const pg_t& r) {
9f95a23c 514 return l.compare(r) == 0;
7c673cae
FG
515}
516inline bool operator!=(const pg_t& l, const pg_t& r) {
9f95a23c 517 return l.compare(r) != 0;
7c673cae
FG
518}
519inline bool operator>(const pg_t& l, const pg_t& r) {
9f95a23c 520 return l.compare(r) > 0;
7c673cae
FG
521}
522inline bool operator>=(const pg_t& l, const pg_t& r) {
9f95a23c 523 return l.compare(r) >= 0;
7c673cae
FG
524}
525
9f95a23c 526std::ostream& operator<<(std::ostream& out, const pg_t &pg);
7c673cae
FG
527
528namespace std {
529 template<> struct hash< pg_t >
530 {
531 size_t operator()( const pg_t& x ) const
532 {
533 static hash<uint32_t> H;
11fdf7f2
TL
534 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
535 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
7c673cae
FG
536 }
537 };
538} // namespace std
539
540struct spg_t {
541 pg_t pgid;
542 shard_id_t shard;
543 spg_t() : shard(shard_id_t::NO_SHARD) {}
544 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
545 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
546 unsigned get_split_bits(unsigned pg_num) const {
547 return pgid.get_split_bits(pg_num);
548 }
549 spg_t get_parent() const {
550 return spg_t(pgid.get_parent(), shard);
551 }
552 ps_t ps() const {
553 return pgid.ps();
554 }
555 uint64_t pool() const {
556 return pgid.pool();
557 }
9f95a23c
TL
558 void reset_shard(shard_id_t s) {
559 shard = s;
560 }
7c673cae
FG
561
562 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
563 char *calc_name(char *buf, const char *suffix_backwords) const;
564
565 bool parse(const char *s);
566 bool parse(const std::string& s) {
567 return parse(s.c_str());
568 }
11fdf7f2
TL
569
570 spg_t get_ancestor(unsigned old_pg_num) const {
571 return spg_t(pgid.get_ancestor(old_pg_num), shard);
572 }
573
7c673cae 574 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
9f95a23c
TL
575 std::set<spg_t> *pchildren) const {
576 std::set<pg_t> _children;
577 std::set<pg_t> *children = pchildren ? &_children : NULL;
7c673cae
FG
578 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
579 if (pchildren && is_split) {
9f95a23c 580 for (std::set<pg_t>::iterator i = _children.begin();
7c673cae
FG
581 i != _children.end();
582 ++i) {
583 pchildren->insert(spg_t(*i, shard));
584 }
585 }
586 return is_split;
587 }
11fdf7f2
TL
588 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
589 return pgid.is_merge_target(old_pg_num, new_pg_num);
590 }
591 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
592 spg_t *parent) const {
593 spg_t out = *this;
594 bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
595 if (r && parent) {
596 *parent = out;
597 }
598 return r;
599 }
600
7c673cae
FG
601 bool is_no_shard() const {
602 return shard == shard_id_t::NO_SHARD;
603 }
604
605 ghobject_t make_pgmeta_oid() const {
606 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
607 }
608
9f95a23c 609 void encode(ceph::buffer::list &bl) const {
7c673cae 610 ENCODE_START(1, 1, bl);
11fdf7f2
TL
611 encode(pgid, bl);
612 encode(shard, bl);
7c673cae
FG
613 ENCODE_FINISH(bl);
614 }
9f95a23c 615 void decode(ceph::buffer::list::const_iterator& bl) {
7c673cae 616 DECODE_START(1, bl);
11fdf7f2
TL
617 decode(pgid, bl);
618 decode(shard, bl);
7c673cae
FG
619 DECODE_FINISH(bl);
620 }
621
9f95a23c 622 ghobject_t make_temp_ghobject(const std::string& name) const {
7c673cae
FG
623 return ghobject_t(
624 hobject_t(object_t(name), "", CEPH_NOSNAP,
625 pgid.ps(),
f64942e4
AA
626 hobject_t::get_temp_pool(pgid.pool()),
627 ""),
7c673cae
FG
628 ghobject_t::NO_GEN,
629 shard);
630 }
631
632 unsigned hash_to_shard(unsigned num_shards) const {
633 return ps() % num_shards;
634 }
635};
636WRITE_CLASS_ENCODER(spg_t)
637WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
638WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
639
640namespace std {
641 template<> struct hash< spg_t >
642 {
643 size_t operator()( const spg_t& x ) const
644 {
645 static hash<uint32_t> H;
646 return H(hash<pg_t>()(x.pgid) ^ x.shard);
647 }
648 };
649} // namespace std
650
9f95a23c 651std::ostream& operator<<(std::ostream& out, const spg_t &pg);
7c673cae
FG
652
653// ----------------------
654
655class coll_t {
656 enum type_t {
657 TYPE_META = 0,
658 TYPE_LEGACY_TEMP = 1, /* no longer used */
659 TYPE_PG = 2,
660 TYPE_PG_TEMP = 3,
661 };
662 type_t type;
663 spg_t pgid;
664 uint64_t removal_seq; // note: deprecated, not encoded
665
666 char _str_buff[spg_t::calc_name_buf_size];
667 char *_str;
668
669 void calc_str();
670
671 coll_t(type_t t, spg_t p, uint64_t r)
672 : type(t), pgid(p), removal_seq(r) {
673 calc_str();
674 }
675
676public:
677 coll_t() : type(TYPE_META), removal_seq(0)
678 {
679 calc_str();
680 }
681
682 coll_t(const coll_t& other)
683 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
684 calc_str();
685 }
686
687 explicit coll_t(spg_t pgid)
688 : type(TYPE_PG), pgid(pgid), removal_seq(0)
689 {
690 calc_str();
691 }
692
693 coll_t& operator=(const coll_t& rhs)
694 {
695 this->type = rhs.type;
696 this->pgid = rhs.pgid;
697 this->removal_seq = rhs.removal_seq;
698 this->calc_str();
699 return *this;
700 }
701
702 // named constructors
703 static coll_t meta() {
704 return coll_t();
705 }
706 static coll_t pg(spg_t p) {
707 return coll_t(p);
708 }
709
710 const std::string to_str() const {
9f95a23c 711 return std::string(_str);
7c673cae
FG
712 }
713 const char *c_str() const {
714 return _str;
715 }
716
717 bool parse(const std::string& s);
718
719 int operator<(const coll_t &rhs) const {
720 return type < rhs.type ||
721 (type == rhs.type && pgid < rhs.pgid);
722 }
723
724 bool is_meta() const {
725 return type == TYPE_META;
726 }
727 bool is_pg_prefix(spg_t *pgid_) const {
728 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
729 *pgid_ = pgid;
730 return true;
731 }
732 return false;
733 }
734 bool is_pg() const {
735 return type == TYPE_PG;
736 }
737 bool is_pg(spg_t *pgid_) const {
738 if (type == TYPE_PG) {
739 *pgid_ = pgid;
740 return true;
741 }
742 return false;
743 }
744 bool is_temp() const {
745 return type == TYPE_PG_TEMP;
746 }
747 bool is_temp(spg_t *pgid_) const {
748 if (type == TYPE_PG_TEMP) {
749 *pgid_ = pgid;
750 return true;
751 }
752 return false;
753 }
9f95a23c
TL
754 int64_t pool() const {
755 return pgid.pool();
756 }
7c673cae 757
9f95a23c
TL
758 void encode(ceph::buffer::list& bl) const;
759 void decode(ceph::buffer::list::const_iterator& bl);
7c673cae
FG
760 size_t encoded_size() const;
761
762 inline bool operator==(const coll_t& rhs) const {
763 // only compare type if meta
764 if (type != rhs.type)
765 return false;
766 if (type == TYPE_META)
767 return true;
768 return type == rhs.type && pgid == rhs.pgid;
769 }
770 inline bool operator!=(const coll_t& rhs) const {
771 return !(*this == rhs);
772 }
773
774 // get a TEMP collection that corresponds to the current collection,
775 // which we presume is a pg collection.
776 coll_t get_temp() const {
11fdf7f2 777 ceph_assert(type == TYPE_PG);
7c673cae
FG
778 return coll_t(TYPE_PG_TEMP, pgid, 0);
779 }
780
781 ghobject_t get_min_hobj() const {
782 ghobject_t o;
783 switch (type) {
784 case TYPE_PG:
785 o.hobj.pool = pgid.pool();
786 o.set_shard(pgid.shard);
787 break;
788 case TYPE_META:
789 o.hobj.pool = -1;
790 break;
791 default:
792 break;
793 }
794 return o;
795 }
796
797 unsigned hash_to_shard(unsigned num_shards) const {
798 if (type == TYPE_PG)
799 return pgid.hash_to_shard(num_shards);
800 return 0; // whatever.
801 }
802
9f95a23c
TL
803 void dump(ceph::Formatter *f) const;
804 static void generate_test_instances(std::list<coll_t*>& o);
7c673cae
FG
805};
806
807WRITE_CLASS_ENCODER(coll_t)
808
9f95a23c 809inline std::ostream& operator<<(std::ostream& out, const coll_t& c) {
7c673cae
FG
810 out << c.to_str();
811 return out;
812}
813
814namespace std {
815 template<> struct hash<coll_t> {
816 size_t operator()(const coll_t &c) const {
817 size_t h = 0;
9f95a23c 818 std::string str(c.to_str());
7c673cae
FG
819 std::string::const_iterator end(str.end());
820 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
821 h += *s;
822 h += (h << 10);
823 h ^= (h >> 6);
824 }
825 h += (h << 3);
826 h ^= (h >> 11);
827 h += (h << 15);
828 return h;
829 }
830 };
831} // namespace std
832
9f95a23c 833inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol)
7c673cae
FG
834{
835 out << pg_t(ol.ol_pgid);
836 int su = ol.ol_stripe_unit;
837 if (su)
838 out << ".su=" << su;
839 return out;
840}
841
842
843
844// compound rados version type
845/* WARNING: If add member in eversion_t, please make sure the encode/decode function
846 * work well. For little-endian machine, we should make sure there is no padding
847 * in 32-bit machine and 64-bit machine.
848 */
849class eversion_t {
850public:
851 version_t version;
852 epoch_t epoch;
853 __u32 __pad;
854 eversion_t() : version(0), epoch(0), __pad(0) {}
855 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
856
857 // cppcheck-suppress noExplicitConstructor
c07f9fc5 858 eversion_t(const ceph_eversion& ce) :
7c673cae
FG
859 version(ce.version),
860 epoch(ce.epoch),
861 __pad(0) { }
862
9f95a23c 863 explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); }
7c673cae 864
11fdf7f2
TL
865 static const eversion_t& max() {
866 static const eversion_t max(-1,-1);
7c673cae
FG
867 return max;
868 }
869
870 operator ceph_eversion() {
871 ceph_eversion c;
872 c.epoch = epoch;
873 c.version = version;
874 return c;
875 }
876
9f95a23c 877 std::string get_key_name() const;
7c673cae 878
11fdf7f2
TL
879 // key must point to the beginning of a block of 32 chars
880 inline void get_key_name(char* key) const {
881 // Below is equivalent of sprintf("%010u.%020llu");
882 key[31] = 0;
883 ritoa<uint64_t, 10, 20>(version, key + 31);
884 key[10] = '.';
885 ritoa<uint32_t, 10, 10>(epoch, key + 10);
886 }
887
9f95a23c 888 void encode(ceph::buffer::list &bl) const {
7c673cae
FG
889#if defined(CEPH_LITTLE_ENDIAN)
890 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
891#else
11fdf7f2
TL
892 using ceph::encode;
893 encode(version, bl);
894 encode(epoch, bl);
7c673cae
FG
895#endif
896 }
9f95a23c 897 void decode(ceph::buffer::list::const_iterator &bl) {
7c673cae
FG
898#if defined(CEPH_LITTLE_ENDIAN)
899 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
900#else
11fdf7f2
TL
901 using ceph::decode;
902 decode(version, bl);
903 decode(epoch, bl);
7c673cae
FG
904#endif
905 }
9f95a23c 906 void decode(ceph::buffer::list& bl) {
11fdf7f2 907 auto p = std::cbegin(bl);
7c673cae
FG
908 decode(p);
909 }
910};
911WRITE_CLASS_ENCODER(eversion_t)
912
913inline bool operator==(const eversion_t& l, const eversion_t& r) {
914 return (l.epoch == r.epoch) && (l.version == r.version);
915}
916inline bool operator!=(const eversion_t& l, const eversion_t& r) {
917 return (l.epoch != r.epoch) || (l.version != r.version);
918}
919inline bool operator<(const eversion_t& l, const eversion_t& r) {
920 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
921}
922inline bool operator<=(const eversion_t& l, const eversion_t& r) {
923 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
924}
925inline bool operator>(const eversion_t& l, const eversion_t& r) {
926 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
927}
928inline bool operator>=(const eversion_t& l, const eversion_t& r) {
929 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
930}
9f95a23c 931inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) {
7c673cae
FG
932 return out << e.epoch << "'" << e.version;
933}
934
935/**
936 * objectstore_perf_stat_t
937 *
938 * current perf information about the osd
939 */
940struct objectstore_perf_stat_t {
11fdf7f2
TL
941 // cur_op_latency is in ns since double add/sub are not associative
942 uint64_t os_commit_latency_ns;
943 uint64_t os_apply_latency_ns;
7c673cae
FG
944
945 objectstore_perf_stat_t() :
11fdf7f2 946 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
7c673cae
FG
947
948 bool operator==(const objectstore_perf_stat_t &r) const {
11fdf7f2
TL
949 return os_commit_latency_ns == r.os_commit_latency_ns &&
950 os_apply_latency_ns == r.os_apply_latency_ns;
7c673cae
FG
951 }
952
953 void add(const objectstore_perf_stat_t &o) {
11fdf7f2
TL
954 os_commit_latency_ns += o.os_commit_latency_ns;
955 os_apply_latency_ns += o.os_apply_latency_ns;
7c673cae
FG
956 }
957 void sub(const objectstore_perf_stat_t &o) {
11fdf7f2
TL
958 os_commit_latency_ns -= o.os_commit_latency_ns;
959 os_apply_latency_ns -= o.os_apply_latency_ns;
7c673cae 960 }
9f95a23c
TL
961 void dump(ceph::Formatter *f) const;
962 void encode(ceph::buffer::list &bl, uint64_t features) const;
963 void decode(ceph::buffer::list::const_iterator &bl);
7c673cae
FG
964 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
965};
11fdf7f2 966WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
7c673cae
FG
967
968/*
969 * pg states
970 */
11fdf7f2
TL
971#define PG_STATE_CREATING (1ULL << 0) // creating
972#define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
973#define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
974#define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
975#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
976#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
977#define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
978#define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
979//#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
980#define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
981#define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
982#define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
983#define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
984#define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
985#define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
986#define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
987#define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
988#define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
989#define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
990#define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
991#define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
992#define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
993#define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
994#define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
995#define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
996#define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
997#define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
998#define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
999#define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
1000#define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
1001#define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
1002#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
9f95a23c
TL
1003#define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
1004#define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
11fdf7f2
TL
1005
1006std::string pg_state_string(uint64_t state);
9f95a23c
TL
1007std::string pg_vector_string(const std::vector<int32_t> &a);
1008std::optional<uint64_t> pg_string_state(const std::string& state);
7c673cae
FG
1009
1010
1011/*
1012 * pool_snap_info_t
1013 *
1014 * attributes for a single pool snapshot.
1015 */
1016struct pool_snap_info_t {
1017 snapid_t snapid;
1018 utime_t stamp;
9f95a23c 1019 std::string name;
7c673cae 1020
9f95a23c
TL
1021 void dump(ceph::Formatter *f) const;
1022 void encode(ceph::buffer::list& bl, uint64_t features) const;
1023 void decode(ceph::buffer::list::const_iterator& bl);
1024 static void generate_test_instances(std::list<pool_snap_info_t*>& o);
7c673cae
FG
1025};
1026WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1027
9f95a23c 1028inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) {
7c673cae
FG
1029 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1030}
1031
1032
1033/*
1034 * pool_opts_t
1035 *
1036 * pool options.
1037 */
1038
1039class pool_opts_t {
1040public:
1041 enum key_t {
1042 SCRUB_MIN_INTERVAL,
1043 SCRUB_MAX_INTERVAL,
1044 DEEP_SCRUB_INTERVAL,
1045 RECOVERY_PRIORITY,
1046 RECOVERY_OP_PRIORITY,
1047 SCRUB_PRIORITY,
1048 COMPRESSION_MODE,
1049 COMPRESSION_ALGORITHM,
1050 COMPRESSION_REQUIRED_RATIO,
1051 COMPRESSION_MAX_BLOB_SIZE,
1052 COMPRESSION_MIN_BLOB_SIZE,
1053 CSUM_TYPE,
1054 CSUM_MAX_BLOCK,
1055 CSUM_MIN_BLOCK,
11fdf7f2
TL
1056 FINGERPRINT_ALGORITHM,
1057 PG_NUM_MIN, // min pg_num
1058 TARGET_SIZE_BYTES, // total bytes in pool
1059 TARGET_SIZE_RATIO, // fraction of total cluster
1060 PG_AUTOSCALE_BIAS,
9f95a23c 1061 READ_LEASE_INTERVAL,
7c673cae
FG
1062 };
1063
1064 enum type_t {
1065 STR,
1066 INT,
1067 DOUBLE,
1068 };
1069
1070 struct opt_desc_t {
1071 key_t key;
1072 type_t type;
1073
1074 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1075
1076 bool operator==(const opt_desc_t& rhs) const {
1077 return key == rhs.key && type == rhs.type;
1078 }
1079 };
1080
11fdf7f2 1081 typedef boost::variant<std::string,int64_t,double> value_t;
7c673cae
FG
1082
1083 static bool is_opt_name(const std::string& name);
1084 static opt_desc_t get_opt_desc(const std::string& name);
1085
1086 pool_opts_t() : opts() {}
1087
1088 bool is_set(key_t key) const;
1089
1090 template<typename T>
1091 void set(key_t key, const T &val) {
1092 value_t value = val;
1093 opts[key] = value;
1094 }
1095
1096 template<typename T>
1097 bool get(key_t key, T *val) const {
1098 opts_t::const_iterator i = opts.find(key);
1099 if (i == opts.end()) {
1100 return false;
1101 }
1102 *val = boost::get<T>(i->second);
1103 return true;
1104 }
1105
1106 const value_t& get(key_t key) const;
1107
1108 bool unset(key_t key);
1109
9f95a23c 1110 void dump(const std::string& name, ceph::Formatter *f) const;
7c673cae 1111
9f95a23c
TL
1112 void dump(ceph::Formatter *f) const;
1113 void encode(ceph::buffer::list &bl, uint64_t features) const;
1114 void decode(ceph::buffer::list::const_iterator &bl);
7c673cae
FG
1115
1116private:
1117 typedef std::map<key_t, value_t> opts_t;
1118 opts_t opts;
1119
9f95a23c 1120 friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts);
7c673cae 1121};
11fdf7f2
TL
1122WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
1123
1124struct pg_merge_meta_t {
1125 pg_t source_pgid;
1126 epoch_t ready_epoch = 0;
1127 epoch_t last_epoch_started = 0;
1128 epoch_t last_epoch_clean = 0;
1129 eversion_t source_version;
1130 eversion_t target_version;
1131
9f95a23c 1132 void encode(ceph::buffer::list& bl) const {
11fdf7f2
TL
1133 ENCODE_START(1, 1, bl);
1134 encode(source_pgid, bl);
1135 encode(ready_epoch, bl);
1136 encode(last_epoch_started, bl);
1137 encode(last_epoch_clean, bl);
1138 encode(source_version, bl);
1139 encode(target_version, bl);
1140 ENCODE_FINISH(bl);
1141 }
9f95a23c 1142 void decode(ceph::buffer::list::const_iterator& p) {
11fdf7f2
TL
1143 DECODE_START(1, p);
1144 decode(source_pgid, p);
1145 decode(ready_epoch, p);
1146 decode(last_epoch_started, p);
1147 decode(last_epoch_clean, p);
1148 decode(source_version, p);
1149 decode(target_version, p);
1150 DECODE_FINISH(p);
1151 }
9f95a23c 1152 void dump(ceph::Formatter *f) const {
11fdf7f2
TL
1153 f->dump_stream("source_pgid") << source_pgid;
1154 f->dump_unsigned("ready_epoch", ready_epoch);
1155 f->dump_unsigned("last_epoch_started", last_epoch_started);
1156 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
1157 f->dump_stream("source_version") << source_version;
1158 f->dump_stream("target_version") << target_version;
1159 }
1160};
1161WRITE_CLASS_ENCODER(pg_merge_meta_t)
7c673cae
FG
1162
1163/*
1164 * pg_pool
1165 */
1166struct pg_pool_t {
c07f9fc5
FG
1167 static const char *APPLICATION_NAME_CEPHFS;
1168 static const char *APPLICATION_NAME_RBD;
1169 static const char *APPLICATION_NAME_RGW;
1170
7c673cae
FG
1171 enum {
1172 TYPE_REPLICATED = 1, // replication
1173 //TYPE_RAID4 = 2, // raid4 (never implemented)
1174 TYPE_ERASURE = 3, // erasure-coded
1175 };
11fdf7f2 1176 static std::string_view get_type_name(int t) {
7c673cae
FG
1177 switch (t) {
1178 case TYPE_REPLICATED: return "replicated";
1179 //case TYPE_RAID4: return "raid4";
1180 case TYPE_ERASURE: return "erasure";
1181 default: return "???";
1182 }
1183 }
11fdf7f2 1184 std::string_view get_type_name() const {
7c673cae
FG
1185 return get_type_name(type);
1186 }
7c673cae
FG
1187
1188 enum {
1189 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1190 FLAG_FULL = 1<<1, // pool is full
1191 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1192 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1193 FLAG_NODELETE = 1<<4, // pool can't be deleted
1194 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1195 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1196 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1197 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1198 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
11fdf7f2 1199 FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
3efd9988
FG
1200 FLAG_NEARFULL = 1<<11, // pool is nearfull
1201 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
11fdf7f2
TL
1202 FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
1203 FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps
1204 FLAG_CREATING = 1<<15, // initial pool PGs are being created
7c673cae
FG
1205 };
1206
1207 static const char *get_flag_name(int f) {
1208 switch (f) {
1209 case FLAG_HASHPSPOOL: return "hashpspool";
1210 case FLAG_FULL: return "full";
1211 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1212 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1213 case FLAG_NODELETE: return "nodelete";
1214 case FLAG_NOPGCHANGE: return "nopgchange";
1215 case FLAG_NOSIZECHANGE: return "nosizechange";
1216 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1217 case FLAG_NOSCRUB: return "noscrub";
1218 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
11fdf7f2 1219 case FLAG_FULL_QUOTA: return "full_quota";
3efd9988
FG
1220 case FLAG_NEARFULL: return "nearfull";
1221 case FLAG_BACKFILLFULL: return "backfillfull";
11fdf7f2
TL
1222 case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
1223 case FLAG_POOL_SNAPS: return "pool_snaps";
1224 case FLAG_CREATING: return "creating";
7c673cae
FG
1225 default: return "???";
1226 }
1227 }
9f95a23c
TL
1228 static std::string get_flags_string(uint64_t f) {
1229 std::string s;
7c673cae
FG
1230 for (unsigned n=0; f && n<64; ++n) {
1231 if (f & (1ull << n)) {
1232 if (s.length())
1233 s += ",";
1234 s += get_flag_name(1ull << n);
1235 }
1236 }
1237 return s;
1238 }
9f95a23c 1239 std::string get_flags_string() const {
7c673cae
FG
1240 return get_flags_string(flags);
1241 }
9f95a23c 1242 static uint64_t get_flag_by_name(const std::string& name) {
7c673cae
FG
1243 if (name == "hashpspool")
1244 return FLAG_HASHPSPOOL;
1245 if (name == "full")
1246 return FLAG_FULL;
1247 if (name == "ec_overwrites")
1248 return FLAG_EC_OVERWRITES;
1249 if (name == "incomplete_clones")
1250 return FLAG_INCOMPLETE_CLONES;
1251 if (name == "nodelete")
1252 return FLAG_NODELETE;
1253 if (name == "nopgchange")
1254 return FLAG_NOPGCHANGE;
1255 if (name == "nosizechange")
1256 return FLAG_NOSIZECHANGE;
1257 if (name == "write_fadvise_dontneed")
1258 return FLAG_WRITE_FADVISE_DONTNEED;
1259 if (name == "noscrub")
1260 return FLAG_NOSCRUB;
1261 if (name == "nodeep-scrub")
1262 return FLAG_NODEEP_SCRUB;
11fdf7f2
TL
1263 if (name == "full_quota")
1264 return FLAG_FULL_QUOTA;
3efd9988
FG
1265 if (name == "nearfull")
1266 return FLAG_NEARFULL;
1267 if (name == "backfillfull")
1268 return FLAG_BACKFILLFULL;
11fdf7f2
TL
1269 if (name == "selfmanaged_snaps")
1270 return FLAG_SELFMANAGED_SNAPS;
1271 if (name == "pool_snaps")
1272 return FLAG_POOL_SNAPS;
1273 if (name == "creating")
1274 return FLAG_CREATING;
7c673cae
FG
1275 return 0;
1276 }
1277
1278 /// converts the acting/up vector to a set of pg shards
9f95a23c 1279 void convert_to_pg_shards(const std::vector<int> &from, std::set<pg_shard_t>* to) const;
7c673cae
FG
1280
1281 typedef enum {
1282 CACHEMODE_NONE = 0, ///< no caching
1283 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1284 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1285 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1286 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1287 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1288 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1289 } cache_mode_t;
1290 static const char *get_cache_mode_name(cache_mode_t m) {
1291 switch (m) {
1292 case CACHEMODE_NONE: return "none";
1293 case CACHEMODE_WRITEBACK: return "writeback";
1294 case CACHEMODE_FORWARD: return "forward";
1295 case CACHEMODE_READONLY: return "readonly";
1296 case CACHEMODE_READFORWARD: return "readforward";
1297 case CACHEMODE_READPROXY: return "readproxy";
1298 case CACHEMODE_PROXY: return "proxy";
1299 default: return "unknown";
1300 }
1301 }
9f95a23c 1302 static cache_mode_t get_cache_mode_from_str(const std::string& s) {
7c673cae
FG
1303 if (s == "none")
1304 return CACHEMODE_NONE;
1305 if (s == "writeback")
1306 return CACHEMODE_WRITEBACK;
1307 if (s == "forward")
1308 return CACHEMODE_FORWARD;
1309 if (s == "readonly")
1310 return CACHEMODE_READONLY;
1311 if (s == "readforward")
1312 return CACHEMODE_READFORWARD;
1313 if (s == "readproxy")
1314 return CACHEMODE_READPROXY;
1315 if (s == "proxy")
1316 return CACHEMODE_PROXY;
1317 return (cache_mode_t)-1;
1318 }
1319 const char *get_cache_mode_name() const {
1320 return get_cache_mode_name(cache_mode);
1321 }
1322 bool cache_mode_requires_hit_set() const {
1323 switch (cache_mode) {
1324 case CACHEMODE_NONE:
1325 case CACHEMODE_FORWARD:
1326 case CACHEMODE_READONLY:
1327 case CACHEMODE_PROXY:
1328 return false;
1329 case CACHEMODE_WRITEBACK:
1330 case CACHEMODE_READFORWARD:
1331 case CACHEMODE_READPROXY:
1332 return true;
1333 default:
11fdf7f2
TL
1334 ceph_abort_msg("implement me");
1335 }
1336 }
1337
9f95a23c
TL
1338 enum class pg_autoscale_mode_t : uint8_t {
1339 OFF = 0,
1340 WARN = 1,
1341 ON = 2,
1342 UNKNOWN = UINT8_MAX,
11fdf7f2 1343 };
9f95a23c 1344 static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) {
11fdf7f2 1345 switch (m) {
9f95a23c
TL
1346 case pg_autoscale_mode_t::OFF: return "off";
1347 case pg_autoscale_mode_t::ON: return "on";
1348 case pg_autoscale_mode_t::WARN: return "warn";
11fdf7f2
TL
1349 default: return "???";
1350 }
1351 }
9f95a23c 1352 static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) {
11fdf7f2 1353 if (m == "off") {
9f95a23c 1354 return pg_autoscale_mode_t::OFF;
11fdf7f2
TL
1355 }
1356 if (m == "warn") {
9f95a23c 1357 return pg_autoscale_mode_t::WARN;
11fdf7f2
TL
1358 }
1359 if (m == "on") {
9f95a23c 1360 return pg_autoscale_mode_t::ON;
7c673cae 1361 }
9f95a23c 1362 return pg_autoscale_mode_t::UNKNOWN;
7c673cae
FG
1363 }
1364
11fdf7f2 1365 utime_t create_time;
9f95a23c
TL
1366 uint64_t flags = 0; ///< FLAG_*
1367 __u8 type = 0; ///< TYPE_*
1368 __u8 size = 0, min_size = 0; ///< number of osds in each pg
1369 __u8 crush_rule = 0; ///< crush placement rule
1370 __u8 object_hash = 0; ///< hash mapping object name to ps
1371 pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
1372
7c673cae 1373private:
11fdf7f2
TL
1374 __u32 pg_num = 0, pgp_num = 0; ///< number of pgs
1375 __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to
1376 __u32 pg_num_target = 0; ///< pg_num we should converge toward
1377 __u32 pgp_num_target = 0; ///< pgp_num we should converge toward
7c673cae
FG
1378
1379public:
9f95a23c
TL
1380 std::map<std::string, std::string> properties; ///< OBSOLETE
1381 std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1382 epoch_t last_change = 0; ///< most recent epoch changed, exclusing snapshot changes
11fdf7f2
TL
1383
1384 /// last epoch that forced clients to resend
1385 epoch_t last_force_op_resend = 0;
1386 /// last epoch that forced clients to resend (pre-nautilus clients only)
1387 epoch_t last_force_op_resend_prenautilus = 0;
7c673cae 1388 /// last epoch that forced clients to resend (pre-luminous clients only)
11fdf7f2
TL
1389 epoch_t last_force_op_resend_preluminous = 0;
1390
1391 /// metadata for the most recent PG merge
1392 pg_merge_meta_t last_pg_merge_meta;
1393
9f95a23c
TL
1394 snapid_t snap_seq = 0; ///< seq for per-pool snapshot
1395 epoch_t snap_epoch = 0; ///< osdmap epoch of last snap
1396 uint64_t auid = 0; ///< who owns the pg
7c673cae 1397
9f95a23c
TL
1398 uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool
1399 uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool
7c673cae
FG
1400
1401 /*
1402 * Pool snaps (global to this pool). These define a SnapContext for
1403 * the pool, unless the client manually specifies an alternate
1404 * context.
1405 */
9f95a23c 1406 std::map<snapid_t, pool_snap_info_t> snaps;
7c673cae
FG
1407 /*
1408 * Alternatively, if we are defining non-pool snaps (e.g. via the
1409 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1410 * used). Snaps and removed_snaps are to be used exclusive of each
1411 * other!
1412 */
1413 interval_set<snapid_t> removed_snaps;
1414
9f95a23c 1415 unsigned pg_num_mask = 0, pgp_num_mask = 0;
7c673cae 1416
9f95a23c
TL
1417 std::set<uint64_t> tiers; ///< pools that are tiers of us
1418 int64_t tier_of = -1; ///< pool for which we are a tier
7c673cae 1419 // Note that write wins for read+write ops
9f95a23c
TL
1420 int64_t read_tier = -1; ///< pool/tier for objecter to direct reads to
1421 int64_t write_tier = -1; ///< pool/tier for objecter to direct writes to
1422 cache_mode_t cache_mode = CACHEMODE_NONE; ///< cache pool mode
7c673cae
FG
1423
1424 bool is_tier() const { return tier_of >= 0; }
1425 bool has_tiers() const { return !tiers.empty(); }
1426 void clear_tier() {
1427 tier_of = -1;
1428 clear_read_tier();
1429 clear_write_tier();
1430 clear_tier_tunables();
1431 }
1432 bool has_read_tier() const { return read_tier >= 0; }
1433 void clear_read_tier() { read_tier = -1; }
1434 bool has_write_tier() const { return write_tier >= 0; }
1435 void clear_write_tier() { write_tier = -1; }
1436 void clear_tier_tunables() {
1437 if (cache_mode != CACHEMODE_NONE)
1438 flags |= FLAG_INCOMPLETE_CLONES;
1439 cache_mode = CACHEMODE_NONE;
1440
1441 target_max_bytes = 0;
1442 target_max_objects = 0;
1443 cache_target_dirty_ratio_micro = 0;
1444 cache_target_dirty_high_ratio_micro = 0;
1445 cache_target_full_ratio_micro = 0;
1446 hit_set_params = HitSet::Params();
1447 hit_set_period = 0;
1448 hit_set_count = 0;
1449 hit_set_grade_decay_rate = 0;
1450 hit_set_search_last_n = 0;
1451 grade_table.resize(0);
1452 }
1453
9f95a23c
TL
1454 uint64_t target_max_bytes = 0; ///< tiering: target max pool size
1455 uint64_t target_max_objects = 0; ///< tiering: target max pool size
7c673cae 1456
9f95a23c
TL
1457 uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty
1458 uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of target to flush with high speed
1459 uint32_t cache_target_full_ratio_micro = 0; ///< cache: fraction of target to fill before we evict in earnest
7c673cae 1460
9f95a23c
TL
1461 uint32_t cache_min_flush_age = 0; ///< minimum age (seconds) before we can flush
1462 uint32_t cache_min_evict_age = 0; ///< minimum age (seconds) before we can evict
7c673cae
FG
1463
1464 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
9f95a23c
TL
1465 uint32_t hit_set_period = 0; ///< periodicity of HitSet segments (seconds)
1466 uint32_t hit_set_count = 0; ///< number of periods to retain
1467 bool use_gmt_hitset = true; ///< use gmt to name the hitset archive object
1468 uint32_t min_read_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on read
1469 uint32_t min_write_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on write
1470 uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects
1471 ///< temperature count,the follow hit_set's priority decay
1472 ///< by this params than pre hit_set
1473 uint32_t hit_set_search_last_n = 0; ///< accumulate atmost N hit_sets for temperature
1474
1475 uint32_t stripe_width = 0; ///< erasure coded stripe size in bytes
1476
1477 uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates
1478 ///< user does not specify any expected value
1479 bool fast_read = false; ///< whether turn on fast read on the pool or not
7c673cae
FG
1480
1481 pool_opts_t opts; ///< options
1482
11fdf7f2
TL
1483 typedef enum {
1484 TYPE_FINGERPRINT_NONE = 0,
1485 TYPE_FINGERPRINT_SHA1 = 1,
9f95a23c
TL
1486 TYPE_FINGERPRINT_SHA256 = 2,
1487 TYPE_FINGERPRINT_SHA512 = 3,
11fdf7f2 1488 } fingerprint_t;
9f95a23c 1489 static fingerprint_t get_fingerprint_from_str(const std::string& s) {
11fdf7f2
TL
1490 if (s == "none")
1491 return TYPE_FINGERPRINT_NONE;
1492 if (s == "sha1")
1493 return TYPE_FINGERPRINT_SHA1;
9f95a23c
TL
1494 if (s == "sha256")
1495 return TYPE_FINGERPRINT_SHA256;
1496 if (s == "sha512")
1497 return TYPE_FINGERPRINT_SHA512;
11fdf7f2
TL
1498 return (fingerprint_t)-1;
1499 }
1500 const fingerprint_t get_fingerprint_type() const {
9f95a23c 1501 std::string fp_str;
11fdf7f2
TL
1502 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1503 return get_fingerprint_from_str(fp_str);
1504 }
1505 const char *get_fingerprint_name() const {
9f95a23c 1506 std::string fp_str;
11fdf7f2
TL
1507 fingerprint_t fp_t;
1508 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1509 fp_t = get_fingerprint_from_str(fp_str);
1510 return get_fingerprint_name(fp_t);
1511 }
1512 static const char *get_fingerprint_name(fingerprint_t m) {
1513 switch (m) {
1514 case TYPE_FINGERPRINT_NONE: return "none";
1515 case TYPE_FINGERPRINT_SHA1: return "sha1";
9f95a23c
TL
1516 case TYPE_FINGERPRINT_SHA256: return "sha256";
1517 case TYPE_FINGERPRINT_SHA512: return "sha512";
11fdf7f2
TL
1518 default: return "unknown";
1519 }
1520 }
1521
c07f9fc5 1522 /// application -> key/value metadata
9f95a23c 1523 std::map<std::string, std::map<std::string, std::string>> application_metadata;
c07f9fc5 1524
7c673cae 1525private:
9f95a23c 1526 std::vector<uint32_t> grade_table;
7c673cae
FG
1527
1528public:
1529 uint32_t get_grade(unsigned i) const {
1530 if (grade_table.size() <= i)
1531 return 0;
1532 return grade_table[i];
1533 }
1534 void calc_grade_table() {
1535 unsigned v = 1000000;
1536 grade_table.resize(hit_set_count);
1537 for (unsigned i = 0; i < hit_set_count; i++) {
1538 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1539 grade_table[i] = v;
1540 }
1541 }
1542
9f95a23c 1543 pg_pool_t() = default;
7c673cae 1544
9f95a23c 1545 void dump(ceph::Formatter *f) const;
7c673cae 1546
11fdf7f2 1547 const utime_t &get_create_time() const { return create_time; }
7c673cae
FG
1548 uint64_t get_flags() const { return flags; }
1549 bool has_flag(uint64_t f) const { return flags & f; }
1550 void set_flag(uint64_t f) { flags |= f; }
1551 void unset_flag(uint64_t f) { flags &= ~f; }
1552
7c673cae 1553 bool require_rollback() const {
11fdf7f2 1554 return is_erasure();
7c673cae
FG
1555 }
1556
1557 /// true if incomplete clones may be present
1558 bool allow_incomplete_clones() const {
1559 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1560 }
1561
1562 unsigned get_type() const { return type; }
1563 unsigned get_size() const { return size; }
1564 unsigned get_min_size() const { return min_size; }
31f18b77 1565 int get_crush_rule() const { return crush_rule; }
7c673cae
FG
1566 int get_object_hash() const { return object_hash; }
1567 const char *get_object_hash_name() const {
1568 return ceph_str_hash_name(get_object_hash());
1569 }
1570 epoch_t get_last_change() const { return last_change; }
1571 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
11fdf7f2
TL
1572 epoch_t get_last_force_op_resend_prenautilus() const {
1573 return last_force_op_resend_prenautilus;
1574 }
7c673cae
FG
1575 epoch_t get_last_force_op_resend_preluminous() const {
1576 return last_force_op_resend_preluminous;
1577 }
1578 epoch_t get_snap_epoch() const { return snap_epoch; }
1579 snapid_t get_snap_seq() const { return snap_seq; }
1580 uint64_t get_auid() const { return auid; }
7c673cae
FG
1581
1582 void set_snap_seq(snapid_t s) { snap_seq = s; }
1583 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1584
1585 void set_stripe_width(uint32_t s) { stripe_width = s; }
1586 uint32_t get_stripe_width() const { return stripe_width; }
1587
1588 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1589 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1590
1591 bool supports_omap() const {
1592 return !(get_type() == TYPE_ERASURE);
1593 }
1594
1595 bool requires_aligned_append() const {
1596 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1597 }
1598 uint64_t required_alignment() const { return stripe_width; }
1599
1600 bool allows_ecoverwrites() const {
1601 return has_flag(FLAG_EC_OVERWRITES);
1602 }
1603
1604 bool can_shift_osds() const {
1605 switch (get_type()) {
1606 case TYPE_REPLICATED:
1607 return true;
1608 case TYPE_ERASURE:
1609 return false;
1610 default:
11fdf7f2 1611 ceph_abort_msg("unhandled pool type");
7c673cae
FG
1612 }
1613 }
1614
1615 unsigned get_pg_num() const { return pg_num; }
1616 unsigned get_pgp_num() const { return pgp_num; }
11fdf7f2
TL
1617 unsigned get_pg_num_target() const { return pg_num_target; }
1618 unsigned get_pgp_num_target() const { return pgp_num_target; }
1619 unsigned get_pg_num_pending() const { return pg_num_pending; }
7c673cae
FG
1620
1621 unsigned get_pg_num_mask() const { return pg_num_mask; }
1622 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1623
1624 // if pg_num is not a multiple of two, pgs are not equally sized.
1625 // return, for a given pg, the fraction (denominator) of the total
1626 // pool size that it represents.
1627 unsigned get_pg_num_divisor(pg_t pgid) const;
1628
11fdf7f2
TL
1629 bool is_pending_merge(pg_t pgid, bool *target) const;
1630
7c673cae
FG
1631 void set_pg_num(int p) {
1632 pg_num = p;
11fdf7f2 1633 pg_num_pending = p;
7c673cae
FG
1634 calc_pg_masks();
1635 }
1636 void set_pgp_num(int p) {
1637 pgp_num = p;
1638 calc_pg_masks();
1639 }
11fdf7f2
TL
1640 void set_pg_num_pending(int p) {
1641 pg_num_pending = p;
1642 calc_pg_masks();
1643 }
1644 void set_pg_num_target(int p) {
1645 pg_num_target = p;
1646 }
1647 void set_pgp_num_target(int p) {
1648 pgp_num_target = p;
1649 }
1650 void dec_pg_num(pg_t source_pgid,
1651 epoch_t ready_epoch,
1652 eversion_t source_version,
1653 eversion_t target_version,
1654 epoch_t last_epoch_started,
1655 epoch_t last_epoch_clean) {
1656 --pg_num;
1657 last_pg_merge_meta.source_pgid = source_pgid;
1658 last_pg_merge_meta.ready_epoch = ready_epoch;
1659 last_pg_merge_meta.source_version = source_version;
1660 last_pg_merge_meta.target_version = target_version;
1661 last_pg_merge_meta.last_epoch_started = last_epoch_started;
1662 last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
1663 calc_pg_masks();
1664 }
7c673cae
FG
1665
1666 void set_quota_max_bytes(uint64_t m) {
1667 quota_max_bytes = m;
1668 }
1669 uint64_t get_quota_max_bytes() {
1670 return quota_max_bytes;
1671 }
1672
1673 void set_quota_max_objects(uint64_t m) {
1674 quota_max_objects = m;
1675 }
1676 uint64_t get_quota_max_objects() {
1677 return quota_max_objects;
1678 }
1679
1680 void set_last_force_op_resend(uint64_t t) {
1681 last_force_op_resend = t;
11fdf7f2 1682 last_force_op_resend_prenautilus = t;
7c673cae
FG
1683 last_force_op_resend_preluminous = t;
1684 }
1685
1686 void calc_pg_masks();
1687
1688 /*
1689 * we have two snap modes:
1690 * - pool global snaps
1691 * - snap existence/non-existence defined by snaps[] and snap_seq
1692 * - user managed snaps
1693 * - removal governed by removed_snaps
1694 *
1695 * we know which mode we're using based on whether removed_snaps is empty.
1696 * If nothing has been created, both functions report false.
1697 */
1698 bool is_pool_snaps_mode() const;
1699 bool is_unmanaged_snaps_mode() const;
1700 bool is_removed_snap(snapid_t s) const;
1701
7c673cae
FG
1702 snapid_t snap_exists(const char *s) const;
1703 void add_snap(const char *n, utime_t stamp);
9f95a23c 1704 uint64_t add_unmanaged_snap(bool preoctopus_compat);
7c673cae 1705 void remove_snap(snapid_t s);
9f95a23c 1706 void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat);
7c673cae
FG
1707
1708 SnapContext get_snap_context() const;
1709
1710 /// hash a object name+namespace key to a hash position
9f95a23c 1711 uint32_t hash_key(const std::string& key, const std::string& ns) const;
7c673cae
FG
1712
1713 /// round a hash position down to a pg num
1714 uint32_t raw_hash_to_pg(uint32_t v) const;
1715
1716 /*
1717 * map a raw pg (with full precision ps) into an actual pg, for storage
1718 */
1719 pg_t raw_pg_to_pg(pg_t pg) const;
1720
1721 /*
1722 * map raw pg (full precision ps) into a placement seed. include
1723 * pool id in that value so that different pools don't use the same
1724 * seeds.
1725 */
1726 ps_t raw_pg_to_pps(pg_t pg) const;
1727
1728 /// choose a random hash position within a pg
1729 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1730
9f95a23c
TL
1731 void encode(ceph::buffer::list& bl, uint64_t features) const;
1732 void decode(ceph::buffer::list::const_iterator& bl);
7c673cae 1733
9f95a23c 1734 static void generate_test_instances(std::list<pg_pool_t*>& o);
7c673cae
FG
1735};
1736WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1737
9f95a23c 1738std::ostream& operator<<(std::ostream& out, const pg_pool_t& p);
7c673cae
FG
1739
1740
1741/**
1742 * a summation of object stats
1743 *
1744 * This is just a container for object stats; we don't know what for.
1745 *
1746 * If you add members in object_stat_sum_t, you should make sure there are
1747 * not padding among these members.
1748 * You should also modify the padding_check function.
1749
1750 */
1751struct object_stat_sum_t {
1752 /**************************************************************************
1753 * WARNING: be sure to update operator==, floor, and split when
1754 * adding/removing fields!
1755 **************************************************************************/
1756 int64_t num_bytes; // in bytes
1757 int64_t num_objects;
1758 int64_t num_object_clones;
1759 int64_t num_object_copies; // num_objects * num_replicas
1760 int64_t num_objects_missing_on_primary;
1761 int64_t num_objects_degraded;
1762 int64_t num_objects_unfound;
1763 int64_t num_rd;
1764 int64_t num_rd_kb;
1765 int64_t num_wr;
1766 int64_t num_wr_kb;
1767 int64_t num_scrub_errors; // total deep and shallow scrub errors
1768 int64_t num_objects_recovered;
1769 int64_t num_bytes_recovered;
1770 int64_t num_keys_recovered;
1771 int64_t num_shallow_scrub_errors;
1772 int64_t num_deep_scrub_errors;
1773 int64_t num_objects_dirty;
1774 int64_t num_whiteouts;
1775 int64_t num_objects_omap;
1776 int64_t num_objects_hit_set_archive;
1777 int64_t num_objects_misplaced;
1778 int64_t num_bytes_hit_set_archive;
1779 int64_t num_flush;
1780 int64_t num_flush_kb;
1781 int64_t num_evict;
1782 int64_t num_evict_kb;
1783 int64_t num_promote;
1784 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1785 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1786 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1787 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1788 int64_t num_objects_pinned;
1789 int64_t num_objects_missing;
1790 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
28e407b8 1791 int64_t num_large_omap_objects = 0;
11fdf7f2
TL
1792 int64_t num_objects_manifest = 0;
1793 int64_t num_omap_bytes = 0;
1794 int64_t num_omap_keys = 0;
1795 int64_t num_objects_repaired = 0;
7c673cae
FG
1796
1797 object_stat_sum_t()
1798 : num_bytes(0),
1799 num_objects(0), num_object_clones(0), num_object_copies(0),
1800 num_objects_missing_on_primary(0), num_objects_degraded(0),
1801 num_objects_unfound(0),
1802 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1803 num_scrub_errors(0),
1804 num_objects_recovered(0),
1805 num_bytes_recovered(0),
1806 num_keys_recovered(0),
1807 num_shallow_scrub_errors(0),
1808 num_deep_scrub_errors(0),
1809 num_objects_dirty(0),
1810 num_whiteouts(0),
1811 num_objects_omap(0),
1812 num_objects_hit_set_archive(0),
1813 num_objects_misplaced(0),
1814 num_bytes_hit_set_archive(0),
1815 num_flush(0),
1816 num_flush_kb(0),
1817 num_evict(0),
1818 num_evict_kb(0),
1819 num_promote(0),
1820 num_flush_mode_high(0), num_flush_mode_low(0),
1821 num_evict_mode_some(0), num_evict_mode_full(0),
1822 num_objects_pinned(0),
1823 num_objects_missing(0),
1824 num_legacy_snapsets(0)
1825 {}
1826
1827 void floor(int64_t f) {
1828#define FLOOR(x) if (x < f) x = f
1829 FLOOR(num_bytes);
1830 FLOOR(num_objects);
1831 FLOOR(num_object_clones);
1832 FLOOR(num_object_copies);
1833 FLOOR(num_objects_missing_on_primary);
1834 FLOOR(num_objects_missing);
1835 FLOOR(num_objects_degraded);
1836 FLOOR(num_objects_misplaced);
1837 FLOOR(num_objects_unfound);
1838 FLOOR(num_rd);
1839 FLOOR(num_rd_kb);
1840 FLOOR(num_wr);
1841 FLOOR(num_wr_kb);
28e407b8 1842 FLOOR(num_large_omap_objects);
11fdf7f2
TL
1843 FLOOR(num_objects_manifest);
1844 FLOOR(num_omap_bytes);
1845 FLOOR(num_omap_keys);
7c673cae
FG
1846 FLOOR(num_shallow_scrub_errors);
1847 FLOOR(num_deep_scrub_errors);
94b18763 1848 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
7c673cae
FG
1849 FLOOR(num_objects_recovered);
1850 FLOOR(num_bytes_recovered);
1851 FLOOR(num_keys_recovered);
1852 FLOOR(num_objects_dirty);
1853 FLOOR(num_whiteouts);
1854 FLOOR(num_objects_omap);
1855 FLOOR(num_objects_hit_set_archive);
1856 FLOOR(num_bytes_hit_set_archive);
1857 FLOOR(num_flush);
1858 FLOOR(num_flush_kb);
1859 FLOOR(num_evict);
1860 FLOOR(num_evict_kb);
1861 FLOOR(num_promote);
1862 FLOOR(num_flush_mode_high);
1863 FLOOR(num_flush_mode_low);
1864 FLOOR(num_evict_mode_some);
1865 FLOOR(num_evict_mode_full);
1866 FLOOR(num_objects_pinned);
1867 FLOOR(num_legacy_snapsets);
11fdf7f2 1868 FLOOR(num_objects_repaired);
7c673cae
FG
1869#undef FLOOR
1870 }
1871
9f95a23c 1872 void split(std::vector<object_stat_sum_t> &out) const {
7c673cae
FG
1873#define SPLIT(PARAM) \
1874 for (unsigned i = 0; i < out.size(); ++i) { \
1875 out[i].PARAM = PARAM / out.size(); \
1876 if (i < (PARAM % out.size())) { \
1877 out[i].PARAM++; \
1878 } \
1879 }
1880#define SPLIT_PRESERVE_NONZERO(PARAM) \
1881 for (unsigned i = 0; i < out.size(); ++i) { \
1882 if (PARAM) \
1883 out[i].PARAM = 1 + PARAM / out.size(); \
1884 else \
1885 out[i].PARAM = 0; \
1886 }
1887
1888 SPLIT(num_bytes);
1889 SPLIT(num_objects);
1890 SPLIT(num_object_clones);
1891 SPLIT(num_object_copies);
1892 SPLIT(num_objects_missing_on_primary);
1893 SPLIT(num_objects_missing);
1894 SPLIT(num_objects_degraded);
1895 SPLIT(num_objects_misplaced);
1896 SPLIT(num_objects_unfound);
1897 SPLIT(num_rd);
1898 SPLIT(num_rd_kb);
1899 SPLIT(num_wr);
1900 SPLIT(num_wr_kb);
11fdf7f2
TL
1901 SPLIT(num_large_omap_objects);
1902 SPLIT(num_objects_manifest);
1903 SPLIT(num_omap_bytes);
1904 SPLIT(num_omap_keys);
1905 SPLIT(num_objects_repaired);
94b18763
FG
1906 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
1907 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
1908 for (unsigned i = 0; i < out.size(); ++i) {
1909 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
1910 out[i].num_deep_scrub_errors;
1911 }
7c673cae
FG
1912 SPLIT(num_objects_recovered);
1913 SPLIT(num_bytes_recovered);
1914 SPLIT(num_keys_recovered);
1915 SPLIT(num_objects_dirty);
1916 SPLIT(num_whiteouts);
1917 SPLIT(num_objects_omap);
1918 SPLIT(num_objects_hit_set_archive);
1919 SPLIT(num_bytes_hit_set_archive);
1920 SPLIT(num_flush);
1921 SPLIT(num_flush_kb);
1922 SPLIT(num_evict);
1923 SPLIT(num_evict_kb);
1924 SPLIT(num_promote);
1925 SPLIT(num_flush_mode_high);
1926 SPLIT(num_flush_mode_low);
1927 SPLIT(num_evict_mode_some);
1928 SPLIT(num_evict_mode_full);
1929 SPLIT(num_objects_pinned);
1930 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1931#undef SPLIT
1932#undef SPLIT_PRESERVE_NONZERO
1933 }
1934
1935 void clear() {
92f5a8d4 1936 // FIPS zeroization audit 20191117: this memset is not security related.
7c673cae
FG
1937 memset(this, 0, sizeof(*this));
1938 }
1939
1940 void calc_copies(int nrep) {
1941 num_object_copies = nrep * num_objects;
1942 }
1943
1944 bool is_zero() const {
1945 return mem_is_zero((char*)this, sizeof(*this));
1946 }
1947
1948 void add(const object_stat_sum_t& o);
1949 void sub(const object_stat_sum_t& o);
1950
9f95a23c 1951 void dump(ceph::Formatter *f) const;
7c673cae
FG
1952 void padding_check() {
1953 static_assert(
1954 sizeof(object_stat_sum_t) ==
1955 sizeof(num_bytes) +
1956 sizeof(num_objects) +
1957 sizeof(num_object_clones) +
1958 sizeof(num_object_copies) +
1959 sizeof(num_objects_missing_on_primary) +
1960 sizeof(num_objects_degraded) +
1961 sizeof(num_objects_unfound) +
1962 sizeof(num_rd) +
1963 sizeof(num_rd_kb) +
1964 sizeof(num_wr) +
1965 sizeof(num_wr_kb) +
1966 sizeof(num_scrub_errors) +
28e407b8 1967 sizeof(num_large_omap_objects) +
11fdf7f2
TL
1968 sizeof(num_objects_manifest) +
1969 sizeof(num_omap_bytes) +
1970 sizeof(num_omap_keys) +
1971 sizeof(num_objects_repaired) +
7c673cae
FG
1972 sizeof(num_objects_recovered) +
1973 sizeof(num_bytes_recovered) +
1974 sizeof(num_keys_recovered) +
1975 sizeof(num_shallow_scrub_errors) +
1976 sizeof(num_deep_scrub_errors) +
1977 sizeof(num_objects_dirty) +
1978 sizeof(num_whiteouts) +
1979 sizeof(num_objects_omap) +
1980 sizeof(num_objects_hit_set_archive) +
1981 sizeof(num_objects_misplaced) +
1982 sizeof(num_bytes_hit_set_archive) +
1983 sizeof(num_flush) +
1984 sizeof(num_flush_kb) +
1985 sizeof(num_evict) +
1986 sizeof(num_evict_kb) +
1987 sizeof(num_promote) +
1988 sizeof(num_flush_mode_high) +
1989 sizeof(num_flush_mode_low) +
1990 sizeof(num_evict_mode_some) +
1991 sizeof(num_evict_mode_full) +
1992 sizeof(num_objects_pinned) +
1993 sizeof(num_objects_missing) +
1994 sizeof(num_legacy_snapsets)
1995 ,
1996 "object_stat_sum_t have padding");
1997 }
9f95a23c
TL
1998 void encode(ceph::buffer::list& bl) const;
1999 void decode(ceph::buffer::list::const_iterator& bl);
2000 static void generate_test_instances(std::list<object_stat_sum_t*>& o);
7c673cae
FG
2001};
2002WRITE_CLASS_ENCODER(object_stat_sum_t)
2003
2004bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
2005
2006/**
2007 * a collection of object stat sums
2008 *
2009 * This is a collection of stat sums over different categories.
2010 */
2011struct object_stat_collection_t {
2012 /**************************************************************************
2013 * WARNING: be sure to update the operator== when adding/removing fields! *
2014 **************************************************************************/
2015 object_stat_sum_t sum;
2016
2017 void calc_copies(int nrep) {
2018 sum.calc_copies(nrep);
2019 }
2020
9f95a23c
TL
2021 void dump(ceph::Formatter *f) const;
2022 void encode(ceph::buffer::list& bl) const;
2023 void decode(ceph::buffer::list::const_iterator& bl);
2024 static void generate_test_instances(std::list<object_stat_collection_t*>& o);
7c673cae
FG
2025
2026 bool is_zero() const {
2027 return sum.is_zero();
2028 }
2029
2030 void clear() {
2031 sum.clear();
2032 }
2033
2034 void floor(int64_t f) {
2035 sum.floor(f);
2036 }
2037
2038 void add(const object_stat_sum_t& o) {
2039 sum.add(o);
2040 }
2041
2042 void add(const object_stat_collection_t& o) {
2043 sum.add(o.sum);
2044 }
2045 void sub(const object_stat_collection_t& o) {
2046 sum.sub(o.sum);
2047 }
2048};
2049WRITE_CLASS_ENCODER(object_stat_collection_t)
2050
2051inline bool operator==(const object_stat_collection_t& l,
2052 const object_stat_collection_t& r) {
2053 return l.sum == r.sum;
2054}
2055
2056
2057/** pg_stat
2058 * aggregate stats for a single PG.
2059 */
2060struct pg_stat_t {
2061 /**************************************************************************
2062 * WARNING: be sure to update the operator== when adding/removing fields! *
2063 **************************************************************************/
2064 eversion_t version;
2065 version_t reported_seq; // sequence number
2066 epoch_t reported_epoch; // epoch of this report
11fdf7f2 2067 uint64_t state;
7c673cae
FG
2068 utime_t last_fresh; // last reported
2069 utime_t last_change; // new state != previous state
2070 utime_t last_active; // state & PG_STATE_ACTIVE
2071 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2072 utime_t last_clean; // state & PG_STATE_CLEAN
2073 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
2074 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
2075 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
2076
2077 eversion_t log_start; // (log_start,version]
2078 eversion_t ondisk_log_start; // there may be more on disk
2079
2080 epoch_t created;
2081 epoch_t last_epoch_clean;
2082 pg_t parent;
2083 __u32 parent_split_bits;
2084
2085 eversion_t last_scrub;
2086 eversion_t last_deep_scrub;
2087 utime_t last_scrub_stamp;
2088 utime_t last_deep_scrub_stamp;
2089 utime_t last_clean_scrub_stamp;
2090
2091 object_stat_collection_t stats;
2092
2093 int64_t log_size;
2094 int64_t ondisk_log_size; // >= active_log_size
2095
9f95a23c
TL
2096 std::vector<int32_t> up, acting;
2097 std::vector<pg_shard_t> avail_no_missing;
2098 std::map< std::set<pg_shard_t>, int32_t > object_location_counts;
7c673cae
FG
2099 epoch_t mapping_epoch;
2100
9f95a23c 2101 std::vector<int32_t> blocked_by; ///< osds on which the pg is blocked
7c673cae 2102
11fdf7f2
TL
2103 interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged
2104
7c673cae
FG
2105 utime_t last_became_active;
2106 utime_t last_became_peered;
2107
2108 /// up, acting primaries
2109 int32_t up_primary;
2110 int32_t acting_primary;
2111
b32b8144
FG
2112 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2113 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2114 uint32_t snaptrimq_len;
2115
7c673cae
FG
2116 bool stats_invalid:1;
2117 /// true if num_objects_dirty is not accurate (because it was not
2118 /// maintained starting from pool creation)
2119 bool dirty_stats_invalid:1;
2120 bool omap_stats_invalid:1;
2121 bool hitset_stats_invalid:1;
2122 bool hitset_bytes_stats_invalid:1;
2123 bool pin_stats_invalid:1;
11fdf7f2 2124 bool manifest_stats_invalid:1;
7c673cae
FG
2125
2126 pg_stat_t()
2127 : reported_seq(0),
2128 reported_epoch(0),
2129 state(0),
2130 created(0), last_epoch_clean(0),
2131 parent_split_bits(0),
2132 log_size(0), ondisk_log_size(0),
2133 mapping_epoch(0),
2134 up_primary(-1),
2135 acting_primary(-1),
b32b8144 2136 snaptrimq_len(0),
7c673cae
FG
2137 stats_invalid(false),
2138 dirty_stats_invalid(false),
2139 omap_stats_invalid(false),
2140 hitset_stats_invalid(false),
2141 hitset_bytes_stats_invalid(false),
11fdf7f2
TL
2142 pin_stats_invalid(false),
2143 manifest_stats_invalid(false)
7c673cae
FG
2144 { }
2145
2146 epoch_t get_effective_last_epoch_clean() const {
2147 if (state & PG_STATE_CLEAN) {
2148 // we are clean as of this report, and should thus take the
2149 // reported epoch
2150 return reported_epoch;
2151 } else {
2152 return last_epoch_clean;
2153 }
2154 }
2155
9f95a23c
TL
2156 std::pair<epoch_t, version_t> get_version_pair() const {
2157 return { reported_epoch, reported_seq };
7c673cae
FG
2158 }
2159
2160 void floor(int64_t f) {
2161 stats.floor(f);
2162 if (log_size < f)
2163 log_size = f;
2164 if (ondisk_log_size < f)
2165 ondisk_log_size = f;
b32b8144
FG
2166 if (snaptrimq_len < f)
2167 snaptrimq_len = f;
7c673cae
FG
2168 }
2169
11fdf7f2
TL
2170 void add_sub_invalid_flags(const pg_stat_t& o) {
2171 // adding (or subtracting!) invalid stats render our stats invalid too
2172 stats_invalid |= o.stats_invalid;
2173 dirty_stats_invalid |= o.dirty_stats_invalid;
eafe8130 2174 omap_stats_invalid |= o.omap_stats_invalid;
11fdf7f2 2175 hitset_stats_invalid |= o.hitset_stats_invalid;
eafe8130 2176 hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid;
11fdf7f2
TL
2177 pin_stats_invalid |= o.pin_stats_invalid;
2178 manifest_stats_invalid |= o.manifest_stats_invalid;
2179 }
7c673cae
FG
2180 void add(const pg_stat_t& o) {
2181 stats.add(o.stats);
2182 log_size += o.log_size;
2183 ondisk_log_size += o.ondisk_log_size;
11fdf7f2
TL
2184 snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
2185 (uint64_t)(1ull << 31));
2186 add_sub_invalid_flags(o);
7c673cae
FG
2187 }
2188 void sub(const pg_stat_t& o) {
2189 stats.sub(o.stats);
2190 log_size -= o.log_size;
2191 ondisk_log_size -= o.ondisk_log_size;
b32b8144
FG
2192 if (o.snaptrimq_len < snaptrimq_len) {
2193 snaptrimq_len -= o.snaptrimq_len;
2194 } else {
2195 snaptrimq_len = 0;
2196 }
11fdf7f2 2197 add_sub_invalid_flags(o);
7c673cae
FG
2198 }
2199
2200 bool is_acting_osd(int32_t osd, bool primary) const;
9f95a23c
TL
2201 void dump(ceph::Formatter *f) const;
2202 void dump_brief(ceph::Formatter *f) const;
2203 void encode(ceph::buffer::list &bl) const;
2204 void decode(ceph::buffer::list::const_iterator &bl);
2205 static void generate_test_instances(std::list<pg_stat_t*>& o);
7c673cae
FG
2206};
2207WRITE_CLASS_ENCODER(pg_stat_t)
2208
2209bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2210
11fdf7f2
TL
2211/** store_statfs_t
2212 * ObjectStore full statfs information
2213 */
2214struct store_statfs_t
2215{
2216 uint64_t total = 0; ///< Total bytes
2217 uint64_t available = 0; ///< Free bytes available
2218 uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
2219
2220 int64_t allocated = 0; ///< Bytes allocated by the store
2221
2222 int64_t data_stored = 0; ///< Bytes actually stored by the user
2223 int64_t data_compressed = 0; ///< Bytes stored after compression
2224 int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
2225 int64_t data_compressed_original = 0; ///< Bytes that were compressed
2226
2227 int64_t omap_allocated = 0; ///< approx usage of omap data
2228 int64_t internal_metadata = 0; ///< approx usage of internal metadata
2229
2230 void reset() {
2231 *this = store_statfs_t();
2232 }
2233 void floor(int64_t f) {
2234#define FLOOR(x) if (int64_t(x) < f) x = f
2235 FLOOR(total);
2236 FLOOR(available);
2237 FLOOR(internally_reserved);
2238 FLOOR(allocated);
2239 FLOOR(data_stored);
2240 FLOOR(data_compressed);
2241 FLOOR(data_compressed_allocated);
2242 FLOOR(data_compressed_original);
2243
2244 FLOOR(omap_allocated);
2245 FLOOR(internal_metadata);
2246#undef FLOOR
2247 }
2248
2249 bool operator ==(const store_statfs_t& other) const;
2250 bool is_zero() const {
2251 return *this == store_statfs_t();
2252 }
2253
2254 uint64_t get_used() const {
2255 return total - available - internally_reserved;
2256 }
2257
2258 // this accumulates both actually used and statfs's internally_reserved
2259 uint64_t get_used_raw() const {
2260 return total - available;
2261 }
2262
2263 float get_used_raw_ratio() const {
2264 if (total) {
2265 return (float)get_used_raw() / (float)total;
2266 } else {
2267 return 0.0;
2268 }
2269 }
2270
2271 // helpers to ease legacy code porting
2272 uint64_t kb_avail() const {
2273 return available >> 10;
2274 }
2275 uint64_t kb() const {
2276 return total >> 10;
2277 }
2278 uint64_t kb_used() const {
2279 return (total - available - internally_reserved) >> 10;
2280 }
2281 uint64_t kb_used_raw() const {
2282 return get_used_raw() >> 10;
2283 }
2284
2285 uint64_t kb_used_data() const {
2286 return allocated >> 10;
2287 }
2288 uint64_t kb_used_omap() const {
2289 return omap_allocated >> 10;
2290 }
2291
2292 uint64_t kb_used_internal_metadata() const {
2293 return internal_metadata >> 10;
2294 }
2295
2296 void add(const store_statfs_t& o) {
2297 total += o.total;
2298 available += o.available;
2299 internally_reserved += o.internally_reserved;
2300 allocated += o.allocated;
2301 data_stored += o.data_stored;
2302 data_compressed += o.data_compressed;
2303 data_compressed_allocated += o.data_compressed_allocated;
2304 data_compressed_original += o.data_compressed_original;
2305 omap_allocated += o.omap_allocated;
2306 internal_metadata += o.internal_metadata;
2307 }
2308 void sub(const store_statfs_t& o) {
2309 total -= o.total;
2310 available -= o.available;
2311 internally_reserved -= o.internally_reserved;
2312 allocated -= o.allocated;
2313 data_stored -= o.data_stored;
2314 data_compressed -= o.data_compressed;
2315 data_compressed_allocated -= o.data_compressed_allocated;
2316 data_compressed_original -= o.data_compressed_original;
2317 omap_allocated -= o.omap_allocated;
2318 internal_metadata -= o.internal_metadata;
2319 }
9f95a23c 2320 void dump(ceph::Formatter *f) const;
11fdf7f2
TL
2321 DENC(store_statfs_t, v, p) {
2322 DENC_START(1, 1, p);
2323 denc(v.total, p);
2324 denc(v.available, p);
2325 denc(v.internally_reserved, p);
2326 denc(v.allocated, p);
2327 denc(v.data_stored, p);
2328 denc(v.data_compressed, p);
2329 denc(v.data_compressed_allocated, p);
2330 denc(v.data_compressed_original, p);
2331 denc(v.omap_allocated, p);
2332 denc(v.internal_metadata, p);
2333 DENC_FINISH(p);
2334 }
9f95a23c 2335 static void generate_test_instances(std::list<store_statfs_t*>& o);
11fdf7f2
TL
2336};
2337WRITE_CLASS_DENC(store_statfs_t)
2338
9f95a23c 2339std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs);
11fdf7f2
TL
2340
2341/** osd_stat
2342 * aggregate stats for an osd
2343 */
2344struct osd_stat_t {
2345 store_statfs_t statfs;
9f95a23c 2346 std::vector<int> hb_peers;
11fdf7f2
TL
2347 int32_t snap_trim_queue_len, num_snap_trimming;
2348 uint64_t num_shards_repaired;
2349
2350 pow2_hist_t op_queue_age_hist;
2351
2352 objectstore_perf_stat_t os_perf_stat;
2353 osd_alerts_t os_alerts;
2354
2355 epoch_t up_from = 0;
2356 uint64_t seq = 0;
2357
2358 uint32_t num_pgs = 0;
2359
81eedcae
TL
2360 uint32_t num_osds = 0;
2361 uint32_t num_per_pool_osds = 0;
9f95a23c 2362 uint32_t num_per_pool_omap_osds = 0;
81eedcae 2363
eafe8130
TL
2364 struct Interfaces {
2365 uint32_t last_update; // in seconds
2366 uint32_t back_pingtime[3];
2367 uint32_t back_min[3];
2368 uint32_t back_max[3];
2369 uint32_t back_last;
2370 uint32_t front_pingtime[3];
2371 uint32_t front_min[3];
2372 uint32_t front_max[3];
2373 uint32_t front_last;
2374 };
2375 map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces
2376
11fdf7f2
TL
2377 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2378 num_shards_repaired(0) {}
2379
2380 void add(const osd_stat_t& o) {
2381 statfs.add(o.statfs);
2382 snap_trim_queue_len += o.snap_trim_queue_len;
2383 num_snap_trimming += o.num_snap_trimming;
2384 num_shards_repaired += o.num_shards_repaired;
2385 op_queue_age_hist.add(o.op_queue_age_hist);
2386 os_perf_stat.add(o.os_perf_stat);
2387 num_pgs += o.num_pgs;
81eedcae
TL
2388 num_osds += o.num_osds;
2389 num_per_pool_osds += o.num_per_pool_osds;
9f95a23c 2390 num_per_pool_omap_osds += o.num_per_pool_omap_osds;
11fdf7f2
TL
2391 for (const auto& a : o.os_alerts) {
2392 auto& target = os_alerts[a.first];
2393 for (auto& i : a.second) {
2394 target.emplace(i.first, i.second);
2395 }
2396 }
2397 }
2398 void sub(const osd_stat_t& o) {
2399 statfs.sub(o.statfs);
2400 snap_trim_queue_len -= o.snap_trim_queue_len;
2401 num_snap_trimming -= o.num_snap_trimming;
2402 num_shards_repaired -= o.num_shards_repaired;
2403 op_queue_age_hist.sub(o.op_queue_age_hist);
2404 os_perf_stat.sub(o.os_perf_stat);
2405 num_pgs -= o.num_pgs;
81eedcae
TL
2406 num_osds -= o.num_osds;
2407 num_per_pool_osds -= o.num_per_pool_osds;
9f95a23c 2408 num_per_pool_omap_osds -= o.num_per_pool_omap_osds;
11fdf7f2
TL
2409 for (const auto& a : o.os_alerts) {
2410 auto& target = os_alerts[a.first];
2411 for (auto& i : a.second) {
2412 target.erase(i.first);
2413 }
2414 if (target.empty()) {
2415 os_alerts.erase(a.first);
2416 }
2417 }
2418 }
9f95a23c
TL
2419 void dump(ceph::Formatter *f, bool with_net = true) const;
2420 void dump_ping_time(ceph::Formatter *f) const;
2421 void encode(ceph::buffer::list &bl, uint64_t features) const;
2422 void decode(ceph::buffer::list::const_iterator &bl);
11fdf7f2
TL
2423 static void generate_test_instances(std::list<osd_stat_t*>& o);
2424};
2425WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
2426
2427inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
2428 return l.statfs == r.statfs &&
2429 l.snap_trim_queue_len == r.snap_trim_queue_len &&
2430 l.num_snap_trimming == r.num_snap_trimming &&
2431 l.num_shards_repaired == r.num_shards_repaired &&
2432 l.hb_peers == r.hb_peers &&
2433 l.op_queue_age_hist == r.op_queue_age_hist &&
2434 l.os_perf_stat == r.os_perf_stat &&
81eedcae
TL
2435 l.num_pgs == r.num_pgs &&
2436 l.num_osds == r.num_osds &&
9f95a23c
TL
2437 l.num_per_pool_osds == r.num_per_pool_osds &&
2438 l.num_per_pool_omap_osds == r.num_per_pool_omap_osds;
11fdf7f2
TL
2439}
2440inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
2441 return !(l == r);
2442}
2443
9f95a23c 2444inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) {
11fdf7f2
TL
2445 return out << "osd_stat(" << s.statfs << ", "
2446 << "peers " << s.hb_peers
2447 << " op hist " << s.op_queue_age_hist.h
2448 << ")";
2449}
2450
7c673cae
FG
2451/*
2452 * summation over an entire pool
2453 */
2454struct pool_stat_t {
2455 object_stat_collection_t stats;
11fdf7f2 2456 store_statfs_t store_stats;
7c673cae
FG
2457 int64_t log_size;
2458 int64_t ondisk_log_size; // >= active_log_size
2459 int32_t up; ///< number of up replicas or shards
2460 int32_t acting; ///< number of acting replicas or shards
11fdf7f2 2461 int32_t num_store_stats; ///< amount of store_stats accumulated
7c673cae 2462
11fdf7f2
TL
2463 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2464 num_store_stats(0)
7c673cae
FG
2465 { }
2466
2467 void floor(int64_t f) {
2468 stats.floor(f);
11fdf7f2 2469 store_stats.floor(f);
7c673cae
FG
2470 if (log_size < f)
2471 log_size = f;
2472 if (ondisk_log_size < f)
2473 ondisk_log_size = f;
2474 if (up < f)
2475 up = f;
2476 if (acting < f)
2477 acting = f;
11fdf7f2
TL
2478 if (num_store_stats < f)
2479 num_store_stats = f;
2480 }
2481
2482 void add(const store_statfs_t& o) {
2483 store_stats.add(o);
2484 ++num_store_stats;
2485 }
2486 void sub(const store_statfs_t& o) {
2487 store_stats.sub(o);
2488 --num_store_stats;
7c673cae
FG
2489 }
2490
2491 void add(const pg_stat_t& o) {
2492 stats.add(o.stats);
2493 log_size += o.log_size;
2494 ondisk_log_size += o.ondisk_log_size;
2495 up += o.up.size();
2496 acting += o.acting.size();
2497 }
2498 void sub(const pg_stat_t& o) {
2499 stats.sub(o.stats);
2500 log_size -= o.log_size;
2501 ondisk_log_size -= o.ondisk_log_size;
2502 up -= o.up.size();
2503 acting -= o.acting.size();
2504 }
2505
2506 bool is_zero() const {
2507 return (stats.is_zero() &&
11fdf7f2 2508 store_stats.is_zero() &&
7c673cae
FG
2509 log_size == 0 &&
2510 ondisk_log_size == 0 &&
2511 up == 0 &&
11fdf7f2
TL
2512 acting == 0 &&
2513 num_store_stats == 0);
2514 }
2515
2516 // helper accessors to retrieve used/netto bytes depending on the
2517 // collection method: new per-pool objectstore report or legacy PG
2518 // summation at OSD.
2519 // In legacy mode used and netto values are the same. But for new per-pool
2520 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2521 // and 'netto' is amount of stored user data.
9f95a23c 2522 uint64_t get_allocated_data_bytes(bool per_pool) const {
81eedcae 2523 if (per_pool) {
9f95a23c 2524 return store_stats.allocated;
11fdf7f2
TL
2525 } else {
2526 // legacy mode, use numbers from 'stats'
9f95a23c
TL
2527 return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2528 }
2529 }
2530 uint64_t get_allocated_omap_bytes(bool per_pool_omap) const {
2531 if (per_pool_omap) {
2532 return store_stats.omap_allocated;
2533 } else {
2534 // omap is not broken out by pool by nautilus bluestore; report the
2535 // scrub value. this will be imprecise in that it won't account for
2536 // any storage overhead/efficiency.
2537 return stats.sum.num_omap_bytes;
11fdf7f2 2538 }
11fdf7f2 2539 }
9f95a23c
TL
2540 uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor
2541 bool per_pool) const {
2542 // NOTE: we need the space amp factor so that we can work backwards from
2543 // the raw utilization to the amount of data that the user actually stored.
81eedcae 2544 if (per_pool) {
9f95a23c 2545 return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
11fdf7f2 2546 } else {
9f95a23c
TL
2547 // legacy mode, use numbers from 'stats'. note that we do NOT use the
2548 // raw_used_rate factor here because we are working from the PG stats
2549 // directly.
2550 return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2551 }
2552 }
2553 uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor
2554 bool per_pool_omap) const {
2555 if (per_pool_omap) {
2556 return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0;
2557 } else {
2558 // omap usage is lazily reported during scrub; this value may lag.
2559 return stats.sum.num_omap_bytes;
11fdf7f2 2560 }
7c673cae
FG
2561 }
2562
9f95a23c
TL
2563 void dump(ceph::Formatter *f) const;
2564 void encode(ceph::buffer::list &bl, uint64_t features) const;
2565 void decode(ceph::buffer::list::const_iterator &bl);
2566 static void generate_test_instances(std::list<pool_stat_t*>& o);
7c673cae
FG
2567};
2568WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2569
2570
2571// -----------------------------------------
2572
2573/**
2574 * pg_hit_set_info_t - information about a single recorded HitSet
2575 *
11fdf7f2 2576 * Track basic metadata about a HitSet, like the number of insertions
7c673cae
FG
2577 * and the time range it covers.
2578 */
2579struct pg_hit_set_info_t {
2580 utime_t begin, end; ///< time interval
2581 eversion_t version; ///< version this HitSet object was written
2582 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2583
2584 friend bool operator==(const pg_hit_set_info_t& l,
2585 const pg_hit_set_info_t& r) {
2586 return
2587 l.begin == r.begin &&
2588 l.end == r.end &&
2589 l.version == r.version &&
2590 l.using_gmt == r.using_gmt;
2591 }
2592
2593 explicit pg_hit_set_info_t(bool using_gmt = true)
2594 : using_gmt(using_gmt) {}
2595
9f95a23c
TL
2596 void encode(ceph::buffer::list &bl) const;
2597 void decode(ceph::buffer::list::const_iterator &bl);
2598 void dump(ceph::Formatter *f) const;
2599 static void generate_test_instances(std::list<pg_hit_set_info_t*>& o);
7c673cae
FG
2600};
2601WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2602
2603/**
2604 * pg_hit_set_history_t - information about a history of hitsets
2605 *
2606 * Include information about the currently accumulating hit set as well
2607 * as archived/historical ones.
2608 */
2609struct pg_hit_set_history_t {
2610 eversion_t current_last_update; ///< last version inserted into current set
9f95a23c 2611 std::list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
7c673cae
FG
2612
2613 friend bool operator==(const pg_hit_set_history_t& l,
2614 const pg_hit_set_history_t& r) {
2615 return
2616 l.current_last_update == r.current_last_update &&
2617 l.history == r.history;
2618 }
2619
9f95a23c
TL
2620 void encode(ceph::buffer::list &bl) const;
2621 void decode(ceph::buffer::list::const_iterator &bl);
2622 void dump(ceph::Formatter *f) const;
2623 static void generate_test_instances(std::list<pg_hit_set_history_t*>& o);
7c673cae
FG
2624};
2625WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2626
2627
2628// -----------------------------------------
2629
2630/**
2631 * pg_history_t - information about recent pg peering/mapping history
2632 *
2633 * This is aggressively shared between OSDs to bound the amount of past
2634 * history they need to worry about.
2635 */
2636struct pg_history_t {
9f95a23c
TL
2637 epoch_t epoch_created = 0; // epoch in which *pg* was created (pool or pg)
2638 epoch_t epoch_pool_created = 0; // epoch in which *pool* was created
31f18b77
FG
2639 // (note: may be pg creation epoch for
2640 // pre-luminous clusters)
9f95a23c
TL
2641 epoch_t last_epoch_started = 0;; // lower bound on last epoch started (anywhere, not necessarily locally)
2642 epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval
2643 epoch_t last_epoch_clean = 0;; // lower bound on last epoch the PG was completely clean.
2644 epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval
2645 epoch_t last_epoch_split = 0;; // as parent or child
2646 epoch_t last_epoch_marked_full = 0;; // pool or cluster
2647
7c673cae
FG
2648 /**
2649 * In the event of a map discontinuity, same_*_since may reflect the first
2650 * map the osd has seen in the new map sequence rather than the actual start
2651 * of the interval. This is ok since a discontinuity at epoch e means there
2652 * must have been a clean interval between e and now and that we cannot be
2653 * in the active set during the interval containing e.
2654 */
9f95a23c
TL
2655 epoch_t same_up_since = 0;; // same acting set since
2656 epoch_t same_interval_since = 0;; // same acting AND up set since
2657 epoch_t same_primary_since = 0;; // same primary at least back through this epoch.
7c673cae
FG
2658
2659 eversion_t last_scrub;
2660 eversion_t last_deep_scrub;
2661 utime_t last_scrub_stamp;
2662 utime_t last_deep_scrub_stamp;
2663 utime_t last_clean_scrub_stamp;
2664
9f95a23c
TL
2665 /// upper bound on how long prior interval readable (relative to encode time)
2666 ceph::timespan prior_readable_until_ub = ceph::timespan::zero();
2667
7c673cae
FG
2668 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2669 return
2670 l.epoch_created == r.epoch_created &&
31f18b77 2671 l.epoch_pool_created == r.epoch_pool_created &&
7c673cae
FG
2672 l.last_epoch_started == r.last_epoch_started &&
2673 l.last_interval_started == r.last_interval_started &&
2674 l.last_epoch_clean == r.last_epoch_clean &&
2675 l.last_interval_clean == r.last_interval_clean &&
2676 l.last_epoch_split == r.last_epoch_split &&
2677 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2678 l.same_up_since == r.same_up_since &&
2679 l.same_interval_since == r.same_interval_since &&
2680 l.same_primary_since == r.same_primary_since &&
2681 l.last_scrub == r.last_scrub &&
2682 l.last_deep_scrub == r.last_deep_scrub &&
2683 l.last_scrub_stamp == r.last_scrub_stamp &&
2684 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
9f95a23c
TL
2685 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2686 l.prior_readable_until_ub == r.prior_readable_until_ub;
2687 }
2688
2689 pg_history_t() {}
2690 pg_history_t(epoch_t created, utime_t stamp)
2691 : epoch_created(created),
2692 epoch_pool_created(created),
2693 same_up_since(created),
2694 same_interval_since(created),
2695 same_primary_since(created),
2696 last_scrub_stamp(stamp),
2697 last_deep_scrub_stamp(stamp),
2698 last_clean_scrub_stamp(stamp) {}
7c673cae
FG
2699
2700 bool merge(const pg_history_t &other) {
2701 // Here, we only update the fields which cannot be calculated from the OSDmap.
2702 bool modified = false;
2703 if (epoch_created < other.epoch_created) {
2704 epoch_created = other.epoch_created;
2705 modified = true;
2706 }
31f18b77
FG
2707 if (epoch_pool_created < other.epoch_pool_created) {
2708 // FIXME: for jewel compat only; this should either be 0 or always the
2709 // same value across all pg instances.
2710 epoch_pool_created = other.epoch_pool_created;
2711 modified = true;
2712 }
7c673cae
FG
2713 if (last_epoch_started < other.last_epoch_started) {
2714 last_epoch_started = other.last_epoch_started;
2715 modified = true;
2716 }
2717 if (last_interval_started < other.last_interval_started) {
2718 last_interval_started = other.last_interval_started;
9f95a23c
TL
2719 // if we are learning about a newer *started* interval, our
2720 // readable_until_ub is obsolete
2721 prior_readable_until_ub = other.prior_readable_until_ub;
2722 modified = true;
2723 } else if (other.last_interval_started == last_interval_started &&
2724 other.prior_readable_until_ub < prior_readable_until_ub) {
2725 // if other is the *same* interval, than pull our upper bound in
2726 // if they have a tighter bound.
2727 prior_readable_until_ub = other.prior_readable_until_ub;
7c673cae
FG
2728 modified = true;
2729 }
2730 if (last_epoch_clean < other.last_epoch_clean) {
2731 last_epoch_clean = other.last_epoch_clean;
2732 modified = true;
2733 }
2734 if (last_interval_clean < other.last_interval_clean) {
2735 last_interval_clean = other.last_interval_clean;
2736 modified = true;
2737 }
2738 if (last_epoch_split < other.last_epoch_split) {
2739 last_epoch_split = other.last_epoch_split;
2740 modified = true;
2741 }
2742 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2743 last_epoch_marked_full = other.last_epoch_marked_full;
2744 modified = true;
2745 }
2746 if (other.last_scrub > last_scrub) {
2747 last_scrub = other.last_scrub;
2748 modified = true;
2749 }
2750 if (other.last_scrub_stamp > last_scrub_stamp) {
2751 last_scrub_stamp = other.last_scrub_stamp;
2752 modified = true;
2753 }
2754 if (other.last_deep_scrub > last_deep_scrub) {
2755 last_deep_scrub = other.last_deep_scrub;
2756 modified = true;
2757 }
2758 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2759 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2760 modified = true;
2761 }
2762 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2763 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2764 modified = true;
2765 }
2766 return modified;
2767 }
2768
9f95a23c
TL
2769 void encode(ceph::buffer::list& bl) const;
2770 void decode(ceph::buffer::list::const_iterator& p);
2771 void dump(ceph::Formatter *f) const;
2772 static void generate_test_instances(std::list<pg_history_t*>& o);
2773
2774 ceph::signedspan refresh_prior_readable_until_ub(
2775 ceph::signedspan now, ///< now, relative to osd startup_time
2776 ceph::signedspan ub) { ///< ub, relative to osd startup_time
2777 if (now >= ub) {
2778 // prior interval(s) are unreadable; we can zero the upper bound
2779 prior_readable_until_ub = ceph::signedspan::zero();
2780 return ceph::signedspan::zero();
2781 } else {
2782 prior_readable_until_ub = ub - now;
2783 return ub;
2784 }
2785 }
2786 ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) {
2787 if (prior_readable_until_ub == ceph::signedspan::zero()) {
2788 return ceph::signedspan::zero();
2789 }
2790 return now + prior_readable_until_ub;
2791 }
7c673cae
FG
2792};
2793WRITE_CLASS_ENCODER(pg_history_t)
2794
9f95a23c
TL
2795inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) {
2796 out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
2797 << " lis/c=" << h.last_interval_started
2798 << "/" << h.last_interval_clean
2799 << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean
2800 << "/" << h.last_epoch_marked_full
2801 << " sis=" << h.same_interval_since;
2802 if (h.prior_readable_until_ub != ceph::timespan::zero()) {
2803 out << " pruub=" << h.prior_readable_until_ub;
2804 }
2805 return out;
7c673cae
FG
2806}
2807
2808
2809/**
2810 * pg_info_t - summary of PG statistics.
2811 *
2812 * some notes:
2813 * - last_complete implies we have all objects that existed as of that
2814 * stamp, OR a newer object, OR have already applied a later delete.
2815 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2816 * otherwise, we have no idea what the pg is supposed to contain.
2817 */
2818struct pg_info_t {
2819 spg_t pgid;
2820 eversion_t last_update; ///< last object version applied to store.
2821 eversion_t last_complete; ///< last version pg was complete through.
2822 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2823 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2824
2825 version_t last_user_version; ///< last user object version applied to store
2826
2827 eversion_t log_tail; ///< oldest log entry.
2828
2829 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
7c673cae
FG
2830
2831 interval_set<snapid_t> purged_snaps;
2832
2833 pg_stat_t stats;
2834
2835 pg_history_t history;
2836 pg_hit_set_history_t hit_set;
2837
2838 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2839 return
2840 l.pgid == r.pgid &&
2841 l.last_update == r.last_update &&
2842 l.last_complete == r.last_complete &&
2843 l.last_epoch_started == r.last_epoch_started &&
2844 l.last_interval_started == r.last_interval_started &&
2845 l.last_user_version == r.last_user_version &&
2846 l.log_tail == r.log_tail &&
2847 l.last_backfill == r.last_backfill &&
7c673cae
FG
2848 l.purged_snaps == r.purged_snaps &&
2849 l.stats == r.stats &&
2850 l.history == r.history &&
2851 l.hit_set == r.hit_set;
2852 }
2853
2854 pg_info_t()
2855 : last_epoch_started(0),
2856 last_interval_started(0),
2857 last_user_version(0),
9f95a23c 2858 last_backfill(hobject_t::get_max())
7c673cae
FG
2859 { }
2860 // cppcheck-suppress noExplicitConstructor
2861 pg_info_t(spg_t p)
2862 : pgid(p),
2863 last_epoch_started(0),
2864 last_interval_started(0),
2865 last_user_version(0),
9f95a23c 2866 last_backfill(hobject_t::get_max())
7c673cae
FG
2867 { }
2868
2869 void set_last_backfill(hobject_t pos) {
2870 last_backfill = pos;
7c673cae
FG
2871 }
2872
2873 bool is_empty() const { return last_update.version == 0; }
2874 bool dne() const { return history.epoch_created == 0; }
2875
11fdf7f2 2876 bool has_missing() const { return last_complete != last_update; }
7c673cae
FG
2877 bool is_incomplete() const { return !last_backfill.is_max(); }
2878
9f95a23c
TL
2879 void encode(ceph::buffer::list& bl) const;
2880 void decode(ceph::buffer::list::const_iterator& p);
2881 void dump(ceph::Formatter *f) const;
2882 static void generate_test_instances(std::list<pg_info_t*>& o);
7c673cae
FG
2883};
2884WRITE_CLASS_ENCODER(pg_info_t)
2885
9f95a23c 2886inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi)
7c673cae
FG
2887{
2888 out << pgi.pgid << "(";
2889 if (pgi.dne())
2890 out << " DNE";
2891 if (pgi.is_empty())
2892 out << " empty";
2893 else {
2894 out << " v " << pgi.last_update;
2895 if (pgi.last_complete != pgi.last_update)
2896 out << " lc " << pgi.last_complete;
2897 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2898 }
2899 if (pgi.is_incomplete())
9f95a23c 2900 out << " lb " << pgi.last_backfill;
7c673cae
FG
2901 //out << " c " << pgi.epoch_created;
2902 out << " local-lis/les=" << pgi.last_interval_started
2903 << "/" << pgi.last_epoch_started;
2904 out << " n=" << pgi.stats.stats.sum.num_objects;
2905 out << " " << pgi.history
2906 << ")";
2907 return out;
2908}
2909
2910/**
2911 * pg_fast_info_t - common pg_info_t fields
2912 *
2913 * These are the fields of pg_info_t (and children) that are updated for
2914 * most IO operations.
2915 *
2916 * ** WARNING **
2917 * Because we rely on these fields to be applied to the normal
2918 * info struct, adding a new field here that is not also new in info
2919 * means that we must set an incompat OSD feature bit!
2920 */
2921struct pg_fast_info_t {
2922 eversion_t last_update;
2923 eversion_t last_complete;
2924 version_t last_user_version;
2925 struct { // pg_stat_t stats
2926 eversion_t version;
2927 version_t reported_seq;
2928 utime_t last_fresh;
2929 utime_t last_active;
2930 utime_t last_peered;
2931 utime_t last_clean;
2932 utime_t last_unstale;
2933 utime_t last_undegraded;
2934 utime_t last_fullsized;
2935 int64_t log_size; // (also ondisk_log_size, which has the same value)
2936 struct { // object_stat_collection_t stats;
2937 struct { // objct_stat_sum_t sum
2938 int64_t num_bytes; // in bytes
2939 int64_t num_objects;
2940 int64_t num_object_copies;
2941 int64_t num_rd;
2942 int64_t num_rd_kb;
2943 int64_t num_wr;
2944 int64_t num_wr_kb;
2945 int64_t num_objects_dirty;
2946 } sum;
2947 } stats;
2948 } stats;
2949
2950 void populate_from(const pg_info_t& info) {
2951 last_update = info.last_update;
2952 last_complete = info.last_complete;
2953 last_user_version = info.last_user_version;
2954 stats.version = info.stats.version;
2955 stats.reported_seq = info.stats.reported_seq;
2956 stats.last_fresh = info.stats.last_fresh;
2957 stats.last_active = info.stats.last_active;
2958 stats.last_peered = info.stats.last_peered;
2959 stats.last_clean = info.stats.last_clean;
2960 stats.last_unstale = info.stats.last_unstale;
2961 stats.last_undegraded = info.stats.last_undegraded;
2962 stats.last_fullsized = info.stats.last_fullsized;
2963 stats.log_size = info.stats.log_size;
2964 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2965 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2966 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2967 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2968 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2969 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2970 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2971 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2972 }
2973
2974 bool try_apply_to(pg_info_t* info) {
2975 if (last_update <= info->last_update)
2976 return false;
2977 info->last_update = last_update;
2978 info->last_complete = last_complete;
2979 info->last_user_version = last_user_version;
2980 info->stats.version = stats.version;
2981 info->stats.reported_seq = stats.reported_seq;
2982 info->stats.last_fresh = stats.last_fresh;
2983 info->stats.last_active = stats.last_active;
2984 info->stats.last_peered = stats.last_peered;
2985 info->stats.last_clean = stats.last_clean;
2986 info->stats.last_unstale = stats.last_unstale;
2987 info->stats.last_undegraded = stats.last_undegraded;
2988 info->stats.last_fullsized = stats.last_fullsized;
2989 info->stats.log_size = stats.log_size;
2990 info->stats.ondisk_log_size = stats.log_size;
2991 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2992 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2993 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2994 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2995 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2996 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2997 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2998 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2999 return true;
3000 }
3001
9f95a23c 3002 void encode(ceph::buffer::list& bl) const {
7c673cae 3003 ENCODE_START(1, 1, bl);
11fdf7f2
TL
3004 encode(last_update, bl);
3005 encode(last_complete, bl);
3006 encode(last_user_version, bl);
3007 encode(stats.version, bl);
3008 encode(stats.reported_seq, bl);
3009 encode(stats.last_fresh, bl);
3010 encode(stats.last_active, bl);
3011 encode(stats.last_peered, bl);
3012 encode(stats.last_clean, bl);
3013 encode(stats.last_unstale, bl);
3014 encode(stats.last_undegraded, bl);
3015 encode(stats.last_fullsized, bl);
3016 encode(stats.log_size, bl);
3017 encode(stats.stats.sum.num_bytes, bl);
3018 encode(stats.stats.sum.num_objects, bl);
3019 encode(stats.stats.sum.num_object_copies, bl);
3020 encode(stats.stats.sum.num_rd, bl);
3021 encode(stats.stats.sum.num_rd_kb, bl);
3022 encode(stats.stats.sum.num_wr, bl);
3023 encode(stats.stats.sum.num_wr_kb, bl);
3024 encode(stats.stats.sum.num_objects_dirty, bl);
7c673cae
FG
3025 ENCODE_FINISH(bl);
3026 }
9f95a23c 3027 void decode(ceph::buffer::list::const_iterator& p) {
7c673cae 3028 DECODE_START(1, p);
11fdf7f2
TL
3029 decode(last_update, p);
3030 decode(last_complete, p);
3031 decode(last_user_version, p);
3032 decode(stats.version, p);
3033 decode(stats.reported_seq, p);
3034 decode(stats.last_fresh, p);
3035 decode(stats.last_active, p);
3036 decode(stats.last_peered, p);
3037 decode(stats.last_clean, p);
3038 decode(stats.last_unstale, p);
3039 decode(stats.last_undegraded, p);
3040 decode(stats.last_fullsized, p);
3041 decode(stats.log_size, p);
3042 decode(stats.stats.sum.num_bytes, p);
3043 decode(stats.stats.sum.num_objects, p);
3044 decode(stats.stats.sum.num_object_copies, p);
3045 decode(stats.stats.sum.num_rd, p);
3046 decode(stats.stats.sum.num_rd_kb, p);
3047 decode(stats.stats.sum.num_wr, p);
3048 decode(stats.stats.sum.num_wr_kb, p);
3049 decode(stats.stats.sum.num_objects_dirty, p);
7c673cae
FG
3050 DECODE_FINISH(p);
3051 }
3052};
3053WRITE_CLASS_ENCODER(pg_fast_info_t)
3054
3055
7c673cae
FG
3056class OSDMap;
3057/**
3058 * PastIntervals -- information needed to determine the PriorSet and
3059 * the might_have_unfound set
3060 */
3061class PastIntervals {
9f95a23c
TL
3062#ifdef WITH_SEASTAR
3063 using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
3064#else
3065 using OSDMapRef = std::shared_ptr<const OSDMap>;
3066#endif
7c673cae
FG
3067public:
3068 struct pg_interval_t {
9f95a23c 3069 std::vector<int32_t> up, acting;
7c673cae
FG
3070 epoch_t first, last;
3071 bool maybe_went_rw;
3072 int32_t primary;
3073 int32_t up_primary;
3074
3075 pg_interval_t()
3076 : first(0), last(0),
3077 maybe_went_rw(false),
3078 primary(-1),
3079 up_primary(-1)
3080 {}
3081
3082 pg_interval_t(
9f95a23c
TL
3083 std::vector<int32_t> &&up,
3084 std::vector<int32_t> &&acting,
7c673cae
FG
3085 epoch_t first,
3086 epoch_t last,
3087 bool maybe_went_rw,
3088 int32_t primary,
3089 int32_t up_primary)
3090 : up(up), acting(acting), first(first), last(last),
3091 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
3092 {}
3093
9f95a23c
TL
3094 void encode(ceph::buffer::list& bl) const;
3095 void decode(ceph::buffer::list::const_iterator& bl);
3096 void dump(ceph::Formatter *f) const;
3097 static void generate_test_instances(std::list<pg_interval_t*>& o);
7c673cae
FG
3098 };
3099
11fdf7f2 3100 PastIntervals();
7c673cae
FG
3101 PastIntervals(PastIntervals &&rhs) = default;
3102 PastIntervals &operator=(PastIntervals &&rhs) = default;
3103
3104 PastIntervals(const PastIntervals &rhs);
3105 PastIntervals &operator=(const PastIntervals &rhs);
3106
3107 class interval_rep {
3108 public:
3109 virtual size_t size() const = 0;
3110 virtual bool empty() const = 0;
3111 virtual void clear() = 0;
9f95a23c
TL
3112 virtual std::pair<epoch_t, epoch_t> get_bounds() const = 0;
3113 virtual std::set<pg_shard_t> get_all_participants(
7c673cae
FG
3114 bool ec_pool) const = 0;
3115 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
9f95a23c
TL
3116 virtual std::unique_ptr<interval_rep> clone() const = 0;
3117 virtual std::ostream &print(std::ostream &out) const = 0;
3118 virtual void encode(ceph::buffer::list &bl) const = 0;
3119 virtual void decode(ceph::buffer::list::const_iterator &bl) = 0;
3120 virtual void dump(ceph::Formatter *f) const = 0;
7c673cae 3121 virtual void iterate_mayberw_back_to(
7c673cae 3122 epoch_t les,
9f95a23c 3123 std::function<void(epoch_t, const std::set<pg_shard_t> &)> &&f) const = 0;
7c673cae
FG
3124
3125 virtual bool has_full_intervals() const { return false; }
3126 virtual void iterate_all_intervals(
3127 std::function<void(const pg_interval_t &)> &&f) const {
11fdf7f2
TL
3128 ceph_assert(!has_full_intervals());
3129 ceph_abort_msg("not valid for this implementation");
7c673cae 3130 }
11fdf7f2 3131 virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
7c673cae
FG
3132
3133 virtual ~interval_rep() {}
3134 };
7c673cae
FG
3135 friend class pi_compact_rep;
3136private:
3137
9f95a23c 3138 std::unique_ptr<interval_rep> past_intervals;
7c673cae 3139
11fdf7f2 3140 explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
7c673cae
FG
3141
3142public:
3143 void add_interval(bool ec_pool, const pg_interval_t &interval) {
11fdf7f2 3144 ceph_assert(past_intervals);
7c673cae
FG
3145 return past_intervals->add_interval(ec_pool, interval);
3146 }
3147
9f95a23c 3148 void encode(ceph::buffer::list &bl) const {
7c673cae
FG
3149 ENCODE_START(1, 1, bl);
3150 if (past_intervals) {
11fdf7f2
TL
3151 __u8 type = 2;
3152 encode(type, bl);
7c673cae
FG
3153 past_intervals->encode(bl);
3154 } else {
11fdf7f2 3155 encode((__u8)0, bl);
7c673cae
FG
3156 }
3157 ENCODE_FINISH(bl);
3158 }
7c673cae 3159
9f95a23c 3160 void decode(ceph::buffer::list::const_iterator &bl);
7c673cae 3161
9f95a23c 3162 void dump(ceph::Formatter *f) const {
11fdf7f2 3163 ceph_assert(past_intervals);
7c673cae
FG
3164 past_intervals->dump(f);
3165 }
9f95a23c 3166 static void generate_test_instances(std::list<PastIntervals *> & o);
7c673cae
FG
3167
3168 /**
3169 * Determines whether there is an interval change
3170 */
3171 static bool is_new_interval(
3172 int old_acting_primary,
3173 int new_acting_primary,
9f95a23c
TL
3174 const std::vector<int> &old_acting,
3175 const std::vector<int> &new_acting,
7c673cae
FG
3176 int old_up_primary,
3177 int new_up_primary,
9f95a23c
TL
3178 const std::vector<int> &old_up,
3179 const std::vector<int> &new_up,
7c673cae
FG
3180 int old_size,
3181 int new_size,
3182 int old_min_size,
3183 int new_min_size,
3184 unsigned old_pg_num,
3185 unsigned new_pg_num,
11fdf7f2
TL
3186 unsigned old_pg_num_pending,
3187 unsigned new_pg_num_pending,
7c673cae
FG
3188 bool old_sort_bitwise,
3189 bool new_sort_bitwise,
c07f9fc5
FG
3190 bool old_recovery_deletes,
3191 bool new_recovery_deletes,
7c673cae
FG
3192 pg_t pgid
3193 );
3194
3195 /**
3196 * Determines whether there is an interval change
3197 */
3198 static bool is_new_interval(
3199 int old_acting_primary, ///< [in] primary as of lastmap
3200 int new_acting_primary, ///< [in] primary as of lastmap
9f95a23c
TL
3201 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3202 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
7c673cae
FG
3203 int old_up_primary, ///< [in] up primary of lastmap
3204 int new_up_primary, ///< [in] up primary of osdmap
9f95a23c
TL
3205 const std::vector<int> &old_up, ///< [in] up as of lastmap
3206 const std::vector<int> &new_up, ///< [in] up as of osdmap
3207 const OSDMap *osdmap, ///< [in] current map
3208 const OSDMap *lastmap, ///< [in] last map
7c673cae
FG
3209 pg_t pgid ///< [in] pgid for pg
3210 );
3211
3212 /**
3213 * Integrates a new map into *past_intervals, returns true
3214 * if an interval was closed out.
3215 */
3216 static bool check_new_interval(
3217 int old_acting_primary, ///< [in] primary as of lastmap
3218 int new_acting_primary, ///< [in] primary as of osdmap
9f95a23c
TL
3219 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3220 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
7c673cae
FG
3221 int old_up_primary, ///< [in] up primary of lastmap
3222 int new_up_primary, ///< [in] up primary of osdmap
9f95a23c
TL
3223 const std::vector<int> &old_up, ///< [in] up as of lastmap
3224 const std::vector<int> &new_up, ///< [in] up as of osdmap
7c673cae
FG
3225 epoch_t same_interval_since, ///< [in] as of osdmap
3226 epoch_t last_epoch_clean, ///< [in] current
9f95a23c
TL
3227 const OSDMap *osdmap, ///< [in] current map
3228 const OSDMap *lastmap, ///< [in] last map
7c673cae 3229 pg_t pgid, ///< [in] pgid for pg
9f95a23c 3230 const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
7c673cae 3231 PastIntervals *past_intervals, ///< [out] intervals
9f95a23c 3232 std::ostream *out = 0 ///< [out] debug ostream
7c673cae 3233 );
9f95a23c
TL
3234 static bool check_new_interval(
3235 int old_acting_primary, ///< [in] primary as of lastmap
3236 int new_acting_primary, ///< [in] primary as of osdmap
3237 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3238 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3239 int old_up_primary, ///< [in] up primary of lastmap
3240 int new_up_primary, ///< [in] up primary of osdmap
3241 const std::vector<int> &old_up, ///< [in] up as of lastmap
3242 const std::vector<int> &new_up, ///< [in] up as of osdmap
3243 epoch_t same_interval_since, ///< [in] as of osdmap
3244 epoch_t last_epoch_clean, ///< [in] current
3245 OSDMapRef osdmap, ///< [in] current map
3246 OSDMapRef lastmap, ///< [in] last map
3247 pg_t pgid, ///< [in] pgid for pg
3248 const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3249 PastIntervals *past_intervals, ///< [out] intervals
3250 std::ostream *out = 0 ///< [out] debug ostream
3251 ) {
3252 return check_new_interval(
3253 old_acting_primary, new_acting_primary,
3254 old_acting, new_acting,
3255 old_up_primary, new_up_primary,
3256 old_up, new_up,
3257 same_interval_since, last_epoch_clean,
3258 osdmap.get(), lastmap.get(),
3259 pgid,
3260 could_have_gone_active,
3261 past_intervals,
3262 out);
3263 }
3264
3265 friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
7c673cae
FG
3266
3267 template <typename F>
3268 void iterate_mayberw_back_to(
7c673cae
FG
3269 epoch_t les,
3270 F &&f) const {
11fdf7f2
TL
3271 ceph_assert(past_intervals);
3272 past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
7c673cae
FG
3273 }
3274 void clear() {
11fdf7f2 3275 ceph_assert(past_intervals);
7c673cae
FG
3276 past_intervals->clear();
3277 }
3278
3279 /**
3280 * Should return a value which gives an indication of the amount
3281 * of state contained
3282 */
3283 size_t size() const {
11fdf7f2 3284 ceph_assert(past_intervals);
7c673cae
FG
3285 return past_intervals->size();
3286 }
3287
3288 bool empty() const {
11fdf7f2 3289 ceph_assert(past_intervals);
7c673cae
FG
3290 return past_intervals->empty();
3291 }
3292
3293 void swap(PastIntervals &other) {
31f18b77
FG
3294 using std::swap;
3295 swap(other.past_intervals, past_intervals);
7c673cae
FG
3296 }
3297
3298 /**
3299 * Return all shards which have been in the acting set back to the
3300 * latest epoch to which we have trimmed except for pg_whoami
3301 */
9f95a23c 3302 std::set<pg_shard_t> get_might_have_unfound(
7c673cae
FG
3303 pg_shard_t pg_whoami,
3304 bool ec_pool) const {
11fdf7f2 3305 ceph_assert(past_intervals);
7c673cae
FG
3306 auto ret = past_intervals->get_all_participants(ec_pool);
3307 ret.erase(pg_whoami);
3308 return ret;
3309 }
3310
3311 /**
3312 * Return all shards which we might want to talk to for peering
3313 */
9f95a23c 3314 std::set<pg_shard_t> get_all_probe(
7c673cae 3315 bool ec_pool) const {
11fdf7f2 3316 ceph_assert(past_intervals);
7c673cae
FG
3317 return past_intervals->get_all_participants(ec_pool);
3318 }
3319
3320 /* Return the set of epochs [start, end) represented by the
3321 * past_interval set.
3322 */
9f95a23c 3323 std::pair<epoch_t, epoch_t> get_bounds() const {
11fdf7f2 3324 ceph_assert(past_intervals);
7c673cae
FG
3325 return past_intervals->get_bounds();
3326 }
3327
11fdf7f2
TL
3328 void adjust_start_backwards(epoch_t last_epoch_clean) {
3329 ceph_assert(past_intervals);
3330 past_intervals->adjust_start_backwards(last_epoch_clean);
3331 }
3332
7c673cae
FG
3333 enum osd_state_t {
3334 UP,
3335 DOWN,
3336 DNE,
3337 LOST
3338 };
3339 struct PriorSet {
3340 bool ec_pool = false;
9f95a23c
TL
3341 std::set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
3342 std::set<int> down; ///< down osds that would normally be in @a probe and might be interesting.
3343 std::map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
7c673cae 3344
11fdf7f2 3345 bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
9f95a23c 3346 const IsPGRecoverablePredicate* pcontdec = nullptr;
7c673cae
FG
3347
3348 PriorSet() = default;
3349 PriorSet(PriorSet &&) = default;
3350 PriorSet &operator=(PriorSet &&) = default;
3351
3352 PriorSet &operator=(const PriorSet &) = delete;
3353 PriorSet(const PriorSet &) = delete;
3354
3355 bool operator==(const PriorSet &rhs) const {
3356 return (ec_pool == rhs.ec_pool) &&
3357 (probe == rhs.probe) &&
3358 (down == rhs.down) &&
3359 (blocked_by == rhs.blocked_by) &&
3360 (pg_down == rhs.pg_down);
3361 }
3362
3363 bool affected_by_map(
3364 const OSDMap &osdmap,
3365 const DoutPrefixProvider *dpp) const;
3366
3367 // For verifying tests
3368 PriorSet(
3369 bool ec_pool,
9f95a23c
TL
3370 std::set<pg_shard_t> probe,
3371 std::set<int> down,
3372 std::map<int, epoch_t> blocked_by,
7c673cae 3373 bool pg_down,
9f95a23c 3374 const IsPGRecoverablePredicate *pcontdec)
7c673cae
FG
3375 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
3376 pg_down(pg_down), pcontdec(pcontdec) {}
3377
3378 private:
3379 template <typename F>
3380 PriorSet(
3381 const PastIntervals &past_intervals,
3382 bool ec_pool,
3383 epoch_t last_epoch_started,
9f95a23c 3384 const IsPGRecoverablePredicate *c,
7c673cae 3385 F f,
9f95a23c
TL
3386 const std::vector<int> &up,
3387 const std::vector<int> &acting,
7c673cae
FG
3388 const DoutPrefixProvider *dpp);
3389
3390 friend class PastIntervals;
3391 };
3392
7c673cae
FG
3393 template <typename... Args>
3394 PriorSet get_prior_set(Args&&... args) const {
3395 return PriorSet(*this, std::forward<Args>(args)...);
3396 }
3397};
3398WRITE_CLASS_ENCODER(PastIntervals)
3399
9f95a23c
TL
3400std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i);
3401std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3402std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i);
7c673cae
FG
3403
3404template <typename F>
3405PastIntervals::PriorSet::PriorSet(
3406 const PastIntervals &past_intervals,
3407 bool ec_pool,
3408 epoch_t last_epoch_started,
9f95a23c 3409 const IsPGRecoverablePredicate *c,
7c673cae 3410 F f,
9f95a23c
TL
3411 const std::vector<int> &up,
3412 const std::vector<int> &acting,
7c673cae
FG
3413 const DoutPrefixProvider *dpp)
3414 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
3415{
3416 /*
3417 * We have to be careful to gracefully deal with situations like
3418 * so. Say we have a power outage or something that takes out both
3419 * OSDs, but the monitor doesn't mark them down in the same epoch.
3420 * The history may look like
3421 *
3422 * 1: A B
3423 * 2: B
3424 * 3: let's say B dies for good, too (say, from the power spike)
3425 * 4: A
3426 *
3427 * which makes it look like B may have applied updates to the PG
3428 * that we need in order to proceed. This sucks...
3429 *
3430 * To minimize the risk of this happening, we CANNOT go active if
3431 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3432 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3433 * Then, we have something like
3434 *
3435 * 1: A B
3436 * 2: B up_thru[B]=0
3437 * 3:
3438 * 4: A
3439 *
3440 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3441 * still 0).
3442 *
3443 * or,
3444 *
3445 * 1: A B
3446 * 2: B up_thru[B]=0
3447 * 3: B up_thru[B]=2
3448 * 4:
3449 * 5: A
3450 *
3451 * -> we must wait for B, bc it was alive through 2, and could have
3452 * written to the pg.
3453 *
3454 * If B is really dead, then an administrator will need to manually
3455 * intervene by marking the OSD as "lost."
3456 */
3457
3458 // Include current acting and up nodes... not because they may
3459 // contain old data (this interval hasn't gone active, obviously),
3460 // but because we want their pg_info to inform choose_acting(), and
3461 // so that we know what they do/do not have explicitly before
3462 // sending them any new info/logs/whatever.
3463 for (unsigned i = 0; i < acting.size(); i++) {
3464 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3465 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3466 }
3467 // It may be possible to exclude the up nodes, but let's keep them in
3468 // there for now.
3469 for (unsigned i = 0; i < up.size(); i++) {
3470 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3471 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3472 }
3473
9f95a23c 3474 std::set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
7c673cae
FG
3475 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3476 for (auto &&i: all_probe) {
3477 switch (f(0, i.osd, nullptr)) {
3478 case UP: {
3479 probe.insert(i);
3480 break;
3481 }
3482 case DNE:
3483 case LOST:
3484 case DOWN: {
3485 down.insert(i.osd);
3486 break;
3487 }
3488 }
3489 }
3490
3491 past_intervals.iterate_mayberw_back_to(
7c673cae 3492 last_epoch_started,
9f95a23c 3493 [&](epoch_t start, const std::set<pg_shard_t> &acting) {
7c673cae
FG
3494 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3495 << ", acting: " << acting << dendl;
3496
3497 // look at candidate osds during this interval. each falls into
3498 // one of three categories: up, down (but potentially
3499 // interesting), or lost (down, but we won't wait for it).
9f95a23c
TL
3500 std::set<pg_shard_t> up_now;
3501 std::map<int, epoch_t> candidate_blocked_by;
7c673cae
FG
3502 // any candidates down now (that might have useful data)
3503 bool any_down_now = false;
3504
3505 // consider ACTING osds
3506 for (auto &&so: acting) {
3507 epoch_t lost_at = 0;
3508 switch (f(start, so.osd, &lost_at)) {
3509 case UP: {
3510 // include past acting osds if they are up.
3511 up_now.insert(so);
3512 break;
3513 }
3514 case DNE: {
3515 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3516 << " no longer exists" << dendl;
3517 break;
3518 }
3519 case LOST: {
3520 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3521 << " is down, but lost_at " << lost_at << dendl;
3522 up_now.insert(so);
3523 break;
3524 }
3525 case DOWN: {
3526 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3527 << " is down" << dendl;
3528 candidate_blocked_by[so.osd] = lost_at;
3529 any_down_now = true;
3530 break;
3531 }
3532 }
3533 }
3534
3535 // if not enough osds survived this interval, and we may have gone rw,
3536 // then we need to wait for one of those osds to recover to
3537 // ensure that we haven't lost any information.
3538 if (!(*pcontdec)(up_now) && any_down_now) {
3539 // fixme: how do we identify a "clean" shutdown anyway?
3540 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3541 << " insufficient up; including down osds" << dendl;
11fdf7f2 3542 ceph_assert(!candidate_blocked_by.empty());
7c673cae
FG
3543 pg_down = true;
3544 blocked_by.insert(
3545 candidate_blocked_by.begin(),
3546 candidate_blocked_by.end());
3547 }
3548 });
3549
3550 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3551 << " down " << down
3552 << " blocked_by " << blocked_by
3553 << (pg_down ? " pg_down":"")
3554 << dendl;
3555}
3556
9f95a23c
TL
3557struct pg_notify_t {
3558 epoch_t query_epoch;
3559 epoch_t epoch_sent;
3560 pg_info_t info;
3561 shard_id_t to;
3562 shard_id_t from;
3563 PastIntervals past_intervals;
3564 pg_notify_t() :
3565 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
3566 from(shard_id_t::NO_SHARD) {}
3567 pg_notify_t(
3568 shard_id_t to,
3569 shard_id_t from,
3570 epoch_t query_epoch,
3571 epoch_t epoch_sent,
3572 const pg_info_t &info,
3573 const PastIntervals& pi)
3574 : query_epoch(query_epoch),
3575 epoch_sent(epoch_sent),
3576 info(info), to(to), from(from),
3577 past_intervals(pi) {
3578 ceph_assert(from == info.pgid.shard);
3579 }
3580 void encode(ceph::buffer::list &bl) const;
3581 void decode(ceph::buffer::list::const_iterator &p);
3582 void dump(ceph::Formatter *f) const;
3583 static void generate_test_instances(std::list<pg_notify_t*> &o);
3584};
3585WRITE_CLASS_ENCODER(pg_notify_t)
3586std::ostream &operator<<(std::ostream &lhs, const pg_notify_t &notify);
3587
3588
7c673cae
FG
3589/**
3590 * pg_query_t - used to ask a peer for information about a pg.
3591 *
3592 * note: if version=0, type=LOG, then we just provide our full log.
3593 */
3594struct pg_query_t {
3595 enum {
3596 INFO = 0,
3597 LOG = 1,
3598 MISSING = 4,
3599 FULLLOG = 5,
3600 };
11fdf7f2 3601 std::string_view get_type_name() const {
7c673cae
FG
3602 switch (type) {
3603 case INFO: return "info";
3604 case LOG: return "log";
3605 case MISSING: return "missing";
3606 case FULLLOG: return "fulllog";
3607 default: return "???";
3608 }
3609 }
3610
3611 __s32 type;
3612 eversion_t since;
3613 pg_history_t history;
3614 epoch_t epoch_sent;
3615 shard_id_t to;
3616 shard_id_t from;
3617
3618 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3619 from(shard_id_t::NO_SHARD) {}
3620 pg_query_t(
3621 int t,
3622 shard_id_t to,
3623 shard_id_t from,
3624 const pg_history_t& h,
3625 epoch_t epoch_sent)
3626 : type(t),
3627 history(h),
3628 epoch_sent(epoch_sent),
3629 to(to), from(from) {
11fdf7f2 3630 ceph_assert(t != LOG);
7c673cae
FG
3631 }
3632 pg_query_t(
3633 int t,
3634 shard_id_t to,
3635 shard_id_t from,
3636 eversion_t s,
3637 const pg_history_t& h,
3638 epoch_t epoch_sent)
3639 : type(t), since(s), history(h),
3640 epoch_sent(epoch_sent), to(to), from(from) {
11fdf7f2 3641 ceph_assert(t == LOG);
7c673cae
FG
3642 }
3643
9f95a23c
TL
3644 void encode(ceph::buffer::list &bl, uint64_t features) const;
3645 void decode(ceph::buffer::list::const_iterator &bl);
7c673cae 3646
9f95a23c
TL
3647 void dump(ceph::Formatter *f) const;
3648 static void generate_test_instances(std::list<pg_query_t*>& o);
7c673cae
FG
3649};
3650WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3651
9f95a23c 3652inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) {
7c673cae
FG
3653 out << "query(" << q.get_type_name() << " " << q.since;
3654 if (q.type == pg_query_t::LOG)
3655 out << " " << q.history;
11fdf7f2 3656 out << " epoch_sent " << q.epoch_sent;
7c673cae
FG
3657 out << ")";
3658 return out;
3659}
3660
9f95a23c
TL
3661/**
3662 * pg_lease_t - readable lease metadata, from primary -> non-primary
3663 *
3664 * This metadata serves to increase either or both of the lease expiration
3665 * and upper bound on the non-primary.
3666 */
3667struct pg_lease_t {
3668 /// pg readable_until value; replicas must not be readable beyond this
3669 ceph::signedspan readable_until = ceph::signedspan::zero();
3670
3671 /// upper bound on any acting osd's readable_until
3672 ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3673
3674 /// duration of the lease (in case clock deltas aren't available)
3675 ceph::signedspan interval = ceph::signedspan::zero();
3676
3677 pg_lease_t() {}
3678 pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub,
3679 ceph::signedspan i)
3680 : readable_until(ru),
3681 readable_until_ub(ruub),
3682 interval(i) {}
3683
3684 void encode(ceph::buffer::list &bl) const;
3685 void decode(ceph::buffer::list::const_iterator &bl);
3686 void dump(ceph::Formatter *f) const;
3687 static void generate_test_instances(std::list<pg_lease_t*>& o);
3688
3689 friend ostream& operator<<(ostream& out, const pg_lease_t& l) {
3690 return out << "pg_lease(ru " << l.readable_until
3691 << " ub " << l.readable_until_ub
3692 << " int " << l.interval << ")";
3693 }
3694};
3695WRITE_CLASS_ENCODER(pg_lease_t)
3696
3697/**
3698 * pg_lease_ack_t - lease ack, from non-primary -> primary
3699 *
3700 * This metadata acknowledges to the primary what a non-primary's noted
3701 * upper bound is.
3702 */
3703struct pg_lease_ack_t {
3704 /// highest upper bound non-primary has recorded (primary's clock)
3705 ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3706
3707 pg_lease_ack_t() {}
3708 pg_lease_ack_t(ceph::signedspan ub)
3709 : readable_until_ub(ub) {}
3710
3711 void encode(ceph::buffer::list &bl) const;
3712 void decode(ceph::buffer::list::const_iterator &bl);
3713 void dump(ceph::Formatter *f) const;
3714 static void generate_test_instances(std::list<pg_lease_ack_t*>& o);
3715
3716 friend ostream& operator<<(ostream& out, const pg_lease_ack_t& l) {
3717 return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")";
3718 }
3719};
3720WRITE_CLASS_ENCODER(pg_lease_ack_t)
3721
3722
3723
7c673cae
FG
3724class PGBackend;
3725class ObjectModDesc {
3726 bool can_local_rollback;
3727 bool rollback_info_completed;
3728
3729 // version required to decode, reflected in encode/decode version
3730 __u8 max_required_version = 1;
3731public:
3732 class Visitor {
3733 public:
3734 virtual void append(uint64_t old_offset) {}
9f95a23c 3735 virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {}
7c673cae
FG
3736 virtual void rmobject(version_t old_version) {}
3737 /**
3738 * Used to support the unfound_lost_delete log event: if the stashed
3739 * version exists, we unstash it, otherwise, we do nothing. This way
3740 * each replica rolls back to whatever state it had prior to the attempt
3741 * at mark unfound lost delete
3742 */
3743 virtual void try_rmobject(version_t old_version) {
3744 rmobject(old_version);
3745 }
3746 virtual void create() {}
9f95a23c 3747 virtual void update_snaps(const std::set<snapid_t> &old_snaps) {}
7c673cae
FG
3748 virtual void rollback_extents(
3749 version_t gen,
9f95a23c 3750 const std::vector<std::pair<uint64_t, uint64_t> > &extents) {}
7c673cae
FG
3751 virtual ~Visitor() {}
3752 };
3753 void visit(Visitor *visitor) const;
9f95a23c 3754 mutable ceph::buffer::list bl;
7c673cae
FG
3755 enum ModID {
3756 APPEND = 1,
3757 SETATTRS = 2,
3758 DELETE = 3,
3759 CREATE = 4,
3760 UPDATE_SNAPS = 5,
3761 TRY_DELETE = 6,
3762 ROLLBACK_EXTENTS = 7
3763 };
31f18b77
FG
3764 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3765 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3766 }
7c673cae
FG
3767 void claim(ObjectModDesc &other) {
3768 bl.clear();
3769 bl.claim(other.bl);
3770 can_local_rollback = other.can_local_rollback;
3771 rollback_info_completed = other.rollback_info_completed;
3772 }
3773 void claim_append(ObjectModDesc &other) {
3774 if (!can_local_rollback || rollback_info_completed)
3775 return;
3776 if (!other.can_local_rollback) {
3777 mark_unrollbackable();
3778 return;
3779 }
3780 bl.claim_append(other.bl);
3781 rollback_info_completed = other.rollback_info_completed;
3782 }
3783 void swap(ObjectModDesc &other) {
3784 bl.swap(other.bl);
3785
31f18b77
FG
3786 using std::swap;
3787 swap(other.can_local_rollback, can_local_rollback);
3788 swap(other.rollback_info_completed, rollback_info_completed);
3789 swap(other.max_required_version, max_required_version);
7c673cae
FG
3790 }
3791 void append_id(ModID id) {
11fdf7f2 3792 using ceph::encode;
7c673cae 3793 uint8_t _id(id);
11fdf7f2 3794 encode(_id, bl);
7c673cae
FG
3795 }
3796 void append(uint64_t old_size) {
3797 if (!can_local_rollback || rollback_info_completed)
3798 return;
3799 ENCODE_START(1, 1, bl);
3800 append_id(APPEND);
11fdf7f2 3801 encode(old_size, bl);
7c673cae
FG
3802 ENCODE_FINISH(bl);
3803 }
9f95a23c 3804 void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) {
7c673cae
FG
3805 if (!can_local_rollback || rollback_info_completed)
3806 return;
3807 ENCODE_START(1, 1, bl);
3808 append_id(SETATTRS);
11fdf7f2 3809 encode(old_attrs, bl);
7c673cae
FG
3810 ENCODE_FINISH(bl);
3811 }
3812 bool rmobject(version_t deletion_version) {
3813 if (!can_local_rollback || rollback_info_completed)
3814 return false;
3815 ENCODE_START(1, 1, bl);
3816 append_id(DELETE);
11fdf7f2 3817 encode(deletion_version, bl);
7c673cae
FG
3818 ENCODE_FINISH(bl);
3819 rollback_info_completed = true;
3820 return true;
3821 }
3822 bool try_rmobject(version_t deletion_version) {
3823 if (!can_local_rollback || rollback_info_completed)
3824 return false;
3825 ENCODE_START(1, 1, bl);
3826 append_id(TRY_DELETE);
11fdf7f2 3827 encode(deletion_version, bl);
7c673cae
FG
3828 ENCODE_FINISH(bl);
3829 rollback_info_completed = true;
3830 return true;
3831 }
3832 void create() {
3833 if (!can_local_rollback || rollback_info_completed)
3834 return;
3835 rollback_info_completed = true;
3836 ENCODE_START(1, 1, bl);
3837 append_id(CREATE);
3838 ENCODE_FINISH(bl);
3839 }
9f95a23c 3840 void update_snaps(const std::set<snapid_t> &old_snaps) {
7c673cae
FG
3841 if (!can_local_rollback || rollback_info_completed)
3842 return;
3843 ENCODE_START(1, 1, bl);
3844 append_id(UPDATE_SNAPS);
11fdf7f2 3845 encode(old_snaps, bl);
7c673cae
FG
3846 ENCODE_FINISH(bl);
3847 }
3848 void rollback_extents(
9f95a23c 3849 version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) {
11fdf7f2
TL
3850 ceph_assert(can_local_rollback);
3851 ceph_assert(!rollback_info_completed);
7c673cae
FG
3852 if (max_required_version < 2)
3853 max_required_version = 2;
3854 ENCODE_START(2, 2, bl);
3855 append_id(ROLLBACK_EXTENTS);
11fdf7f2
TL
3856 encode(gen, bl);
3857 encode(extents, bl);
7c673cae
FG
3858 ENCODE_FINISH(bl);
3859 }
3860
3861 // cannot be rolled back
3862 void mark_unrollbackable() {
3863 can_local_rollback = false;
3864 bl.clear();
3865 }
3866 bool can_rollback() const {
3867 return can_local_rollback;
3868 }
3869 bool empty() const {
3870 return can_local_rollback && (bl.length() == 0);
3871 }
3872
3873 bool requires_kraken() const {
3874 return max_required_version >= 2;
3875 }
3876
3877 /**
3878 * Create fresh copy of bl bytes to avoid keeping large buffers around
3879 * in the case that bl contains ptrs which point into a much larger
3880 * message buffer
3881 */
31f18b77 3882 void trim_bl() const {
7c673cae
FG
3883 if (bl.length() > 0)
3884 bl.rebuild();
3885 }
9f95a23c
TL
3886 void encode(ceph::buffer::list &bl) const;
3887 void decode(ceph::buffer::list::const_iterator &bl);
3888 void dump(ceph::Formatter *f) const;
3889 static void generate_test_instances(std::list<ObjectModDesc*>& o);
3890};
3891WRITE_CLASS_ENCODER(ObjectModDesc)
3892
3893class ObjectCleanRegions {
3894private:
3895 bool new_object;
3896 bool clean_omap;
3897 interval_set<uint64_t> clean_offsets;
3898 static std::atomic<uint32_t> max_num_intervals;
3899
3900 /**
3901 * trim the number of intervals if clean_offsets.num_intervals()
3902 * exceeds the given upbound max_num_intervals
3903 * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
3904 * then new interval [30~10] will evict out the shortest one [20~5]
3905 * finally, clean_offsets becomes {[5~10], [30~10]}
3906 */
3907 void trim();
3908 friend ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr);
3909public:
3910 ObjectCleanRegions() : new_object(false), clean_omap(true) {
3911 clean_offsets.insert(0, (uint64_t)-1);
3912 }
3913 ObjectCleanRegions(uint64_t offset, uint64_t len, bool co)
3914 : new_object(false), clean_omap(co) {
3915 clean_offsets.insert(offset, len);
3916 }
3917 bool operator==(const ObjectCleanRegions &orc) const {
3918 return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets;
3919 }
3920 static void set_max_num_intervals(uint32_t num);
3921 void merge(const ObjectCleanRegions &other);
3922 void mark_data_region_dirty(uint64_t offset, uint64_t len);
3923 void mark_omap_dirty();
3924 void mark_object_new();
3925 void mark_fully_dirty();
3926 interval_set<uint64_t> get_dirty_regions() const;
3927 bool omap_is_dirty() const;
3928 bool object_is_exist() const;
3929
7c673cae 3930 void encode(bufferlist &bl) const;
11fdf7f2 3931 void decode(bufferlist::const_iterator &bl);
7c673cae 3932 void dump(Formatter *f) const;
9f95a23c 3933 static void generate_test_instances(list<ObjectCleanRegions*>& o);
7c673cae 3934};
9f95a23c
TL
3935WRITE_CLASS_ENCODER(ObjectCleanRegions)
3936ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr);
3937
3938
3939struct OSDOp {
3940 ceph_osd_op op;
3941 sobject_t soid;
3942
3943 ceph::buffer::list indata, outdata;
3944 errorcode32_t rval = 0;
3945
3946 OSDOp() {
3947 // FIPS zeroization audit 20191115: this memset clean for security
3948 memset(&op, 0, sizeof(ceph_osd_op));
3949 }
3950
3951 OSDOp(const int op_code) {
3952 // FIPS zeroization audit 20191115: this memset clean for security
3953 memset(&op, 0, sizeof(ceph_osd_op));
3954 op.op = op_code;
3955 }
3956
3957 /**
3958 * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
3959 *
3960 * @param ops [out] vector of OSDOps
3961 * @param in [in] combined data buffer
3962 */
3963 static void split_osd_op_vector_in_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
3964
3965 /**
3966 * merge indata members of a vector of OSDOp into a single ceph::buffer::list
3967 *
3968 * Notably this also encodes certain other OSDOp data into the data
3969 * buffer, including the sobject_t soid.
3970 *
3971 * @param ops [in] vector of OSDOps
3972 * @param out [out] combined data buffer
3973 */
3974 static void merge_osd_op_vector_in_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
3975
3976 /**
3977 * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
3978 *
3979 * @param ops [out] vector of OSDOps
3980 * @param in [in] combined data buffer
3981 */
3982 static void split_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
3983
3984 /**
3985 * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
3986 *
3987 * @param ops [in] vector of OSDOps
3988 * @param out [out] combined data buffer
3989 */
3990 static void merge_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
3991
3992 /**
3993 * Clear data as much as possible, leave minimal data for historical op dump
3994 *
3995 * @param ops [in] vector of OSDOps
3996 */
3997 static void clear_data(std::vector<OSDOp>& ops);
3998};
3999std::ostream& operator<<(std::ostream& out, const OSDOp& op);
7c673cae
FG
4000
4001
9f95a23c
TL
4002struct pg_log_op_return_item_t {
4003 int32_t rval;
4004 bufferlist bl;
4005 void encode(bufferlist& p) const {
4006 using ceph::encode;
4007 encode(rval, p);
4008 encode(bl, p);
4009 }
4010 void decode(bufferlist::const_iterator& p) {
4011 using ceph::decode;
4012 decode(rval, p);
4013 decode(bl, p);
4014 }
4015 void dump(Formatter *f) const {
4016 f->dump_int("rval", rval);
4017 f->dump_unsigned("bl_length", bl.length());
4018 }
4019 friend bool operator==(const pg_log_op_return_item_t& lhs,
4020 const pg_log_op_return_item_t& rhs) {
4021 return lhs.rval == rhs.rval &&
4022 lhs.bl.contents_equal(rhs.bl);
4023 }
4024 friend bool operator!=(const pg_log_op_return_item_t& lhs,
4025 const pg_log_op_return_item_t& rhs) {
4026 return !(lhs == rhs);
4027 }
4028 friend ostream& operator<<(ostream& out, const pg_log_op_return_item_t& i) {
4029 return out << "r=" << i.rval << "+" << i.bl.length() << "b";
4030 }
4031};
4032WRITE_CLASS_ENCODER(pg_log_op_return_item_t)
4033
7c673cae
FG
4034/**
4035 * pg_log_entry_t - single entry/event in pg log
4036 *
4037 */
4038struct pg_log_entry_t {
4039 enum {
4040 MODIFY = 1, // some unspecified modification (but not *all* modifications)
4041 CLONE = 2, // cloned object from head
4042 DELETE = 3, // deleted object
11fdf7f2 4043 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
7c673cae
FG
4044 LOST_REVERT = 5, // lost new version, revert to an older version.
4045 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
4046 LOST_MARK = 7, // lost new version, now EIO
4047 PROMOTE = 8, // promoted object from another tier
4048 CLEAN = 9, // mark an object clean
4049 ERROR = 10, // write that returned an error
4050 };
4051 static const char *get_op_name(int op) {
4052 switch (op) {
4053 case MODIFY:
4054 return "modify";
4055 case PROMOTE:
4056 return "promote";
4057 case CLONE:
4058 return "clone";
4059 case DELETE:
4060 return "delete";
7c673cae
FG
4061 case LOST_REVERT:
4062 return "l_revert";
4063 case LOST_DELETE:
4064 return "l_delete";
4065 case LOST_MARK:
4066 return "l_mark";
4067 case CLEAN:
4068 return "clean";
4069 case ERROR:
4070 return "error";
4071 default:
4072 return "unknown";
4073 }
4074 }
4075 const char *get_op_name() const {
4076 return get_op_name(op);
4077 }
4078
4079 // describes state for a locally-rollbackable entry
4080 ObjectModDesc mod_desc;
9f95a23c 4081 ceph::buffer::list snaps; // only for clone entries
7c673cae
FG
4082 hobject_t soid;
4083 osd_reqid_t reqid; // caller+tid to uniquely identify request
9f95a23c 4084 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
11fdf7f2
TL
4085
4086 /// map extra_reqids by index to error return code (if any)
4087 mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
4088
7c673cae
FG
4089 eversion_t version, prior_version, reverting_to;
4090 version_t user_version; // the user version for this entry
4091 utime_t mtime; // this is the _user_ mtime, mind you
4092 int32_t return_code; // only stored for ERRORs for dup detection
4093
9f95a23c
TL
4094 vector<pg_log_op_return_item_t> op_returns;
4095
7c673cae
FG
4096 __s32 op;
4097 bool invalid_hash; // only when decoding sobject_t based entries
4098 bool invalid_pool; // only when decoding pool-less hobject based entries
9f95a23c 4099 ObjectCleanRegions clean_regions;
7c673cae
FG
4100
4101 pg_log_entry_t()
4102 : user_version(0), return_code(0), op(0),
31f18b77
FG
4103 invalid_hash(false), invalid_pool(false) {
4104 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4105 }
7c673cae
FG
4106 pg_log_entry_t(int _op, const hobject_t& _soid,
4107 const eversion_t& v, const eversion_t& pv,
4108 version_t uv,
4109 const osd_reqid_t& rid, const utime_t& mt,
4110 int return_code)
4111 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
4112 mtime(mt), return_code(return_code), op(_op),
31f18b77
FG
4113 invalid_hash(false), invalid_pool(false) {
4114 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4115 }
7c673cae
FG
4116
4117 bool is_clone() const { return op == CLONE; }
4118 bool is_modify() const { return op == MODIFY; }
4119 bool is_promote() const { return op == PROMOTE; }
4120 bool is_clean() const { return op == CLEAN; }
7c673cae
FG
4121 bool is_lost_revert() const { return op == LOST_REVERT; }
4122 bool is_lost_delete() const { return op == LOST_DELETE; }
4123 bool is_lost_mark() const { return op == LOST_MARK; }
4124 bool is_error() const { return op == ERROR; }
4125
4126 bool is_update() const {
4127 return
4128 is_clone() || is_modify() || is_promote() || is_clean() ||
11fdf7f2 4129 is_lost_revert() || is_lost_mark();
7c673cae
FG
4130 }
4131 bool is_delete() const {
4132 return op == DELETE || op == LOST_DELETE;
4133 }
4134
4135 bool can_rollback() const {
4136 return mod_desc.can_rollback();
4137 }
4138
4139 void mark_unrollbackable() {
4140 mod_desc.mark_unrollbackable();
4141 }
4142
4143 bool requires_kraken() const {
4144 return mod_desc.requires_kraken();
4145 }
4146
4147 // Errors are only used for dup detection, whereas
4148 // the index by objects is used by recovery, copy_get,
4149 // and other facilities that don't expect or need to
4150 // be aware of error entries.
4151 bool object_is_indexed() const {
4152 return !is_error();
4153 }
4154
4155 bool reqid_is_indexed() const {
4156 return reqid != osd_reqid_t() &&
4157 (op == MODIFY || op == DELETE || op == ERROR);
4158 }
4159
9f95a23c
TL
4160 void set_op_returns(std::vector<OSDOp>& ops) {
4161 op_returns.resize(ops.size());
4162 for (unsigned i = 0; i < ops.size(); ++i) {
4163 op_returns[i].rval = ops[i].rval;
4164 op_returns[i].bl = ops[i].outdata;
4165 }
4166 }
7c673cae 4167
9f95a23c
TL
4168 std::string get_key_name() const;
4169 void encode_with_checksum(ceph::buffer::list& bl) const;
4170 void decode_with_checksum(ceph::buffer::list::const_iterator& p);
4171
4172 void encode(ceph::buffer::list &bl) const;
4173 void decode(ceph::buffer::list::const_iterator &bl);
4174 void dump(ceph::Formatter *f) const;
4175 static void generate_test_instances(std::list<pg_log_entry_t*>& o);
7c673cae
FG
4176
4177};
4178WRITE_CLASS_ENCODER(pg_log_entry_t)
4179
9f95a23c 4180std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e);
7c673cae 4181
c07f9fc5
FG
4182struct pg_log_dup_t {
4183 osd_reqid_t reqid; // caller+tid to uniquely identify request
4184 eversion_t version;
4185 version_t user_version; // the user version for this entry
4186 int32_t return_code; // only stored for ERRORs for dup detection
7c673cae 4187
9f95a23c
TL
4188 vector<pg_log_op_return_item_t> op_returns;
4189
c07f9fc5
FG
4190 pg_log_dup_t()
4191 : user_version(0), return_code(0)
4192 {}
4193 explicit pg_log_dup_t(const pg_log_entry_t& entry)
4194 : reqid(entry.reqid), version(entry.version),
9f95a23c
TL
4195 user_version(entry.user_version),
4196 return_code(entry.return_code),
4197 op_returns(entry.op_returns)
c07f9fc5
FG
4198 {}
4199 pg_log_dup_t(const eversion_t& v, version_t uv,
4200 const osd_reqid_t& rid, int return_code)
4201 : reqid(rid), version(v), user_version(uv),
4202 return_code(return_code)
4203 {}
4204
9f95a23c
TL
4205 std::string get_key_name() const;
4206 void encode(ceph::buffer::list &bl) const;
4207 void decode(ceph::buffer::list::const_iterator &bl);
4208 void dump(ceph::Formatter *f) const;
4209 static void generate_test_instances(std::list<pg_log_dup_t*>& o);
c07f9fc5 4210
181888fb
FG
4211 bool operator==(const pg_log_dup_t &rhs) const {
4212 return reqid == rhs.reqid &&
4213 version == rhs.version &&
4214 user_version == rhs.user_version &&
9f95a23c
TL
4215 return_code == rhs.return_code &&
4216 op_returns == rhs.op_returns;
181888fb
FG
4217 }
4218 bool operator!=(const pg_log_dup_t &rhs) const {
4219 return !(*this == rhs);
4220 }
4221
c07f9fc5
FG
4222 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4223};
4224WRITE_CLASS_ENCODER(pg_log_dup_t)
4225
4226std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
7c673cae
FG
4227
4228/**
4229 * pg_log_t - incremental log of recent pg changes.
4230 *
4231 * serves as a recovery queue for recent changes.
4232 */
4233struct pg_log_t {
4234 /*
4235 * head - newest entry (update|delete)
4236 * tail - entry previous to oldest (update|delete) for which we have
4237 * complete negative information.
4238 * i.e. we can infer pg contents for any store whose last_update >= tail.
4239 */
4240 eversion_t head; // newest entry
4241 eversion_t tail; // version prior to oldest
4242
4243protected:
4244 // We can rollback rollback-able entries > can_rollback_to
4245 eversion_t can_rollback_to;
4246
4247 // always <= can_rollback_to, indicates how far stashed rollback
4248 // data can be found
4249 eversion_t rollback_info_trimmed_to;
4250
4251public:
c07f9fc5
FG
4252 // the actual log
4253 mempool::osd_pglog::list<pg_log_entry_t> log;
4254
4255 // entries just for dup op detection ordered oldest to newest
4256 mempool::osd_pglog::list<pg_log_dup_t> dups;
4257
7c673cae
FG
4258 pg_log_t() = default;
4259 pg_log_t(const eversion_t &last_update,
4260 const eversion_t &log_tail,
4261 const eversion_t &can_rollback_to,
4262 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
4263 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
4264 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
7c673cae
FG
4265 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4266 rollback_info_trimmed_to(rollback_info_trimmed_to),
c07f9fc5 4267 log(std::move(entries)), dups(std::move(dup_entries)) {}
7c673cae
FG
4268 pg_log_t(const eversion_t &last_update,
4269 const eversion_t &log_tail,
4270 const eversion_t &can_rollback_to,
4271 const eversion_t &rollback_info_trimmed_to,
c07f9fc5
FG
4272 const std::list<pg_log_entry_t> &entries,
4273 const std::list<pg_log_dup_t> &dup_entries)
7c673cae
FG
4274 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4275 rollback_info_trimmed_to(rollback_info_trimmed_to) {
4276 for (auto &&entry: entries) {
4277 log.push_back(entry);
4278 }
c07f9fc5
FG
4279 for (auto &&entry: dup_entries) {
4280 dups.push_back(entry);
4281 }
7c673cae
FG
4282 }
4283
4284 void clear() {
4285 eversion_t z;
4286 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
4287 log.clear();
c07f9fc5 4288 dups.clear();
7c673cae
FG
4289 }
4290
4291 eversion_t get_rollback_info_trimmed_to() const {
4292 return rollback_info_trimmed_to;
4293 }
4294 eversion_t get_can_rollback_to() const {
4295 return can_rollback_to;
4296 }
4297
4298
4299 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
31f18b77 4300 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
7c673cae
FG
4301 oldlog.swap(log);
4302
4303 eversion_t old_tail;
4304 unsigned mask = ~((~0)<<split_bits);
4305 for (auto i = oldlog.begin();
4306 i != oldlog.end();
4307 ) {
4308 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
4309 childlog.push_back(*i);
4310 } else {
4311 log.push_back(*i);
4312 }
4313 oldlog.erase(i++);
4314 }
4315
c07f9fc5
FG
4316 // osd_reqid is unique, so it doesn't matter if there are extra
4317 // dup entries in each pg. To avoid storing oid with the dup
4318 // entries, just copy the whole list.
4319 auto childdups(dups);
4320
7c673cae
FG
4321 return pg_log_t(
4322 head,
4323 tail,
4324 can_rollback_to,
4325 rollback_info_trimmed_to,
c07f9fc5
FG
4326 std::move(childlog),
4327 std::move(childdups));
4328 }
7c673cae 4329
31f18b77 4330 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
11fdf7f2 4331 ceph_assert(newhead >= tail);
7c673cae 4332
31f18b77
FG
4333 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
4334 mempool::osd_pglog::list<pg_log_entry_t> divergent;
7c673cae
FG
4335 while (true) {
4336 if (p == log.begin()) {
4337 // yikes, the whole thing is divergent!
31f18b77
FG
4338 using std::swap;
4339 swap(divergent, log);
7c673cae
FG
4340 break;
4341 }
4342 --p;
4343 if (p->version.version <= newhead.version) {
4344 /*
4345 * look at eversion.version here. we want to avoid a situation like:
4346 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4347 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4348 * lower_bound = 100'9
4349 * i.e, same request, different version. If the eversion.version is > the
4350 * lower_bound, we it is divergent.
4351 */
4352 ++p;
4353 divergent.splice(divergent.begin(), log, p, log.end());
4354 break;
4355 }
11fdf7f2 4356 ceph_assert(p->version > newhead);
7c673cae
FG
4357 }
4358 head = newhead;
4359
4360 if (can_rollback_to > newhead)
4361 can_rollback_to = newhead;
4362
4363 if (rollback_info_trimmed_to > newhead)
4364 rollback_info_trimmed_to = newhead;
4365
4366 return divergent;
4367 }
4368
9f95a23c 4369 void merge_from(const std::vector<pg_log_t*>& slogs, eversion_t last_update) {
11fdf7f2
TL
4370 log.clear();
4371
4372 // sort and merge dups
9f95a23c 4373 std::multimap<eversion_t,pg_log_dup_t> sorted;
11fdf7f2
TL
4374 for (auto& d : dups) {
4375 sorted.emplace(d.version, d);
4376 }
4377 for (auto l : slogs) {
4378 for (auto& d : l->dups) {
4379 sorted.emplace(d.version, d);
4380 }
4381 }
4382 dups.clear();
4383 for (auto& i : sorted) {
4384 dups.push_back(i.second);
4385 }
4386
4387 head = last_update;
4388 tail = last_update;
4389 can_rollback_to = last_update;
4390 rollback_info_trimmed_to = last_update;
4391 }
4392
7c673cae
FG
4393 bool empty() const {
4394 return log.empty();
4395 }
4396
4397 bool null() const {
4398 return head.version == 0 && head.epoch == 0;
4399 }
4400
9f95a23c 4401 uint64_t approx_size() const {
7c673cae
FG
4402 return head.version - tail.version;
4403 }
4404
4405 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
9f95a23c 4406 const std::string &hit_set_namespace, const pg_log_t &in,
7c673cae
FG
4407 pg_log_t &out, pg_log_t &reject);
4408
4409 /**
4410 * copy entries from the tail of another pg_log_t
4411 *
4412 * @param other pg_log_t to copy from
4413 * @param from copy entries after this version
4414 */
81eedcae 4415 void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
7c673cae
FG
4416
4417 /**
4418 * copy up to N entries
4419 *
4420 * @param other source log
4421 * @param max max number of entries to copy
4422 */
81eedcae 4423 void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
7c673cae 4424
9f95a23c 4425 std::ostream& print(std::ostream& out) const;
7c673cae 4426
9f95a23c
TL
4427 void encode(ceph::buffer::list &bl) const;
4428 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
4429 void dump(ceph::Formatter *f) const;
4430 static void generate_test_instances(std::list<pg_log_t*>& o);
7c673cae
FG
4431};
4432WRITE_CLASS_ENCODER(pg_log_t)
4433
9f95a23c 4434inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log)
7c673cae
FG
4435{
4436 out << "log((" << log.tail << "," << log.head << "], crt="
4437 << log.get_can_rollback_to() << ")";
4438 return out;
4439}
4440
4441
4442/**
4443 * pg_missing_t - summary of missing objects.
4444 *
4445 * kept in memory, as a supplement to pg_log_t
4446 * also used to pass missing info in messages.
4447 */
4448struct pg_missing_item {
4449 eversion_t need, have;
9f95a23c 4450 ObjectCleanRegions clean_regions;
c07f9fc5
FG
4451 enum missing_flags_t {
4452 FLAG_NONE = 0,
4453 FLAG_DELETE = 1,
4454 } flags;
4455 pg_missing_item() : flags(FLAG_NONE) {}
4456 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
9f95a23c
TL
4457 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) :
4458 need(n), have(h) {
c07f9fc5 4459 set_delete(is_delete);
9f95a23c
TL
4460 if (old_style)
4461 clean_regions.mark_fully_dirty();
c07f9fc5
FG
4462 }
4463
9f95a23c 4464 void encode(ceph::buffer::list& bl, uint64_t features) const {
11fdf7f2 4465 using ceph::encode;
9f95a23c
TL
4466 if (HAVE_FEATURE(features, SERVER_OCTOPUS)) {
4467 // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
4468 // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
c07f9fc5 4469 // possible. This can be replaced with the legacy encoding
9f95a23c
TL
4470 encode(eversion_t(), bl);
4471 encode(eversion_t(-1, -1), bl);
11fdf7f2 4472 encode(need, bl);
9f95a23c 4473 encode(have, bl);
11fdf7f2 4474 encode(static_cast<uint8_t>(flags), bl);
9f95a23c 4475 encode(clean_regions, bl);
c07f9fc5 4476 } else {
9f95a23c 4477 encode(eversion_t(), bl);
11fdf7f2
TL
4478 encode(need, bl);
4479 encode(have, bl);
9f95a23c 4480 encode(static_cast<uint8_t>(flags), bl);
c07f9fc5 4481 }
7c673cae 4482 }
9f95a23c 4483 void decode(ceph::buffer::list::const_iterator& bl) {
11fdf7f2 4484 using ceph::decode;
9f95a23c 4485 eversion_t e, l;
11fdf7f2 4486 decode(e, bl);
9f95a23c
TL
4487 decode(l, bl);
4488 if(l == eversion_t(-1, -1)) {
4489 // support all
11fdf7f2
TL
4490 decode(need, bl);
4491 decode(have, bl);
c07f9fc5 4492 uint8_t f;
11fdf7f2 4493 decode(f, bl);
c07f9fc5 4494 flags = static_cast<missing_flags_t>(f);
9f95a23c
TL
4495 decode(clean_regions, bl);
4496 } else {
4497 // support OSD_RECOVERY_DELETES
4498 need = l;
4499 decode(have, bl);
4500 uint8_t f;
4501 decode(f, bl);
4502 flags = static_cast<missing_flags_t>(f);
4503 clean_regions.mark_fully_dirty();
c07f9fc5
FG
4504 }
4505 }
4506
4507 void set_delete(bool is_delete) {
4508 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
4509 }
4510
4511 bool is_delete() const {
4512 return (flags & FLAG_DELETE) == FLAG_DELETE;
4513 }
4514
9f95a23c 4515 std::string flag_str() const {
c07f9fc5
FG
4516 if (flags == FLAG_NONE) {
4517 return "none";
4518 } else {
4519 return "delete";
4520 }
7c673cae 4521 }
c07f9fc5 4522
9f95a23c 4523 void dump(ceph::Formatter *f) const {
7c673cae
FG
4524 f->dump_stream("need") << need;
4525 f->dump_stream("have") << have;
c07f9fc5 4526 f->dump_stream("flags") << flag_str();
9f95a23c 4527 f->dump_stream("clean_regions") << clean_regions;
7c673cae 4528 }
9f95a23c 4529 static void generate_test_instances(std::list<pg_missing_item*>& o) {
7c673cae
FG
4530 o.push_back(new pg_missing_item);
4531 o.push_back(new pg_missing_item);
4532 o.back()->need = eversion_t(1, 2);
4533 o.back()->have = eversion_t(1, 1);
c07f9fc5
FG
4534 o.push_back(new pg_missing_item);
4535 o.back()->need = eversion_t(3, 5);
4536 o.back()->have = eversion_t(3, 4);
9f95a23c
TL
4537 o.back()->clean_regions.mark_data_region_dirty(4096, 8192);
4538 o.back()->clean_regions.mark_omap_dirty();
c07f9fc5 4539 o.back()->flags = FLAG_DELETE;
7c673cae
FG
4540 }
4541 bool operator==(const pg_missing_item &rhs) const {
c07f9fc5 4542 return need == rhs.need && have == rhs.have && flags == rhs.flags;
7c673cae
FG
4543 }
4544 bool operator!=(const pg_missing_item &rhs) const {
4545 return !(*this == rhs);
4546 }
4547};
c07f9fc5 4548WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
9f95a23c 4549std::ostream& operator<<(std::ostream& out, const pg_missing_item &item);
7c673cae
FG
4550
4551class pg_missing_const_i {
4552public:
9f95a23c 4553 virtual const std::map<hobject_t, pg_missing_item> &
7c673cae 4554 get_items() const = 0;
9f95a23c 4555 virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0;
c07f9fc5 4556 virtual bool get_may_include_deletes() const = 0;
7c673cae
FG
4557 virtual unsigned int num_missing() const = 0;
4558 virtual bool have_missing() const = 0;
4559 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
4560 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
7c673cae
FG
4561 virtual ~pg_missing_const_i() {}
4562};
4563
4564
4565template <bool Track>
4566class ChangeTracker {
4567public:
4568 void changed(const hobject_t &obj) {}
4569 template <typename F>
4570 void get_changed(F &&f) const {}
4571 void flush() {}
4572 bool is_clean() const {
4573 return true;
4574 }
4575};
4576template <>
4577class ChangeTracker<true> {
9f95a23c 4578 std::set<hobject_t> _changed;
7c673cae
FG
4579public:
4580 void changed(const hobject_t &obj) {
4581 _changed.insert(obj);
4582 }
4583 template <typename F>
4584 void get_changed(F &&f) const {
4585 for (auto const &i: _changed) {
4586 f(i);
4587 }
4588 }
4589 void flush() {
4590 _changed.clear();
4591 }
4592 bool is_clean() const {
4593 return _changed.empty();
4594 }
4595};
4596
4597template <bool TrackChanges>
4598class pg_missing_set : public pg_missing_const_i {
4599 using item = pg_missing_item;
9f95a23c
TL
4600 std::map<hobject_t, item> missing; // oid -> (need v, have v)
4601 std::map<version_t, hobject_t> rmissing; // v -> oid
7c673cae
FG
4602 ChangeTracker<TrackChanges> tracker;
4603
4604public:
4605 pg_missing_set() = default;
4606
4607 template <typename missing_type>
4608 pg_missing_set(const missing_type &m) {
7c673cae
FG
4609 missing = m.get_items();
4610 rmissing = m.get_rmissing();
c07f9fc5 4611 may_include_deletes = m.get_may_include_deletes();
7c673cae
FG
4612 for (auto &&i: missing)
4613 tracker.changed(i.first);
4614 }
4615
c07f9fc5
FG
4616 bool may_include_deletes = false;
4617
9f95a23c 4618 const std::map<hobject_t, item> &get_items() const override {
7c673cae
FG
4619 return missing;
4620 }
9f95a23c 4621 const std::map<version_t, hobject_t> &get_rmissing() const override {
7c673cae
FG
4622 return rmissing;
4623 }
c07f9fc5
FG
4624 bool get_may_include_deletes() const override {
4625 return may_include_deletes;
4626 }
7c673cae
FG
4627 unsigned int num_missing() const override {
4628 return missing.size();
4629 }
4630 bool have_missing() const override {
4631 return !missing.empty();
4632 }
9f95a23c
TL
4633 void merge(const pg_log_entry_t& e) {
4634 auto miter = missing.find(e.soid);
4635 if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have)
4636 miter->second.clean_regions.merge(e.clean_regions);
4637 }
7c673cae
FG
4638 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
4639 auto iter = missing.find(oid);
4640 if (iter == missing.end())
4641 return false;
4642 if (out)
4643 *out = iter->second;
4644 return true;
4645 }
4646 bool is_missing(const hobject_t& oid, eversion_t v) const override {
9f95a23c 4647 std::map<hobject_t, item>::const_iterator m =
7c673cae
FG
4648 missing.find(oid);
4649 if (m == missing.end())
4650 return false;
4651 const item &item(m->second);
4652 if (item.need > v)
4653 return false;
4654 return true;
4655 }
11fdf7f2
TL
4656 eversion_t get_oldest_need() const {
4657 if (missing.empty()) {
7c673cae 4658 return eversion_t();
11fdf7f2
TL
4659 }
4660 auto it = missing.find(rmissing.begin()->second);
4661 ceph_assert(it != missing.end());
4662 return it->second.need;
7c673cae
FG
4663 }
4664
4665 void claim(pg_missing_set& o) {
4666 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
4667 missing.swap(o.missing);
4668 rmissing.swap(o.rmissing);
4669 }
4670
4671 /*
4672 * this needs to be called in log order as we extend the log. it
4673 * assumes missing is accurate up through the previous log entry.
4674 */
4675 void add_next_event(const pg_log_entry_t& e) {
9f95a23c 4676 std::map<hobject_t, item>::iterator missing_it;
c07f9fc5
FG
4677 missing_it = missing.find(e.soid);
4678 bool is_missing_divergent_item = missing_it != missing.end();
4679 if (e.prior_version == eversion_t() || e.is_clone()) {
4680 // new object.
4681 if (is_missing_divergent_item) { // use iterator
9f95a23c
TL
4682 rmissing.erase(missing_it->second.need.version);
4683 // .have = nil
4684 missing_it->second = item(e.version, eversion_t(), e.is_delete());
4685 missing_it->second.clean_regions.mark_fully_dirty();
4686 } else {
4687 // create new element in missing map
4688 // .have = nil
4689 missing[e.soid] = item(e.version, eversion_t(), e.is_delete());
4690 missing[e.soid].clean_regions.mark_fully_dirty();
4691 }
c07f9fc5
FG
4692 } else if (is_missing_divergent_item) {
4693 // already missing (prior).
4694 rmissing.erase((missing_it->second).need.version);
9f95a23c 4695 missing_it->second.need = e.version; // leave .have unchanged.
c07f9fc5 4696 missing_it->second.set_delete(e.is_delete());
9f95a23c
TL
4697 if (e.is_lost_revert())
4698 missing_it->second.clean_regions.mark_fully_dirty();
4699 else
4700 missing_it->second.clean_regions.merge(e.clean_regions);
c07f9fc5
FG
4701 } else {
4702 // not missing, we must have prior_version (if any)
11fdf7f2 4703 ceph_assert(!is_missing_divergent_item);
c07f9fc5 4704 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
9f95a23c
TL
4705 if (e.is_lost_revert())
4706 missing[e.soid].clean_regions.mark_fully_dirty();
4707 else
4708 missing[e.soid].clean_regions = e.clean_regions;
7c673cae 4709 }
c07f9fc5 4710 rmissing[e.version.version] = e.soid;
7c673cae
FG
4711 tracker.changed(e.soid);
4712 }
4713
c07f9fc5 4714 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
9f95a23c
TL
4715 auto p = missing.find(oid);
4716 if (p != missing.end()) {
4717 rmissing.erase((p->second).need.version);
4718 p->second.need = need; // do not adjust .have
4719 p->second.set_delete(is_delete);
4720 p->second.clean_regions.mark_fully_dirty();
7c673cae 4721 } else {
c07f9fc5 4722 missing[oid] = item(need, eversion_t(), is_delete);
9f95a23c 4723 missing[oid].clean_regions.mark_fully_dirty();
7c673cae
FG
4724 }
4725 rmissing[need.version] = oid;
4726
4727 tracker.changed(oid);
4728 }
4729
4730 void revise_have(hobject_t oid, eversion_t have) {
9f95a23c
TL
4731 auto p = missing.find(oid);
4732 if (p != missing.end()) {
7c673cae 4733 tracker.changed(oid);
9f95a23c
TL
4734 (p->second).have = have;
4735 }
4736 }
4737
4738 void mark_fully_dirty(const hobject_t& oid) {
4739 auto p = missing.find(oid);
4740 if (p != missing.end()) {
4741 tracker.changed(oid);
4742 (p->second).clean_regions.mark_fully_dirty();
7c673cae
FG
4743 }
4744 }
4745
c07f9fc5
FG
4746 void add(const hobject_t& oid, eversion_t need, eversion_t have,
4747 bool is_delete) {
9f95a23c 4748 missing[oid] = item(need, have, is_delete, true);
7c673cae
FG
4749 rmissing[need.version] = oid;
4750 tracker.changed(oid);
4751 }
4752
9f95a23c
TL
4753 void add(const hobject_t& oid, pg_missing_item&& item) {
4754 rmissing[item.need.version] = oid;
4755 missing.insert({oid, std::move(item)});
4756 tracker.changed(oid);
4757 }
4758
7c673cae
FG
4759 void rm(const hobject_t& oid, eversion_t v) {
4760 std::map<hobject_t, item>::iterator p = missing.find(oid);
4761 if (p != missing.end() && p->second.need <= v)
4762 rm(p);
4763 }
4764
4765 void rm(std::map<hobject_t, item>::const_iterator m) {
4766 tracker.changed(m->first);
4767 rmissing.erase(m->second.need.version);
4768 missing.erase(m);
4769 }
4770
4771 void got(const hobject_t& oid, eversion_t v) {
4772 std::map<hobject_t, item>::iterator p = missing.find(oid);
11fdf7f2
TL
4773 ceph_assert(p != missing.end());
4774 ceph_assert(p->second.need <= v || p->second.is_delete());
7c673cae
FG
4775 got(p);
4776 }
4777
4778 void got(std::map<hobject_t, item>::const_iterator m) {
4779 tracker.changed(m->first);
4780 rmissing.erase(m->second.need.version);
4781 missing.erase(m);
4782 }
4783
4784 void split_into(
4785 pg_t child_pgid,
4786 unsigned split_bits,
4787 pg_missing_set *omissing) {
c07f9fc5 4788 omissing->may_include_deletes = may_include_deletes;
7c673cae 4789 unsigned mask = ~((~0)<<split_bits);
9f95a23c 4790 for (std::map<hobject_t, item>::iterator i = missing.begin();
7c673cae
FG
4791 i != missing.end();
4792 ) {
4793 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
c07f9fc5
FG
4794 omissing->add(i->first, i->second.need, i->second.have,
4795 i->second.is_delete());
7c673cae
FG
4796 rm(i++);
4797 } else {
4798 ++i;
4799 }
4800 }
4801 }
4802
4803 void clear() {
4804 for (auto const &i: missing)
4805 tracker.changed(i.first);
4806 missing.clear();
4807 rmissing.clear();
4808 }
4809
9f95a23c
TL
4810 void encode(ceph::buffer::list &bl, uint64_t features) const {
4811 ENCODE_START(5, 2, bl)
4812 encode(missing, bl, features);
11fdf7f2 4813 encode(may_include_deletes, bl);
7c673cae
FG
4814 ENCODE_FINISH(bl);
4815 }
9f95a23c 4816 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) {
7c673cae
FG
4817 for (auto const &i: missing)
4818 tracker.changed(i.first);
9f95a23c 4819 DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
11fdf7f2 4820 decode(missing, bl);
c07f9fc5 4821 if (struct_v >= 4) {
11fdf7f2 4822 decode(may_include_deletes, bl);
c07f9fc5 4823 }
7c673cae
FG
4824 DECODE_FINISH(bl);
4825
4826 if (struct_v < 3) {
4827 // Handle hobject_t upgrade
9f95a23c
TL
4828 std::map<hobject_t, item> tmp;
4829 for (std::map<hobject_t, item>::iterator i =
7c673cae
FG
4830 missing.begin();
4831 i != missing.end();
4832 ) {
4833 if (!i->first.is_max() && i->first.pool == -1) {
4834 hobject_t to_insert(i->first);
4835 to_insert.pool = pool;
4836 tmp[to_insert] = i->second;
4837 missing.erase(i++);
4838 } else {
4839 ++i;
4840 }
4841 }
4842 missing.insert(tmp.begin(), tmp.end());
4843 }
4844
9f95a23c 4845 for (std::map<hobject_t,item>::iterator it =
7c673cae
FG
4846 missing.begin();
4847 it != missing.end();
4848 ++it)
4849 rmissing[it->second.need.version] = it->first;
4850 for (auto const &i: missing)
4851 tracker.changed(i.first);
4852 }
9f95a23c 4853 void dump(ceph::Formatter *f) const {
7c673cae 4854 f->open_array_section("missing");
9f95a23c 4855 for (std::map<hobject_t,item>::const_iterator p =
7c673cae
FG
4856 missing.begin(); p != missing.end(); ++p) {
4857 f->open_object_section("item");
4858 f->dump_stream("object") << p->first;
4859 p->second.dump(f);
4860 f->close_section();
4861 }
4862 f->close_section();
c07f9fc5 4863 f->dump_bool("may_include_deletes", may_include_deletes);
7c673cae
FG
4864 }
4865 template <typename F>
4866 void filter_objects(F &&f) {
4867 for (auto i = missing.begin(); i != missing.end();) {
4868 if (f(i->first)) {
4869 rm(i++);
4870 } else {
4871 ++i;
4872 }
4873 }
4874 }
9f95a23c 4875 static void generate_test_instances(std::list<pg_missing_set*>& o) {
7c673cae 4876 o.push_back(new pg_missing_set);
9f95a23c 4877 o.back()->may_include_deletes = true;
7c673cae
FG
4878 o.push_back(new pg_missing_set);
4879 o.back()->add(
4880 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
c07f9fc5 4881 eversion_t(5, 6), eversion_t(5, 1), false);
9f95a23c 4882 o.back()->may_include_deletes = true;
c07f9fc5
FG
4883 o.push_back(new pg_missing_set);
4884 o.back()->add(
4885 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4886 eversion_t(5, 6), eversion_t(5, 1), true);
4887 o.back()->may_include_deletes = true;
7c673cae
FG
4888 }
4889 template <typename F>
4890 void get_changed(F &&f) const {
4891 tracker.get_changed(f);
4892 }
4893 void flush() {
4894 tracker.flush();
4895 }
4896 bool is_clean() const {
4897 return tracker.is_clean();
4898 }
4899 template <typename missing_t>
4900 bool debug_verify_from_init(
4901 const missing_t &init_missing,
9f95a23c 4902 std::ostream *oss) const {
7c673cae
FG
4903 if (!TrackChanges)
4904 return true;
4905 auto check_missing(init_missing.get_items());
4906 tracker.get_changed([&](const hobject_t &hoid) {
4907 check_missing.erase(hoid);
4908 if (missing.count(hoid)) {
4909 check_missing.insert(*(missing.find(hoid)));
4910 }
4911 });
4912 bool ok = true;
4913 if (check_missing.size() != missing.size()) {
4914 if (oss) {
4915 *oss << "Size mismatch, check: " << check_missing.size()
4916 << ", actual: " << missing.size() << "\n";
4917 }
4918 ok = false;
4919 }
4920 for (auto &i: missing) {
4921 if (!check_missing.count(i.first)) {
4922 if (oss)
4923 *oss << "check_missing missing " << i.first << "\n";
4924 ok = false;
4925 } else if (check_missing[i.first] != i.second) {
4926 if (oss)
4927 *oss << "check_missing missing item mismatch on " << i.first
4928 << ", check: " << check_missing[i.first]
4929 << ", actual: " << i.second << "\n";
4930 ok = false;
4931 }
4932 }
4933 if (oss && !ok) {
4934 *oss << "check_missing: " << check_missing << "\n";
9f95a23c 4935 std::set<hobject_t> changed;
7c673cae
FG
4936 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4937 *oss << "changed: " << changed << "\n";
4938 }
4939 return ok;
4940 }
4941};
4942template <bool TrackChanges>
4943void encode(
9f95a23c 4944 const pg_missing_set<TrackChanges> &c, ceph::buffer::list &bl, uint64_t features=0) {
7c673cae 4945 ENCODE_DUMP_PRE();
9f95a23c 4946 c.encode(bl, features);
7c673cae
FG
4947 ENCODE_DUMP_POST(cl);
4948}
4949template <bool TrackChanges>
9f95a23c 4950void decode(pg_missing_set<TrackChanges> &c, ceph::buffer::list::const_iterator &p) {
7c673cae
FG
4951 c.decode(p);
4952}
4953template <bool TrackChanges>
9f95a23c 4954std::ostream& operator<<(std::ostream& out, const pg_missing_set<TrackChanges> &missing)
7c673cae 4955{
c07f9fc5
FG
4956 out << "missing(" << missing.num_missing()
4957 << " may_include_deletes = " << missing.may_include_deletes;
7c673cae
FG
4958 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4959 out << ")";
4960 return out;
4961}
4962
4963using pg_missing_t = pg_missing_set<false>;
4964using pg_missing_tracker_t = pg_missing_set<true>;
4965
4966
4967/**
4968 * pg list objects response format
4969 *
4970 */
4971struct pg_nls_response_t {
4972 collection_list_handle_t handle;
9f95a23c 4973 std::list<librados::ListObjectImpl> entries;
7c673cae 4974
9f95a23c 4975 void encode(ceph::buffer::list& bl) const {
7c673cae 4976 ENCODE_START(1, 1, bl);
11fdf7f2 4977 encode(handle, bl);
7c673cae 4978 __u32 n = (__u32)entries.size();
11fdf7f2 4979 encode(n, bl);
9f95a23c 4980 for (std::list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
11fdf7f2
TL
4981 encode(i->nspace, bl);
4982 encode(i->oid, bl);
4983 encode(i->locator, bl);
7c673cae
FG
4984 }
4985 ENCODE_FINISH(bl);
4986 }
9f95a23c 4987 void decode(ceph::buffer::list::const_iterator& bl) {
7c673cae 4988 DECODE_START(1, bl);
11fdf7f2 4989 decode(handle, bl);
7c673cae 4990 __u32 n;
11fdf7f2 4991 decode(n, bl);
7c673cae
FG
4992 entries.clear();
4993 while (n--) {
4994 librados::ListObjectImpl i;
11fdf7f2
TL
4995 decode(i.nspace, bl);
4996 decode(i.oid, bl);
4997 decode(i.locator, bl);
7c673cae
FG
4998 entries.push_back(i);
4999 }
5000 DECODE_FINISH(bl);
5001 }
9f95a23c 5002 void dump(ceph::Formatter *f) const {
7c673cae
FG
5003 f->dump_stream("handle") << handle;
5004 f->open_array_section("entries");
9f95a23c 5005 for (std::list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
7c673cae
FG
5006 f->open_object_section("object");
5007 f->dump_string("namespace", p->nspace);
5008 f->dump_string("object", p->oid);
5009 f->dump_string("key", p->locator);
5010 f->close_section();
5011 }
5012 f->close_section();
5013 }
9f95a23c 5014 static void generate_test_instances(std::list<pg_nls_response_t*>& o) {
7c673cae
FG
5015 o.push_back(new pg_nls_response_t);
5016 o.push_back(new pg_nls_response_t);
5017 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5018 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5019 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5020 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5021 o.push_back(new pg_nls_response_t);
5022 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
5023 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5024 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5025 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5026 o.push_back(new pg_nls_response_t);
5027 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
5028 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5029 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5030 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5031 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5032 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5033 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5034 }
5035};
5036
5037WRITE_CLASS_ENCODER(pg_nls_response_t)
5038
5039// For backwards compatibility with older OSD requests
5040struct pg_ls_response_t {
5041 collection_list_handle_t handle;
9f95a23c 5042 std::list<std::pair<object_t, std::string> > entries;
7c673cae 5043
9f95a23c 5044 void encode(ceph::buffer::list& bl) const {
11fdf7f2 5045 using ceph::encode;
7c673cae 5046 __u8 v = 1;
11fdf7f2
TL
5047 encode(v, bl);
5048 encode(handle, bl);
5049 encode(entries, bl);
7c673cae 5050 }
9f95a23c 5051 void decode(ceph::buffer::list::const_iterator& bl) {
11fdf7f2 5052 using ceph::decode;
7c673cae 5053 __u8 v;
11fdf7f2
TL
5054 decode(v, bl);
5055 ceph_assert(v == 1);
5056 decode(handle, bl);
5057 decode(entries, bl);
7c673cae 5058 }
9f95a23c 5059 void dump(ceph::Formatter *f) const {
7c673cae
FG
5060 f->dump_stream("handle") << handle;
5061 f->open_array_section("entries");
9f95a23c 5062 for (std::list<std::pair<object_t, std::string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
7c673cae
FG
5063 f->open_object_section("object");
5064 f->dump_stream("object") << p->first;
5065 f->dump_string("key", p->second);
5066 f->close_section();
5067 }
5068 f->close_section();
5069 }
9f95a23c 5070 static void generate_test_instances(std::list<pg_ls_response_t*>& o) {
7c673cae
FG
5071 o.push_back(new pg_ls_response_t);
5072 o.push_back(new pg_ls_response_t);
5073 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
9f95a23c
TL
5074 o.back()->entries.push_back(std::make_pair(object_t("one"), std::string()));
5075 o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey")));
7c673cae
FG
5076 }
5077};
5078
5079WRITE_CLASS_ENCODER(pg_ls_response_t)
5080
5081/**
5082 * object_copy_cursor_t
5083 */
5084struct object_copy_cursor_t {
5085 uint64_t data_offset;
9f95a23c 5086 std::string omap_offset;
7c673cae
FG
5087 bool attr_complete;
5088 bool data_complete;
5089 bool omap_complete;
5090
5091 object_copy_cursor_t()
5092 : data_offset(0),
5093 attr_complete(false),
5094 data_complete(false),
5095 omap_complete(false)
5096 {}
5097
5098 bool is_initial() const {
5099 return !attr_complete && data_offset == 0 && omap_offset.empty();
5100 }
5101 bool is_complete() const {
5102 return attr_complete && data_complete && omap_complete;
5103 }
5104
9f95a23c
TL
5105 static void generate_test_instances(std::list<object_copy_cursor_t*>& o);
5106 void encode(ceph::buffer::list& bl) const;
5107 void decode(ceph::buffer::list::const_iterator &bl);
5108 void dump(ceph::Formatter *f) const;
7c673cae
FG
5109};
5110WRITE_CLASS_ENCODER(object_copy_cursor_t)
5111
5112/**
5113 * object_copy_data_t
5114 *
5115 * Return data from a copy request. The semantics are a little strange
5116 * as a result of the encoding's heritage.
5117 *
5118 * In particular, the sender unconditionally fills in the cursor (from what
5119 * it receives and sends), the size, and the mtime, but is responsible for
5120 * figuring out whether it should put any data in the attrs, data, or
5121 * omap members (corresponding to xattrs, object data, and the omap entries)
5122 * based on external data (the client includes a max amount to return with
5123 * the copy request). The client then looks into the attrs, data, and/or omap
5124 * based on the contents of the cursor.
5125 */
5126struct object_copy_data_t {
5127 enum {
5128 FLAG_DATA_DIGEST = 1<<0,
5129 FLAG_OMAP_DIGEST = 1<<1,
5130 };
5131 object_copy_cursor_t cursor;
5132 uint64_t size;
5133 utime_t mtime;
5134 uint32_t data_digest, omap_digest;
5135 uint32_t flags;
9f95a23c
TL
5136 std::map<std::string, ceph::buffer::list> attrs;
5137 ceph::buffer::list data;
5138 ceph::buffer::list omap_header;
5139 ceph::buffer::list omap_data;
7c673cae
FG
5140
5141 /// which snaps we are defined for (if a snap and not the head)
9f95a23c 5142 std::vector<snapid_t> snaps;
11fdf7f2 5143 /// latest snap seq for the object (if head)
7c673cae
FG
5144 snapid_t snap_seq;
5145
11fdf7f2 5146 /// recent reqids on this object
9f95a23c 5147 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids;
7c673cae 5148
11fdf7f2
TL
5149 /// map reqids by index to error return code (if any)
5150 mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
5151
7c673cae
FG
5152 uint64_t truncate_seq;
5153 uint64_t truncate_size;
5154
5155public:
5156 object_copy_data_t() :
5157 size((uint64_t)-1), data_digest(-1),
5158 omap_digest(-1), flags(0),
5159 truncate_seq(0),
5160 truncate_size(0) {}
5161
9f95a23c
TL
5162 static void generate_test_instances(std::list<object_copy_data_t*>& o);
5163 void encode(ceph::buffer::list& bl, uint64_t features) const;
5164 void decode(ceph::buffer::list::const_iterator& bl);
5165 void dump(ceph::Formatter *f) const;
7c673cae
FG
5166};
5167WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
5168
5169/**
5170 * pg creation info
5171 */
5172struct pg_create_t {
5173 epoch_t created; // epoch pg created
5174 pg_t parent; // split from parent (if != pg_t())
5175 __s32 split_bits;
5176
5177 pg_create_t()
5178 : created(0), split_bits(0) {}
5179 pg_create_t(unsigned c, pg_t p, int s)
5180 : created(c), parent(p), split_bits(s) {}
5181
9f95a23c
TL
5182 void encode(ceph::buffer::list &bl) const;
5183 void decode(ceph::buffer::list::const_iterator &bl);
5184 void dump(ceph::Formatter *f) const;
5185 static void generate_test_instances(std::list<pg_create_t*>& o);
7c673cae
FG
5186};
5187WRITE_CLASS_ENCODER(pg_create_t)
5188
7c673cae
FG
5189// -----------------------------------------
5190
5191class ObjectExtent {
5192 /**
5193 * ObjectExtents are used for specifying IO behavior against RADOS
5194 * objects when one is using the ObjectCacher.
5195 *
5196 * To use this in a real system, *every member* must be filled
5197 * out correctly. In particular, make sure to initialize the
5198 * oloc correctly, as its default values are deliberate poison
5199 * and will cause internal ObjectCacher asserts.
5200 *
5201 * Similarly, your buffer_extents vector *must* specify a total
5202 * size equal to your length. If the buffer_extents inadvertently
5203 * contain less space than the length member specifies, you
5204 * will get unintelligible asserts deep in the ObjectCacher.
5205 *
5206 * If you are trying to do testing and don't care about actual
5207 * RADOS function, the simplest thing to do is to initialize
5208 * the ObjectExtent (truncate_size can be 0), create a single entry
5209 * in buffer_extents matching the length, and set oloc.pool to 0.
5210 */
5211 public:
5212 object_t oid; // object id
5213 uint64_t objectno;
5214 uint64_t offset; // in object
5215 uint64_t length; // in object
5216 uint64_t truncate_size; // in object
5217
5218 object_locator_t oloc; // object locator (pool etc)
5219
9f95a23c 5220 std::vector<std::pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
7c673cae
FG
5221
5222 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
5223 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
5224 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
5225};
5226
9f95a23c 5227inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex)
7c673cae
FG
5228{
5229 return out << "extent("
5230 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
5231 << " " << ex.offset << "~" << ex.length
5232 << " -> " << ex.buffer_extents
5233 << ")";
5234}
5235
5236
7c673cae
FG
5237// ---------------------------------------
5238
5239class OSDSuperblock {
5240public:
5241 uuid_d cluster_fsid, osd_fsid;
9f95a23c
TL
5242 int32_t whoami = -1; // my role in this fs.
5243 epoch_t current_epoch = 0; // most recent epoch
5244 epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have.
5245 double weight = 0.0;
7c673cae
FG
5246
5247 CompatSet compat_features;
5248
5249 // last interval over which i mounted and was then active
9f95a23c
TL
5250 epoch_t mounted = 0; // last epoch i mounted
5251 epoch_t clean_thru = 0; // epoch i was active and clean thru
7c673cae 5252
9f95a23c
TL
5253 epoch_t purged_snaps_last = 0;
5254 utime_t last_purged_snaps_scrub;
7c673cae 5255
9f95a23c
TL
5256 void encode(ceph::buffer::list &bl) const;
5257 void decode(ceph::buffer::list::const_iterator &bl);
5258 void dump(ceph::Formatter *f) const;
5259 static void generate_test_instances(std::list<OSDSuperblock*>& o);
7c673cae
FG
5260};
5261WRITE_CLASS_ENCODER(OSDSuperblock)
5262
9f95a23c 5263inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb)
7c673cae
FG
5264{
5265 return out << "sb(" << sb.cluster_fsid
5266 << " osd." << sb.whoami
5267 << " " << sb.osd_fsid
5268 << " e" << sb.current_epoch
5269 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
5270 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
5271 << ")";
5272}
5273
5274
5275// -------
5276
5277
5278
5279
5280
5281
5282/*
5283 * attached to object head. describes most recent snap context, and
5284 * set of existing clones.
5285 */
5286struct SnapSet {
5287 snapid_t seq;
9f95a23c
TL
5288 // NOTE: this is for pre-octopus compatibility only! remove in Q release
5289 std::vector<snapid_t> snaps; // descending
5290 std::vector<snapid_t> clones; // ascending
5291 std::map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
5292 std::map<snapid_t, uint64_t> clone_size;
5293 std::map<snapid_t, std::vector<snapid_t>> clone_snaps; // descending
7c673cae 5294
11fdf7f2 5295 SnapSet() : seq(0) {}
9f95a23c 5296 explicit SnapSet(ceph::buffer::list& bl) {
11fdf7f2 5297 auto p = std::cbegin(bl);
7c673cae
FG
5298 decode(p);
5299 }
5300
7c673cae
FG
5301 /// populate SnapSet from a librados::snap_set_t
5302 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
5303
5304 /// get space accounted to clone
5305 uint64_t get_clone_bytes(snapid_t clone) const;
5306
9f95a23c
TL
5307 void encode(ceph::buffer::list& bl) const;
5308 void decode(ceph::buffer::list::const_iterator& bl);
5309 void dump(ceph::Formatter *f) const;
5310 static void generate_test_instances(std::list<SnapSet*>& o);
7c673cae
FG
5311
5312 SnapContext get_ssc_as_of(snapid_t as_of) const {
5313 SnapContext out;
5314 out.seq = as_of;
9f95a23c
TL
5315 for (auto p = clone_snaps.rbegin();
5316 p != clone_snaps.rend();
5317 ++p) {
5318 for (auto snap : p->second) {
5319 if (snap <= as_of) {
5320 out.snaps.push_back(snap);
5321 }
5322 }
7c673cae
FG
5323 }
5324 return out;
5325 }
5326
7c673cae
FG
5327
5328 SnapSet get_filtered(const pg_pool_t &pinfo) const;
5329 void filter(const pg_pool_t &pinfo);
5330};
5331WRITE_CLASS_ENCODER(SnapSet)
5332
9f95a23c 5333std::ostream& operator<<(std::ostream& out, const SnapSet& cs);
7c673cae
FG
5334
5335
5336
5337#define OI_ATTR "_"
5338#define SS_ATTR "snapset"
5339
5340struct watch_info_t {
5341 uint64_t cookie;
5342 uint32_t timeout_seconds;
5343 entity_addr_t addr;
5344
5345 watch_info_t() : cookie(0), timeout_seconds(0) { }
5346 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
5347
9f95a23c
TL
5348 void encode(ceph::buffer::list& bl, uint64_t features) const;
5349 void decode(ceph::buffer::list::const_iterator& bl);
5350 void dump(ceph::Formatter *f) const;
5351 static void generate_test_instances(std::list<watch_info_t*>& o);
7c673cae
FG
5352};
5353WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
5354
5355static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
5356 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
5357 && l.addr == r.addr;
5358}
5359
9f95a23c 5360static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) {
7c673cae
FG
5361 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
5362 << " " << w.addr << ")";
5363}
5364
5365struct notify_info_t {
5366 uint64_t cookie;
5367 uint64_t notify_id;
5368 uint32_t timeout;
9f95a23c 5369 ceph::buffer::list bl;
7c673cae
FG
5370};
5371
9f95a23c 5372static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) {
7c673cae
FG
5373 return out << "notify(cookie " << n.cookie
5374 << " notify" << n.notify_id
5375 << " " << n.timeout << "s)";
5376}
5377
11fdf7f2
TL
5378struct chunk_info_t {
5379 typedef enum {
5380 FLAG_DIRTY = 1,
5381 FLAG_MISSING = 2,
5382 FLAG_HAS_REFERENCE = 4,
5383 FLAG_HAS_FINGERPRINT = 8,
5384 } cflag_t;
5385 uint32_t offset;
5386 uint32_t length;
5387 hobject_t oid;
5388 cflag_t flags; // FLAG_*
5389
5390 chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
5391
9f95a23c
TL
5392 static std::string get_flag_string(uint64_t flags) {
5393 std::string r;
11fdf7f2
TL
5394 if (flags & FLAG_DIRTY) {
5395 r += "|dirty";
5396 }
5397 if (flags & FLAG_MISSING) {
5398 r += "|missing";
5399 }
5400 if (flags & FLAG_HAS_REFERENCE) {
5401 r += "|has_reference";
5402 }
5403 if (flags & FLAG_HAS_FINGERPRINT) {
5404 r += "|has_fingerprint";
5405 }
5406 if (r.length())
5407 return r.substr(1);
5408 return r;
5409 }
5410 bool test_flag(cflag_t f) const {
5411 return (flags & f) == f;
5412 }
5413 void set_flag(cflag_t f) {
5414 flags = (cflag_t)(flags | f);
5415 }
5416 void set_flags(cflag_t f) {
5417 flags = f;
5418 }
5419 void clear_flag(cflag_t f) {
5420 flags = (cflag_t)(flags & ~f);
5421 }
5422 void clear_flags() {
5423 flags = (cflag_t)0;
5424 }
5425 bool is_dirty() const {
5426 return test_flag(FLAG_DIRTY);
5427 }
5428 bool is_missing() const {
5429 return test_flag(FLAG_MISSING);
5430 }
5431 bool has_reference() const {
5432 return test_flag(FLAG_HAS_REFERENCE);
5433 }
5434 bool has_fingerprint() const {
5435 return test_flag(FLAG_HAS_FINGERPRINT);
5436 }
9f95a23c
TL
5437 void encode(ceph::buffer::list &bl) const;
5438 void decode(ceph::buffer::list::const_iterator &bl);
5439 void dump(ceph::Formatter *f) const;
5440 friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
11fdf7f2
TL
5441};
5442WRITE_CLASS_ENCODER(chunk_info_t)
9f95a23c 5443std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
11fdf7f2 5444
31f18b77
FG
5445struct object_info_t;
5446struct object_manifest_t {
5447 enum {
5448 TYPE_NONE = 0,
11fdf7f2
TL
5449 TYPE_REDIRECT = 1,
5450 TYPE_CHUNKED = 2,
31f18b77
FG
5451 };
5452 uint8_t type; // redirect, chunked, ...
5453 hobject_t redirect_target;
9f95a23c 5454 std::map<uint64_t, chunk_info_t> chunk_map;
31f18b77
FG
5455
5456 object_manifest_t() : type(0) { }
5457 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
5458 : type(type), redirect_target(redirect_target) { }
5459
5460 bool is_empty() const {
5461 return type == TYPE_NONE;
5462 }
5463 bool is_redirect() const {
5464 return type == TYPE_REDIRECT;
5465 }
5466 bool is_chunked() const {
5467 return type == TYPE_CHUNKED;
5468 }
11fdf7f2 5469 static std::string_view get_type_name(uint8_t m) {
31f18b77
FG
5470 switch (m) {
5471 case TYPE_NONE: return "none";
5472 case TYPE_REDIRECT: return "redirect";
5473 case TYPE_CHUNKED: return "chunked";
5474 default: return "unknown";
5475 }
5476 }
11fdf7f2 5477 std::string_view get_type_name() const {
31f18b77
FG
5478 return get_type_name(type);
5479 }
11fdf7f2
TL
5480 void clear() {
5481 type = 0;
5482 redirect_target = hobject_t();
5483 chunk_map.clear();
5484 }
9f95a23c
TL
5485 static void generate_test_instances(std::list<object_manifest_t*>& o);
5486 void encode(ceph::buffer::list &bl) const;
5487 void decode(ceph::buffer::list::const_iterator &bl);
5488 void dump(ceph::Formatter *f) const;
5489 friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
31f18b77
FG
5490};
5491WRITE_CLASS_ENCODER(object_manifest_t)
9f95a23c 5492std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi);
7c673cae
FG
5493
5494struct object_info_t {
5495 hobject_t soid;
5496 eversion_t version, prior_version;
5497 version_t user_version;
5498 osd_reqid_t last_reqid;
5499
5500 uint64_t size;
5501 utime_t mtime;
5502 utime_t local_mtime; // local mtime
5503
5504 // note: these are currently encoded into a total 16 bits; see
5505 // encode()/decode() for the weirdness.
5506 typedef enum {
11fdf7f2
TL
5507 FLAG_LOST = 1<<0,
5508 FLAG_WHITEOUT = 1<<1, // object logically does not exist
5509 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
5510 FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
5511 FLAG_DATA_DIGEST = 1<<4, // has data crc
5512 FLAG_OMAP_DIGEST = 1<<5, // has omap crc
5513 FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
5514 FLAG_MANIFEST = 1<<7, // has manifest
5515 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
5516 FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
7c673cae
FG
5517 } flag_t;
5518
5519 flag_t flags;
5520
9f95a23c
TL
5521 static std::string get_flag_string(flag_t flags) {
5522 std::string s;
5523 std::vector<std::string> sv = get_flag_vector(flags);
94b18763 5524 for (auto ss : sv) {
9f95a23c 5525 s += std::string("|") + ss;
94b18763
FG
5526 }
5527 if (s.length())
5528 return s.substr(1);
5529 return s;
5530 }
9f95a23c
TL
5531 static std::vector<std::string> get_flag_vector(flag_t flags) {
5532 std::vector<std::string> sv;
7c673cae 5533 if (flags & FLAG_LOST)
94b18763 5534 sv.insert(sv.end(), "lost");
7c673cae 5535 if (flags & FLAG_WHITEOUT)
94b18763 5536 sv.insert(sv.end(), "whiteout");
7c673cae 5537 if (flags & FLAG_DIRTY)
94b18763 5538 sv.insert(sv.end(), "dirty");
7c673cae 5539 if (flags & FLAG_USES_TMAP)
94b18763 5540 sv.insert(sv.end(), "uses_tmap");
7c673cae 5541 if (flags & FLAG_OMAP)
94b18763 5542 sv.insert(sv.end(), "omap");
7c673cae 5543 if (flags & FLAG_DATA_DIGEST)
94b18763 5544 sv.insert(sv.end(), "data_digest");
7c673cae 5545 if (flags & FLAG_OMAP_DIGEST)
94b18763 5546 sv.insert(sv.end(), "omap_digest");
7c673cae 5547 if (flags & FLAG_CACHE_PIN)
94b18763 5548 sv.insert(sv.end(), "cache_pin");
31f18b77 5549 if (flags & FLAG_MANIFEST)
94b18763 5550 sv.insert(sv.end(), "manifest");
11fdf7f2
TL
5551 if (flags & FLAG_REDIRECT_HAS_REFERENCE)
5552 sv.insert(sv.end(), "redirect_has_reference");
94b18763 5553 return sv;
7c673cae 5554 }
9f95a23c 5555 std::string get_flag_string() const {
7c673cae
FG
5556 return get_flag_string(flags);
5557 }
5558
7c673cae
FG
5559 uint64_t truncate_seq, truncate_size;
5560
9f95a23c 5561 std::map<std::pair<uint64_t, entity_name_t>, watch_info_t> watchers;
7c673cae
FG
5562
5563 // opportunistic checksums; may or may not be present
5564 __u32 data_digest; ///< data crc32c
5565 __u32 omap_digest; ///< omap crc32c
5566
5567 // alloc hint attribute
5568 uint64_t expected_object_size, expected_write_size;
5569 uint32_t alloc_hint_flags;
5570
31f18b77
FG
5571 struct object_manifest_t manifest;
5572
7c673cae
FG
5573 void copy_user_bits(const object_info_t& other);
5574
7c673cae
FG
5575 bool test_flag(flag_t f) const {
5576 return (flags & f) == f;
5577 }
5578 void set_flag(flag_t f) {
5579 flags = (flag_t)(flags | f);
5580 }
5581 void clear_flag(flag_t f) {
5582 flags = (flag_t)(flags & ~f);
5583 }
5584 bool is_lost() const {
5585 return test_flag(FLAG_LOST);
5586 }
5587 bool is_whiteout() const {
5588 return test_flag(FLAG_WHITEOUT);
5589 }
5590 bool is_dirty() const {
5591 return test_flag(FLAG_DIRTY);
5592 }
5593 bool is_omap() const {
5594 return test_flag(FLAG_OMAP);
5595 }
5596 bool is_data_digest() const {
5597 return test_flag(FLAG_DATA_DIGEST);
5598 }
5599 bool is_omap_digest() const {
5600 return test_flag(FLAG_OMAP_DIGEST);
5601 }
5602 bool is_cache_pinned() const {
5603 return test_flag(FLAG_CACHE_PIN);
5604 }
31f18b77
FG
5605 bool has_manifest() const {
5606 return test_flag(FLAG_MANIFEST);
5607 }
7c673cae
FG
5608 void set_data_digest(__u32 d) {
5609 set_flag(FLAG_DATA_DIGEST);
5610 data_digest = d;
5611 }
5612 void set_omap_digest(__u32 d) {
5613 set_flag(FLAG_OMAP_DIGEST);
5614 omap_digest = d;
5615 }
5616 void clear_data_digest() {
5617 clear_flag(FLAG_DATA_DIGEST);
5618 data_digest = -1;
5619 }
5620 void clear_omap_digest() {
5621 clear_flag(FLAG_OMAP_DIGEST);
5622 omap_digest = -1;
5623 }
5624 void new_object() {
28e407b8
AA
5625 clear_data_digest();
5626 clear_omap_digest();
7c673cae
FG
5627 }
5628
9f95a23c
TL
5629 void encode(ceph::buffer::list& bl, uint64_t features) const;
5630 void decode(ceph::buffer::list::const_iterator& bl);
5631 void decode(ceph::buffer::list& bl) {
11fdf7f2 5632 auto p = std::cbegin(bl);
7c673cae
FG
5633 decode(p);
5634 }
9f95a23c
TL
5635 void dump(ceph::Formatter *f) const;
5636 static void generate_test_instances(std::list<object_info_t*>& o);
7c673cae
FG
5637
5638 explicit object_info_t()
5639 : user_version(0), size(0), flags((flag_t)0),
5640 truncate_seq(0), truncate_size(0),
5641 data_digest(-1), omap_digest(-1),
5642 expected_object_size(0), expected_write_size(0),
5643 alloc_hint_flags(0)
5644 {}
5645
5646 explicit object_info_t(const hobject_t& s)
5647 : soid(s),
5648 user_version(0), size(0), flags((flag_t)0),
5649 truncate_seq(0), truncate_size(0),
5650 data_digest(-1), omap_digest(-1),
5651 expected_object_size(0), expected_write_size(0),
5652 alloc_hint_flags(0)
5653 {}
5654
9f95a23c 5655 explicit object_info_t(ceph::buffer::list& bl) {
7c673cae
FG
5656 decode(bl);
5657 }
5658};
5659WRITE_CLASS_ENCODER_FEATURES(object_info_t)
5660
9f95a23c 5661std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
7c673cae
FG
5662
5663
5664
5665// Object recovery
5666struct ObjectRecoveryInfo {
5667 hobject_t soid;
5668 eversion_t version;
5669 uint64_t size;
5670 object_info_t oi;
5671 SnapSet ss; // only populated if soid is_snap()
5672 interval_set<uint64_t> copy_subset;
9f95a23c
TL
5673 std::map<hobject_t, interval_set<uint64_t>> clone_subset;
5674 bool object_exist;
7c673cae 5675
9f95a23c 5676 ObjectRecoveryInfo() : size(0), object_exist(true) { }
7c673cae 5677
9f95a23c
TL
5678 static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o);
5679 void encode(ceph::buffer::list &bl, uint64_t features) const;
5680 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
5681 std::ostream &print(std::ostream &out) const;
5682 void dump(ceph::Formatter *f) const;
7c673cae
FG
5683};
5684WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
9f95a23c 5685std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf);
7c673cae
FG
5686
5687struct ObjectRecoveryProgress {
5688 uint64_t data_recovered_to;
9f95a23c 5689 std::string omap_recovered_to;
7c673cae
FG
5690 bool first;
5691 bool data_complete;
5692 bool omap_complete;
224ce89b 5693 bool error = false;
7c673cae
FG
5694
5695 ObjectRecoveryProgress()
5696 : data_recovered_to(0),
5697 first(true),
5698 data_complete(false), omap_complete(false) { }
5699
5700 bool is_complete(const ObjectRecoveryInfo& info) const {
5701 return (data_recovered_to >= (
5702 info.copy_subset.empty() ?
5703 0 : info.copy_subset.range_end())) &&
5704 omap_complete;
5705 }
5706
9f95a23c
TL
5707 static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o);
5708 void encode(ceph::buffer::list &bl) const;
5709 void decode(ceph::buffer::list::const_iterator &bl);
5710 std::ostream &print(std::ostream &out) const;
5711 void dump(ceph::Formatter *f) const;
7c673cae
FG
5712};
5713WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
9f95a23c 5714std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog);
7c673cae
FG
5715
5716struct PushReplyOp {
5717 hobject_t soid;
5718
9f95a23c
TL
5719 static void generate_test_instances(std::list<PushReplyOp*>& o);
5720 void encode(ceph::buffer::list &bl) const;
5721 void decode(ceph::buffer::list::const_iterator &bl);
5722 std::ostream &print(std::ostream &out) const;
5723 void dump(ceph::Formatter *f) const;
7c673cae
FG
5724
5725 uint64_t cost(CephContext *cct) const;
5726};
5727WRITE_CLASS_ENCODER(PushReplyOp)
9f95a23c 5728std::ostream& operator<<(std::ostream& out, const PushReplyOp &op);
7c673cae
FG
5729
5730struct PullOp {
5731 hobject_t soid;
5732
5733 ObjectRecoveryInfo recovery_info;
5734 ObjectRecoveryProgress recovery_progress;
5735
9f95a23c
TL
5736 static void generate_test_instances(std::list<PullOp*>& o);
5737 void encode(ceph::buffer::list &bl, uint64_t features) const;
5738 void decode(ceph::buffer::list::const_iterator &bl);
5739 std::ostream &print(std::ostream &out) const;
5740 void dump(ceph::Formatter *f) const;
7c673cae
FG
5741
5742 uint64_t cost(CephContext *cct) const;
5743};
5744WRITE_CLASS_ENCODER_FEATURES(PullOp)
9f95a23c 5745std::ostream& operator<<(std::ostream& out, const PullOp &op);
7c673cae
FG
5746
5747struct PushOp {
5748 hobject_t soid;
5749 eversion_t version;
9f95a23c 5750 ceph::buffer::list data;
7c673cae 5751 interval_set<uint64_t> data_included;
9f95a23c
TL
5752 ceph::buffer::list omap_header;
5753 std::map<std::string, ceph::buffer::list> omap_entries;
5754 std::map<std::string, ceph::buffer::list> attrset;
7c673cae
FG
5755
5756 ObjectRecoveryInfo recovery_info;
5757 ObjectRecoveryProgress before_progress;
5758 ObjectRecoveryProgress after_progress;
5759
9f95a23c
TL
5760 static void generate_test_instances(std::list<PushOp*>& o);
5761 void encode(ceph::buffer::list &bl, uint64_t features) const;
5762 void decode(ceph::buffer::list::const_iterator &bl);
5763 std::ostream &print(std::ostream &out) const;
5764 void dump(ceph::Formatter *f) const;
7c673cae
FG
5765
5766 uint64_t cost(CephContext *cct) const;
5767};
5768WRITE_CLASS_ENCODER_FEATURES(PushOp)
9f95a23c 5769std::ostream& operator<<(std::ostream& out, const PushOp &op);
7c673cae
FG
5770
5771
5772/*
5773 * summarize pg contents for purposes of a scrub
5774 */
5775struct ScrubMap {
5776 struct object {
9f95a23c 5777 std::map<std::string, ceph::buffer::ptr> attrs;
7c673cae
FG
5778 uint64_t size;
5779 __u32 omap_digest; ///< omap crc32c
5780 __u32 digest; ///< data crc32c
5781 bool negative:1;
5782 bool digest_present:1;
5783 bool omap_digest_present:1;
5784 bool read_error:1;
5785 bool stat_error:1;
5786 bool ec_hash_mismatch:1;
5787 bool ec_size_mismatch:1;
28e407b8
AA
5788 bool large_omap_object_found:1;
5789 uint64_t large_omap_object_key_count = 0;
5790 uint64_t large_omap_object_value_size = 0;
11fdf7f2
TL
5791 uint64_t object_omap_bytes = 0;
5792 uint64_t object_omap_keys = 0;
7c673cae
FG
5793
5794 object() :
5795 // Init invalid size so it won't match if we get a stat EIO error
5796 size(-1), omap_digest(0), digest(0),
28e407b8
AA
5797 negative(false), digest_present(false), omap_digest_present(false),
5798 read_error(false), stat_error(false), ec_hash_mismatch(false),
5799 ec_size_mismatch(false), large_omap_object_found(false) {}
7c673cae 5800
9f95a23c
TL
5801 void encode(ceph::buffer::list& bl) const;
5802 void decode(ceph::buffer::list::const_iterator& bl);
5803 void dump(ceph::Formatter *f) const;
5804 static void generate_test_instances(std::list<object*>& o);
7c673cae
FG
5805 };
5806 WRITE_CLASS_ENCODER(object)
5807
9f95a23c 5808 std::map<hobject_t,object> objects;
7c673cae
FG
5809 eversion_t valid_through;
5810 eversion_t incr_since;
28e407b8 5811 bool has_large_omap_object_errors:1;
11fdf7f2 5812 bool has_omap_keys:1;
7c673cae
FG
5813
5814 void merge_incr(const ScrubMap &l);
28e407b8
AA
5815 void clear_from(const hobject_t& start) {
5816 objects.erase(objects.lower_bound(start), objects.end());
5817 }
7c673cae
FG
5818 void insert(const ScrubMap &r) {
5819 objects.insert(r.objects.begin(), r.objects.end());
5820 }
5821 void swap(ScrubMap &r) {
31f18b77
FG
5822 using std::swap;
5823 swap(objects, r.objects);
5824 swap(valid_through, r.valid_through);
5825 swap(incr_since, r.incr_since);
7c673cae
FG
5826 }
5827
9f95a23c
TL
5828 void encode(ceph::buffer::list& bl) const;
5829 void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1);
5830 void dump(ceph::Formatter *f) const;
5831 static void generate_test_instances(std::list<ScrubMap*>& o);
7c673cae
FG
5832};
5833WRITE_CLASS_ENCODER(ScrubMap::object)
5834WRITE_CLASS_ENCODER(ScrubMap)
5835
28e407b8
AA
5836struct ScrubMapBuilder {
5837 bool deep = false;
9f95a23c 5838 std::vector<hobject_t> ls;
28e407b8
AA
5839 size_t pos = 0;
5840 int64_t data_pos = 0;
9f95a23c 5841 std::string omap_pos;
28e407b8 5842 int ret = 0;
9f95a23c 5843 ceph::buffer::hash data_hash, omap_hash; ///< accumulatinng hash value
28e407b8
AA
5844 uint64_t omap_keys = 0;
5845 uint64_t omap_bytes = 0;
5846
5847 bool empty() {
5848 return ls.empty();
5849 }
5850 bool done() {
5851 return pos >= ls.size();
5852 }
5853 void reset() {
5854 *this = ScrubMapBuilder();
5855 }
5856
5857 bool data_done() {
5858 return data_pos < 0;
5859 }
5860
5861 void next_object() {
5862 ++pos;
5863 data_pos = 0;
5864 omap_pos.clear();
5865 omap_keys = 0;
5866 omap_bytes = 0;
5867 }
5868
9f95a23c 5869 friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) {
28e407b8
AA
5870 out << "(" << pos.pos << "/" << pos.ls.size();
5871 if (pos.pos < pos.ls.size()) {
5872 out << " " << pos.ls[pos.pos];
5873 }
5874 if (pos.data_pos < 0) {
5875 out << " byte " << pos.data_pos;
5876 }
5877 if (!pos.omap_pos.empty()) {
5878 out << " key " << pos.omap_pos;
5879 }
5880 if (pos.deep) {
5881 out << " deep";
5882 }
5883 if (pos.ret) {
5884 out << " ret " << pos.ret;
5885 }
5886 return out << ")";
5887 }
5888};
5889
7c673cae
FG
5890struct watch_item_t {
5891 entity_name_t name;
5892 uint64_t cookie;
5893 uint32_t timeout_seconds;
5894 entity_addr_t addr;
5895
5896 watch_item_t() : cookie(0), timeout_seconds(0) { }
5897 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5898 const entity_addr_t& addr)
5899 : name(name), cookie(cookie), timeout_seconds(timeout),
5900 addr(addr) { }
5901
9f95a23c 5902 void encode(ceph::buffer::list &bl, uint64_t features) const {
7c673cae 5903 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5904 encode(name, bl);
5905 encode(cookie, bl);
5906 encode(timeout_seconds, bl);
5907 encode(addr, bl, features);
7c673cae
FG
5908 ENCODE_FINISH(bl);
5909 }
9f95a23c 5910 void decode(ceph::buffer::list::const_iterator &bl) {
7c673cae 5911 DECODE_START(2, bl);
11fdf7f2
TL
5912 decode(name, bl);
5913 decode(cookie, bl);
5914 decode(timeout_seconds, bl);
7c673cae 5915 if (struct_v >= 2) {
11fdf7f2 5916 decode(addr, bl);
7c673cae
FG
5917 }
5918 DECODE_FINISH(bl);
5919 }
9f95a23c
TL
5920 void dump(ceph::Formatter *f) const {
5921 f->dump_stream("watcher") << name;
5922 f->dump_int("cookie", cookie);
5923 f->dump_int("timeout", timeout_seconds);
5924 f->open_object_section("addr");
5925 addr.dump(f);
5926 f->close_section();
5927 }
5928 static void generate_test_instances(std::list<watch_item_t*>& o) {
5929 entity_addr_t ea;
5930 ea.set_type(entity_addr_t::TYPE_LEGACY);
5931 ea.set_nonce(1000);
5932 ea.set_family(AF_INET);
5933 ea.set_in4_quad(0, 127);
5934 ea.set_in4_quad(1, 0);
5935 ea.set_in4_quad(2, 0);
5936 ea.set_in4_quad(3, 1);
5937 ea.set_port(1024);
5938 o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5939 ea.set_nonce(1001);
5940 ea.set_in4_quad(3, 2);
5941 ea.set_port(1025);
5942 o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5943 }
7c673cae
FG
5944};
5945WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5946
5947struct obj_watch_item_t {
5948 hobject_t obj;
5949 watch_item_t wi;
5950};
5951
5952/**
5953 * obj list watch response format
5954 *
5955 */
5956struct obj_list_watch_response_t {
9f95a23c 5957 std::list<watch_item_t> entries;
7c673cae 5958
9f95a23c 5959 void encode(ceph::buffer::list& bl, uint64_t features) const {
7c673cae 5960 ENCODE_START(1, 1, bl);
11fdf7f2 5961 encode(entries, bl, features);
7c673cae
FG
5962 ENCODE_FINISH(bl);
5963 }
9f95a23c 5964 void decode(ceph::buffer::list::const_iterator& bl) {
7c673cae 5965 DECODE_START(1, bl);
11fdf7f2 5966 decode(entries, bl);
7c673cae
FG
5967 DECODE_FINISH(bl);
5968 }
9f95a23c 5969 void dump(ceph::Formatter *f) const {
7c673cae 5970 f->open_array_section("entries");
9f95a23c 5971 for (std::list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
7c673cae 5972 f->open_object_section("watch");
9f95a23c 5973 p->dump(f);
7c673cae
FG
5974 f->close_section();
5975 }
5976 f->close_section();
5977 }
9f95a23c 5978 static void generate_test_instances(std::list<obj_list_watch_response_t*>& o) {
7c673cae
FG
5979 entity_addr_t ea;
5980 o.push_back(new obj_list_watch_response_t);
5981 o.push_back(new obj_list_watch_response_t);
9f95a23c
TL
5982 std::list<watch_item_t*> test_watchers;
5983 watch_item_t::generate_test_instances(test_watchers);
5984 for (auto &e : test_watchers) {
5985 o.back()->entries.push_back(*e);
5986 delete e;
5987 }
7c673cae
FG
5988 }
5989};
5990WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5991
5992struct clone_info {
5993 snapid_t cloneid;
9f95a23c
TL
5994 std::vector<snapid_t> snaps; // ascending
5995 std::vector< std::pair<uint64_t,uint64_t> > overlap;
7c673cae
FG
5996 uint64_t size;
5997
5998 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5999
9f95a23c 6000 void encode(ceph::buffer::list& bl) const {
7c673cae 6001 ENCODE_START(1, 1, bl);
11fdf7f2
TL
6002 encode(cloneid, bl);
6003 encode(snaps, bl);
6004 encode(overlap, bl);
6005 encode(size, bl);
7c673cae
FG
6006 ENCODE_FINISH(bl);
6007 }
9f95a23c 6008 void decode(ceph::buffer::list::const_iterator& bl) {
7c673cae 6009 DECODE_START(1, bl);
11fdf7f2
TL
6010 decode(cloneid, bl);
6011 decode(snaps, bl);
6012 decode(overlap, bl);
6013 decode(size, bl);
7c673cae
FG
6014 DECODE_FINISH(bl);
6015 }
9f95a23c 6016 void dump(ceph::Formatter *f) const {
7c673cae
FG
6017 if (cloneid == CEPH_NOSNAP)
6018 f->dump_string("cloneid", "HEAD");
6019 else
6020 f->dump_unsigned("cloneid", cloneid.val);
6021 f->open_array_section("snapshots");
9f95a23c 6022 for (std::vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
7c673cae
FG
6023 f->open_object_section("snap");
6024 f->dump_unsigned("id", p->val);
6025 f->close_section();
6026 }
6027 f->close_section();
6028 f->open_array_section("overlaps");
9f95a23c 6029 for (std::vector< std::pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
7c673cae
FG
6030 q != overlap.end(); ++q) {
6031 f->open_object_section("overlap");
6032 f->dump_unsigned("offset", q->first);
6033 f->dump_unsigned("length", q->second);
6034 f->close_section();
6035 }
6036 f->close_section();
6037 f->dump_unsigned("size", size);
6038 }
9f95a23c 6039 static void generate_test_instances(std::list<clone_info*>& o) {
7c673cae
FG
6040 o.push_back(new clone_info);
6041 o.push_back(new clone_info);
6042 o.back()->cloneid = 1;
6043 o.back()->snaps.push_back(1);
9f95a23c
TL
6044 o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6045 o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
7c673cae
FG
6046 o.back()->size = 16384;
6047 o.push_back(new clone_info);
6048 o.back()->cloneid = CEPH_NOSNAP;
6049 o.back()->size = 32768;
6050 }
6051};
6052WRITE_CLASS_ENCODER(clone_info)
6053
6054/**
6055 * obj list snaps response format
6056 *
6057 */
6058struct obj_list_snap_response_t {
9f95a23c 6059 std::vector<clone_info> clones; // ascending
7c673cae
FG
6060 snapid_t seq;
6061
9f95a23c 6062 void encode(ceph::buffer::list& bl) const {
7c673cae 6063 ENCODE_START(2, 1, bl);
11fdf7f2
TL
6064 encode(clones, bl);
6065 encode(seq, bl);
7c673cae
FG
6066 ENCODE_FINISH(bl);
6067 }
9f95a23c 6068 void decode(ceph::buffer::list::const_iterator& bl) {
7c673cae 6069 DECODE_START(2, bl);
11fdf7f2 6070 decode(clones, bl);
7c673cae 6071 if (struct_v >= 2)
11fdf7f2 6072 decode(seq, bl);
7c673cae
FG
6073 else
6074 seq = CEPH_NOSNAP;
6075 DECODE_FINISH(bl);
6076 }
9f95a23c 6077 void dump(ceph::Formatter *f) const {
7c673cae 6078 f->open_array_section("clones");
9f95a23c 6079 for (std::vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
7c673cae
FG
6080 f->open_object_section("clone");
6081 p->dump(f);
6082 f->close_section();
6083 }
6084 f->dump_unsigned("seq", seq);
6085 f->close_section();
6086 }
9f95a23c 6087 static void generate_test_instances(std::list<obj_list_snap_response_t*>& o) {
7c673cae
FG
6088 o.push_back(new obj_list_snap_response_t);
6089 o.push_back(new obj_list_snap_response_t);
6090 clone_info cl;
6091 cl.cloneid = 1;
6092 cl.snaps.push_back(1);
9f95a23c
TL
6093 cl.overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6094 cl.overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
7c673cae
FG
6095 cl.size = 16384;
6096 o.back()->clones.push_back(cl);
6097 cl.cloneid = CEPH_NOSNAP;
6098 cl.snaps.clear();
6099 cl.overlap.clear();
6100 cl.size = 32768;
6101 o.back()->clones.push_back(cl);
6102 o.back()->seq = 123;
6103 }
6104};
6105
6106WRITE_CLASS_ENCODER(obj_list_snap_response_t)
6107
6108// PromoteCounter
6109
6110struct PromoteCounter {
11fdf7f2
TL
6111 std::atomic<unsigned long long> attempts{0};
6112 std::atomic<unsigned long long> objects{0};
6113 std::atomic<unsigned long long> bytes{0};
7c673cae
FG
6114
6115 void attempt() {
6116 attempts++;
6117 }
6118
6119 void finish(uint64_t size) {
6120 objects++;
6121 bytes += size;
6122 }
6123
6124 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
6125 *a = attempts;
6126 *o = objects;
6127 *b = bytes;
6128 attempts = *a / 2;
6129 objects = *o / 2;
6130 bytes = *b / 2;
6131 }
6132};
6133
11fdf7f2
TL
6134struct pool_pg_num_history_t {
6135 /// last epoch updated
6136 epoch_t epoch = 0;
6137 /// poolid -> epoch -> pg_num
9f95a23c 6138 std::map<int64_t, std::map<epoch_t,uint32_t>> pg_nums;
11fdf7f2 6139 /// pair(epoch, poolid)
9f95a23c 6140 std::set<std::pair<epoch_t,int64_t>> deleted_pools;
7c673cae 6141
11fdf7f2
TL
6142 void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
6143 pg_nums[pool][epoch] = pg_num;
6144 }
6145 void log_pool_delete(epoch_t epoch, int64_t pool) {
9f95a23c 6146 deleted_pools.insert(std::make_pair(epoch, pool));
11fdf7f2 6147 }
7c673cae 6148
11fdf7f2
TL
6149 /// prune history based on oldest osdmap epoch in the cluster
6150 void prune(epoch_t oldest_epoch) {
6151 auto i = deleted_pools.begin();
6152 while (i != deleted_pools.end()) {
6153 if (i->first >= oldest_epoch) {
6154 break;
6155 }
6156 pg_nums.erase(i->second);
6157 i = deleted_pools.erase(i);
6158 }
6159 for (auto& j : pg_nums) {
6160 auto k = j.second.lower_bound(oldest_epoch);
6161 // keep this and the entry before it (just to be paranoid)
6162 if (k != j.second.begin()) {
6163 --k;
6164 j.second.erase(j.second.begin(), k);
6165 }
6166 }
6167 }
6168
9f95a23c 6169 void encode(ceph::buffer::list& bl) const {
11fdf7f2
TL
6170 ENCODE_START(1, 1, bl);
6171 encode(epoch, bl);
6172 encode(pg_nums, bl);
6173 encode(deleted_pools, bl);
6174 ENCODE_FINISH(bl);
6175 }
9f95a23c 6176 void decode(ceph::buffer::list::const_iterator& p) {
11fdf7f2
TL
6177 DECODE_START(1, p);
6178 decode(epoch, p);
6179 decode(pg_nums, p);
6180 decode(deleted_pools, p);
6181 DECODE_FINISH(p);
6182 }
9f95a23c 6183 void dump(ceph::Formatter *f) const {
11fdf7f2
TL
6184 f->dump_unsigned("epoch", epoch);
6185 f->open_object_section("pools");
6186 for (auto& i : pg_nums) {
6187 f->open_object_section("pool");
6188 f->dump_unsigned("pool_id", i.first);
6189 f->open_array_section("changes");
6190 for (auto& j : i.second) {
6191 f->open_object_section("change");
6192 f->dump_unsigned("epoch", j.first);
6193 f->dump_unsigned("pg_num", j.second);
6194 f->close_section();
6195 }
6196 f->close_section();
6197 f->close_section();
6198 }
6199 f->close_section();
6200 f->open_array_section("deleted_pools");
6201 for (auto& i : deleted_pools) {
6202 f->open_object_section("deletion");
6203 f->dump_unsigned("pool_id", i.second);
6204 f->dump_unsigned("epoch", i.first);
6205 f->close_section();
6206 }
6207 f->close_section();
6208 }
9f95a23c 6209 static void generate_test_instances(std::list<pool_pg_num_history_t*>& ls) {
11fdf7f2
TL
6210 ls.push_back(new pool_pg_num_history_t);
6211 }
9f95a23c 6212 friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) {
11fdf7f2
TL
6213 return out << "pg_num_history(e" << h.epoch
6214 << " pg_nums " << h.pg_nums
6215 << " deleted_pools " << h.deleted_pools
6216 << ")";
7c673cae 6217 }
7c673cae 6218};
11fdf7f2
TL
6219WRITE_CLASS_ENCODER(pool_pg_num_history_t)
6220
9f95a23c
TL
6221// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
6222// easily skip them
6223static const string_view infover_key = "_infover"sv;
6224static const string_view info_key = "_info"sv;
6225static const string_view biginfo_key = "_biginfo"sv;
6226static const string_view epoch_key = "_epoch"sv;
6227static const string_view fastinfo_key = "_fastinfo"sv;
6228
6229static const __u8 pg_latest_struct_v = 10;
6230// v10 is the new past_intervals encoding
6231// v9 was fastinfo_key addition
6232// v8 was the move to a per-pg pgmeta object
6233// v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
6234// (first appeared in cuttlefish).
6235static const __u8 pg_compat_struct_v = 10;
6236
6237int prepare_info_keymap(
6238 CephContext* cct,
6239 map<string,bufferlist> *km,
6240 string *key_to_remove,
6241 epoch_t epoch,
6242 pg_info_t &info,
6243 pg_info_t &last_written_info,
6244 PastIntervals &past_intervals,
6245 bool dirty_big_info,
6246 bool dirty_epoch,
6247 bool try_fast_info,
6248 PerfCounters *logger = nullptr,
6249 DoutPrefixProvider *dpp = nullptr);
6250
6251namespace ceph::os {
6252 class Transaction;
6253};
6254
6255void create_pg_collection(
6256 ceph::os::Transaction& t, spg_t pgid, int bits);
6257
6258void init_pg_ondisk(
6259 ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool);
6260
11fdf7f2
TL
6261// omap specific stats
6262struct omap_stat_t {
6263 int large_omap_objects;
6264 int64_t omap_bytes;
6265 int64_t omap_keys;
6266};
7c673cae 6267
9f95a23c
TL
6268// filter for pg listings
6269class PGLSFilter {
6270 CephContext* cct;
6271protected:
6272 std::string xattr;
6273public:
6274 PGLSFilter();
6275 virtual ~PGLSFilter();
6276 virtual bool filter(const hobject_t &obj,
6277 const ceph::buffer::list& xattr_data) const = 0;
6278
6279 /**
6280 * Arguments passed from the RADOS client. Implementations must
6281 * handle any encoding errors, and return an appropriate error code,
6282 * or 0 on valid input.
6283 */
6284 virtual int init(ceph::buffer::list::const_iterator &params) = 0;
6285
6286 /**
6287 * xattr key, or empty string. If non-empty, this xattr will be fetched
6288 * and the value passed into ::filter
6289 */
6290 virtual const std::string& get_xattr() const { return xattr; }
6291
6292 /**
6293 * If true, objects without the named xattr (if xattr name is not empty)
6294 * will be rejected without calling ::filter
6295 */
6296 virtual bool reject_empty_xattr() const { return true; }
6297};
6298
6299class PGLSPlainFilter : public PGLSFilter {
6300 std::string val;
6301public:
6302 int init(ceph::bufferlist::const_iterator &params) override;
6303 ~PGLSPlainFilter() override {}
6304 bool filter(const hobject_t& obj,
6305 const ceph::bufferlist& xattr_data) const override;
6306};
6307
6308
7c673cae 6309#endif