1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
25 #include <string_view>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/optional/optional_io.hpp>
29 #include <boost/variant.hpp>
30 #include <boost/smart_ptr/local_shared_ptr.hpp>
32 #include "include/rados/rados_types.hpp"
33 #include "include/mempool.h"
35 #include "msg/msg_types.h"
36 #include "include/compat.h"
37 #include "include/types.h"
38 #include "include/utime.h"
39 #include "include/CompatSet.h"
40 #include "common/ceph_context.h"
41 #include "common/histogram.h"
42 #include "include/interval_set.h"
43 #include "include/inline_memory.h"
44 #include "common/Formatter.h"
45 #include "common/bloom_filter.hpp"
46 #include "common/hobject.h"
47 #include "common/snap_types.h"
50 #include "librados/ListObjectImpl.h"
51 #include "compressor/Compressor.h"
52 #include "osd_perf_counters.h"
54 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
56 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
57 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
58 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
59 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
60 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
61 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
62 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
63 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
64 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
65 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
66 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
67 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
68 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
69 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
70 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
71 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
72 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
75 /// pool priority range set by user
76 #define OSD_POOL_PRIORITY_MAX 10
77 #define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
79 /// min recovery priority for MBackfillReserve
80 #define OSD_RECOVERY_PRIORITY_MIN 0
82 /// base backfill priority for MBackfillReserve
83 #define OSD_BACKFILL_PRIORITY_BASE 100
85 /// base backfill priority for MBackfillReserve (degraded PG)
86 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
88 /// base recovery priority for MBackfillReserve
89 #define OSD_RECOVERY_PRIORITY_BASE 180
91 /// base backfill priority for MBackfillReserve (inactive PG)
92 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
94 /// base recovery priority for MRecoveryReserve (inactive PG)
95 #define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
97 /// max manually/automatically set recovery priority for MBackfillReserve
98 #define OSD_RECOVERY_PRIORITY_MAX 253
100 /// backfill priority for MBackfillReserve, when forced manually
101 #define OSD_BACKFILL_PRIORITY_FORCED 254
103 /// recovery priority for MRecoveryReserve, when forced manually
104 #define OSD_RECOVERY_PRIORITY_FORCED 255
106 /// priority for pg deletion when osd is not fullish
107 #define OSD_DELETE_PRIORITY_NORMAL 179
109 /// priority for pg deletion when osd is approaching full
110 #define OSD_DELETE_PRIORITY_FULLISH 219
112 /// priority when more full
113 #define OSD_DELETE_PRIORITY_FULL 255
115 static std::map
<int, int> max_prio_map
= {
116 {OSD_BACKFILL_PRIORITY_BASE
, OSD_BACKFILL_DEGRADED_PRIORITY_BASE
- 1},
117 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_BASE
- 1},
118 {OSD_RECOVERY_PRIORITY_BASE
, OSD_BACKFILL_INACTIVE_PRIORITY_BASE
- 1},
119 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_MAX
},
120 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_MAX
}
123 typedef hobject_t collection_list_handle_t
;
125 /// convert a single CPEH_OSD_FLAG_* to a std::string
126 const char *ceph_osd_flag_name(unsigned flag
);
127 /// convert a single CEPH_OSD_OF_FLAG_* to a std::string
128 const char *ceph_osd_op_flag_name(unsigned flag
);
130 /// convert CEPH_OSD_FLAG_* op flags to a std::string
131 std::string
ceph_osd_flag_string(unsigned flags
);
132 /// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
133 std::string
ceph_osd_op_flag_string(unsigned flags
);
134 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
135 std::string
ceph_osd_alloc_hint_flag_string(unsigned flags
);
137 typedef std::map
<std::string
,std::string
> osd_alert_list_t
;
138 /// map osd id -> alert_list_t
139 typedef std::map
<int, osd_alert_list_t
> osd_alerts_t
;
140 void dump(ceph::Formatter
* f
, const osd_alerts_t
& alerts
);
143 typedef interval_set
<
145 mempool::osdmap::flat_map
> snap_interval_set_t
;
149 * osd request identifier
151 * caller name + incarnation# + tid to unique identify this request.
154 entity_name_t name
; // who
156 int32_t inc
; // incarnation
161 osd_reqid_t(const entity_name_t
& a
, int i
, ceph_tid_t t
)
162 : name(a
), tid(t
), inc(i
)
165 DENC(osd_reqid_t
, v
, p
) {
172 void dump(ceph::Formatter
*f
) const;
173 static void generate_test_instances(std::list
<osd_reqid_t
*>& o
);
175 WRITE_CLASS_DENC(osd_reqid_t
)
180 static const int32_t NO_OSD
= 0x7fffffff;
183 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD
) {}
184 explicit pg_shard_t(int osd
) : osd(osd
), shard(shard_id_t::NO_SHARD
) {}
185 pg_shard_t(int osd
, shard_id_t shard
) : osd(osd
), shard(shard
) {}
186 bool is_undefined() const {
189 std::string
get_osd() const { return (osd
== NO_OSD
? "NONE" : std::to_string(osd
)); }
190 void encode(ceph::buffer::list
&bl
) const;
191 void decode(ceph::buffer::list::const_iterator
&bl
);
192 void dump(ceph::Formatter
*f
) const {
193 f
->dump_unsigned("osd", osd
);
194 if (shard
!= shard_id_t::NO_SHARD
) {
195 f
->dump_unsigned("shard", shard
);
198 auto operator<=>(const pg_shard_t
&) const = default;
200 WRITE_CLASS_ENCODER(pg_shard_t
)
201 std::ostream
& operator<<(std::ostream
&lhs
, const pg_shard_t
&rhs
);
203 using HobjToShardSetMapping
= std::map
<hobject_t
, std::set
<pg_shard_t
>>;
205 class IsPGRecoverablePredicate
{
208 * have encodes the shards available
210 virtual bool operator()(const std::set
<pg_shard_t
> &have
) const = 0;
211 virtual ~IsPGRecoverablePredicate() {}
214 class IsPGReadablePredicate
{
217 * have encodes the shards available
219 virtual bool operator()(const std::set
<pg_shard_t
> &have
) const = 0;
220 virtual ~IsPGReadablePredicate() {}
223 inline std::ostream
& operator<<(std::ostream
& out
, const osd_reqid_t
& r
) {
224 return out
<< r
.name
<< "." << r
.inc
<< ":" << r
.tid
;
227 inline bool operator==(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
228 return (l
.name
== r
.name
) && (l
.inc
== r
.inc
) && (l
.tid
== r
.tid
);
230 inline bool operator!=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
231 return (l
.name
!= r
.name
) || (l
.inc
!= r
.inc
) || (l
.tid
!= r
.tid
);
233 inline bool operator<(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
234 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
235 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
< r
.tid
);
237 inline bool operator<=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
238 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
239 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
<= r
.tid
);
241 inline bool operator>(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
<= r
); }
242 inline bool operator>=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
< r
); }
245 template<> struct hash
<osd_reqid_t
> {
246 size_t operator()(const osd_reqid_t
&r
) const {
247 static hash
<uint64_t> H
;
248 return H(r
.name
.num() ^ r
.tid
^ r
.inc
);
256 // a locator constrains the placement of an object. mainly, which pool
258 struct object_locator_t
{
259 // You specify either the hash or the key -- not both
260 std::int64_t pool
; ///< pool id
261 std::string key
; ///< key string (if non-empty)
262 std::string nspace
; ///< namespace
263 std::int64_t hash
; ///< hash position (if >= 0)
265 explicit object_locator_t()
266 : pool(-1), hash(-1) {}
267 explicit object_locator_t(int64_t po
)
268 : pool(po
), hash(-1) {}
269 explicit object_locator_t(int64_t po
, int64_t ps
)
270 : pool(po
), hash(ps
) {}
271 explicit object_locator_t(int64_t po
, std::string_view ns
)
272 : pool(po
), nspace(ns
), hash(-1) {}
273 explicit object_locator_t(int64_t po
, std::string_view ns
, int64_t ps
)
274 : pool(po
), nspace(ns
), hash(ps
) {}
275 explicit object_locator_t(int64_t po
, std::string_view ns
, std::string_view s
)
276 : pool(po
), key(s
), nspace(ns
), hash(-1) {}
277 explicit object_locator_t(const hobject_t
& soid
)
278 : pool(soid
.pool
), key(soid
.get_key()), nspace(soid
.nspace
), hash(-1) {}
280 int64_t get_pool() const {
295 void encode(ceph::buffer::list
& bl
) const;
296 void decode(ceph::buffer::list::const_iterator
& p
);
297 void dump(ceph::Formatter
*f
) const;
298 static void generate_test_instances(std::list
<object_locator_t
*>& o
);
300 WRITE_CLASS_ENCODER(object_locator_t
)
302 inline bool operator==(const object_locator_t
& l
, const object_locator_t
& r
) {
303 return l
.pool
== r
.pool
&& l
.key
== r
.key
&& l
.nspace
== r
.nspace
&& l
.hash
== r
.hash
;
305 inline bool operator!=(const object_locator_t
& l
, const object_locator_t
& r
) {
309 inline std::ostream
& operator<<(std::ostream
& out
, const object_locator_t
& loc
)
311 out
<< "@" << loc
.pool
;
312 if (loc
.nspace
.length())
313 out
<< ";" << loc
.nspace
;
314 if (loc
.key
.length())
315 out
<< ":" << loc
.key
;
319 struct request_redirect_t
{
321 object_locator_t redirect_locator
; ///< this is authoritative
322 std::string redirect_object
; ///< If non-empty, the request goes to this object name
324 friend std::ostream
& operator<<(std::ostream
& out
, const request_redirect_t
& redir
);
327 request_redirect_t() {}
328 explicit request_redirect_t(const object_locator_t
& orig
, int64_t rpool
) :
329 redirect_locator(orig
) { redirect_locator
.pool
= rpool
; }
330 explicit request_redirect_t(const object_locator_t
& rloc
) :
331 redirect_locator(rloc
) {}
332 explicit request_redirect_t(const object_locator_t
& orig
,
333 const std::string
& robj
) :
334 redirect_locator(orig
), redirect_object(robj
) {}
336 bool empty() const { return redirect_locator
.empty() &&
337 redirect_object
.empty(); }
339 void combine_with_locator(object_locator_t
& orig
, std::string
& obj
) const {
340 orig
= redirect_locator
;
341 if (!redirect_object
.empty())
342 obj
= redirect_object
;
345 void encode(ceph::buffer::list
& bl
) const;
346 void decode(ceph::buffer::list::const_iterator
& bl
);
347 void dump(ceph::Formatter
*f
) const;
348 static void generate_test_instances(std::list
<request_redirect_t
*>& o
);
350 WRITE_CLASS_ENCODER(request_redirect_t
)
352 inline std::ostream
& operator<<(std::ostream
& out
, const request_redirect_t
& redir
) {
353 out
<< "object " << redir
.redirect_object
<< ", locator{" << redir
.redirect_locator
<< "}";
357 // Internal OSD op flags - set by the OSD based on the op types
359 CEPH_OSD_RMW_FLAG_READ
= (1 << 1),
360 CEPH_OSD_RMW_FLAG_WRITE
= (1 << 2),
361 CEPH_OSD_RMW_FLAG_CLASS_READ
= (1 << 3),
362 CEPH_OSD_RMW_FLAG_CLASS_WRITE
= (1 << 4),
363 CEPH_OSD_RMW_FLAG_PGOP
= (1 << 5),
364 CEPH_OSD_RMW_FLAG_CACHE
= (1 << 6),
365 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE
= (1 << 7),
366 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE
= (1 << 8),
367 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE
= (1 << 9),
368 CEPH_OSD_RMW_FLAG_RWORDERED
= (1 << 10),
369 CEPH_OSD_RMW_FLAG_RETURNVEC
= (1 << 11),
375 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
377 // placement seed (a hash value)
378 typedef uint32_t ps_t
;
380 // old (v1) pg_t encoding (wrap old struct ceph_pg)
383 void encode(ceph::buffer::list
& bl
) const {
384 ceph::encode_raw(v
, bl
);
386 void decode(ceph::buffer::list::const_iterator
& bl
) {
387 ceph::decode_raw(v
, bl
);
390 WRITE_CLASS_ENCODER(old_pg_t
)
392 // placement group id
397 pg_t() : m_pool(0), m_seed(0) {}
398 pg_t(ps_t seed
, uint64_t pool
) :
399 m_pool(pool
), m_seed(seed
) {}
400 // cppcheck-suppress noExplicitConstructor
401 pg_t(const ceph_pg
& cpg
) :
402 m_pool(cpg
.pool
), m_seed(cpg
.ps
) {}
404 // cppcheck-suppress noExplicitConstructor
405 pg_t(const old_pg_t
& opg
) {
409 old_pg_t
get_old_pg() const {
411 ceph_assert(m_pool
< 0xffffffffull
);
414 o
.v
.preferred
= (__s16
)-1;
421 int64_t pool() const {
425 static const uint8_t calc_name_buf_size
= 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
426 char *calc_name(char *buf
, const char *suffix_backwords
) const;
428 void set_ps(ps_t p
) {
431 void set_pool(uint64_t p
) {
435 pg_t
get_parent() const;
436 pg_t
get_ancestor(unsigned old_pg_num
) const;
438 int print(char *o
, int maxlen
) const;
439 bool parse(const char *s
);
441 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
, std::set
<pg_t
> *pchildren
) const;
443 bool is_merge_source(unsigned old_pg_num
, unsigned new_pg_num
, pg_t
*parent
) const;
444 bool is_merge_target(unsigned old_pg_num
, unsigned new_pg_num
) const {
445 return ps() < new_pg_num
&& is_split(new_pg_num
, old_pg_num
, nullptr);
449 * Returns b such that for all object o:
450 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
452 unsigned get_split_bits(unsigned pg_num
) const;
454 bool contains(int bits
, const ghobject_t
& oid
) const {
456 (int64_t)m_pool
== oid
.hobj
.get_logical_pool() &&
457 oid
.match(bits
, ps());
459 bool contains(int bits
, const hobject_t
& oid
) const {
461 (int64_t)m_pool
== oid
.get_logical_pool() &&
462 oid
.match(bits
, ps());
465 hobject_t
get_hobj_start() const;
466 hobject_t
get_hobj_end(unsigned pg_num
) const;
468 // strong ordering is supported
469 auto operator<=>(const pg_t
&) const noexcept
= default;
471 void encode(ceph::buffer::list
& bl
) const {
477 encode((int32_t)-1, bl
); // was preferred
479 void decode(ceph::buffer::list::const_iterator
& bl
) {
485 bl
+= sizeof(int32_t); // was preferred
487 void decode_old(ceph::buffer::list::const_iterator
& bl
) {
493 void dump(ceph::Formatter
*f
) const;
494 static void generate_test_instances(std::list
<pg_t
*>& o
);
496 WRITE_CLASS_ENCODER(pg_t
)
498 std::ostream
& operator<<(std::ostream
& out
, const pg_t
&pg
);
501 template<> struct hash
< pg_t
>
503 size_t operator()( const pg_t
& x
) const
505 static hash
<uint32_t> H
;
506 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
507 return H((x
.pool() & 0xffffffff) ^ (x
.pool() >> 32) ^ x
.ps() ^ (int32_t)(-1));
515 spg_t() : shard(shard_id_t::NO_SHARD
) {}
516 spg_t(pg_t pgid
, shard_id_t shard
) : pgid(pgid
), shard(shard
) {}
517 explicit spg_t(pg_t pgid
) : pgid(pgid
), shard(shard_id_t::NO_SHARD
) {}
518 auto operator<=>(const spg_t
&) const = default;
519 unsigned get_split_bits(unsigned pg_num
) const {
520 return pgid
.get_split_bits(pg_num
);
522 spg_t
get_parent() const {
523 return spg_t(pgid
.get_parent(), shard
);
528 uint64_t pool() const {
531 void reset_shard(shard_id_t s
) {
535 static const uint8_t calc_name_buf_size
= pg_t::calc_name_buf_size
+ 4; // 36 + len('s') + len("255");
536 char *calc_name(char *buf
, const char *suffix_backwords
) const;
537 // and a (limited) version that uses an internal buffer:
538 std::string
calc_name_sring() const;
540 bool parse(const char *s
);
541 bool parse(const std::string
& s
) {
542 return parse(s
.c_str());
545 spg_t
get_ancestor(unsigned old_pg_num
) const {
546 return spg_t(pgid
.get_ancestor(old_pg_num
), shard
);
549 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
,
550 std::set
<spg_t
> *pchildren
) const {
551 std::set
<pg_t
> _children
;
552 std::set
<pg_t
> *children
= pchildren
? &_children
: NULL
;
553 bool is_split
= pgid
.is_split(old_pg_num
, new_pg_num
, children
);
554 if (pchildren
&& is_split
) {
555 for (std::set
<pg_t
>::iterator i
= _children
.begin();
556 i
!= _children
.end();
558 pchildren
->insert(spg_t(*i
, shard
));
563 bool is_merge_target(unsigned old_pg_num
, unsigned new_pg_num
) const {
564 return pgid
.is_merge_target(old_pg_num
, new_pg_num
);
566 bool is_merge_source(unsigned old_pg_num
, unsigned new_pg_num
,
567 spg_t
*parent
) const {
569 bool r
= pgid
.is_merge_source(old_pg_num
, new_pg_num
, &out
.pgid
);
576 bool is_no_shard() const {
577 return shard
== shard_id_t::NO_SHARD
;
580 ghobject_t
make_pgmeta_oid() const {
581 return ghobject_t::make_pgmeta(pgid
.pool(), pgid
.ps(), shard
);
584 void encode(ceph::buffer::list
&bl
) const {
585 ENCODE_START(1, 1, bl
);
590 void decode(ceph::buffer::list::const_iterator
& bl
) {
597 ghobject_t
make_temp_ghobject(const std::string
& name
) const {
599 hobject_t(object_t(name
), "", CEPH_NOSNAP
,
601 hobject_t::get_temp_pool(pgid
.pool()),
607 unsigned hash_to_shard(unsigned num_shards
) const {
608 return ps() % num_shards
;
611 WRITE_CLASS_ENCODER(spg_t
)
614 template<> struct hash
< spg_t
>
616 size_t operator()( const spg_t
& x
) const
618 static hash
<uint32_t> H
;
619 return H(hash
<pg_t
>()(x
.pgid
) ^ x
.shard
);
624 std::ostream
& operator<<(std::ostream
& out
, const spg_t
&pg
);
626 // ----------------------
629 enum type_t
: uint8_t {
631 TYPE_LEGACY_TEMP
= 1, /* no longer used */
637 uint64_t removal_seq
; // note: deprecated, not encoded
639 char _str_buff
[spg_t::calc_name_buf_size
];
644 coll_t(type_t t
, spg_t p
, uint64_t r
)
645 : type(t
), pgid(p
), removal_seq(r
) {
649 friend class denc_coll_t
;
651 coll_t() : type(TYPE_META
), removal_seq(0)
656 coll_t(const coll_t
& other
)
657 : type(other
.type
), pgid(other
.pgid
), removal_seq(other
.removal_seq
) {
661 explicit coll_t(spg_t pgid
)
662 : type(TYPE_PG
), pgid(pgid
), removal_seq(0)
667 coll_t
& operator=(const coll_t
& rhs
)
669 this->type
= rhs
.type
;
670 this->pgid
= rhs
.pgid
;
671 this->removal_seq
= rhs
.removal_seq
;
676 // named constructors
677 static coll_t
meta() {
680 static coll_t
pg(spg_t p
) {
684 const std::string
to_str() const {
685 return std::string(_str
);
687 const char *c_str() const {
691 bool parse(const std::string
& s
);
693 int operator<(const coll_t
&rhs
) const {
694 return type
< rhs
.type
||
695 (type
== rhs
.type
&& pgid
< rhs
.pgid
);
698 bool is_meta() const {
699 return type
== TYPE_META
;
701 bool is_pg_prefix(spg_t
*pgid_
) const {
702 if (type
== TYPE_PG
|| type
== TYPE_PG_TEMP
) {
709 return type
== TYPE_PG
;
711 bool is_pg(spg_t
*pgid_
) const {
712 if (type
== TYPE_PG
) {
718 bool is_temp() const {
719 return type
== TYPE_PG_TEMP
;
721 bool is_temp(spg_t
*pgid_
) const {
722 if (type
== TYPE_PG_TEMP
) {
728 int64_t pool() const {
732 void encode(ceph::buffer::list
& bl
) const;
733 void decode(ceph::buffer::list::const_iterator
& bl
);
734 size_t encoded_size() const;
736 inline bool operator==(const coll_t
& rhs
) const {
737 // only compare type if meta
738 if (type
!= rhs
.type
)
740 if (type
== TYPE_META
)
742 return type
== rhs
.type
&& pgid
== rhs
.pgid
;
744 inline bool operator!=(const coll_t
& rhs
) const {
745 return !(*this == rhs
);
748 // get a TEMP collection that corresponds to the current collection,
749 // which we presume is a pg collection.
750 coll_t
get_temp() const {
751 ceph_assert(type
== TYPE_PG
);
752 return coll_t(TYPE_PG_TEMP
, pgid
, 0);
755 ghobject_t
get_min_hobj() const {
759 o
.hobj
.pool
= pgid
.pool();
760 o
.set_shard(pgid
.shard
);
771 unsigned hash_to_shard(unsigned num_shards
) const {
773 return pgid
.hash_to_shard(num_shards
);
774 return 0; // whatever.
777 void dump(ceph::Formatter
*f
) const;
778 static void generate_test_instances(std::list
<coll_t
*>& o
);
781 WRITE_CLASS_ENCODER(coll_t
)
783 inline std::ostream
& operator<<(std::ostream
& out
, const coll_t
& c
) {
788 #if FMT_VERSION >= 90000
789 template <> struct fmt::formatter
<coll_t
> : fmt::ostream_formatter
{};
793 template<> struct hash
<coll_t
> {
794 size_t operator()(const coll_t
&c
) const {
796 std::string
str(c
.to_str());
797 std::string::const_iterator
end(str
.end());
798 for (std::string::const_iterator s
= str
.begin(); s
!= end
; ++s
) {
811 inline std::ostream
& operator<<(std::ostream
& out
, const ceph_object_layout
&ol
)
813 out
<< pg_t(ol
.ol_pgid
);
814 int su
= ol
.ol_stripe_unit
;
823 auto &get_type() const { return coll
.type
; }
824 auto &get_type() { return coll
.type
; }
825 auto &get_pgid() const { return coll
.pgid
; }
826 auto &get_pgid() { return coll
.pgid
; }
828 denc_coll_t() = default;
829 denc_coll_t(const denc_coll_t
&) = default;
830 denc_coll_t(denc_coll_t
&&) = default;
832 denc_coll_t
&operator=(const denc_coll_t
&) = default;
833 denc_coll_t
&operator=(denc_coll_t
&&) = default;
835 explicit denc_coll_t(const coll_t
&coll
) : coll(coll
) {}
836 operator coll_t() const {
840 bool operator<(const denc_coll_t
&rhs
) const {
841 return coll
< rhs
.coll
;
844 DENC(denc_coll_t
, v
, p
) {
846 denc(v
.get_type(), p
);
847 denc(v
.get_pgid().pgid
.m_pool
, p
);
848 denc(v
.get_pgid().pgid
.m_seed
, p
);
849 denc(v
.get_pgid().shard
.id
, p
);
853 WRITE_CLASS_DENC(denc_coll_t
)
856 // compound rados version type
857 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
858 * work well. For little-endian machine, we should make sure there is no padding
859 * in 32-bit machine and 64-bit machine.
866 eversion_t() : version(0), epoch(0), __pad(0) {}
867 eversion_t(epoch_t e
, version_t v
) : version(v
), epoch(e
), __pad(0) {}
869 // cppcheck-suppress noExplicitConstructor
870 eversion_t(const ceph_eversion
& ce
) :
875 explicit eversion_t(ceph::buffer::list
& bl
) : __pad(0) { decode(bl
); }
877 static const eversion_t
& max() {
878 static const eversion_t
max(-1,-1);
882 operator ceph_eversion() {
889 std::string
get_key_name() const;
891 // key must point to the beginning of a block of 32 chars
892 inline void get_key_name(char* key
) const {
893 // Below is equivalent of sprintf("%010u.%020llu");
895 ritoa
<uint64_t, 10, 20>(version
, key
+ 31);
897 ritoa
<uint32_t, 10, 10>(epoch
, key
+ 10);
900 void encode(ceph::buffer::list
&bl
) const {
901 #if defined(CEPH_LITTLE_ENDIAN)
902 bl
.append((char *)this, sizeof(version_t
) + sizeof(epoch_t
));
909 void decode(ceph::buffer::list::const_iterator
&bl
) {
910 #if defined(CEPH_LITTLE_ENDIAN)
911 bl
.copy(sizeof(version_t
) + sizeof(epoch_t
), (char *)this);
918 void decode(ceph::buffer::list
& bl
) {
919 auto p
= std::cbegin(bl
);
923 WRITE_CLASS_ENCODER(eversion_t
)
925 inline bool operator==(const eversion_t
& l
, const eversion_t
& r
) {
926 return (l
.epoch
== r
.epoch
) && (l
.version
== r
.version
);
928 inline bool operator!=(const eversion_t
& l
, const eversion_t
& r
) {
929 return (l
.epoch
!= r
.epoch
) || (l
.version
!= r
.version
);
931 inline bool operator<(const eversion_t
& l
, const eversion_t
& r
) {
932 return (l
.epoch
== r
.epoch
) ? (l
.version
< r
.version
):(l
.epoch
< r
.epoch
);
934 inline bool operator<=(const eversion_t
& l
, const eversion_t
& r
) {
935 return (l
.epoch
== r
.epoch
) ? (l
.version
<= r
.version
):(l
.epoch
<= r
.epoch
);
937 inline bool operator>(const eversion_t
& l
, const eversion_t
& r
) {
938 return (l
.epoch
== r
.epoch
) ? (l
.version
> r
.version
):(l
.epoch
> r
.epoch
);
940 inline bool operator>=(const eversion_t
& l
, const eversion_t
& r
) {
941 return (l
.epoch
== r
.epoch
) ? (l
.version
>= r
.version
):(l
.epoch
>= r
.epoch
);
943 inline std::ostream
& operator<<(std::ostream
& out
, const eversion_t
& e
) {
944 return out
<< e
.epoch
<< "'" << e
.version
;
948 * objectstore_perf_stat_t
950 * current perf information about the osd
952 struct objectstore_perf_stat_t
{
953 // cur_op_latency is in ns since double add/sub are not associative
954 uint64_t os_commit_latency_ns
;
955 uint64_t os_apply_latency_ns
;
957 objectstore_perf_stat_t() :
958 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
960 bool operator==(const objectstore_perf_stat_t
&r
) const {
961 return os_commit_latency_ns
== r
.os_commit_latency_ns
&&
962 os_apply_latency_ns
== r
.os_apply_latency_ns
;
965 void add(const objectstore_perf_stat_t
&o
) {
966 os_commit_latency_ns
+= o
.os_commit_latency_ns
;
967 os_apply_latency_ns
+= o
.os_apply_latency_ns
;
969 void sub(const objectstore_perf_stat_t
&o
) {
970 os_commit_latency_ns
-= o
.os_commit_latency_ns
;
971 os_apply_latency_ns
-= o
.os_apply_latency_ns
;
973 void dump(ceph::Formatter
*f
) const;
974 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
975 void decode(ceph::buffer::list::const_iterator
&bl
);
976 static void generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
);
978 WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t
)
983 #define PG_STATE_CREATING (1ULL << 0) // creating
984 #define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
985 #define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
986 #define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
987 #define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
988 #define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
989 #define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
990 #define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
991 //#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
992 #define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
993 #define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
994 #define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
995 #define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
996 #define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
997 #define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
998 #define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
999 #define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
1000 #define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
1001 #define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
1002 #define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
1003 #define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
1004 #define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
1005 #define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
1006 #define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
1007 #define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
1008 #define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
1009 #define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
1010 #define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
1011 #define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
1012 #define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
1013 #define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
1014 #define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
1015 #define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
1016 #define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
1018 std::string
pg_state_string(uint64_t state
);
1019 std::string
pg_vector_string(const std::vector
<int32_t> &a
);
1020 std::optional
<uint64_t> pg_string_state(const std::string
& state
);
1026 * attributes for a single pool snapshot.
1028 struct pool_snap_info_t
{
1033 void dump(ceph::Formatter
*f
) const;
1034 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
1035 void decode(ceph::buffer::list::const_iterator
& bl
);
1036 static void generate_test_instances(std::list
<pool_snap_info_t
*>& o
);
1038 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t
)
1040 inline std::ostream
& operator<<(std::ostream
& out
, const pool_snap_info_t
& si
) {
1041 return out
<< si
.snapid
<< '(' << si
.name
<< ' ' << si
.stamp
<< ')';
1051 // The order of items in the list is important, therefore,
1052 // you should always add to the end of the list when adding new options.
1059 DEEP_SCRUB_INTERVAL
,
1061 RECOVERY_OP_PRIORITY
,
1064 COMPRESSION_ALGORITHM
,
1065 COMPRESSION_REQUIRED_RATIO
,
1066 COMPRESSION_MAX_BLOB_SIZE
,
1067 COMPRESSION_MIN_BLOB_SIZE
,
1071 FINGERPRINT_ALGORITHM
,
1072 PG_NUM_MIN
, // min pg_num
1073 TARGET_SIZE_BYTES
, // total bytes in pool
1074 TARGET_SIZE_RATIO
, // fraction of total cluster
1076 READ_LEASE_INTERVAL
,
1078 DEDUP_CHUNK_ALGORITHM
,
1079 DEDUP_CDC_CHUNK_SIZE
,
1080 PG_NUM_MAX
, // max pg_num
1093 opt_desc_t(key_t k
, type_t t
) : key(k
), type(t
) {}
1095 bool operator==(const opt_desc_t
& rhs
) const {
1096 return key
== rhs
.key
&& type
== rhs
.type
;
1100 typedef boost::variant
<std::string
,int64_t,double> value_t
;
1102 static bool is_opt_name(const std::string
& name
);
1103 static opt_desc_t
get_opt_desc(const std::string
& name
);
1105 pool_opts_t() : opts() {}
1107 bool is_set(key_t key
) const;
1109 template<typename T
>
1110 void set(key_t key
, const T
&val
) {
1111 value_t value
= val
;
1115 template<typename T
>
1116 bool get(key_t key
, T
*val
) const {
1117 opts_t::const_iterator i
= opts
.find(key
);
1118 if (i
== opts
.end()) {
1121 *val
= boost::get
<T
>(i
->second
);
1125 template<typename T
>
1126 T
value_or(key_t key
, T
&& default_value
) const {
1127 auto i
= opts
.find(key
);
1128 if (i
== opts
.end()) {
1129 return std::forward
<T
>(default_value
);
1131 return boost::get
<T
>(i
->second
);
1134 const value_t
& get(key_t key
) const;
1136 bool unset(key_t key
);
1138 void dump(const std::string
& name
, ceph::Formatter
*f
) const;
1140 void dump(ceph::Formatter
*f
) const;
1141 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
1142 void decode(ceph::buffer::list::const_iterator
&bl
);
1145 typedef std::map
<key_t
, value_t
> opts_t
;
1148 friend std::ostream
& operator<<(std::ostream
& out
, const pool_opts_t
& opts
);
1150 WRITE_CLASS_ENCODER_FEATURES(pool_opts_t
)
1152 struct pg_merge_meta_t
{
1154 epoch_t ready_epoch
= 0;
1155 epoch_t last_epoch_started
= 0;
1156 epoch_t last_epoch_clean
= 0;
1157 eversion_t source_version
;
1158 eversion_t target_version
;
1160 void encode(ceph::buffer::list
& bl
) const {
1161 ENCODE_START(1, 1, bl
);
1162 encode(source_pgid
, bl
);
1163 encode(ready_epoch
, bl
);
1164 encode(last_epoch_started
, bl
);
1165 encode(last_epoch_clean
, bl
);
1166 encode(source_version
, bl
);
1167 encode(target_version
, bl
);
1170 void decode(ceph::buffer::list::const_iterator
& p
) {
1172 decode(source_pgid
, p
);
1173 decode(ready_epoch
, p
);
1174 decode(last_epoch_started
, p
);
1175 decode(last_epoch_clean
, p
);
1176 decode(source_version
, p
);
1177 decode(target_version
, p
);
1180 void dump(ceph::Formatter
*f
) const {
1181 f
->dump_stream("source_pgid") << source_pgid
;
1182 f
->dump_unsigned("ready_epoch", ready_epoch
);
1183 f
->dump_unsigned("last_epoch_started", last_epoch_started
);
1184 f
->dump_unsigned("last_epoch_clean", last_epoch_clean
);
1185 f
->dump_stream("source_version") << source_version
;
1186 f
->dump_stream("target_version") << target_version
;
1189 WRITE_CLASS_ENCODER(pg_merge_meta_t
)
1197 static const char *APPLICATION_NAME_CEPHFS
;
1198 static const char *APPLICATION_NAME_RBD
;
1199 static const char *APPLICATION_NAME_RGW
;
1202 TYPE_REPLICATED
= 1, // replication
1203 //TYPE_RAID4 = 2, // raid4 (never implemented)
1204 TYPE_ERASURE
= 3, // erasure-coded
1206 static constexpr uint32_t pg_CRUSH_ITEM_NONE
= 0x7fffffff; /* can't import crush.h here */
1207 static std::string_view
get_type_name(int t
) {
1209 case TYPE_REPLICATED
: return "replicated";
1210 //case TYPE_RAID4: return "raid4";
1211 case TYPE_ERASURE
: return "erasure";
1212 default: return "???";
1215 std::string_view
get_type_name() const {
1216 return get_type_name(type
);
1220 FLAG_HASHPSPOOL
= 1<<0, // hash pg seed and pool together (instead of adding)
1221 FLAG_FULL
= 1<<1, // pool is full
1222 FLAG_EC_OVERWRITES
= 1<<2, // enables overwrites, once enabled, cannot be disabled
1223 FLAG_INCOMPLETE_CLONES
= 1<<3, // may have incomplete clones (bc we are/were an overlay)
1224 FLAG_NODELETE
= 1<<4, // pool can't be deleted
1225 FLAG_NOPGCHANGE
= 1<<5, // pool's pg and pgp num can't be changed
1226 FLAG_NOSIZECHANGE
= 1<<6, // pool's size and min size can't be changed
1227 FLAG_WRITE_FADVISE_DONTNEED
= 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1228 FLAG_NOSCRUB
= 1<<8, // block periodic scrub
1229 FLAG_NODEEP_SCRUB
= 1<<9, // block periodic deep-scrub
1230 FLAG_FULL_QUOTA
= 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1231 FLAG_NEARFULL
= 1<<11, // pool is nearfull
1232 FLAG_BACKFILLFULL
= 1<<12, // pool is backfillfull
1233 FLAG_SELFMANAGED_SNAPS
= 1<<13, // pool uses selfmanaged snaps
1234 FLAG_POOL_SNAPS
= 1<<14, // pool has pool snaps
1235 FLAG_CREATING
= 1<<15, // initial pool PGs are being created
1236 FLAG_EIO
= 1<<16, // return EIO for all client ops
1237 FLAG_BULK
= 1<<17, //pool is large
1238 // PGs from this pool are allowed to be created on crimson osds.
1239 // Pool features are restricted to those supported by crimson-osd.
1240 // Note, does not prohibit being created on classic osd.
1241 FLAG_CRIMSON
= 1<<18,
1244 static const char *get_flag_name(uint64_t f
) {
1246 case FLAG_HASHPSPOOL
: return "hashpspool";
1247 case FLAG_FULL
: return "full";
1248 case FLAG_EC_OVERWRITES
: return "ec_overwrites";
1249 case FLAG_INCOMPLETE_CLONES
: return "incomplete_clones";
1250 case FLAG_NODELETE
: return "nodelete";
1251 case FLAG_NOPGCHANGE
: return "nopgchange";
1252 case FLAG_NOSIZECHANGE
: return "nosizechange";
1253 case FLAG_WRITE_FADVISE_DONTNEED
: return "write_fadvise_dontneed";
1254 case FLAG_NOSCRUB
: return "noscrub";
1255 case FLAG_NODEEP_SCRUB
: return "nodeep-scrub";
1256 case FLAG_FULL_QUOTA
: return "full_quota";
1257 case FLAG_NEARFULL
: return "nearfull";
1258 case FLAG_BACKFILLFULL
: return "backfillfull";
1259 case FLAG_SELFMANAGED_SNAPS
: return "selfmanaged_snaps";
1260 case FLAG_POOL_SNAPS
: return "pool_snaps";
1261 case FLAG_CREATING
: return "creating";
1262 case FLAG_EIO
: return "eio";
1263 case FLAG_BULK
: return "bulk";
1264 case FLAG_CRIMSON
: return "crimson";
1265 default: return "???";
1268 static std::string
get_flags_string(uint64_t f
) {
1270 for (unsigned n
=0; f
&& n
<64; ++n
) {
1271 if (f
& (1ull << n
)) {
1274 s
+= get_flag_name(1ull << n
);
1279 std::string
get_flags_string() const {
1280 return get_flags_string(flags
);
1282 static uint64_t get_flag_by_name(const std::string
& name
) {
1283 if (name
== "hashpspool")
1284 return FLAG_HASHPSPOOL
;
1287 if (name
== "ec_overwrites")
1288 return FLAG_EC_OVERWRITES
;
1289 if (name
== "incomplete_clones")
1290 return FLAG_INCOMPLETE_CLONES
;
1291 if (name
== "nodelete")
1292 return FLAG_NODELETE
;
1293 if (name
== "nopgchange")
1294 return FLAG_NOPGCHANGE
;
1295 if (name
== "nosizechange")
1296 return FLAG_NOSIZECHANGE
;
1297 if (name
== "write_fadvise_dontneed")
1298 return FLAG_WRITE_FADVISE_DONTNEED
;
1299 if (name
== "noscrub")
1300 return FLAG_NOSCRUB
;
1301 if (name
== "nodeep-scrub")
1302 return FLAG_NODEEP_SCRUB
;
1303 if (name
== "full_quota")
1304 return FLAG_FULL_QUOTA
;
1305 if (name
== "nearfull")
1306 return FLAG_NEARFULL
;
1307 if (name
== "backfillfull")
1308 return FLAG_BACKFILLFULL
;
1309 if (name
== "selfmanaged_snaps")
1310 return FLAG_SELFMANAGED_SNAPS
;
1311 if (name
== "pool_snaps")
1312 return FLAG_POOL_SNAPS
;
1313 if (name
== "creating")
1314 return FLAG_CREATING
;
1319 if (name
== "crimson")
1320 return FLAG_CRIMSON
;
1324 /// converts the acting/up vector to a set of pg shards
1325 void convert_to_pg_shards(const std::vector
<int> &from
, std::set
<pg_shard_t
>* to
) const;
1328 CACHEMODE_NONE
= 0, ///< no caching
1329 CACHEMODE_WRITEBACK
= 1, ///< write to cache, flush later
1330 CACHEMODE_FORWARD
= 2, ///< forward if not in cache
1331 CACHEMODE_READONLY
= 3, ///< handle reads, forward writes [not strongly consistent]
1332 CACHEMODE_READFORWARD
= 4, ///< forward reads, write to cache flush later
1333 CACHEMODE_READPROXY
= 5, ///< proxy reads, write to cache flush later
1334 CACHEMODE_PROXY
= 6, ///< proxy if not in cache
1336 static const char *get_cache_mode_name(cache_mode_t m
) {
1338 case CACHEMODE_NONE
: return "none";
1339 case CACHEMODE_WRITEBACK
: return "writeback";
1340 case CACHEMODE_FORWARD
: return "forward";
1341 case CACHEMODE_READONLY
: return "readonly";
1342 case CACHEMODE_READFORWARD
: return "readforward";
1343 case CACHEMODE_READPROXY
: return "readproxy";
1344 case CACHEMODE_PROXY
: return "proxy";
1345 default: return "unknown";
1348 static cache_mode_t
get_cache_mode_from_str(const std::string
& s
) {
1350 return CACHEMODE_NONE
;
1351 if (s
== "writeback")
1352 return CACHEMODE_WRITEBACK
;
1354 return CACHEMODE_FORWARD
;
1355 if (s
== "readonly")
1356 return CACHEMODE_READONLY
;
1357 if (s
== "readforward")
1358 return CACHEMODE_READFORWARD
;
1359 if (s
== "readproxy")
1360 return CACHEMODE_READPROXY
;
1362 return CACHEMODE_PROXY
;
1363 return (cache_mode_t
)-1;
1365 const char *get_cache_mode_name() const {
1366 return get_cache_mode_name(cache_mode
);
1368 bool cache_mode_requires_hit_set() const {
1369 switch (cache_mode
) {
1370 case CACHEMODE_NONE
:
1371 case CACHEMODE_FORWARD
:
1372 case CACHEMODE_READONLY
:
1373 case CACHEMODE_PROXY
:
1375 case CACHEMODE_WRITEBACK
:
1376 case CACHEMODE_READFORWARD
:
1377 case CACHEMODE_READPROXY
:
1380 ceph_abort_msg("implement me");
1384 enum class pg_autoscale_mode_t
: uint8_t {
1388 UNKNOWN
= UINT8_MAX
,
1390 static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m
) {
1392 case pg_autoscale_mode_t::OFF
: return "off";
1393 case pg_autoscale_mode_t::ON
: return "on";
1394 case pg_autoscale_mode_t::WARN
: return "warn";
1395 default: return "???";
1398 static pg_autoscale_mode_t
get_pg_autoscale_mode_by_name(const std::string
& m
) {
1400 return pg_autoscale_mode_t::OFF
;
1403 return pg_autoscale_mode_t::WARN
;
1406 return pg_autoscale_mode_t::ON
;
1408 return pg_autoscale_mode_t::UNKNOWN
;
1411 utime_t create_time
;
1412 uint64_t flags
= 0; ///< FLAG_*
1413 __u8 type
= 0; ///< TYPE_*
1414 __u8 size
= 0, min_size
= 0; ///< number of osds in each pg
1415 __u8 crush_rule
= 0; ///< crush placement rule
1416 __u8 object_hash
= 0; ///< hash mapping object name to ps
1417 pg_autoscale_mode_t pg_autoscale_mode
= pg_autoscale_mode_t::UNKNOWN
;
1420 __u32 pg_num
= 0, pgp_num
= 0; ///< number of pgs
1421 __u32 pg_num_pending
= 0; ///< pg_num we are about to merge down to
1422 __u32 pg_num_target
= 0; ///< pg_num we should converge toward
1423 __u32 pgp_num_target
= 0; ///< pgp_num we should converge toward
1426 std::map
<std::string
, std::string
> properties
; ///< OBSOLETE
1427 std::string erasure_code_profile
; ///< name of the erasure code profile in OSDMap
1428 epoch_t last_change
= 0; ///< most recent epoch changed, exclusing snapshot changes
1429 // If non-zero, require OSDs in at least this many different instances...
1430 uint32_t peering_crush_bucket_count
= 0;
1431 // of this bucket type...
1432 uint32_t peering_crush_bucket_barrier
= 0;
1433 // including this one
1434 int32_t peering_crush_mandatory_member
= pg_CRUSH_ITEM_NONE
;
1435 // The per-bucket replica count is calculated with this "target"
1436 // instead of the above crush_bucket_count. This means we can maintain a
1437 // target size of 4 without attempting to place them all in 1 DC
1438 uint32_t peering_crush_bucket_target
= 0;
1439 /// last epoch that forced clients to resend
1440 epoch_t last_force_op_resend
= 0;
1441 /// last epoch that forced clients to resend (pre-nautilus clients only)
1442 epoch_t last_force_op_resend_prenautilus
= 0;
1443 /// last epoch that forced clients to resend (pre-luminous clients only)
1444 epoch_t last_force_op_resend_preluminous
= 0;
1446 /// metadata for the most recent PG merge
1447 pg_merge_meta_t last_pg_merge_meta
;
1449 snapid_t snap_seq
= 0; ///< seq for per-pool snapshot
1450 epoch_t snap_epoch
= 0; ///< osdmap epoch of last snap
1451 uint64_t auid
= 0; ///< who owns the pg
1453 uint64_t quota_max_bytes
= 0; ///< maximum number of bytes for this pool
1454 uint64_t quota_max_objects
= 0; ///< maximum number of objects for this pool
1457 * Pool snaps (global to this pool). These define a SnapContext for
1458 * the pool, unless the client manually specifies an alternate
1461 std::map
<snapid_t
, pool_snap_info_t
> snaps
;
1463 * Alternatively, if we are defining non-pool snaps (e.g. via the
1464 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1465 * used). Snaps and removed_snaps are to be used exclusive of each
1468 interval_set
<snapid_t
> removed_snaps
;
1470 unsigned pg_num_mask
= 0, pgp_num_mask
= 0;
1472 std::set
<uint64_t> tiers
; ///< pools that are tiers of us
1473 int64_t tier_of
= -1; ///< pool for which we are a tier
1474 // Note that write wins for read+write ops
1475 int64_t read_tier
= -1; ///< pool/tier for objecter to direct reads to
1476 int64_t write_tier
= -1; ///< pool/tier for objecter to direct writes to
1477 cache_mode_t cache_mode
= CACHEMODE_NONE
; ///< cache pool mode
1479 bool is_tier() const { return tier_of
>= 0; }
1480 bool has_tiers() const { return !tiers
.empty(); }
1485 clear_tier_tunables();
1487 bool has_read_tier() const { return read_tier
>= 0; }
1488 void clear_read_tier() { read_tier
= -1; }
1489 bool has_write_tier() const { return write_tier
>= 0; }
1490 void clear_write_tier() { write_tier
= -1; }
1491 void clear_tier_tunables() {
1492 if (cache_mode
!= CACHEMODE_NONE
)
1493 flags
|= FLAG_INCOMPLETE_CLONES
;
1494 cache_mode
= CACHEMODE_NONE
;
1496 target_max_bytes
= 0;
1497 target_max_objects
= 0;
1498 cache_target_dirty_ratio_micro
= 0;
1499 cache_target_dirty_high_ratio_micro
= 0;
1500 cache_target_full_ratio_micro
= 0;
1501 hit_set_params
= HitSet::Params();
1504 hit_set_grade_decay_rate
= 0;
1505 hit_set_search_last_n
= 0;
1506 grade_table
.resize(0);
1509 bool has_snaps() const {
1510 return snaps
.size() > 0;
1513 bool is_stretch_pool() const {
1514 return peering_crush_bucket_count
!= 0;
1517 bool stretch_set_can_peer(const std::set
<int>& want
, const OSDMap
& osdmap
,
1518 std::ostream
*out
) const;
1519 bool stretch_set_can_peer(const std::vector
<int>& want
, const OSDMap
& osdmap
,
1520 std::ostream
*out
) const {
1521 if (!is_stretch_pool()) return true;
1522 std::set
<int> swant
;
1523 for (auto i
: want
) swant
.insert(i
);
1524 return stretch_set_can_peer(swant
, osdmap
, out
);
1527 uint64_t target_max_bytes
= 0; ///< tiering: target max pool size
1528 uint64_t target_max_objects
= 0; ///< tiering: target max pool size
1530 uint32_t cache_target_dirty_ratio_micro
= 0; ///< cache: fraction of target to leave dirty
1531 uint32_t cache_target_dirty_high_ratio_micro
= 0; ///< cache: fraction of target to flush with high speed
1532 uint32_t cache_target_full_ratio_micro
= 0; ///< cache: fraction of target to fill before we evict in earnest
1534 uint32_t cache_min_flush_age
= 0; ///< minimum age (seconds) before we can flush
1535 uint32_t cache_min_evict_age
= 0; ///< minimum age (seconds) before we can evict
1537 HitSet::Params hit_set_params
; ///< The HitSet params to use on this pool
1538 uint32_t hit_set_period
= 0; ///< periodicity of HitSet segments (seconds)
1539 uint32_t hit_set_count
= 0; ///< number of periods to retain
1540 bool use_gmt_hitset
= true; ///< use gmt to name the hitset archive object
1541 uint32_t min_read_recency_for_promote
= 0; ///< minimum number of HitSet to check before promote on read
1542 uint32_t min_write_recency_for_promote
= 0; ///< minimum number of HitSet to check before promote on write
1543 uint32_t hit_set_grade_decay_rate
= 0; ///< current hit_set has highest priority on objects
1544 ///< temperature count,the follow hit_set's priority decay
1545 ///< by this params than pre hit_set
1546 uint32_t hit_set_search_last_n
= 0; ///< accumulate atmost N hit_sets for temperature
1548 uint32_t stripe_width
= 0; ///< erasure coded stripe size in bytes
1550 uint64_t expected_num_objects
= 0; ///< expected number of objects on this pool, a value of 0 indicates
1551 ///< user does not specify any expected value
1552 bool fast_read
= false; ///< whether turn on fast read on the pool or not
1554 pool_opts_t opts
; ///< options
1557 TYPE_FINGERPRINT_NONE
= 0,
1558 TYPE_FINGERPRINT_SHA1
= 1,
1559 TYPE_FINGERPRINT_SHA256
= 2,
1560 TYPE_FINGERPRINT_SHA512
= 3,
1562 static fingerprint_t
get_fingerprint_from_str(const std::string
& s
) {
1564 return TYPE_FINGERPRINT_NONE
;
1566 return TYPE_FINGERPRINT_SHA1
;
1568 return TYPE_FINGERPRINT_SHA256
;
1570 return TYPE_FINGERPRINT_SHA512
;
1571 return (fingerprint_t
)-1;
1573 const fingerprint_t
get_fingerprint_type() const {
1575 opts
.get(pool_opts_t::FINGERPRINT_ALGORITHM
, &fp_str
);
1576 return get_fingerprint_from_str(fp_str
);
1578 const char *get_fingerprint_name() const {
1581 opts
.get(pool_opts_t::FINGERPRINT_ALGORITHM
, &fp_str
);
1582 fp_t
= get_fingerprint_from_str(fp_str
);
1583 return get_fingerprint_name(fp_t
);
1585 static const char *get_fingerprint_name(fingerprint_t m
) {
1587 case TYPE_FINGERPRINT_NONE
: return "none";
1588 case TYPE_FINGERPRINT_SHA1
: return "sha1";
1589 case TYPE_FINGERPRINT_SHA256
: return "sha256";
1590 case TYPE_FINGERPRINT_SHA512
: return "sha512";
1591 default: return "unknown";
1596 TYPE_DEDUP_CHUNK_NONE
= 0,
1597 TYPE_DEDUP_CHUNK_FASTCDC
= 1,
1598 TYPE_DEDUP_CHUNK_FIXEDCDC
= 2,
1599 } dedup_chunk_algo_t
;
1600 static dedup_chunk_algo_t
get_dedup_chunk_algorithm_from_str(const std::string
& s
) {
1602 return TYPE_DEDUP_CHUNK_NONE
;
1604 return TYPE_DEDUP_CHUNK_FASTCDC
;
1606 return TYPE_DEDUP_CHUNK_FIXEDCDC
;
1607 return (dedup_chunk_algo_t
)-1;
1609 const dedup_chunk_algo_t
get_dedup_chunk_algorithm_type() const {
1610 std::string algo_str
;
1611 opts
.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM
, &algo_str
);
1612 return get_dedup_chunk_algorithm_from_str(algo_str
);
1614 const char *get_dedup_chunk_algorithm_name() const {
1615 std::string dedup_chunk_algo_str
;
1616 dedup_chunk_algo_t dedup_chunk_algo_t
;
1617 opts
.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM
, &dedup_chunk_algo_str
);
1618 dedup_chunk_algo_t
= get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str
);
1619 return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t
);
1621 static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m
) {
1623 case TYPE_DEDUP_CHUNK_NONE
: return "none";
1624 case TYPE_DEDUP_CHUNK_FASTCDC
: return "fastcdc";
1625 case TYPE_DEDUP_CHUNK_FIXEDCDC
: return "fixed";
1626 default: return "unknown";
1630 int64_t get_dedup_tier() const {
1631 int64_t tier_id
= 0;
1632 opts
.get(pool_opts_t::DEDUP_TIER
, &tier_id
);
1635 int64_t get_dedup_cdc_chunk_size() const {
1636 int64_t chunk_size
= 0;
1637 opts
.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE
, &chunk_size
);
1641 /// application -> key/value metadata
1642 std::map
<std::string
, std::map
<std::string
, std::string
>> application_metadata
;
1645 std::vector
<uint32_t> grade_table
;
1648 uint32_t get_grade(unsigned i
) const {
1649 if (grade_table
.size() <= i
)
1651 return grade_table
[i
];
1653 void calc_grade_table() {
1654 unsigned v
= 1000000;
1655 grade_table
.resize(hit_set_count
);
1656 for (unsigned i
= 0; i
< hit_set_count
; i
++) {
1657 v
= v
* (1 - (hit_set_grade_decay_rate
/ 100.0));
1662 pg_pool_t() = default;
1664 void dump(ceph::Formatter
*f
) const;
1666 const utime_t
&get_create_time() const { return create_time
; }
1667 uint64_t get_flags() const { return flags
; }
1668 bool has_flag(uint64_t f
) const { return flags
& f
; }
1669 void set_flag(uint64_t f
) { flags
|= f
; }
1670 void unset_flag(uint64_t f
) { flags
&= ~f
; }
1672 bool require_rollback() const {
1673 return is_erasure();
1676 /// true if incomplete clones may be present
1677 bool allow_incomplete_clones() const {
1678 return cache_mode
!= CACHEMODE_NONE
|| has_flag(FLAG_INCOMPLETE_CLONES
);
1681 unsigned get_type() const { return type
; }
1682 unsigned get_size() const { return size
; }
1683 unsigned get_min_size() const { return min_size
; }
1684 int get_crush_rule() const { return crush_rule
; }
1685 int get_object_hash() const { return object_hash
; }
1686 const char *get_object_hash_name() const {
1687 return ceph_str_hash_name(get_object_hash());
1689 epoch_t
get_last_change() const { return last_change
; }
1690 epoch_t
get_last_force_op_resend() const { return last_force_op_resend
; }
1691 epoch_t
get_last_force_op_resend_prenautilus() const {
1692 return last_force_op_resend_prenautilus
;
1694 epoch_t
get_last_force_op_resend_preluminous() const {
1695 return last_force_op_resend_preluminous
;
1697 epoch_t
get_snap_epoch() const { return snap_epoch
; }
1698 snapid_t
get_snap_seq() const { return snap_seq
; }
1699 uint64_t get_auid() const { return auid
; }
1701 void set_snap_seq(snapid_t s
) { snap_seq
= s
; }
1702 void set_snap_epoch(epoch_t e
) { snap_epoch
= e
; }
1704 void set_stripe_width(uint32_t s
) { stripe_width
= s
; }
1705 uint32_t get_stripe_width() const { return stripe_width
; }
1707 bool is_replicated() const { return get_type() == TYPE_REPLICATED
; }
1708 bool is_erasure() const { return get_type() == TYPE_ERASURE
; }
1710 bool supports_omap() const {
1711 return !(get_type() == TYPE_ERASURE
);
1714 bool requires_aligned_append() const {
1715 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES
);
1717 uint64_t required_alignment() const { return stripe_width
; }
1719 bool allows_ecoverwrites() const {
1720 return has_flag(FLAG_EC_OVERWRITES
);
1723 bool is_crimson() const {
1724 return has_flag(FLAG_CRIMSON
);
1727 bool can_shift_osds() const {
1728 switch (get_type()) {
1729 case TYPE_REPLICATED
:
1734 ceph_abort_msg("unhandled pool type");
1738 unsigned get_pg_num() const { return pg_num
; }
1739 unsigned get_pgp_num() const { return pgp_num
; }
1740 unsigned get_pg_num_target() const { return pg_num_target
; }
1741 unsigned get_pgp_num_target() const { return pgp_num_target
; }
1742 unsigned get_pg_num_pending() const { return pg_num_pending
; }
1744 unsigned get_pg_num_mask() const { return pg_num_mask
; }
1745 unsigned get_pgp_num_mask() const { return pgp_num_mask
; }
1747 // if pg_num is not a multiple of two, pgs are not equally sized.
1748 // return, for a given pg, the fraction (denominator) of the total
1749 // pool size that it represents.
1750 unsigned get_pg_num_divisor(pg_t pgid
) const;
1752 bool is_pending_merge(pg_t pgid
, bool *target
) const;
1754 void set_pg_num(int p
) {
1759 void set_pgp_num(int p
) {
1763 void set_pg_num_pending(int p
) {
1767 void set_pg_num_target(int p
) {
1770 void set_pgp_num_target(int p
) {
1773 void dec_pg_num(pg_t source_pgid
,
1774 epoch_t ready_epoch
,
1775 eversion_t source_version
,
1776 eversion_t target_version
,
1777 epoch_t last_epoch_started
,
1778 epoch_t last_epoch_clean
) {
1780 last_pg_merge_meta
.source_pgid
= source_pgid
;
1781 last_pg_merge_meta
.ready_epoch
= ready_epoch
;
1782 last_pg_merge_meta
.source_version
= source_version
;
1783 last_pg_merge_meta
.target_version
= target_version
;
1784 last_pg_merge_meta
.last_epoch_started
= last_epoch_started
;
1785 last_pg_merge_meta
.last_epoch_clean
= last_epoch_clean
;
1789 void set_quota_max_bytes(uint64_t m
) {
1790 quota_max_bytes
= m
;
1792 uint64_t get_quota_max_bytes() {
1793 return quota_max_bytes
;
1796 void set_quota_max_objects(uint64_t m
) {
1797 quota_max_objects
= m
;
1799 uint64_t get_quota_max_objects() {
1800 return quota_max_objects
;
1803 void set_last_force_op_resend(uint64_t t
) {
1804 last_force_op_resend
= t
;
1805 last_force_op_resend_prenautilus
= t
;
1806 last_force_op_resend_preluminous
= t
;
1809 void calc_pg_masks();
1812 * we have two snap modes:
1813 * - pool global snaps
1814 * - snap existence/non-existence defined by snaps[] and snap_seq
1815 * - user managed snaps
1816 * - removal governed by removed_snaps
1818 * we know which mode we're using based on whether removed_snaps is empty.
1819 * If nothing has been created, both functions report false.
1821 bool is_pool_snaps_mode() const;
1822 bool is_unmanaged_snaps_mode() const;
1823 bool is_removed_snap(snapid_t s
) const;
1825 snapid_t
snap_exists(std::string_view s
) const;
1826 void add_snap(const char *n
, utime_t stamp
);
1827 uint64_t add_unmanaged_snap(bool preoctopus_compat
);
1828 void remove_snap(snapid_t s
);
1829 void remove_unmanaged_snap(snapid_t s
, bool preoctopus_compat
);
1831 SnapContext
get_snap_context() const;
1833 /// hash a object name+namespace key to a hash position
1834 uint32_t hash_key(const std::string
& key
, const std::string
& ns
) const;
1836 /// round a hash position down to a pg num
1837 uint32_t raw_hash_to_pg(uint32_t v
) const;
1840 * map a raw pg (with full precision ps) into an actual pg, for storage
1842 pg_t
raw_pg_to_pg(pg_t pg
) const;
1845 * map raw pg (full precision ps) into a placement seed. include
1846 * pool id in that value so that different pools don't use the same
1849 ps_t
raw_pg_to_pps(pg_t pg
) const;
1851 /// choose a random hash position within a pg
1852 uint32_t get_random_pg_position(pg_t pgid
, uint32_t seed
) const;
1854 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
1855 void decode(ceph::buffer::list::const_iterator
& bl
);
1857 static void generate_test_instances(std::list
<pg_pool_t
*>& o
);
1859 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t
)
1861 std::ostream
& operator<<(std::ostream
& out
, const pg_pool_t
& p
);
1865 * a summation of object stats
1867 * This is just a container for object stats; we don't know what for.
1869 * If you add members in object_stat_sum_t, you should make sure there are
1870 * not padding among these members.
1871 * You should also modify the padding_check function.
1874 struct object_stat_sum_t
{
1875 /**************************************************************************
1876 * WARNING: be sure to update operator==, floor, and split when
1877 * adding/removing fields!
1878 **************************************************************************/
1879 int64_t num_bytes
{0}; // in bytes
1880 int64_t num_objects
{0};
1881 int64_t num_object_clones
{0};
1882 int64_t num_object_copies
{0}; // num_objects * num_replicas
1883 int64_t num_objects_missing_on_primary
{0};
1884 int64_t num_objects_degraded
{0};
1885 int64_t num_objects_unfound
{0};
1887 int64_t num_rd_kb
{0};
1889 int64_t num_wr_kb
{0};
1890 int64_t num_scrub_errors
{0}; // total deep and shallow scrub errors
1891 int64_t num_objects_recovered
{0};
1892 int64_t num_bytes_recovered
{0};
1893 int64_t num_keys_recovered
{0};
1894 int64_t num_shallow_scrub_errors
{0};
1895 int64_t num_deep_scrub_errors
{0};
1896 int64_t num_objects_dirty
{0};
1897 int64_t num_whiteouts
{0};
1898 int64_t num_objects_omap
{0};
1899 int64_t num_objects_hit_set_archive
{0};
1900 int64_t num_objects_misplaced
{0};
1901 int64_t num_bytes_hit_set_archive
{0};
1902 int64_t num_flush
{0};
1903 int64_t num_flush_kb
{0};
1904 int64_t num_evict
{0};
1905 int64_t num_evict_kb
{0};
1906 int64_t num_promote
{0};
1907 int32_t num_flush_mode_high
{0}; // 1 when in high flush mode, otherwise 0
1908 int32_t num_flush_mode_low
{0}; // 1 when in low flush mode, otherwise 0
1909 int32_t num_evict_mode_some
{0}; // 1 when in evict some mode, otherwise 0
1910 int32_t num_evict_mode_full
{0}; // 1 when in evict full mode, otherwise 0
1911 int64_t num_objects_pinned
{0};
1912 int64_t num_objects_missing
{0};
1913 int64_t num_legacy_snapsets
{0}; ///< upper bound on pre-luminous-style SnapSets
1914 int64_t num_large_omap_objects
{0};
1915 int64_t num_objects_manifest
{0};
1916 int64_t num_omap_bytes
{0};
1917 int64_t num_omap_keys
{0};
1918 int64_t num_objects_repaired
{0};
1920 object_stat_sum_t() = default;
1922 void floor(int64_t f
) {
1923 #define FLOOR(x) if (x < f) x = f
1926 FLOOR(num_object_clones
);
1927 FLOOR(num_object_copies
);
1928 FLOOR(num_objects_missing_on_primary
);
1929 FLOOR(num_objects_missing
);
1930 FLOOR(num_objects_degraded
);
1931 FLOOR(num_objects_misplaced
);
1932 FLOOR(num_objects_unfound
);
1937 FLOOR(num_large_omap_objects
);
1938 FLOOR(num_objects_manifest
);
1939 FLOOR(num_omap_bytes
);
1940 FLOOR(num_omap_keys
);
1941 FLOOR(num_shallow_scrub_errors
);
1942 FLOOR(num_deep_scrub_errors
);
1943 num_scrub_errors
= num_shallow_scrub_errors
+ num_deep_scrub_errors
;
1944 FLOOR(num_objects_recovered
);
1945 FLOOR(num_bytes_recovered
);
1946 FLOOR(num_keys_recovered
);
1947 FLOOR(num_objects_dirty
);
1948 FLOOR(num_whiteouts
);
1949 FLOOR(num_objects_omap
);
1950 FLOOR(num_objects_hit_set_archive
);
1951 FLOOR(num_bytes_hit_set_archive
);
1953 FLOOR(num_flush_kb
);
1955 FLOOR(num_evict_kb
);
1957 FLOOR(num_flush_mode_high
);
1958 FLOOR(num_flush_mode_low
);
1959 FLOOR(num_evict_mode_some
);
1960 FLOOR(num_evict_mode_full
);
1961 FLOOR(num_objects_pinned
);
1962 FLOOR(num_legacy_snapsets
);
1963 FLOOR(num_objects_repaired
);
1967 void split(std::vector
<object_stat_sum_t
> &out
) const {
1968 #define SPLIT(PARAM) \
1969 for (unsigned i = 0; i < out.size(); ++i) { \
1970 out[i].PARAM = PARAM / out.size(); \
1971 if (i < (PARAM % out.size())) { \
1975 #define SPLIT_PRESERVE_NONZERO(PARAM) \
1976 for (unsigned i = 0; i < out.size(); ++i) { \
1978 out[i].PARAM = 1 + PARAM / out.size(); \
1985 SPLIT(num_object_clones
);
1986 SPLIT(num_object_copies
);
1987 SPLIT(num_objects_missing_on_primary
);
1988 SPLIT(num_objects_missing
);
1989 SPLIT(num_objects_degraded
);
1990 SPLIT(num_objects_misplaced
);
1991 SPLIT(num_objects_unfound
);
1996 SPLIT(num_large_omap_objects
);
1997 SPLIT(num_objects_manifest
);
1998 SPLIT(num_omap_bytes
);
1999 SPLIT(num_omap_keys
);
2000 SPLIT(num_objects_repaired
);
2001 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors
);
2002 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors
);
2003 for (unsigned i
= 0; i
< out
.size(); ++i
) {
2004 out
[i
].num_scrub_errors
= out
[i
].num_shallow_scrub_errors
+
2005 out
[i
].num_deep_scrub_errors
;
2007 SPLIT(num_objects_recovered
);
2008 SPLIT(num_bytes_recovered
);
2009 SPLIT(num_keys_recovered
);
2010 SPLIT(num_objects_dirty
);
2011 SPLIT(num_whiteouts
);
2012 SPLIT(num_objects_omap
);
2013 SPLIT(num_objects_hit_set_archive
);
2014 SPLIT(num_bytes_hit_set_archive
);
2016 SPLIT(num_flush_kb
);
2018 SPLIT(num_evict_kb
);
2020 SPLIT(num_flush_mode_high
);
2021 SPLIT(num_flush_mode_low
);
2022 SPLIT(num_evict_mode_some
);
2023 SPLIT(num_evict_mode_full
);
2024 SPLIT(num_objects_pinned
);
2025 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets
);
2027 #undef SPLIT_PRESERVE_NONZERO
2031 // FIPS zeroization audit 20191117: this memset is not security related.
2032 memset(this, 0, sizeof(*this));
2035 void calc_copies(int nrep
) {
2036 num_object_copies
= nrep
* num_objects
;
2039 bool is_zero() const {
2040 return mem_is_zero((char*)this, sizeof(*this));
2043 void add(const object_stat_sum_t
& o
);
2044 void sub(const object_stat_sum_t
& o
);
2046 void dump(ceph::Formatter
*f
) const;
2047 void padding_check() {
2049 sizeof(object_stat_sum_t
) ==
2051 sizeof(num_objects
) +
2052 sizeof(num_object_clones
) +
2053 sizeof(num_object_copies
) +
2054 sizeof(num_objects_missing_on_primary
) +
2055 sizeof(num_objects_degraded
) +
2056 sizeof(num_objects_unfound
) +
2061 sizeof(num_scrub_errors
) +
2062 sizeof(num_large_omap_objects
) +
2063 sizeof(num_objects_manifest
) +
2064 sizeof(num_omap_bytes
) +
2065 sizeof(num_omap_keys
) +
2066 sizeof(num_objects_repaired
) +
2067 sizeof(num_objects_recovered
) +
2068 sizeof(num_bytes_recovered
) +
2069 sizeof(num_keys_recovered
) +
2070 sizeof(num_shallow_scrub_errors
) +
2071 sizeof(num_deep_scrub_errors
) +
2072 sizeof(num_objects_dirty
) +
2073 sizeof(num_whiteouts
) +
2074 sizeof(num_objects_omap
) +
2075 sizeof(num_objects_hit_set_archive
) +
2076 sizeof(num_objects_misplaced
) +
2077 sizeof(num_bytes_hit_set_archive
) +
2079 sizeof(num_flush_kb
) +
2081 sizeof(num_evict_kb
) +
2082 sizeof(num_promote
) +
2083 sizeof(num_flush_mode_high
) +
2084 sizeof(num_flush_mode_low
) +
2085 sizeof(num_evict_mode_some
) +
2086 sizeof(num_evict_mode_full
) +
2087 sizeof(num_objects_pinned
) +
2088 sizeof(num_objects_missing
) +
2089 sizeof(num_legacy_snapsets
)
2091 "object_stat_sum_t have padding");
2093 void encode(ceph::buffer::list
& bl
) const;
2094 void decode(ceph::buffer::list::const_iterator
& bl
);
2095 static void generate_test_instances(std::list
<object_stat_sum_t
*>& o
);
2097 WRITE_CLASS_ENCODER(object_stat_sum_t
)
2099 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
);
2102 * a collection of object stat sums
2104 * This is a collection of stat sums over different categories.
2106 struct object_stat_collection_t
{
2107 /**************************************************************************
2108 * WARNING: be sure to update the operator== when adding/removing fields! *
2109 **************************************************************************/
2110 object_stat_sum_t sum
;
2112 void calc_copies(int nrep
) {
2113 sum
.calc_copies(nrep
);
2116 void dump(ceph::Formatter
*f
) const;
2117 void encode(ceph::buffer::list
& bl
) const;
2118 void decode(ceph::buffer::list::const_iterator
& bl
);
2119 static void generate_test_instances(std::list
<object_stat_collection_t
*>& o
);
2121 bool is_zero() const {
2122 return sum
.is_zero();
2129 void floor(int64_t f
) {
2133 void add(const object_stat_sum_t
& o
) {
2137 void add(const object_stat_collection_t
& o
) {
2140 void sub(const object_stat_collection_t
& o
) {
2144 WRITE_CLASS_ENCODER(object_stat_collection_t
)
2146 inline bool operator==(const object_stat_collection_t
& l
,
2147 const object_stat_collection_t
& r
) {
2148 return l
.sum
== r
.sum
;
2151 enum class scrub_level_t
: bool { shallow
= false, deep
= true };
2152 enum class scrub_type_t
: bool { not_repair
= false, do_repair
= true };
2154 /// is there a scrub in our future?
2155 enum class pg_scrub_sched_status_t
: uint16_t {
2156 unknown
, ///< status not reported yet
2157 not_queued
, ///< not in the OSD's scrub queue. Probably not active.
2158 active
, ///< scrubbing
2159 scheduled
, ///< scheduled for a scrub at an already determined time
2160 queued
, ///< queued to be scrubbed
2161 blocked
///< blocked waiting for objects to be unlocked
2164 struct pg_scrubbing_status_t
{
2165 utime_t m_scheduled_at
{};
2166 int32_t m_duration_seconds
{0}; // relevant when scrubbing
2167 pg_scrub_sched_status_t m_sched_status
{pg_scrub_sched_status_t::unknown
};
2168 bool m_is_active
{false};
2169 scrub_level_t m_is_deep
{scrub_level_t::shallow
};
2170 bool m_is_periodic
{true};
2173 bool operator==(const pg_scrubbing_status_t
& l
, const pg_scrubbing_status_t
& r
);
2176 * aggregate stats for a single PG.
2179 /**************************************************************************
2180 * WARNING: be sure to update the operator== when adding/removing fields! *
2181 **************************************************************************/
2183 version_t reported_seq
; // sequence number
2184 epoch_t reported_epoch
; // epoch of this report
2186 utime_t last_fresh
; // last reported
2187 utime_t last_change
; // new state != previous state
2188 utime_t last_active
; // state & PG_STATE_ACTIVE
2189 utime_t last_peered
; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2190 utime_t last_clean
; // state & PG_STATE_CLEAN
2191 utime_t last_unstale
; // (state & PG_STATE_STALE) == 0
2192 utime_t last_undegraded
; // (state & PG_STATE_DEGRADED) == 0
2193 utime_t last_fullsized
; // (state & PG_STATE_UNDERSIZED) == 0
2195 eversion_t log_start
; // (log_start,version]
2196 eversion_t ondisk_log_start
; // there may be more on disk
2199 epoch_t last_epoch_clean
;
2201 __u32 parent_split_bits
;
2203 eversion_t last_scrub
;
2204 eversion_t last_deep_scrub
;
2205 utime_t last_scrub_stamp
;
2206 utime_t last_deep_scrub_stamp
;
2207 utime_t last_clean_scrub_stamp
;
2208 int32_t last_scrub_duration
{0};
2210 object_stat_collection_t stats
;
2213 int64_t log_dups_size
;
2214 int64_t ondisk_log_size
; // >= active_log_size
2215 int64_t objects_scrubbed
;
2216 double scrub_duration
;
2218 std::vector
<int32_t> up
, acting
;
2219 std::vector
<pg_shard_t
> avail_no_missing
;
2220 std::map
< std::set
<pg_shard_t
>, int32_t > object_location_counts
;
2221 epoch_t mapping_epoch
;
2223 std::vector
<int32_t> blocked_by
; ///< osds on which the pg is blocked
2225 interval_set
<snapid_t
> purged_snaps
; ///< recently removed snaps that we've purged
2227 utime_t last_became_active
;
2228 utime_t last_became_peered
;
2230 /// up, acting primaries
2232 int32_t acting_primary
;
2234 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2235 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2236 uint32_t snaptrimq_len
;
2237 int64_t objects_trimmed
;
2238 double snaptrim_duration
;
2240 pg_scrubbing_status_t scrub_sched_status
;
2242 bool stats_invalid
:1;
2243 /// true if num_objects_dirty is not accurate (because it was not
2244 /// maintained starting from pool creation)
2245 bool dirty_stats_invalid
:1;
2246 bool omap_stats_invalid
:1;
2247 bool hitset_stats_invalid
:1;
2248 bool hitset_bytes_stats_invalid
:1;
2249 bool pin_stats_invalid
:1;
2250 bool manifest_stats_invalid
:1;
2256 created(0), last_epoch_clean(0),
2257 parent_split_bits(0),
2258 log_size(0), log_dups_size(0),
2260 objects_scrubbed(0),
2267 snaptrim_duration(0.0),
2268 stats_invalid(false),
2269 dirty_stats_invalid(false),
2270 omap_stats_invalid(false),
2271 hitset_stats_invalid(false),
2272 hitset_bytes_stats_invalid(false),
2273 pin_stats_invalid(false),
2274 manifest_stats_invalid(false)
2277 epoch_t
get_effective_last_epoch_clean() const {
2278 if (state
& PG_STATE_CLEAN
) {
2279 // we are clean as of this report, and should thus take the
2281 return reported_epoch
;
2283 return last_epoch_clean
;
2287 std::pair
<epoch_t
, version_t
> get_version_pair() const {
2288 return { reported_epoch
, reported_seq
};
2291 void floor(int64_t f
) {
2295 if (ondisk_log_size
< f
)
2296 ondisk_log_size
= f
;
2297 if (snaptrimq_len
< f
)
2301 void add_sub_invalid_flags(const pg_stat_t
& o
) {
2302 // adding (or subtracting!) invalid stats render our stats invalid too
2303 stats_invalid
|= o
.stats_invalid
;
2304 dirty_stats_invalid
|= o
.dirty_stats_invalid
;
2305 omap_stats_invalid
|= o
.omap_stats_invalid
;
2306 hitset_stats_invalid
|= o
.hitset_stats_invalid
;
2307 hitset_bytes_stats_invalid
|= o
.hitset_bytes_stats_invalid
;
2308 pin_stats_invalid
|= o
.pin_stats_invalid
;
2309 manifest_stats_invalid
|= o
.manifest_stats_invalid
;
2311 void add(const pg_stat_t
& o
) {
2313 log_size
+= o
.log_size
;
2314 log_dups_size
+= o
.log_dups_size
;
2315 ondisk_log_size
+= o
.ondisk_log_size
;
2316 snaptrimq_len
= std::min((uint64_t)snaptrimq_len
+ o
.snaptrimq_len
,
2317 (uint64_t)(1ull << 31));
2318 add_sub_invalid_flags(o
);
2320 void sub(const pg_stat_t
& o
) {
2322 log_size
-= o
.log_size
;
2323 log_dups_size
-= o
.log_dups_size
;
2324 ondisk_log_size
-= o
.ondisk_log_size
;
2325 if (o
.snaptrimq_len
< snaptrimq_len
) {
2326 snaptrimq_len
-= o
.snaptrimq_len
;
2330 add_sub_invalid_flags(o
);
2333 bool is_acting_osd(int32_t osd
, bool primary
) const;
2334 void dump(ceph::Formatter
*f
) const;
2335 void dump_brief(ceph::Formatter
*f
) const;
2336 std::string
dump_scrub_schedule() const;
2337 void encode(ceph::buffer::list
&bl
) const;
2338 void decode(ceph::buffer::list::const_iterator
&bl
);
2339 static void generate_test_instances(std::list
<pg_stat_t
*>& o
);
2341 WRITE_CLASS_ENCODER(pg_stat_t
)
2343 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
);
2346 * ObjectStore full statfs information
2348 struct store_statfs_t
2350 uint64_t total
= 0; ///< Total bytes
2351 uint64_t available
= 0; ///< Free bytes available
2352 uint64_t internally_reserved
= 0; ///< Bytes reserved for internal purposes
2354 int64_t allocated
= 0; ///< Bytes allocated by the store
2356 int64_t data_stored
= 0; ///< Bytes actually stored by the user
2357 int64_t data_compressed
= 0; ///< Bytes stored after compression
2358 int64_t data_compressed_allocated
= 0; ///< Bytes allocated for compressed data
2359 int64_t data_compressed_original
= 0; ///< Bytes that were compressed
2361 int64_t omap_allocated
= 0; ///< approx usage of omap data
2362 int64_t internal_metadata
= 0; ///< approx usage of internal metadata
2365 *this = store_statfs_t();
2367 void floor(int64_t f
) {
2368 #define FLOOR(x) if (int64_t(x) < f) x = f
2371 FLOOR(internally_reserved
);
2374 FLOOR(data_compressed
);
2375 FLOOR(data_compressed_allocated
);
2376 FLOOR(data_compressed_original
);
2378 FLOOR(omap_allocated
);
2379 FLOOR(internal_metadata
);
2383 bool operator ==(const store_statfs_t
& other
) const;
2384 bool is_zero() const {
2385 return *this == store_statfs_t();
2388 uint64_t get_used() const {
2389 return total
- available
- internally_reserved
;
2392 // this accumulates both actually used and statfs's internally_reserved
2393 uint64_t get_used_raw() const {
2394 return total
- available
;
2397 float get_used_raw_ratio() const {
2399 return (float)get_used_raw() / (float)total
;
2405 // helpers to ease legacy code porting
2406 uint64_t kb_avail() const {
2407 return available
>> 10;
2409 uint64_t kb() const {
2412 uint64_t kb_used() const {
2413 return (total
- available
- internally_reserved
) >> 10;
2415 uint64_t kb_used_raw() const {
2416 return get_used_raw() >> 10;
2419 uint64_t kb_used_data() const {
2420 return allocated
>> 10;
2422 uint64_t kb_used_omap() const {
2423 return omap_allocated
>> 10;
2426 uint64_t kb_used_internal_metadata() const {
2427 return internal_metadata
>> 10;
2430 void add(const store_statfs_t
& o
) {
2432 available
+= o
.available
;
2433 internally_reserved
+= o
.internally_reserved
;
2434 allocated
+= o
.allocated
;
2435 data_stored
+= o
.data_stored
;
2436 data_compressed
+= o
.data_compressed
;
2437 data_compressed_allocated
+= o
.data_compressed_allocated
;
2438 data_compressed_original
+= o
.data_compressed_original
;
2439 omap_allocated
+= o
.omap_allocated
;
2440 internal_metadata
+= o
.internal_metadata
;
2442 void sub(const store_statfs_t
& o
) {
2444 available
-= o
.available
;
2445 internally_reserved
-= o
.internally_reserved
;
2446 allocated
-= o
.allocated
;
2447 data_stored
-= o
.data_stored
;
2448 data_compressed
-= o
.data_compressed
;
2449 data_compressed_allocated
-= o
.data_compressed_allocated
;
2450 data_compressed_original
-= o
.data_compressed_original
;
2451 omap_allocated
-= o
.omap_allocated
;
2452 internal_metadata
-= o
.internal_metadata
;
2454 void dump(ceph::Formatter
*f
) const;
2455 DENC(store_statfs_t
, v
, p
) {
2456 DENC_START(1, 1, p
);
2458 denc(v
.available
, p
);
2459 denc(v
.internally_reserved
, p
);
2460 denc(v
.allocated
, p
);
2461 denc(v
.data_stored
, p
);
2462 denc(v
.data_compressed
, p
);
2463 denc(v
.data_compressed_allocated
, p
);
2464 denc(v
.data_compressed_original
, p
);
2465 denc(v
.omap_allocated
, p
);
2466 denc(v
.internal_metadata
, p
);
2469 static void generate_test_instances(std::list
<store_statfs_t
*>& o
);
2471 WRITE_CLASS_DENC(store_statfs_t
)
2473 std::ostream
&operator<<(std::ostream
&lhs
, const store_statfs_t
&rhs
);
2476 * aggregate stats for an osd
2479 store_statfs_t statfs
;
2480 std::vector
<int> hb_peers
;
2481 int32_t snap_trim_queue_len
, num_snap_trimming
;
2482 uint64_t num_shards_repaired
;
2484 pow2_hist_t op_queue_age_hist
;
2486 objectstore_perf_stat_t os_perf_stat
;
2487 osd_alerts_t os_alerts
;
2489 epoch_t up_from
= 0;
2492 uint32_t num_pgs
= 0;
2494 uint32_t num_osds
= 0;
2495 uint32_t num_per_pool_osds
= 0;
2496 uint32_t num_per_pool_omap_osds
= 0;
2499 uint32_t last_update
; // in seconds
2500 uint32_t back_pingtime
[3];
2501 uint32_t back_min
[3];
2502 uint32_t back_max
[3];
2504 uint32_t front_pingtime
[3];
2505 uint32_t front_min
[3];
2506 uint32_t front_max
[3];
2507 uint32_t front_last
;
2509 std::map
<int, Interfaces
> hb_pingtime
; ///< map of osd id to Interfaces
2511 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2512 num_shards_repaired(0) {}
2514 void add(const osd_stat_t
& o
) {
2515 statfs
.add(o
.statfs
);
2516 snap_trim_queue_len
+= o
.snap_trim_queue_len
;
2517 num_snap_trimming
+= o
.num_snap_trimming
;
2518 num_shards_repaired
+= o
.num_shards_repaired
;
2519 op_queue_age_hist
.add(o
.op_queue_age_hist
);
2520 os_perf_stat
.add(o
.os_perf_stat
);
2521 num_pgs
+= o
.num_pgs
;
2522 num_osds
+= o
.num_osds
;
2523 num_per_pool_osds
+= o
.num_per_pool_osds
;
2524 num_per_pool_omap_osds
+= o
.num_per_pool_omap_osds
;
2525 for (const auto& a
: o
.os_alerts
) {
2526 auto& target
= os_alerts
[a
.first
];
2527 for (auto& i
: a
.second
) {
2528 target
.emplace(i
.first
, i
.second
);
2532 void sub(const osd_stat_t
& o
) {
2533 statfs
.sub(o
.statfs
);
2534 snap_trim_queue_len
-= o
.snap_trim_queue_len
;
2535 num_snap_trimming
-= o
.num_snap_trimming
;
2536 num_shards_repaired
-= o
.num_shards_repaired
;
2537 op_queue_age_hist
.sub(o
.op_queue_age_hist
);
2538 os_perf_stat
.sub(o
.os_perf_stat
);
2539 num_pgs
-= o
.num_pgs
;
2540 num_osds
-= o
.num_osds
;
2541 num_per_pool_osds
-= o
.num_per_pool_osds
;
2542 num_per_pool_omap_osds
-= o
.num_per_pool_omap_osds
;
2543 for (const auto& a
: o
.os_alerts
) {
2544 auto& target
= os_alerts
[a
.first
];
2545 for (auto& i
: a
.second
) {
2546 target
.erase(i
.first
);
2548 if (target
.empty()) {
2549 os_alerts
.erase(a
.first
);
2553 void dump(ceph::Formatter
*f
, bool with_net
= true) const;
2554 void dump_ping_time(ceph::Formatter
*f
) const;
2555 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
2556 void decode(ceph::buffer::list::const_iterator
&bl
);
2557 static void generate_test_instances(std::list
<osd_stat_t
*>& o
);
2559 WRITE_CLASS_ENCODER_FEATURES(osd_stat_t
)
2561 inline bool operator==(const osd_stat_t
& l
, const osd_stat_t
& r
) {
2562 return l
.statfs
== r
.statfs
&&
2563 l
.snap_trim_queue_len
== r
.snap_trim_queue_len
&&
2564 l
.num_snap_trimming
== r
.num_snap_trimming
&&
2565 l
.num_shards_repaired
== r
.num_shards_repaired
&&
2566 l
.hb_peers
== r
.hb_peers
&&
2567 l
.op_queue_age_hist
== r
.op_queue_age_hist
&&
2568 l
.os_perf_stat
== r
.os_perf_stat
&&
2569 l
.num_pgs
== r
.num_pgs
&&
2570 l
.num_osds
== r
.num_osds
&&
2571 l
.num_per_pool_osds
== r
.num_per_pool_osds
&&
2572 l
.num_per_pool_omap_osds
== r
.num_per_pool_omap_osds
;
2574 inline bool operator!=(const osd_stat_t
& l
, const osd_stat_t
& r
) {
2578 inline std::ostream
& operator<<(std::ostream
& out
, const osd_stat_t
& s
) {
2579 return out
<< "osd_stat(" << s
.statfs
<< ", "
2580 << "peers " << s
.hb_peers
2581 << " op hist " << s
.op_queue_age_hist
.h
2586 * summation over an entire pool
2588 struct pool_stat_t
{
2589 object_stat_collection_t stats
;
2590 store_statfs_t store_stats
;
2592 int64_t ondisk_log_size
; // >= active_log_size
2593 int32_t up
; ///< number of up replicas or shards
2594 int32_t acting
; ///< number of acting replicas or shards
2595 int32_t num_store_stats
; ///< amount of store_stats accumulated
2597 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2601 void floor(int64_t f
) {
2603 store_stats
.floor(f
);
2606 if (ondisk_log_size
< f
)
2607 ondisk_log_size
= f
;
2612 if (num_store_stats
< f
)
2613 num_store_stats
= f
;
2616 void add(const store_statfs_t
& o
) {
2620 void sub(const store_statfs_t
& o
) {
2625 void add(const pg_stat_t
& o
) {
2627 log_size
+= o
.log_size
;
2628 ondisk_log_size
+= o
.ondisk_log_size
;
2630 acting
+= o
.acting
.size();
2632 void sub(const pg_stat_t
& o
) {
2634 log_size
-= o
.log_size
;
2635 ondisk_log_size
-= o
.ondisk_log_size
;
2637 acting
-= o
.acting
.size();
2640 bool is_zero() const {
2641 return (stats
.is_zero() &&
2642 store_stats
.is_zero() &&
2644 ondisk_log_size
== 0 &&
2647 num_store_stats
== 0);
2650 // helper accessors to retrieve used/netto bytes depending on the
2651 // collection method: new per-pool objectstore report or legacy PG
2652 // summation at OSD.
2653 // In legacy mode used and netto values are the same. But for new per-pool
2654 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2655 // and 'netto' is amount of stored user data.
2656 uint64_t get_allocated_data_bytes(bool per_pool
) const {
2658 return store_stats
.allocated
;
2660 // legacy mode, use numbers from 'stats'
2661 return stats
.sum
.num_bytes
+ stats
.sum
.num_bytes_hit_set_archive
;
2664 uint64_t get_allocated_omap_bytes(bool per_pool_omap
) const {
2665 if (per_pool_omap
) {
2666 return store_stats
.omap_allocated
;
2668 // omap is not broken out by pool by nautilus bluestore; report the
2669 // scrub value. this will be imprecise in that it won't account for
2670 // any storage overhead/efficiency.
2671 return stats
.sum
.num_omap_bytes
;
2674 uint64_t get_user_data_bytes(float raw_used_rate
, ///< space amp factor
2675 bool per_pool
) const {
2676 // NOTE: we need the space amp factor so that we can work backwards from
2677 // the raw utilization to the amount of data that the user actually stored.
2679 return raw_used_rate
? store_stats
.data_stored
/ raw_used_rate
: 0;
2681 // legacy mode, use numbers from 'stats'. note that we do NOT use the
2682 // raw_used_rate factor here because we are working from the PG stats
2684 return stats
.sum
.num_bytes
+ stats
.sum
.num_bytes_hit_set_archive
;
2687 uint64_t get_user_omap_bytes(float raw_used_rate
, ///< space amp factor
2688 bool per_pool_omap
) const {
2689 if (per_pool_omap
) {
2690 return raw_used_rate
? store_stats
.omap_allocated
/ raw_used_rate
: 0;
2692 // omap usage is lazily reported during scrub; this value may lag.
2693 return stats
.sum
.num_omap_bytes
;
2697 void dump(ceph::Formatter
*f
) const;
2698 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
2699 void decode(ceph::buffer::list::const_iterator
&bl
);
2700 static void generate_test_instances(std::list
<pool_stat_t
*>& o
);
2702 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t
)
2705 // -----------------------------------------
2708 * pg_hit_set_info_t - information about a single recorded HitSet
2710 * Track basic metadata about a HitSet, like the number of insertions
2711 * and the time range it covers.
2713 struct pg_hit_set_info_t
{
2714 utime_t begin
, end
; ///< time interval
2715 eversion_t version
; ///< version this HitSet object was written
2716 bool using_gmt
; ///< use gmt for creating the hit_set archive object name
2718 friend bool operator==(const pg_hit_set_info_t
& l
,
2719 const pg_hit_set_info_t
& r
) {
2721 l
.begin
== r
.begin
&&
2723 l
.version
== r
.version
&&
2724 l
.using_gmt
== r
.using_gmt
;
2727 explicit pg_hit_set_info_t(bool using_gmt
= true)
2728 : using_gmt(using_gmt
) {}
2730 void encode(ceph::buffer::list
&bl
) const;
2731 void decode(ceph::buffer::list::const_iterator
&bl
);
2732 void dump(ceph::Formatter
*f
) const;
2733 static void generate_test_instances(std::list
<pg_hit_set_info_t
*>& o
);
2735 WRITE_CLASS_ENCODER(pg_hit_set_info_t
)
2738 * pg_hit_set_history_t - information about a history of hitsets
2740 * Include information about the currently accumulating hit set as well
2741 * as archived/historical ones.
2743 struct pg_hit_set_history_t
{
2744 eversion_t current_last_update
; ///< last version inserted into current set
2745 std::list
<pg_hit_set_info_t
> history
; ///< archived sets, sorted oldest -> newest
2747 friend bool operator==(const pg_hit_set_history_t
& l
,
2748 const pg_hit_set_history_t
& r
) {
2750 l
.current_last_update
== r
.current_last_update
&&
2751 l
.history
== r
.history
;
2754 void encode(ceph::buffer::list
&bl
) const;
2755 void decode(ceph::buffer::list::const_iterator
&bl
);
2756 void dump(ceph::Formatter
*f
) const;
2757 static void generate_test_instances(std::list
<pg_hit_set_history_t
*>& o
);
2759 WRITE_CLASS_ENCODER(pg_hit_set_history_t
)
2762 // -----------------------------------------
2765 * pg_history_t - information about recent pg peering/mapping history
2767 * This is aggressively shared between OSDs to bound the amount of past
2768 * history they need to worry about.
2770 struct pg_history_t
{
2771 epoch_t epoch_created
= 0; // epoch in which *pg* was created (pool or pg)
2772 epoch_t epoch_pool_created
= 0; // epoch in which *pool* was created
2773 // (note: may be pg creation epoch for
2774 // pre-luminous clusters)
2775 epoch_t last_epoch_started
= 0;; // lower bound on last epoch started (anywhere, not necessarily locally)
2776 // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/
2777 epoch_t last_interval_started
= 0;; // first epoch of last_epoch_started interval
2778 epoch_t last_epoch_clean
= 0;; // lower bound on last epoch the PG was completely clean.
2779 epoch_t last_interval_clean
= 0;; // first epoch of last_epoch_clean interval
2780 epoch_t last_epoch_split
= 0;; // as parent or child
2781 epoch_t last_epoch_marked_full
= 0;; // pool or cluster
2784 * In the event of a map discontinuity, same_*_since may reflect the first
2785 * map the osd has seen in the new map sequence rather than the actual start
2786 * of the interval. This is ok since a discontinuity at epoch e means there
2787 * must have been a clean interval between e and now and that we cannot be
2788 * in the active set during the interval containing e.
2790 epoch_t same_up_since
= 0;; // same acting set since
2791 epoch_t same_interval_since
= 0;; // same acting AND up set since
2792 epoch_t same_primary_since
= 0;; // same primary at least back through this epoch.
2794 eversion_t last_scrub
;
2795 eversion_t last_deep_scrub
;
2796 utime_t last_scrub_stamp
;
2797 utime_t last_deep_scrub_stamp
;
2798 utime_t last_clean_scrub_stamp
;
2800 /// upper bound on how long prior interval readable (relative to encode time)
2801 ceph::timespan prior_readable_until_ub
= ceph::timespan::zero();
2803 friend bool operator==(const pg_history_t
& l
, const pg_history_t
& r
) {
2805 l
.epoch_created
== r
.epoch_created
&&
2806 l
.epoch_pool_created
== r
.epoch_pool_created
&&
2807 l
.last_epoch_started
== r
.last_epoch_started
&&
2808 l
.last_interval_started
== r
.last_interval_started
&&
2809 l
.last_epoch_clean
== r
.last_epoch_clean
&&
2810 l
.last_interval_clean
== r
.last_interval_clean
&&
2811 l
.last_epoch_split
== r
.last_epoch_split
&&
2812 l
.last_epoch_marked_full
== r
.last_epoch_marked_full
&&
2813 l
.same_up_since
== r
.same_up_since
&&
2814 l
.same_interval_since
== r
.same_interval_since
&&
2815 l
.same_primary_since
== r
.same_primary_since
&&
2816 l
.last_scrub
== r
.last_scrub
&&
2817 l
.last_deep_scrub
== r
.last_deep_scrub
&&
2818 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
2819 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
2820 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
&&
2821 l
.prior_readable_until_ub
== r
.prior_readable_until_ub
;
2825 pg_history_t(epoch_t created
, utime_t stamp
)
2826 : epoch_created(created
),
2827 epoch_pool_created(created
),
2828 same_up_since(created
),
2829 same_interval_since(created
),
2830 same_primary_since(created
),
2831 last_scrub_stamp(stamp
),
2832 last_deep_scrub_stamp(stamp
),
2833 last_clean_scrub_stamp(stamp
) {}
2835 bool merge(const pg_history_t
&other
) {
2836 // Here, we only update the fields which cannot be calculated from the OSDmap.
2837 bool modified
= false;
2838 if (epoch_created
< other
.epoch_created
) {
2839 epoch_created
= other
.epoch_created
;
2842 if (epoch_pool_created
< other
.epoch_pool_created
) {
2843 // FIXME: for jewel compat only; this should either be 0 or always the
2844 // same value across all pg instances.
2845 epoch_pool_created
= other
.epoch_pool_created
;
2848 if (last_epoch_started
< other
.last_epoch_started
) {
2849 last_epoch_started
= other
.last_epoch_started
;
2852 if (last_interval_started
< other
.last_interval_started
) {
2853 last_interval_started
= other
.last_interval_started
;
2854 // if we are learning about a newer *started* interval, our
2855 // readable_until_ub is obsolete
2856 prior_readable_until_ub
= other
.prior_readable_until_ub
;
2858 } else if (other
.last_interval_started
== last_interval_started
&&
2859 other
.prior_readable_until_ub
< prior_readable_until_ub
) {
2860 // if other is the *same* interval, than pull our upper bound in
2861 // if they have a tighter bound.
2862 prior_readable_until_ub
= other
.prior_readable_until_ub
;
2865 if (last_epoch_clean
< other
.last_epoch_clean
) {
2866 last_epoch_clean
= other
.last_epoch_clean
;
2869 if (last_interval_clean
< other
.last_interval_clean
) {
2870 last_interval_clean
= other
.last_interval_clean
;
2873 if (last_epoch_split
< other
.last_epoch_split
) {
2874 last_epoch_split
= other
.last_epoch_split
;
2877 if (last_epoch_marked_full
< other
.last_epoch_marked_full
) {
2878 last_epoch_marked_full
= other
.last_epoch_marked_full
;
2881 if (other
.last_scrub
> last_scrub
) {
2882 last_scrub
= other
.last_scrub
;
2885 if (other
.last_scrub_stamp
> last_scrub_stamp
) {
2886 last_scrub_stamp
= other
.last_scrub_stamp
;
2889 if (other
.last_deep_scrub
> last_deep_scrub
) {
2890 last_deep_scrub
= other
.last_deep_scrub
;
2893 if (other
.last_deep_scrub_stamp
> last_deep_scrub_stamp
) {
2894 last_deep_scrub_stamp
= other
.last_deep_scrub_stamp
;
2897 if (other
.last_clean_scrub_stamp
> last_clean_scrub_stamp
) {
2898 last_clean_scrub_stamp
= other
.last_clean_scrub_stamp
;
2904 void encode(ceph::buffer::list
& bl
) const;
2905 void decode(ceph::buffer::list::const_iterator
& p
);
2906 void dump(ceph::Formatter
*f
) const;
2907 static void generate_test_instances(std::list
<pg_history_t
*>& o
);
2909 ceph::signedspan
refresh_prior_readable_until_ub(
2910 ceph::signedspan now
, ///< now, relative to osd startup_time
2911 ceph::signedspan ub
) { ///< ub, relative to osd startup_time
2913 // prior interval(s) are unreadable; we can zero the upper bound
2914 prior_readable_until_ub
= ceph::signedspan::zero();
2915 return ceph::signedspan::zero();
2917 prior_readable_until_ub
= ub
- now
;
2921 ceph::signedspan
get_prior_readable_until_ub(ceph::signedspan now
) {
2922 if (prior_readable_until_ub
== ceph::signedspan::zero()) {
2923 return ceph::signedspan::zero();
2925 return now
+ prior_readable_until_ub
;
2928 WRITE_CLASS_ENCODER(pg_history_t
)
2930 inline std::ostream
& operator<<(std::ostream
& out
, const pg_history_t
& h
) {
2931 out
<< "ec=" << h
.epoch_created
<< "/" << h
.epoch_pool_created
2932 << " lis/c=" << h
.last_interval_started
2933 << "/" << h
.last_interval_clean
2934 << " les/c/f=" << h
.last_epoch_started
<< "/" << h
.last_epoch_clean
2935 << "/" << h
.last_epoch_marked_full
2936 << " sis=" << h
.same_interval_since
;
2937 if (h
.prior_readable_until_ub
!= ceph::timespan::zero()) {
2938 out
<< " pruub=" << h
.prior_readable_until_ub
;
2945 * pg_info_t - summary of PG statistics.
2948 * - last_complete implies we have all objects that existed as of that
2949 * stamp, OR a newer object, OR have already applied a later delete.
2950 * - if last_complete >= log.tail, then we know pg contents thru log.head.
2951 * otherwise, we have no idea what the pg is supposed to contain.
2955 eversion_t last_update
; ///< last object version applied to store.
2956 eversion_t last_complete
; ///< last version pg was complete through.
2957 epoch_t last_epoch_started
; ///< last epoch at which this pg started on this osd
2958 epoch_t last_interval_started
; ///< first epoch of last_epoch_started interval
2960 version_t last_user_version
; ///< last user object version applied to store
2962 eversion_t log_tail
; ///< oldest log entry.
2964 hobject_t last_backfill
; ///< objects >= this and < last_complete may be missing
2966 interval_set
<snapid_t
> purged_snaps
;
2970 pg_history_t history
;
2971 pg_hit_set_history_t hit_set
;
2973 friend bool operator==(const pg_info_t
& l
, const pg_info_t
& r
) {
2976 l
.last_update
== r
.last_update
&&
2977 l
.last_complete
== r
.last_complete
&&
2978 l
.last_epoch_started
== r
.last_epoch_started
&&
2979 l
.last_interval_started
== r
.last_interval_started
&&
2980 l
.last_user_version
== r
.last_user_version
&&
2981 l
.log_tail
== r
.log_tail
&&
2982 l
.last_backfill
== r
.last_backfill
&&
2983 l
.purged_snaps
== r
.purged_snaps
&&
2984 l
.stats
== r
.stats
&&
2985 l
.history
== r
.history
&&
2986 l
.hit_set
== r
.hit_set
;
2990 : last_epoch_started(0),
2991 last_interval_started(0),
2992 last_user_version(0),
2993 last_backfill(hobject_t::get_max())
2995 // cppcheck-suppress noExplicitConstructor
2998 last_epoch_started(0),
2999 last_interval_started(0),
3000 last_user_version(0),
3001 last_backfill(hobject_t::get_max())
3004 void set_last_backfill(hobject_t pos
) {
3005 last_backfill
= pos
;
3008 bool is_empty() const { return last_update
.version
== 0; }
3009 bool dne() const { return history
.epoch_created
== 0; }
3011 bool has_missing() const { return last_complete
!= last_update
; }
3012 bool is_incomplete() const { return !last_backfill
.is_max(); }
3014 void encode(ceph::buffer::list
& bl
) const;
3015 void decode(ceph::buffer::list::const_iterator
& p
);
3016 void dump(ceph::Formatter
*f
) const;
3017 static void generate_test_instances(std::list
<pg_info_t
*>& o
);
3019 WRITE_CLASS_ENCODER(pg_info_t
)
3021 inline std::ostream
& operator<<(std::ostream
& out
, const pg_info_t
& pgi
)
3023 out
<< pgi
.pgid
<< "(";
3029 out
<< " v " << pgi
.last_update
;
3030 if (pgi
.last_complete
!= pgi
.last_update
)
3031 out
<< " lc " << pgi
.last_complete
;
3032 out
<< " (" << pgi
.log_tail
<< "," << pgi
.last_update
<< "]";
3034 if (pgi
.is_incomplete())
3035 out
<< " lb " << pgi
.last_backfill
;
3036 //out << " c " << pgi.epoch_created;
3037 out
<< " local-lis/les=" << pgi
.last_interval_started
3038 << "/" << pgi
.last_epoch_started
;
3039 out
<< " n=" << pgi
.stats
.stats
.sum
.num_objects
;
3040 out
<< " " << pgi
.history
3046 * pg_fast_info_t - common pg_info_t fields
3048 * These are the fields of pg_info_t (and children) that are updated for
3049 * most IO operations.
3052 * Because we rely on these fields to be applied to the normal
3053 * info struct, adding a new field here that is not also new in info
3054 * means that we must set an incompat OSD feature bit!
3056 struct pg_fast_info_t
{
3057 eversion_t last_update
;
3058 eversion_t last_complete
;
3059 version_t last_user_version
;
3060 struct { // pg_stat_t stats
3062 version_t reported_seq
;
3064 utime_t last_active
;
3065 utime_t last_peered
;
3067 utime_t last_unstale
;
3068 utime_t last_undegraded
;
3069 utime_t last_fullsized
;
3070 int64_t log_size
; // (also ondisk_log_size, which has the same value)
3071 struct { // object_stat_collection_t stats;
3072 struct { // objct_stat_sum_t sum
3073 int64_t num_bytes
; // in bytes
3074 int64_t num_objects
;
3075 int64_t num_object_copies
;
3080 int64_t num_objects_dirty
;
3085 void populate_from(const pg_info_t
& info
) {
3086 last_update
= info
.last_update
;
3087 last_complete
= info
.last_complete
;
3088 last_user_version
= info
.last_user_version
;
3089 stats
.version
= info
.stats
.version
;
3090 stats
.reported_seq
= info
.stats
.reported_seq
;
3091 stats
.last_fresh
= info
.stats
.last_fresh
;
3092 stats
.last_active
= info
.stats
.last_active
;
3093 stats
.last_peered
= info
.stats
.last_peered
;
3094 stats
.last_clean
= info
.stats
.last_clean
;
3095 stats
.last_unstale
= info
.stats
.last_unstale
;
3096 stats
.last_undegraded
= info
.stats
.last_undegraded
;
3097 stats
.last_fullsized
= info
.stats
.last_fullsized
;
3098 stats
.log_size
= info
.stats
.log_size
;
3099 stats
.stats
.sum
.num_bytes
= info
.stats
.stats
.sum
.num_bytes
;
3100 stats
.stats
.sum
.num_objects
= info
.stats
.stats
.sum
.num_objects
;
3101 stats
.stats
.sum
.num_object_copies
= info
.stats
.stats
.sum
.num_object_copies
;
3102 stats
.stats
.sum
.num_rd
= info
.stats
.stats
.sum
.num_rd
;
3103 stats
.stats
.sum
.num_rd_kb
= info
.stats
.stats
.sum
.num_rd_kb
;
3104 stats
.stats
.sum
.num_wr
= info
.stats
.stats
.sum
.num_wr
;
3105 stats
.stats
.sum
.num_wr_kb
= info
.stats
.stats
.sum
.num_wr_kb
;
3106 stats
.stats
.sum
.num_objects_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
3109 bool try_apply_to(pg_info_t
* info
) {
3110 if (last_update
<= info
->last_update
)
3112 info
->last_update
= last_update
;
3113 info
->last_complete
= last_complete
;
3114 info
->last_user_version
= last_user_version
;
3115 info
->stats
.version
= stats
.version
;
3116 info
->stats
.reported_seq
= stats
.reported_seq
;
3117 info
->stats
.last_fresh
= stats
.last_fresh
;
3118 info
->stats
.last_active
= stats
.last_active
;
3119 info
->stats
.last_peered
= stats
.last_peered
;
3120 info
->stats
.last_clean
= stats
.last_clean
;
3121 info
->stats
.last_unstale
= stats
.last_unstale
;
3122 info
->stats
.last_undegraded
= stats
.last_undegraded
;
3123 info
->stats
.last_fullsized
= stats
.last_fullsized
;
3124 info
->stats
.log_size
= stats
.log_size
;
3125 info
->stats
.ondisk_log_size
= stats
.log_size
;
3126 info
->stats
.stats
.sum
.num_bytes
= stats
.stats
.sum
.num_bytes
;
3127 info
->stats
.stats
.sum
.num_objects
= stats
.stats
.sum
.num_objects
;
3128 info
->stats
.stats
.sum
.num_object_copies
= stats
.stats
.sum
.num_object_copies
;
3129 info
->stats
.stats
.sum
.num_rd
= stats
.stats
.sum
.num_rd
;
3130 info
->stats
.stats
.sum
.num_rd_kb
= stats
.stats
.sum
.num_rd_kb
;
3131 info
->stats
.stats
.sum
.num_wr
= stats
.stats
.sum
.num_wr
;
3132 info
->stats
.stats
.sum
.num_wr_kb
= stats
.stats
.sum
.num_wr_kb
;
3133 info
->stats
.stats
.sum
.num_objects_dirty
= stats
.stats
.sum
.num_objects_dirty
;
3137 void encode(ceph::buffer::list
& bl
) const {
3138 ENCODE_START(1, 1, bl
);
3139 encode(last_update
, bl
);
3140 encode(last_complete
, bl
);
3141 encode(last_user_version
, bl
);
3142 encode(stats
.version
, bl
);
3143 encode(stats
.reported_seq
, bl
);
3144 encode(stats
.last_fresh
, bl
);
3145 encode(stats
.last_active
, bl
);
3146 encode(stats
.last_peered
, bl
);
3147 encode(stats
.last_clean
, bl
);
3148 encode(stats
.last_unstale
, bl
);
3149 encode(stats
.last_undegraded
, bl
);
3150 encode(stats
.last_fullsized
, bl
);
3151 encode(stats
.log_size
, bl
);
3152 encode(stats
.stats
.sum
.num_bytes
, bl
);
3153 encode(stats
.stats
.sum
.num_objects
, bl
);
3154 encode(stats
.stats
.sum
.num_object_copies
, bl
);
3155 encode(stats
.stats
.sum
.num_rd
, bl
);
3156 encode(stats
.stats
.sum
.num_rd_kb
, bl
);
3157 encode(stats
.stats
.sum
.num_wr
, bl
);
3158 encode(stats
.stats
.sum
.num_wr_kb
, bl
);
3159 encode(stats
.stats
.sum
.num_objects_dirty
, bl
);
3162 void decode(ceph::buffer::list::const_iterator
& p
) {
3164 decode(last_update
, p
);
3165 decode(last_complete
, p
);
3166 decode(last_user_version
, p
);
3167 decode(stats
.version
, p
);
3168 decode(stats
.reported_seq
, p
);
3169 decode(stats
.last_fresh
, p
);
3170 decode(stats
.last_active
, p
);
3171 decode(stats
.last_peered
, p
);
3172 decode(stats
.last_clean
, p
);
3173 decode(stats
.last_unstale
, p
);
3174 decode(stats
.last_undegraded
, p
);
3175 decode(stats
.last_fullsized
, p
);
3176 decode(stats
.log_size
, p
);
3177 decode(stats
.stats
.sum
.num_bytes
, p
);
3178 decode(stats
.stats
.sum
.num_objects
, p
);
3179 decode(stats
.stats
.sum
.num_object_copies
, p
);
3180 decode(stats
.stats
.sum
.num_rd
, p
);
3181 decode(stats
.stats
.sum
.num_rd_kb
, p
);
3182 decode(stats
.stats
.sum
.num_wr
, p
);
3183 decode(stats
.stats
.sum
.num_wr_kb
, p
);
3184 decode(stats
.stats
.sum
.num_objects_dirty
, p
);
3188 WRITE_CLASS_ENCODER(pg_fast_info_t
)
3192 * PastIntervals -- information needed to determine the PriorSet and
3193 * the might_have_unfound set
3195 class PastIntervals
{
3197 using OSDMapRef
= boost::local_shared_ptr
<const OSDMap
>;
3199 using OSDMapRef
= std::shared_ptr
<const OSDMap
>;
3202 struct pg_interval_t
{
3203 std::vector
<int32_t> up
, acting
;
3204 epoch_t first
, last
;
3210 : first(0), last(0),
3211 maybe_went_rw(false),
3217 std::vector
<int32_t> &&up
,
3218 std::vector
<int32_t> &&acting
,
3224 : up(up
), acting(acting
), first(first
), last(last
),
3225 maybe_went_rw(maybe_went_rw
), primary(primary
), up_primary(up_primary
)
3228 void encode(ceph::buffer::list
& bl
) const;
3229 void decode(ceph::buffer::list::const_iterator
& bl
);
3230 void dump(ceph::Formatter
*f
) const;
3231 static void generate_test_instances(std::list
<pg_interval_t
*>& o
);
3235 PastIntervals(PastIntervals
&&rhs
) = default;
3236 PastIntervals
&operator=(PastIntervals
&&rhs
) = default;
3238 PastIntervals(const PastIntervals
&rhs
);
3239 PastIntervals
&operator=(const PastIntervals
&rhs
);
3241 class interval_rep
{
3243 virtual size_t size() const = 0;
3244 virtual bool empty() const = 0;
3245 virtual void clear() = 0;
3246 virtual std::pair
<epoch_t
, epoch_t
> get_bounds() const = 0;
3247 virtual std::set
<pg_shard_t
> get_all_participants(
3248 bool ec_pool
) const = 0;
3249 virtual void add_interval(bool ec_pool
, const pg_interval_t
&interval
) = 0;
3250 virtual std::unique_ptr
<interval_rep
> clone() const = 0;
3251 virtual std::ostream
&print(std::ostream
&out
) const = 0;
3252 virtual void encode(ceph::buffer::list
&bl
) const = 0;
3253 virtual void decode(ceph::buffer::list::const_iterator
&bl
) = 0;
3254 virtual void dump(ceph::Formatter
*f
) const = 0;
3255 virtual void iterate_mayberw_back_to(
3257 std::function
<void(epoch_t
, const std::set
<pg_shard_t
> &)> &&f
) const = 0;
3259 virtual bool has_full_intervals() const { return false; }
3260 virtual void iterate_all_intervals(
3261 std::function
<void(const pg_interval_t
&)> &&f
) const {
3262 ceph_assert(!has_full_intervals());
3263 ceph_abort_msg("not valid for this implementation");
3265 virtual void adjust_start_backwards(epoch_t last_epoch_clean
) = 0;
3267 virtual ~interval_rep() {}
3269 friend class pi_compact_rep
;
3272 std::unique_ptr
<interval_rep
> past_intervals
;
3274 explicit PastIntervals(interval_rep
*rep
) : past_intervals(rep
) {}
3277 void add_interval(bool ec_pool
, const pg_interval_t
&interval
) {
3278 ceph_assert(past_intervals
);
3279 return past_intervals
->add_interval(ec_pool
, interval
);
3282 void encode(ceph::buffer::list
&bl
) const {
3283 ENCODE_START(1, 1, bl
);
3284 if (past_intervals
) {
3287 past_intervals
->encode(bl
);
3289 encode((__u8
)0, bl
);
3294 void decode(ceph::buffer::list::const_iterator
&bl
);
3296 void dump(ceph::Formatter
*f
) const {
3297 ceph_assert(past_intervals
);
3298 past_intervals
->dump(f
);
3300 static void generate_test_instances(std::list
<PastIntervals
*> & o
);
3303 * Determines whether there is an interval change
3305 static bool is_new_interval(
3306 int old_acting_primary
,
3307 int new_acting_primary
,
3308 const std::vector
<int> &old_acting
,
3309 const std::vector
<int> &new_acting
,
3312 const std::vector
<int> &old_up
,
3313 const std::vector
<int> &new_up
,
3318 unsigned old_pg_num
,
3319 unsigned new_pg_num
,
3320 unsigned old_pg_num_pending
,
3321 unsigned new_pg_num_pending
,
3322 bool old_sort_bitwise
,
3323 bool new_sort_bitwise
,
3324 bool old_recovery_deletes
,
3325 bool new_recovery_deletes
,
3326 uint32_t old_crush_count
,
3327 uint32_t new_crush_count
,
3328 uint32_t old_crush_target
,
3329 uint32_t new_crush_target
,
3330 uint32_t old_crush_barrier
,
3331 uint32_t new_crush_barrier
,
3332 int32_t old_crush_member
,
3333 int32_t new_crush_member
,
3338 * Determines whether there is an interval change
3340 static bool is_new_interval(
3341 int old_acting_primary
, ///< [in] primary as of lastmap
3342 int new_acting_primary
, ///< [in] primary as of lastmap
3343 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3344 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3345 int old_up_primary
, ///< [in] up primary of lastmap
3346 int new_up_primary
, ///< [in] up primary of osdmap
3347 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3348 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3349 const OSDMap
*osdmap
, ///< [in] current map
3350 const OSDMap
*lastmap
, ///< [in] last map
3351 pg_t pgid
///< [in] pgid for pg
3355 * Integrates a new map into *past_intervals, returns true
3356 * if an interval was closed out.
3358 static bool check_new_interval(
3359 int old_acting_primary
, ///< [in] primary as of lastmap
3360 int new_acting_primary
, ///< [in] primary as of osdmap
3361 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3362 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3363 int old_up_primary
, ///< [in] up primary of lastmap
3364 int new_up_primary
, ///< [in] up primary of osdmap
3365 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3366 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3367 epoch_t same_interval_since
, ///< [in] as of osdmap
3368 epoch_t last_epoch_clean
, ///< [in] current
3369 const OSDMap
*osdmap
, ///< [in] current map
3370 const OSDMap
*lastmap
, ///< [in] last map
3371 pg_t pgid
, ///< [in] pgid for pg
3372 const IsPGRecoverablePredicate
&could_have_gone_active
, ///< [in] predicate whether the pg can be active
3373 PastIntervals
*past_intervals
, ///< [out] intervals
3374 std::ostream
*out
= 0 ///< [out] debug ostream
3376 static bool check_new_interval(
3377 int old_acting_primary
, ///< [in] primary as of lastmap
3378 int new_acting_primary
, ///< [in] primary as of osdmap
3379 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3380 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3381 int old_up_primary
, ///< [in] up primary of lastmap
3382 int new_up_primary
, ///< [in] up primary of osdmap
3383 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3384 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3385 epoch_t same_interval_since
, ///< [in] as of osdmap
3386 epoch_t last_epoch_clean
, ///< [in] current
3387 OSDMapRef osdmap
, ///< [in] current map
3388 OSDMapRef lastmap
, ///< [in] last map
3389 pg_t pgid
, ///< [in] pgid for pg
3390 const IsPGRecoverablePredicate
&could_have_gone_active
, ///< [in] predicate whether the pg can be active
3391 PastIntervals
*past_intervals
, ///< [out] intervals
3392 std::ostream
*out
= 0 ///< [out] debug ostream
3394 return check_new_interval(
3395 old_acting_primary
, new_acting_primary
,
3396 old_acting
, new_acting
,
3397 old_up_primary
, new_up_primary
,
3399 same_interval_since
, last_epoch_clean
,
3400 osdmap
.get(), lastmap
.get(),
3402 could_have_gone_active
,
3407 friend std::ostream
& operator<<(std::ostream
& out
, const PastIntervals
&i
);
3409 template <typename F
>
3410 void iterate_mayberw_back_to(
3413 ceph_assert(past_intervals
);
3414 past_intervals
->iterate_mayberw_back_to(les
, std::forward
<F
>(f
));
3417 ceph_assert(past_intervals
);
3418 past_intervals
->clear();
3422 * Should return a value which gives an indication of the amount
3423 * of state contained
3425 size_t size() const {
3426 ceph_assert(past_intervals
);
3427 return past_intervals
->size();
3430 bool empty() const {
3431 ceph_assert(past_intervals
);
3432 return past_intervals
->empty();
3435 void swap(PastIntervals
&other
) {
3437 swap(other
.past_intervals
, past_intervals
);
3441 * Return all shards which have been in the acting set back to the
3442 * latest epoch to which we have trimmed except for pg_whoami
3444 std::set
<pg_shard_t
> get_might_have_unfound(
3445 pg_shard_t pg_whoami
,
3446 bool ec_pool
) const {
3447 ceph_assert(past_intervals
);
3448 auto ret
= past_intervals
->get_all_participants(ec_pool
);
3449 ret
.erase(pg_whoami
);
3454 * Return all shards which we might want to talk to for peering
3456 std::set
<pg_shard_t
> get_all_probe(
3457 bool ec_pool
) const {
3458 ceph_assert(past_intervals
);
3459 return past_intervals
->get_all_participants(ec_pool
);
3462 /* Return the set of epochs [start, end) represented by the
3463 * past_interval set.
3465 std::pair
<epoch_t
, epoch_t
> get_bounds() const {
3466 ceph_assert(past_intervals
);
3467 return past_intervals
->get_bounds();
3470 void adjust_start_backwards(epoch_t last_epoch_clean
) {
3471 ceph_assert(past_intervals
);
3472 past_intervals
->adjust_start_backwards(last_epoch_clean
);
3482 bool ec_pool
= false;
3483 std::set
<pg_shard_t
> probe
; ///< current+prior OSDs we need to probe.
3484 std::set
<int> down
; ///< down osds that would normally be in @a probe and might be interesting.
3485 std::map
<int, epoch_t
> blocked_by
; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
3487 bool pg_down
= false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
3488 const IsPGRecoverablePredicate
* pcontdec
= nullptr;
3490 PriorSet() = default;
3491 PriorSet(PriorSet
&&) = default;
3492 PriorSet
&operator=(PriorSet
&&) = default;
3494 PriorSet
&operator=(const PriorSet
&) = delete;
3495 PriorSet(const PriorSet
&) = delete;
3497 bool operator==(const PriorSet
&rhs
) const {
3498 return (ec_pool
== rhs
.ec_pool
) &&
3499 (probe
== rhs
.probe
) &&
3500 (down
== rhs
.down
) &&
3501 (blocked_by
== rhs
.blocked_by
) &&
3502 (pg_down
== rhs
.pg_down
);
3505 bool affected_by_map(
3506 const OSDMap
&osdmap
,
3507 const DoutPrefixProvider
*dpp
) const;
3509 // For verifying tests
3512 std::set
<pg_shard_t
> probe
,
3514 std::map
<int, epoch_t
> blocked_by
,
3516 const IsPGRecoverablePredicate
*pcontdec
)
3517 : ec_pool(ec_pool
), probe(probe
), down(down
), blocked_by(blocked_by
),
3518 pg_down(pg_down
), pcontdec(pcontdec
) {}
3521 template <typename F
>
3523 const PastIntervals
&past_intervals
,
3525 epoch_t last_epoch_started
,
3526 const IsPGRecoverablePredicate
*c
,
3528 const std::vector
<int> &up
,
3529 const std::vector
<int> &acting
,
3530 const DoutPrefixProvider
*dpp
);
3532 friend class PastIntervals
;
3535 template <typename
... Args
>
3536 PriorSet
get_prior_set(Args
&&... args
) const {
3537 return PriorSet(*this, std::forward
<Args
>(args
)...);
3540 WRITE_CLASS_ENCODER(PastIntervals
)
3542 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals::pg_interval_t
& i
);
3543 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals
&i
);
3544 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals::PriorSet
&i
);
3546 template <typename F
>
3547 PastIntervals::PriorSet::PriorSet(
3548 const PastIntervals
&past_intervals
,
3550 epoch_t last_epoch_started
,
3551 const IsPGRecoverablePredicate
*c
,
3553 const std::vector
<int> &up
,
3554 const std::vector
<int> &acting
,
3555 const DoutPrefixProvider
*dpp
)
3556 : ec_pool(ec_pool
), pg_down(false), pcontdec(c
)
3559 * We have to be careful to gracefully deal with situations like
3560 * so. Say we have a power outage or something that takes out both
3561 * OSDs, but the monitor doesn't mark them down in the same epoch.
3562 * The history may look like
3566 * 3: let's say B dies for good, too (say, from the power spike)
3569 * which makes it look like B may have applied updates to the PG
3570 * that we need in order to proceed. This sucks...
3572 * To minimize the risk of this happening, we CANNOT go active if
3573 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3574 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3575 * Then, we have something like
3582 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3593 * -> we must wait for B, bc it was alive through 2, and could have
3594 * written to the pg.
3596 * If B is really dead, then an administrator will need to manually
3597 * intervene by marking the OSD as "lost."
3600 // Include current acting and up nodes... not because they may
3601 // contain old data (this interval hasn't gone active, obviously),
3602 // but because we want their pg_info to inform choose_acting(), and
3603 // so that we know what they do/do not have explicitly before
3604 // sending them any new info/logs/whatever.
3605 for (unsigned i
= 0; i
< acting
.size(); i
++) {
3606 if (acting
[i
] != pg_pool_t::pg_CRUSH_ITEM_NONE
)
3607 probe
.insert(pg_shard_t(acting
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3609 // It may be possible to exclude the up nodes, but let's keep them in
3611 for (unsigned i
= 0; i
< up
.size(); i
++) {
3612 if (up
[i
] != pg_pool_t::pg_CRUSH_ITEM_NONE
)
3613 probe
.insert(pg_shard_t(up
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3616 std::set
<pg_shard_t
> all_probe
= past_intervals
.get_all_probe(ec_pool
);
3617 ldpp_dout(dpp
, 10) << "build_prior all_probe " << all_probe
<< dendl
;
3618 for (auto &&i
: all_probe
) {
3619 switch (f(0, i
.osd
, nullptr)) {
3633 past_intervals
.iterate_mayberw_back_to(
3635 [&](epoch_t start
, const std::set
<pg_shard_t
> &acting
) {
3636 ldpp_dout(dpp
, 10) << "build_prior maybe_rw interval:" << start
3637 << ", acting: " << acting
<< dendl
;
3639 // look at candidate osds during this interval. each falls into
3640 // one of three categories: up, down (but potentially
3641 // interesting), or lost (down, but we won't wait for it).
3642 std::set
<pg_shard_t
> up_now
;
3643 std::map
<int, epoch_t
> candidate_blocked_by
;
3644 // any candidates down now (that might have useful data)
3645 bool any_down_now
= false;
3647 // consider ACTING osds
3648 for (auto &&so
: acting
) {
3649 epoch_t lost_at
= 0;
3650 switch (f(start
, so
.osd
, &lost_at
)) {
3652 // include past acting osds if they are up.
3657 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3658 << " no longer exists" << dendl
;
3662 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3663 << " is down, but lost_at " << lost_at
<< dendl
;
3668 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3669 << " is down" << dendl
;
3670 candidate_blocked_by
[so
.osd
] = lost_at
;
3671 any_down_now
= true;
3677 // if not enough osds survived this interval, and we may have gone rw,
3678 // then we need to wait for one of those osds to recover to
3679 // ensure that we haven't lost any information.
3680 if (!(*pcontdec
)(up_now
) && any_down_now
) {
3681 // fixme: how do we identify a "clean" shutdown anyway?
3682 ldpp_dout(dpp
, 10) << "build_prior possibly went active+rw,"
3683 << " insufficient up; including down osds" << dendl
;
3684 ceph_assert(!candidate_blocked_by
.empty());
3687 candidate_blocked_by
.begin(),
3688 candidate_blocked_by
.end());
3692 ldpp_dout(dpp
, 10) << "build_prior final: probe " << probe
3694 << " blocked_by " << blocked_by
3695 << (pg_down
? " pg_down":"")
3699 struct pg_notify_t
{
3700 epoch_t query_epoch
;
3705 PastIntervals past_intervals
;
3707 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3708 from(shard_id_t::NO_SHARD
) {}
3712 epoch_t query_epoch
,
3714 const pg_info_t
&info
,
3715 const PastIntervals
& pi
)
3716 : query_epoch(query_epoch
),
3717 epoch_sent(epoch_sent
),
3718 info(info
), to(to
), from(from
),
3719 past_intervals(pi
) {
3720 ceph_assert(from
== info
.pgid
.shard
);
3722 void encode(ceph::buffer::list
&bl
) const;
3723 void decode(ceph::buffer::list::const_iterator
&p
);
3724 void dump(ceph::Formatter
*f
) const;
3725 static void generate_test_instances(std::list
<pg_notify_t
*> &o
);
3727 WRITE_CLASS_ENCODER(pg_notify_t
)
3728 std::ostream
&operator<<(std::ostream
&lhs
, const pg_notify_t
¬ify
);
3732 * pg_query_t - used to ask a peer for information about a pg.
3734 * note: if version=0, type=LOG, then we just provide our full log.
3743 std::string_view
get_type_name() const {
3745 case INFO
: return "info";
3746 case LOG
: return "log";
3747 case MISSING
: return "missing";
3748 case FULLLOG
: return "fulllog";
3749 default: return "???";
3755 pg_history_t history
;
3760 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3761 from(shard_id_t::NO_SHARD
) {}
3766 const pg_history_t
& h
,
3770 epoch_sent(epoch_sent
),
3771 to(to
), from(from
) {
3772 ceph_assert(t
!= LOG
);
3779 const pg_history_t
& h
,
3781 : type(t
), since(s
), history(h
),
3782 epoch_sent(epoch_sent
), to(to
), from(from
) {
3783 ceph_assert(t
== LOG
);
3786 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
3787 void decode(ceph::buffer::list::const_iterator
&bl
);
3789 void dump(ceph::Formatter
*f
) const;
3790 static void generate_test_instances(std::list
<pg_query_t
*>& o
);
3792 WRITE_CLASS_ENCODER_FEATURES(pg_query_t
)
3794 inline std::ostream
& operator<<(std::ostream
& out
, const pg_query_t
& q
) {
3795 out
<< "query(" << q
.get_type_name() << " " << q
.since
;
3796 if (q
.type
== pg_query_t::LOG
)
3797 out
<< " " << q
.history
;
3798 out
<< " epoch_sent " << q
.epoch_sent
;
3804 * pg_lease_t - readable lease metadata, from primary -> non-primary
3806 * This metadata serves to increase either or both of the lease expiration
3807 * and upper bound on the non-primary.
3810 /// pg readable_until value; replicas must not be readable beyond this
3811 ceph::signedspan readable_until
= ceph::signedspan::zero();
3813 /// upper bound on any acting osd's readable_until
3814 ceph::signedspan readable_until_ub
= ceph::signedspan::zero();
3816 /// duration of the lease (in case clock deltas aren't available)
3817 ceph::signedspan interval
= ceph::signedspan::zero();
3820 pg_lease_t(ceph::signedspan ru
, ceph::signedspan ruub
,
3822 : readable_until(ru
),
3823 readable_until_ub(ruub
),
3826 void encode(ceph::buffer::list
&bl
) const;
3827 void decode(ceph::buffer::list::const_iterator
&bl
);
3828 void dump(ceph::Formatter
*f
) const;
3829 static void generate_test_instances(std::list
<pg_lease_t
*>& o
);
3831 friend std::ostream
& operator<<(std::ostream
& out
, const pg_lease_t
& l
) {
3832 return out
<< "pg_lease(ru " << l
.readable_until
3833 << " ub " << l
.readable_until_ub
3834 << " int " << l
.interval
<< ")";
3837 WRITE_CLASS_ENCODER(pg_lease_t
)
3840 * pg_lease_ack_t - lease ack, from non-primary -> primary
3842 * This metadata acknowledges to the primary what a non-primary's noted
3845 struct pg_lease_ack_t
{
3846 /// highest upper bound non-primary has recorded (primary's clock)
3847 ceph::signedspan readable_until_ub
= ceph::signedspan::zero();
3850 pg_lease_ack_t(ceph::signedspan ub
)
3851 : readable_until_ub(ub
) {}
3853 void encode(ceph::buffer::list
&bl
) const;
3854 void decode(ceph::buffer::list::const_iterator
&bl
);
3855 void dump(ceph::Formatter
*f
) const;
3856 static void generate_test_instances(std::list
<pg_lease_ack_t
*>& o
);
3858 friend std::ostream
& operator<<(std::ostream
& out
, const pg_lease_ack_t
& l
) {
3859 return out
<< "pg_lease_ack(ruub " << l
.readable_until_ub
<< ")";
3862 WRITE_CLASS_ENCODER(pg_lease_ack_t
)
3867 class ObjectModDesc
{
3868 bool can_local_rollback
;
3869 bool rollback_info_completed
;
3871 // version required to decode, reflected in encode/decode version
3872 __u8 max_required_version
= 1;
3876 virtual void append(uint64_t old_offset
) {}
3877 virtual void setattrs(std::map
<std::string
, std::optional
<ceph::buffer::list
>> &attrs
) {}
3878 virtual void rmobject(version_t old_version
) {}
3880 * Used to support the unfound_lost_delete log event: if the stashed
3881 * version exists, we unstash it, otherwise, we do nothing. This way
3882 * each replica rolls back to whatever state it had prior to the attempt
3883 * at mark unfound lost delete
3885 virtual void try_rmobject(version_t old_version
) {
3886 rmobject(old_version
);
3888 virtual void create() {}
3889 virtual void update_snaps(const std::set
<snapid_t
> &old_snaps
) {}
3890 virtual void rollback_extents(
3892 const std::vector
<std::pair
<uint64_t, uint64_t> > &extents
) {}
3893 virtual ~Visitor() {}
3895 void visit(Visitor
*visitor
) const;
3896 mutable ceph::buffer::list bl
;
3904 ROLLBACK_EXTENTS
= 7
3906 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3907 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3909 void claim(ObjectModDesc
&other
) {
3910 bl
= std::move(other
.bl
);
3911 can_local_rollback
= other
.can_local_rollback
;
3912 rollback_info_completed
= other
.rollback_info_completed
;
3914 void claim_append(ObjectModDesc
&other
) {
3915 if (!can_local_rollback
|| rollback_info_completed
)
3917 if (!other
.can_local_rollback
) {
3918 mark_unrollbackable();
3921 bl
.claim_append(other
.bl
);
3922 rollback_info_completed
= other
.rollback_info_completed
;
3924 void swap(ObjectModDesc
&other
) {
3928 swap(other
.can_local_rollback
, can_local_rollback
);
3929 swap(other
.rollback_info_completed
, rollback_info_completed
);
3930 swap(other
.max_required_version
, max_required_version
);
3932 void append_id(ModID id
) {
3937 void append(uint64_t old_size
) {
3938 if (!can_local_rollback
|| rollback_info_completed
)
3940 ENCODE_START(1, 1, bl
);
3942 encode(old_size
, bl
);
3945 void setattrs(std::map
<std::string
, std::optional
<ceph::buffer::list
>> &old_attrs
) {
3946 if (!can_local_rollback
|| rollback_info_completed
)
3948 ENCODE_START(1, 1, bl
);
3949 append_id(SETATTRS
);
3950 encode(old_attrs
, bl
);
3953 bool rmobject(version_t deletion_version
) {
3954 if (!can_local_rollback
|| rollback_info_completed
)
3956 ENCODE_START(1, 1, bl
);
3958 encode(deletion_version
, bl
);
3960 rollback_info_completed
= true;
3963 bool try_rmobject(version_t deletion_version
) {
3964 if (!can_local_rollback
|| rollback_info_completed
)
3966 ENCODE_START(1, 1, bl
);
3967 append_id(TRY_DELETE
);
3968 encode(deletion_version
, bl
);
3970 rollback_info_completed
= true;
3974 if (!can_local_rollback
|| rollback_info_completed
)
3976 rollback_info_completed
= true;
3977 ENCODE_START(1, 1, bl
);
3981 void update_snaps(const std::set
<snapid_t
> &old_snaps
) {
3982 if (!can_local_rollback
|| rollback_info_completed
)
3984 ENCODE_START(1, 1, bl
);
3985 append_id(UPDATE_SNAPS
);
3986 encode(old_snaps
, bl
);
3989 void rollback_extents(
3990 version_t gen
, const std::vector
<std::pair
<uint64_t, uint64_t> > &extents
) {
3991 ceph_assert(can_local_rollback
);
3992 ceph_assert(!rollback_info_completed
);
3993 if (max_required_version
< 2)
3994 max_required_version
= 2;
3995 ENCODE_START(2, 2, bl
);
3996 append_id(ROLLBACK_EXTENTS
);
3998 encode(extents
, bl
);
4002 // cannot be rolled back
4003 void mark_unrollbackable() {
4004 can_local_rollback
= false;
4007 bool can_rollback() const {
4008 return can_local_rollback
;
4010 bool empty() const {
4011 return can_local_rollback
&& (bl
.length() == 0);
4014 bool requires_kraken() const {
4015 return max_required_version
>= 2;
4019 * Create fresh copy of bl bytes to avoid keeping large buffers around
4020 * in the case that bl contains ptrs which point into a much larger
4023 void trim_bl() const {
4024 if (bl
.length() > 0)
4027 void encode(ceph::buffer::list
&bl
) const;
4028 void decode(ceph::buffer::list::const_iterator
&bl
);
4029 void dump(ceph::Formatter
*f
) const;
4030 static void generate_test_instances(std::list
<ObjectModDesc
*>& o
);
4032 WRITE_CLASS_ENCODER(ObjectModDesc
)
4034 class ObjectCleanRegions
{
4038 interval_set
<uint64_t> clean_offsets
;
4039 static std::atomic
<uint32_t> max_num_intervals
;
4042 * trim the number of intervals if clean_offsets.num_intervals()
4043 * exceeds the given upbound max_num_intervals
4044 * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
4045 * then new interval [30~10] will evict out the shortest one [20~5]
4046 * finally, clean_offsets becomes {[5~10], [30~10]}
4049 friend std::ostream
& operator<<(std::ostream
& out
, const ObjectCleanRegions
& ocr
);
4051 ObjectCleanRegions() : new_object(false), clean_omap(true) {
4052 clean_offsets
.insert(0, (uint64_t)-1);
4054 ObjectCleanRegions(uint64_t offset
, uint64_t len
, bool co
)
4055 : new_object(false), clean_omap(co
) {
4056 clean_offsets
.insert(offset
, len
);
4058 bool operator==(const ObjectCleanRegions
&orc
) const {
4059 return new_object
== orc
.new_object
&& clean_omap
== orc
.clean_omap
&& clean_offsets
== orc
.clean_offsets
;
4061 static void set_max_num_intervals(uint32_t num
);
4062 void merge(const ObjectCleanRegions
&other
);
4063 void mark_data_region_dirty(uint64_t offset
, uint64_t len
);
4064 void mark_omap_dirty();
4065 void mark_object_new();
4066 void mark_fully_dirty();
4067 interval_set
<uint64_t> get_dirty_regions() const;
4068 bool omap_is_dirty() const;
4069 bool object_is_exist() const;
4070 bool is_clean_region(uint64_t offset
, uint64_t len
) const;
4072 void encode(ceph::buffer::list
&bl
) const;
4073 void decode(ceph::buffer::list::const_iterator
&bl
);
4074 void dump(ceph::Formatter
*f
) const;
4075 static void generate_test_instances(std::list
<ObjectCleanRegions
*>& o
);
4077 WRITE_CLASS_ENCODER(ObjectCleanRegions
)
4078 std::ostream
& operator<<(std::ostream
& out
, const ObjectCleanRegions
& ocr
);
4085 ceph::buffer::list indata
, outdata
;
4089 // FIPS zeroization audit 20191115: this memset clean for security
4090 memset(&op
, 0, sizeof(ceph_osd_op
));
4093 OSDOp(const int op_code
) {
4094 // FIPS zeroization audit 20191115: this memset clean for security
4095 memset(&op
, 0, sizeof(ceph_osd_op
));
4100 * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
4102 * @param ops [out] vector of OSDOps
4103 * @param in [in] combined data buffer
4105 template<typename V
>
4106 static void split_osd_op_vector_in_data(V
& ops
,
4107 ceph::buffer::list
& in
) {
4108 ceph::buffer::list::iterator datap
= in
.begin();
4109 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4110 if (ops
[i
].op
.payload_len
) {
4111 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].indata
);
4117 * merge indata members of a vector of OSDOp into a single ceph::buffer::list
4119 * Notably this also encodes certain other OSDOp data into the data
4120 * buffer, including the sobject_t soid.
4122 * @param ops [in] vector of OSDOps
4123 * @param out [out] combined data buffer
4125 template<typename V
>
4126 static void merge_osd_op_vector_in_data(V
& ops
, ceph::buffer::list
& out
) {
4127 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4128 if (ops
[i
].indata
.length()) {
4129 ops
[i
].op
.payload_len
= ops
[i
].indata
.length();
4130 out
.append(ops
[i
].indata
);
4136 * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
4138 * @param ops [out] vector of OSDOps
4139 * @param in [in] combined data buffer
4141 static void split_osd_op_vector_out_data(std::vector
<OSDOp
>& ops
, ceph::buffer::list
& in
);
4144 * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
4146 * @param ops [in] vector of OSDOps
4147 * @param out [out] combined data buffer
4149 static void merge_osd_op_vector_out_data(std::vector
<OSDOp
>& ops
, ceph::buffer::list
& out
);
4152 * Clear data as much as possible, leave minimal data for historical op dump
4154 * @param ops [in] vector of OSDOps
4156 template<typename V
>
4157 static void clear_data(V
& ops
) {
4158 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4161 if (ceph_osd_op_type_attr(op
.op
.op
) &&
4162 op
.op
.xattr
.name_len
&&
4163 op
.indata
.length() >= op
.op
.xattr
.name_len
) {
4164 ceph::buffer::list bl
;
4165 bl
.push_back(ceph::buffer::ptr_node::create(op
.op
.xattr
.name_len
));
4166 bl
.begin().copy_in(op
.op
.xattr
.name_len
, op
.indata
);
4167 op
.indata
= std::move(bl
);
4168 } else if (ceph_osd_op_type_exec(op
.op
.op
) &&
4169 op
.op
.cls
.class_len
&&
4170 op
.indata
.length() >
4171 (op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
)) {
4172 __u8 len
= op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
;
4173 ceph::buffer::list bl
;
4174 bl
.push_back(ceph::buffer::ptr_node::create(len
));
4175 bl
.begin().copy_in(len
, op
.indata
);
4176 op
.indata
= std::move(bl
);
4183 std::ostream
& operator<<(std::ostream
& out
, const OSDOp
& op
);
4185 struct pg_log_op_return_item_t
{
4187 ceph::buffer::list bl
;
4188 void encode(ceph::buffer::list
& p
) const {
4193 void decode(ceph::buffer::list::const_iterator
& p
) {
4198 void dump(ceph::Formatter
*f
) const {
4199 f
->dump_int("rval", rval
);
4200 f
->dump_unsigned("bl_length", bl
.length());
4202 friend bool operator==(const pg_log_op_return_item_t
& lhs
,
4203 const pg_log_op_return_item_t
& rhs
) {
4204 return lhs
.rval
== rhs
.rval
&&
4205 lhs
.bl
.contents_equal(rhs
.bl
);
4207 friend bool operator!=(const pg_log_op_return_item_t
& lhs
,
4208 const pg_log_op_return_item_t
& rhs
) {
4209 return !(lhs
== rhs
);
4211 friend std::ostream
& operator<<(std::ostream
& out
, const pg_log_op_return_item_t
& i
) {
4212 return out
<< "r=" << i
.rval
<< "+" << i
.bl
.length() << "b";
4215 WRITE_CLASS_ENCODER(pg_log_op_return_item_t
)
4218 * pg_log_entry_t - single entry/event in pg log
4221 struct pg_log_entry_t
{
4223 MODIFY
= 1, // some unspecified modification (but not *all* modifications)
4224 CLONE
= 2, // cloned object from head
4225 DELETE
= 3, // deleted object
4226 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
4227 LOST_REVERT
= 5, // lost new version, revert to an older version.
4228 LOST_DELETE
= 6, // lost new version, revert to no object (deleted).
4229 LOST_MARK
= 7, // lost new version, now EIO
4230 PROMOTE
= 8, // promoted object from another tier
4231 CLEAN
= 9, // mark an object clean
4232 ERROR
= 10, // write that returned an error
4234 static const char *get_op_name(int op
) {
4258 const char *get_op_name() const {
4259 return get_op_name(op
);
4262 // describes state for a locally-rollbackable entry
4263 ObjectModDesc mod_desc
;
4264 ceph::buffer::list snaps
; // only for clone entries
4266 osd_reqid_t reqid
; // caller+tid to uniquely identify request
4267 mempool::osd_pglog::vector
<std::pair
<osd_reqid_t
, version_t
> > extra_reqids
;
4269 /// map extra_reqids by index to error return code (if any)
4270 mempool::osd_pglog::map
<uint32_t, int> extra_reqid_return_codes
;
4272 eversion_t version
, prior_version
, reverting_to
;
4273 version_t user_version
; // the user version for this entry
4274 utime_t mtime
; // this is the _user_ mtime, mind you
4275 int32_t return_code
; // only stored for ERRORs for dup detection
4277 std::vector
<pg_log_op_return_item_t
> op_returns
;
4280 bool invalid_hash
; // only when decoding sobject_t based entries
4281 bool invalid_pool
; // only when decoding pool-less hobject based entries
4282 ObjectCleanRegions clean_regions
;
4285 : user_version(0), return_code(0), op(0),
4286 invalid_hash(false), invalid_pool(false) {
4287 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4289 pg_log_entry_t(int _op
, const hobject_t
& _soid
,
4290 const eversion_t
& v
, const eversion_t
& pv
,
4292 const osd_reqid_t
& rid
, const utime_t
& mt
,
4294 : soid(_soid
), reqid(rid
), version(v
), prior_version(pv
), user_version(uv
),
4295 mtime(mt
), return_code(return_code
), op(_op
),
4296 invalid_hash(false), invalid_pool(false) {
4297 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4300 bool is_clone() const { return op
== CLONE
; }
4301 bool is_modify() const { return op
== MODIFY
; }
4302 bool is_promote() const { return op
== PROMOTE
; }
4303 bool is_clean() const { return op
== CLEAN
; }
4304 bool is_lost_revert() const { return op
== LOST_REVERT
; }
4305 bool is_lost_delete() const { return op
== LOST_DELETE
; }
4306 bool is_lost_mark() const { return op
== LOST_MARK
; }
4307 bool is_error() const { return op
== ERROR
; }
4309 bool is_update() const {
4311 is_clone() || is_modify() || is_promote() || is_clean() ||
4312 is_lost_revert() || is_lost_mark();
4314 bool is_delete() const {
4315 return op
== DELETE
|| op
== LOST_DELETE
;
4318 bool can_rollback() const {
4319 return mod_desc
.can_rollback();
4322 void mark_unrollbackable() {
4323 mod_desc
.mark_unrollbackable();
4326 bool requires_kraken() const {
4327 return mod_desc
.requires_kraken();
4330 // Errors are only used for dup detection, whereas
4331 // the index by objects is used by recovery, copy_get,
4332 // and other facilities that don't expect or need to
4333 // be aware of error entries.
4334 bool object_is_indexed() const {
4338 bool reqid_is_indexed() const {
4339 return reqid
!= osd_reqid_t() &&
4340 (op
== MODIFY
|| op
== DELETE
|| op
== ERROR
);
4343 void set_op_returns(const std::vector
<OSDOp
>& ops
) {
4344 op_returns
.resize(ops
.size());
4345 for (unsigned i
= 0; i
< ops
.size(); ++i
) {
4346 op_returns
[i
].rval
= ops
[i
].rval
;
4347 op_returns
[i
].bl
= ops
[i
].outdata
;
4351 std::string
get_key_name() const;
4352 void encode_with_checksum(ceph::buffer::list
& bl
) const;
4353 void decode_with_checksum(ceph::buffer::list::const_iterator
& p
);
4355 void encode(ceph::buffer::list
&bl
) const;
4356 void decode(ceph::buffer::list::const_iterator
&bl
);
4357 void dump(ceph::Formatter
*f
) const;
4358 static void generate_test_instances(std::list
<pg_log_entry_t
*>& o
);
4361 WRITE_CLASS_ENCODER(pg_log_entry_t
)
4363 std::ostream
& operator<<(std::ostream
& out
, const pg_log_entry_t
& e
);
4365 struct pg_log_dup_t
{
4366 osd_reqid_t reqid
; // caller+tid to uniquely identify request
4368 version_t user_version
; // the user version for this entry
4369 int32_t return_code
; // only stored for ERRORs for dup detection
4371 std::vector
<pg_log_op_return_item_t
> op_returns
;
4374 : user_version(0), return_code(0)
4376 explicit pg_log_dup_t(const pg_log_entry_t
& entry
)
4377 : reqid(entry
.reqid
), version(entry
.version
),
4378 user_version(entry
.user_version
),
4379 return_code(entry
.return_code
),
4380 op_returns(entry
.op_returns
)
4382 pg_log_dup_t(const eversion_t
& v
, version_t uv
,
4383 const osd_reqid_t
& rid
, int return_code
)
4384 : reqid(rid
), version(v
), user_version(uv
),
4385 return_code(return_code
)
4388 std::string
get_key_name() const;
4389 void encode(ceph::buffer::list
&bl
) const;
4390 void decode(ceph::buffer::list::const_iterator
&bl
);
4391 void dump(ceph::Formatter
*f
) const;
4392 static void generate_test_instances(std::list
<pg_log_dup_t
*>& o
);
4394 bool operator==(const pg_log_dup_t
&rhs
) const {
4395 return reqid
== rhs
.reqid
&&
4396 version
== rhs
.version
&&
4397 user_version
== rhs
.user_version
&&
4398 return_code
== rhs
.return_code
&&
4399 op_returns
== rhs
.op_returns
;
4401 bool operator!=(const pg_log_dup_t
&rhs
) const {
4402 return !(*this == rhs
);
4405 friend std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
4407 WRITE_CLASS_ENCODER(pg_log_dup_t
)
4409 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
4412 * pg_log_t - incremental log of recent pg changes.
4414 * serves as a recovery queue for recent changes.
4418 * head - newest entry (update|delete)
4419 * tail - entry previous to oldest (update|delete) for which we have
4420 * complete negative information.
4421 * i.e. we can infer pg contents for any store whose last_update >= tail.
4423 eversion_t head
; // newest entry
4424 eversion_t tail
; // version prior to oldest
4427 // We can rollback rollback-able entries > can_rollback_to
4428 eversion_t can_rollback_to
;
4430 // always <= can_rollback_to, indicates how far stashed rollback
4431 // data can be found
4432 eversion_t rollback_info_trimmed_to
;
4436 mempool::osd_pglog::list
<pg_log_entry_t
> log
;
4438 // entries just for dup op detection ordered oldest to newest
4439 mempool::osd_pglog::list
<pg_log_dup_t
> dups
;
4441 pg_log_t() = default;
4442 pg_log_t(const eversion_t
&last_update
,
4443 const eversion_t
&log_tail
,
4444 const eversion_t
&can_rollback_to
,
4445 const eversion_t
&rollback_info_trimmed_to
,
4446 mempool::osd_pglog::list
<pg_log_entry_t
> &&entries
,
4447 mempool::osd_pglog::list
<pg_log_dup_t
> &&dup_entries
)
4448 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
4449 rollback_info_trimmed_to(rollback_info_trimmed_to
),
4450 log(std::move(entries
)), dups(std::move(dup_entries
)) {}
4451 pg_log_t(const eversion_t
&last_update
,
4452 const eversion_t
&log_tail
,
4453 const eversion_t
&can_rollback_to
,
4454 const eversion_t
&rollback_info_trimmed_to
,
4455 const std::list
<pg_log_entry_t
> &entries
,
4456 const std::list
<pg_log_dup_t
> &dup_entries
)
4457 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
4458 rollback_info_trimmed_to(rollback_info_trimmed_to
) {
4459 for (auto &&entry
: entries
) {
4460 log
.push_back(entry
);
4462 for (auto &&entry
: dup_entries
) {
4463 dups
.push_back(entry
);
4469 rollback_info_trimmed_to
= can_rollback_to
= head
= tail
= z
;
4474 eversion_t
get_rollback_info_trimmed_to() const {
4475 return rollback_info_trimmed_to
;
4477 eversion_t
get_can_rollback_to() const {
4478 return can_rollback_to
;
4482 pg_log_t
split_out_child(pg_t child_pgid
, unsigned split_bits
) {
4483 mempool::osd_pglog::list
<pg_log_entry_t
> oldlog
, childlog
;
4486 eversion_t old_tail
;
4487 unsigned mask
= ~((~0)<<split_bits
);
4488 for (auto i
= oldlog
.begin();
4491 if ((i
->soid
.get_hash() & mask
) == child_pgid
.m_seed
) {
4492 childlog
.push_back(*i
);
4499 // osd_reqid is unique, so it doesn't matter if there are extra
4500 // dup entries in each pg. To avoid storing oid with the dup
4501 // entries, just copy the whole list.
4502 auto childdups(dups
);
4508 rollback_info_trimmed_to
,
4509 std::move(childlog
),
4510 std::move(childdups
));
4513 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
4514 ceph_assert(newhead
>= tail
);
4516 mempool::osd_pglog::list
<pg_log_entry_t
>::iterator p
= log
.end();
4517 mempool::osd_pglog::list
<pg_log_entry_t
> divergent
;
4519 if (p
== log
.begin()) {
4520 // yikes, the whole thing is divergent!
4522 swap(divergent
, log
);
4526 if (p
->version
.version
<= newhead
.version
) {
4528 * look at eversion.version here. we want to avoid a situation like:
4529 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4530 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4531 * lower_bound = 100'9
4532 * i.e, same request, different version. If the eversion.version is > the
4533 * lower_bound, we it is divergent.
4536 divergent
.splice(divergent
.begin(), log
, p
, log
.end());
4539 ceph_assert(p
->version
> newhead
);
4543 if (can_rollback_to
> newhead
)
4544 can_rollback_to
= newhead
;
4546 if (rollback_info_trimmed_to
> newhead
)
4547 rollback_info_trimmed_to
= newhead
;
4552 void merge_from(const std::vector
<pg_log_t
*>& slogs
, eversion_t last_update
) {
4555 // sort and merge dups
4556 std::multimap
<eversion_t
,pg_log_dup_t
> sorted
;
4557 for (auto& d
: dups
) {
4558 sorted
.emplace(d
.version
, d
);
4560 for (auto l
: slogs
) {
4561 for (auto& d
: l
->dups
) {
4562 sorted
.emplace(d
.version
, d
);
4566 for (auto& i
: sorted
) {
4567 dups
.push_back(i
.second
);
4572 can_rollback_to
= last_update
;
4573 rollback_info_trimmed_to
= last_update
;
4576 bool empty() const {
4581 return head
.version
== 0 && head
.epoch
== 0;
4584 uint64_t approx_size() const {
4585 return head
.version
- tail
.version
;
4588 static void filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
4589 const std::string
&hit_set_namespace
, const pg_log_t
&in
,
4590 pg_log_t
&out
, pg_log_t
&reject
);
4593 * copy entries from the tail of another pg_log_t
4595 * @param other pg_log_t to copy from
4596 * @param from copy entries after this version
4598 void copy_after(CephContext
* cct
, const pg_log_t
&other
, eversion_t from
);
4601 * copy up to N entries
4603 * @param other source log
4604 * @param max max number of entries to copy
4606 void copy_up_to(CephContext
* cct
, const pg_log_t
&other
, int max
);
4608 std::ostream
& print(std::ostream
& out
) const;
4610 void encode(ceph::buffer::list
&bl
) const;
4611 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1);
4612 void dump(ceph::Formatter
*f
) const;
4613 static void generate_test_instances(std::list
<pg_log_t
*>& o
);
4615 WRITE_CLASS_ENCODER(pg_log_t
)
4617 inline std::ostream
& operator<<(std::ostream
& out
, const pg_log_t
& log
)
4619 out
<< "log((" << log
.tail
<< "," << log
.head
<< "], crt="
4620 << log
.get_can_rollback_to() << ")";
4626 * pg_missing_t - summary of missing objects.
4628 * kept in memory, as a supplement to pg_log_t
4629 * also used to pass missing info in messages.
4631 struct pg_missing_item
{
4632 eversion_t need
, have
;
4633 ObjectCleanRegions clean_regions
;
4634 enum missing_flags_t
{
4638 pg_missing_item() : flags(FLAG_NONE
) {}
4639 explicit pg_missing_item(eversion_t n
) : need(n
), flags(FLAG_NONE
) {} // have no old version
4640 pg_missing_item(eversion_t n
, eversion_t h
, bool is_delete
=false, bool old_style
= false) :
4642 set_delete(is_delete
);
4644 clean_regions
.mark_fully_dirty();
4647 void encode(ceph::buffer::list
& bl
, uint64_t features
) const {
4649 if (HAVE_FEATURE(features
, SERVER_OCTOPUS
)) {
4650 // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
4651 // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
4652 // possible. This can be replaced with the legacy encoding
4653 encode(eversion_t(), bl
);
4654 encode(eversion_t(-1, -1), bl
);
4657 encode(static_cast<uint8_t>(flags
), bl
);
4658 encode(clean_regions
, bl
);
4660 encode(eversion_t(), bl
);
4663 encode(static_cast<uint8_t>(flags
), bl
);
4666 void decode(ceph::buffer::list::const_iterator
& bl
) {
4671 if(l
== eversion_t(-1, -1)) {
4677 flags
= static_cast<missing_flags_t
>(f
);
4678 decode(clean_regions
, bl
);
4680 // support OSD_RECOVERY_DELETES
4685 flags
= static_cast<missing_flags_t
>(f
);
4686 clean_regions
.mark_fully_dirty();
4690 void set_delete(bool is_delete
) {
4691 flags
= is_delete
? FLAG_DELETE
: FLAG_NONE
;
4694 bool is_delete() const {
4695 return (flags
& FLAG_DELETE
) == FLAG_DELETE
;
4698 std::string
flag_str() const {
4699 if (flags
== FLAG_NONE
) {
4706 void dump(ceph::Formatter
*f
) const {
4707 f
->dump_stream("need") << need
;
4708 f
->dump_stream("have") << have
;
4709 f
->dump_stream("flags") << flag_str();
4710 f
->dump_stream("clean_regions") << clean_regions
;
4712 static void generate_test_instances(std::list
<pg_missing_item
*>& o
) {
4713 o
.push_back(new pg_missing_item
);
4714 o
.push_back(new pg_missing_item
);
4715 o
.back()->need
= eversion_t(1, 2);
4716 o
.back()->have
= eversion_t(1, 1);
4717 o
.push_back(new pg_missing_item
);
4718 o
.back()->need
= eversion_t(3, 5);
4719 o
.back()->have
= eversion_t(3, 4);
4720 o
.back()->clean_regions
.mark_data_region_dirty(4096, 8192);
4721 o
.back()->clean_regions
.mark_omap_dirty();
4722 o
.back()->flags
= FLAG_DELETE
;
4724 bool operator==(const pg_missing_item
&rhs
) const {
4725 return need
== rhs
.need
&& have
== rhs
.have
&& flags
== rhs
.flags
;
4727 bool operator!=(const pg_missing_item
&rhs
) const {
4728 return !(*this == rhs
);
4731 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item
)
4732 std::ostream
& operator<<(std::ostream
& out
, const pg_missing_item
&item
);
4733 #if FMT_VERSION >= 90000
4734 template <> struct fmt::formatter
<pg_missing_item
> : fmt::ostream_formatter
{};
4737 class pg_missing_const_i
{
4739 virtual const std::map
<hobject_t
, pg_missing_item
> &
4740 get_items() const = 0;
4741 virtual const std::map
<version_t
, hobject_t
> &get_rmissing() const = 0;
4742 virtual bool get_may_include_deletes() const = 0;
4743 virtual unsigned int num_missing() const = 0;
4744 virtual bool have_missing() const = 0;
4745 virtual bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const = 0;
4746 virtual bool is_missing(const hobject_t
& oid
, eversion_t v
) const = 0;
4747 virtual ~pg_missing_const_i() {}
4751 template <bool Track
>
4752 class ChangeTracker
{
4754 void changed(const hobject_t
&obj
) {}
4755 template <typename F
>
4756 void get_changed(F
&&f
) const {}
4758 bool is_clean() const {
4763 class ChangeTracker
<true> {
4764 std::set
<hobject_t
> _changed
;
4766 void changed(const hobject_t
&obj
) {
4767 _changed
.insert(obj
);
4769 template <typename F
>
4770 void get_changed(F
&&f
) const {
4771 for (auto const &i
: _changed
) {
4778 bool is_clean() const {
4779 return _changed
.empty();
4783 template <bool TrackChanges
>
4784 class pg_missing_set
: public pg_missing_const_i
{
4785 using item
= pg_missing_item
;
4786 std::map
<hobject_t
, item
> missing
; // oid -> (need v, have v)
4787 std::map
<version_t
, hobject_t
> rmissing
; // v -> oid
4788 ChangeTracker
<TrackChanges
> tracker
;
4791 pg_missing_set() = default;
4793 template <typename missing_type
>
4794 pg_missing_set(const missing_type
&m
) {
4795 missing
= m
.get_items();
4796 rmissing
= m
.get_rmissing();
4797 may_include_deletes
= m
.get_may_include_deletes();
4798 for (auto &&i
: missing
)
4799 tracker
.changed(i
.first
);
4802 bool may_include_deletes
= false;
4804 const std::map
<hobject_t
, item
> &get_items() const override
{
4807 const std::map
<version_t
, hobject_t
> &get_rmissing() const override
{
4810 bool get_may_include_deletes() const override
{
4811 return may_include_deletes
;
4813 unsigned int num_missing() const override
{
4814 return missing
.size();
4816 bool have_missing() const override
{
4817 return !missing
.empty();
4819 void merge(const pg_log_entry_t
& e
) {
4820 auto miter
= missing
.find(e
.soid
);
4821 if (miter
!= missing
.end() && miter
->second
.have
!= eversion_t() && e
.version
> miter
->second
.have
)
4822 miter
->second
.clean_regions
.merge(e
.clean_regions
);
4824 bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const override
{
4825 auto iter
= missing
.find(oid
);
4826 if (iter
== missing
.end())
4829 *out
= iter
->second
;
4832 bool is_missing(const hobject_t
& oid
, eversion_t v
) const override
{
4833 std::map
<hobject_t
, item
>::const_iterator m
=
4835 if (m
== missing
.end())
4837 const item
&item(m
->second
);
4842 eversion_t
get_oldest_need() const {
4843 if (missing
.empty()) {
4844 return eversion_t();
4846 auto it
= missing
.find(rmissing
.begin()->second
);
4847 ceph_assert(it
!= missing
.end());
4848 return it
->second
.need
;
4851 void claim(pg_missing_set
&& o
) {
4852 static_assert(!TrackChanges
, "Can't use claim with TrackChanges");
4853 missing
= std::move(o
.missing
);
4854 rmissing
= std::move(o
.rmissing
);
4858 * this needs to be called in log order as we extend the log. it
4859 * assumes missing is accurate up through the previous log entry.
4861 void add_next_event(const pg_log_entry_t
& e
) {
4862 std::map
<hobject_t
, item
>::iterator missing_it
;
4863 missing_it
= missing
.find(e
.soid
);
4864 bool is_missing_divergent_item
= missing_it
!= missing
.end();
4865 if (e
.prior_version
== eversion_t() || e
.is_clone()) {
4867 if (is_missing_divergent_item
) { // use iterator
4868 rmissing
.erase(missing_it
->second
.need
.version
);
4870 missing_it
->second
= item(e
.version
, eversion_t(), e
.is_delete());
4871 missing_it
->second
.clean_regions
.mark_fully_dirty();
4873 // create new element in missing map
4875 missing
[e
.soid
] = item(e
.version
, eversion_t(), e
.is_delete());
4876 missing
[e
.soid
].clean_regions
.mark_fully_dirty();
4878 } else if (is_missing_divergent_item
) {
4879 // already missing (prior).
4880 rmissing
.erase((missing_it
->second
).need
.version
);
4881 missing_it
->second
.need
= e
.version
; // leave .have unchanged.
4882 missing_it
->second
.set_delete(e
.is_delete());
4883 if (e
.is_lost_revert())
4884 missing_it
->second
.clean_regions
.mark_fully_dirty();
4886 missing_it
->second
.clean_regions
.merge(e
.clean_regions
);
4888 // not missing, we must have prior_version (if any)
4889 ceph_assert(!is_missing_divergent_item
);
4890 missing
[e
.soid
] = item(e
.version
, e
.prior_version
, e
.is_delete());
4891 if (e
.is_lost_revert())
4892 missing
[e
.soid
].clean_regions
.mark_fully_dirty();
4894 missing
[e
.soid
].clean_regions
= e
.clean_regions
;
4896 rmissing
[e
.version
.version
] = e
.soid
;
4897 tracker
.changed(e
.soid
);
4900 void revise_need(hobject_t oid
, eversion_t need
, bool is_delete
) {
4901 auto p
= missing
.find(oid
);
4902 if (p
!= missing
.end()) {
4903 rmissing
.erase((p
->second
).need
.version
);
4904 p
->second
.need
= need
; // do not adjust .have
4905 p
->second
.set_delete(is_delete
);
4906 p
->second
.clean_regions
.mark_fully_dirty();
4908 missing
[oid
] = item(need
, eversion_t(), is_delete
);
4909 missing
[oid
].clean_regions
.mark_fully_dirty();
4911 rmissing
[need
.version
] = oid
;
4913 tracker
.changed(oid
);
4916 void revise_have(hobject_t oid
, eversion_t have
) {
4917 auto p
= missing
.find(oid
);
4918 if (p
!= missing
.end()) {
4919 tracker
.changed(oid
);
4920 (p
->second
).have
= have
;
4924 void mark_fully_dirty(const hobject_t
& oid
) {
4925 auto p
= missing
.find(oid
);
4926 if (p
!= missing
.end()) {
4927 tracker
.changed(oid
);
4928 (p
->second
).clean_regions
.mark_fully_dirty();
4932 void add(const hobject_t
& oid
, eversion_t need
, eversion_t have
,
4934 missing
[oid
] = item(need
, have
, is_delete
, true);
4935 rmissing
[need
.version
] = oid
;
4936 tracker
.changed(oid
);
4939 void add(const hobject_t
& oid
, pg_missing_item
&& item
) {
4940 rmissing
[item
.need
.version
] = oid
;
4941 missing
.insert({oid
, std::move(item
)});
4942 tracker
.changed(oid
);
4945 void rm(const hobject_t
& oid
, eversion_t v
) {
4946 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
4947 if (p
!= missing
.end() && p
->second
.need
<= v
)
4951 void rm(std::map
<hobject_t
, item
>::const_iterator m
) {
4952 tracker
.changed(m
->first
);
4953 rmissing
.erase(m
->second
.need
.version
);
4957 void got(const hobject_t
& oid
, eversion_t v
) {
4958 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
4959 ceph_assert(p
!= missing
.end());
4960 ceph_assert(p
->second
.need
<= v
|| p
->second
.is_delete());
4964 void got(std::map
<hobject_t
, item
>::const_iterator m
) {
4965 tracker
.changed(m
->first
);
4966 rmissing
.erase(m
->second
.need
.version
);
4972 unsigned split_bits
,
4973 pg_missing_set
*omissing
) {
4974 omissing
->may_include_deletes
= may_include_deletes
;
4975 unsigned mask
= ~((~0)<<split_bits
);
4976 for (std::map
<hobject_t
, item
>::iterator i
= missing
.begin();
4979 if ((i
->first
.get_hash() & mask
) == child_pgid
.m_seed
) {
4980 omissing
->add(i
->first
, i
->second
.need
, i
->second
.have
,
4981 i
->second
.is_delete());
4990 for (auto const &i
: missing
)
4991 tracker
.changed(i
.first
);
4996 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
4997 ENCODE_START(5, 2, bl
)
4998 encode(missing
, bl
, features
);
4999 encode(may_include_deletes
, bl
);
5002 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1) {
5003 for (auto const &i
: missing
)
5004 tracker
.changed(i
.first
);
5005 DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl
);
5006 decode(missing
, bl
);
5007 if (struct_v
>= 4) {
5008 decode(may_include_deletes
, bl
);
5013 // Handle hobject_t upgrade
5014 std::map
<hobject_t
, item
> tmp
;
5015 for (std::map
<hobject_t
, item
>::iterator i
=
5019 if (!i
->first
.is_max() && i
->first
.pool
== -1) {
5020 hobject_t
to_insert(i
->first
);
5021 to_insert
.pool
= pool
;
5022 tmp
[to_insert
] = i
->second
;
5028 missing
.insert(tmp
.begin(), tmp
.end());
5031 for (std::map
<hobject_t
,item
>::iterator it
=
5033 it
!= missing
.end();
5035 rmissing
[it
->second
.need
.version
] = it
->first
;
5036 for (auto const &i
: missing
)
5037 tracker
.changed(i
.first
);
5039 void dump(ceph::Formatter
*f
) const {
5040 f
->open_array_section("missing");
5041 for (std::map
<hobject_t
,item
>::const_iterator p
=
5042 missing
.begin(); p
!= missing
.end(); ++p
) {
5043 f
->open_object_section("item");
5044 f
->dump_stream("object") << p
->first
;
5049 f
->dump_bool("may_include_deletes", may_include_deletes
);
5051 template <typename F
>
5052 void filter_objects(F
&&f
) {
5053 for (auto i
= missing
.begin(); i
!= missing
.end();) {
5061 static void generate_test_instances(std::list
<pg_missing_set
*>& o
) {
5062 o
.push_back(new pg_missing_set
);
5063 o
.back()->may_include_deletes
= true;
5064 o
.push_back(new pg_missing_set
);
5066 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5067 eversion_t(5, 6), eversion_t(5, 1), false);
5068 o
.back()->may_include_deletes
= true;
5069 o
.push_back(new pg_missing_set
);
5071 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5072 eversion_t(5, 6), eversion_t(5, 1), true);
5073 o
.back()->may_include_deletes
= true;
5075 template <typename F
>
5076 void get_changed(F
&&f
) const {
5077 tracker
.get_changed(f
);
5082 bool is_clean() const {
5083 return tracker
.is_clean();
5085 template <typename missing_t
>
5086 bool debug_verify_from_init(
5087 const missing_t
&init_missing
,
5088 std::ostream
*oss
) const {
5091 auto check_missing(init_missing
.get_items());
5092 tracker
.get_changed([&](const hobject_t
&hoid
) {
5093 check_missing
.erase(hoid
);
5094 if (missing
.count(hoid
)) {
5095 check_missing
.insert(*(missing
.find(hoid
)));
5099 if (check_missing
.size() != missing
.size()) {
5101 *oss
<< "Size mismatch, check: " << check_missing
.size()
5102 << ", actual: " << missing
.size() << "\n";
5106 for (auto &i
: missing
) {
5107 if (!check_missing
.count(i
.first
)) {
5109 *oss
<< "check_missing missing " << i
.first
<< "\n";
5111 } else if (check_missing
[i
.first
] != i
.second
) {
5113 *oss
<< "check_missing missing item mismatch on " << i
.first
5114 << ", check: " << check_missing
[i
.first
]
5115 << ", actual: " << i
.second
<< "\n";
5120 *oss
<< "check_missing: " << check_missing
<< "\n";
5121 std::set
<hobject_t
> changed
;
5122 tracker
.get_changed([&](const hobject_t
&hoid
) { changed
.insert(hoid
); });
5123 *oss
<< "changed: " << changed
<< "\n";
5128 template <bool TrackChanges
>
5130 const pg_missing_set
<TrackChanges
> &c
, ceph::buffer::list
&bl
, uint64_t features
=0) {
5132 c
.encode(bl
, features
);
5133 ENCODE_DUMP_POST(cl
);
5135 template <bool TrackChanges
>
5136 void decode(pg_missing_set
<TrackChanges
> &c
, ceph::buffer::list::const_iterator
&p
) {
5139 template <bool TrackChanges
>
5140 std::ostream
& operator<<(std::ostream
& out
, const pg_missing_set
<TrackChanges
> &missing
)
5142 out
<< "missing(" << missing
.num_missing()
5143 << " may_include_deletes = " << missing
.may_include_deletes
;
5144 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
5149 using pg_missing_t
= pg_missing_set
<false>;
5150 using pg_missing_tracker_t
= pg_missing_set
<true>;
5156 * pg list objects response format
5160 template<typename T
>
5161 struct pg_nls_response_template
{
5162 collection_list_handle_t handle
;
5163 std::vector
<T
> entries
;
5165 void encode(ceph::buffer::list
& bl
) const {
5166 ENCODE_START(1, 1, bl
);
5168 __u32 n
= (__u32
)entries
.size();
5170 for (auto i
= entries
.begin(); i
!= entries
.end(); ++i
) {
5171 encode(i
->nspace
, bl
);
5173 encode(i
->locator
, bl
);
5177 void decode(ceph::buffer::list::const_iterator
& bl
) {
5178 DECODE_START(1, bl
);
5185 decode(i
.nspace
, bl
);
5187 decode(i
.locator
, bl
);
5188 entries
.push_back(i
);
5192 void dump(ceph::Formatter
*f
) const {
5193 f
->dump_stream("handle") << handle
;
5194 f
->open_array_section("entries");
5195 for (auto p
= entries
.begin(); p
!= entries
.end(); ++p
) {
5196 f
->open_object_section("object");
5197 f
->dump_string("namespace", p
->nspace
);
5198 f
->dump_string("object", p
->oid
);
5199 f
->dump_string("key", p
->locator
);
5204 static void generate_test_instances(std::list
<pg_nls_response_template
<T
>*>& o
) {
5205 o
.push_back(new pg_nls_response_template
<T
>);
5206 o
.push_back(new pg_nls_response_template
<T
>);
5207 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5208 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
5209 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
5210 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
5211 o
.push_back(new pg_nls_response_template
<T
>);
5212 o
.back()->handle
= hobject_t(object_t("hi"), "key", 3, 4, -1, "");
5213 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5214 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5215 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5216 o
.push_back(new pg_nls_response_template
<T
>);
5217 o
.back()->handle
= hobject_t(object_t("hi"), "key", 5, 6, -1, "");
5218 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
5219 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
5220 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
5221 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5222 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5223 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5227 using pg_nls_response_t
= pg_nls_response_template
<librados::ListObjectImpl
>;
5229 WRITE_CLASS_ENCODER(pg_nls_response_t
)
5231 // For backwards compatibility with older OSD requests
5232 struct pg_ls_response_t
{
5233 collection_list_handle_t handle
;
5234 std::list
<std::pair
<object_t
, std::string
> > entries
;
5236 void encode(ceph::buffer::list
& bl
) const {
5241 encode(entries
, bl
);
5243 void decode(ceph::buffer::list::const_iterator
& bl
) {
5247 ceph_assert(v
== 1);
5249 decode(entries
, bl
);
5251 void dump(ceph::Formatter
*f
) const {
5252 f
->dump_stream("handle") << handle
;
5253 f
->open_array_section("entries");
5254 for (std::list
<std::pair
<object_t
, std::string
> >::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
5255 f
->open_object_section("object");
5256 f
->dump_stream("object") << p
->first
;
5257 f
->dump_string("key", p
->second
);
5262 static void generate_test_instances(std::list
<pg_ls_response_t
*>& o
) {
5263 o
.push_back(new pg_ls_response_t
);
5264 o
.push_back(new pg_ls_response_t
);
5265 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5266 o
.back()->entries
.push_back(std::make_pair(object_t("one"), std::string()));
5267 o
.back()->entries
.push_back(std::make_pair(object_t("two"), std::string("twokey")));
5271 WRITE_CLASS_ENCODER(pg_ls_response_t
)
5274 * object_copy_cursor_t
5276 struct object_copy_cursor_t
{
5277 uint64_t data_offset
;
5278 std::string omap_offset
;
5283 object_copy_cursor_t()
5285 attr_complete(false),
5286 data_complete(false),
5287 omap_complete(false)
5290 bool is_initial() const {
5291 return !attr_complete
&& data_offset
== 0 && omap_offset
.empty();
5293 bool is_complete() const {
5294 return attr_complete
&& data_complete
&& omap_complete
;
5297 static void generate_test_instances(std::list
<object_copy_cursor_t
*>& o
);
5298 void encode(ceph::buffer::list
& bl
) const;
5299 void decode(ceph::buffer::list::const_iterator
&bl
);
5300 void dump(ceph::Formatter
*f
) const;
5302 WRITE_CLASS_ENCODER(object_copy_cursor_t
)
5305 * object_copy_data_t
5307 * Return data from a copy request. The semantics are a little strange
5308 * as a result of the encoding's heritage.
5310 * In particular, the sender unconditionally fills in the cursor (from what
5311 * it receives and sends), the size, and the mtime, but is responsible for
5312 * figuring out whether it should put any data in the attrs, data, or
5313 * omap members (corresponding to xattrs, object data, and the omap entries)
5314 * based on external data (the client includes a max amount to return with
5315 * the copy request). The client then looks into the attrs, data, and/or omap
5316 * based on the contents of the cursor.
5318 struct object_copy_data_t
{
5320 FLAG_DATA_DIGEST
= 1<<0,
5321 FLAG_OMAP_DIGEST
= 1<<1,
5323 object_copy_cursor_t cursor
;
5326 uint32_t data_digest
, omap_digest
;
5328 std::map
<std::string
, ceph::buffer::list
, std::less
<>> attrs
;
5329 ceph::buffer::list data
;
5330 ceph::buffer::list omap_header
;
5331 ceph::buffer::list omap_data
;
5333 /// which snaps we are defined for (if a snap and not the head)
5334 std::vector
<snapid_t
> snaps
;
5335 /// latest snap seq for the object (if head)
5338 /// recent reqids on this object
5339 mempool::osd_pglog::vector
<std::pair
<osd_reqid_t
, version_t
> > reqids
;
5341 /// map reqids by index to error return code (if any)
5342 mempool::osd_pglog::map
<uint32_t, int> reqid_return_codes
;
5344 uint64_t truncate_seq
;
5345 uint64_t truncate_size
;
5348 object_copy_data_t() :
5349 size((uint64_t)-1), data_digest(-1),
5350 omap_digest(-1), flags(0),
5354 static void generate_test_instances(std::list
<object_copy_data_t
*>& o
);
5355 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5356 void decode(ceph::buffer::list::const_iterator
& bl
);
5357 void dump(ceph::Formatter
*f
) const;
5359 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t
)
5364 struct pg_create_t
{
5365 epoch_t created
; // epoch pg created
5366 pg_t parent
; // split from parent (if != pg_t())
5370 : created(0), split_bits(0) {}
5371 pg_create_t(unsigned c
, pg_t p
, int s
)
5372 : created(c
), parent(p
), split_bits(s
) {}
5374 void encode(ceph::buffer::list
&bl
) const;
5375 void decode(ceph::buffer::list::const_iterator
&bl
);
5376 void dump(ceph::Formatter
*f
) const;
5377 static void generate_test_instances(std::list
<pg_create_t
*>& o
);
5379 WRITE_CLASS_ENCODER(pg_create_t
)
5381 // -----------------------------------------
5383 class ObjectExtent
{
5385 * ObjectExtents are used for specifying IO behavior against RADOS
5386 * objects when one is using the ObjectCacher.
5388 * To use this in a real system, *every member* must be filled
5389 * out correctly. In particular, make sure to initialize the
5390 * oloc correctly, as its default values are deliberate poison
5391 * and will cause internal ObjectCacher asserts.
5393 * Similarly, your buffer_extents vector *must* specify a total
5394 * size equal to your length. If the buffer_extents inadvertently
5395 * contain less space than the length member specifies, you
5396 * will get unintelligible asserts deep in the ObjectCacher.
5398 * If you are trying to do testing and don't care about actual
5399 * RADOS function, the simplest thing to do is to initialize
5400 * the ObjectExtent (truncate_size can be 0), create a single entry
5401 * in buffer_extents matching the length, and set oloc.pool to 0.
5404 object_t oid
; // object id
5406 uint64_t offset
; // in object
5407 uint64_t length
; // in object
5408 uint64_t truncate_size
; // in object
5410 object_locator_t oloc
; // object locator (pool etc)
5412 std::vector
<std::pair
<uint64_t,uint64_t> > buffer_extents
; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
5414 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
5415 ObjectExtent(object_t o
, uint64_t ono
, uint64_t off
, uint64_t l
, uint64_t ts
) :
5416 oid(o
), objectno(ono
), offset(off
), length(l
), truncate_size(ts
) { }
5419 inline std::ostream
& operator<<(std::ostream
& out
, const ObjectExtent
&ex
)
5421 return out
<< "extent("
5422 << ex
.oid
<< " (" << ex
.objectno
<< ") in " << ex
.oloc
5423 << " " << ex
.offset
<< "~" << ex
.length
5424 << " -> " << ex
.buffer_extents
5429 // ---------------------------------------
5431 class OSDSuperblock
{
5433 uuid_d cluster_fsid
, osd_fsid
;
5434 int32_t whoami
= -1; // my role in this fs.
5435 epoch_t current_epoch
= 0; // most recent epoch
5436 epoch_t oldest_map
= 0, newest_map
= 0; // oldest/newest maps we have.
5437 double weight
= 0.0;
5439 CompatSet compat_features
;
5441 // last interval over which i mounted and was then active
5442 epoch_t mounted
= 0; // last epoch i mounted
5443 epoch_t clean_thru
= 0; // epoch i was active and clean thru
5445 epoch_t purged_snaps_last
= 0;
5446 utime_t last_purged_snaps_scrub
;
5448 epoch_t cluster_osdmap_trim_lower_bound
= 0;
5450 void encode(ceph::buffer::list
&bl
) const;
5451 void decode(ceph::buffer::list::const_iterator
&bl
);
5452 void dump(ceph::Formatter
*f
) const;
5453 static void generate_test_instances(std::list
<OSDSuperblock
*>& o
);
5455 WRITE_CLASS_ENCODER(OSDSuperblock
)
5457 inline std::ostream
& operator<<(std::ostream
& out
, const OSDSuperblock
& sb
)
5459 return out
<< "sb(" << sb
.cluster_fsid
5460 << " osd." << sb
.whoami
5461 << " " << sb
.osd_fsid
5462 << " e" << sb
.current_epoch
5463 << " [" << sb
.oldest_map
<< "," << sb
.newest_map
<< "]"
5464 << " lci=[" << sb
.mounted
<< "," << sb
.clean_thru
<< "]"
5465 << " tlb=" << sb
.cluster_osdmap_trim_lower_bound
5478 * attached to object head. describes most recent snap context, and
5479 * set of existing clones.
5483 // NOTE: this is for pre-octopus compatibility only! remove in Q release
5484 std::vector
<snapid_t
> snaps
; // descending
5485 std::vector
<snapid_t
> clones
; // ascending
5486 std::map
<snapid_t
, interval_set
<uint64_t> > clone_overlap
; // overlap w/ next newest
5487 std::map
<snapid_t
, uint64_t> clone_size
;
5488 std::map
<snapid_t
, std::vector
<snapid_t
>> clone_snaps
; // descending
5490 SnapSet() : seq(0) {}
5491 explicit SnapSet(ceph::buffer::list
& bl
) {
5492 auto p
= std::cbegin(bl
);
5496 /// populate SnapSet from a librados::snap_set_t
5497 void from_snap_set(const librados::snap_set_t
& ss
, bool legacy
);
5499 /// get space accounted to clone
5500 uint64_t get_clone_bytes(snapid_t clone
) const;
5502 void encode(ceph::buffer::list
& bl
) const;
5503 void decode(ceph::buffer::list::const_iterator
& bl
);
5504 void dump(ceph::Formatter
*f
) const;
5505 static void generate_test_instances(std::list
<SnapSet
*>& o
);
5507 SnapContext
get_ssc_as_of(snapid_t as_of
) const {
5510 for (auto p
= clone_snaps
.rbegin();
5511 p
!= clone_snaps
.rend();
5513 for (auto snap
: p
->second
) {
5514 if (snap
<= as_of
) {
5515 out
.snaps
.push_back(snap
);
5523 SnapSet
get_filtered(const pg_pool_t
&pinfo
) const;
5524 void filter(const pg_pool_t
&pinfo
);
5526 WRITE_CLASS_ENCODER(SnapSet
)
5528 std::ostream
& operator<<(std::ostream
& out
, const SnapSet
& cs
);
5533 #define SS_ATTR "snapset"
5535 struct watch_info_t
{
5537 uint32_t timeout_seconds
;
5540 watch_info_t() : cookie(0), timeout_seconds(0) { }
5541 watch_info_t(uint64_t c
, uint32_t t
, const entity_addr_t
& a
) : cookie(c
), timeout_seconds(t
), addr(a
) {}
5543 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5544 void decode(ceph::buffer::list::const_iterator
& bl
);
5545 void dump(ceph::Formatter
*f
) const;
5546 static void generate_test_instances(std::list
<watch_info_t
*>& o
);
5548 WRITE_CLASS_ENCODER_FEATURES(watch_info_t
)
5550 static inline bool operator==(const watch_info_t
& l
, const watch_info_t
& r
) {
5551 return l
.cookie
== r
.cookie
&& l
.timeout_seconds
== r
.timeout_seconds
5552 && l
.addr
== r
.addr
;
5555 static inline std::ostream
& operator<<(std::ostream
& out
, const watch_info_t
& w
) {
5556 return out
<< "watch(cookie " << w
.cookie
<< " " << w
.timeout_seconds
<< "s"
5557 << " " << w
.addr
<< ")";
5560 struct notify_info_t
{
5564 ceph::buffer::list bl
;
5567 static inline std::ostream
& operator<<(std::ostream
& out
, const notify_info_t
& n
) {
5568 return out
<< "notify(cookie " << n
.cookie
5569 << " notify" << n
.notify_id
5570 << " " << n
.timeout
<< "s)";
5573 class object_ref_delta_t
{
5574 std::map
<hobject_t
, int> ref_delta
;
5577 object_ref_delta_t() = default;
5578 object_ref_delta_t(const object_ref_delta_t
&) = default;
5579 object_ref_delta_t(object_ref_delta_t
&&) = default;
5581 object_ref_delta_t(decltype(ref_delta
) &&ref_delta
)
5582 : ref_delta(std::move(ref_delta
)) {}
5583 object_ref_delta_t(const decltype(ref_delta
) &ref_delta
)
5584 : ref_delta(ref_delta
) {}
5586 object_ref_delta_t
&operator=(const object_ref_delta_t
&) = default;
5587 object_ref_delta_t
&operator=(object_ref_delta_t
&&) = default;
5589 void dec_ref(const hobject_t
&hoid
, unsigned num
=1) {
5590 mut_ref(hoid
, -num
);
5592 void inc_ref(const hobject_t
&hoid
, unsigned num
=1) {
5595 void mut_ref(const hobject_t
&hoid
, int num
) {
5596 [[maybe_unused
]] auto [iter
, _
] = ref_delta
.try_emplace(hoid
, 0);
5597 iter
->second
+= num
;
5598 if (iter
->second
== 0)
5599 ref_delta
.erase(iter
);
5602 auto begin() const { return ref_delta
.begin(); }
5603 auto end() const { return ref_delta
.end(); }
5604 auto find(hobject_t
&key
) const { return ref_delta
.find(key
); }
5606 bool operator==(const object_ref_delta_t
&rhs
) const {
5607 return ref_delta
== rhs
.ref_delta
;
5609 bool operator!=(const object_ref_delta_t
&rhs
) const {
5610 return !(*this == rhs
);
5613 return ref_delta
.empty();
5616 return ref_delta
.size();
5618 friend std::ostream
& operator<<(std::ostream
& out
, const object_ref_delta_t
& ci
);
5621 struct chunk_info_t
{
5625 FLAG_HAS_REFERENCE
= 4,
5626 FLAG_HAS_FINGERPRINT
= 8,
5631 cflag_t flags
; // FLAG_*
5633 chunk_info_t() : offset(0), length(0), flags((cflag_t
)0) { }
5634 chunk_info_t(uint32_t offset
, uint32_t length
, hobject_t oid
) :
5635 offset(offset
), length(length
), oid(oid
), flags((cflag_t
)0) { }
5637 static std::string
get_flag_string(uint64_t flags
) {
5639 if (flags
& FLAG_DIRTY
) {
5642 if (flags
& FLAG_MISSING
) {
5645 if (flags
& FLAG_HAS_REFERENCE
) {
5646 r
+= "|has_reference";
5648 if (flags
& FLAG_HAS_FINGERPRINT
) {
5649 r
+= "|has_fingerprint";
5655 bool test_flag(cflag_t f
) const {
5656 return (flags
& f
) == f
;
5658 void set_flag(cflag_t f
) {
5659 flags
= (cflag_t
)(flags
| f
);
5661 void set_flags(cflag_t f
) {
5664 void clear_flag(cflag_t f
) {
5665 flags
= (cflag_t
)(flags
& ~f
);
5667 void clear_flags() {
5670 bool is_dirty() const {
5671 return test_flag(FLAG_DIRTY
);
5673 bool is_missing() const {
5674 return test_flag(FLAG_MISSING
);
5676 bool has_reference() const {
5677 return test_flag(FLAG_HAS_REFERENCE
);
5679 bool has_fingerprint() const {
5680 return test_flag(FLAG_HAS_FINGERPRINT
);
5682 void encode(ceph::buffer::list
&bl
) const;
5683 void decode(ceph::buffer::list::const_iterator
&bl
);
5684 void dump(ceph::Formatter
*f
) const;
5685 friend std::ostream
& operator<<(std::ostream
& out
, const chunk_info_t
& ci
);
5686 bool operator==(const chunk_info_t
& cit
) const;
5687 bool operator!=(const chunk_info_t
& cit
) const {
5688 return !(cit
== *this);
5691 WRITE_CLASS_ENCODER(chunk_info_t
)
5692 std::ostream
& operator<<(std::ostream
& out
, const chunk_info_t
& ci
);
5694 struct object_info_t
;
5695 struct object_manifest_t
{
5701 uint8_t type
; // redirect, chunked, ...
5702 hobject_t redirect_target
;
5703 std::map
<uint64_t, chunk_info_t
> chunk_map
;
5705 object_manifest_t() : type(0) { }
5706 object_manifest_t(uint8_t type
, const hobject_t
& redirect_target
)
5707 : type(type
), redirect_target(redirect_target
) { }
5709 bool is_empty() const {
5710 return type
== TYPE_NONE
;
5712 bool is_redirect() const {
5713 return type
== TYPE_REDIRECT
;
5715 bool is_chunked() const {
5716 return type
== TYPE_CHUNKED
;
5718 static std::string_view
get_type_name(uint8_t m
) {
5720 case TYPE_NONE
: return "none";
5721 case TYPE_REDIRECT
: return "redirect";
5722 case TYPE_CHUNKED
: return "chunked";
5723 default: return "unknown";
5726 std::string_view
get_type_name() const {
5727 return get_type_name(type
);
5731 redirect_target
= hobject_t();
5736 * calc_refs_to_inc_on_set
5738 * Takes a manifest and returns the set of refs to
5739 * increment upon set-chunk
5741 * l should be nullptr if there are no clones, or
5742 * l and g may each be null if the corresponding clone does not exist.
5743 * *this contains the set of new references to set
5746 void calc_refs_to_inc_on_set(
5747 const object_manifest_t
* g
, ///< [in] manifest for clone > *this
5748 const object_manifest_t
* l
, ///< [in] manifest for clone < *this
5749 object_ref_delta_t
&delta
///< [out] set of refs to drop
5753 * calc_refs_to_drop_on_modify
5755 * Takes a manifest and returns the set of refs to
5756 * drop upon modification
5758 * l should be nullptr if there are no clones, or
5759 * l may be null if the corresponding clone does not exist.
5762 void calc_refs_to_drop_on_modify(
5763 const object_manifest_t
* l
, ///< [in] manifest for previous clone
5764 const ObjectCleanRegions
& clean_regions
, ///< [in] clean regions
5765 object_ref_delta_t
&delta
///< [out] set of refs to drop
5769 * calc_refs_to_drop_on_removal
5771 * Takes the two adjacent manifests and returns the set of refs to
5772 * drop upon removal of the clone containing *this.
5774 * g should be nullptr if *this is on HEAD, l should be nullptr if
5775 * *this is on the oldest clone (or head if there are no clones).
5777 void calc_refs_to_drop_on_removal(
5778 const object_manifest_t
* g
, ///< [in] manifest for clone > *this
5779 const object_manifest_t
* l
, ///< [in] manifest for clone < *this
5780 object_ref_delta_t
&delta
///< [out] set of refs to drop
5783 static void generate_test_instances(std::list
<object_manifest_t
*>& o
);
5784 void encode(ceph::buffer::list
&bl
) const;
5785 void decode(ceph::buffer::list::const_iterator
&bl
);
5786 void dump(ceph::Formatter
*f
) const;
5787 friend std::ostream
& operator<<(std::ostream
& out
, const object_info_t
& oi
);
5789 WRITE_CLASS_ENCODER(object_manifest_t
)
5790 std::ostream
& operator<<(std::ostream
& out
, const object_manifest_t
& oi
);
5792 struct object_info_t
{
5794 eversion_t version
, prior_version
;
5795 version_t user_version
;
5796 osd_reqid_t last_reqid
;
5800 utime_t local_mtime
; // local mtime
5802 // note: these are currently encoded into a total 16 bits; see
5803 // encode()/decode() for the weirdness.
5806 FLAG_WHITEOUT
= 1<<1, // object logically does not exist
5807 FLAG_DIRTY
= 1<<2, // object has been modified since last flushed or undirtied
5808 FLAG_OMAP
= 1<<3, // has (or may have) some/any omap data
5809 FLAG_DATA_DIGEST
= 1<<4, // has data crc
5810 FLAG_OMAP_DIGEST
= 1<<5, // has omap crc
5811 FLAG_CACHE_PIN
= 1<<6, // pin the object in cache tier
5812 FLAG_MANIFEST
= 1<<7, // has manifest
5813 FLAG_USES_TMAP
= 1<<8, // deprecated; no longer used
5814 FLAG_REDIRECT_HAS_REFERENCE
= 1<<9, // has reference
5819 static std::string
get_flag_string(flag_t flags
) {
5821 std::vector
<std::string
> sv
= get_flag_vector(flags
);
5822 for (auto ss
: sv
) {
5823 s
+= std::string("|") + ss
;
5829 static std::vector
<std::string
> get_flag_vector(flag_t flags
) {
5830 std::vector
<std::string
> sv
;
5831 if (flags
& FLAG_LOST
)
5832 sv
.insert(sv
.end(), "lost");
5833 if (flags
& FLAG_WHITEOUT
)
5834 sv
.insert(sv
.end(), "whiteout");
5835 if (flags
& FLAG_DIRTY
)
5836 sv
.insert(sv
.end(), "dirty");
5837 if (flags
& FLAG_USES_TMAP
)
5838 sv
.insert(sv
.end(), "uses_tmap");
5839 if (flags
& FLAG_OMAP
)
5840 sv
.insert(sv
.end(), "omap");
5841 if (flags
& FLAG_DATA_DIGEST
)
5842 sv
.insert(sv
.end(), "data_digest");
5843 if (flags
& FLAG_OMAP_DIGEST
)
5844 sv
.insert(sv
.end(), "omap_digest");
5845 if (flags
& FLAG_CACHE_PIN
)
5846 sv
.insert(sv
.end(), "cache_pin");
5847 if (flags
& FLAG_MANIFEST
)
5848 sv
.insert(sv
.end(), "manifest");
5849 if (flags
& FLAG_REDIRECT_HAS_REFERENCE
)
5850 sv
.insert(sv
.end(), "redirect_has_reference");
5853 std::string
get_flag_string() const {
5854 return get_flag_string(flags
);
5857 uint64_t truncate_seq
, truncate_size
;
5859 std::map
<std::pair
<uint64_t, entity_name_t
>, watch_info_t
> watchers
;
5861 // opportunistic checksums; may or may not be present
5862 __u32 data_digest
; ///< data crc32c
5863 __u32 omap_digest
; ///< omap crc32c
5865 // alloc hint attribute
5866 uint64_t expected_object_size
, expected_write_size
;
5867 uint32_t alloc_hint_flags
;
5869 struct object_manifest_t manifest
;
5871 void copy_user_bits(const object_info_t
& other
);
5873 bool test_flag(flag_t f
) const {
5874 return (flags
& f
) == f
;
5876 void set_flag(flag_t f
) {
5877 flags
= (flag_t
)(flags
| f
);
5879 void clear_flag(flag_t f
) {
5880 flags
= (flag_t
)(flags
& ~f
);
5882 bool is_lost() const {
5883 return test_flag(FLAG_LOST
);
5885 bool is_whiteout() const {
5886 return test_flag(FLAG_WHITEOUT
);
5888 bool is_dirty() const {
5889 return test_flag(FLAG_DIRTY
);
5891 bool is_omap() const {
5892 return test_flag(FLAG_OMAP
);
5894 bool is_data_digest() const {
5895 return test_flag(FLAG_DATA_DIGEST
);
5897 bool is_omap_digest() const {
5898 return test_flag(FLAG_OMAP_DIGEST
);
5900 bool is_cache_pinned() const {
5901 return test_flag(FLAG_CACHE_PIN
);
5903 bool has_manifest() const {
5904 return test_flag(FLAG_MANIFEST
);
5906 void set_data_digest(__u32 d
) {
5907 set_flag(FLAG_DATA_DIGEST
);
5910 void set_omap_digest(__u32 d
) {
5911 set_flag(FLAG_OMAP_DIGEST
);
5914 void clear_data_digest() {
5915 clear_flag(FLAG_DATA_DIGEST
);
5918 void clear_omap_digest() {
5919 clear_flag(FLAG_OMAP_DIGEST
);
5923 clear_data_digest();
5924 clear_omap_digest();
5927 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5928 void decode(ceph::buffer::list::const_iterator
& bl
);
5929 void decode(const ceph::buffer::list
& bl
) {
5930 auto p
= std::cbegin(bl
);
5934 void encode_no_oid(ceph::buffer::list
& bl
, uint64_t features
) {
5935 // TODO: drop soid field and remove the denc no_oid methods
5936 auto tmp_oid
= hobject_t(hobject_t::get_max());
5938 encode(bl
, features
);
5941 void decode_no_oid(ceph::buffer::list::const_iterator
& bl
) {
5943 ceph_assert(soid
.is_max());
5945 void decode_no_oid(const ceph::buffer::list
& bl
) {
5946 auto p
= std::cbegin(bl
);
5949 void decode_no_oid(const ceph::buffer::list
& bl
, const hobject_t
& _soid
) {
5950 auto p
= std::cbegin(bl
);
5955 void dump(ceph::Formatter
*f
) const;
5956 static void generate_test_instances(std::list
<object_info_t
*>& o
);
5958 explicit object_info_t()
5959 : user_version(0), size(0), flags((flag_t
)0),
5960 truncate_seq(0), truncate_size(0),
5961 data_digest(-1), omap_digest(-1),
5962 expected_object_size(0), expected_write_size(0),
5966 explicit object_info_t(const hobject_t
& s
)
5968 user_version(0), size(0), flags((flag_t
)0),
5969 truncate_seq(0), truncate_size(0),
5970 data_digest(-1), omap_digest(-1),
5971 expected_object_size(0), expected_write_size(0),
5975 explicit object_info_t(const ceph::buffer::list
& bl
) {
5979 explicit object_info_t(const ceph::buffer::list
& bl
, const hobject_t
& _soid
) {
5984 WRITE_CLASS_ENCODER_FEATURES(object_info_t
)
5986 std::ostream
& operator<<(std::ostream
& out
, const object_info_t
& oi
);
5991 struct ObjectRecoveryInfo
{
5996 SnapSet ss
; // only populated if soid is_snap()
5997 interval_set
<uint64_t> copy_subset
;
5998 std::map
<hobject_t
, interval_set
<uint64_t>> clone_subset
;
6001 ObjectRecoveryInfo() : size(0), object_exist(true) { }
6003 static void generate_test_instances(std::list
<ObjectRecoveryInfo
*>& o
);
6004 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6005 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1);
6006 std::ostream
&print(std::ostream
&out
) const;
6007 void dump(ceph::Formatter
*f
) const;
6009 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo
)
6010 std::ostream
& operator<<(std::ostream
& out
, const ObjectRecoveryInfo
&inf
);
6012 struct ObjectRecoveryProgress
{
6013 uint64_t data_recovered_to
;
6014 std::string omap_recovered_to
;
6020 ObjectRecoveryProgress()
6021 : data_recovered_to(0),
6023 data_complete(false), omap_complete(false) { }
6025 bool is_complete(const ObjectRecoveryInfo
& info
) const {
6026 return (data_recovered_to
>= (
6027 info
.copy_subset
.empty() ?
6028 0 : info
.copy_subset
.range_end())) &&
6032 uint64_t estimate_remaining_data_to_recover(const ObjectRecoveryInfo
& info
) const {
6033 // Overestimates in case of clones, but avoids traversing copy_subset
6034 return info
.size
- data_recovered_to
;
6037 static void generate_test_instances(std::list
<ObjectRecoveryProgress
*>& o
);
6038 void encode(ceph::buffer::list
&bl
) const;
6039 void decode(ceph::buffer::list::const_iterator
&bl
);
6040 std::ostream
&print(std::ostream
&out
) const;
6041 void dump(ceph::Formatter
*f
) const;
6043 WRITE_CLASS_ENCODER(ObjectRecoveryProgress
)
6044 std::ostream
& operator<<(std::ostream
& out
, const ObjectRecoveryProgress
&prog
);
6046 struct PushReplyOp
{
6049 static void generate_test_instances(std::list
<PushReplyOp
*>& o
);
6050 void encode(ceph::buffer::list
&bl
) const;
6051 void decode(ceph::buffer::list::const_iterator
&bl
);
6052 std::ostream
&print(std::ostream
&out
) const;
6053 void dump(ceph::Formatter
*f
) const;
6055 uint64_t cost(CephContext
*cct
) const;
6057 WRITE_CLASS_ENCODER(PushReplyOp
)
6058 std::ostream
& operator<<(std::ostream
& out
, const PushReplyOp
&op
);
6063 ObjectRecoveryInfo recovery_info
;
6064 ObjectRecoveryProgress recovery_progress
;
6066 static void generate_test_instances(std::list
<PullOp
*>& o
);
6067 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6068 void decode(ceph::buffer::list::const_iterator
&bl
);
6069 std::ostream
&print(std::ostream
&out
) const;
6070 void dump(ceph::Formatter
*f
) const;
6072 uint64_t cost(CephContext
*cct
) const;
6074 WRITE_CLASS_ENCODER_FEATURES(PullOp
)
6075 std::ostream
& operator<<(std::ostream
& out
, const PullOp
&op
);
6080 ceph::buffer::list data
;
6081 interval_set
<uint64_t> data_included
;
6082 ceph::buffer::list omap_header
;
6083 std::map
<std::string
, ceph::buffer::list
> omap_entries
;
6084 std::map
<std::string
, ceph::buffer::list
, std::less
<>> attrset
;
6086 ObjectRecoveryInfo recovery_info
;
6087 ObjectRecoveryProgress before_progress
;
6088 ObjectRecoveryProgress after_progress
;
6090 static void generate_test_instances(std::list
<PushOp
*>& o
);
6091 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6092 void decode(ceph::buffer::list::const_iterator
&bl
);
6093 std::ostream
&print(std::ostream
&out
) const;
6094 void dump(ceph::Formatter
*f
) const;
6096 uint64_t cost(CephContext
*cct
) const;
6098 WRITE_CLASS_ENCODER_FEATURES(PushOp
)
6099 std::ostream
& operator<<(std::ostream
& out
, const PushOp
&op
);
6102 * summarize pg contents for purposes of a scrub
6104 * If members are added to ScrubMap, make sure to modify swap().
6108 std::map
<std::string
, ceph::buffer::ptr
, std::less
<>> attrs
;
6110 __u32 omap_digest
; ///< omap crc32c
6111 __u32 digest
; ///< data crc32c
6113 bool digest_present
:1;
6114 bool omap_digest_present
:1;
6117 bool ec_hash_mismatch
:1;
6118 bool ec_size_mismatch
:1;
6119 bool large_omap_object_found
:1;
6120 uint64_t large_omap_object_key_count
= 0;
6121 uint64_t large_omap_object_value_size
= 0;
6122 uint64_t object_omap_bytes
= 0;
6123 uint64_t object_omap_keys
= 0;
6126 // Init invalid size so it won't match if we get a stat EIO error
6127 size(-1), omap_digest(0), digest(0),
6128 negative(false), digest_present(false), omap_digest_present(false),
6129 read_error(false), stat_error(false), ec_hash_mismatch(false),
6130 ec_size_mismatch(false), large_omap_object_found(false) {}
6132 void encode(ceph::buffer::list
& bl
) const;
6133 void decode(ceph::buffer::list::const_iterator
& bl
);
6134 void dump(ceph::Formatter
*f
) const;
6135 static void generate_test_instances(std::list
<object
*>& o
);
6137 WRITE_CLASS_ENCODER(object
)
6139 std::map
<hobject_t
,object
> objects
;
6140 eversion_t valid_through
;
6141 eversion_t incr_since
;
6142 bool has_large_omap_object_errors
{false};
6143 bool has_omap_keys
{false};
6145 void merge_incr(const ScrubMap
&l
);
6146 void clear_from(const hobject_t
& start
) {
6147 objects
.erase(objects
.lower_bound(start
), objects
.end());
6149 void insert(const ScrubMap
&r
) {
6150 objects
.insert(r
.objects
.begin(), r
.objects
.end());
6152 void swap(ScrubMap
&r
) {
6154 swap(objects
, r
.objects
);
6155 swap(valid_through
, r
.valid_through
);
6156 swap(incr_since
, r
.incr_since
);
6157 swap(has_large_omap_object_errors
, r
.has_large_omap_object_errors
);
6158 swap(has_omap_keys
, r
.has_omap_keys
);
6161 void encode(ceph::buffer::list
& bl
) const;
6162 void decode(ceph::buffer::list::const_iterator
& bl
, int64_t pool
=-1);
6163 void dump(ceph::Formatter
*f
) const;
6164 static void generate_test_instances(std::list
<ScrubMap
*>& o
);
6166 WRITE_CLASS_ENCODER(ScrubMap::object
)
6167 WRITE_CLASS_ENCODER(ScrubMap
)
6169 struct ScrubMapBuilder
{
6171 std::vector
<hobject_t
> ls
;
6173 int64_t data_pos
= 0;
6174 std::string omap_pos
;
6176 ceph::buffer::hash data_hash
, omap_hash
; ///< accumulatinng hash value
6177 uint64_t omap_keys
= 0;
6178 uint64_t omap_bytes
= 0;
6184 return pos
>= ls
.size();
6187 *this = ScrubMapBuilder();
6191 return data_pos
< 0;
6194 void next_object() {
6202 friend std::ostream
& operator<<(std::ostream
& out
, const ScrubMapBuilder
& pos
) {
6203 out
<< "(" << pos
.pos
<< "/" << pos
.ls
.size();
6204 if (pos
.pos
< pos
.ls
.size()) {
6205 out
<< " " << pos
.ls
[pos
.pos
];
6207 if (pos
.data_pos
< 0) {
6208 out
<< " byte " << pos
.data_pos
;
6210 if (!pos
.omap_pos
.empty()) {
6211 out
<< " key " << pos
.omap_pos
;
6217 out
<< " ret " << pos
.ret
;
6223 struct watch_item_t
{
6226 uint32_t timeout_seconds
;
6229 watch_item_t() : cookie(0), timeout_seconds(0) { }
6230 watch_item_t(entity_name_t name
, uint64_t cookie
, uint32_t timeout
,
6231 const entity_addr_t
& addr
)
6232 : name(name
), cookie(cookie
), timeout_seconds(timeout
),
6235 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
6236 ENCODE_START(2, 1, bl
);
6239 encode(timeout_seconds
, bl
);
6240 encode(addr
, bl
, features
);
6243 void decode(ceph::buffer::list::const_iterator
&bl
) {
6244 DECODE_START(2, bl
);
6247 decode(timeout_seconds
, bl
);
6248 if (struct_v
>= 2) {
6253 void dump(ceph::Formatter
*f
) const {
6254 f
->dump_stream("watcher") << name
;
6255 f
->dump_int("cookie", cookie
);
6256 f
->dump_int("timeout", timeout_seconds
);
6257 f
->open_object_section("addr");
6261 static void generate_test_instances(std::list
<watch_item_t
*>& o
) {
6263 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
6265 ea
.set_family(AF_INET
);
6266 ea
.set_in4_quad(0, 127);
6267 ea
.set_in4_quad(1, 0);
6268 ea
.set_in4_quad(2, 0);
6269 ea
.set_in4_quad(3, 1);
6271 o
.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 1), 10, 30, ea
));
6273 ea
.set_in4_quad(3, 2);
6275 o
.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 2), 20, 60, ea
));
6278 WRITE_CLASS_ENCODER_FEATURES(watch_item_t
)
6280 struct obj_watch_item_t
{
6286 * obj list watch response format
6289 struct obj_list_watch_response_t
{
6290 std::list
<watch_item_t
> entries
;
6292 void encode(ceph::buffer::list
& bl
, uint64_t features
) const {
6293 ENCODE_START(1, 1, bl
);
6294 encode(entries
, bl
, features
);
6297 void decode(ceph::buffer::list::const_iterator
& bl
) {
6298 DECODE_START(1, bl
);
6299 decode(entries
, bl
);
6302 void dump(ceph::Formatter
*f
) const {
6303 f
->open_array_section("entries");
6304 for (std::list
<watch_item_t
>::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
6305 f
->open_object_section("watch");
6311 static void generate_test_instances(std::list
<obj_list_watch_response_t
*>& o
) {
6313 o
.push_back(new obj_list_watch_response_t
);
6314 o
.push_back(new obj_list_watch_response_t
);
6315 std::list
<watch_item_t
*> test_watchers
;
6316 watch_item_t::generate_test_instances(test_watchers
);
6317 for (auto &e
: test_watchers
) {
6318 o
.back()->entries
.push_back(*e
);
6323 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t
)
6327 std::vector
<snapid_t
> snaps
; // ascending
6328 std::vector
< std::pair
<uint64_t,uint64_t> > overlap
;
6331 clone_info() : cloneid(CEPH_NOSNAP
), size(0) {}
6333 void encode(ceph::buffer::list
& bl
) const {
6334 ENCODE_START(1, 1, bl
);
6335 encode(cloneid
, bl
);
6337 encode(overlap
, bl
);
6341 void decode(ceph::buffer::list::const_iterator
& bl
) {
6342 DECODE_START(1, bl
);
6343 decode(cloneid
, bl
);
6345 decode(overlap
, bl
);
6349 void dump(ceph::Formatter
*f
) const {
6350 if (cloneid
== CEPH_NOSNAP
)
6351 f
->dump_string("cloneid", "HEAD");
6353 f
->dump_unsigned("cloneid", cloneid
.val
);
6354 f
->open_array_section("snapshots");
6355 for (std::vector
<snapid_t
>::const_iterator p
= snaps
.begin(); p
!= snaps
.end(); ++p
) {
6356 f
->open_object_section("snap");
6357 f
->dump_unsigned("id", p
->val
);
6361 f
->open_array_section("overlaps");
6362 for (std::vector
< std::pair
<uint64_t,uint64_t> >::const_iterator q
= overlap
.begin();
6363 q
!= overlap
.end(); ++q
) {
6364 f
->open_object_section("overlap");
6365 f
->dump_unsigned("offset", q
->first
);
6366 f
->dump_unsigned("length", q
->second
);
6370 f
->dump_unsigned("size", size
);
6372 static void generate_test_instances(std::list
<clone_info
*>& o
) {
6373 o
.push_back(new clone_info
);
6374 o
.push_back(new clone_info
);
6375 o
.back()->cloneid
= 1;
6376 o
.back()->snaps
.push_back(1);
6377 o
.back()->overlap
.push_back(std::pair
<uint64_t,uint64_t>(0,4096));
6378 o
.back()->overlap
.push_back(std::pair
<uint64_t,uint64_t>(8192,4096));
6379 o
.back()->size
= 16384;
6380 o
.push_back(new clone_info
);
6381 o
.back()->cloneid
= CEPH_NOSNAP
;
6382 o
.back()->size
= 32768;
6385 WRITE_CLASS_ENCODER(clone_info
)
6388 * obj list snaps response format
6391 struct obj_list_snap_response_t
{
6392 std::vector
<clone_info
> clones
; // ascending
6395 void encode(ceph::buffer::list
& bl
) const {
6396 ENCODE_START(2, 1, bl
);
6401 void decode(ceph::buffer::list::const_iterator
& bl
) {
6402 DECODE_START(2, bl
);
6410 void dump(ceph::Formatter
*f
) const {
6411 f
->open_array_section("clones");
6412 for (std::vector
<clone_info
>::const_iterator p
= clones
.begin(); p
!= clones
.end(); ++p
) {
6413 f
->open_object_section("clone");
6417 f
->dump_unsigned("seq", seq
);
6420 static void generate_test_instances(std::list
<obj_list_snap_response_t
*>& o
) {
6421 o
.push_back(new obj_list_snap_response_t
);
6422 o
.push_back(new obj_list_snap_response_t
);
6425 cl
.snaps
.push_back(1);
6426 cl
.overlap
.push_back(std::pair
<uint64_t,uint64_t>(0,4096));
6427 cl
.overlap
.push_back(std::pair
<uint64_t,uint64_t>(8192,4096));
6429 o
.back()->clones
.push_back(cl
);
6430 cl
.cloneid
= CEPH_NOSNAP
;
6434 o
.back()->clones
.push_back(cl
);
6435 o
.back()->seq
= 123;
6439 WRITE_CLASS_ENCODER(obj_list_snap_response_t
)
6443 struct PromoteCounter
{
6444 std::atomic
<unsigned long long> attempts
{0};
6445 std::atomic
<unsigned long long> objects
{0};
6446 std::atomic
<unsigned long long> bytes
{0};
6452 void finish(uint64_t size
) {
6457 void sample_and_attenuate(uint64_t *a
, uint64_t *o
, uint64_t *b
) {
6467 struct pool_pg_num_history_t
{
6468 /// last epoch updated
6470 /// poolid -> epoch -> pg_num
6471 std::map
<int64_t, std::map
<epoch_t
,uint32_t>> pg_nums
;
6472 /// pair(epoch, poolid)
6473 std::set
<std::pair
<epoch_t
,int64_t>> deleted_pools
;
6475 void log_pg_num_change(epoch_t epoch
, int64_t pool
, uint32_t pg_num
) {
6476 pg_nums
[pool
][epoch
] = pg_num
;
6478 void log_pool_delete(epoch_t epoch
, int64_t pool
) {
6479 deleted_pools
.insert(std::make_pair(epoch
, pool
));
6482 /// prune history based on oldest osdmap epoch in the cluster
6483 void prune(epoch_t oldest_epoch
) {
6484 auto i
= deleted_pools
.begin();
6485 while (i
!= deleted_pools
.end()) {
6486 if (i
->first
>= oldest_epoch
) {
6489 pg_nums
.erase(i
->second
);
6490 i
= deleted_pools
.erase(i
);
6492 for (auto& j
: pg_nums
) {
6493 auto k
= j
.second
.lower_bound(oldest_epoch
);
6494 // keep this and the entry before it (just to be paranoid)
6495 if (k
!= j
.second
.begin()) {
6497 j
.second
.erase(j
.second
.begin(), k
);
6502 void encode(ceph::buffer::list
& bl
) const {
6503 ENCODE_START(1, 1, bl
);
6505 encode(pg_nums
, bl
);
6506 encode(deleted_pools
, bl
);
6509 void decode(ceph::buffer::list::const_iterator
& p
) {
6513 decode(deleted_pools
, p
);
6516 void dump(ceph::Formatter
*f
) const {
6517 f
->dump_unsigned("epoch", epoch
);
6518 f
->open_object_section("pools");
6519 for (auto& i
: pg_nums
) {
6520 f
->open_object_section("pool");
6521 f
->dump_unsigned("pool_id", i
.first
);
6522 f
->open_array_section("changes");
6523 for (auto& j
: i
.second
) {
6524 f
->open_object_section("change");
6525 f
->dump_unsigned("epoch", j
.first
);
6526 f
->dump_unsigned("pg_num", j
.second
);
6533 f
->open_array_section("deleted_pools");
6534 for (auto& i
: deleted_pools
) {
6535 f
->open_object_section("deletion");
6536 f
->dump_unsigned("pool_id", i
.second
);
6537 f
->dump_unsigned("epoch", i
.first
);
6542 static void generate_test_instances(std::list
<pool_pg_num_history_t
*>& ls
) {
6543 ls
.push_back(new pool_pg_num_history_t
);
6545 friend std::ostream
& operator<<(std::ostream
& out
, const pool_pg_num_history_t
& h
) {
6546 return out
<< "pg_num_history(e" << h
.epoch
6547 << " pg_nums " << h
.pg_nums
6548 << " deleted_pools " << h
.deleted_pools
6552 WRITE_CLASS_ENCODER(pool_pg_num_history_t
)
6554 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
6556 static const std::string_view infover_key
= "_infover";
6557 static const std::string_view info_key
= "_info";
6558 static const std::string_view biginfo_key
= "_biginfo";
6559 static const std::string_view epoch_key
= "_epoch";
6560 static const std::string_view fastinfo_key
= "_fastinfo";
6562 static const __u8 pg_latest_struct_v
= 10;
6563 // v10 is the new past_intervals encoding
6564 // v9 was fastinfo_key addition
6565 // v8 was the move to a per-pg pgmeta object
6566 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
6567 // (first appeared in cuttlefish).
6568 static const __u8 pg_compat_struct_v
= 10;
6570 int prepare_info_keymap(
6572 std::map
<std::string
,ceph::buffer::list
> *km
,
6573 std::string
*key_to_remove
,
6576 pg_info_t
&last_written_info
,
6577 PastIntervals
&past_intervals
,
6578 bool dirty_big_info
,
6581 PerfCounters
*logger
= nullptr,
6582 DoutPrefixProvider
*dpp
= nullptr);
6584 namespace ceph::os
{
6588 void create_pg_collection(
6589 ceph::os::Transaction
& t
, spg_t pgid
, int bits
);
6591 void init_pg_ondisk(
6592 ceph::os::Transaction
& t
, spg_t pgid
, const pg_pool_t
*pool
);
6594 // filter for pg listings
6601 virtual ~PGLSFilter();
6602 virtual bool filter(const hobject_t
&obj
,
6603 const ceph::buffer::list
& xattr_data
) const = 0;
6606 * Arguments passed from the RADOS client. Implementations must
6607 * handle any encoding errors, and return an appropriate error code,
6608 * or 0 on valid input.
6610 virtual int init(ceph::buffer::list::const_iterator
¶ms
) = 0;
6613 * xattr key, or empty string. If non-empty, this xattr will be fetched
6614 * and the value passed into ::filter
6616 virtual const std::string
& get_xattr() const { return xattr
; }
6619 * If true, objects without the named xattr (if xattr name is not empty)
6620 * will be rejected without calling ::filter
6622 virtual bool reject_empty_xattr() const { return true; }
6625 class PGLSPlainFilter
: public PGLSFilter
{
6628 int init(ceph::buffer::list::const_iterator
¶ms
) override
;
6629 ~PGLSPlainFilter() override
{}
6630 bool filter(const hobject_t
& obj
,
6631 const ceph::buffer::list
& xattr_data
) const override
;
6634 // alias name for this structure:
6635 using missing_map_t
= std::map
<hobject_t
,
6636 std::pair
<std::optional
<uint32_t>,
6637 std::optional
<uint32_t>>>;