1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
25 #include <string_view>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/optional/optional_io.hpp>
29 #include <boost/variant.hpp>
30 #include <boost/smart_ptr/local_shared_ptr.hpp>
32 #include "include/rados/rados_types.hpp"
33 #include "include/mempool.h"
35 #include "msg/msg_types.h"
36 #include "include/compat.h"
37 #include "include/types.h"
38 #include "include/utime.h"
39 #include "include/CompatSet.h"
40 #include "common/ceph_context.h"
41 #include "common/histogram.h"
42 #include "include/interval_set.h"
43 #include "include/inline_memory.h"
44 #include "common/Formatter.h"
45 #include "common/bloom_filter.hpp"
46 #include "common/hobject.h"
47 #include "common/snap_types.h"
50 #include "include/cmp.h"
51 #include "librados/ListObjectImpl.h"
52 #include "compressor/Compressor.h"
53 #include "osd_perf_counters.h"
55 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
57 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
58 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
59 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
60 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
61 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
62 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
63 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
64 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
65 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
66 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
67 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
68 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
69 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
70 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
71 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
72 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
73 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
76 /// pool priority range set by user
77 #define OSD_POOL_PRIORITY_MAX 10
78 #define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
80 /// min recovery priority for MBackfillReserve
81 #define OSD_RECOVERY_PRIORITY_MIN 0
83 /// base backfill priority for MBackfillReserve
84 #define OSD_BACKFILL_PRIORITY_BASE 100
86 /// base backfill priority for MBackfillReserve (degraded PG)
87 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
89 /// base recovery priority for MBackfillReserve
90 #define OSD_RECOVERY_PRIORITY_BASE 180
92 /// base backfill priority for MBackfillReserve (inactive PG)
93 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
95 /// base recovery priority for MRecoveryReserve (inactive PG)
96 #define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
98 /// max manually/automatically set recovery priority for MBackfillReserve
99 #define OSD_RECOVERY_PRIORITY_MAX 253
101 /// backfill priority for MBackfillReserve, when forced manually
102 #define OSD_BACKFILL_PRIORITY_FORCED 254
104 /// recovery priority for MRecoveryReserve, when forced manually
105 #define OSD_RECOVERY_PRIORITY_FORCED 255
107 /// priority for pg deletion when osd is not fullish
108 #define OSD_DELETE_PRIORITY_NORMAL 179
110 /// priority for pg deletion when osd is approaching full
111 #define OSD_DELETE_PRIORITY_FULLISH 219
113 /// priority when more full
114 #define OSD_DELETE_PRIORITY_FULL 255
116 static std::map
<int, int> max_prio_map
= {
117 {OSD_BACKFILL_PRIORITY_BASE
, OSD_BACKFILL_DEGRADED_PRIORITY_BASE
- 1},
118 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_BASE
- 1},
119 {OSD_RECOVERY_PRIORITY_BASE
, OSD_BACKFILL_INACTIVE_PRIORITY_BASE
- 1},
120 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_MAX
},
121 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_MAX
}
124 typedef hobject_t collection_list_handle_t
;
126 /// convert a single CPEH_OSD_FLAG_* to a std::string
127 const char *ceph_osd_flag_name(unsigned flag
);
128 /// convert a single CEPH_OSD_OF_FLAG_* to a std::string
129 const char *ceph_osd_op_flag_name(unsigned flag
);
131 /// convert CEPH_OSD_FLAG_* op flags to a std::string
132 std::string
ceph_osd_flag_string(unsigned flags
);
133 /// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
134 std::string
ceph_osd_op_flag_string(unsigned flags
);
135 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
136 std::string
ceph_osd_alloc_hint_flag_string(unsigned flags
);
138 typedef std::map
<std::string
,std::string
> osd_alert_list_t
;
139 /// map osd id -> alert_list_t
140 typedef std::map
<int, osd_alert_list_t
> osd_alerts_t
;
141 void dump(ceph::Formatter
* f
, const osd_alerts_t
& alerts
);
144 typedef interval_set
<
146 mempool::osdmap::flat_map
> snap_interval_set_t
;
150 * osd request identifier
152 * caller name + incarnation# + tid to unique identify this request.
155 entity_name_t name
; // who
157 int32_t inc
; // incarnation
162 osd_reqid_t(const entity_name_t
& a
, int i
, ceph_tid_t t
)
163 : name(a
), tid(t
), inc(i
)
166 DENC(osd_reqid_t
, v
, p
) {
173 void dump(ceph::Formatter
*f
) const;
174 static void generate_test_instances(std::list
<osd_reqid_t
*>& o
);
176 WRITE_CLASS_DENC(osd_reqid_t
)
181 static const int32_t NO_OSD
= 0x7fffffff;
184 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD
) {}
185 explicit pg_shard_t(int osd
) : osd(osd
), shard(shard_id_t::NO_SHARD
) {}
186 pg_shard_t(int osd
, shard_id_t shard
) : osd(osd
), shard(shard
) {}
187 bool is_undefined() const {
190 std::string
get_osd() const { return (osd
== NO_OSD
? "NONE" : std::to_string(osd
)); }
191 void encode(ceph::buffer::list
&bl
) const;
192 void decode(ceph::buffer::list::const_iterator
&bl
);
193 void dump(ceph::Formatter
*f
) const {
194 f
->dump_unsigned("osd", osd
);
195 if (shard
!= shard_id_t::NO_SHARD
) {
196 f
->dump_unsigned("shard", shard
);
200 WRITE_CLASS_ENCODER(pg_shard_t
)
201 WRITE_EQ_OPERATORS_2(pg_shard_t
, osd
, shard
)
202 WRITE_CMP_OPERATORS_2(pg_shard_t
, osd
, shard
)
203 std::ostream
& operator<<(std::ostream
&lhs
, const pg_shard_t
&rhs
);
205 using HobjToShardSetMapping
= std::map
<hobject_t
, std::set
<pg_shard_t
>>;
207 class IsPGRecoverablePredicate
{
210 * have encodes the shards available
212 virtual bool operator()(const std::set
<pg_shard_t
> &have
) const = 0;
213 virtual ~IsPGRecoverablePredicate() {}
216 class IsPGReadablePredicate
{
219 * have encodes the shards available
221 virtual bool operator()(const std::set
<pg_shard_t
> &have
) const = 0;
222 virtual ~IsPGReadablePredicate() {}
225 inline std::ostream
& operator<<(std::ostream
& out
, const osd_reqid_t
& r
) {
226 return out
<< r
.name
<< "." << r
.inc
<< ":" << r
.tid
;
229 inline bool operator==(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
230 return (l
.name
== r
.name
) && (l
.inc
== r
.inc
) && (l
.tid
== r
.tid
);
232 inline bool operator!=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
233 return (l
.name
!= r
.name
) || (l
.inc
!= r
.inc
) || (l
.tid
!= r
.tid
);
235 inline bool operator<(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
236 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
237 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
< r
.tid
);
239 inline bool operator<=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
240 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
241 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
<= r
.tid
);
243 inline bool operator>(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
<= r
); }
244 inline bool operator>=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
< r
); }
247 template<> struct hash
<osd_reqid_t
> {
248 size_t operator()(const osd_reqid_t
&r
) const {
249 static hash
<uint64_t> H
;
250 return H(r
.name
.num() ^ r
.tid
^ r
.inc
);
258 // a locator constrains the placement of an object. mainly, which pool
260 struct object_locator_t
{
261 // You specify either the hash or the key -- not both
262 std::int64_t pool
; ///< pool id
263 std::string key
; ///< key string (if non-empty)
264 std::string nspace
; ///< namespace
265 std::int64_t hash
; ///< hash position (if >= 0)
267 explicit object_locator_t()
268 : pool(-1), hash(-1) {}
269 explicit object_locator_t(int64_t po
)
270 : pool(po
), hash(-1) {}
271 explicit object_locator_t(int64_t po
, int64_t ps
)
272 : pool(po
), hash(ps
) {}
273 explicit object_locator_t(int64_t po
, std::string_view ns
)
274 : pool(po
), nspace(ns
), hash(-1) {}
275 explicit object_locator_t(int64_t po
, std::string_view ns
, int64_t ps
)
276 : pool(po
), nspace(ns
), hash(ps
) {}
277 explicit object_locator_t(int64_t po
, std::string_view ns
, std::string_view s
)
278 : pool(po
), key(s
), nspace(ns
), hash(-1) {}
279 explicit object_locator_t(const hobject_t
& soid
)
280 : pool(soid
.pool
), key(soid
.get_key()), nspace(soid
.nspace
), hash(-1) {}
282 int64_t get_pool() const {
297 void encode(ceph::buffer::list
& bl
) const;
298 void decode(ceph::buffer::list::const_iterator
& p
);
299 void dump(ceph::Formatter
*f
) const;
300 static void generate_test_instances(std::list
<object_locator_t
*>& o
);
302 WRITE_CLASS_ENCODER(object_locator_t
)
304 inline bool operator==(const object_locator_t
& l
, const object_locator_t
& r
) {
305 return l
.pool
== r
.pool
&& l
.key
== r
.key
&& l
.nspace
== r
.nspace
&& l
.hash
== r
.hash
;
307 inline bool operator!=(const object_locator_t
& l
, const object_locator_t
& r
) {
311 inline std::ostream
& operator<<(std::ostream
& out
, const object_locator_t
& loc
)
313 out
<< "@" << loc
.pool
;
314 if (loc
.nspace
.length())
315 out
<< ";" << loc
.nspace
;
316 if (loc
.key
.length())
317 out
<< ":" << loc
.key
;
321 struct request_redirect_t
{
323 object_locator_t redirect_locator
; ///< this is authoritative
324 std::string redirect_object
; ///< If non-empty, the request goes to this object name
326 friend std::ostream
& operator<<(std::ostream
& out
, const request_redirect_t
& redir
);
329 request_redirect_t() {}
330 explicit request_redirect_t(const object_locator_t
& orig
, int64_t rpool
) :
331 redirect_locator(orig
) { redirect_locator
.pool
= rpool
; }
332 explicit request_redirect_t(const object_locator_t
& rloc
) :
333 redirect_locator(rloc
) {}
334 explicit request_redirect_t(const object_locator_t
& orig
,
335 const std::string
& robj
) :
336 redirect_locator(orig
), redirect_object(robj
) {}
338 bool empty() const { return redirect_locator
.empty() &&
339 redirect_object
.empty(); }
341 void combine_with_locator(object_locator_t
& orig
, std::string
& obj
) const {
342 orig
= redirect_locator
;
343 if (!redirect_object
.empty())
344 obj
= redirect_object
;
347 void encode(ceph::buffer::list
& bl
) const;
348 void decode(ceph::buffer::list::const_iterator
& bl
);
349 void dump(ceph::Formatter
*f
) const;
350 static void generate_test_instances(std::list
<request_redirect_t
*>& o
);
352 WRITE_CLASS_ENCODER(request_redirect_t
)
354 inline std::ostream
& operator<<(std::ostream
& out
, const request_redirect_t
& redir
) {
355 out
<< "object " << redir
.redirect_object
<< ", locator{" << redir
.redirect_locator
<< "}";
359 // Internal OSD op flags - set by the OSD based on the op types
361 CEPH_OSD_RMW_FLAG_READ
= (1 << 1),
362 CEPH_OSD_RMW_FLAG_WRITE
= (1 << 2),
363 CEPH_OSD_RMW_FLAG_CLASS_READ
= (1 << 3),
364 CEPH_OSD_RMW_FLAG_CLASS_WRITE
= (1 << 4),
365 CEPH_OSD_RMW_FLAG_PGOP
= (1 << 5),
366 CEPH_OSD_RMW_FLAG_CACHE
= (1 << 6),
367 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE
= (1 << 7),
368 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE
= (1 << 8),
369 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE
= (1 << 9),
370 CEPH_OSD_RMW_FLAG_RWORDERED
= (1 << 10),
371 CEPH_OSD_RMW_FLAG_RETURNVEC
= (1 << 11),
377 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
379 // placement seed (a hash value)
380 typedef uint32_t ps_t
;
382 // old (v1) pg_t encoding (wrap old struct ceph_pg)
385 void encode(ceph::buffer::list
& bl
) const {
386 ceph::encode_raw(v
, bl
);
388 void decode(ceph::buffer::list::const_iterator
& bl
) {
389 ceph::decode_raw(v
, bl
);
392 WRITE_CLASS_ENCODER(old_pg_t
)
394 // placement group id
399 pg_t() : m_pool(0), m_seed(0) {}
400 pg_t(ps_t seed
, uint64_t pool
) :
401 m_pool(pool
), m_seed(seed
) {}
402 // cppcheck-suppress noExplicitConstructor
403 pg_t(const ceph_pg
& cpg
) :
404 m_pool(cpg
.pool
), m_seed(cpg
.ps
) {}
406 // cppcheck-suppress noExplicitConstructor
407 pg_t(const old_pg_t
& opg
) {
411 old_pg_t
get_old_pg() const {
413 ceph_assert(m_pool
< 0xffffffffull
);
416 o
.v
.preferred
= (__s16
)-1;
423 int64_t pool() const {
427 static const uint8_t calc_name_buf_size
= 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
428 char *calc_name(char *buf
, const char *suffix_backwords
) const;
430 void set_ps(ps_t p
) {
433 void set_pool(uint64_t p
) {
437 pg_t
get_parent() const;
438 pg_t
get_ancestor(unsigned old_pg_num
) const;
440 int print(char *o
, int maxlen
) const;
441 bool parse(const char *s
);
443 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
, std::set
<pg_t
> *pchildren
) const;
445 bool is_merge_source(unsigned old_pg_num
, unsigned new_pg_num
, pg_t
*parent
) const;
446 bool is_merge_target(unsigned old_pg_num
, unsigned new_pg_num
) const {
447 return ps() < new_pg_num
&& is_split(new_pg_num
, old_pg_num
, nullptr);
451 * Returns b such that for all object o:
452 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
454 unsigned get_split_bits(unsigned pg_num
) const;
456 bool contains(int bits
, const ghobject_t
& oid
) const {
458 (int64_t)m_pool
== oid
.hobj
.get_logical_pool() &&
459 oid
.match(bits
, ps());
461 bool contains(int bits
, const hobject_t
& oid
) const {
463 (int64_t)m_pool
== oid
.get_logical_pool() &&
464 oid
.match(bits
, ps());
467 hobject_t
get_hobj_start() const;
468 hobject_t
get_hobj_end(unsigned pg_num
) const;
470 // strong ordering is supported
471 inline int compare(const pg_t
& p
) const noexcept
{
472 if (auto delta
= pool() - p
.pool(); delta
!= 0) {
474 } else if (ps() < p
.ps()) {
476 } else if (ps() > p
.ps()) {
483 void encode(ceph::buffer::list
& bl
) const {
489 encode((int32_t)-1, bl
); // was preferred
491 void decode(ceph::buffer::list::const_iterator
& bl
) {
497 bl
+= sizeof(int32_t); // was preferred
499 void decode_old(ceph::buffer::list::const_iterator
& bl
) {
505 void dump(ceph::Formatter
*f
) const;
506 static void generate_test_instances(std::list
<pg_t
*>& o
);
508 WRITE_CLASS_ENCODER(pg_t
)
510 inline bool operator<(const pg_t
& l
, const pg_t
& r
) {
511 return l
.compare(r
) < 0;
513 inline bool operator<=(const pg_t
& l
, const pg_t
& r
) {
514 return l
.compare(r
) <= 0;
516 inline bool operator==(const pg_t
& l
, const pg_t
& r
) {
517 return l
.compare(r
) == 0;
519 inline bool operator!=(const pg_t
& l
, const pg_t
& r
) {
520 return l
.compare(r
) != 0;
522 inline bool operator>(const pg_t
& l
, const pg_t
& r
) {
523 return l
.compare(r
) > 0;
525 inline bool operator>=(const pg_t
& l
, const pg_t
& r
) {
526 return l
.compare(r
) >= 0;
529 std::ostream
& operator<<(std::ostream
& out
, const pg_t
&pg
);
532 template<> struct hash
< pg_t
>
534 size_t operator()( const pg_t
& x
) const
536 static hash
<uint32_t> H
;
537 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
538 return H((x
.pool() & 0xffffffff) ^ (x
.pool() >> 32) ^ x
.ps() ^ (int32_t)(-1));
546 spg_t() : shard(shard_id_t::NO_SHARD
) {}
547 spg_t(pg_t pgid
, shard_id_t shard
) : pgid(pgid
), shard(shard
) {}
548 explicit spg_t(pg_t pgid
) : pgid(pgid
), shard(shard_id_t::NO_SHARD
) {}
549 unsigned get_split_bits(unsigned pg_num
) const {
550 return pgid
.get_split_bits(pg_num
);
552 spg_t
get_parent() const {
553 return spg_t(pgid
.get_parent(), shard
);
558 uint64_t pool() const {
561 void reset_shard(shard_id_t s
) {
565 static const uint8_t calc_name_buf_size
= pg_t::calc_name_buf_size
+ 4; // 36 + len('s') + len("255");
566 char *calc_name(char *buf
, const char *suffix_backwords
) const;
568 bool parse(const char *s
);
569 bool parse(const std::string
& s
) {
570 return parse(s
.c_str());
573 spg_t
get_ancestor(unsigned old_pg_num
) const {
574 return spg_t(pgid
.get_ancestor(old_pg_num
), shard
);
577 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
,
578 std::set
<spg_t
> *pchildren
) const {
579 std::set
<pg_t
> _children
;
580 std::set
<pg_t
> *children
= pchildren
? &_children
: NULL
;
581 bool is_split
= pgid
.is_split(old_pg_num
, new_pg_num
, children
);
582 if (pchildren
&& is_split
) {
583 for (std::set
<pg_t
>::iterator i
= _children
.begin();
584 i
!= _children
.end();
586 pchildren
->insert(spg_t(*i
, shard
));
591 bool is_merge_target(unsigned old_pg_num
, unsigned new_pg_num
) const {
592 return pgid
.is_merge_target(old_pg_num
, new_pg_num
);
594 bool is_merge_source(unsigned old_pg_num
, unsigned new_pg_num
,
595 spg_t
*parent
) const {
597 bool r
= pgid
.is_merge_source(old_pg_num
, new_pg_num
, &out
.pgid
);
604 bool is_no_shard() const {
605 return shard
== shard_id_t::NO_SHARD
;
608 ghobject_t
make_pgmeta_oid() const {
609 return ghobject_t::make_pgmeta(pgid
.pool(), pgid
.ps(), shard
);
612 void encode(ceph::buffer::list
&bl
) const {
613 ENCODE_START(1, 1, bl
);
618 void decode(ceph::buffer::list::const_iterator
& bl
) {
625 ghobject_t
make_temp_ghobject(const std::string
& name
) const {
627 hobject_t(object_t(name
), "", CEPH_NOSNAP
,
629 hobject_t::get_temp_pool(pgid
.pool()),
635 unsigned hash_to_shard(unsigned num_shards
) const {
636 return ps() % num_shards
;
639 WRITE_CLASS_ENCODER(spg_t
)
640 WRITE_EQ_OPERATORS_2(spg_t
, pgid
, shard
)
641 WRITE_CMP_OPERATORS_2(spg_t
, pgid
, shard
)
644 template<> struct hash
< spg_t
>
646 size_t operator()( const spg_t
& x
) const
648 static hash
<uint32_t> H
;
649 return H(hash
<pg_t
>()(x
.pgid
) ^ x
.shard
);
654 std::ostream
& operator<<(std::ostream
& out
, const spg_t
&pg
);
656 // ----------------------
659 enum type_t
: uint8_t {
661 TYPE_LEGACY_TEMP
= 1, /* no longer used */
667 uint64_t removal_seq
; // note: deprecated, not encoded
669 char _str_buff
[spg_t::calc_name_buf_size
];
674 coll_t(type_t t
, spg_t p
, uint64_t r
)
675 : type(t
), pgid(p
), removal_seq(r
) {
679 friend class denc_coll_t
;
681 coll_t() : type(TYPE_META
), removal_seq(0)
686 coll_t(const coll_t
& other
)
687 : type(other
.type
), pgid(other
.pgid
), removal_seq(other
.removal_seq
) {
691 explicit coll_t(spg_t pgid
)
692 : type(TYPE_PG
), pgid(pgid
), removal_seq(0)
697 coll_t
& operator=(const coll_t
& rhs
)
699 this->type
= rhs
.type
;
700 this->pgid
= rhs
.pgid
;
701 this->removal_seq
= rhs
.removal_seq
;
706 // named constructors
707 static coll_t
meta() {
710 static coll_t
pg(spg_t p
) {
714 const std::string
to_str() const {
715 return std::string(_str
);
717 const char *c_str() const {
721 bool parse(const std::string
& s
);
723 int operator<(const coll_t
&rhs
) const {
724 return type
< rhs
.type
||
725 (type
== rhs
.type
&& pgid
< rhs
.pgid
);
728 bool is_meta() const {
729 return type
== TYPE_META
;
731 bool is_pg_prefix(spg_t
*pgid_
) const {
732 if (type
== TYPE_PG
|| type
== TYPE_PG_TEMP
) {
739 return type
== TYPE_PG
;
741 bool is_pg(spg_t
*pgid_
) const {
742 if (type
== TYPE_PG
) {
748 bool is_temp() const {
749 return type
== TYPE_PG_TEMP
;
751 bool is_temp(spg_t
*pgid_
) const {
752 if (type
== TYPE_PG_TEMP
) {
758 int64_t pool() const {
762 void encode(ceph::buffer::list
& bl
) const;
763 void decode(ceph::buffer::list::const_iterator
& bl
);
764 size_t encoded_size() const;
766 inline bool operator==(const coll_t
& rhs
) const {
767 // only compare type if meta
768 if (type
!= rhs
.type
)
770 if (type
== TYPE_META
)
772 return type
== rhs
.type
&& pgid
== rhs
.pgid
;
774 inline bool operator!=(const coll_t
& rhs
) const {
775 return !(*this == rhs
);
778 // get a TEMP collection that corresponds to the current collection,
779 // which we presume is a pg collection.
780 coll_t
get_temp() const {
781 ceph_assert(type
== TYPE_PG
);
782 return coll_t(TYPE_PG_TEMP
, pgid
, 0);
785 ghobject_t
get_min_hobj() const {
789 o
.hobj
.pool
= pgid
.pool();
790 o
.set_shard(pgid
.shard
);
801 unsigned hash_to_shard(unsigned num_shards
) const {
803 return pgid
.hash_to_shard(num_shards
);
804 return 0; // whatever.
807 void dump(ceph::Formatter
*f
) const;
808 static void generate_test_instances(std::list
<coll_t
*>& o
);
811 WRITE_CLASS_ENCODER(coll_t
)
813 inline std::ostream
& operator<<(std::ostream
& out
, const coll_t
& c
) {
819 template<> struct hash
<coll_t
> {
820 size_t operator()(const coll_t
&c
) const {
822 std::string
str(c
.to_str());
823 std::string::const_iterator
end(str
.end());
824 for (std::string::const_iterator s
= str
.begin(); s
!= end
; ++s
) {
837 inline std::ostream
& operator<<(std::ostream
& out
, const ceph_object_layout
&ol
)
839 out
<< pg_t(ol
.ol_pgid
);
840 int su
= ol
.ol_stripe_unit
;
849 auto &get_type() const { return coll
.type
; }
850 auto &get_type() { return coll
.type
; }
851 auto &get_pgid() const { return coll
.pgid
; }
852 auto &get_pgid() { return coll
.pgid
; }
854 denc_coll_t() = default;
855 denc_coll_t(const denc_coll_t
&) = default;
856 denc_coll_t(denc_coll_t
&&) = default;
858 denc_coll_t
&operator=(const denc_coll_t
&) = default;
859 denc_coll_t
&operator=(denc_coll_t
&&) = default;
861 explicit denc_coll_t(const coll_t
&coll
) : coll(coll
) {}
862 operator coll_t() const {
866 bool operator<(const denc_coll_t
&rhs
) const {
867 return coll
< rhs
.coll
;
870 DENC(denc_coll_t
, v
, p
) {
872 denc(v
.get_type(), p
);
873 denc(v
.get_pgid().pgid
.m_pool
, p
);
874 denc(v
.get_pgid().pgid
.m_seed
, p
);
875 denc(v
.get_pgid().shard
.id
, p
);
879 WRITE_CLASS_DENC(denc_coll_t
)
882 // compound rados version type
883 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
884 * work well. For little-endian machine, we should make sure there is no padding
885 * in 32-bit machine and 64-bit machine.
892 eversion_t() : version(0), epoch(0), __pad(0) {}
893 eversion_t(epoch_t e
, version_t v
) : version(v
), epoch(e
), __pad(0) {}
895 // cppcheck-suppress noExplicitConstructor
896 eversion_t(const ceph_eversion
& ce
) :
901 explicit eversion_t(ceph::buffer::list
& bl
) : __pad(0) { decode(bl
); }
903 static const eversion_t
& max() {
904 static const eversion_t
max(-1,-1);
908 operator ceph_eversion() {
915 std::string
get_key_name() const;
917 // key must point to the beginning of a block of 32 chars
918 inline void get_key_name(char* key
) const {
919 // Below is equivalent of sprintf("%010u.%020llu");
921 ritoa
<uint64_t, 10, 20>(version
, key
+ 31);
923 ritoa
<uint32_t, 10, 10>(epoch
, key
+ 10);
926 void encode(ceph::buffer::list
&bl
) const {
927 #if defined(CEPH_LITTLE_ENDIAN)
928 bl
.append((char *)this, sizeof(version_t
) + sizeof(epoch_t
));
935 void decode(ceph::buffer::list::const_iterator
&bl
) {
936 #if defined(CEPH_LITTLE_ENDIAN)
937 bl
.copy(sizeof(version_t
) + sizeof(epoch_t
), (char *)this);
944 void decode(ceph::buffer::list
& bl
) {
945 auto p
= std::cbegin(bl
);
949 WRITE_CLASS_ENCODER(eversion_t
)
951 inline bool operator==(const eversion_t
& l
, const eversion_t
& r
) {
952 return (l
.epoch
== r
.epoch
) && (l
.version
== r
.version
);
954 inline bool operator!=(const eversion_t
& l
, const eversion_t
& r
) {
955 return (l
.epoch
!= r
.epoch
) || (l
.version
!= r
.version
);
957 inline bool operator<(const eversion_t
& l
, const eversion_t
& r
) {
958 return (l
.epoch
== r
.epoch
) ? (l
.version
< r
.version
):(l
.epoch
< r
.epoch
);
960 inline bool operator<=(const eversion_t
& l
, const eversion_t
& r
) {
961 return (l
.epoch
== r
.epoch
) ? (l
.version
<= r
.version
):(l
.epoch
<= r
.epoch
);
963 inline bool operator>(const eversion_t
& l
, const eversion_t
& r
) {
964 return (l
.epoch
== r
.epoch
) ? (l
.version
> r
.version
):(l
.epoch
> r
.epoch
);
966 inline bool operator>=(const eversion_t
& l
, const eversion_t
& r
) {
967 return (l
.epoch
== r
.epoch
) ? (l
.version
>= r
.version
):(l
.epoch
>= r
.epoch
);
969 inline std::ostream
& operator<<(std::ostream
& out
, const eversion_t
& e
) {
970 return out
<< e
.epoch
<< "'" << e
.version
;
974 * objectstore_perf_stat_t
976 * current perf information about the osd
978 struct objectstore_perf_stat_t
{
979 // cur_op_latency is in ns since double add/sub are not associative
980 uint64_t os_commit_latency_ns
;
981 uint64_t os_apply_latency_ns
;
983 objectstore_perf_stat_t() :
984 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
986 bool operator==(const objectstore_perf_stat_t
&r
) const {
987 return os_commit_latency_ns
== r
.os_commit_latency_ns
&&
988 os_apply_latency_ns
== r
.os_apply_latency_ns
;
991 void add(const objectstore_perf_stat_t
&o
) {
992 os_commit_latency_ns
+= o
.os_commit_latency_ns
;
993 os_apply_latency_ns
+= o
.os_apply_latency_ns
;
995 void sub(const objectstore_perf_stat_t
&o
) {
996 os_commit_latency_ns
-= o
.os_commit_latency_ns
;
997 os_apply_latency_ns
-= o
.os_apply_latency_ns
;
999 void dump(ceph::Formatter
*f
) const;
1000 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
1001 void decode(ceph::buffer::list::const_iterator
&bl
);
1002 static void generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
);
1004 WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t
)
1009 #define PG_STATE_CREATING (1ULL << 0) // creating
1010 #define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
1011 #define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
1012 #define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
1013 #define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
1014 #define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
1015 #define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
1016 #define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
1017 //#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
1018 #define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
1019 #define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
1020 #define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
1021 #define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
1022 #define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
1023 #define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
1024 #define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
1025 #define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
1026 #define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
1027 #define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
1028 #define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
1029 #define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
1030 #define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
1031 #define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
1032 #define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
1033 #define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
1034 #define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
1035 #define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
1036 #define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
1037 #define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
1038 #define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
1039 #define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
1040 #define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
1041 #define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
1042 #define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
1044 std::string
pg_state_string(uint64_t state
);
1045 std::string
pg_vector_string(const std::vector
<int32_t> &a
);
1046 std::optional
<uint64_t> pg_string_state(const std::string
& state
);
1052 * attributes for a single pool snapshot.
1054 struct pool_snap_info_t
{
1059 void dump(ceph::Formatter
*f
) const;
1060 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
1061 void decode(ceph::buffer::list::const_iterator
& bl
);
1062 static void generate_test_instances(std::list
<pool_snap_info_t
*>& o
);
1064 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t
)
1066 inline std::ostream
& operator<<(std::ostream
& out
, const pool_snap_info_t
& si
) {
1067 return out
<< si
.snapid
<< '(' << si
.name
<< ' ' << si
.stamp
<< ')';
1082 DEEP_SCRUB_INTERVAL
,
1084 RECOVERY_OP_PRIORITY
,
1087 COMPRESSION_ALGORITHM
,
1088 COMPRESSION_REQUIRED_RATIO
,
1089 COMPRESSION_MAX_BLOB_SIZE
,
1090 COMPRESSION_MIN_BLOB_SIZE
,
1094 FINGERPRINT_ALGORITHM
,
1095 PG_NUM_MIN
, // min pg_num
1096 PG_NUM_MAX
, // max pg_num
1097 TARGET_SIZE_BYTES
, // total bytes in pool
1098 TARGET_SIZE_RATIO
, // fraction of total cluster
1100 READ_LEASE_INTERVAL
,
1102 DEDUP_CHUNK_ALGORITHM
,
1103 DEDUP_CDC_CHUNK_SIZE
,
1116 opt_desc_t(key_t k
, type_t t
) : key(k
), type(t
) {}
1118 bool operator==(const opt_desc_t
& rhs
) const {
1119 return key
== rhs
.key
&& type
== rhs
.type
;
1123 typedef boost::variant
<std::string
,int64_t,double> value_t
;
1125 static bool is_opt_name(const std::string
& name
);
1126 static opt_desc_t
get_opt_desc(const std::string
& name
);
1128 pool_opts_t() : opts() {}
1130 bool is_set(key_t key
) const;
1132 template<typename T
>
1133 void set(key_t key
, const T
&val
) {
1134 value_t value
= val
;
1138 template<typename T
>
1139 bool get(key_t key
, T
*val
) const {
1140 opts_t::const_iterator i
= opts
.find(key
);
1141 if (i
== opts
.end()) {
1144 *val
= boost::get
<T
>(i
->second
);
1148 template<typename T
>
1149 T
value_or(key_t key
, T
&& default_value
) const {
1150 auto i
= opts
.find(key
);
1151 if (i
== opts
.end()) {
1152 return std::forward
<T
>(default_value
);
1154 return boost::get
<T
>(i
->second
);
1157 const value_t
& get(key_t key
) const;
1159 bool unset(key_t key
);
1161 void dump(const std::string
& name
, ceph::Formatter
*f
) const;
1163 void dump(ceph::Formatter
*f
) const;
1164 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
1165 void decode(ceph::buffer::list::const_iterator
&bl
);
1168 typedef std::map
<key_t
, value_t
> opts_t
;
1171 friend std::ostream
& operator<<(std::ostream
& out
, const pool_opts_t
& opts
);
1173 WRITE_CLASS_ENCODER_FEATURES(pool_opts_t
)
1175 struct pg_merge_meta_t
{
1177 epoch_t ready_epoch
= 0;
1178 epoch_t last_epoch_started
= 0;
1179 epoch_t last_epoch_clean
= 0;
1180 eversion_t source_version
;
1181 eversion_t target_version
;
1183 void encode(ceph::buffer::list
& bl
) const {
1184 ENCODE_START(1, 1, bl
);
1185 encode(source_pgid
, bl
);
1186 encode(ready_epoch
, bl
);
1187 encode(last_epoch_started
, bl
);
1188 encode(last_epoch_clean
, bl
);
1189 encode(source_version
, bl
);
1190 encode(target_version
, bl
);
1193 void decode(ceph::buffer::list::const_iterator
& p
) {
1195 decode(source_pgid
, p
);
1196 decode(ready_epoch
, p
);
1197 decode(last_epoch_started
, p
);
1198 decode(last_epoch_clean
, p
);
1199 decode(source_version
, p
);
1200 decode(target_version
, p
);
1203 void dump(ceph::Formatter
*f
) const {
1204 f
->dump_stream("source_pgid") << source_pgid
;
1205 f
->dump_unsigned("ready_epoch", ready_epoch
);
1206 f
->dump_unsigned("last_epoch_started", last_epoch_started
);
1207 f
->dump_unsigned("last_epoch_clean", last_epoch_clean
);
1208 f
->dump_stream("source_version") << source_version
;
1209 f
->dump_stream("target_version") << target_version
;
1212 WRITE_CLASS_ENCODER(pg_merge_meta_t
)
1220 static const char *APPLICATION_NAME_CEPHFS
;
1221 static const char *APPLICATION_NAME_RBD
;
1222 static const char *APPLICATION_NAME_RGW
;
1225 TYPE_REPLICATED
= 1, // replication
1226 //TYPE_RAID4 = 2, // raid4 (never implemented)
1227 TYPE_ERASURE
= 3, // erasure-coded
1229 static constexpr uint32_t pg_CRUSH_ITEM_NONE
= 0x7fffffff; /* can't import crush.h here */
1230 static std::string_view
get_type_name(int t
) {
1232 case TYPE_REPLICATED
: return "replicated";
1233 //case TYPE_RAID4: return "raid4";
1234 case TYPE_ERASURE
: return "erasure";
1235 default: return "???";
1238 std::string_view
get_type_name() const {
1239 return get_type_name(type
);
1243 FLAG_HASHPSPOOL
= 1<<0, // hash pg seed and pool together (instead of adding)
1244 FLAG_FULL
= 1<<1, // pool is full
1245 FLAG_EC_OVERWRITES
= 1<<2, // enables overwrites, once enabled, cannot be disabled
1246 FLAG_INCOMPLETE_CLONES
= 1<<3, // may have incomplete clones (bc we are/were an overlay)
1247 FLAG_NODELETE
= 1<<4, // pool can't be deleted
1248 FLAG_NOPGCHANGE
= 1<<5, // pool's pg and pgp num can't be changed
1249 FLAG_NOSIZECHANGE
= 1<<6, // pool's size and min size can't be changed
1250 FLAG_WRITE_FADVISE_DONTNEED
= 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1251 FLAG_NOSCRUB
= 1<<8, // block periodic scrub
1252 FLAG_NODEEP_SCRUB
= 1<<9, // block periodic deep-scrub
1253 FLAG_FULL_QUOTA
= 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1254 FLAG_NEARFULL
= 1<<11, // pool is nearfull
1255 FLAG_BACKFILLFULL
= 1<<12, // pool is backfillfull
1256 FLAG_SELFMANAGED_SNAPS
= 1<<13, // pool uses selfmanaged snaps
1257 FLAG_POOL_SNAPS
= 1<<14, // pool has pool snaps
1258 FLAG_CREATING
= 1<<15, // initial pool PGs are being created
1259 FLAG_EIO
= 1<<16, // return EIO for all client ops
1260 FLAG_BULK
= 1<<17, //pool is large
1263 static const char *get_flag_name(uint64_t f
) {
1265 case FLAG_HASHPSPOOL
: return "hashpspool";
1266 case FLAG_FULL
: return "full";
1267 case FLAG_EC_OVERWRITES
: return "ec_overwrites";
1268 case FLAG_INCOMPLETE_CLONES
: return "incomplete_clones";
1269 case FLAG_NODELETE
: return "nodelete";
1270 case FLAG_NOPGCHANGE
: return "nopgchange";
1271 case FLAG_NOSIZECHANGE
: return "nosizechange";
1272 case FLAG_WRITE_FADVISE_DONTNEED
: return "write_fadvise_dontneed";
1273 case FLAG_NOSCRUB
: return "noscrub";
1274 case FLAG_NODEEP_SCRUB
: return "nodeep-scrub";
1275 case FLAG_FULL_QUOTA
: return "full_quota";
1276 case FLAG_NEARFULL
: return "nearfull";
1277 case FLAG_BACKFILLFULL
: return "backfillfull";
1278 case FLAG_SELFMANAGED_SNAPS
: return "selfmanaged_snaps";
1279 case FLAG_POOL_SNAPS
: return "pool_snaps";
1280 case FLAG_CREATING
: return "creating";
1281 case FLAG_EIO
: return "eio";
1282 case FLAG_BULK
: return "bulk";
1283 default: return "???";
1286 static std::string
get_flags_string(uint64_t f
) {
1288 for (unsigned n
=0; f
&& n
<64; ++n
) {
1289 if (f
& (1ull << n
)) {
1292 s
+= get_flag_name(1ull << n
);
1297 std::string
get_flags_string() const {
1298 return get_flags_string(flags
);
1300 static uint64_t get_flag_by_name(const std::string
& name
) {
1301 if (name
== "hashpspool")
1302 return FLAG_HASHPSPOOL
;
1305 if (name
== "ec_overwrites")
1306 return FLAG_EC_OVERWRITES
;
1307 if (name
== "incomplete_clones")
1308 return FLAG_INCOMPLETE_CLONES
;
1309 if (name
== "nodelete")
1310 return FLAG_NODELETE
;
1311 if (name
== "nopgchange")
1312 return FLAG_NOPGCHANGE
;
1313 if (name
== "nosizechange")
1314 return FLAG_NOSIZECHANGE
;
1315 if (name
== "write_fadvise_dontneed")
1316 return FLAG_WRITE_FADVISE_DONTNEED
;
1317 if (name
== "noscrub")
1318 return FLAG_NOSCRUB
;
1319 if (name
== "nodeep-scrub")
1320 return FLAG_NODEEP_SCRUB
;
1321 if (name
== "full_quota")
1322 return FLAG_FULL_QUOTA
;
1323 if (name
== "nearfull")
1324 return FLAG_NEARFULL
;
1325 if (name
== "backfillfull")
1326 return FLAG_BACKFILLFULL
;
1327 if (name
== "selfmanaged_snaps")
1328 return FLAG_SELFMANAGED_SNAPS
;
1329 if (name
== "pool_snaps")
1330 return FLAG_POOL_SNAPS
;
1331 if (name
== "creating")
1332 return FLAG_CREATING
;
1340 /// converts the acting/up vector to a set of pg shards
1341 void convert_to_pg_shards(const std::vector
<int> &from
, std::set
<pg_shard_t
>* to
) const;
1344 CACHEMODE_NONE
= 0, ///< no caching
1345 CACHEMODE_WRITEBACK
= 1, ///< write to cache, flush later
1346 CACHEMODE_FORWARD
= 2, ///< forward if not in cache
1347 CACHEMODE_READONLY
= 3, ///< handle reads, forward writes [not strongly consistent]
1348 CACHEMODE_READFORWARD
= 4, ///< forward reads, write to cache flush later
1349 CACHEMODE_READPROXY
= 5, ///< proxy reads, write to cache flush later
1350 CACHEMODE_PROXY
= 6, ///< proxy if not in cache
1352 static const char *get_cache_mode_name(cache_mode_t m
) {
1354 case CACHEMODE_NONE
: return "none";
1355 case CACHEMODE_WRITEBACK
: return "writeback";
1356 case CACHEMODE_FORWARD
: return "forward";
1357 case CACHEMODE_READONLY
: return "readonly";
1358 case CACHEMODE_READFORWARD
: return "readforward";
1359 case CACHEMODE_READPROXY
: return "readproxy";
1360 case CACHEMODE_PROXY
: return "proxy";
1361 default: return "unknown";
1364 static cache_mode_t
get_cache_mode_from_str(const std::string
& s
) {
1366 return CACHEMODE_NONE
;
1367 if (s
== "writeback")
1368 return CACHEMODE_WRITEBACK
;
1370 return CACHEMODE_FORWARD
;
1371 if (s
== "readonly")
1372 return CACHEMODE_READONLY
;
1373 if (s
== "readforward")
1374 return CACHEMODE_READFORWARD
;
1375 if (s
== "readproxy")
1376 return CACHEMODE_READPROXY
;
1378 return CACHEMODE_PROXY
;
1379 return (cache_mode_t
)-1;
1381 const char *get_cache_mode_name() const {
1382 return get_cache_mode_name(cache_mode
);
1384 bool cache_mode_requires_hit_set() const {
1385 switch (cache_mode
) {
1386 case CACHEMODE_NONE
:
1387 case CACHEMODE_FORWARD
:
1388 case CACHEMODE_READONLY
:
1389 case CACHEMODE_PROXY
:
1391 case CACHEMODE_WRITEBACK
:
1392 case CACHEMODE_READFORWARD
:
1393 case CACHEMODE_READPROXY
:
1396 ceph_abort_msg("implement me");
1400 enum class pg_autoscale_mode_t
: uint8_t {
1404 UNKNOWN
= UINT8_MAX
,
1406 static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m
) {
1408 case pg_autoscale_mode_t::OFF
: return "off";
1409 case pg_autoscale_mode_t::ON
: return "on";
1410 case pg_autoscale_mode_t::WARN
: return "warn";
1411 default: return "???";
1414 static pg_autoscale_mode_t
get_pg_autoscale_mode_by_name(const std::string
& m
) {
1416 return pg_autoscale_mode_t::OFF
;
1419 return pg_autoscale_mode_t::WARN
;
1422 return pg_autoscale_mode_t::ON
;
1424 return pg_autoscale_mode_t::UNKNOWN
;
1427 utime_t create_time
;
1428 uint64_t flags
= 0; ///< FLAG_*
1429 __u8 type
= 0; ///< TYPE_*
1430 __u8 size
= 0, min_size
= 0; ///< number of osds in each pg
1431 __u8 crush_rule
= 0; ///< crush placement rule
1432 __u8 object_hash
= 0; ///< hash mapping object name to ps
1433 pg_autoscale_mode_t pg_autoscale_mode
= pg_autoscale_mode_t::UNKNOWN
;
1436 __u32 pg_num
= 0, pgp_num
= 0; ///< number of pgs
1437 __u32 pg_num_pending
= 0; ///< pg_num we are about to merge down to
1438 __u32 pg_num_target
= 0; ///< pg_num we should converge toward
1439 __u32 pgp_num_target
= 0; ///< pgp_num we should converge toward
1442 std::map
<std::string
, std::string
> properties
; ///< OBSOLETE
1443 std::string erasure_code_profile
; ///< name of the erasure code profile in OSDMap
1444 epoch_t last_change
= 0; ///< most recent epoch changed, exclusing snapshot changes
1445 // If non-zero, require OSDs in at least this many different instances...
1446 uint32_t peering_crush_bucket_count
= 0;
1447 // of this bucket type...
1448 uint32_t peering_crush_bucket_barrier
= 0;
1449 // including this one
1450 int32_t peering_crush_mandatory_member
= pg_CRUSH_ITEM_NONE
;
1451 // The per-bucket replica count is calculated with this "target"
1452 // instead of the above crush_bucket_count. This means we can maintain a
1453 // target size of 4 without attempting to place them all in 1 DC
1454 uint32_t peering_crush_bucket_target
= 0;
1455 /// last epoch that forced clients to resend
1456 epoch_t last_force_op_resend
= 0;
1457 /// last epoch that forced clients to resend (pre-nautilus clients only)
1458 epoch_t last_force_op_resend_prenautilus
= 0;
1459 /// last epoch that forced clients to resend (pre-luminous clients only)
1460 epoch_t last_force_op_resend_preluminous
= 0;
1462 /// metadata for the most recent PG merge
1463 pg_merge_meta_t last_pg_merge_meta
;
1465 snapid_t snap_seq
= 0; ///< seq for per-pool snapshot
1466 epoch_t snap_epoch
= 0; ///< osdmap epoch of last snap
1467 uint64_t auid
= 0; ///< who owns the pg
1469 uint64_t quota_max_bytes
= 0; ///< maximum number of bytes for this pool
1470 uint64_t quota_max_objects
= 0; ///< maximum number of objects for this pool
1473 * Pool snaps (global to this pool). These define a SnapContext for
1474 * the pool, unless the client manually specifies an alternate
1477 std::map
<snapid_t
, pool_snap_info_t
> snaps
;
1479 * Alternatively, if we are defining non-pool snaps (e.g. via the
1480 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1481 * used). Snaps and removed_snaps are to be used exclusive of each
1484 interval_set
<snapid_t
> removed_snaps
;
1486 unsigned pg_num_mask
= 0, pgp_num_mask
= 0;
1488 std::set
<uint64_t> tiers
; ///< pools that are tiers of us
1489 int64_t tier_of
= -1; ///< pool for which we are a tier
1490 // Note that write wins for read+write ops
1491 int64_t read_tier
= -1; ///< pool/tier for objecter to direct reads to
1492 int64_t write_tier
= -1; ///< pool/tier for objecter to direct writes to
1493 cache_mode_t cache_mode
= CACHEMODE_NONE
; ///< cache pool mode
1495 bool is_tier() const { return tier_of
>= 0; }
1496 bool has_tiers() const { return !tiers
.empty(); }
1501 clear_tier_tunables();
1503 bool has_read_tier() const { return read_tier
>= 0; }
1504 void clear_read_tier() { read_tier
= -1; }
1505 bool has_write_tier() const { return write_tier
>= 0; }
1506 void clear_write_tier() { write_tier
= -1; }
1507 void clear_tier_tunables() {
1508 if (cache_mode
!= CACHEMODE_NONE
)
1509 flags
|= FLAG_INCOMPLETE_CLONES
;
1510 cache_mode
= CACHEMODE_NONE
;
1512 target_max_bytes
= 0;
1513 target_max_objects
= 0;
1514 cache_target_dirty_ratio_micro
= 0;
1515 cache_target_dirty_high_ratio_micro
= 0;
1516 cache_target_full_ratio_micro
= 0;
1517 hit_set_params
= HitSet::Params();
1520 hit_set_grade_decay_rate
= 0;
1521 hit_set_search_last_n
= 0;
1522 grade_table
.resize(0);
1525 bool is_stretch_pool() const {
1526 return peering_crush_bucket_count
!= 0;
1529 bool stretch_set_can_peer(const std::set
<int>& want
, const OSDMap
& osdmap
,
1530 std::ostream
*out
) const;
1531 bool stretch_set_can_peer(const std::vector
<int>& want
, const OSDMap
& osdmap
,
1532 std::ostream
*out
) const {
1533 if (!is_stretch_pool()) return true;
1534 std::set
<int> swant
;
1535 for (auto i
: want
) swant
.insert(i
);
1536 return stretch_set_can_peer(swant
, osdmap
, out
);
1539 uint64_t target_max_bytes
= 0; ///< tiering: target max pool size
1540 uint64_t target_max_objects
= 0; ///< tiering: target max pool size
1542 uint32_t cache_target_dirty_ratio_micro
= 0; ///< cache: fraction of target to leave dirty
1543 uint32_t cache_target_dirty_high_ratio_micro
= 0; ///< cache: fraction of target to flush with high speed
1544 uint32_t cache_target_full_ratio_micro
= 0; ///< cache: fraction of target to fill before we evict in earnest
1546 uint32_t cache_min_flush_age
= 0; ///< minimum age (seconds) before we can flush
1547 uint32_t cache_min_evict_age
= 0; ///< minimum age (seconds) before we can evict
1549 HitSet::Params hit_set_params
; ///< The HitSet params to use on this pool
1550 uint32_t hit_set_period
= 0; ///< periodicity of HitSet segments (seconds)
1551 uint32_t hit_set_count
= 0; ///< number of periods to retain
1552 bool use_gmt_hitset
= true; ///< use gmt to name the hitset archive object
1553 uint32_t min_read_recency_for_promote
= 0; ///< minimum number of HitSet to check before promote on read
1554 uint32_t min_write_recency_for_promote
= 0; ///< minimum number of HitSet to check before promote on write
1555 uint32_t hit_set_grade_decay_rate
= 0; ///< current hit_set has highest priority on objects
1556 ///< temperature count,the follow hit_set's priority decay
1557 ///< by this params than pre hit_set
1558 uint32_t hit_set_search_last_n
= 0; ///< accumulate atmost N hit_sets for temperature
1560 uint32_t stripe_width
= 0; ///< erasure coded stripe size in bytes
1562 uint64_t expected_num_objects
= 0; ///< expected number of objects on this pool, a value of 0 indicates
1563 ///< user does not specify any expected value
1564 bool fast_read
= false; ///< whether turn on fast read on the pool or not
1566 pool_opts_t opts
; ///< options
1569 TYPE_FINGERPRINT_NONE
= 0,
1570 TYPE_FINGERPRINT_SHA1
= 1,
1571 TYPE_FINGERPRINT_SHA256
= 2,
1572 TYPE_FINGERPRINT_SHA512
= 3,
1574 static fingerprint_t
get_fingerprint_from_str(const std::string
& s
) {
1576 return TYPE_FINGERPRINT_NONE
;
1578 return TYPE_FINGERPRINT_SHA1
;
1580 return TYPE_FINGERPRINT_SHA256
;
1582 return TYPE_FINGERPRINT_SHA512
;
1583 return (fingerprint_t
)-1;
1585 const fingerprint_t
get_fingerprint_type() const {
1587 opts
.get(pool_opts_t::FINGERPRINT_ALGORITHM
, &fp_str
);
1588 return get_fingerprint_from_str(fp_str
);
1590 const char *get_fingerprint_name() const {
1593 opts
.get(pool_opts_t::FINGERPRINT_ALGORITHM
, &fp_str
);
1594 fp_t
= get_fingerprint_from_str(fp_str
);
1595 return get_fingerprint_name(fp_t
);
1597 static const char *get_fingerprint_name(fingerprint_t m
) {
1599 case TYPE_FINGERPRINT_NONE
: return "none";
1600 case TYPE_FINGERPRINT_SHA1
: return "sha1";
1601 case TYPE_FINGERPRINT_SHA256
: return "sha256";
1602 case TYPE_FINGERPRINT_SHA512
: return "sha512";
1603 default: return "unknown";
1608 TYPE_DEDUP_CHUNK_NONE
= 0,
1609 TYPE_DEDUP_CHUNK_FASTCDC
= 1,
1610 TYPE_DEDUP_CHUNK_FIXEDCDC
= 2,
1611 } dedup_chunk_algo_t
;
1612 static dedup_chunk_algo_t
get_dedup_chunk_algorithm_from_str(const std::string
& s
) {
1614 return TYPE_DEDUP_CHUNK_NONE
;
1616 return TYPE_DEDUP_CHUNK_FASTCDC
;
1618 return TYPE_DEDUP_CHUNK_FIXEDCDC
;
1619 return (dedup_chunk_algo_t
)-1;
1621 const dedup_chunk_algo_t
get_dedup_chunk_algorithm_type() const {
1622 std::string algo_str
;
1623 opts
.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM
, &algo_str
);
1624 return get_dedup_chunk_algorithm_from_str(algo_str
);
1626 const char *get_dedup_chunk_algorithm_name() const {
1627 std::string dedup_chunk_algo_str
;
1628 dedup_chunk_algo_t dedup_chunk_algo_t
;
1629 opts
.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM
, &dedup_chunk_algo_str
);
1630 dedup_chunk_algo_t
= get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str
);
1631 return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t
);
1633 static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m
) {
1635 case TYPE_DEDUP_CHUNK_NONE
: return "none";
1636 case TYPE_DEDUP_CHUNK_FASTCDC
: return "fastcdc";
1637 case TYPE_DEDUP_CHUNK_FIXEDCDC
: return "fixed";
1638 default: return "unknown";
1642 int64_t get_dedup_tier() const {
1643 int64_t tier_id
= 0;
1644 opts
.get(pool_opts_t::DEDUP_TIER
, &tier_id
);
1647 int64_t get_dedup_cdc_chunk_size() const {
1648 int64_t chunk_size
= 0;
1649 opts
.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE
, &chunk_size
);
1653 /// application -> key/value metadata
1654 std::map
<std::string
, std::map
<std::string
, std::string
>> application_metadata
;
1657 std::vector
<uint32_t> grade_table
;
1660 uint32_t get_grade(unsigned i
) const {
1661 if (grade_table
.size() <= i
)
1663 return grade_table
[i
];
1665 void calc_grade_table() {
1666 unsigned v
= 1000000;
1667 grade_table
.resize(hit_set_count
);
1668 for (unsigned i
= 0; i
< hit_set_count
; i
++) {
1669 v
= v
* (1 - (hit_set_grade_decay_rate
/ 100.0));
1674 pg_pool_t() = default;
1676 void dump(ceph::Formatter
*f
) const;
1678 const utime_t
&get_create_time() const { return create_time
; }
1679 uint64_t get_flags() const { return flags
; }
1680 bool has_flag(uint64_t f
) const { return flags
& f
; }
1681 void set_flag(uint64_t f
) { flags
|= f
; }
1682 void unset_flag(uint64_t f
) { flags
&= ~f
; }
1684 bool require_rollback() const {
1685 return is_erasure();
1688 /// true if incomplete clones may be present
1689 bool allow_incomplete_clones() const {
1690 return cache_mode
!= CACHEMODE_NONE
|| has_flag(FLAG_INCOMPLETE_CLONES
);
1693 unsigned get_type() const { return type
; }
1694 unsigned get_size() const { return size
; }
1695 unsigned get_min_size() const { return min_size
; }
1696 int get_crush_rule() const { return crush_rule
; }
1697 int get_object_hash() const { return object_hash
; }
1698 const char *get_object_hash_name() const {
1699 return ceph_str_hash_name(get_object_hash());
1701 epoch_t
get_last_change() const { return last_change
; }
1702 epoch_t
get_last_force_op_resend() const { return last_force_op_resend
; }
1703 epoch_t
get_last_force_op_resend_prenautilus() const {
1704 return last_force_op_resend_prenautilus
;
1706 epoch_t
get_last_force_op_resend_preluminous() const {
1707 return last_force_op_resend_preluminous
;
1709 epoch_t
get_snap_epoch() const { return snap_epoch
; }
1710 snapid_t
get_snap_seq() const { return snap_seq
; }
1711 uint64_t get_auid() const { return auid
; }
1713 void set_snap_seq(snapid_t s
) { snap_seq
= s
; }
1714 void set_snap_epoch(epoch_t e
) { snap_epoch
= e
; }
1716 void set_stripe_width(uint32_t s
) { stripe_width
= s
; }
1717 uint32_t get_stripe_width() const { return stripe_width
; }
1719 bool is_replicated() const { return get_type() == TYPE_REPLICATED
; }
1720 bool is_erasure() const { return get_type() == TYPE_ERASURE
; }
1722 bool supports_omap() const {
1723 return !(get_type() == TYPE_ERASURE
);
1726 bool requires_aligned_append() const {
1727 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES
);
1729 uint64_t required_alignment() const { return stripe_width
; }
1731 bool allows_ecoverwrites() const {
1732 return has_flag(FLAG_EC_OVERWRITES
);
1735 bool can_shift_osds() const {
1736 switch (get_type()) {
1737 case TYPE_REPLICATED
:
1742 ceph_abort_msg("unhandled pool type");
1746 unsigned get_pg_num() const { return pg_num
; }
1747 unsigned get_pgp_num() const { return pgp_num
; }
1748 unsigned get_pg_num_target() const { return pg_num_target
; }
1749 unsigned get_pgp_num_target() const { return pgp_num_target
; }
1750 unsigned get_pg_num_pending() const { return pg_num_pending
; }
1752 unsigned get_pg_num_mask() const { return pg_num_mask
; }
1753 unsigned get_pgp_num_mask() const { return pgp_num_mask
; }
1755 // if pg_num is not a multiple of two, pgs are not equally sized.
1756 // return, for a given pg, the fraction (denominator) of the total
1757 // pool size that it represents.
1758 unsigned get_pg_num_divisor(pg_t pgid
) const;
1760 bool is_pending_merge(pg_t pgid
, bool *target
) const;
1762 void set_pg_num(int p
) {
1767 void set_pgp_num(int p
) {
1771 void set_pg_num_pending(int p
) {
1775 void set_pg_num_target(int p
) {
1778 void set_pgp_num_target(int p
) {
1781 void dec_pg_num(pg_t source_pgid
,
1782 epoch_t ready_epoch
,
1783 eversion_t source_version
,
1784 eversion_t target_version
,
1785 epoch_t last_epoch_started
,
1786 epoch_t last_epoch_clean
) {
1788 last_pg_merge_meta
.source_pgid
= source_pgid
;
1789 last_pg_merge_meta
.ready_epoch
= ready_epoch
;
1790 last_pg_merge_meta
.source_version
= source_version
;
1791 last_pg_merge_meta
.target_version
= target_version
;
1792 last_pg_merge_meta
.last_epoch_started
= last_epoch_started
;
1793 last_pg_merge_meta
.last_epoch_clean
= last_epoch_clean
;
1797 void set_quota_max_bytes(uint64_t m
) {
1798 quota_max_bytes
= m
;
1800 uint64_t get_quota_max_bytes() {
1801 return quota_max_bytes
;
1804 void set_quota_max_objects(uint64_t m
) {
1805 quota_max_objects
= m
;
1807 uint64_t get_quota_max_objects() {
1808 return quota_max_objects
;
1811 void set_last_force_op_resend(uint64_t t
) {
1812 last_force_op_resend
= t
;
1813 last_force_op_resend_prenautilus
= t
;
1814 last_force_op_resend_preluminous
= t
;
1817 void calc_pg_masks();
1820 * we have two snap modes:
1821 * - pool global snaps
1822 * - snap existence/non-existence defined by snaps[] and snap_seq
1823 * - user managed snaps
1824 * - removal governed by removed_snaps
1826 * we know which mode we're using based on whether removed_snaps is empty.
1827 * If nothing has been created, both functions report false.
1829 bool is_pool_snaps_mode() const;
1830 bool is_unmanaged_snaps_mode() const;
1831 bool is_removed_snap(snapid_t s
) const;
1833 snapid_t
snap_exists(std::string_view s
) const;
1834 void add_snap(const char *n
, utime_t stamp
);
1835 uint64_t add_unmanaged_snap(bool preoctopus_compat
);
1836 void remove_snap(snapid_t s
);
1837 void remove_unmanaged_snap(snapid_t s
, bool preoctopus_compat
);
1839 SnapContext
get_snap_context() const;
1841 /// hash a object name+namespace key to a hash position
1842 uint32_t hash_key(const std::string
& key
, const std::string
& ns
) const;
1844 /// round a hash position down to a pg num
1845 uint32_t raw_hash_to_pg(uint32_t v
) const;
1848 * map a raw pg (with full precision ps) into an actual pg, for storage
1850 pg_t
raw_pg_to_pg(pg_t pg
) const;
1853 * map raw pg (full precision ps) into a placement seed. include
1854 * pool id in that value so that different pools don't use the same
1857 ps_t
raw_pg_to_pps(pg_t pg
) const;
1859 /// choose a random hash position within a pg
1860 uint32_t get_random_pg_position(pg_t pgid
, uint32_t seed
) const;
1862 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
1863 void decode(ceph::buffer::list::const_iterator
& bl
);
1865 static void generate_test_instances(std::list
<pg_pool_t
*>& o
);
1867 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t
)
1869 std::ostream
& operator<<(std::ostream
& out
, const pg_pool_t
& p
);
1873 * a summation of object stats
1875 * This is just a container for object stats; we don't know what for.
1877 * If you add members in object_stat_sum_t, you should make sure there are
1878 * not padding among these members.
1879 * You should also modify the padding_check function.
1882 struct object_stat_sum_t
{
1883 /**************************************************************************
1884 * WARNING: be sure to update operator==, floor, and split when
1885 * adding/removing fields!
1886 **************************************************************************/
1887 int64_t num_bytes
; // in bytes
1888 int64_t num_objects
;
1889 int64_t num_object_clones
;
1890 int64_t num_object_copies
; // num_objects * num_replicas
1891 int64_t num_objects_missing_on_primary
;
1892 int64_t num_objects_degraded
;
1893 int64_t num_objects_unfound
;
1898 int64_t num_scrub_errors
; // total deep and shallow scrub errors
1899 int64_t num_objects_recovered
;
1900 int64_t num_bytes_recovered
;
1901 int64_t num_keys_recovered
;
1902 int64_t num_shallow_scrub_errors
;
1903 int64_t num_deep_scrub_errors
;
1904 int64_t num_objects_dirty
;
1905 int64_t num_whiteouts
;
1906 int64_t num_objects_omap
;
1907 int64_t num_objects_hit_set_archive
;
1908 int64_t num_objects_misplaced
;
1909 int64_t num_bytes_hit_set_archive
;
1911 int64_t num_flush_kb
;
1913 int64_t num_evict_kb
;
1914 int64_t num_promote
;
1915 int32_t num_flush_mode_high
; // 1 when in high flush mode, otherwise 0
1916 int32_t num_flush_mode_low
; // 1 when in low flush mode, otherwise 0
1917 int32_t num_evict_mode_some
; // 1 when in evict some mode, otherwise 0
1918 int32_t num_evict_mode_full
; // 1 when in evict full mode, otherwise 0
1919 int64_t num_objects_pinned
;
1920 int64_t num_objects_missing
;
1921 int64_t num_legacy_snapsets
; ///< upper bound on pre-luminous-style SnapSets
1922 int64_t num_large_omap_objects
= 0;
1923 int64_t num_objects_manifest
= 0;
1924 int64_t num_omap_bytes
= 0;
1925 int64_t num_omap_keys
= 0;
1926 int64_t num_objects_repaired
= 0;
1930 num_objects(0), num_object_clones(0), num_object_copies(0),
1931 num_objects_missing_on_primary(0), num_objects_degraded(0),
1932 num_objects_unfound(0),
1933 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1934 num_scrub_errors(0),
1935 num_objects_recovered(0),
1936 num_bytes_recovered(0),
1937 num_keys_recovered(0),
1938 num_shallow_scrub_errors(0),
1939 num_deep_scrub_errors(0),
1940 num_objects_dirty(0),
1942 num_objects_omap(0),
1943 num_objects_hit_set_archive(0),
1944 num_objects_misplaced(0),
1945 num_bytes_hit_set_archive(0),
1951 num_flush_mode_high(0), num_flush_mode_low(0),
1952 num_evict_mode_some(0), num_evict_mode_full(0),
1953 num_objects_pinned(0),
1954 num_objects_missing(0),
1955 num_legacy_snapsets(0)
1958 void floor(int64_t f
) {
1959 #define FLOOR(x) if (x < f) x = f
1962 FLOOR(num_object_clones
);
1963 FLOOR(num_object_copies
);
1964 FLOOR(num_objects_missing_on_primary
);
1965 FLOOR(num_objects_missing
);
1966 FLOOR(num_objects_degraded
);
1967 FLOOR(num_objects_misplaced
);
1968 FLOOR(num_objects_unfound
);
1973 FLOOR(num_large_omap_objects
);
1974 FLOOR(num_objects_manifest
);
1975 FLOOR(num_omap_bytes
);
1976 FLOOR(num_omap_keys
);
1977 FLOOR(num_shallow_scrub_errors
);
1978 FLOOR(num_deep_scrub_errors
);
1979 num_scrub_errors
= num_shallow_scrub_errors
+ num_deep_scrub_errors
;
1980 FLOOR(num_objects_recovered
);
1981 FLOOR(num_bytes_recovered
);
1982 FLOOR(num_keys_recovered
);
1983 FLOOR(num_objects_dirty
);
1984 FLOOR(num_whiteouts
);
1985 FLOOR(num_objects_omap
);
1986 FLOOR(num_objects_hit_set_archive
);
1987 FLOOR(num_bytes_hit_set_archive
);
1989 FLOOR(num_flush_kb
);
1991 FLOOR(num_evict_kb
);
1993 FLOOR(num_flush_mode_high
);
1994 FLOOR(num_flush_mode_low
);
1995 FLOOR(num_evict_mode_some
);
1996 FLOOR(num_evict_mode_full
);
1997 FLOOR(num_objects_pinned
);
1998 FLOOR(num_legacy_snapsets
);
1999 FLOOR(num_objects_repaired
);
2003 void split(std::vector
<object_stat_sum_t
> &out
) const {
2004 #define SPLIT(PARAM) \
2005 for (unsigned i = 0; i < out.size(); ++i) { \
2006 out[i].PARAM = PARAM / out.size(); \
2007 if (i < (PARAM % out.size())) { \
2011 #define SPLIT_PRESERVE_NONZERO(PARAM) \
2012 for (unsigned i = 0; i < out.size(); ++i) { \
2014 out[i].PARAM = 1 + PARAM / out.size(); \
2021 SPLIT(num_object_clones
);
2022 SPLIT(num_object_copies
);
2023 SPLIT(num_objects_missing_on_primary
);
2024 SPLIT(num_objects_missing
);
2025 SPLIT(num_objects_degraded
);
2026 SPLIT(num_objects_misplaced
);
2027 SPLIT(num_objects_unfound
);
2032 SPLIT(num_large_omap_objects
);
2033 SPLIT(num_objects_manifest
);
2034 SPLIT(num_omap_bytes
);
2035 SPLIT(num_omap_keys
);
2036 SPLIT(num_objects_repaired
);
2037 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors
);
2038 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors
);
2039 for (unsigned i
= 0; i
< out
.size(); ++i
) {
2040 out
[i
].num_scrub_errors
= out
[i
].num_shallow_scrub_errors
+
2041 out
[i
].num_deep_scrub_errors
;
2043 SPLIT(num_objects_recovered
);
2044 SPLIT(num_bytes_recovered
);
2045 SPLIT(num_keys_recovered
);
2046 SPLIT(num_objects_dirty
);
2047 SPLIT(num_whiteouts
);
2048 SPLIT(num_objects_omap
);
2049 SPLIT(num_objects_hit_set_archive
);
2050 SPLIT(num_bytes_hit_set_archive
);
2052 SPLIT(num_flush_kb
);
2054 SPLIT(num_evict_kb
);
2056 SPLIT(num_flush_mode_high
);
2057 SPLIT(num_flush_mode_low
);
2058 SPLIT(num_evict_mode_some
);
2059 SPLIT(num_evict_mode_full
);
2060 SPLIT(num_objects_pinned
);
2061 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets
);
2063 #undef SPLIT_PRESERVE_NONZERO
2067 // FIPS zeroization audit 20191117: this memset is not security related.
2068 memset(this, 0, sizeof(*this));
2071 void calc_copies(int nrep
) {
2072 num_object_copies
= nrep
* num_objects
;
2075 bool is_zero() const {
2076 return mem_is_zero((char*)this, sizeof(*this));
2079 void add(const object_stat_sum_t
& o
);
2080 void sub(const object_stat_sum_t
& o
);
2082 void dump(ceph::Formatter
*f
) const;
2083 void padding_check() {
2085 sizeof(object_stat_sum_t
) ==
2087 sizeof(num_objects
) +
2088 sizeof(num_object_clones
) +
2089 sizeof(num_object_copies
) +
2090 sizeof(num_objects_missing_on_primary
) +
2091 sizeof(num_objects_degraded
) +
2092 sizeof(num_objects_unfound
) +
2097 sizeof(num_scrub_errors
) +
2098 sizeof(num_large_omap_objects
) +
2099 sizeof(num_objects_manifest
) +
2100 sizeof(num_omap_bytes
) +
2101 sizeof(num_omap_keys
) +
2102 sizeof(num_objects_repaired
) +
2103 sizeof(num_objects_recovered
) +
2104 sizeof(num_bytes_recovered
) +
2105 sizeof(num_keys_recovered
) +
2106 sizeof(num_shallow_scrub_errors
) +
2107 sizeof(num_deep_scrub_errors
) +
2108 sizeof(num_objects_dirty
) +
2109 sizeof(num_whiteouts
) +
2110 sizeof(num_objects_omap
) +
2111 sizeof(num_objects_hit_set_archive
) +
2112 sizeof(num_objects_misplaced
) +
2113 sizeof(num_bytes_hit_set_archive
) +
2115 sizeof(num_flush_kb
) +
2117 sizeof(num_evict_kb
) +
2118 sizeof(num_promote
) +
2119 sizeof(num_flush_mode_high
) +
2120 sizeof(num_flush_mode_low
) +
2121 sizeof(num_evict_mode_some
) +
2122 sizeof(num_evict_mode_full
) +
2123 sizeof(num_objects_pinned
) +
2124 sizeof(num_objects_missing
) +
2125 sizeof(num_legacy_snapsets
)
2127 "object_stat_sum_t have padding");
2129 void encode(ceph::buffer::list
& bl
) const;
2130 void decode(ceph::buffer::list::const_iterator
& bl
);
2131 static void generate_test_instances(std::list
<object_stat_sum_t
*>& o
);
2133 WRITE_CLASS_ENCODER(object_stat_sum_t
)
2135 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
);
2138 * a collection of object stat sums
2140 * This is a collection of stat sums over different categories.
2142 struct object_stat_collection_t
{
2143 /**************************************************************************
2144 * WARNING: be sure to update the operator== when adding/removing fields! *
2145 **************************************************************************/
2146 object_stat_sum_t sum
;
2148 void calc_copies(int nrep
) {
2149 sum
.calc_copies(nrep
);
2152 void dump(ceph::Formatter
*f
) const;
2153 void encode(ceph::buffer::list
& bl
) const;
2154 void decode(ceph::buffer::list::const_iterator
& bl
);
2155 static void generate_test_instances(std::list
<object_stat_collection_t
*>& o
);
2157 bool is_zero() const {
2158 return sum
.is_zero();
2165 void floor(int64_t f
) {
2169 void add(const object_stat_sum_t
& o
) {
2173 void add(const object_stat_collection_t
& o
) {
2176 void sub(const object_stat_collection_t
& o
) {
2180 WRITE_CLASS_ENCODER(object_stat_collection_t
)
2182 inline bool operator==(const object_stat_collection_t
& l
,
2183 const object_stat_collection_t
& r
) {
2184 return l
.sum
== r
.sum
;
2187 enum class scrub_level_t
: bool { shallow
= false, deep
= true };
2188 enum class scrub_type_t
: bool { not_repair
= false, do_repair
= true };
2190 /// is there a scrub in our future?
2191 enum class pg_scrub_sched_status_t
: uint16_t {
2192 unknown
, ///< status not reported yet
2193 not_queued
, ///< not in the OSD's scrub queue. Probably not active.
2194 active
, ///< scrubbing
2195 scheduled
, ///< scheduled for a scrub at an already determined time
2196 queued
///< queued to be scrubbed
2199 struct pg_scrubbing_status_t
{
2200 utime_t m_scheduled_at
{};
2201 int32_t m_duration_seconds
{0}; // relevant when scrubbing
2202 pg_scrub_sched_status_t m_sched_status
{pg_scrub_sched_status_t::unknown
};
2203 bool m_is_active
{false};
2204 scrub_level_t m_is_deep
{scrub_level_t::shallow
};
2205 bool m_is_periodic
{true};
2208 bool operator==(const pg_scrubbing_status_t
& l
, const pg_scrubbing_status_t
& r
);
2211 * aggregate stats for a single PG.
2214 /**************************************************************************
2215 * WARNING: be sure to update the operator== when adding/removing fields! *
2216 **************************************************************************/
2218 version_t reported_seq
; // sequence number
2219 epoch_t reported_epoch
; // epoch of this report
2221 utime_t last_fresh
; // last reported
2222 utime_t last_change
; // new state != previous state
2223 utime_t last_active
; // state & PG_STATE_ACTIVE
2224 utime_t last_peered
; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2225 utime_t last_clean
; // state & PG_STATE_CLEAN
2226 utime_t last_unstale
; // (state & PG_STATE_STALE) == 0
2227 utime_t last_undegraded
; // (state & PG_STATE_DEGRADED) == 0
2228 utime_t last_fullsized
; // (state & PG_STATE_UNDERSIZED) == 0
2230 eversion_t log_start
; // (log_start,version]
2231 eversion_t ondisk_log_start
; // there may be more on disk
2234 epoch_t last_epoch_clean
;
2236 __u32 parent_split_bits
;
2238 eversion_t last_scrub
;
2239 eversion_t last_deep_scrub
;
2240 utime_t last_scrub_stamp
;
2241 utime_t last_deep_scrub_stamp
;
2242 utime_t last_clean_scrub_stamp
;
2243 int32_t last_scrub_duration
{0};
2245 object_stat_collection_t stats
;
2248 int64_t ondisk_log_size
; // >= active_log_size
2249 int64_t objects_scrubbed
;
2251 std::vector
<int32_t> up
, acting
;
2252 std::vector
<pg_shard_t
> avail_no_missing
;
2253 std::map
< std::set
<pg_shard_t
>, int32_t > object_location_counts
;
2254 epoch_t mapping_epoch
;
2256 std::vector
<int32_t> blocked_by
; ///< osds on which the pg is blocked
2258 interval_set
<snapid_t
> purged_snaps
; ///< recently removed snaps that we've purged
2260 utime_t last_became_active
;
2261 utime_t last_became_peered
;
2263 /// up, acting primaries
2265 int32_t acting_primary
;
2267 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2268 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2269 uint32_t snaptrimq_len
;
2271 pg_scrubbing_status_t scrub_sched_status
;
2273 bool stats_invalid
:1;
2274 /// true if num_objects_dirty is not accurate (because it was not
2275 /// maintained starting from pool creation)
2276 bool dirty_stats_invalid
:1;
2277 bool omap_stats_invalid
:1;
2278 bool hitset_stats_invalid
:1;
2279 bool hitset_bytes_stats_invalid
:1;
2280 bool pin_stats_invalid
:1;
2281 bool manifest_stats_invalid
:1;
2287 created(0), last_epoch_clean(0),
2288 parent_split_bits(0),
2289 log_size(0), ondisk_log_size(0),
2290 objects_scrubbed(0),
2295 stats_invalid(false),
2296 dirty_stats_invalid(false),
2297 omap_stats_invalid(false),
2298 hitset_stats_invalid(false),
2299 hitset_bytes_stats_invalid(false),
2300 pin_stats_invalid(false),
2301 manifest_stats_invalid(false)
2304 epoch_t
get_effective_last_epoch_clean() const {
2305 if (state
& PG_STATE_CLEAN
) {
2306 // we are clean as of this report, and should thus take the
2308 return reported_epoch
;
2310 return last_epoch_clean
;
2314 std::pair
<epoch_t
, version_t
> get_version_pair() const {
2315 return { reported_epoch
, reported_seq
};
2318 void floor(int64_t f
) {
2322 if (ondisk_log_size
< f
)
2323 ondisk_log_size
= f
;
2324 if (snaptrimq_len
< f
)
2328 void add_sub_invalid_flags(const pg_stat_t
& o
) {
2329 // adding (or subtracting!) invalid stats render our stats invalid too
2330 stats_invalid
|= o
.stats_invalid
;
2331 dirty_stats_invalid
|= o
.dirty_stats_invalid
;
2332 omap_stats_invalid
|= o
.omap_stats_invalid
;
2333 hitset_stats_invalid
|= o
.hitset_stats_invalid
;
2334 hitset_bytes_stats_invalid
|= o
.hitset_bytes_stats_invalid
;
2335 pin_stats_invalid
|= o
.pin_stats_invalid
;
2336 manifest_stats_invalid
|= o
.manifest_stats_invalid
;
2338 void add(const pg_stat_t
& o
) {
2340 log_size
+= o
.log_size
;
2341 ondisk_log_size
+= o
.ondisk_log_size
;
2342 snaptrimq_len
= std::min((uint64_t)snaptrimq_len
+ o
.snaptrimq_len
,
2343 (uint64_t)(1ull << 31));
2344 add_sub_invalid_flags(o
);
2346 void sub(const pg_stat_t
& o
) {
2348 log_size
-= o
.log_size
;
2349 ondisk_log_size
-= o
.ondisk_log_size
;
2350 if (o
.snaptrimq_len
< snaptrimq_len
) {
2351 snaptrimq_len
-= o
.snaptrimq_len
;
2355 add_sub_invalid_flags(o
);
2358 bool is_acting_osd(int32_t osd
, bool primary
) const;
2359 void dump(ceph::Formatter
*f
) const;
2360 void dump_brief(ceph::Formatter
*f
) const;
2361 std::string
dump_scrub_schedule() const;
2362 void encode(ceph::buffer::list
&bl
) const;
2363 void decode(ceph::buffer::list::const_iterator
&bl
);
2364 static void generate_test_instances(std::list
<pg_stat_t
*>& o
);
2366 WRITE_CLASS_ENCODER(pg_stat_t
)
2368 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
);
2371 * ObjectStore full statfs information
2373 struct store_statfs_t
2375 uint64_t total
= 0; ///< Total bytes
2376 uint64_t available
= 0; ///< Free bytes available
2377 uint64_t internally_reserved
= 0; ///< Bytes reserved for internal purposes
2379 int64_t allocated
= 0; ///< Bytes allocated by the store
2381 int64_t data_stored
= 0; ///< Bytes actually stored by the user
2382 int64_t data_compressed
= 0; ///< Bytes stored after compression
2383 int64_t data_compressed_allocated
= 0; ///< Bytes allocated for compressed data
2384 int64_t data_compressed_original
= 0; ///< Bytes that were compressed
2386 int64_t omap_allocated
= 0; ///< approx usage of omap data
2387 int64_t internal_metadata
= 0; ///< approx usage of internal metadata
2390 *this = store_statfs_t();
2392 void floor(int64_t f
) {
2393 #define FLOOR(x) if (int64_t(x) < f) x = f
2396 FLOOR(internally_reserved
);
2399 FLOOR(data_compressed
);
2400 FLOOR(data_compressed_allocated
);
2401 FLOOR(data_compressed_original
);
2403 FLOOR(omap_allocated
);
2404 FLOOR(internal_metadata
);
2408 bool operator ==(const store_statfs_t
& other
) const;
2409 bool is_zero() const {
2410 return *this == store_statfs_t();
2413 uint64_t get_used() const {
2414 return total
- available
- internally_reserved
;
2417 // this accumulates both actually used and statfs's internally_reserved
2418 uint64_t get_used_raw() const {
2419 return total
- available
;
2422 float get_used_raw_ratio() const {
2424 return (float)get_used_raw() / (float)total
;
2430 // helpers to ease legacy code porting
2431 uint64_t kb_avail() const {
2432 return available
>> 10;
2434 uint64_t kb() const {
2437 uint64_t kb_used() const {
2438 return (total
- available
- internally_reserved
) >> 10;
2440 uint64_t kb_used_raw() const {
2441 return get_used_raw() >> 10;
2444 uint64_t kb_used_data() const {
2445 return allocated
>> 10;
2447 uint64_t kb_used_omap() const {
2448 return omap_allocated
>> 10;
2451 uint64_t kb_used_internal_metadata() const {
2452 return internal_metadata
>> 10;
2455 void add(const store_statfs_t
& o
) {
2457 available
+= o
.available
;
2458 internally_reserved
+= o
.internally_reserved
;
2459 allocated
+= o
.allocated
;
2460 data_stored
+= o
.data_stored
;
2461 data_compressed
+= o
.data_compressed
;
2462 data_compressed_allocated
+= o
.data_compressed_allocated
;
2463 data_compressed_original
+= o
.data_compressed_original
;
2464 omap_allocated
+= o
.omap_allocated
;
2465 internal_metadata
+= o
.internal_metadata
;
2467 void sub(const store_statfs_t
& o
) {
2469 available
-= o
.available
;
2470 internally_reserved
-= o
.internally_reserved
;
2471 allocated
-= o
.allocated
;
2472 data_stored
-= o
.data_stored
;
2473 data_compressed
-= o
.data_compressed
;
2474 data_compressed_allocated
-= o
.data_compressed_allocated
;
2475 data_compressed_original
-= o
.data_compressed_original
;
2476 omap_allocated
-= o
.omap_allocated
;
2477 internal_metadata
-= o
.internal_metadata
;
2479 void dump(ceph::Formatter
*f
) const;
2480 DENC(store_statfs_t
, v
, p
) {
2481 DENC_START(1, 1, p
);
2483 denc(v
.available
, p
);
2484 denc(v
.internally_reserved
, p
);
2485 denc(v
.allocated
, p
);
2486 denc(v
.data_stored
, p
);
2487 denc(v
.data_compressed
, p
);
2488 denc(v
.data_compressed_allocated
, p
);
2489 denc(v
.data_compressed_original
, p
);
2490 denc(v
.omap_allocated
, p
);
2491 denc(v
.internal_metadata
, p
);
2494 static void generate_test_instances(std::list
<store_statfs_t
*>& o
);
2496 WRITE_CLASS_DENC(store_statfs_t
)
2498 std::ostream
&operator<<(std::ostream
&lhs
, const store_statfs_t
&rhs
);
2501 * aggregate stats for an osd
2504 store_statfs_t statfs
;
2505 std::vector
<int> hb_peers
;
2506 int32_t snap_trim_queue_len
, num_snap_trimming
;
2507 uint64_t num_shards_repaired
;
2509 pow2_hist_t op_queue_age_hist
;
2511 objectstore_perf_stat_t os_perf_stat
;
2512 osd_alerts_t os_alerts
;
2514 epoch_t up_from
= 0;
2517 uint32_t num_pgs
= 0;
2519 uint32_t num_osds
= 0;
2520 uint32_t num_per_pool_osds
= 0;
2521 uint32_t num_per_pool_omap_osds
= 0;
2524 uint32_t last_update
; // in seconds
2525 uint32_t back_pingtime
[3];
2526 uint32_t back_min
[3];
2527 uint32_t back_max
[3];
2529 uint32_t front_pingtime
[3];
2530 uint32_t front_min
[3];
2531 uint32_t front_max
[3];
2532 uint32_t front_last
;
2534 std::map
<int, Interfaces
> hb_pingtime
; ///< map of osd id to Interfaces
2536 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2537 num_shards_repaired(0) {}
2539 void add(const osd_stat_t
& o
) {
2540 statfs
.add(o
.statfs
);
2541 snap_trim_queue_len
+= o
.snap_trim_queue_len
;
2542 num_snap_trimming
+= o
.num_snap_trimming
;
2543 num_shards_repaired
+= o
.num_shards_repaired
;
2544 op_queue_age_hist
.add(o
.op_queue_age_hist
);
2545 os_perf_stat
.add(o
.os_perf_stat
);
2546 num_pgs
+= o
.num_pgs
;
2547 num_osds
+= o
.num_osds
;
2548 num_per_pool_osds
+= o
.num_per_pool_osds
;
2549 num_per_pool_omap_osds
+= o
.num_per_pool_omap_osds
;
2550 for (const auto& a
: o
.os_alerts
) {
2551 auto& target
= os_alerts
[a
.first
];
2552 for (auto& i
: a
.second
) {
2553 target
.emplace(i
.first
, i
.second
);
2557 void sub(const osd_stat_t
& o
) {
2558 statfs
.sub(o
.statfs
);
2559 snap_trim_queue_len
-= o
.snap_trim_queue_len
;
2560 num_snap_trimming
-= o
.num_snap_trimming
;
2561 num_shards_repaired
-= o
.num_shards_repaired
;
2562 op_queue_age_hist
.sub(o
.op_queue_age_hist
);
2563 os_perf_stat
.sub(o
.os_perf_stat
);
2564 num_pgs
-= o
.num_pgs
;
2565 num_osds
-= o
.num_osds
;
2566 num_per_pool_osds
-= o
.num_per_pool_osds
;
2567 num_per_pool_omap_osds
-= o
.num_per_pool_omap_osds
;
2568 for (const auto& a
: o
.os_alerts
) {
2569 auto& target
= os_alerts
[a
.first
];
2570 for (auto& i
: a
.second
) {
2571 target
.erase(i
.first
);
2573 if (target
.empty()) {
2574 os_alerts
.erase(a
.first
);
2578 void dump(ceph::Formatter
*f
, bool with_net
= true) const;
2579 void dump_ping_time(ceph::Formatter
*f
) const;
2580 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
2581 void decode(ceph::buffer::list::const_iterator
&bl
);
2582 static void generate_test_instances(std::list
<osd_stat_t
*>& o
);
2584 WRITE_CLASS_ENCODER_FEATURES(osd_stat_t
)
2586 inline bool operator==(const osd_stat_t
& l
, const osd_stat_t
& r
) {
2587 return l
.statfs
== r
.statfs
&&
2588 l
.snap_trim_queue_len
== r
.snap_trim_queue_len
&&
2589 l
.num_snap_trimming
== r
.num_snap_trimming
&&
2590 l
.num_shards_repaired
== r
.num_shards_repaired
&&
2591 l
.hb_peers
== r
.hb_peers
&&
2592 l
.op_queue_age_hist
== r
.op_queue_age_hist
&&
2593 l
.os_perf_stat
== r
.os_perf_stat
&&
2594 l
.num_pgs
== r
.num_pgs
&&
2595 l
.num_osds
== r
.num_osds
&&
2596 l
.num_per_pool_osds
== r
.num_per_pool_osds
&&
2597 l
.num_per_pool_omap_osds
== r
.num_per_pool_omap_osds
;
2599 inline bool operator!=(const osd_stat_t
& l
, const osd_stat_t
& r
) {
2603 inline std::ostream
& operator<<(std::ostream
& out
, const osd_stat_t
& s
) {
2604 return out
<< "osd_stat(" << s
.statfs
<< ", "
2605 << "peers " << s
.hb_peers
2606 << " op hist " << s
.op_queue_age_hist
.h
2611 * summation over an entire pool
2613 struct pool_stat_t
{
2614 object_stat_collection_t stats
;
2615 store_statfs_t store_stats
;
2617 int64_t ondisk_log_size
; // >= active_log_size
2618 int32_t up
; ///< number of up replicas or shards
2619 int32_t acting
; ///< number of acting replicas or shards
2620 int32_t num_store_stats
; ///< amount of store_stats accumulated
2622 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2626 void floor(int64_t f
) {
2628 store_stats
.floor(f
);
2631 if (ondisk_log_size
< f
)
2632 ondisk_log_size
= f
;
2637 if (num_store_stats
< f
)
2638 num_store_stats
= f
;
2641 void add(const store_statfs_t
& o
) {
2645 void sub(const store_statfs_t
& o
) {
2650 void add(const pg_stat_t
& o
) {
2652 log_size
+= o
.log_size
;
2653 ondisk_log_size
+= o
.ondisk_log_size
;
2655 acting
+= o
.acting
.size();
2657 void sub(const pg_stat_t
& o
) {
2659 log_size
-= o
.log_size
;
2660 ondisk_log_size
-= o
.ondisk_log_size
;
2662 acting
-= o
.acting
.size();
2665 bool is_zero() const {
2666 return (stats
.is_zero() &&
2667 store_stats
.is_zero() &&
2669 ondisk_log_size
== 0 &&
2672 num_store_stats
== 0);
2675 // helper accessors to retrieve used/netto bytes depending on the
2676 // collection method: new per-pool objectstore report or legacy PG
2677 // summation at OSD.
2678 // In legacy mode used and netto values are the same. But for new per-pool
2679 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2680 // and 'netto' is amount of stored user data.
2681 uint64_t get_allocated_data_bytes(bool per_pool
) const {
2683 return store_stats
.allocated
;
2685 // legacy mode, use numbers from 'stats'
2686 return stats
.sum
.num_bytes
+ stats
.sum
.num_bytes_hit_set_archive
;
2689 uint64_t get_allocated_omap_bytes(bool per_pool_omap
) const {
2690 if (per_pool_omap
) {
2691 return store_stats
.omap_allocated
;
2693 // omap is not broken out by pool by nautilus bluestore; report the
2694 // scrub value. this will be imprecise in that it won't account for
2695 // any storage overhead/efficiency.
2696 return stats
.sum
.num_omap_bytes
;
2699 uint64_t get_user_data_bytes(float raw_used_rate
, ///< space amp factor
2700 bool per_pool
) const {
2701 // NOTE: we need the space amp factor so that we can work backwards from
2702 // the raw utilization to the amount of data that the user actually stored.
2704 return raw_used_rate
? store_stats
.data_stored
/ raw_used_rate
: 0;
2706 // legacy mode, use numbers from 'stats'. note that we do NOT use the
2707 // raw_used_rate factor here because we are working from the PG stats
2709 return stats
.sum
.num_bytes
+ stats
.sum
.num_bytes_hit_set_archive
;
2712 uint64_t get_user_omap_bytes(float raw_used_rate
, ///< space amp factor
2713 bool per_pool_omap
) const {
2714 if (per_pool_omap
) {
2715 return raw_used_rate
? store_stats
.omap_allocated
/ raw_used_rate
: 0;
2717 // omap usage is lazily reported during scrub; this value may lag.
2718 return stats
.sum
.num_omap_bytes
;
2722 void dump(ceph::Formatter
*f
) const;
2723 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
2724 void decode(ceph::buffer::list::const_iterator
&bl
);
2725 static void generate_test_instances(std::list
<pool_stat_t
*>& o
);
2727 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t
)
2730 // -----------------------------------------
2733 * pg_hit_set_info_t - information about a single recorded HitSet
2735 * Track basic metadata about a HitSet, like the number of insertions
2736 * and the time range it covers.
2738 struct pg_hit_set_info_t
{
2739 utime_t begin
, end
; ///< time interval
2740 eversion_t version
; ///< version this HitSet object was written
2741 bool using_gmt
; ///< use gmt for creating the hit_set archive object name
2743 friend bool operator==(const pg_hit_set_info_t
& l
,
2744 const pg_hit_set_info_t
& r
) {
2746 l
.begin
== r
.begin
&&
2748 l
.version
== r
.version
&&
2749 l
.using_gmt
== r
.using_gmt
;
2752 explicit pg_hit_set_info_t(bool using_gmt
= true)
2753 : using_gmt(using_gmt
) {}
2755 void encode(ceph::buffer::list
&bl
) const;
2756 void decode(ceph::buffer::list::const_iterator
&bl
);
2757 void dump(ceph::Formatter
*f
) const;
2758 static void generate_test_instances(std::list
<pg_hit_set_info_t
*>& o
);
2760 WRITE_CLASS_ENCODER(pg_hit_set_info_t
)
2763 * pg_hit_set_history_t - information about a history of hitsets
2765 * Include information about the currently accumulating hit set as well
2766 * as archived/historical ones.
2768 struct pg_hit_set_history_t
{
2769 eversion_t current_last_update
; ///< last version inserted into current set
2770 std::list
<pg_hit_set_info_t
> history
; ///< archived sets, sorted oldest -> newest
2772 friend bool operator==(const pg_hit_set_history_t
& l
,
2773 const pg_hit_set_history_t
& r
) {
2775 l
.current_last_update
== r
.current_last_update
&&
2776 l
.history
== r
.history
;
2779 void encode(ceph::buffer::list
&bl
) const;
2780 void decode(ceph::buffer::list::const_iterator
&bl
);
2781 void dump(ceph::Formatter
*f
) const;
2782 static void generate_test_instances(std::list
<pg_hit_set_history_t
*>& o
);
2784 WRITE_CLASS_ENCODER(pg_hit_set_history_t
)
2787 // -----------------------------------------
2790 * pg_history_t - information about recent pg peering/mapping history
2792 * This is aggressively shared between OSDs to bound the amount of past
2793 * history they need to worry about.
2795 struct pg_history_t
{
2796 epoch_t epoch_created
= 0; // epoch in which *pg* was created (pool or pg)
2797 epoch_t epoch_pool_created
= 0; // epoch in which *pool* was created
2798 // (note: may be pg creation epoch for
2799 // pre-luminous clusters)
2800 epoch_t last_epoch_started
= 0;; // lower bound on last epoch started (anywhere, not necessarily locally)
2801 // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/
2802 epoch_t last_interval_started
= 0;; // first epoch of last_epoch_started interval
2803 epoch_t last_epoch_clean
= 0;; // lower bound on last epoch the PG was completely clean.
2804 epoch_t last_interval_clean
= 0;; // first epoch of last_epoch_clean interval
2805 epoch_t last_epoch_split
= 0;; // as parent or child
2806 epoch_t last_epoch_marked_full
= 0;; // pool or cluster
2809 * In the event of a map discontinuity, same_*_since may reflect the first
2810 * map the osd has seen in the new map sequence rather than the actual start
2811 * of the interval. This is ok since a discontinuity at epoch e means there
2812 * must have been a clean interval between e and now and that we cannot be
2813 * in the active set during the interval containing e.
2815 epoch_t same_up_since
= 0;; // same acting set since
2816 epoch_t same_interval_since
= 0;; // same acting AND up set since
2817 epoch_t same_primary_since
= 0;; // same primary at least back through this epoch.
2819 eversion_t last_scrub
;
2820 eversion_t last_deep_scrub
;
2821 utime_t last_scrub_stamp
;
2822 utime_t last_deep_scrub_stamp
;
2823 utime_t last_clean_scrub_stamp
;
2825 /// upper bound on how long prior interval readable (relative to encode time)
2826 ceph::timespan prior_readable_until_ub
= ceph::timespan::zero();
2828 friend bool operator==(const pg_history_t
& l
, const pg_history_t
& r
) {
2830 l
.epoch_created
== r
.epoch_created
&&
2831 l
.epoch_pool_created
== r
.epoch_pool_created
&&
2832 l
.last_epoch_started
== r
.last_epoch_started
&&
2833 l
.last_interval_started
== r
.last_interval_started
&&
2834 l
.last_epoch_clean
== r
.last_epoch_clean
&&
2835 l
.last_interval_clean
== r
.last_interval_clean
&&
2836 l
.last_epoch_split
== r
.last_epoch_split
&&
2837 l
.last_epoch_marked_full
== r
.last_epoch_marked_full
&&
2838 l
.same_up_since
== r
.same_up_since
&&
2839 l
.same_interval_since
== r
.same_interval_since
&&
2840 l
.same_primary_since
== r
.same_primary_since
&&
2841 l
.last_scrub
== r
.last_scrub
&&
2842 l
.last_deep_scrub
== r
.last_deep_scrub
&&
2843 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
2844 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
2845 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
&&
2846 l
.prior_readable_until_ub
== r
.prior_readable_until_ub
;
2850 pg_history_t(epoch_t created
, utime_t stamp
)
2851 : epoch_created(created
),
2852 epoch_pool_created(created
),
2853 same_up_since(created
),
2854 same_interval_since(created
),
2855 same_primary_since(created
),
2856 last_scrub_stamp(stamp
),
2857 last_deep_scrub_stamp(stamp
),
2858 last_clean_scrub_stamp(stamp
) {}
2860 bool merge(const pg_history_t
&other
) {
2861 // Here, we only update the fields which cannot be calculated from the OSDmap.
2862 bool modified
= false;
2863 if (epoch_created
< other
.epoch_created
) {
2864 epoch_created
= other
.epoch_created
;
2867 if (epoch_pool_created
< other
.epoch_pool_created
) {
2868 // FIXME: for jewel compat only; this should either be 0 or always the
2869 // same value across all pg instances.
2870 epoch_pool_created
= other
.epoch_pool_created
;
2873 if (last_epoch_started
< other
.last_epoch_started
) {
2874 last_epoch_started
= other
.last_epoch_started
;
2877 if (last_interval_started
< other
.last_interval_started
) {
2878 last_interval_started
= other
.last_interval_started
;
2879 // if we are learning about a newer *started* interval, our
2880 // readable_until_ub is obsolete
2881 prior_readable_until_ub
= other
.prior_readable_until_ub
;
2883 } else if (other
.last_interval_started
== last_interval_started
&&
2884 other
.prior_readable_until_ub
< prior_readable_until_ub
) {
2885 // if other is the *same* interval, than pull our upper bound in
2886 // if they have a tighter bound.
2887 prior_readable_until_ub
= other
.prior_readable_until_ub
;
2890 if (last_epoch_clean
< other
.last_epoch_clean
) {
2891 last_epoch_clean
= other
.last_epoch_clean
;
2894 if (last_interval_clean
< other
.last_interval_clean
) {
2895 last_interval_clean
= other
.last_interval_clean
;
2898 if (last_epoch_split
< other
.last_epoch_split
) {
2899 last_epoch_split
= other
.last_epoch_split
;
2902 if (last_epoch_marked_full
< other
.last_epoch_marked_full
) {
2903 last_epoch_marked_full
= other
.last_epoch_marked_full
;
2906 if (other
.last_scrub
> last_scrub
) {
2907 last_scrub
= other
.last_scrub
;
2910 if (other
.last_scrub_stamp
> last_scrub_stamp
) {
2911 last_scrub_stamp
= other
.last_scrub_stamp
;
2914 if (other
.last_deep_scrub
> last_deep_scrub
) {
2915 last_deep_scrub
= other
.last_deep_scrub
;
2918 if (other
.last_deep_scrub_stamp
> last_deep_scrub_stamp
) {
2919 last_deep_scrub_stamp
= other
.last_deep_scrub_stamp
;
2922 if (other
.last_clean_scrub_stamp
> last_clean_scrub_stamp
) {
2923 last_clean_scrub_stamp
= other
.last_clean_scrub_stamp
;
2929 void encode(ceph::buffer::list
& bl
) const;
2930 void decode(ceph::buffer::list::const_iterator
& p
);
2931 void dump(ceph::Formatter
*f
) const;
2932 static void generate_test_instances(std::list
<pg_history_t
*>& o
);
2934 ceph::signedspan
refresh_prior_readable_until_ub(
2935 ceph::signedspan now
, ///< now, relative to osd startup_time
2936 ceph::signedspan ub
) { ///< ub, relative to osd startup_time
2938 // prior interval(s) are unreadable; we can zero the upper bound
2939 prior_readable_until_ub
= ceph::signedspan::zero();
2940 return ceph::signedspan::zero();
2942 prior_readable_until_ub
= ub
- now
;
2946 ceph::signedspan
get_prior_readable_until_ub(ceph::signedspan now
) {
2947 if (prior_readable_until_ub
== ceph::signedspan::zero()) {
2948 return ceph::signedspan::zero();
2950 return now
+ prior_readable_until_ub
;
2953 WRITE_CLASS_ENCODER(pg_history_t
)
2955 inline std::ostream
& operator<<(std::ostream
& out
, const pg_history_t
& h
) {
2956 out
<< "ec=" << h
.epoch_created
<< "/" << h
.epoch_pool_created
2957 << " lis/c=" << h
.last_interval_started
2958 << "/" << h
.last_interval_clean
2959 << " les/c/f=" << h
.last_epoch_started
<< "/" << h
.last_epoch_clean
2960 << "/" << h
.last_epoch_marked_full
2961 << " sis=" << h
.same_interval_since
;
2962 if (h
.prior_readable_until_ub
!= ceph::timespan::zero()) {
2963 out
<< " pruub=" << h
.prior_readable_until_ub
;
2970 * pg_info_t - summary of PG statistics.
2973 * - last_complete implies we have all objects that existed as of that
2974 * stamp, OR a newer object, OR have already applied a later delete.
2975 * - if last_complete >= log.tail, then we know pg contents thru log.head.
2976 * otherwise, we have no idea what the pg is supposed to contain.
2980 eversion_t last_update
; ///< last object version applied to store.
2981 eversion_t last_complete
; ///< last version pg was complete through.
2982 epoch_t last_epoch_started
; ///< last epoch at which this pg started on this osd
2983 epoch_t last_interval_started
; ///< first epoch of last_epoch_started interval
2985 version_t last_user_version
; ///< last user object version applied to store
2987 eversion_t log_tail
; ///< oldest log entry.
2989 hobject_t last_backfill
; ///< objects >= this and < last_complete may be missing
2991 interval_set
<snapid_t
> purged_snaps
;
2995 pg_history_t history
;
2996 pg_hit_set_history_t hit_set
;
2998 friend bool operator==(const pg_info_t
& l
, const pg_info_t
& r
) {
3001 l
.last_update
== r
.last_update
&&
3002 l
.last_complete
== r
.last_complete
&&
3003 l
.last_epoch_started
== r
.last_epoch_started
&&
3004 l
.last_interval_started
== r
.last_interval_started
&&
3005 l
.last_user_version
== r
.last_user_version
&&
3006 l
.log_tail
== r
.log_tail
&&
3007 l
.last_backfill
== r
.last_backfill
&&
3008 l
.purged_snaps
== r
.purged_snaps
&&
3009 l
.stats
== r
.stats
&&
3010 l
.history
== r
.history
&&
3011 l
.hit_set
== r
.hit_set
;
3015 : last_epoch_started(0),
3016 last_interval_started(0),
3017 last_user_version(0),
3018 last_backfill(hobject_t::get_max())
3020 // cppcheck-suppress noExplicitConstructor
3023 last_epoch_started(0),
3024 last_interval_started(0),
3025 last_user_version(0),
3026 last_backfill(hobject_t::get_max())
3029 void set_last_backfill(hobject_t pos
) {
3030 last_backfill
= pos
;
3033 bool is_empty() const { return last_update
.version
== 0; }
3034 bool dne() const { return history
.epoch_created
== 0; }
3036 bool has_missing() const { return last_complete
!= last_update
; }
3037 bool is_incomplete() const { return !last_backfill
.is_max(); }
3039 void encode(ceph::buffer::list
& bl
) const;
3040 void decode(ceph::buffer::list::const_iterator
& p
);
3041 void dump(ceph::Formatter
*f
) const;
3042 static void generate_test_instances(std::list
<pg_info_t
*>& o
);
3044 WRITE_CLASS_ENCODER(pg_info_t
)
3046 inline std::ostream
& operator<<(std::ostream
& out
, const pg_info_t
& pgi
)
3048 out
<< pgi
.pgid
<< "(";
3054 out
<< " v " << pgi
.last_update
;
3055 if (pgi
.last_complete
!= pgi
.last_update
)
3056 out
<< " lc " << pgi
.last_complete
;
3057 out
<< " (" << pgi
.log_tail
<< "," << pgi
.last_update
<< "]";
3059 if (pgi
.is_incomplete())
3060 out
<< " lb " << pgi
.last_backfill
;
3061 //out << " c " << pgi.epoch_created;
3062 out
<< " local-lis/les=" << pgi
.last_interval_started
3063 << "/" << pgi
.last_epoch_started
;
3064 out
<< " n=" << pgi
.stats
.stats
.sum
.num_objects
;
3065 out
<< " " << pgi
.history
3071 * pg_fast_info_t - common pg_info_t fields
3073 * These are the fields of pg_info_t (and children) that are updated for
3074 * most IO operations.
3077 * Because we rely on these fields to be applied to the normal
3078 * info struct, adding a new field here that is not also new in info
3079 * means that we must set an incompat OSD feature bit!
3081 struct pg_fast_info_t
{
3082 eversion_t last_update
;
3083 eversion_t last_complete
;
3084 version_t last_user_version
;
3085 struct { // pg_stat_t stats
3087 version_t reported_seq
;
3089 utime_t last_active
;
3090 utime_t last_peered
;
3092 utime_t last_unstale
;
3093 utime_t last_undegraded
;
3094 utime_t last_fullsized
;
3095 int64_t log_size
; // (also ondisk_log_size, which has the same value)
3096 struct { // object_stat_collection_t stats;
3097 struct { // objct_stat_sum_t sum
3098 int64_t num_bytes
; // in bytes
3099 int64_t num_objects
;
3100 int64_t num_object_copies
;
3105 int64_t num_objects_dirty
;
3110 void populate_from(const pg_info_t
& info
) {
3111 last_update
= info
.last_update
;
3112 last_complete
= info
.last_complete
;
3113 last_user_version
= info
.last_user_version
;
3114 stats
.version
= info
.stats
.version
;
3115 stats
.reported_seq
= info
.stats
.reported_seq
;
3116 stats
.last_fresh
= info
.stats
.last_fresh
;
3117 stats
.last_active
= info
.stats
.last_active
;
3118 stats
.last_peered
= info
.stats
.last_peered
;
3119 stats
.last_clean
= info
.stats
.last_clean
;
3120 stats
.last_unstale
= info
.stats
.last_unstale
;
3121 stats
.last_undegraded
= info
.stats
.last_undegraded
;
3122 stats
.last_fullsized
= info
.stats
.last_fullsized
;
3123 stats
.log_size
= info
.stats
.log_size
;
3124 stats
.stats
.sum
.num_bytes
= info
.stats
.stats
.sum
.num_bytes
;
3125 stats
.stats
.sum
.num_objects
= info
.stats
.stats
.sum
.num_objects
;
3126 stats
.stats
.sum
.num_object_copies
= info
.stats
.stats
.sum
.num_object_copies
;
3127 stats
.stats
.sum
.num_rd
= info
.stats
.stats
.sum
.num_rd
;
3128 stats
.stats
.sum
.num_rd_kb
= info
.stats
.stats
.sum
.num_rd_kb
;
3129 stats
.stats
.sum
.num_wr
= info
.stats
.stats
.sum
.num_wr
;
3130 stats
.stats
.sum
.num_wr_kb
= info
.stats
.stats
.sum
.num_wr_kb
;
3131 stats
.stats
.sum
.num_objects_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
3134 bool try_apply_to(pg_info_t
* info
) {
3135 if (last_update
<= info
->last_update
)
3137 info
->last_update
= last_update
;
3138 info
->last_complete
= last_complete
;
3139 info
->last_user_version
= last_user_version
;
3140 info
->stats
.version
= stats
.version
;
3141 info
->stats
.reported_seq
= stats
.reported_seq
;
3142 info
->stats
.last_fresh
= stats
.last_fresh
;
3143 info
->stats
.last_active
= stats
.last_active
;
3144 info
->stats
.last_peered
= stats
.last_peered
;
3145 info
->stats
.last_clean
= stats
.last_clean
;
3146 info
->stats
.last_unstale
= stats
.last_unstale
;
3147 info
->stats
.last_undegraded
= stats
.last_undegraded
;
3148 info
->stats
.last_fullsized
= stats
.last_fullsized
;
3149 info
->stats
.log_size
= stats
.log_size
;
3150 info
->stats
.ondisk_log_size
= stats
.log_size
;
3151 info
->stats
.stats
.sum
.num_bytes
= stats
.stats
.sum
.num_bytes
;
3152 info
->stats
.stats
.sum
.num_objects
= stats
.stats
.sum
.num_objects
;
3153 info
->stats
.stats
.sum
.num_object_copies
= stats
.stats
.sum
.num_object_copies
;
3154 info
->stats
.stats
.sum
.num_rd
= stats
.stats
.sum
.num_rd
;
3155 info
->stats
.stats
.sum
.num_rd_kb
= stats
.stats
.sum
.num_rd_kb
;
3156 info
->stats
.stats
.sum
.num_wr
= stats
.stats
.sum
.num_wr
;
3157 info
->stats
.stats
.sum
.num_wr_kb
= stats
.stats
.sum
.num_wr_kb
;
3158 info
->stats
.stats
.sum
.num_objects_dirty
= stats
.stats
.sum
.num_objects_dirty
;
3162 void encode(ceph::buffer::list
& bl
) const {
3163 ENCODE_START(1, 1, bl
);
3164 encode(last_update
, bl
);
3165 encode(last_complete
, bl
);
3166 encode(last_user_version
, bl
);
3167 encode(stats
.version
, bl
);
3168 encode(stats
.reported_seq
, bl
);
3169 encode(stats
.last_fresh
, bl
);
3170 encode(stats
.last_active
, bl
);
3171 encode(stats
.last_peered
, bl
);
3172 encode(stats
.last_clean
, bl
);
3173 encode(stats
.last_unstale
, bl
);
3174 encode(stats
.last_undegraded
, bl
);
3175 encode(stats
.last_fullsized
, bl
);
3176 encode(stats
.log_size
, bl
);
3177 encode(stats
.stats
.sum
.num_bytes
, bl
);
3178 encode(stats
.stats
.sum
.num_objects
, bl
);
3179 encode(stats
.stats
.sum
.num_object_copies
, bl
);
3180 encode(stats
.stats
.sum
.num_rd
, bl
);
3181 encode(stats
.stats
.sum
.num_rd_kb
, bl
);
3182 encode(stats
.stats
.sum
.num_wr
, bl
);
3183 encode(stats
.stats
.sum
.num_wr_kb
, bl
);
3184 encode(stats
.stats
.sum
.num_objects_dirty
, bl
);
3187 void decode(ceph::buffer::list::const_iterator
& p
) {
3189 decode(last_update
, p
);
3190 decode(last_complete
, p
);
3191 decode(last_user_version
, p
);
3192 decode(stats
.version
, p
);
3193 decode(stats
.reported_seq
, p
);
3194 decode(stats
.last_fresh
, p
);
3195 decode(stats
.last_active
, p
);
3196 decode(stats
.last_peered
, p
);
3197 decode(stats
.last_clean
, p
);
3198 decode(stats
.last_unstale
, p
);
3199 decode(stats
.last_undegraded
, p
);
3200 decode(stats
.last_fullsized
, p
);
3201 decode(stats
.log_size
, p
);
3202 decode(stats
.stats
.sum
.num_bytes
, p
);
3203 decode(stats
.stats
.sum
.num_objects
, p
);
3204 decode(stats
.stats
.sum
.num_object_copies
, p
);
3205 decode(stats
.stats
.sum
.num_rd
, p
);
3206 decode(stats
.stats
.sum
.num_rd_kb
, p
);
3207 decode(stats
.stats
.sum
.num_wr
, p
);
3208 decode(stats
.stats
.sum
.num_wr_kb
, p
);
3209 decode(stats
.stats
.sum
.num_objects_dirty
, p
);
3213 WRITE_CLASS_ENCODER(pg_fast_info_t
)
3217 * PastIntervals -- information needed to determine the PriorSet and
3218 * the might_have_unfound set
3220 class PastIntervals
{
3222 using OSDMapRef
= boost::local_shared_ptr
<const OSDMap
>;
3224 using OSDMapRef
= std::shared_ptr
<const OSDMap
>;
3227 struct pg_interval_t
{
3228 std::vector
<int32_t> up
, acting
;
3229 epoch_t first
, last
;
3235 : first(0), last(0),
3236 maybe_went_rw(false),
3242 std::vector
<int32_t> &&up
,
3243 std::vector
<int32_t> &&acting
,
3249 : up(up
), acting(acting
), first(first
), last(last
),
3250 maybe_went_rw(maybe_went_rw
), primary(primary
), up_primary(up_primary
)
3253 void encode(ceph::buffer::list
& bl
) const;
3254 void decode(ceph::buffer::list::const_iterator
& bl
);
3255 void dump(ceph::Formatter
*f
) const;
3256 static void generate_test_instances(std::list
<pg_interval_t
*>& o
);
3260 PastIntervals(PastIntervals
&&rhs
) = default;
3261 PastIntervals
&operator=(PastIntervals
&&rhs
) = default;
3263 PastIntervals(const PastIntervals
&rhs
);
3264 PastIntervals
&operator=(const PastIntervals
&rhs
);
3266 class interval_rep
{
3268 virtual size_t size() const = 0;
3269 virtual bool empty() const = 0;
3270 virtual void clear() = 0;
3271 virtual std::pair
<epoch_t
, epoch_t
> get_bounds() const = 0;
3272 virtual std::set
<pg_shard_t
> get_all_participants(
3273 bool ec_pool
) const = 0;
3274 virtual void add_interval(bool ec_pool
, const pg_interval_t
&interval
) = 0;
3275 virtual std::unique_ptr
<interval_rep
> clone() const = 0;
3276 virtual std::ostream
&print(std::ostream
&out
) const = 0;
3277 virtual void encode(ceph::buffer::list
&bl
) const = 0;
3278 virtual void decode(ceph::buffer::list::const_iterator
&bl
) = 0;
3279 virtual void dump(ceph::Formatter
*f
) const = 0;
3280 virtual void iterate_mayberw_back_to(
3282 std::function
<void(epoch_t
, const std::set
<pg_shard_t
> &)> &&f
) const = 0;
3284 virtual bool has_full_intervals() const { return false; }
3285 virtual void iterate_all_intervals(
3286 std::function
<void(const pg_interval_t
&)> &&f
) const {
3287 ceph_assert(!has_full_intervals());
3288 ceph_abort_msg("not valid for this implementation");
3290 virtual void adjust_start_backwards(epoch_t last_epoch_clean
) = 0;
3292 virtual ~interval_rep() {}
3294 friend class pi_compact_rep
;
3297 std::unique_ptr
<interval_rep
> past_intervals
;
3299 explicit PastIntervals(interval_rep
*rep
) : past_intervals(rep
) {}
3302 void add_interval(bool ec_pool
, const pg_interval_t
&interval
) {
3303 ceph_assert(past_intervals
);
3304 return past_intervals
->add_interval(ec_pool
, interval
);
3307 void encode(ceph::buffer::list
&bl
) const {
3308 ENCODE_START(1, 1, bl
);
3309 if (past_intervals
) {
3312 past_intervals
->encode(bl
);
3314 encode((__u8
)0, bl
);
3319 void decode(ceph::buffer::list::const_iterator
&bl
);
3321 void dump(ceph::Formatter
*f
) const {
3322 ceph_assert(past_intervals
);
3323 past_intervals
->dump(f
);
3325 static void generate_test_instances(std::list
<PastIntervals
*> & o
);
3328 * Determines whether there is an interval change
3330 static bool is_new_interval(
3331 int old_acting_primary
,
3332 int new_acting_primary
,
3333 const std::vector
<int> &old_acting
,
3334 const std::vector
<int> &new_acting
,
3337 const std::vector
<int> &old_up
,
3338 const std::vector
<int> &new_up
,
3343 unsigned old_pg_num
,
3344 unsigned new_pg_num
,
3345 unsigned old_pg_num_pending
,
3346 unsigned new_pg_num_pending
,
3347 bool old_sort_bitwise
,
3348 bool new_sort_bitwise
,
3349 bool old_recovery_deletes
,
3350 bool new_recovery_deletes
,
3351 uint32_t old_crush_count
,
3352 uint32_t new_crush_count
,
3353 uint32_t old_crush_target
,
3354 uint32_t new_crush_target
,
3355 uint32_t old_crush_barrier
,
3356 uint32_t new_crush_barrier
,
3357 int32_t old_crush_member
,
3358 int32_t new_crush_member
,
3363 * Determines whether there is an interval change
3365 static bool is_new_interval(
3366 int old_acting_primary
, ///< [in] primary as of lastmap
3367 int new_acting_primary
, ///< [in] primary as of lastmap
3368 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3369 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3370 int old_up_primary
, ///< [in] up primary of lastmap
3371 int new_up_primary
, ///< [in] up primary of osdmap
3372 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3373 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3374 const OSDMap
*osdmap
, ///< [in] current map
3375 const OSDMap
*lastmap
, ///< [in] last map
3376 pg_t pgid
///< [in] pgid for pg
3380 * Integrates a new map into *past_intervals, returns true
3381 * if an interval was closed out.
3383 static bool check_new_interval(
3384 int old_acting_primary
, ///< [in] primary as of lastmap
3385 int new_acting_primary
, ///< [in] primary as of osdmap
3386 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3387 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3388 int old_up_primary
, ///< [in] up primary of lastmap
3389 int new_up_primary
, ///< [in] up primary of osdmap
3390 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3391 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3392 epoch_t same_interval_since
, ///< [in] as of osdmap
3393 epoch_t last_epoch_clean
, ///< [in] current
3394 const OSDMap
*osdmap
, ///< [in] current map
3395 const OSDMap
*lastmap
, ///< [in] last map
3396 pg_t pgid
, ///< [in] pgid for pg
3397 const IsPGRecoverablePredicate
&could_have_gone_active
, ///< [in] predicate whether the pg can be active
3398 PastIntervals
*past_intervals
, ///< [out] intervals
3399 std::ostream
*out
= 0 ///< [out] debug ostream
3401 static bool check_new_interval(
3402 int old_acting_primary
, ///< [in] primary as of lastmap
3403 int new_acting_primary
, ///< [in] primary as of osdmap
3404 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3405 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3406 int old_up_primary
, ///< [in] up primary of lastmap
3407 int new_up_primary
, ///< [in] up primary of osdmap
3408 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3409 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3410 epoch_t same_interval_since
, ///< [in] as of osdmap
3411 epoch_t last_epoch_clean
, ///< [in] current
3412 OSDMapRef osdmap
, ///< [in] current map
3413 OSDMapRef lastmap
, ///< [in] last map
3414 pg_t pgid
, ///< [in] pgid for pg
3415 const IsPGRecoverablePredicate
&could_have_gone_active
, ///< [in] predicate whether the pg can be active
3416 PastIntervals
*past_intervals
, ///< [out] intervals
3417 std::ostream
*out
= 0 ///< [out] debug ostream
3419 return check_new_interval(
3420 old_acting_primary
, new_acting_primary
,
3421 old_acting
, new_acting
,
3422 old_up_primary
, new_up_primary
,
3424 same_interval_since
, last_epoch_clean
,
3425 osdmap
.get(), lastmap
.get(),
3427 could_have_gone_active
,
3432 friend std::ostream
& operator<<(std::ostream
& out
, const PastIntervals
&i
);
3434 template <typename F
>
3435 void iterate_mayberw_back_to(
3438 ceph_assert(past_intervals
);
3439 past_intervals
->iterate_mayberw_back_to(les
, std::forward
<F
>(f
));
3442 ceph_assert(past_intervals
);
3443 past_intervals
->clear();
3447 * Should return a value which gives an indication of the amount
3448 * of state contained
3450 size_t size() const {
3451 ceph_assert(past_intervals
);
3452 return past_intervals
->size();
3455 bool empty() const {
3456 ceph_assert(past_intervals
);
3457 return past_intervals
->empty();
3460 void swap(PastIntervals
&other
) {
3462 swap(other
.past_intervals
, past_intervals
);
3466 * Return all shards which have been in the acting set back to the
3467 * latest epoch to which we have trimmed except for pg_whoami
3469 std::set
<pg_shard_t
> get_might_have_unfound(
3470 pg_shard_t pg_whoami
,
3471 bool ec_pool
) const {
3472 ceph_assert(past_intervals
);
3473 auto ret
= past_intervals
->get_all_participants(ec_pool
);
3474 ret
.erase(pg_whoami
);
3479 * Return all shards which we might want to talk to for peering
3481 std::set
<pg_shard_t
> get_all_probe(
3482 bool ec_pool
) const {
3483 ceph_assert(past_intervals
);
3484 return past_intervals
->get_all_participants(ec_pool
);
3487 /* Return the set of epochs [start, end) represented by the
3488 * past_interval set.
3490 std::pair
<epoch_t
, epoch_t
> get_bounds() const {
3491 ceph_assert(past_intervals
);
3492 return past_intervals
->get_bounds();
3495 void adjust_start_backwards(epoch_t last_epoch_clean
) {
3496 ceph_assert(past_intervals
);
3497 past_intervals
->adjust_start_backwards(last_epoch_clean
);
3507 bool ec_pool
= false;
3508 std::set
<pg_shard_t
> probe
; ///< current+prior OSDs we need to probe.
3509 std::set
<int> down
; ///< down osds that would normally be in @a probe and might be interesting.
3510 std::map
<int, epoch_t
> blocked_by
; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
3512 bool pg_down
= false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
3513 const IsPGRecoverablePredicate
* pcontdec
= nullptr;
3515 PriorSet() = default;
3516 PriorSet(PriorSet
&&) = default;
3517 PriorSet
&operator=(PriorSet
&&) = default;
3519 PriorSet
&operator=(const PriorSet
&) = delete;
3520 PriorSet(const PriorSet
&) = delete;
3522 bool operator==(const PriorSet
&rhs
) const {
3523 return (ec_pool
== rhs
.ec_pool
) &&
3524 (probe
== rhs
.probe
) &&
3525 (down
== rhs
.down
) &&
3526 (blocked_by
== rhs
.blocked_by
) &&
3527 (pg_down
== rhs
.pg_down
);
3530 bool affected_by_map(
3531 const OSDMap
&osdmap
,
3532 const DoutPrefixProvider
*dpp
) const;
3534 // For verifying tests
3537 std::set
<pg_shard_t
> probe
,
3539 std::map
<int, epoch_t
> blocked_by
,
3541 const IsPGRecoverablePredicate
*pcontdec
)
3542 : ec_pool(ec_pool
), probe(probe
), down(down
), blocked_by(blocked_by
),
3543 pg_down(pg_down
), pcontdec(pcontdec
) {}
3546 template <typename F
>
3548 const PastIntervals
&past_intervals
,
3550 epoch_t last_epoch_started
,
3551 const IsPGRecoverablePredicate
*c
,
3553 const std::vector
<int> &up
,
3554 const std::vector
<int> &acting
,
3555 const DoutPrefixProvider
*dpp
);
3557 friend class PastIntervals
;
3560 template <typename
... Args
>
3561 PriorSet
get_prior_set(Args
&&... args
) const {
3562 return PriorSet(*this, std::forward
<Args
>(args
)...);
3565 WRITE_CLASS_ENCODER(PastIntervals
)
3567 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals::pg_interval_t
& i
);
3568 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals
&i
);
3569 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals::PriorSet
&i
);
3571 template <typename F
>
3572 PastIntervals::PriorSet::PriorSet(
3573 const PastIntervals
&past_intervals
,
3575 epoch_t last_epoch_started
,
3576 const IsPGRecoverablePredicate
*c
,
3578 const std::vector
<int> &up
,
3579 const std::vector
<int> &acting
,
3580 const DoutPrefixProvider
*dpp
)
3581 : ec_pool(ec_pool
), pg_down(false), pcontdec(c
)
3584 * We have to be careful to gracefully deal with situations like
3585 * so. Say we have a power outage or something that takes out both
3586 * OSDs, but the monitor doesn't mark them down in the same epoch.
3587 * The history may look like
3591 * 3: let's say B dies for good, too (say, from the power spike)
3594 * which makes it look like B may have applied updates to the PG
3595 * that we need in order to proceed. This sucks...
3597 * To minimize the risk of this happening, we CANNOT go active if
3598 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3599 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3600 * Then, we have something like
3607 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3618 * -> we must wait for B, bc it was alive through 2, and could have
3619 * written to the pg.
3621 * If B is really dead, then an administrator will need to manually
3622 * intervene by marking the OSD as "lost."
3625 // Include current acting and up nodes... not because they may
3626 // contain old data (this interval hasn't gone active, obviously),
3627 // but because we want their pg_info to inform choose_acting(), and
3628 // so that we know what they do/do not have explicitly before
3629 // sending them any new info/logs/whatever.
3630 for (unsigned i
= 0; i
< acting
.size(); i
++) {
3631 if (acting
[i
] != pg_pool_t::pg_CRUSH_ITEM_NONE
)
3632 probe
.insert(pg_shard_t(acting
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3634 // It may be possible to exclude the up nodes, but let's keep them in
3636 for (unsigned i
= 0; i
< up
.size(); i
++) {
3637 if (up
[i
] != pg_pool_t::pg_CRUSH_ITEM_NONE
)
3638 probe
.insert(pg_shard_t(up
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3641 std::set
<pg_shard_t
> all_probe
= past_intervals
.get_all_probe(ec_pool
);
3642 ldpp_dout(dpp
, 10) << "build_prior all_probe " << all_probe
<< dendl
;
3643 for (auto &&i
: all_probe
) {
3644 switch (f(0, i
.osd
, nullptr)) {
3658 past_intervals
.iterate_mayberw_back_to(
3660 [&](epoch_t start
, const std::set
<pg_shard_t
> &acting
) {
3661 ldpp_dout(dpp
, 10) << "build_prior maybe_rw interval:" << start
3662 << ", acting: " << acting
<< dendl
;
3664 // look at candidate osds during this interval. each falls into
3665 // one of three categories: up, down (but potentially
3666 // interesting), or lost (down, but we won't wait for it).
3667 std::set
<pg_shard_t
> up_now
;
3668 std::map
<int, epoch_t
> candidate_blocked_by
;
3669 // any candidates down now (that might have useful data)
3670 bool any_down_now
= false;
3672 // consider ACTING osds
3673 for (auto &&so
: acting
) {
3674 epoch_t lost_at
= 0;
3675 switch (f(start
, so
.osd
, &lost_at
)) {
3677 // include past acting osds if they are up.
3682 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3683 << " no longer exists" << dendl
;
3687 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3688 << " is down, but lost_at " << lost_at
<< dendl
;
3693 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3694 << " is down" << dendl
;
3695 candidate_blocked_by
[so
.osd
] = lost_at
;
3696 any_down_now
= true;
3702 // if not enough osds survived this interval, and we may have gone rw,
3703 // then we need to wait for one of those osds to recover to
3704 // ensure that we haven't lost any information.
3705 if (!(*pcontdec
)(up_now
) && any_down_now
) {
3706 // fixme: how do we identify a "clean" shutdown anyway?
3707 ldpp_dout(dpp
, 10) << "build_prior possibly went active+rw,"
3708 << " insufficient up; including down osds" << dendl
;
3709 ceph_assert(!candidate_blocked_by
.empty());
3712 candidate_blocked_by
.begin(),
3713 candidate_blocked_by
.end());
3717 ldpp_dout(dpp
, 10) << "build_prior final: probe " << probe
3719 << " blocked_by " << blocked_by
3720 << (pg_down
? " pg_down":"")
3724 struct pg_notify_t
{
3725 epoch_t query_epoch
;
3730 PastIntervals past_intervals
;
3732 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3733 from(shard_id_t::NO_SHARD
) {}
3737 epoch_t query_epoch
,
3739 const pg_info_t
&info
,
3740 const PastIntervals
& pi
)
3741 : query_epoch(query_epoch
),
3742 epoch_sent(epoch_sent
),
3743 info(info
), to(to
), from(from
),
3744 past_intervals(pi
) {
3745 ceph_assert(from
== info
.pgid
.shard
);
3747 void encode(ceph::buffer::list
&bl
) const;
3748 void decode(ceph::buffer::list::const_iterator
&p
);
3749 void dump(ceph::Formatter
*f
) const;
3750 static void generate_test_instances(std::list
<pg_notify_t
*> &o
);
3752 WRITE_CLASS_ENCODER(pg_notify_t
)
3753 std::ostream
&operator<<(std::ostream
&lhs
, const pg_notify_t
¬ify
);
3757 * pg_query_t - used to ask a peer for information about a pg.
3759 * note: if version=0, type=LOG, then we just provide our full log.
3768 std::string_view
get_type_name() const {
3770 case INFO
: return "info";
3771 case LOG
: return "log";
3772 case MISSING
: return "missing";
3773 case FULLLOG
: return "fulllog";
3774 default: return "???";
3780 pg_history_t history
;
3785 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3786 from(shard_id_t::NO_SHARD
) {}
3791 const pg_history_t
& h
,
3795 epoch_sent(epoch_sent
),
3796 to(to
), from(from
) {
3797 ceph_assert(t
!= LOG
);
3804 const pg_history_t
& h
,
3806 : type(t
), since(s
), history(h
),
3807 epoch_sent(epoch_sent
), to(to
), from(from
) {
3808 ceph_assert(t
== LOG
);
3811 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
3812 void decode(ceph::buffer::list::const_iterator
&bl
);
3814 void dump(ceph::Formatter
*f
) const;
3815 static void generate_test_instances(std::list
<pg_query_t
*>& o
);
3817 WRITE_CLASS_ENCODER_FEATURES(pg_query_t
)
3819 inline std::ostream
& operator<<(std::ostream
& out
, const pg_query_t
& q
) {
3820 out
<< "query(" << q
.get_type_name() << " " << q
.since
;
3821 if (q
.type
== pg_query_t::LOG
)
3822 out
<< " " << q
.history
;
3823 out
<< " epoch_sent " << q
.epoch_sent
;
3829 * pg_lease_t - readable lease metadata, from primary -> non-primary
3831 * This metadata serves to increase either or both of the lease expiration
3832 * and upper bound on the non-primary.
3835 /// pg readable_until value; replicas must not be readable beyond this
3836 ceph::signedspan readable_until
= ceph::signedspan::zero();
3838 /// upper bound on any acting osd's readable_until
3839 ceph::signedspan readable_until_ub
= ceph::signedspan::zero();
3841 /// duration of the lease (in case clock deltas aren't available)
3842 ceph::signedspan interval
= ceph::signedspan::zero();
3845 pg_lease_t(ceph::signedspan ru
, ceph::signedspan ruub
,
3847 : readable_until(ru
),
3848 readable_until_ub(ruub
),
3851 void encode(ceph::buffer::list
&bl
) const;
3852 void decode(ceph::buffer::list::const_iterator
&bl
);
3853 void dump(ceph::Formatter
*f
) const;
3854 static void generate_test_instances(std::list
<pg_lease_t
*>& o
);
3856 friend std::ostream
& operator<<(std::ostream
& out
, const pg_lease_t
& l
) {
3857 return out
<< "pg_lease(ru " << l
.readable_until
3858 << " ub " << l
.readable_until_ub
3859 << " int " << l
.interval
<< ")";
3862 WRITE_CLASS_ENCODER(pg_lease_t
)
3865 * pg_lease_ack_t - lease ack, from non-primary -> primary
3867 * This metadata acknowledges to the primary what a non-primary's noted
3870 struct pg_lease_ack_t
{
3871 /// highest upper bound non-primary has recorded (primary's clock)
3872 ceph::signedspan readable_until_ub
= ceph::signedspan::zero();
3875 pg_lease_ack_t(ceph::signedspan ub
)
3876 : readable_until_ub(ub
) {}
3878 void encode(ceph::buffer::list
&bl
) const;
3879 void decode(ceph::buffer::list::const_iterator
&bl
);
3880 void dump(ceph::Formatter
*f
) const;
3881 static void generate_test_instances(std::list
<pg_lease_ack_t
*>& o
);
3883 friend std::ostream
& operator<<(std::ostream
& out
, const pg_lease_ack_t
& l
) {
3884 return out
<< "pg_lease_ack(ruub " << l
.readable_until_ub
<< ")";
3887 WRITE_CLASS_ENCODER(pg_lease_ack_t
)
3892 class ObjectModDesc
{
3893 bool can_local_rollback
;
3894 bool rollback_info_completed
;
3896 // version required to decode, reflected in encode/decode version
3897 __u8 max_required_version
= 1;
3901 virtual void append(uint64_t old_offset
) {}
3902 virtual void setattrs(std::map
<std::string
, std::optional
<ceph::buffer::list
>> &attrs
) {}
3903 virtual void rmobject(version_t old_version
) {}
3905 * Used to support the unfound_lost_delete log event: if the stashed
3906 * version exists, we unstash it, otherwise, we do nothing. This way
3907 * each replica rolls back to whatever state it had prior to the attempt
3908 * at mark unfound lost delete
3910 virtual void try_rmobject(version_t old_version
) {
3911 rmobject(old_version
);
3913 virtual void create() {}
3914 virtual void update_snaps(const std::set
<snapid_t
> &old_snaps
) {}
3915 virtual void rollback_extents(
3917 const std::vector
<std::pair
<uint64_t, uint64_t> > &extents
) {}
3918 virtual ~Visitor() {}
3920 void visit(Visitor
*visitor
) const;
3921 mutable ceph::buffer::list bl
;
3929 ROLLBACK_EXTENTS
= 7
3931 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3932 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3934 void claim(ObjectModDesc
&other
) {
3935 bl
= std::move(other
.bl
);
3936 can_local_rollback
= other
.can_local_rollback
;
3937 rollback_info_completed
= other
.rollback_info_completed
;
3939 void claim_append(ObjectModDesc
&other
) {
3940 if (!can_local_rollback
|| rollback_info_completed
)
3942 if (!other
.can_local_rollback
) {
3943 mark_unrollbackable();
3946 bl
.claim_append(other
.bl
);
3947 rollback_info_completed
= other
.rollback_info_completed
;
3949 void swap(ObjectModDesc
&other
) {
3953 swap(other
.can_local_rollback
, can_local_rollback
);
3954 swap(other
.rollback_info_completed
, rollback_info_completed
);
3955 swap(other
.max_required_version
, max_required_version
);
3957 void append_id(ModID id
) {
3962 void append(uint64_t old_size
) {
3963 if (!can_local_rollback
|| rollback_info_completed
)
3965 ENCODE_START(1, 1, bl
);
3967 encode(old_size
, bl
);
3970 void setattrs(std::map
<std::string
, std::optional
<ceph::buffer::list
>> &old_attrs
) {
3971 if (!can_local_rollback
|| rollback_info_completed
)
3973 ENCODE_START(1, 1, bl
);
3974 append_id(SETATTRS
);
3975 encode(old_attrs
, bl
);
3978 bool rmobject(version_t deletion_version
) {
3979 if (!can_local_rollback
|| rollback_info_completed
)
3981 ENCODE_START(1, 1, bl
);
3983 encode(deletion_version
, bl
);
3985 rollback_info_completed
= true;
3988 bool try_rmobject(version_t deletion_version
) {
3989 if (!can_local_rollback
|| rollback_info_completed
)
3991 ENCODE_START(1, 1, bl
);
3992 append_id(TRY_DELETE
);
3993 encode(deletion_version
, bl
);
3995 rollback_info_completed
= true;
3999 if (!can_local_rollback
|| rollback_info_completed
)
4001 rollback_info_completed
= true;
4002 ENCODE_START(1, 1, bl
);
4006 void update_snaps(const std::set
<snapid_t
> &old_snaps
) {
4007 if (!can_local_rollback
|| rollback_info_completed
)
4009 ENCODE_START(1, 1, bl
);
4010 append_id(UPDATE_SNAPS
);
4011 encode(old_snaps
, bl
);
4014 void rollback_extents(
4015 version_t gen
, const std::vector
<std::pair
<uint64_t, uint64_t> > &extents
) {
4016 ceph_assert(can_local_rollback
);
4017 ceph_assert(!rollback_info_completed
);
4018 if (max_required_version
< 2)
4019 max_required_version
= 2;
4020 ENCODE_START(2, 2, bl
);
4021 append_id(ROLLBACK_EXTENTS
);
4023 encode(extents
, bl
);
4027 // cannot be rolled back
4028 void mark_unrollbackable() {
4029 can_local_rollback
= false;
4032 bool can_rollback() const {
4033 return can_local_rollback
;
4035 bool empty() const {
4036 return can_local_rollback
&& (bl
.length() == 0);
4039 bool requires_kraken() const {
4040 return max_required_version
>= 2;
4044 * Create fresh copy of bl bytes to avoid keeping large buffers around
4045 * in the case that bl contains ptrs which point into a much larger
4048 void trim_bl() const {
4049 if (bl
.length() > 0)
4052 void encode(ceph::buffer::list
&bl
) const;
4053 void decode(ceph::buffer::list::const_iterator
&bl
);
4054 void dump(ceph::Formatter
*f
) const;
4055 static void generate_test_instances(std::list
<ObjectModDesc
*>& o
);
4057 WRITE_CLASS_ENCODER(ObjectModDesc
)
4059 class ObjectCleanRegions
{
4063 interval_set
<uint64_t> clean_offsets
;
4064 static std::atomic
<uint32_t> max_num_intervals
;
4067 * trim the number of intervals if clean_offsets.num_intervals()
4068 * exceeds the given upbound max_num_intervals
4069 * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
4070 * then new interval [30~10] will evict out the shortest one [20~5]
4071 * finally, clean_offsets becomes {[5~10], [30~10]}
4074 friend std::ostream
& operator<<(std::ostream
& out
, const ObjectCleanRegions
& ocr
);
4076 ObjectCleanRegions() : new_object(false), clean_omap(true) {
4077 clean_offsets
.insert(0, (uint64_t)-1);
4079 ObjectCleanRegions(uint64_t offset
, uint64_t len
, bool co
)
4080 : new_object(false), clean_omap(co
) {
4081 clean_offsets
.insert(offset
, len
);
4083 bool operator==(const ObjectCleanRegions
&orc
) const {
4084 return new_object
== orc
.new_object
&& clean_omap
== orc
.clean_omap
&& clean_offsets
== orc
.clean_offsets
;
4086 static void set_max_num_intervals(uint32_t num
);
4087 void merge(const ObjectCleanRegions
&other
);
4088 void mark_data_region_dirty(uint64_t offset
, uint64_t len
);
4089 void mark_omap_dirty();
4090 void mark_object_new();
4091 void mark_fully_dirty();
4092 interval_set
<uint64_t> get_dirty_regions() const;
4093 bool omap_is_dirty() const;
4094 bool object_is_exist() const;
4095 bool is_clean_region(uint64_t offset
, uint64_t len
) const;
4097 void encode(ceph::buffer::list
&bl
) const;
4098 void decode(ceph::buffer::list::const_iterator
&bl
);
4099 void dump(ceph::Formatter
*f
) const;
4100 static void generate_test_instances(std::list
<ObjectCleanRegions
*>& o
);
4102 WRITE_CLASS_ENCODER(ObjectCleanRegions
)
4103 std::ostream
& operator<<(std::ostream
& out
, const ObjectCleanRegions
& ocr
);
4110 ceph::buffer::list indata
, outdata
;
4111 errorcode32_t rval
= 0;
4114 // FIPS zeroization audit 20191115: this memset clean for security
4115 memset(&op
, 0, sizeof(ceph_osd_op
));
4118 OSDOp(const int op_code
) {
4119 // FIPS zeroization audit 20191115: this memset clean for security
4120 memset(&op
, 0, sizeof(ceph_osd_op
));
4125 * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
4127 * @param ops [out] vector of OSDOps
4128 * @param in [in] combined data buffer
4130 template<typename V
>
4131 static void split_osd_op_vector_in_data(V
& ops
,
4132 ceph::buffer::list
& in
) {
4133 ceph::buffer::list::iterator datap
= in
.begin();
4134 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4135 if (ops
[i
].op
.payload_len
) {
4136 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].indata
);
4142 * merge indata members of a vector of OSDOp into a single ceph::buffer::list
4144 * Notably this also encodes certain other OSDOp data into the data
4145 * buffer, including the sobject_t soid.
4147 * @param ops [in] vector of OSDOps
4148 * @param out [out] combined data buffer
4150 template<typename V
>
4151 static void merge_osd_op_vector_in_data(V
& ops
, ceph::buffer::list
& out
) {
4152 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4153 if (ops
[i
].indata
.length()) {
4154 ops
[i
].op
.payload_len
= ops
[i
].indata
.length();
4155 out
.append(ops
[i
].indata
);
4161 * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
4163 * @param ops [out] vector of OSDOps
4164 * @param in [in] combined data buffer
4166 static void split_osd_op_vector_out_data(std::vector
<OSDOp
>& ops
, ceph::buffer::list
& in
);
4169 * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
4171 * @param ops [in] vector of OSDOps
4172 * @param out [out] combined data buffer
4174 static void merge_osd_op_vector_out_data(std::vector
<OSDOp
>& ops
, ceph::buffer::list
& out
);
4177 * Clear data as much as possible, leave minimal data for historical op dump
4179 * @param ops [in] vector of OSDOps
4181 template<typename V
>
4182 static void clear_data(V
& ops
) {
4183 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4186 if (ceph_osd_op_type_attr(op
.op
.op
) &&
4187 op
.op
.xattr
.name_len
&&
4188 op
.indata
.length() >= op
.op
.xattr
.name_len
) {
4189 ceph::buffer::list bl
;
4190 bl
.push_back(ceph::buffer::ptr_node::create(op
.op
.xattr
.name_len
));
4191 bl
.begin().copy_in(op
.op
.xattr
.name_len
, op
.indata
);
4192 op
.indata
= std::move(bl
);
4193 } else if (ceph_osd_op_type_exec(op
.op
.op
) &&
4194 op
.op
.cls
.class_len
&&
4195 op
.indata
.length() >
4196 (op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
)) {
4197 __u8 len
= op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
;
4198 ceph::buffer::list bl
;
4199 bl
.push_back(ceph::buffer::ptr_node::create(len
));
4200 bl
.begin().copy_in(len
, op
.indata
);
4201 op
.indata
= std::move(bl
);
4208 std::ostream
& operator<<(std::ostream
& out
, const OSDOp
& op
);
4210 struct pg_log_op_return_item_t
{
4212 ceph::buffer::list bl
;
4213 void encode(ceph::buffer::list
& p
) const {
4218 void decode(ceph::buffer::list::const_iterator
& p
) {
4223 void dump(ceph::Formatter
*f
) const {
4224 f
->dump_int("rval", rval
);
4225 f
->dump_unsigned("bl_length", bl
.length());
4227 friend bool operator==(const pg_log_op_return_item_t
& lhs
,
4228 const pg_log_op_return_item_t
& rhs
) {
4229 return lhs
.rval
== rhs
.rval
&&
4230 lhs
.bl
.contents_equal(rhs
.bl
);
4232 friend bool operator!=(const pg_log_op_return_item_t
& lhs
,
4233 const pg_log_op_return_item_t
& rhs
) {
4234 return !(lhs
== rhs
);
4236 friend std::ostream
& operator<<(std::ostream
& out
, const pg_log_op_return_item_t
& i
) {
4237 return out
<< "r=" << i
.rval
<< "+" << i
.bl
.length() << "b";
4240 WRITE_CLASS_ENCODER(pg_log_op_return_item_t
)
4243 * pg_log_entry_t - single entry/event in pg log
4246 struct pg_log_entry_t
{
4248 MODIFY
= 1, // some unspecified modification (but not *all* modifications)
4249 CLONE
= 2, // cloned object from head
4250 DELETE
= 3, // deleted object
4251 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
4252 LOST_REVERT
= 5, // lost new version, revert to an older version.
4253 LOST_DELETE
= 6, // lost new version, revert to no object (deleted).
4254 LOST_MARK
= 7, // lost new version, now EIO
4255 PROMOTE
= 8, // promoted object from another tier
4256 CLEAN
= 9, // mark an object clean
4257 ERROR
= 10, // write that returned an error
4259 static const char *get_op_name(int op
) {
4283 const char *get_op_name() const {
4284 return get_op_name(op
);
4287 // describes state for a locally-rollbackable entry
4288 ObjectModDesc mod_desc
;
4289 ceph::buffer::list snaps
; // only for clone entries
4291 osd_reqid_t reqid
; // caller+tid to uniquely identify request
4292 mempool::osd_pglog::vector
<std::pair
<osd_reqid_t
, version_t
> > extra_reqids
;
4294 /// map extra_reqids by index to error return code (if any)
4295 mempool::osd_pglog::map
<uint32_t, int> extra_reqid_return_codes
;
4297 eversion_t version
, prior_version
, reverting_to
;
4298 version_t user_version
; // the user version for this entry
4299 utime_t mtime
; // this is the _user_ mtime, mind you
4300 int32_t return_code
; // only stored for ERRORs for dup detection
4302 std::vector
<pg_log_op_return_item_t
> op_returns
;
4305 bool invalid_hash
; // only when decoding sobject_t based entries
4306 bool invalid_pool
; // only when decoding pool-less hobject based entries
4307 ObjectCleanRegions clean_regions
;
4310 : user_version(0), return_code(0), op(0),
4311 invalid_hash(false), invalid_pool(false) {
4312 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4314 pg_log_entry_t(int _op
, const hobject_t
& _soid
,
4315 const eversion_t
& v
, const eversion_t
& pv
,
4317 const osd_reqid_t
& rid
, const utime_t
& mt
,
4319 : soid(_soid
), reqid(rid
), version(v
), prior_version(pv
), user_version(uv
),
4320 mtime(mt
), return_code(return_code
), op(_op
),
4321 invalid_hash(false), invalid_pool(false) {
4322 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4325 bool is_clone() const { return op
== CLONE
; }
4326 bool is_modify() const { return op
== MODIFY
; }
4327 bool is_promote() const { return op
== PROMOTE
; }
4328 bool is_clean() const { return op
== CLEAN
; }
4329 bool is_lost_revert() const { return op
== LOST_REVERT
; }
4330 bool is_lost_delete() const { return op
== LOST_DELETE
; }
4331 bool is_lost_mark() const { return op
== LOST_MARK
; }
4332 bool is_error() const { return op
== ERROR
; }
4334 bool is_update() const {
4336 is_clone() || is_modify() || is_promote() || is_clean() ||
4337 is_lost_revert() || is_lost_mark();
4339 bool is_delete() const {
4340 return op
== DELETE
|| op
== LOST_DELETE
;
4343 bool can_rollback() const {
4344 return mod_desc
.can_rollback();
4347 void mark_unrollbackable() {
4348 mod_desc
.mark_unrollbackable();
4351 bool requires_kraken() const {
4352 return mod_desc
.requires_kraken();
4355 // Errors are only used for dup detection, whereas
4356 // the index by objects is used by recovery, copy_get,
4357 // and other facilities that don't expect or need to
4358 // be aware of error entries.
4359 bool object_is_indexed() const {
4363 bool reqid_is_indexed() const {
4364 return reqid
!= osd_reqid_t() &&
4365 (op
== MODIFY
|| op
== DELETE
|| op
== ERROR
);
4368 void set_op_returns(const std::vector
<OSDOp
>& ops
) {
4369 op_returns
.resize(ops
.size());
4370 for (unsigned i
= 0; i
< ops
.size(); ++i
) {
4371 op_returns
[i
].rval
= ops
[i
].rval
;
4372 op_returns
[i
].bl
= ops
[i
].outdata
;
4376 std::string
get_key_name() const;
4377 void encode_with_checksum(ceph::buffer::list
& bl
) const;
4378 void decode_with_checksum(ceph::buffer::list::const_iterator
& p
);
4380 void encode(ceph::buffer::list
&bl
) const;
4381 void decode(ceph::buffer::list::const_iterator
&bl
);
4382 void dump(ceph::Formatter
*f
) const;
4383 static void generate_test_instances(std::list
<pg_log_entry_t
*>& o
);
4386 WRITE_CLASS_ENCODER(pg_log_entry_t
)
4388 std::ostream
& operator<<(std::ostream
& out
, const pg_log_entry_t
& e
);
4390 struct pg_log_dup_t
{
4391 osd_reqid_t reqid
; // caller+tid to uniquely identify request
4393 version_t user_version
; // the user version for this entry
4394 int32_t return_code
; // only stored for ERRORs for dup detection
4396 std::vector
<pg_log_op_return_item_t
> op_returns
;
4399 : user_version(0), return_code(0)
4401 explicit pg_log_dup_t(const pg_log_entry_t
& entry
)
4402 : reqid(entry
.reqid
), version(entry
.version
),
4403 user_version(entry
.user_version
),
4404 return_code(entry
.return_code
),
4405 op_returns(entry
.op_returns
)
4407 pg_log_dup_t(const eversion_t
& v
, version_t uv
,
4408 const osd_reqid_t
& rid
, int return_code
)
4409 : reqid(rid
), version(v
), user_version(uv
),
4410 return_code(return_code
)
4413 std::string
get_key_name() const;
4414 void encode(ceph::buffer::list
&bl
) const;
4415 void decode(ceph::buffer::list::const_iterator
&bl
);
4416 void dump(ceph::Formatter
*f
) const;
4417 static void generate_test_instances(std::list
<pg_log_dup_t
*>& o
);
4419 bool operator==(const pg_log_dup_t
&rhs
) const {
4420 return reqid
== rhs
.reqid
&&
4421 version
== rhs
.version
&&
4422 user_version
== rhs
.user_version
&&
4423 return_code
== rhs
.return_code
&&
4424 op_returns
== rhs
.op_returns
;
4426 bool operator!=(const pg_log_dup_t
&rhs
) const {
4427 return !(*this == rhs
);
4430 friend std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
4432 WRITE_CLASS_ENCODER(pg_log_dup_t
)
4434 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
4437 * pg_log_t - incremental log of recent pg changes.
4439 * serves as a recovery queue for recent changes.
4443 * head - newest entry (update|delete)
4444 * tail - entry previous to oldest (update|delete) for which we have
4445 * complete negative information.
4446 * i.e. we can infer pg contents for any store whose last_update >= tail.
4448 eversion_t head
; // newest entry
4449 eversion_t tail
; // version prior to oldest
4452 // We can rollback rollback-able entries > can_rollback_to
4453 eversion_t can_rollback_to
;
4455 // always <= can_rollback_to, indicates how far stashed rollback
4456 // data can be found
4457 eversion_t rollback_info_trimmed_to
;
4461 mempool::osd_pglog::list
<pg_log_entry_t
> log
;
4463 // entries just for dup op detection ordered oldest to newest
4464 mempool::osd_pglog::list
<pg_log_dup_t
> dups
;
4466 pg_log_t() = default;
4467 pg_log_t(const eversion_t
&last_update
,
4468 const eversion_t
&log_tail
,
4469 const eversion_t
&can_rollback_to
,
4470 const eversion_t
&rollback_info_trimmed_to
,
4471 mempool::osd_pglog::list
<pg_log_entry_t
> &&entries
,
4472 mempool::osd_pglog::list
<pg_log_dup_t
> &&dup_entries
)
4473 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
4474 rollback_info_trimmed_to(rollback_info_trimmed_to
),
4475 log(std::move(entries
)), dups(std::move(dup_entries
)) {}
4476 pg_log_t(const eversion_t
&last_update
,
4477 const eversion_t
&log_tail
,
4478 const eversion_t
&can_rollback_to
,
4479 const eversion_t
&rollback_info_trimmed_to
,
4480 const std::list
<pg_log_entry_t
> &entries
,
4481 const std::list
<pg_log_dup_t
> &dup_entries
)
4482 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
4483 rollback_info_trimmed_to(rollback_info_trimmed_to
) {
4484 for (auto &&entry
: entries
) {
4485 log
.push_back(entry
);
4487 for (auto &&entry
: dup_entries
) {
4488 dups
.push_back(entry
);
4494 rollback_info_trimmed_to
= can_rollback_to
= head
= tail
= z
;
4499 eversion_t
get_rollback_info_trimmed_to() const {
4500 return rollback_info_trimmed_to
;
4502 eversion_t
get_can_rollback_to() const {
4503 return can_rollback_to
;
4507 pg_log_t
split_out_child(pg_t child_pgid
, unsigned split_bits
) {
4508 mempool::osd_pglog::list
<pg_log_entry_t
> oldlog
, childlog
;
4511 eversion_t old_tail
;
4512 unsigned mask
= ~((~0)<<split_bits
);
4513 for (auto i
= oldlog
.begin();
4516 if ((i
->soid
.get_hash() & mask
) == child_pgid
.m_seed
) {
4517 childlog
.push_back(*i
);
4524 // osd_reqid is unique, so it doesn't matter if there are extra
4525 // dup entries in each pg. To avoid storing oid with the dup
4526 // entries, just copy the whole list.
4527 auto childdups(dups
);
4533 rollback_info_trimmed_to
,
4534 std::move(childlog
),
4535 std::move(childdups
));
4538 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
4539 ceph_assert(newhead
>= tail
);
4541 mempool::osd_pglog::list
<pg_log_entry_t
>::iterator p
= log
.end();
4542 mempool::osd_pglog::list
<pg_log_entry_t
> divergent
;
4544 if (p
== log
.begin()) {
4545 // yikes, the whole thing is divergent!
4547 swap(divergent
, log
);
4551 if (p
->version
.version
<= newhead
.version
) {
4553 * look at eversion.version here. we want to avoid a situation like:
4554 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4555 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4556 * lower_bound = 100'9
4557 * i.e, same request, different version. If the eversion.version is > the
4558 * lower_bound, we it is divergent.
4561 divergent
.splice(divergent
.begin(), log
, p
, log
.end());
4564 ceph_assert(p
->version
> newhead
);
4568 if (can_rollback_to
> newhead
)
4569 can_rollback_to
= newhead
;
4571 if (rollback_info_trimmed_to
> newhead
)
4572 rollback_info_trimmed_to
= newhead
;
4577 void merge_from(const std::vector
<pg_log_t
*>& slogs
, eversion_t last_update
) {
4580 // sort and merge dups
4581 std::multimap
<eversion_t
,pg_log_dup_t
> sorted
;
4582 for (auto& d
: dups
) {
4583 sorted
.emplace(d
.version
, d
);
4585 for (auto l
: slogs
) {
4586 for (auto& d
: l
->dups
) {
4587 sorted
.emplace(d
.version
, d
);
4591 for (auto& i
: sorted
) {
4592 dups
.push_back(i
.second
);
4597 can_rollback_to
= last_update
;
4598 rollback_info_trimmed_to
= last_update
;
4601 bool empty() const {
4606 return head
.version
== 0 && head
.epoch
== 0;
4609 uint64_t approx_size() const {
4610 return head
.version
- tail
.version
;
4613 static void filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
4614 const std::string
&hit_set_namespace
, const pg_log_t
&in
,
4615 pg_log_t
&out
, pg_log_t
&reject
);
4618 * copy entries from the tail of another pg_log_t
4620 * @param other pg_log_t to copy from
4621 * @param from copy entries after this version
4623 void copy_after(CephContext
* cct
, const pg_log_t
&other
, eversion_t from
);
4626 * copy up to N entries
4628 * @param other source log
4629 * @param max max number of entries to copy
4631 void copy_up_to(CephContext
* cct
, const pg_log_t
&other
, int max
);
4633 std::ostream
& print(std::ostream
& out
) const;
4635 void encode(ceph::buffer::list
&bl
) const;
4636 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1);
4637 void dump(ceph::Formatter
*f
) const;
4638 static void generate_test_instances(std::list
<pg_log_t
*>& o
);
4640 WRITE_CLASS_ENCODER(pg_log_t
)
4642 inline std::ostream
& operator<<(std::ostream
& out
, const pg_log_t
& log
)
4644 out
<< "log((" << log
.tail
<< "," << log
.head
<< "], crt="
4645 << log
.get_can_rollback_to() << ")";
4651 * pg_missing_t - summary of missing objects.
4653 * kept in memory, as a supplement to pg_log_t
4654 * also used to pass missing info in messages.
4656 struct pg_missing_item
{
4657 eversion_t need
, have
;
4658 ObjectCleanRegions clean_regions
;
4659 enum missing_flags_t
{
4663 pg_missing_item() : flags(FLAG_NONE
) {}
4664 explicit pg_missing_item(eversion_t n
) : need(n
), flags(FLAG_NONE
) {} // have no old version
4665 pg_missing_item(eversion_t n
, eversion_t h
, bool is_delete
=false, bool old_style
= false) :
4667 set_delete(is_delete
);
4669 clean_regions
.mark_fully_dirty();
4672 void encode(ceph::buffer::list
& bl
, uint64_t features
) const {
4674 if (HAVE_FEATURE(features
, SERVER_OCTOPUS
)) {
4675 // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
4676 // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
4677 // possible. This can be replaced with the legacy encoding
4678 encode(eversion_t(), bl
);
4679 encode(eversion_t(-1, -1), bl
);
4682 encode(static_cast<uint8_t>(flags
), bl
);
4683 encode(clean_regions
, bl
);
4685 encode(eversion_t(), bl
);
4688 encode(static_cast<uint8_t>(flags
), bl
);
4691 void decode(ceph::buffer::list::const_iterator
& bl
) {
4696 if(l
== eversion_t(-1, -1)) {
4702 flags
= static_cast<missing_flags_t
>(f
);
4703 decode(clean_regions
, bl
);
4705 // support OSD_RECOVERY_DELETES
4710 flags
= static_cast<missing_flags_t
>(f
);
4711 clean_regions
.mark_fully_dirty();
4715 void set_delete(bool is_delete
) {
4716 flags
= is_delete
? FLAG_DELETE
: FLAG_NONE
;
4719 bool is_delete() const {
4720 return (flags
& FLAG_DELETE
) == FLAG_DELETE
;
4723 std::string
flag_str() const {
4724 if (flags
== FLAG_NONE
) {
4731 void dump(ceph::Formatter
*f
) const {
4732 f
->dump_stream("need") << need
;
4733 f
->dump_stream("have") << have
;
4734 f
->dump_stream("flags") << flag_str();
4735 f
->dump_stream("clean_regions") << clean_regions
;
4737 static void generate_test_instances(std::list
<pg_missing_item
*>& o
) {
4738 o
.push_back(new pg_missing_item
);
4739 o
.push_back(new pg_missing_item
);
4740 o
.back()->need
= eversion_t(1, 2);
4741 o
.back()->have
= eversion_t(1, 1);
4742 o
.push_back(new pg_missing_item
);
4743 o
.back()->need
= eversion_t(3, 5);
4744 o
.back()->have
= eversion_t(3, 4);
4745 o
.back()->clean_regions
.mark_data_region_dirty(4096, 8192);
4746 o
.back()->clean_regions
.mark_omap_dirty();
4747 o
.back()->flags
= FLAG_DELETE
;
4749 bool operator==(const pg_missing_item
&rhs
) const {
4750 return need
== rhs
.need
&& have
== rhs
.have
&& flags
== rhs
.flags
;
4752 bool operator!=(const pg_missing_item
&rhs
) const {
4753 return !(*this == rhs
);
4756 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item
)
4757 std::ostream
& operator<<(std::ostream
& out
, const pg_missing_item
&item
);
4759 class pg_missing_const_i
{
4761 virtual const std::map
<hobject_t
, pg_missing_item
> &
4762 get_items() const = 0;
4763 virtual const std::map
<version_t
, hobject_t
> &get_rmissing() const = 0;
4764 virtual bool get_may_include_deletes() const = 0;
4765 virtual unsigned int num_missing() const = 0;
4766 virtual bool have_missing() const = 0;
4767 virtual bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const = 0;
4768 virtual bool is_missing(const hobject_t
& oid
, eversion_t v
) const = 0;
4769 virtual ~pg_missing_const_i() {}
4773 template <bool Track
>
4774 class ChangeTracker
{
4776 void changed(const hobject_t
&obj
) {}
4777 template <typename F
>
4778 void get_changed(F
&&f
) const {}
4780 bool is_clean() const {
4785 class ChangeTracker
<true> {
4786 std::set
<hobject_t
> _changed
;
4788 void changed(const hobject_t
&obj
) {
4789 _changed
.insert(obj
);
4791 template <typename F
>
4792 void get_changed(F
&&f
) const {
4793 for (auto const &i
: _changed
) {
4800 bool is_clean() const {
4801 return _changed
.empty();
4805 template <bool TrackChanges
>
4806 class pg_missing_set
: public pg_missing_const_i
{
4807 using item
= pg_missing_item
;
4808 std::map
<hobject_t
, item
> missing
; // oid -> (need v, have v)
4809 std::map
<version_t
, hobject_t
> rmissing
; // v -> oid
4810 ChangeTracker
<TrackChanges
> tracker
;
4813 pg_missing_set() = default;
4815 template <typename missing_type
>
4816 pg_missing_set(const missing_type
&m
) {
4817 missing
= m
.get_items();
4818 rmissing
= m
.get_rmissing();
4819 may_include_deletes
= m
.get_may_include_deletes();
4820 for (auto &&i
: missing
)
4821 tracker
.changed(i
.first
);
4824 bool may_include_deletes
= false;
4826 const std::map
<hobject_t
, item
> &get_items() const override
{
4829 const std::map
<version_t
, hobject_t
> &get_rmissing() const override
{
4832 bool get_may_include_deletes() const override
{
4833 return may_include_deletes
;
4835 unsigned int num_missing() const override
{
4836 return missing
.size();
4838 bool have_missing() const override
{
4839 return !missing
.empty();
4841 void merge(const pg_log_entry_t
& e
) {
4842 auto miter
= missing
.find(e
.soid
);
4843 if (miter
!= missing
.end() && miter
->second
.have
!= eversion_t() && e
.version
> miter
->second
.have
)
4844 miter
->second
.clean_regions
.merge(e
.clean_regions
);
4846 bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const override
{
4847 auto iter
= missing
.find(oid
);
4848 if (iter
== missing
.end())
4851 *out
= iter
->second
;
4854 bool is_missing(const hobject_t
& oid
, eversion_t v
) const override
{
4855 std::map
<hobject_t
, item
>::const_iterator m
=
4857 if (m
== missing
.end())
4859 const item
&item(m
->second
);
4864 eversion_t
get_oldest_need() const {
4865 if (missing
.empty()) {
4866 return eversion_t();
4868 auto it
= missing
.find(rmissing
.begin()->second
);
4869 ceph_assert(it
!= missing
.end());
4870 return it
->second
.need
;
4873 void claim(pg_missing_set
&& o
) {
4874 static_assert(!TrackChanges
, "Can't use claim with TrackChanges");
4875 missing
= std::move(o
.missing
);
4876 rmissing
= std::move(o
.rmissing
);
4880 * this needs to be called in log order as we extend the log. it
4881 * assumes missing is accurate up through the previous log entry.
4883 void add_next_event(const pg_log_entry_t
& e
) {
4884 std::map
<hobject_t
, item
>::iterator missing_it
;
4885 missing_it
= missing
.find(e
.soid
);
4886 bool is_missing_divergent_item
= missing_it
!= missing
.end();
4887 if (e
.prior_version
== eversion_t() || e
.is_clone()) {
4889 if (is_missing_divergent_item
) { // use iterator
4890 rmissing
.erase(missing_it
->second
.need
.version
);
4892 missing_it
->second
= item(e
.version
, eversion_t(), e
.is_delete());
4893 missing_it
->second
.clean_regions
.mark_fully_dirty();
4895 // create new element in missing map
4897 missing
[e
.soid
] = item(e
.version
, eversion_t(), e
.is_delete());
4898 missing
[e
.soid
].clean_regions
.mark_fully_dirty();
4900 } else if (is_missing_divergent_item
) {
4901 // already missing (prior).
4902 rmissing
.erase((missing_it
->second
).need
.version
);
4903 missing_it
->second
.need
= e
.version
; // leave .have unchanged.
4904 missing_it
->second
.set_delete(e
.is_delete());
4905 if (e
.is_lost_revert())
4906 missing_it
->second
.clean_regions
.mark_fully_dirty();
4908 missing_it
->second
.clean_regions
.merge(e
.clean_regions
);
4910 // not missing, we must have prior_version (if any)
4911 ceph_assert(!is_missing_divergent_item
);
4912 missing
[e
.soid
] = item(e
.version
, e
.prior_version
, e
.is_delete());
4913 if (e
.is_lost_revert())
4914 missing
[e
.soid
].clean_regions
.mark_fully_dirty();
4916 missing
[e
.soid
].clean_regions
= e
.clean_regions
;
4918 rmissing
[e
.version
.version
] = e
.soid
;
4919 tracker
.changed(e
.soid
);
4922 void revise_need(hobject_t oid
, eversion_t need
, bool is_delete
) {
4923 auto p
= missing
.find(oid
);
4924 if (p
!= missing
.end()) {
4925 rmissing
.erase((p
->second
).need
.version
);
4926 p
->second
.need
= need
; // do not adjust .have
4927 p
->second
.set_delete(is_delete
);
4928 p
->second
.clean_regions
.mark_fully_dirty();
4930 missing
[oid
] = item(need
, eversion_t(), is_delete
);
4931 missing
[oid
].clean_regions
.mark_fully_dirty();
4933 rmissing
[need
.version
] = oid
;
4935 tracker
.changed(oid
);
4938 void revise_have(hobject_t oid
, eversion_t have
) {
4939 auto p
= missing
.find(oid
);
4940 if (p
!= missing
.end()) {
4941 tracker
.changed(oid
);
4942 (p
->second
).have
= have
;
4946 void mark_fully_dirty(const hobject_t
& oid
) {
4947 auto p
= missing
.find(oid
);
4948 if (p
!= missing
.end()) {
4949 tracker
.changed(oid
);
4950 (p
->second
).clean_regions
.mark_fully_dirty();
4954 void add(const hobject_t
& oid
, eversion_t need
, eversion_t have
,
4956 missing
[oid
] = item(need
, have
, is_delete
, true);
4957 rmissing
[need
.version
] = oid
;
4958 tracker
.changed(oid
);
4961 void add(const hobject_t
& oid
, pg_missing_item
&& item
) {
4962 rmissing
[item
.need
.version
] = oid
;
4963 missing
.insert({oid
, std::move(item
)});
4964 tracker
.changed(oid
);
4967 void rm(const hobject_t
& oid
, eversion_t v
) {
4968 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
4969 if (p
!= missing
.end() && p
->second
.need
<= v
)
4973 void rm(std::map
<hobject_t
, item
>::const_iterator m
) {
4974 tracker
.changed(m
->first
);
4975 rmissing
.erase(m
->second
.need
.version
);
4979 void got(const hobject_t
& oid
, eversion_t v
) {
4980 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
4981 ceph_assert(p
!= missing
.end());
4982 ceph_assert(p
->second
.need
<= v
|| p
->second
.is_delete());
4986 void got(std::map
<hobject_t
, item
>::const_iterator m
) {
4987 tracker
.changed(m
->first
);
4988 rmissing
.erase(m
->second
.need
.version
);
4994 unsigned split_bits
,
4995 pg_missing_set
*omissing
) {
4996 omissing
->may_include_deletes
= may_include_deletes
;
4997 unsigned mask
= ~((~0)<<split_bits
);
4998 for (std::map
<hobject_t
, item
>::iterator i
= missing
.begin();
5001 if ((i
->first
.get_hash() & mask
) == child_pgid
.m_seed
) {
5002 omissing
->add(i
->first
, i
->second
.need
, i
->second
.have
,
5003 i
->second
.is_delete());
5012 for (auto const &i
: missing
)
5013 tracker
.changed(i
.first
);
5018 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
5019 ENCODE_START(5, 2, bl
)
5020 encode(missing
, bl
, features
);
5021 encode(may_include_deletes
, bl
);
5024 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1) {
5025 for (auto const &i
: missing
)
5026 tracker
.changed(i
.first
);
5027 DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl
);
5028 decode(missing
, bl
);
5029 if (struct_v
>= 4) {
5030 decode(may_include_deletes
, bl
);
5035 // Handle hobject_t upgrade
5036 std::map
<hobject_t
, item
> tmp
;
5037 for (std::map
<hobject_t
, item
>::iterator i
=
5041 if (!i
->first
.is_max() && i
->first
.pool
== -1) {
5042 hobject_t
to_insert(i
->first
);
5043 to_insert
.pool
= pool
;
5044 tmp
[to_insert
] = i
->second
;
5050 missing
.insert(tmp
.begin(), tmp
.end());
5053 for (std::map
<hobject_t
,item
>::iterator it
=
5055 it
!= missing
.end();
5057 rmissing
[it
->second
.need
.version
] = it
->first
;
5058 for (auto const &i
: missing
)
5059 tracker
.changed(i
.first
);
5061 void dump(ceph::Formatter
*f
) const {
5062 f
->open_array_section("missing");
5063 for (std::map
<hobject_t
,item
>::const_iterator p
=
5064 missing
.begin(); p
!= missing
.end(); ++p
) {
5065 f
->open_object_section("item");
5066 f
->dump_stream("object") << p
->first
;
5071 f
->dump_bool("may_include_deletes", may_include_deletes
);
5073 template <typename F
>
5074 void filter_objects(F
&&f
) {
5075 for (auto i
= missing
.begin(); i
!= missing
.end();) {
5083 static void generate_test_instances(std::list
<pg_missing_set
*>& o
) {
5084 o
.push_back(new pg_missing_set
);
5085 o
.back()->may_include_deletes
= true;
5086 o
.push_back(new pg_missing_set
);
5088 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5089 eversion_t(5, 6), eversion_t(5, 1), false);
5090 o
.back()->may_include_deletes
= true;
5091 o
.push_back(new pg_missing_set
);
5093 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5094 eversion_t(5, 6), eversion_t(5, 1), true);
5095 o
.back()->may_include_deletes
= true;
5097 template <typename F
>
5098 void get_changed(F
&&f
) const {
5099 tracker
.get_changed(f
);
5104 bool is_clean() const {
5105 return tracker
.is_clean();
5107 template <typename missing_t
>
5108 bool debug_verify_from_init(
5109 const missing_t
&init_missing
,
5110 std::ostream
*oss
) const {
5113 auto check_missing(init_missing
.get_items());
5114 tracker
.get_changed([&](const hobject_t
&hoid
) {
5115 check_missing
.erase(hoid
);
5116 if (missing
.count(hoid
)) {
5117 check_missing
.insert(*(missing
.find(hoid
)));
5121 if (check_missing
.size() != missing
.size()) {
5123 *oss
<< "Size mismatch, check: " << check_missing
.size()
5124 << ", actual: " << missing
.size() << "\n";
5128 for (auto &i
: missing
) {
5129 if (!check_missing
.count(i
.first
)) {
5131 *oss
<< "check_missing missing " << i
.first
<< "\n";
5133 } else if (check_missing
[i
.first
] != i
.second
) {
5135 *oss
<< "check_missing missing item mismatch on " << i
.first
5136 << ", check: " << check_missing
[i
.first
]
5137 << ", actual: " << i
.second
<< "\n";
5142 *oss
<< "check_missing: " << check_missing
<< "\n";
5143 std::set
<hobject_t
> changed
;
5144 tracker
.get_changed([&](const hobject_t
&hoid
) { changed
.insert(hoid
); });
5145 *oss
<< "changed: " << changed
<< "\n";
5150 template <bool TrackChanges
>
5152 const pg_missing_set
<TrackChanges
> &c
, ceph::buffer::list
&bl
, uint64_t features
=0) {
5154 c
.encode(bl
, features
);
5155 ENCODE_DUMP_POST(cl
);
5157 template <bool TrackChanges
>
5158 void decode(pg_missing_set
<TrackChanges
> &c
, ceph::buffer::list::const_iterator
&p
) {
5161 template <bool TrackChanges
>
5162 std::ostream
& operator<<(std::ostream
& out
, const pg_missing_set
<TrackChanges
> &missing
)
5164 out
<< "missing(" << missing
.num_missing()
5165 << " may_include_deletes = " << missing
.may_include_deletes
;
5166 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
5171 using pg_missing_t
= pg_missing_set
<false>;
5172 using pg_missing_tracker_t
= pg_missing_set
<true>;
5178 * pg list objects response format
5182 template<typename T
>
5183 struct pg_nls_response_template
{
5184 collection_list_handle_t handle
;
5185 std::vector
<T
> entries
;
5187 void encode(ceph::buffer::list
& bl
) const {
5188 ENCODE_START(1, 1, bl
);
5190 __u32 n
= (__u32
)entries
.size();
5192 for (auto i
= entries
.begin(); i
!= entries
.end(); ++i
) {
5193 encode(i
->nspace
, bl
);
5195 encode(i
->locator
, bl
);
5199 void decode(ceph::buffer::list::const_iterator
& bl
) {
5200 DECODE_START(1, bl
);
5207 decode(i
.nspace
, bl
);
5209 decode(i
.locator
, bl
);
5210 entries
.push_back(i
);
5214 void dump(ceph::Formatter
*f
) const {
5215 f
->dump_stream("handle") << handle
;
5216 f
->open_array_section("entries");
5217 for (auto p
= entries
.begin(); p
!= entries
.end(); ++p
) {
5218 f
->open_object_section("object");
5219 f
->dump_string("namespace", p
->nspace
);
5220 f
->dump_string("object", p
->oid
);
5221 f
->dump_string("key", p
->locator
);
5226 static void generate_test_instances(std::list
<pg_nls_response_template
<T
>*>& o
) {
5227 o
.push_back(new pg_nls_response_template
<T
>);
5228 o
.push_back(new pg_nls_response_template
<T
>);
5229 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5230 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
5231 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
5232 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
5233 o
.push_back(new pg_nls_response_template
<T
>);
5234 o
.back()->handle
= hobject_t(object_t("hi"), "key", 3, 4, -1, "");
5235 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5236 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5237 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5238 o
.push_back(new pg_nls_response_template
<T
>);
5239 o
.back()->handle
= hobject_t(object_t("hi"), "key", 5, 6, -1, "");
5240 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
5241 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
5242 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
5243 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5244 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5245 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5249 using pg_nls_response_t
= pg_nls_response_template
<librados::ListObjectImpl
>;
5251 WRITE_CLASS_ENCODER(pg_nls_response_t
)
5253 // For backwards compatibility with older OSD requests
5254 struct pg_ls_response_t
{
5255 collection_list_handle_t handle
;
5256 std::list
<std::pair
<object_t
, std::string
> > entries
;
5258 void encode(ceph::buffer::list
& bl
) const {
5263 encode(entries
, bl
);
5265 void decode(ceph::buffer::list::const_iterator
& bl
) {
5269 ceph_assert(v
== 1);
5271 decode(entries
, bl
);
5273 void dump(ceph::Formatter
*f
) const {
5274 f
->dump_stream("handle") << handle
;
5275 f
->open_array_section("entries");
5276 for (std::list
<std::pair
<object_t
, std::string
> >::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
5277 f
->open_object_section("object");
5278 f
->dump_stream("object") << p
->first
;
5279 f
->dump_string("key", p
->second
);
5284 static void generate_test_instances(std::list
<pg_ls_response_t
*>& o
) {
5285 o
.push_back(new pg_ls_response_t
);
5286 o
.push_back(new pg_ls_response_t
);
5287 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5288 o
.back()->entries
.push_back(std::make_pair(object_t("one"), std::string()));
5289 o
.back()->entries
.push_back(std::make_pair(object_t("two"), std::string("twokey")));
5293 WRITE_CLASS_ENCODER(pg_ls_response_t
)
5296 * object_copy_cursor_t
5298 struct object_copy_cursor_t
{
5299 uint64_t data_offset
;
5300 std::string omap_offset
;
5305 object_copy_cursor_t()
5307 attr_complete(false),
5308 data_complete(false),
5309 omap_complete(false)
5312 bool is_initial() const {
5313 return !attr_complete
&& data_offset
== 0 && omap_offset
.empty();
5315 bool is_complete() const {
5316 return attr_complete
&& data_complete
&& omap_complete
;
5319 static void generate_test_instances(std::list
<object_copy_cursor_t
*>& o
);
5320 void encode(ceph::buffer::list
& bl
) const;
5321 void decode(ceph::buffer::list::const_iterator
&bl
);
5322 void dump(ceph::Formatter
*f
) const;
5324 WRITE_CLASS_ENCODER(object_copy_cursor_t
)
5327 * object_copy_data_t
5329 * Return data from a copy request. The semantics are a little strange
5330 * as a result of the encoding's heritage.
5332 * In particular, the sender unconditionally fills in the cursor (from what
5333 * it receives and sends), the size, and the mtime, but is responsible for
5334 * figuring out whether it should put any data in the attrs, data, or
5335 * omap members (corresponding to xattrs, object data, and the omap entries)
5336 * based on external data (the client includes a max amount to return with
5337 * the copy request). The client then looks into the attrs, data, and/or omap
5338 * based on the contents of the cursor.
5340 struct object_copy_data_t
{
5342 FLAG_DATA_DIGEST
= 1<<0,
5343 FLAG_OMAP_DIGEST
= 1<<1,
5345 object_copy_cursor_t cursor
;
5348 uint32_t data_digest
, omap_digest
;
5350 std::map
<std::string
, ceph::buffer::list
, std::less
<>> attrs
;
5351 ceph::buffer::list data
;
5352 ceph::buffer::list omap_header
;
5353 ceph::buffer::list omap_data
;
5355 /// which snaps we are defined for (if a snap and not the head)
5356 std::vector
<snapid_t
> snaps
;
5357 /// latest snap seq for the object (if head)
5360 /// recent reqids on this object
5361 mempool::osd_pglog::vector
<std::pair
<osd_reqid_t
, version_t
> > reqids
;
5363 /// map reqids by index to error return code (if any)
5364 mempool::osd_pglog::map
<uint32_t, int> reqid_return_codes
;
5366 uint64_t truncate_seq
;
5367 uint64_t truncate_size
;
5370 object_copy_data_t() :
5371 size((uint64_t)-1), data_digest(-1),
5372 omap_digest(-1), flags(0),
5376 static void generate_test_instances(std::list
<object_copy_data_t
*>& o
);
5377 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5378 void decode(ceph::buffer::list::const_iterator
& bl
);
5379 void dump(ceph::Formatter
*f
) const;
5381 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t
)
5386 struct pg_create_t
{
5387 epoch_t created
; // epoch pg created
5388 pg_t parent
; // split from parent (if != pg_t())
5392 : created(0), split_bits(0) {}
5393 pg_create_t(unsigned c
, pg_t p
, int s
)
5394 : created(c
), parent(p
), split_bits(s
) {}
5396 void encode(ceph::buffer::list
&bl
) const;
5397 void decode(ceph::buffer::list::const_iterator
&bl
);
5398 void dump(ceph::Formatter
*f
) const;
5399 static void generate_test_instances(std::list
<pg_create_t
*>& o
);
5401 WRITE_CLASS_ENCODER(pg_create_t
)
5403 // -----------------------------------------
5405 class ObjectExtent
{
5407 * ObjectExtents are used for specifying IO behavior against RADOS
5408 * objects when one is using the ObjectCacher.
5410 * To use this in a real system, *every member* must be filled
5411 * out correctly. In particular, make sure to initialize the
5412 * oloc correctly, as its default values are deliberate poison
5413 * and will cause internal ObjectCacher asserts.
5415 * Similarly, your buffer_extents vector *must* specify a total
5416 * size equal to your length. If the buffer_extents inadvertently
5417 * contain less space than the length member specifies, you
5418 * will get unintelligible asserts deep in the ObjectCacher.
5420 * If you are trying to do testing and don't care about actual
5421 * RADOS function, the simplest thing to do is to initialize
5422 * the ObjectExtent (truncate_size can be 0), create a single entry
5423 * in buffer_extents matching the length, and set oloc.pool to 0.
5426 object_t oid
; // object id
5428 uint64_t offset
; // in object
5429 uint64_t length
; // in object
5430 uint64_t truncate_size
; // in object
5432 object_locator_t oloc
; // object locator (pool etc)
5434 std::vector
<std::pair
<uint64_t,uint64_t> > buffer_extents
; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
5436 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
5437 ObjectExtent(object_t o
, uint64_t ono
, uint64_t off
, uint64_t l
, uint64_t ts
) :
5438 oid(o
), objectno(ono
), offset(off
), length(l
), truncate_size(ts
) { }
5441 inline std::ostream
& operator<<(std::ostream
& out
, const ObjectExtent
&ex
)
5443 return out
<< "extent("
5444 << ex
.oid
<< " (" << ex
.objectno
<< ") in " << ex
.oloc
5445 << " " << ex
.offset
<< "~" << ex
.length
5446 << " -> " << ex
.buffer_extents
5451 // ---------------------------------------
5453 class OSDSuperblock
{
5455 uuid_d cluster_fsid
, osd_fsid
;
5456 int32_t whoami
= -1; // my role in this fs.
5457 epoch_t current_epoch
= 0; // most recent epoch
5458 epoch_t oldest_map
= 0, newest_map
= 0; // oldest/newest maps we have.
5459 double weight
= 0.0;
5461 CompatSet compat_features
;
5463 // last interval over which i mounted and was then active
5464 epoch_t mounted
= 0; // last epoch i mounted
5465 epoch_t clean_thru
= 0; // epoch i was active and clean thru
5467 epoch_t purged_snaps_last
= 0;
5468 utime_t last_purged_snaps_scrub
;
5470 void encode(ceph::buffer::list
&bl
) const;
5471 void decode(ceph::buffer::list::const_iterator
&bl
);
5472 void dump(ceph::Formatter
*f
) const;
5473 static void generate_test_instances(std::list
<OSDSuperblock
*>& o
);
5475 WRITE_CLASS_ENCODER(OSDSuperblock
)
5477 inline std::ostream
& operator<<(std::ostream
& out
, const OSDSuperblock
& sb
)
5479 return out
<< "sb(" << sb
.cluster_fsid
5480 << " osd." << sb
.whoami
5481 << " " << sb
.osd_fsid
5482 << " e" << sb
.current_epoch
5483 << " [" << sb
.oldest_map
<< "," << sb
.newest_map
<< "]"
5484 << " lci=[" << sb
.mounted
<< "," << sb
.clean_thru
<< "]"
5497 * attached to object head. describes most recent snap context, and
5498 * set of existing clones.
5502 // NOTE: this is for pre-octopus compatibility only! remove in Q release
5503 std::vector
<snapid_t
> snaps
; // descending
5504 std::vector
<snapid_t
> clones
; // ascending
5505 std::map
<snapid_t
, interval_set
<uint64_t> > clone_overlap
; // overlap w/ next newest
5506 std::map
<snapid_t
, uint64_t> clone_size
;
5507 std::map
<snapid_t
, std::vector
<snapid_t
>> clone_snaps
; // descending
5509 SnapSet() : seq(0) {}
5510 explicit SnapSet(ceph::buffer::list
& bl
) {
5511 auto p
= std::cbegin(bl
);
5515 /// populate SnapSet from a librados::snap_set_t
5516 void from_snap_set(const librados::snap_set_t
& ss
, bool legacy
);
5518 /// get space accounted to clone
5519 uint64_t get_clone_bytes(snapid_t clone
) const;
5521 void encode(ceph::buffer::list
& bl
) const;
5522 void decode(ceph::buffer::list::const_iterator
& bl
);
5523 void dump(ceph::Formatter
*f
) const;
5524 static void generate_test_instances(std::list
<SnapSet
*>& o
);
5526 SnapContext
get_ssc_as_of(snapid_t as_of
) const {
5529 for (auto p
= clone_snaps
.rbegin();
5530 p
!= clone_snaps
.rend();
5532 for (auto snap
: p
->second
) {
5533 if (snap
<= as_of
) {
5534 out
.snaps
.push_back(snap
);
5542 SnapSet
get_filtered(const pg_pool_t
&pinfo
) const;
5543 void filter(const pg_pool_t
&pinfo
);
5545 WRITE_CLASS_ENCODER(SnapSet
)
5547 std::ostream
& operator<<(std::ostream
& out
, const SnapSet
& cs
);
5552 #define SS_ATTR "snapset"
5554 struct watch_info_t
{
5556 uint32_t timeout_seconds
;
5559 watch_info_t() : cookie(0), timeout_seconds(0) { }
5560 watch_info_t(uint64_t c
, uint32_t t
, const entity_addr_t
& a
) : cookie(c
), timeout_seconds(t
), addr(a
) {}
5562 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5563 void decode(ceph::buffer::list::const_iterator
& bl
);
5564 void dump(ceph::Formatter
*f
) const;
5565 static void generate_test_instances(std::list
<watch_info_t
*>& o
);
5567 WRITE_CLASS_ENCODER_FEATURES(watch_info_t
)
5569 static inline bool operator==(const watch_info_t
& l
, const watch_info_t
& r
) {
5570 return l
.cookie
== r
.cookie
&& l
.timeout_seconds
== r
.timeout_seconds
5571 && l
.addr
== r
.addr
;
5574 static inline std::ostream
& operator<<(std::ostream
& out
, const watch_info_t
& w
) {
5575 return out
<< "watch(cookie " << w
.cookie
<< " " << w
.timeout_seconds
<< "s"
5576 << " " << w
.addr
<< ")";
5579 struct notify_info_t
{
5583 ceph::buffer::list bl
;
5586 static inline std::ostream
& operator<<(std::ostream
& out
, const notify_info_t
& n
) {
5587 return out
<< "notify(cookie " << n
.cookie
5588 << " notify" << n
.notify_id
5589 << " " << n
.timeout
<< "s)";
5592 class object_ref_delta_t
{
5593 std::map
<hobject_t
, int> ref_delta
;
5596 object_ref_delta_t() = default;
5597 object_ref_delta_t(const object_ref_delta_t
&) = default;
5598 object_ref_delta_t(object_ref_delta_t
&&) = default;
5600 object_ref_delta_t(decltype(ref_delta
) &&ref_delta
)
5601 : ref_delta(std::move(ref_delta
)) {}
5602 object_ref_delta_t(const decltype(ref_delta
) &ref_delta
)
5603 : ref_delta(ref_delta
) {}
5605 object_ref_delta_t
&operator=(const object_ref_delta_t
&) = default;
5606 object_ref_delta_t
&operator=(object_ref_delta_t
&&) = default;
5608 void dec_ref(const hobject_t
&hoid
, unsigned num
=1) {
5609 mut_ref(hoid
, -num
);
5611 void inc_ref(const hobject_t
&hoid
, unsigned num
=1) {
5614 void mut_ref(const hobject_t
&hoid
, int num
) {
5615 [[maybe_unused
]] auto [iter
, _
] = ref_delta
.try_emplace(hoid
, 0);
5616 iter
->second
+= num
;
5617 if (iter
->second
== 0)
5618 ref_delta
.erase(iter
);
5621 auto begin() const { return ref_delta
.begin(); }
5622 auto end() const { return ref_delta
.end(); }
5623 auto find(hobject_t
&key
) const { return ref_delta
.find(key
); }
5625 bool operator==(const object_ref_delta_t
&rhs
) const {
5626 return ref_delta
== rhs
.ref_delta
;
5628 bool operator!=(const object_ref_delta_t
&rhs
) const {
5629 return !(*this == rhs
);
5632 return ref_delta
.empty();
5635 return ref_delta
.size();
5637 friend std::ostream
& operator<<(std::ostream
& out
, const object_ref_delta_t
& ci
);
5640 struct chunk_info_t
{
5644 FLAG_HAS_REFERENCE
= 4,
5645 FLAG_HAS_FINGERPRINT
= 8,
5650 cflag_t flags
; // FLAG_*
5652 chunk_info_t() : offset(0), length(0), flags((cflag_t
)0) { }
5653 chunk_info_t(uint32_t offset
, uint32_t length
, hobject_t oid
) :
5654 offset(offset
), length(length
), oid(oid
), flags((cflag_t
)0) { }
5656 static std::string
get_flag_string(uint64_t flags
) {
5658 if (flags
& FLAG_DIRTY
) {
5661 if (flags
& FLAG_MISSING
) {
5664 if (flags
& FLAG_HAS_REFERENCE
) {
5665 r
+= "|has_reference";
5667 if (flags
& FLAG_HAS_FINGERPRINT
) {
5668 r
+= "|has_fingerprint";
5674 bool test_flag(cflag_t f
) const {
5675 return (flags
& f
) == f
;
5677 void set_flag(cflag_t f
) {
5678 flags
= (cflag_t
)(flags
| f
);
5680 void set_flags(cflag_t f
) {
5683 void clear_flag(cflag_t f
) {
5684 flags
= (cflag_t
)(flags
& ~f
);
5686 void clear_flags() {
5689 bool is_dirty() const {
5690 return test_flag(FLAG_DIRTY
);
5692 bool is_missing() const {
5693 return test_flag(FLAG_MISSING
);
5695 bool has_reference() const {
5696 return test_flag(FLAG_HAS_REFERENCE
);
5698 bool has_fingerprint() const {
5699 return test_flag(FLAG_HAS_FINGERPRINT
);
5701 void encode(ceph::buffer::list
&bl
) const;
5702 void decode(ceph::buffer::list::const_iterator
&bl
);
5703 void dump(ceph::Formatter
*f
) const;
5704 friend std::ostream
& operator<<(std::ostream
& out
, const chunk_info_t
& ci
);
5705 bool operator==(const chunk_info_t
& cit
) const;
5706 bool operator!=(const chunk_info_t
& cit
) const {
5707 return !(cit
== *this);
5710 WRITE_CLASS_ENCODER(chunk_info_t
)
5711 std::ostream
& operator<<(std::ostream
& out
, const chunk_info_t
& ci
);
5713 struct object_info_t
;
5714 struct object_manifest_t
{
5720 uint8_t type
; // redirect, chunked, ...
5721 hobject_t redirect_target
;
5722 std::map
<uint64_t, chunk_info_t
> chunk_map
;
5724 object_manifest_t() : type(0) { }
5725 object_manifest_t(uint8_t type
, const hobject_t
& redirect_target
)
5726 : type(type
), redirect_target(redirect_target
) { }
5728 bool is_empty() const {
5729 return type
== TYPE_NONE
;
5731 bool is_redirect() const {
5732 return type
== TYPE_REDIRECT
;
5734 bool is_chunked() const {
5735 return type
== TYPE_CHUNKED
;
5737 static std::string_view
get_type_name(uint8_t m
) {
5739 case TYPE_NONE
: return "none";
5740 case TYPE_REDIRECT
: return "redirect";
5741 case TYPE_CHUNKED
: return "chunked";
5742 default: return "unknown";
5745 std::string_view
get_type_name() const {
5746 return get_type_name(type
);
5750 redirect_target
= hobject_t();
5755 * calc_refs_to_inc_on_set
5757 * Takes a manifest and returns the set of refs to
5758 * increment upon set-chunk
5760 * l should be nullptr if there are no clones, or
5761 * l and g may each be null if the corresponding clone does not exist.
5762 * *this contains the set of new references to set
5765 void calc_refs_to_inc_on_set(
5766 const object_manifest_t
* g
, ///< [in] manifest for clone > *this
5767 const object_manifest_t
* l
, ///< [in] manifest for clone < *this
5768 object_ref_delta_t
&delta
///< [out] set of refs to drop
5772 * calc_refs_to_drop_on_modify
5774 * Takes a manifest and returns the set of refs to
5775 * drop upon modification
5777 * l should be nullptr if there are no clones, or
5778 * l may be null if the corresponding clone does not exist.
5781 void calc_refs_to_drop_on_modify(
5782 const object_manifest_t
* l
, ///< [in] manifest for previous clone
5783 const ObjectCleanRegions
& clean_regions
, ///< [in] clean regions
5784 object_ref_delta_t
&delta
///< [out] set of refs to drop
5788 * calc_refs_to_drop_on_removal
5790 * Takes the two adjacent manifests and returns the set of refs to
5791 * drop upon removal of the clone containing *this.
5793 * g should be nullptr if *this is on HEAD, l should be nullptr if
5794 * *this is on the oldest clone (or head if there are no clones).
5796 void calc_refs_to_drop_on_removal(
5797 const object_manifest_t
* g
, ///< [in] manifest for clone > *this
5798 const object_manifest_t
* l
, ///< [in] manifest for clone < *this
5799 object_ref_delta_t
&delta
///< [out] set of refs to drop
5802 static void generate_test_instances(std::list
<object_manifest_t
*>& o
);
5803 void encode(ceph::buffer::list
&bl
) const;
5804 void decode(ceph::buffer::list::const_iterator
&bl
);
5805 void dump(ceph::Formatter
*f
) const;
5806 friend std::ostream
& operator<<(std::ostream
& out
, const object_info_t
& oi
);
5808 WRITE_CLASS_ENCODER(object_manifest_t
)
5809 std::ostream
& operator<<(std::ostream
& out
, const object_manifest_t
& oi
);
5811 struct object_info_t
{
5813 eversion_t version
, prior_version
;
5814 version_t user_version
;
5815 osd_reqid_t last_reqid
;
5819 utime_t local_mtime
; // local mtime
5821 // note: these are currently encoded into a total 16 bits; see
5822 // encode()/decode() for the weirdness.
5825 FLAG_WHITEOUT
= 1<<1, // object logically does not exist
5826 FLAG_DIRTY
= 1<<2, // object has been modified since last flushed or undirtied
5827 FLAG_OMAP
= 1<<3, // has (or may have) some/any omap data
5828 FLAG_DATA_DIGEST
= 1<<4, // has data crc
5829 FLAG_OMAP_DIGEST
= 1<<5, // has omap crc
5830 FLAG_CACHE_PIN
= 1<<6, // pin the object in cache tier
5831 FLAG_MANIFEST
= 1<<7, // has manifest
5832 FLAG_USES_TMAP
= 1<<8, // deprecated; no longer used
5833 FLAG_REDIRECT_HAS_REFERENCE
= 1<<9, // has reference
5838 static std::string
get_flag_string(flag_t flags
) {
5840 std::vector
<std::string
> sv
= get_flag_vector(flags
);
5841 for (auto ss
: sv
) {
5842 s
+= std::string("|") + ss
;
5848 static std::vector
<std::string
> get_flag_vector(flag_t flags
) {
5849 std::vector
<std::string
> sv
;
5850 if (flags
& FLAG_LOST
)
5851 sv
.insert(sv
.end(), "lost");
5852 if (flags
& FLAG_WHITEOUT
)
5853 sv
.insert(sv
.end(), "whiteout");
5854 if (flags
& FLAG_DIRTY
)
5855 sv
.insert(sv
.end(), "dirty");
5856 if (flags
& FLAG_USES_TMAP
)
5857 sv
.insert(sv
.end(), "uses_tmap");
5858 if (flags
& FLAG_OMAP
)
5859 sv
.insert(sv
.end(), "omap");
5860 if (flags
& FLAG_DATA_DIGEST
)
5861 sv
.insert(sv
.end(), "data_digest");
5862 if (flags
& FLAG_OMAP_DIGEST
)
5863 sv
.insert(sv
.end(), "omap_digest");
5864 if (flags
& FLAG_CACHE_PIN
)
5865 sv
.insert(sv
.end(), "cache_pin");
5866 if (flags
& FLAG_MANIFEST
)
5867 sv
.insert(sv
.end(), "manifest");
5868 if (flags
& FLAG_REDIRECT_HAS_REFERENCE
)
5869 sv
.insert(sv
.end(), "redirect_has_reference");
5872 std::string
get_flag_string() const {
5873 return get_flag_string(flags
);
5876 uint64_t truncate_seq
, truncate_size
;
5878 std::map
<std::pair
<uint64_t, entity_name_t
>, watch_info_t
> watchers
;
5880 // opportunistic checksums; may or may not be present
5881 __u32 data_digest
; ///< data crc32c
5882 __u32 omap_digest
; ///< omap crc32c
5884 // alloc hint attribute
5885 uint64_t expected_object_size
, expected_write_size
;
5886 uint32_t alloc_hint_flags
;
5888 struct object_manifest_t manifest
;
5890 void copy_user_bits(const object_info_t
& other
);
5892 bool test_flag(flag_t f
) const {
5893 return (flags
& f
) == f
;
5895 void set_flag(flag_t f
) {
5896 flags
= (flag_t
)(flags
| f
);
5898 void clear_flag(flag_t f
) {
5899 flags
= (flag_t
)(flags
& ~f
);
5901 bool is_lost() const {
5902 return test_flag(FLAG_LOST
);
5904 bool is_whiteout() const {
5905 return test_flag(FLAG_WHITEOUT
);
5907 bool is_dirty() const {
5908 return test_flag(FLAG_DIRTY
);
5910 bool is_omap() const {
5911 return test_flag(FLAG_OMAP
);
5913 bool is_data_digest() const {
5914 return test_flag(FLAG_DATA_DIGEST
);
5916 bool is_omap_digest() const {
5917 return test_flag(FLAG_OMAP_DIGEST
);
5919 bool is_cache_pinned() const {
5920 return test_flag(FLAG_CACHE_PIN
);
5922 bool has_manifest() const {
5923 return test_flag(FLAG_MANIFEST
);
5925 void set_data_digest(__u32 d
) {
5926 set_flag(FLAG_DATA_DIGEST
);
5929 void set_omap_digest(__u32 d
) {
5930 set_flag(FLAG_OMAP_DIGEST
);
5933 void clear_data_digest() {
5934 clear_flag(FLAG_DATA_DIGEST
);
5937 void clear_omap_digest() {
5938 clear_flag(FLAG_OMAP_DIGEST
);
5942 clear_data_digest();
5943 clear_omap_digest();
5946 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5947 void decode(ceph::buffer::list::const_iterator
& bl
);
5948 void decode(const ceph::buffer::list
& bl
) {
5949 auto p
= std::cbegin(bl
);
5953 void encode_no_oid(ceph::buffer::list
& bl
, uint64_t features
) {
5954 // TODO: drop soid field and remove the denc no_oid methods
5955 auto tmp_oid
= hobject_t(hobject_t::get_max());
5957 encode(bl
, features
);
5960 void decode_no_oid(ceph::buffer::list::const_iterator
& bl
) {
5962 ceph_assert(soid
.is_max());
5964 void decode_no_oid(const ceph::buffer::list
& bl
) {
5965 auto p
= std::cbegin(bl
);
5968 void decode_no_oid(const ceph::buffer::list
& bl
, const hobject_t
& _soid
) {
5969 auto p
= std::cbegin(bl
);
5974 void dump(ceph::Formatter
*f
) const;
5975 static void generate_test_instances(std::list
<object_info_t
*>& o
);
5977 explicit object_info_t()
5978 : user_version(0), size(0), flags((flag_t
)0),
5979 truncate_seq(0), truncate_size(0),
5980 data_digest(-1), omap_digest(-1),
5981 expected_object_size(0), expected_write_size(0),
5985 explicit object_info_t(const hobject_t
& s
)
5987 user_version(0), size(0), flags((flag_t
)0),
5988 truncate_seq(0), truncate_size(0),
5989 data_digest(-1), omap_digest(-1),
5990 expected_object_size(0), expected_write_size(0),
5994 explicit object_info_t(const ceph::buffer::list
& bl
) {
5998 explicit object_info_t(const ceph::buffer::list
& bl
, const hobject_t
& _soid
) {
6003 WRITE_CLASS_ENCODER_FEATURES(object_info_t
)
6005 std::ostream
& operator<<(std::ostream
& out
, const object_info_t
& oi
);
6010 struct ObjectRecoveryInfo
{
6015 SnapSet ss
; // only populated if soid is_snap()
6016 interval_set
<uint64_t> copy_subset
;
6017 std::map
<hobject_t
, interval_set
<uint64_t>> clone_subset
;
6020 ObjectRecoveryInfo() : size(0), object_exist(true) { }
6022 static void generate_test_instances(std::list
<ObjectRecoveryInfo
*>& o
);
6023 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6024 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1);
6025 std::ostream
&print(std::ostream
&out
) const;
6026 void dump(ceph::Formatter
*f
) const;
6028 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo
)
6029 std::ostream
& operator<<(std::ostream
& out
, const ObjectRecoveryInfo
&inf
);
6031 struct ObjectRecoveryProgress
{
6032 uint64_t data_recovered_to
;
6033 std::string omap_recovered_to
;
6039 ObjectRecoveryProgress()
6040 : data_recovered_to(0),
6042 data_complete(false), omap_complete(false) { }
6044 bool is_complete(const ObjectRecoveryInfo
& info
) const {
6045 return (data_recovered_to
>= (
6046 info
.copy_subset
.empty() ?
6047 0 : info
.copy_subset
.range_end())) &&
6051 static void generate_test_instances(std::list
<ObjectRecoveryProgress
*>& o
);
6052 void encode(ceph::buffer::list
&bl
) const;
6053 void decode(ceph::buffer::list::const_iterator
&bl
);
6054 std::ostream
&print(std::ostream
&out
) const;
6055 void dump(ceph::Formatter
*f
) const;
6057 WRITE_CLASS_ENCODER(ObjectRecoveryProgress
)
6058 std::ostream
& operator<<(std::ostream
& out
, const ObjectRecoveryProgress
&prog
);
6060 struct PushReplyOp
{
6063 static void generate_test_instances(std::list
<PushReplyOp
*>& o
);
6064 void encode(ceph::buffer::list
&bl
) const;
6065 void decode(ceph::buffer::list::const_iterator
&bl
);
6066 std::ostream
&print(std::ostream
&out
) const;
6067 void dump(ceph::Formatter
*f
) const;
6069 uint64_t cost(CephContext
*cct
) const;
6071 WRITE_CLASS_ENCODER(PushReplyOp
)
6072 std::ostream
& operator<<(std::ostream
& out
, const PushReplyOp
&op
);
6077 ObjectRecoveryInfo recovery_info
;
6078 ObjectRecoveryProgress recovery_progress
;
6080 static void generate_test_instances(std::list
<PullOp
*>& o
);
6081 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6082 void decode(ceph::buffer::list::const_iterator
&bl
);
6083 std::ostream
&print(std::ostream
&out
) const;
6084 void dump(ceph::Formatter
*f
) const;
6086 uint64_t cost(CephContext
*cct
) const;
6088 WRITE_CLASS_ENCODER_FEATURES(PullOp
)
6089 std::ostream
& operator<<(std::ostream
& out
, const PullOp
&op
);
6094 ceph::buffer::list data
;
6095 interval_set
<uint64_t> data_included
;
6096 ceph::buffer::list omap_header
;
6097 std::map
<std::string
, ceph::buffer::list
> omap_entries
;
6098 std::map
<std::string
, ceph::buffer::list
, std::less
<>> attrset
;
6100 ObjectRecoveryInfo recovery_info
;
6101 ObjectRecoveryProgress before_progress
;
6102 ObjectRecoveryProgress after_progress
;
6104 static void generate_test_instances(std::list
<PushOp
*>& o
);
6105 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6106 void decode(ceph::buffer::list::const_iterator
&bl
);
6107 std::ostream
&print(std::ostream
&out
) const;
6108 void dump(ceph::Formatter
*f
) const;
6110 uint64_t cost(CephContext
*cct
) const;
6112 WRITE_CLASS_ENCODER_FEATURES(PushOp
)
6113 std::ostream
& operator<<(std::ostream
& out
, const PushOp
&op
);
6116 * summarize pg contents for purposes of a scrub
6120 std::map
<std::string
, ceph::buffer::ptr
, std::less
<>> attrs
;
6122 __u32 omap_digest
; ///< omap crc32c
6123 __u32 digest
; ///< data crc32c
6125 bool digest_present
:1;
6126 bool omap_digest_present
:1;
6129 bool ec_hash_mismatch
:1;
6130 bool ec_size_mismatch
:1;
6131 bool large_omap_object_found
:1;
6132 uint64_t large_omap_object_key_count
= 0;
6133 uint64_t large_omap_object_value_size
= 0;
6134 uint64_t object_omap_bytes
= 0;
6135 uint64_t object_omap_keys
= 0;
6138 // Init invalid size so it won't match if we get a stat EIO error
6139 size(-1), omap_digest(0), digest(0),
6140 negative(false), digest_present(false), omap_digest_present(false),
6141 read_error(false), stat_error(false), ec_hash_mismatch(false),
6142 ec_size_mismatch(false), large_omap_object_found(false) {}
6144 void encode(ceph::buffer::list
& bl
) const;
6145 void decode(ceph::buffer::list::const_iterator
& bl
);
6146 void dump(ceph::Formatter
*f
) const;
6147 static void generate_test_instances(std::list
<object
*>& o
);
6149 WRITE_CLASS_ENCODER(object
)
6151 std::map
<hobject_t
,object
> objects
;
6152 eversion_t valid_through
;
6153 eversion_t incr_since
;
6154 bool has_large_omap_object_errors
:1;
6155 bool has_omap_keys
:1;
6157 void merge_incr(const ScrubMap
&l
);
6158 void clear_from(const hobject_t
& start
) {
6159 objects
.erase(objects
.lower_bound(start
), objects
.end());
6161 void insert(const ScrubMap
&r
) {
6162 objects
.insert(r
.objects
.begin(), r
.objects
.end());
6164 void swap(ScrubMap
&r
) {
6166 swap(objects
, r
.objects
);
6167 swap(valid_through
, r
.valid_through
);
6168 swap(incr_since
, r
.incr_since
);
6171 void encode(ceph::buffer::list
& bl
) const;
6172 void decode(ceph::buffer::list::const_iterator
& bl
, int64_t pool
=-1);
6173 void dump(ceph::Formatter
*f
) const;
6174 static void generate_test_instances(std::list
<ScrubMap
*>& o
);
6176 WRITE_CLASS_ENCODER(ScrubMap::object
)
6177 WRITE_CLASS_ENCODER(ScrubMap
)
6179 struct ScrubMapBuilder
{
6181 std::vector
<hobject_t
> ls
;
6183 int64_t data_pos
= 0;
6184 std::string omap_pos
;
6186 ceph::buffer::hash data_hash
, omap_hash
; ///< accumulatinng hash value
6187 uint64_t omap_keys
= 0;
6188 uint64_t omap_bytes
= 0;
6194 return pos
>= ls
.size();
6197 *this = ScrubMapBuilder();
6201 return data_pos
< 0;
6204 void next_object() {
6212 friend std::ostream
& operator<<(std::ostream
& out
, const ScrubMapBuilder
& pos
) {
6213 out
<< "(" << pos
.pos
<< "/" << pos
.ls
.size();
6214 if (pos
.pos
< pos
.ls
.size()) {
6215 out
<< " " << pos
.ls
[pos
.pos
];
6217 if (pos
.data_pos
< 0) {
6218 out
<< " byte " << pos
.data_pos
;
6220 if (!pos
.omap_pos
.empty()) {
6221 out
<< " key " << pos
.omap_pos
;
6227 out
<< " ret " << pos
.ret
;
6233 struct watch_item_t
{
6236 uint32_t timeout_seconds
;
6239 watch_item_t() : cookie(0), timeout_seconds(0) { }
6240 watch_item_t(entity_name_t name
, uint64_t cookie
, uint32_t timeout
,
6241 const entity_addr_t
& addr
)
6242 : name(name
), cookie(cookie
), timeout_seconds(timeout
),
6245 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
6246 ENCODE_START(2, 1, bl
);
6249 encode(timeout_seconds
, bl
);
6250 encode(addr
, bl
, features
);
6253 void decode(ceph::buffer::list::const_iterator
&bl
) {
6254 DECODE_START(2, bl
);
6257 decode(timeout_seconds
, bl
);
6258 if (struct_v
>= 2) {
6263 void dump(ceph::Formatter
*f
) const {
6264 f
->dump_stream("watcher") << name
;
6265 f
->dump_int("cookie", cookie
);
6266 f
->dump_int("timeout", timeout_seconds
);
6267 f
->open_object_section("addr");
6271 static void generate_test_instances(std::list
<watch_item_t
*>& o
) {
6273 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
6275 ea
.set_family(AF_INET
);
6276 ea
.set_in4_quad(0, 127);
6277 ea
.set_in4_quad(1, 0);
6278 ea
.set_in4_quad(2, 0);
6279 ea
.set_in4_quad(3, 1);
6281 o
.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 1), 10, 30, ea
));
6283 ea
.set_in4_quad(3, 2);
6285 o
.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 2), 20, 60, ea
));
6288 WRITE_CLASS_ENCODER_FEATURES(watch_item_t
)
6290 struct obj_watch_item_t
{
6296 * obj list watch response format
6299 struct obj_list_watch_response_t
{
6300 std::list
<watch_item_t
> entries
;
6302 void encode(ceph::buffer::list
& bl
, uint64_t features
) const {
6303 ENCODE_START(1, 1, bl
);
6304 encode(entries
, bl
, features
);
6307 void decode(ceph::buffer::list::const_iterator
& bl
) {
6308 DECODE_START(1, bl
);
6309 decode(entries
, bl
);
6312 void dump(ceph::Formatter
*f
) const {
6313 f
->open_array_section("entries");
6314 for (std::list
<watch_item_t
>::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
6315 f
->open_object_section("watch");
6321 static void generate_test_instances(std::list
<obj_list_watch_response_t
*>& o
) {
6323 o
.push_back(new obj_list_watch_response_t
);
6324 o
.push_back(new obj_list_watch_response_t
);
6325 std::list
<watch_item_t
*> test_watchers
;
6326 watch_item_t::generate_test_instances(test_watchers
);
6327 for (auto &e
: test_watchers
) {
6328 o
.back()->entries
.push_back(*e
);
6333 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t
)
6337 std::vector
<snapid_t
> snaps
; // ascending
6338 std::vector
< std::pair
<uint64_t,uint64_t> > overlap
;
6341 clone_info() : cloneid(CEPH_NOSNAP
), size(0) {}
6343 void encode(ceph::buffer::list
& bl
) const {
6344 ENCODE_START(1, 1, bl
);
6345 encode(cloneid
, bl
);
6347 encode(overlap
, bl
);
6351 void decode(ceph::buffer::list::const_iterator
& bl
) {
6352 DECODE_START(1, bl
);
6353 decode(cloneid
, bl
);
6355 decode(overlap
, bl
);
6359 void dump(ceph::Formatter
*f
) const {
6360 if (cloneid
== CEPH_NOSNAP
)
6361 f
->dump_string("cloneid", "HEAD");
6363 f
->dump_unsigned("cloneid", cloneid
.val
);
6364 f
->open_array_section("snapshots");
6365 for (std::vector
<snapid_t
>::const_iterator p
= snaps
.begin(); p
!= snaps
.end(); ++p
) {
6366 f
->open_object_section("snap");
6367 f
->dump_unsigned("id", p
->val
);
6371 f
->open_array_section("overlaps");
6372 for (std::vector
< std::pair
<uint64_t,uint64_t> >::const_iterator q
= overlap
.begin();
6373 q
!= overlap
.end(); ++q
) {
6374 f
->open_object_section("overlap");
6375 f
->dump_unsigned("offset", q
->first
);
6376 f
->dump_unsigned("length", q
->second
);
6380 f
->dump_unsigned("size", size
);
6382 static void generate_test_instances(std::list
<clone_info
*>& o
) {
6383 o
.push_back(new clone_info
);
6384 o
.push_back(new clone_info
);
6385 o
.back()->cloneid
= 1;
6386 o
.back()->snaps
.push_back(1);
6387 o
.back()->overlap
.push_back(std::pair
<uint64_t,uint64_t>(0,4096));
6388 o
.back()->overlap
.push_back(std::pair
<uint64_t,uint64_t>(8192,4096));
6389 o
.back()->size
= 16384;
6390 o
.push_back(new clone_info
);
6391 o
.back()->cloneid
= CEPH_NOSNAP
;
6392 o
.back()->size
= 32768;
6395 WRITE_CLASS_ENCODER(clone_info
)
6398 * obj list snaps response format
6401 struct obj_list_snap_response_t
{
6402 std::vector
<clone_info
> clones
; // ascending
6405 void encode(ceph::buffer::list
& bl
) const {
6406 ENCODE_START(2, 1, bl
);
6411 void decode(ceph::buffer::list::const_iterator
& bl
) {
6412 DECODE_START(2, bl
);
6420 void dump(ceph::Formatter
*f
) const {
6421 f
->open_array_section("clones");
6422 for (std::vector
<clone_info
>::const_iterator p
= clones
.begin(); p
!= clones
.end(); ++p
) {
6423 f
->open_object_section("clone");
6427 f
->dump_unsigned("seq", seq
);
6430 static void generate_test_instances(std::list
<obj_list_snap_response_t
*>& o
) {
6431 o
.push_back(new obj_list_snap_response_t
);
6432 o
.push_back(new obj_list_snap_response_t
);
6435 cl
.snaps
.push_back(1);
6436 cl
.overlap
.push_back(std::pair
<uint64_t,uint64_t>(0,4096));
6437 cl
.overlap
.push_back(std::pair
<uint64_t,uint64_t>(8192,4096));
6439 o
.back()->clones
.push_back(cl
);
6440 cl
.cloneid
= CEPH_NOSNAP
;
6444 o
.back()->clones
.push_back(cl
);
6445 o
.back()->seq
= 123;
6449 WRITE_CLASS_ENCODER(obj_list_snap_response_t
)
6453 struct PromoteCounter
{
6454 std::atomic
<unsigned long long> attempts
{0};
6455 std::atomic
<unsigned long long> objects
{0};
6456 std::atomic
<unsigned long long> bytes
{0};
6462 void finish(uint64_t size
) {
6467 void sample_and_attenuate(uint64_t *a
, uint64_t *o
, uint64_t *b
) {
6477 struct pool_pg_num_history_t
{
6478 /// last epoch updated
6480 /// poolid -> epoch -> pg_num
6481 std::map
<int64_t, std::map
<epoch_t
,uint32_t>> pg_nums
;
6482 /// pair(epoch, poolid)
6483 std::set
<std::pair
<epoch_t
,int64_t>> deleted_pools
;
6485 void log_pg_num_change(epoch_t epoch
, int64_t pool
, uint32_t pg_num
) {
6486 pg_nums
[pool
][epoch
] = pg_num
;
6488 void log_pool_delete(epoch_t epoch
, int64_t pool
) {
6489 deleted_pools
.insert(std::make_pair(epoch
, pool
));
6492 /// prune history based on oldest osdmap epoch in the cluster
6493 void prune(epoch_t oldest_epoch
) {
6494 auto i
= deleted_pools
.begin();
6495 while (i
!= deleted_pools
.end()) {
6496 if (i
->first
>= oldest_epoch
) {
6499 pg_nums
.erase(i
->second
);
6500 i
= deleted_pools
.erase(i
);
6502 for (auto& j
: pg_nums
) {
6503 auto k
= j
.second
.lower_bound(oldest_epoch
);
6504 // keep this and the entry before it (just to be paranoid)
6505 if (k
!= j
.second
.begin()) {
6507 j
.second
.erase(j
.second
.begin(), k
);
6512 void encode(ceph::buffer::list
& bl
) const {
6513 ENCODE_START(1, 1, bl
);
6515 encode(pg_nums
, bl
);
6516 encode(deleted_pools
, bl
);
6519 void decode(ceph::buffer::list::const_iterator
& p
) {
6523 decode(deleted_pools
, p
);
6526 void dump(ceph::Formatter
*f
) const {
6527 f
->dump_unsigned("epoch", epoch
);
6528 f
->open_object_section("pools");
6529 for (auto& i
: pg_nums
) {
6530 f
->open_object_section("pool");
6531 f
->dump_unsigned("pool_id", i
.first
);
6532 f
->open_array_section("changes");
6533 for (auto& j
: i
.second
) {
6534 f
->open_object_section("change");
6535 f
->dump_unsigned("epoch", j
.first
);
6536 f
->dump_unsigned("pg_num", j
.second
);
6543 f
->open_array_section("deleted_pools");
6544 for (auto& i
: deleted_pools
) {
6545 f
->open_object_section("deletion");
6546 f
->dump_unsigned("pool_id", i
.second
);
6547 f
->dump_unsigned("epoch", i
.first
);
6552 static void generate_test_instances(std::list
<pool_pg_num_history_t
*>& ls
) {
6553 ls
.push_back(new pool_pg_num_history_t
);
6555 friend std::ostream
& operator<<(std::ostream
& out
, const pool_pg_num_history_t
& h
) {
6556 return out
<< "pg_num_history(e" << h
.epoch
6557 << " pg_nums " << h
.pg_nums
6558 << " deleted_pools " << h
.deleted_pools
6562 WRITE_CLASS_ENCODER(pool_pg_num_history_t
)
6564 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
6566 static const std::string_view infover_key
= "_infover";
6567 static const std::string_view info_key
= "_info";
6568 static const std::string_view biginfo_key
= "_biginfo";
6569 static const std::string_view epoch_key
= "_epoch";
6570 static const std::string_view fastinfo_key
= "_fastinfo";
6572 static const __u8 pg_latest_struct_v
= 10;
6573 // v10 is the new past_intervals encoding
6574 // v9 was fastinfo_key addition
6575 // v8 was the move to a per-pg pgmeta object
6576 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
6577 // (first appeared in cuttlefish).
6578 static const __u8 pg_compat_struct_v
= 10;
6580 int prepare_info_keymap(
6582 std::map
<std::string
,ceph::buffer::list
> *km
,
6583 std::string
*key_to_remove
,
6586 pg_info_t
&last_written_info
,
6587 PastIntervals
&past_intervals
,
6588 bool dirty_big_info
,
6591 PerfCounters
*logger
= nullptr,
6592 DoutPrefixProvider
*dpp
= nullptr);
6594 namespace ceph::os
{
6598 void create_pg_collection(
6599 ceph::os::Transaction
& t
, spg_t pgid
, int bits
);
6601 void init_pg_ondisk(
6602 ceph::os::Transaction
& t
, spg_t pgid
, const pg_pool_t
*pool
);
6604 // omap specific stats
6605 struct omap_stat_t
{
6606 int large_omap_objects
;
6611 // filter for pg listings
6618 virtual ~PGLSFilter();
6619 virtual bool filter(const hobject_t
&obj
,
6620 const ceph::buffer::list
& xattr_data
) const = 0;
6623 * Arguments passed from the RADOS client. Implementations must
6624 * handle any encoding errors, and return an appropriate error code,
6625 * or 0 on valid input.
6627 virtual int init(ceph::buffer::list::const_iterator
¶ms
) = 0;
6630 * xattr key, or empty string. If non-empty, this xattr will be fetched
6631 * and the value passed into ::filter
6633 virtual const std::string
& get_xattr() const { return xattr
; }
6636 * If true, objects without the named xattr (if xattr name is not empty)
6637 * will be rejected without calling ::filter
6639 virtual bool reject_empty_xattr() const { return true; }
6642 class PGLSPlainFilter
: public PGLSFilter
{
6645 int init(ceph::buffer::list::const_iterator
¶ms
) override
;
6646 ~PGLSPlainFilter() override
{}
6647 bool filter(const hobject_t
& obj
,
6648 const ceph::buffer::list
& xattr_data
) const override
;
6651 // alias name for this structure:
6652 using missing_map_t
= std::map
<hobject_t
,
6653 std::pair
<std::optional
<uint32_t>,
6654 std::optional
<uint32_t>>>;