1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
25 #include <string_view>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/optional/optional_io.hpp>
29 #include <boost/variant.hpp>
30 #include <boost/smart_ptr/local_shared_ptr.hpp>
32 #include "include/rados/rados_types.hpp"
33 #include "include/mempool.h"
35 #include "msg/msg_types.h"
36 #include "include/compat.h"
37 #include "include/types.h"
38 #include "include/utime.h"
39 #include "include/CompatSet.h"
40 #include "common/ceph_context.h"
41 #include "common/histogram.h"
42 #include "include/interval_set.h"
43 #include "include/inline_memory.h"
44 #include "common/Formatter.h"
45 #include "common/bloom_filter.hpp"
46 #include "common/hobject.h"
47 #include "common/snap_types.h"
50 #include "include/cmp.h"
51 #include "librados/ListObjectImpl.h"
52 #include "compressor/Compressor.h"
53 #include "osd_perf_counters.h"
55 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
57 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
58 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
59 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
60 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
61 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
62 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
63 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
64 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
65 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
66 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
67 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
68 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
69 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
70 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
71 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
72 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
73 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
76 /// pool priority range set by user
77 #define OSD_POOL_PRIORITY_MAX 10
78 #define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
80 /// min recovery priority for MBackfillReserve
81 #define OSD_RECOVERY_PRIORITY_MIN 0
83 /// base backfill priority for MBackfillReserve
84 #define OSD_BACKFILL_PRIORITY_BASE 100
86 /// base backfill priority for MBackfillReserve (degraded PG)
87 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
89 /// base recovery priority for MBackfillReserve
90 #define OSD_RECOVERY_PRIORITY_BASE 180
92 /// base backfill priority for MBackfillReserve (inactive PG)
93 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
95 /// base recovery priority for MRecoveryReserve (inactive PG)
96 #define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
98 /// max manually/automatically set recovery priority for MBackfillReserve
99 #define OSD_RECOVERY_PRIORITY_MAX 253
101 /// backfill priority for MBackfillReserve, when forced manually
102 #define OSD_BACKFILL_PRIORITY_FORCED 254
104 /// recovery priority for MRecoveryReserve, when forced manually
105 #define OSD_RECOVERY_PRIORITY_FORCED 255
107 /// priority for pg deletion when osd is not fullish
108 #define OSD_DELETE_PRIORITY_NORMAL 179
110 /// priority for pg deletion when osd is approaching full
111 #define OSD_DELETE_PRIORITY_FULLISH 219
113 /// priority when more full
114 #define OSD_DELETE_PRIORITY_FULL 255
116 static std::map
<int, int> max_prio_map
= {
117 {OSD_BACKFILL_PRIORITY_BASE
, OSD_BACKFILL_DEGRADED_PRIORITY_BASE
- 1},
118 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_BASE
- 1},
119 {OSD_RECOVERY_PRIORITY_BASE
, OSD_BACKFILL_INACTIVE_PRIORITY_BASE
- 1},
120 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_MAX
},
121 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_MAX
}
124 typedef hobject_t collection_list_handle_t
;
126 /// convert a single CPEH_OSD_FLAG_* to a std::string
127 const char *ceph_osd_flag_name(unsigned flag
);
128 /// convert a single CEPH_OSD_OF_FLAG_* to a std::string
129 const char *ceph_osd_op_flag_name(unsigned flag
);
131 /// convert CEPH_OSD_FLAG_* op flags to a std::string
132 std::string
ceph_osd_flag_string(unsigned flags
);
133 /// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
134 std::string
ceph_osd_op_flag_string(unsigned flags
);
135 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
136 std::string
ceph_osd_alloc_hint_flag_string(unsigned flags
);
138 typedef std::map
<std::string
,std::string
> osd_alert_list_t
;
139 /// map osd id -> alert_list_t
140 typedef std::map
<int, osd_alert_list_t
> osd_alerts_t
;
141 void dump(ceph::Formatter
* f
, const osd_alerts_t
& alerts
);
144 typedef interval_set
<
146 mempool::osdmap::flat_map
> snap_interval_set_t
;
150 * osd request identifier
152 * caller name + incarnation# + tid to unique identify this request.
155 entity_name_t name
; // who
157 int32_t inc
; // incarnation
162 osd_reqid_t(const entity_name_t
& a
, int i
, ceph_tid_t t
)
163 : name(a
), tid(t
), inc(i
)
166 DENC(osd_reqid_t
, v
, p
) {
173 void dump(ceph::Formatter
*f
) const;
174 static void generate_test_instances(std::list
<osd_reqid_t
*>& o
);
176 WRITE_CLASS_DENC(osd_reqid_t
)
181 static const int32_t NO_OSD
= 0x7fffffff;
184 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD
) {}
185 explicit pg_shard_t(int osd
) : osd(osd
), shard(shard_id_t::NO_SHARD
) {}
186 pg_shard_t(int osd
, shard_id_t shard
) : osd(osd
), shard(shard
) {}
187 bool is_undefined() const {
190 std::string
get_osd() const { return (osd
== NO_OSD
? "NONE" : std::to_string(osd
)); }
191 void encode(ceph::buffer::list
&bl
) const;
192 void decode(ceph::buffer::list::const_iterator
&bl
);
193 void dump(ceph::Formatter
*f
) const {
194 f
->dump_unsigned("osd", osd
);
195 if (shard
!= shard_id_t::NO_SHARD
) {
196 f
->dump_unsigned("shard", shard
);
200 WRITE_CLASS_ENCODER(pg_shard_t
)
201 WRITE_EQ_OPERATORS_2(pg_shard_t
, osd
, shard
)
202 WRITE_CMP_OPERATORS_2(pg_shard_t
, osd
, shard
)
203 std::ostream
& operator<<(std::ostream
&lhs
, const pg_shard_t
&rhs
);
205 using HobjToShardSetMapping
= std::map
<hobject_t
, std::set
<pg_shard_t
>>;
207 class IsPGRecoverablePredicate
{
210 * have encodes the shards available
212 virtual bool operator()(const std::set
<pg_shard_t
> &have
) const = 0;
213 virtual ~IsPGRecoverablePredicate() {}
216 class IsPGReadablePredicate
{
219 * have encodes the shards available
221 virtual bool operator()(const std::set
<pg_shard_t
> &have
) const = 0;
222 virtual ~IsPGReadablePredicate() {}
225 inline std::ostream
& operator<<(std::ostream
& out
, const osd_reqid_t
& r
) {
226 return out
<< r
.name
<< "." << r
.inc
<< ":" << r
.tid
;
229 inline bool operator==(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
230 return (l
.name
== r
.name
) && (l
.inc
== r
.inc
) && (l
.tid
== r
.tid
);
232 inline bool operator!=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
233 return (l
.name
!= r
.name
) || (l
.inc
!= r
.inc
) || (l
.tid
!= r
.tid
);
235 inline bool operator<(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
236 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
237 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
< r
.tid
);
239 inline bool operator<=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
240 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
241 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
<= r
.tid
);
243 inline bool operator>(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
<= r
); }
244 inline bool operator>=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
< r
); }
247 template<> struct hash
<osd_reqid_t
> {
248 size_t operator()(const osd_reqid_t
&r
) const {
249 static hash
<uint64_t> H
;
250 return H(r
.name
.num() ^ r
.tid
^ r
.inc
);
258 // a locator constrains the placement of an object. mainly, which pool
260 struct object_locator_t
{
261 // You specify either the hash or the key -- not both
262 std::int64_t pool
; ///< pool id
263 std::string key
; ///< key string (if non-empty)
264 std::string nspace
; ///< namespace
265 std::int64_t hash
; ///< hash position (if >= 0)
267 explicit object_locator_t()
268 : pool(-1), hash(-1) {}
269 explicit object_locator_t(int64_t po
)
270 : pool(po
), hash(-1) {}
271 explicit object_locator_t(int64_t po
, int64_t ps
)
272 : pool(po
), hash(ps
) {}
273 explicit object_locator_t(int64_t po
, std::string_view ns
)
274 : pool(po
), nspace(ns
), hash(-1) {}
275 explicit object_locator_t(int64_t po
, std::string_view ns
, int64_t ps
)
276 : pool(po
), nspace(ns
), hash(ps
) {}
277 explicit object_locator_t(int64_t po
, std::string_view ns
, std::string_view s
)
278 : pool(po
), key(s
), nspace(ns
), hash(-1) {}
279 explicit object_locator_t(const hobject_t
& soid
)
280 : pool(soid
.pool
), key(soid
.get_key()), nspace(soid
.nspace
), hash(-1) {}
282 int64_t get_pool() const {
297 void encode(ceph::buffer::list
& bl
) const;
298 void decode(ceph::buffer::list::const_iterator
& p
);
299 void dump(ceph::Formatter
*f
) const;
300 static void generate_test_instances(std::list
<object_locator_t
*>& o
);
302 WRITE_CLASS_ENCODER(object_locator_t
)
304 inline bool operator==(const object_locator_t
& l
, const object_locator_t
& r
) {
305 return l
.pool
== r
.pool
&& l
.key
== r
.key
&& l
.nspace
== r
.nspace
&& l
.hash
== r
.hash
;
307 inline bool operator!=(const object_locator_t
& l
, const object_locator_t
& r
) {
311 inline std::ostream
& operator<<(std::ostream
& out
, const object_locator_t
& loc
)
313 out
<< "@" << loc
.pool
;
314 if (loc
.nspace
.length())
315 out
<< ";" << loc
.nspace
;
316 if (loc
.key
.length())
317 out
<< ":" << loc
.key
;
321 struct request_redirect_t
{
323 object_locator_t redirect_locator
; ///< this is authoritative
324 std::string redirect_object
; ///< If non-empty, the request goes to this object name
326 friend std::ostream
& operator<<(std::ostream
& out
, const request_redirect_t
& redir
);
329 request_redirect_t() {}
330 explicit request_redirect_t(const object_locator_t
& orig
, int64_t rpool
) :
331 redirect_locator(orig
) { redirect_locator
.pool
= rpool
; }
332 explicit request_redirect_t(const object_locator_t
& rloc
) :
333 redirect_locator(rloc
) {}
334 explicit request_redirect_t(const object_locator_t
& orig
,
335 const std::string
& robj
) :
336 redirect_locator(orig
), redirect_object(robj
) {}
338 bool empty() const { return redirect_locator
.empty() &&
339 redirect_object
.empty(); }
341 void combine_with_locator(object_locator_t
& orig
, std::string
& obj
) const {
342 orig
= redirect_locator
;
343 if (!redirect_object
.empty())
344 obj
= redirect_object
;
347 void encode(ceph::buffer::list
& bl
) const;
348 void decode(ceph::buffer::list::const_iterator
& bl
);
349 void dump(ceph::Formatter
*f
) const;
350 static void generate_test_instances(std::list
<request_redirect_t
*>& o
);
352 WRITE_CLASS_ENCODER(request_redirect_t
)
354 inline std::ostream
& operator<<(std::ostream
& out
, const request_redirect_t
& redir
) {
355 out
<< "object " << redir
.redirect_object
<< ", locator{" << redir
.redirect_locator
<< "}";
359 // Internal OSD op flags - set by the OSD based on the op types
361 CEPH_OSD_RMW_FLAG_READ
= (1 << 1),
362 CEPH_OSD_RMW_FLAG_WRITE
= (1 << 2),
363 CEPH_OSD_RMW_FLAG_CLASS_READ
= (1 << 3),
364 CEPH_OSD_RMW_FLAG_CLASS_WRITE
= (1 << 4),
365 CEPH_OSD_RMW_FLAG_PGOP
= (1 << 5),
366 CEPH_OSD_RMW_FLAG_CACHE
= (1 << 6),
367 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE
= (1 << 7),
368 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE
= (1 << 8),
369 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE
= (1 << 9),
370 CEPH_OSD_RMW_FLAG_RWORDERED
= (1 << 10),
371 CEPH_OSD_RMW_FLAG_RETURNVEC
= (1 << 11),
377 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
379 // placement seed (a hash value)
380 typedef uint32_t ps_t
;
382 // old (v1) pg_t encoding (wrap old struct ceph_pg)
385 void encode(ceph::buffer::list
& bl
) const {
386 ceph::encode_raw(v
, bl
);
388 void decode(ceph::buffer::list::const_iterator
& bl
) {
389 ceph::decode_raw(v
, bl
);
392 WRITE_CLASS_ENCODER(old_pg_t
)
394 // placement group id
399 pg_t() : m_pool(0), m_seed(0) {}
400 pg_t(ps_t seed
, uint64_t pool
) :
401 m_pool(pool
), m_seed(seed
) {}
402 // cppcheck-suppress noExplicitConstructor
403 pg_t(const ceph_pg
& cpg
) :
404 m_pool(cpg
.pool
), m_seed(cpg
.ps
) {}
406 // cppcheck-suppress noExplicitConstructor
407 pg_t(const old_pg_t
& opg
) {
411 old_pg_t
get_old_pg() const {
413 ceph_assert(m_pool
< 0xffffffffull
);
416 o
.v
.preferred
= (__s16
)-1;
423 int64_t pool() const {
427 static const uint8_t calc_name_buf_size
= 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
428 char *calc_name(char *buf
, const char *suffix_backwords
) const;
430 void set_ps(ps_t p
) {
433 void set_pool(uint64_t p
) {
437 pg_t
get_parent() const;
438 pg_t
get_ancestor(unsigned old_pg_num
) const;
440 int print(char *o
, int maxlen
) const;
441 bool parse(const char *s
);
443 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
, std::set
<pg_t
> *pchildren
) const;
445 bool is_merge_source(unsigned old_pg_num
, unsigned new_pg_num
, pg_t
*parent
) const;
446 bool is_merge_target(unsigned old_pg_num
, unsigned new_pg_num
) const {
447 return ps() < new_pg_num
&& is_split(new_pg_num
, old_pg_num
, nullptr);
451 * Returns b such that for all object o:
452 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
454 unsigned get_split_bits(unsigned pg_num
) const;
456 bool contains(int bits
, const ghobject_t
& oid
) const {
458 (int64_t)m_pool
== oid
.hobj
.get_logical_pool() &&
459 oid
.match(bits
, ps());
461 bool contains(int bits
, const hobject_t
& oid
) const {
463 (int64_t)m_pool
== oid
.get_logical_pool() &&
464 oid
.match(bits
, ps());
467 hobject_t
get_hobj_start() const;
468 hobject_t
get_hobj_end(unsigned pg_num
) const;
470 // strong ordering is supported
471 inline int compare(const pg_t
& p
) const noexcept
{
472 if (auto delta
= pool() - p
.pool(); delta
!= 0) {
474 } else if (ps() < p
.ps()) {
476 } else if (ps() > p
.ps()) {
483 void encode(ceph::buffer::list
& bl
) const {
489 encode((int32_t)-1, bl
); // was preferred
491 void decode(ceph::buffer::list::const_iterator
& bl
) {
497 bl
+= sizeof(int32_t); // was preferred
499 void decode_old(ceph::buffer::list::const_iterator
& bl
) {
505 void dump(ceph::Formatter
*f
) const;
506 static void generate_test_instances(std::list
<pg_t
*>& o
);
508 WRITE_CLASS_ENCODER(pg_t
)
510 inline bool operator<(const pg_t
& l
, const pg_t
& r
) {
511 return l
.compare(r
) < 0;
513 inline bool operator<=(const pg_t
& l
, const pg_t
& r
) {
514 return l
.compare(r
) <= 0;
516 inline bool operator==(const pg_t
& l
, const pg_t
& r
) {
517 return l
.compare(r
) == 0;
519 inline bool operator!=(const pg_t
& l
, const pg_t
& r
) {
520 return l
.compare(r
) != 0;
522 inline bool operator>(const pg_t
& l
, const pg_t
& r
) {
523 return l
.compare(r
) > 0;
525 inline bool operator>=(const pg_t
& l
, const pg_t
& r
) {
526 return l
.compare(r
) >= 0;
529 std::ostream
& operator<<(std::ostream
& out
, const pg_t
&pg
);
532 template<> struct hash
< pg_t
>
534 size_t operator()( const pg_t
& x
) const
536 static hash
<uint32_t> H
;
537 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
538 return H((x
.pool() & 0xffffffff) ^ (x
.pool() >> 32) ^ x
.ps() ^ (int32_t)(-1));
546 spg_t() : shard(shard_id_t::NO_SHARD
) {}
547 spg_t(pg_t pgid
, shard_id_t shard
) : pgid(pgid
), shard(shard
) {}
548 explicit spg_t(pg_t pgid
) : pgid(pgid
), shard(shard_id_t::NO_SHARD
) {}
549 unsigned get_split_bits(unsigned pg_num
) const {
550 return pgid
.get_split_bits(pg_num
);
552 spg_t
get_parent() const {
553 return spg_t(pgid
.get_parent(), shard
);
558 uint64_t pool() const {
561 void reset_shard(shard_id_t s
) {
565 static const uint8_t calc_name_buf_size
= pg_t::calc_name_buf_size
+ 4; // 36 + len('s') + len("255");
566 char *calc_name(char *buf
, const char *suffix_backwords
) const;
568 bool parse(const char *s
);
569 bool parse(const std::string
& s
) {
570 return parse(s
.c_str());
573 spg_t
get_ancestor(unsigned old_pg_num
) const {
574 return spg_t(pgid
.get_ancestor(old_pg_num
), shard
);
577 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
,
578 std::set
<spg_t
> *pchildren
) const {
579 std::set
<pg_t
> _children
;
580 std::set
<pg_t
> *children
= pchildren
? &_children
: NULL
;
581 bool is_split
= pgid
.is_split(old_pg_num
, new_pg_num
, children
);
582 if (pchildren
&& is_split
) {
583 for (std::set
<pg_t
>::iterator i
= _children
.begin();
584 i
!= _children
.end();
586 pchildren
->insert(spg_t(*i
, shard
));
591 bool is_merge_target(unsigned old_pg_num
, unsigned new_pg_num
) const {
592 return pgid
.is_merge_target(old_pg_num
, new_pg_num
);
594 bool is_merge_source(unsigned old_pg_num
, unsigned new_pg_num
,
595 spg_t
*parent
) const {
597 bool r
= pgid
.is_merge_source(old_pg_num
, new_pg_num
, &out
.pgid
);
604 bool is_no_shard() const {
605 return shard
== shard_id_t::NO_SHARD
;
608 ghobject_t
make_pgmeta_oid() const {
609 return ghobject_t::make_pgmeta(pgid
.pool(), pgid
.ps(), shard
);
612 void encode(ceph::buffer::list
&bl
) const {
613 ENCODE_START(1, 1, bl
);
618 void decode(ceph::buffer::list::const_iterator
& bl
) {
625 ghobject_t
make_temp_ghobject(const std::string
& name
) const {
627 hobject_t(object_t(name
), "", CEPH_NOSNAP
,
629 hobject_t::get_temp_pool(pgid
.pool()),
635 unsigned hash_to_shard(unsigned num_shards
) const {
636 return ps() % num_shards
;
639 WRITE_CLASS_ENCODER(spg_t
)
640 WRITE_EQ_OPERATORS_2(spg_t
, pgid
, shard
)
641 WRITE_CMP_OPERATORS_2(spg_t
, pgid
, shard
)
644 template<> struct hash
< spg_t
>
646 size_t operator()( const spg_t
& x
) const
648 static hash
<uint32_t> H
;
649 return H(hash
<pg_t
>()(x
.pgid
) ^ x
.shard
);
654 std::ostream
& operator<<(std::ostream
& out
, const spg_t
&pg
);
656 // ----------------------
659 enum type_t
: uint8_t {
661 TYPE_LEGACY_TEMP
= 1, /* no longer used */
667 uint64_t removal_seq
; // note: deprecated, not encoded
669 char _str_buff
[spg_t::calc_name_buf_size
];
674 coll_t(type_t t
, spg_t p
, uint64_t r
)
675 : type(t
), pgid(p
), removal_seq(r
) {
679 friend class denc_coll_t
;
681 coll_t() : type(TYPE_META
), removal_seq(0)
686 coll_t(const coll_t
& other
)
687 : type(other
.type
), pgid(other
.pgid
), removal_seq(other
.removal_seq
) {
691 explicit coll_t(spg_t pgid
)
692 : type(TYPE_PG
), pgid(pgid
), removal_seq(0)
697 coll_t
& operator=(const coll_t
& rhs
)
699 this->type
= rhs
.type
;
700 this->pgid
= rhs
.pgid
;
701 this->removal_seq
= rhs
.removal_seq
;
706 // named constructors
707 static coll_t
meta() {
710 static coll_t
pg(spg_t p
) {
714 const std::string
to_str() const {
715 return std::string(_str
);
717 const char *c_str() const {
721 bool parse(const std::string
& s
);
723 int operator<(const coll_t
&rhs
) const {
724 return type
< rhs
.type
||
725 (type
== rhs
.type
&& pgid
< rhs
.pgid
);
728 bool is_meta() const {
729 return type
== TYPE_META
;
731 bool is_pg_prefix(spg_t
*pgid_
) const {
732 if (type
== TYPE_PG
|| type
== TYPE_PG_TEMP
) {
739 return type
== TYPE_PG
;
741 bool is_pg(spg_t
*pgid_
) const {
742 if (type
== TYPE_PG
) {
748 bool is_temp() const {
749 return type
== TYPE_PG_TEMP
;
751 bool is_temp(spg_t
*pgid_
) const {
752 if (type
== TYPE_PG_TEMP
) {
758 int64_t pool() const {
762 void encode(ceph::buffer::list
& bl
) const;
763 void decode(ceph::buffer::list::const_iterator
& bl
);
764 size_t encoded_size() const;
766 inline bool operator==(const coll_t
& rhs
) const {
767 // only compare type if meta
768 if (type
!= rhs
.type
)
770 if (type
== TYPE_META
)
772 return type
== rhs
.type
&& pgid
== rhs
.pgid
;
774 inline bool operator!=(const coll_t
& rhs
) const {
775 return !(*this == rhs
);
778 // get a TEMP collection that corresponds to the current collection,
779 // which we presume is a pg collection.
780 coll_t
get_temp() const {
781 ceph_assert(type
== TYPE_PG
);
782 return coll_t(TYPE_PG_TEMP
, pgid
, 0);
785 ghobject_t
get_min_hobj() const {
789 o
.hobj
.pool
= pgid
.pool();
790 o
.set_shard(pgid
.shard
);
801 unsigned hash_to_shard(unsigned num_shards
) const {
803 return pgid
.hash_to_shard(num_shards
);
804 return 0; // whatever.
807 void dump(ceph::Formatter
*f
) const;
808 static void generate_test_instances(std::list
<coll_t
*>& o
);
811 WRITE_CLASS_ENCODER(coll_t
)
813 inline std::ostream
& operator<<(std::ostream
& out
, const coll_t
& c
) {
819 template<> struct hash
<coll_t
> {
820 size_t operator()(const coll_t
&c
) const {
822 std::string
str(c
.to_str());
823 std::string::const_iterator
end(str
.end());
824 for (std::string::const_iterator s
= str
.begin(); s
!= end
; ++s
) {
837 inline std::ostream
& operator<<(std::ostream
& out
, const ceph_object_layout
&ol
)
839 out
<< pg_t(ol
.ol_pgid
);
840 int su
= ol
.ol_stripe_unit
;
849 auto &get_type() const { return coll
.type
; }
850 auto &get_type() { return coll
.type
; }
851 auto &get_pgid() const { return coll
.pgid
; }
852 auto &get_pgid() { return coll
.pgid
; }
854 denc_coll_t() = default;
855 denc_coll_t(const denc_coll_t
&) = default;
856 denc_coll_t(denc_coll_t
&&) = default;
858 denc_coll_t
&operator=(const denc_coll_t
&) = default;
859 denc_coll_t
&operator=(denc_coll_t
&&) = default;
861 explicit denc_coll_t(const coll_t
&coll
) : coll(coll
) {}
862 operator coll_t() const {
866 bool operator<(const denc_coll_t
&rhs
) const {
867 return coll
< rhs
.coll
;
870 DENC(denc_coll_t
, v
, p
) {
872 denc(v
.get_type(), p
);
873 denc(v
.get_pgid().pgid
.m_pool
, p
);
874 denc(v
.get_pgid().pgid
.m_seed
, p
);
875 denc(v
.get_pgid().shard
.id
, p
);
879 WRITE_CLASS_DENC(denc_coll_t
)
882 // compound rados version type
883 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
884 * work well. For little-endian machine, we should make sure there is no padding
885 * in 32-bit machine and 64-bit machine.
892 eversion_t() : version(0), epoch(0), __pad(0) {}
893 eversion_t(epoch_t e
, version_t v
) : version(v
), epoch(e
), __pad(0) {}
895 // cppcheck-suppress noExplicitConstructor
896 eversion_t(const ceph_eversion
& ce
) :
901 explicit eversion_t(ceph::buffer::list
& bl
) : __pad(0) { decode(bl
); }
903 static const eversion_t
& max() {
904 static const eversion_t
max(-1,-1);
908 operator ceph_eversion() {
915 std::string
get_key_name() const;
917 // key must point to the beginning of a block of 32 chars
918 inline void get_key_name(char* key
) const {
919 // Below is equivalent of sprintf("%010u.%020llu");
921 ritoa
<uint64_t, 10, 20>(version
, key
+ 31);
923 ritoa
<uint32_t, 10, 10>(epoch
, key
+ 10);
926 void encode(ceph::buffer::list
&bl
) const {
927 #if defined(CEPH_LITTLE_ENDIAN)
928 bl
.append((char *)this, sizeof(version_t
) + sizeof(epoch_t
));
935 void decode(ceph::buffer::list::const_iterator
&bl
) {
936 #if defined(CEPH_LITTLE_ENDIAN)
937 bl
.copy(sizeof(version_t
) + sizeof(epoch_t
), (char *)this);
944 void decode(ceph::buffer::list
& bl
) {
945 auto p
= std::cbegin(bl
);
949 WRITE_CLASS_ENCODER(eversion_t
)
951 inline bool operator==(const eversion_t
& l
, const eversion_t
& r
) {
952 return (l
.epoch
== r
.epoch
) && (l
.version
== r
.version
);
954 inline bool operator!=(const eversion_t
& l
, const eversion_t
& r
) {
955 return (l
.epoch
!= r
.epoch
) || (l
.version
!= r
.version
);
957 inline bool operator<(const eversion_t
& l
, const eversion_t
& r
) {
958 return (l
.epoch
== r
.epoch
) ? (l
.version
< r
.version
):(l
.epoch
< r
.epoch
);
960 inline bool operator<=(const eversion_t
& l
, const eversion_t
& r
) {
961 return (l
.epoch
== r
.epoch
) ? (l
.version
<= r
.version
):(l
.epoch
<= r
.epoch
);
963 inline bool operator>(const eversion_t
& l
, const eversion_t
& r
) {
964 return (l
.epoch
== r
.epoch
) ? (l
.version
> r
.version
):(l
.epoch
> r
.epoch
);
966 inline bool operator>=(const eversion_t
& l
, const eversion_t
& r
) {
967 return (l
.epoch
== r
.epoch
) ? (l
.version
>= r
.version
):(l
.epoch
>= r
.epoch
);
969 inline std::ostream
& operator<<(std::ostream
& out
, const eversion_t
& e
) {
970 return out
<< e
.epoch
<< "'" << e
.version
;
974 * objectstore_perf_stat_t
976 * current perf information about the osd
978 struct objectstore_perf_stat_t
{
979 // cur_op_latency is in ns since double add/sub are not associative
980 uint64_t os_commit_latency_ns
;
981 uint64_t os_apply_latency_ns
;
983 objectstore_perf_stat_t() :
984 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
986 bool operator==(const objectstore_perf_stat_t
&r
) const {
987 return os_commit_latency_ns
== r
.os_commit_latency_ns
&&
988 os_apply_latency_ns
== r
.os_apply_latency_ns
;
991 void add(const objectstore_perf_stat_t
&o
) {
992 os_commit_latency_ns
+= o
.os_commit_latency_ns
;
993 os_apply_latency_ns
+= o
.os_apply_latency_ns
;
995 void sub(const objectstore_perf_stat_t
&o
) {
996 os_commit_latency_ns
-= o
.os_commit_latency_ns
;
997 os_apply_latency_ns
-= o
.os_apply_latency_ns
;
999 void dump(ceph::Formatter
*f
) const;
1000 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
1001 void decode(ceph::buffer::list::const_iterator
&bl
);
1002 static void generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
);
1004 WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t
)
1009 #define PG_STATE_CREATING (1ULL << 0) // creating
1010 #define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
1011 #define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
1012 #define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
1013 #define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
1014 #define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
1015 #define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
1016 #define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
1017 //#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
1018 #define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
1019 #define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
1020 #define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
1021 #define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
1022 #define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
1023 #define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
1024 #define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
1025 #define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
1026 #define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
1027 #define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
1028 #define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
1029 #define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
1030 #define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
1031 #define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
1032 #define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
1033 #define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
1034 #define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
1035 #define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
1036 #define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
1037 #define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
1038 #define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
1039 #define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
1040 #define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
1041 #define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
1042 #define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
1044 std::string
pg_state_string(uint64_t state
);
1045 std::string
pg_vector_string(const std::vector
<int32_t> &a
);
1046 std::optional
<uint64_t> pg_string_state(const std::string
& state
);
1052 * attributes for a single pool snapshot.
1054 struct pool_snap_info_t
{
1059 void dump(ceph::Formatter
*f
) const;
1060 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
1061 void decode(ceph::buffer::list::const_iterator
& bl
);
1062 static void generate_test_instances(std::list
<pool_snap_info_t
*>& o
);
1064 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t
)
1066 inline std::ostream
& operator<<(std::ostream
& out
, const pool_snap_info_t
& si
) {
1067 return out
<< si
.snapid
<< '(' << si
.name
<< ' ' << si
.stamp
<< ')';
1077 // The order of items in the list is important, therefore,
1078 // you should always add to the end of the list when adding new options.
1085 DEEP_SCRUB_INTERVAL
,
1087 RECOVERY_OP_PRIORITY
,
1090 COMPRESSION_ALGORITHM
,
1091 COMPRESSION_REQUIRED_RATIO
,
1092 COMPRESSION_MAX_BLOB_SIZE
,
1093 COMPRESSION_MIN_BLOB_SIZE
,
1097 FINGERPRINT_ALGORITHM
,
1098 PG_NUM_MIN
, // min pg_num
1099 TARGET_SIZE_BYTES
, // total bytes in pool
1100 TARGET_SIZE_RATIO
, // fraction of total cluster
1102 READ_LEASE_INTERVAL
,
1104 DEDUP_CHUNK_ALGORITHM
,
1105 DEDUP_CDC_CHUNK_SIZE
,
1106 PG_NUM_MAX
, // max pg_num
1119 opt_desc_t(key_t k
, type_t t
) : key(k
), type(t
) {}
1121 bool operator==(const opt_desc_t
& rhs
) const {
1122 return key
== rhs
.key
&& type
== rhs
.type
;
1126 typedef boost::variant
<std::string
,int64_t,double> value_t
;
1128 static bool is_opt_name(const std::string
& name
);
1129 static opt_desc_t
get_opt_desc(const std::string
& name
);
1131 pool_opts_t() : opts() {}
1133 bool is_set(key_t key
) const;
1135 template<typename T
>
1136 void set(key_t key
, const T
&val
) {
1137 value_t value
= val
;
1141 template<typename T
>
1142 bool get(key_t key
, T
*val
) const {
1143 opts_t::const_iterator i
= opts
.find(key
);
1144 if (i
== opts
.end()) {
1147 *val
= boost::get
<T
>(i
->second
);
1151 template<typename T
>
1152 T
value_or(key_t key
, T
&& default_value
) const {
1153 auto i
= opts
.find(key
);
1154 if (i
== opts
.end()) {
1155 return std::forward
<T
>(default_value
);
1157 return boost::get
<T
>(i
->second
);
1160 const value_t
& get(key_t key
) const;
1162 bool unset(key_t key
);
1164 void dump(const std::string
& name
, ceph::Formatter
*f
) const;
1166 void dump(ceph::Formatter
*f
) const;
1167 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
1168 void decode(ceph::buffer::list::const_iterator
&bl
);
1171 typedef std::map
<key_t
, value_t
> opts_t
;
1174 friend std::ostream
& operator<<(std::ostream
& out
, const pool_opts_t
& opts
);
1176 WRITE_CLASS_ENCODER_FEATURES(pool_opts_t
)
1178 struct pg_merge_meta_t
{
1180 epoch_t ready_epoch
= 0;
1181 epoch_t last_epoch_started
= 0;
1182 epoch_t last_epoch_clean
= 0;
1183 eversion_t source_version
;
1184 eversion_t target_version
;
1186 void encode(ceph::buffer::list
& bl
) const {
1187 ENCODE_START(1, 1, bl
);
1188 encode(source_pgid
, bl
);
1189 encode(ready_epoch
, bl
);
1190 encode(last_epoch_started
, bl
);
1191 encode(last_epoch_clean
, bl
);
1192 encode(source_version
, bl
);
1193 encode(target_version
, bl
);
1196 void decode(ceph::buffer::list::const_iterator
& p
) {
1198 decode(source_pgid
, p
);
1199 decode(ready_epoch
, p
);
1200 decode(last_epoch_started
, p
);
1201 decode(last_epoch_clean
, p
);
1202 decode(source_version
, p
);
1203 decode(target_version
, p
);
1206 void dump(ceph::Formatter
*f
) const {
1207 f
->dump_stream("source_pgid") << source_pgid
;
1208 f
->dump_unsigned("ready_epoch", ready_epoch
);
1209 f
->dump_unsigned("last_epoch_started", last_epoch_started
);
1210 f
->dump_unsigned("last_epoch_clean", last_epoch_clean
);
1211 f
->dump_stream("source_version") << source_version
;
1212 f
->dump_stream("target_version") << target_version
;
1215 WRITE_CLASS_ENCODER(pg_merge_meta_t
)
1223 static const char *APPLICATION_NAME_CEPHFS
;
1224 static const char *APPLICATION_NAME_RBD
;
1225 static const char *APPLICATION_NAME_RGW
;
1228 TYPE_REPLICATED
= 1, // replication
1229 //TYPE_RAID4 = 2, // raid4 (never implemented)
1230 TYPE_ERASURE
= 3, // erasure-coded
1232 static constexpr uint32_t pg_CRUSH_ITEM_NONE
= 0x7fffffff; /* can't import crush.h here */
1233 static std::string_view
get_type_name(int t
) {
1235 case TYPE_REPLICATED
: return "replicated";
1236 //case TYPE_RAID4: return "raid4";
1237 case TYPE_ERASURE
: return "erasure";
1238 default: return "???";
1241 std::string_view
get_type_name() const {
1242 return get_type_name(type
);
1246 FLAG_HASHPSPOOL
= 1<<0, // hash pg seed and pool together (instead of adding)
1247 FLAG_FULL
= 1<<1, // pool is full
1248 FLAG_EC_OVERWRITES
= 1<<2, // enables overwrites, once enabled, cannot be disabled
1249 FLAG_INCOMPLETE_CLONES
= 1<<3, // may have incomplete clones (bc we are/were an overlay)
1250 FLAG_NODELETE
= 1<<4, // pool can't be deleted
1251 FLAG_NOPGCHANGE
= 1<<5, // pool's pg and pgp num can't be changed
1252 FLAG_NOSIZECHANGE
= 1<<6, // pool's size and min size can't be changed
1253 FLAG_WRITE_FADVISE_DONTNEED
= 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1254 FLAG_NOSCRUB
= 1<<8, // block periodic scrub
1255 FLAG_NODEEP_SCRUB
= 1<<9, // block periodic deep-scrub
1256 FLAG_FULL_QUOTA
= 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1257 FLAG_NEARFULL
= 1<<11, // pool is nearfull
1258 FLAG_BACKFILLFULL
= 1<<12, // pool is backfillfull
1259 FLAG_SELFMANAGED_SNAPS
= 1<<13, // pool uses selfmanaged snaps
1260 FLAG_POOL_SNAPS
= 1<<14, // pool has pool snaps
1261 FLAG_CREATING
= 1<<15, // initial pool PGs are being created
1262 FLAG_EIO
= 1<<16, // return EIO for all client ops
1263 FLAG_BULK
= 1<<17, //pool is large
1266 static const char *get_flag_name(uint64_t f
) {
1268 case FLAG_HASHPSPOOL
: return "hashpspool";
1269 case FLAG_FULL
: return "full";
1270 case FLAG_EC_OVERWRITES
: return "ec_overwrites";
1271 case FLAG_INCOMPLETE_CLONES
: return "incomplete_clones";
1272 case FLAG_NODELETE
: return "nodelete";
1273 case FLAG_NOPGCHANGE
: return "nopgchange";
1274 case FLAG_NOSIZECHANGE
: return "nosizechange";
1275 case FLAG_WRITE_FADVISE_DONTNEED
: return "write_fadvise_dontneed";
1276 case FLAG_NOSCRUB
: return "noscrub";
1277 case FLAG_NODEEP_SCRUB
: return "nodeep-scrub";
1278 case FLAG_FULL_QUOTA
: return "full_quota";
1279 case FLAG_NEARFULL
: return "nearfull";
1280 case FLAG_BACKFILLFULL
: return "backfillfull";
1281 case FLAG_SELFMANAGED_SNAPS
: return "selfmanaged_snaps";
1282 case FLAG_POOL_SNAPS
: return "pool_snaps";
1283 case FLAG_CREATING
: return "creating";
1284 case FLAG_EIO
: return "eio";
1285 case FLAG_BULK
: return "bulk";
1286 default: return "???";
1289 static std::string
get_flags_string(uint64_t f
) {
1291 for (unsigned n
=0; f
&& n
<64; ++n
) {
1292 if (f
& (1ull << n
)) {
1295 s
+= get_flag_name(1ull << n
);
1300 std::string
get_flags_string() const {
1301 return get_flags_string(flags
);
1303 static uint64_t get_flag_by_name(const std::string
& name
) {
1304 if (name
== "hashpspool")
1305 return FLAG_HASHPSPOOL
;
1308 if (name
== "ec_overwrites")
1309 return FLAG_EC_OVERWRITES
;
1310 if (name
== "incomplete_clones")
1311 return FLAG_INCOMPLETE_CLONES
;
1312 if (name
== "nodelete")
1313 return FLAG_NODELETE
;
1314 if (name
== "nopgchange")
1315 return FLAG_NOPGCHANGE
;
1316 if (name
== "nosizechange")
1317 return FLAG_NOSIZECHANGE
;
1318 if (name
== "write_fadvise_dontneed")
1319 return FLAG_WRITE_FADVISE_DONTNEED
;
1320 if (name
== "noscrub")
1321 return FLAG_NOSCRUB
;
1322 if (name
== "nodeep-scrub")
1323 return FLAG_NODEEP_SCRUB
;
1324 if (name
== "full_quota")
1325 return FLAG_FULL_QUOTA
;
1326 if (name
== "nearfull")
1327 return FLAG_NEARFULL
;
1328 if (name
== "backfillfull")
1329 return FLAG_BACKFILLFULL
;
1330 if (name
== "selfmanaged_snaps")
1331 return FLAG_SELFMANAGED_SNAPS
;
1332 if (name
== "pool_snaps")
1333 return FLAG_POOL_SNAPS
;
1334 if (name
== "creating")
1335 return FLAG_CREATING
;
1343 /// converts the acting/up vector to a set of pg shards
1344 void convert_to_pg_shards(const std::vector
<int> &from
, std::set
<pg_shard_t
>* to
) const;
1347 CACHEMODE_NONE
= 0, ///< no caching
1348 CACHEMODE_WRITEBACK
= 1, ///< write to cache, flush later
1349 CACHEMODE_FORWARD
= 2, ///< forward if not in cache
1350 CACHEMODE_READONLY
= 3, ///< handle reads, forward writes [not strongly consistent]
1351 CACHEMODE_READFORWARD
= 4, ///< forward reads, write to cache flush later
1352 CACHEMODE_READPROXY
= 5, ///< proxy reads, write to cache flush later
1353 CACHEMODE_PROXY
= 6, ///< proxy if not in cache
1355 static const char *get_cache_mode_name(cache_mode_t m
) {
1357 case CACHEMODE_NONE
: return "none";
1358 case CACHEMODE_WRITEBACK
: return "writeback";
1359 case CACHEMODE_FORWARD
: return "forward";
1360 case CACHEMODE_READONLY
: return "readonly";
1361 case CACHEMODE_READFORWARD
: return "readforward";
1362 case CACHEMODE_READPROXY
: return "readproxy";
1363 case CACHEMODE_PROXY
: return "proxy";
1364 default: return "unknown";
1367 static cache_mode_t
get_cache_mode_from_str(const std::string
& s
) {
1369 return CACHEMODE_NONE
;
1370 if (s
== "writeback")
1371 return CACHEMODE_WRITEBACK
;
1373 return CACHEMODE_FORWARD
;
1374 if (s
== "readonly")
1375 return CACHEMODE_READONLY
;
1376 if (s
== "readforward")
1377 return CACHEMODE_READFORWARD
;
1378 if (s
== "readproxy")
1379 return CACHEMODE_READPROXY
;
1381 return CACHEMODE_PROXY
;
1382 return (cache_mode_t
)-1;
1384 const char *get_cache_mode_name() const {
1385 return get_cache_mode_name(cache_mode
);
1387 bool cache_mode_requires_hit_set() const {
1388 switch (cache_mode
) {
1389 case CACHEMODE_NONE
:
1390 case CACHEMODE_FORWARD
:
1391 case CACHEMODE_READONLY
:
1392 case CACHEMODE_PROXY
:
1394 case CACHEMODE_WRITEBACK
:
1395 case CACHEMODE_READFORWARD
:
1396 case CACHEMODE_READPROXY
:
1399 ceph_abort_msg("implement me");
1403 enum class pg_autoscale_mode_t
: uint8_t {
1407 UNKNOWN
= UINT8_MAX
,
1409 static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m
) {
1411 case pg_autoscale_mode_t::OFF
: return "off";
1412 case pg_autoscale_mode_t::ON
: return "on";
1413 case pg_autoscale_mode_t::WARN
: return "warn";
1414 default: return "???";
1417 static pg_autoscale_mode_t
get_pg_autoscale_mode_by_name(const std::string
& m
) {
1419 return pg_autoscale_mode_t::OFF
;
1422 return pg_autoscale_mode_t::WARN
;
1425 return pg_autoscale_mode_t::ON
;
1427 return pg_autoscale_mode_t::UNKNOWN
;
1430 utime_t create_time
;
1431 uint64_t flags
= 0; ///< FLAG_*
1432 __u8 type
= 0; ///< TYPE_*
1433 __u8 size
= 0, min_size
= 0; ///< number of osds in each pg
1434 __u8 crush_rule
= 0; ///< crush placement rule
1435 __u8 object_hash
= 0; ///< hash mapping object name to ps
1436 pg_autoscale_mode_t pg_autoscale_mode
= pg_autoscale_mode_t::UNKNOWN
;
1439 __u32 pg_num
= 0, pgp_num
= 0; ///< number of pgs
1440 __u32 pg_num_pending
= 0; ///< pg_num we are about to merge down to
1441 __u32 pg_num_target
= 0; ///< pg_num we should converge toward
1442 __u32 pgp_num_target
= 0; ///< pgp_num we should converge toward
1445 std::map
<std::string
, std::string
> properties
; ///< OBSOLETE
1446 std::string erasure_code_profile
; ///< name of the erasure code profile in OSDMap
1447 epoch_t last_change
= 0; ///< most recent epoch changed, exclusing snapshot changes
1448 // If non-zero, require OSDs in at least this many different instances...
1449 uint32_t peering_crush_bucket_count
= 0;
1450 // of this bucket type...
1451 uint32_t peering_crush_bucket_barrier
= 0;
1452 // including this one
1453 int32_t peering_crush_mandatory_member
= pg_CRUSH_ITEM_NONE
;
1454 // The per-bucket replica count is calculated with this "target"
1455 // instead of the above crush_bucket_count. This means we can maintain a
1456 // target size of 4 without attempting to place them all in 1 DC
1457 uint32_t peering_crush_bucket_target
= 0;
1458 /// last epoch that forced clients to resend
1459 epoch_t last_force_op_resend
= 0;
1460 /// last epoch that forced clients to resend (pre-nautilus clients only)
1461 epoch_t last_force_op_resend_prenautilus
= 0;
1462 /// last epoch that forced clients to resend (pre-luminous clients only)
1463 epoch_t last_force_op_resend_preluminous
= 0;
1465 /// metadata for the most recent PG merge
1466 pg_merge_meta_t last_pg_merge_meta
;
1468 snapid_t snap_seq
= 0; ///< seq for per-pool snapshot
1469 epoch_t snap_epoch
= 0; ///< osdmap epoch of last snap
1470 uint64_t auid
= 0; ///< who owns the pg
1472 uint64_t quota_max_bytes
= 0; ///< maximum number of bytes for this pool
1473 uint64_t quota_max_objects
= 0; ///< maximum number of objects for this pool
1476 * Pool snaps (global to this pool). These define a SnapContext for
1477 * the pool, unless the client manually specifies an alternate
1480 std::map
<snapid_t
, pool_snap_info_t
> snaps
;
1482 * Alternatively, if we are defining non-pool snaps (e.g. via the
1483 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1484 * used). Snaps and removed_snaps are to be used exclusive of each
1487 interval_set
<snapid_t
> removed_snaps
;
1489 unsigned pg_num_mask
= 0, pgp_num_mask
= 0;
1491 std::set
<uint64_t> tiers
; ///< pools that are tiers of us
1492 int64_t tier_of
= -1; ///< pool for which we are a tier
1493 // Note that write wins for read+write ops
1494 int64_t read_tier
= -1; ///< pool/tier for objecter to direct reads to
1495 int64_t write_tier
= -1; ///< pool/tier for objecter to direct writes to
1496 cache_mode_t cache_mode
= CACHEMODE_NONE
; ///< cache pool mode
1498 bool is_tier() const { return tier_of
>= 0; }
1499 bool has_tiers() const { return !tiers
.empty(); }
1504 clear_tier_tunables();
1506 bool has_read_tier() const { return read_tier
>= 0; }
1507 void clear_read_tier() { read_tier
= -1; }
1508 bool has_write_tier() const { return write_tier
>= 0; }
1509 void clear_write_tier() { write_tier
= -1; }
1510 void clear_tier_tunables() {
1511 if (cache_mode
!= CACHEMODE_NONE
)
1512 flags
|= FLAG_INCOMPLETE_CLONES
;
1513 cache_mode
= CACHEMODE_NONE
;
1515 target_max_bytes
= 0;
1516 target_max_objects
= 0;
1517 cache_target_dirty_ratio_micro
= 0;
1518 cache_target_dirty_high_ratio_micro
= 0;
1519 cache_target_full_ratio_micro
= 0;
1520 hit_set_params
= HitSet::Params();
1523 hit_set_grade_decay_rate
= 0;
1524 hit_set_search_last_n
= 0;
1525 grade_table
.resize(0);
1528 bool is_stretch_pool() const {
1529 return peering_crush_bucket_count
!= 0;
1532 bool stretch_set_can_peer(const std::set
<int>& want
, const OSDMap
& osdmap
,
1533 std::ostream
*out
) const;
1534 bool stretch_set_can_peer(const std::vector
<int>& want
, const OSDMap
& osdmap
,
1535 std::ostream
*out
) const {
1536 if (!is_stretch_pool()) return true;
1537 std::set
<int> swant
;
1538 for (auto i
: want
) swant
.insert(i
);
1539 return stretch_set_can_peer(swant
, osdmap
, out
);
1542 uint64_t target_max_bytes
= 0; ///< tiering: target max pool size
1543 uint64_t target_max_objects
= 0; ///< tiering: target max pool size
1545 uint32_t cache_target_dirty_ratio_micro
= 0; ///< cache: fraction of target to leave dirty
1546 uint32_t cache_target_dirty_high_ratio_micro
= 0; ///< cache: fraction of target to flush with high speed
1547 uint32_t cache_target_full_ratio_micro
= 0; ///< cache: fraction of target to fill before we evict in earnest
1549 uint32_t cache_min_flush_age
= 0; ///< minimum age (seconds) before we can flush
1550 uint32_t cache_min_evict_age
= 0; ///< minimum age (seconds) before we can evict
1552 HitSet::Params hit_set_params
; ///< The HitSet params to use on this pool
1553 uint32_t hit_set_period
= 0; ///< periodicity of HitSet segments (seconds)
1554 uint32_t hit_set_count
= 0; ///< number of periods to retain
1555 bool use_gmt_hitset
= true; ///< use gmt to name the hitset archive object
1556 uint32_t min_read_recency_for_promote
= 0; ///< minimum number of HitSet to check before promote on read
1557 uint32_t min_write_recency_for_promote
= 0; ///< minimum number of HitSet to check before promote on write
1558 uint32_t hit_set_grade_decay_rate
= 0; ///< current hit_set has highest priority on objects
1559 ///< temperature count,the follow hit_set's priority decay
1560 ///< by this params than pre hit_set
1561 uint32_t hit_set_search_last_n
= 0; ///< accumulate atmost N hit_sets for temperature
1563 uint32_t stripe_width
= 0; ///< erasure coded stripe size in bytes
1565 uint64_t expected_num_objects
= 0; ///< expected number of objects on this pool, a value of 0 indicates
1566 ///< user does not specify any expected value
1567 bool fast_read
= false; ///< whether turn on fast read on the pool or not
1569 pool_opts_t opts
; ///< options
1572 TYPE_FINGERPRINT_NONE
= 0,
1573 TYPE_FINGERPRINT_SHA1
= 1,
1574 TYPE_FINGERPRINT_SHA256
= 2,
1575 TYPE_FINGERPRINT_SHA512
= 3,
1577 static fingerprint_t
get_fingerprint_from_str(const std::string
& s
) {
1579 return TYPE_FINGERPRINT_NONE
;
1581 return TYPE_FINGERPRINT_SHA1
;
1583 return TYPE_FINGERPRINT_SHA256
;
1585 return TYPE_FINGERPRINT_SHA512
;
1586 return (fingerprint_t
)-1;
1588 const fingerprint_t
get_fingerprint_type() const {
1590 opts
.get(pool_opts_t::FINGERPRINT_ALGORITHM
, &fp_str
);
1591 return get_fingerprint_from_str(fp_str
);
1593 const char *get_fingerprint_name() const {
1596 opts
.get(pool_opts_t::FINGERPRINT_ALGORITHM
, &fp_str
);
1597 fp_t
= get_fingerprint_from_str(fp_str
);
1598 return get_fingerprint_name(fp_t
);
1600 static const char *get_fingerprint_name(fingerprint_t m
) {
1602 case TYPE_FINGERPRINT_NONE
: return "none";
1603 case TYPE_FINGERPRINT_SHA1
: return "sha1";
1604 case TYPE_FINGERPRINT_SHA256
: return "sha256";
1605 case TYPE_FINGERPRINT_SHA512
: return "sha512";
1606 default: return "unknown";
1611 TYPE_DEDUP_CHUNK_NONE
= 0,
1612 TYPE_DEDUP_CHUNK_FASTCDC
= 1,
1613 TYPE_DEDUP_CHUNK_FIXEDCDC
= 2,
1614 } dedup_chunk_algo_t
;
1615 static dedup_chunk_algo_t
get_dedup_chunk_algorithm_from_str(const std::string
& s
) {
1617 return TYPE_DEDUP_CHUNK_NONE
;
1619 return TYPE_DEDUP_CHUNK_FASTCDC
;
1621 return TYPE_DEDUP_CHUNK_FIXEDCDC
;
1622 return (dedup_chunk_algo_t
)-1;
1624 const dedup_chunk_algo_t
get_dedup_chunk_algorithm_type() const {
1625 std::string algo_str
;
1626 opts
.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM
, &algo_str
);
1627 return get_dedup_chunk_algorithm_from_str(algo_str
);
1629 const char *get_dedup_chunk_algorithm_name() const {
1630 std::string dedup_chunk_algo_str
;
1631 dedup_chunk_algo_t dedup_chunk_algo_t
;
1632 opts
.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM
, &dedup_chunk_algo_str
);
1633 dedup_chunk_algo_t
= get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str
);
1634 return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t
);
1636 static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m
) {
1638 case TYPE_DEDUP_CHUNK_NONE
: return "none";
1639 case TYPE_DEDUP_CHUNK_FASTCDC
: return "fastcdc";
1640 case TYPE_DEDUP_CHUNK_FIXEDCDC
: return "fixed";
1641 default: return "unknown";
1645 int64_t get_dedup_tier() const {
1646 int64_t tier_id
= 0;
1647 opts
.get(pool_opts_t::DEDUP_TIER
, &tier_id
);
1650 int64_t get_dedup_cdc_chunk_size() const {
1651 int64_t chunk_size
= 0;
1652 opts
.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE
, &chunk_size
);
1656 /// application -> key/value metadata
1657 std::map
<std::string
, std::map
<std::string
, std::string
>> application_metadata
;
1660 std::vector
<uint32_t> grade_table
;
1663 uint32_t get_grade(unsigned i
) const {
1664 if (grade_table
.size() <= i
)
1666 return grade_table
[i
];
1668 void calc_grade_table() {
1669 unsigned v
= 1000000;
1670 grade_table
.resize(hit_set_count
);
1671 for (unsigned i
= 0; i
< hit_set_count
; i
++) {
1672 v
= v
* (1 - (hit_set_grade_decay_rate
/ 100.0));
1677 pg_pool_t() = default;
1679 void dump(ceph::Formatter
*f
) const;
1681 const utime_t
&get_create_time() const { return create_time
; }
1682 uint64_t get_flags() const { return flags
; }
1683 bool has_flag(uint64_t f
) const { return flags
& f
; }
1684 void set_flag(uint64_t f
) { flags
|= f
; }
1685 void unset_flag(uint64_t f
) { flags
&= ~f
; }
1687 bool require_rollback() const {
1688 return is_erasure();
1691 /// true if incomplete clones may be present
1692 bool allow_incomplete_clones() const {
1693 return cache_mode
!= CACHEMODE_NONE
|| has_flag(FLAG_INCOMPLETE_CLONES
);
1696 unsigned get_type() const { return type
; }
1697 unsigned get_size() const { return size
; }
1698 unsigned get_min_size() const { return min_size
; }
1699 int get_crush_rule() const { return crush_rule
; }
1700 int get_object_hash() const { return object_hash
; }
1701 const char *get_object_hash_name() const {
1702 return ceph_str_hash_name(get_object_hash());
1704 epoch_t
get_last_change() const { return last_change
; }
1705 epoch_t
get_last_force_op_resend() const { return last_force_op_resend
; }
1706 epoch_t
get_last_force_op_resend_prenautilus() const {
1707 return last_force_op_resend_prenautilus
;
1709 epoch_t
get_last_force_op_resend_preluminous() const {
1710 return last_force_op_resend_preluminous
;
1712 epoch_t
get_snap_epoch() const { return snap_epoch
; }
1713 snapid_t
get_snap_seq() const { return snap_seq
; }
1714 uint64_t get_auid() const { return auid
; }
1716 void set_snap_seq(snapid_t s
) { snap_seq
= s
; }
1717 void set_snap_epoch(epoch_t e
) { snap_epoch
= e
; }
1719 void set_stripe_width(uint32_t s
) { stripe_width
= s
; }
1720 uint32_t get_stripe_width() const { return stripe_width
; }
1722 bool is_replicated() const { return get_type() == TYPE_REPLICATED
; }
1723 bool is_erasure() const { return get_type() == TYPE_ERASURE
; }
1725 bool supports_omap() const {
1726 return !(get_type() == TYPE_ERASURE
);
1729 bool requires_aligned_append() const {
1730 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES
);
1732 uint64_t required_alignment() const { return stripe_width
; }
1734 bool allows_ecoverwrites() const {
1735 return has_flag(FLAG_EC_OVERWRITES
);
1738 bool can_shift_osds() const {
1739 switch (get_type()) {
1740 case TYPE_REPLICATED
:
1745 ceph_abort_msg("unhandled pool type");
1749 unsigned get_pg_num() const { return pg_num
; }
1750 unsigned get_pgp_num() const { return pgp_num
; }
1751 unsigned get_pg_num_target() const { return pg_num_target
; }
1752 unsigned get_pgp_num_target() const { return pgp_num_target
; }
1753 unsigned get_pg_num_pending() const { return pg_num_pending
; }
1755 unsigned get_pg_num_mask() const { return pg_num_mask
; }
1756 unsigned get_pgp_num_mask() const { return pgp_num_mask
; }
1758 // if pg_num is not a multiple of two, pgs are not equally sized.
1759 // return, for a given pg, the fraction (denominator) of the total
1760 // pool size that it represents.
1761 unsigned get_pg_num_divisor(pg_t pgid
) const;
1763 bool is_pending_merge(pg_t pgid
, bool *target
) const;
1765 void set_pg_num(int p
) {
1770 void set_pgp_num(int p
) {
1774 void set_pg_num_pending(int p
) {
1778 void set_pg_num_target(int p
) {
1781 void set_pgp_num_target(int p
) {
1784 void dec_pg_num(pg_t source_pgid
,
1785 epoch_t ready_epoch
,
1786 eversion_t source_version
,
1787 eversion_t target_version
,
1788 epoch_t last_epoch_started
,
1789 epoch_t last_epoch_clean
) {
1791 last_pg_merge_meta
.source_pgid
= source_pgid
;
1792 last_pg_merge_meta
.ready_epoch
= ready_epoch
;
1793 last_pg_merge_meta
.source_version
= source_version
;
1794 last_pg_merge_meta
.target_version
= target_version
;
1795 last_pg_merge_meta
.last_epoch_started
= last_epoch_started
;
1796 last_pg_merge_meta
.last_epoch_clean
= last_epoch_clean
;
1800 void set_quota_max_bytes(uint64_t m
) {
1801 quota_max_bytes
= m
;
1803 uint64_t get_quota_max_bytes() {
1804 return quota_max_bytes
;
1807 void set_quota_max_objects(uint64_t m
) {
1808 quota_max_objects
= m
;
1810 uint64_t get_quota_max_objects() {
1811 return quota_max_objects
;
1814 void set_last_force_op_resend(uint64_t t
) {
1815 last_force_op_resend
= t
;
1816 last_force_op_resend_prenautilus
= t
;
1817 last_force_op_resend_preluminous
= t
;
1820 void calc_pg_masks();
1823 * we have two snap modes:
1824 * - pool global snaps
1825 * - snap existence/non-existence defined by snaps[] and snap_seq
1826 * - user managed snaps
1827 * - removal governed by removed_snaps
1829 * we know which mode we're using based on whether removed_snaps is empty.
1830 * If nothing has been created, both functions report false.
1832 bool is_pool_snaps_mode() const;
1833 bool is_unmanaged_snaps_mode() const;
1834 bool is_removed_snap(snapid_t s
) const;
1836 snapid_t
snap_exists(std::string_view s
) const;
1837 void add_snap(const char *n
, utime_t stamp
);
1838 uint64_t add_unmanaged_snap(bool preoctopus_compat
);
1839 void remove_snap(snapid_t s
);
1840 void remove_unmanaged_snap(snapid_t s
, bool preoctopus_compat
);
1842 SnapContext
get_snap_context() const;
1844 /// hash a object name+namespace key to a hash position
1845 uint32_t hash_key(const std::string
& key
, const std::string
& ns
) const;
1847 /// round a hash position down to a pg num
1848 uint32_t raw_hash_to_pg(uint32_t v
) const;
1851 * map a raw pg (with full precision ps) into an actual pg, for storage
1853 pg_t
raw_pg_to_pg(pg_t pg
) const;
1856 * map raw pg (full precision ps) into a placement seed. include
1857 * pool id in that value so that different pools don't use the same
1860 ps_t
raw_pg_to_pps(pg_t pg
) const;
1862 /// choose a random hash position within a pg
1863 uint32_t get_random_pg_position(pg_t pgid
, uint32_t seed
) const;
1865 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
1866 void decode(ceph::buffer::list::const_iterator
& bl
);
1868 static void generate_test_instances(std::list
<pg_pool_t
*>& o
);
1870 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t
)
1872 std::ostream
& operator<<(std::ostream
& out
, const pg_pool_t
& p
);
1876 * a summation of object stats
1878 * This is just a container for object stats; we don't know what for.
1880 * If you add members in object_stat_sum_t, you should make sure there are
1881 * not padding among these members.
1882 * You should also modify the padding_check function.
1885 struct object_stat_sum_t
{
1886 /**************************************************************************
1887 * WARNING: be sure to update operator==, floor, and split when
1888 * adding/removing fields!
1889 **************************************************************************/
1890 int64_t num_bytes
; // in bytes
1891 int64_t num_objects
;
1892 int64_t num_object_clones
;
1893 int64_t num_object_copies
; // num_objects * num_replicas
1894 int64_t num_objects_missing_on_primary
;
1895 int64_t num_objects_degraded
;
1896 int64_t num_objects_unfound
;
1901 int64_t num_scrub_errors
; // total deep and shallow scrub errors
1902 int64_t num_objects_recovered
;
1903 int64_t num_bytes_recovered
;
1904 int64_t num_keys_recovered
;
1905 int64_t num_shallow_scrub_errors
;
1906 int64_t num_deep_scrub_errors
;
1907 int64_t num_objects_dirty
;
1908 int64_t num_whiteouts
;
1909 int64_t num_objects_omap
;
1910 int64_t num_objects_hit_set_archive
;
1911 int64_t num_objects_misplaced
;
1912 int64_t num_bytes_hit_set_archive
;
1914 int64_t num_flush_kb
;
1916 int64_t num_evict_kb
;
1917 int64_t num_promote
;
1918 int32_t num_flush_mode_high
; // 1 when in high flush mode, otherwise 0
1919 int32_t num_flush_mode_low
; // 1 when in low flush mode, otherwise 0
1920 int32_t num_evict_mode_some
; // 1 when in evict some mode, otherwise 0
1921 int32_t num_evict_mode_full
; // 1 when in evict full mode, otherwise 0
1922 int64_t num_objects_pinned
;
1923 int64_t num_objects_missing
;
1924 int64_t num_legacy_snapsets
; ///< upper bound on pre-luminous-style SnapSets
1925 int64_t num_large_omap_objects
= 0;
1926 int64_t num_objects_manifest
= 0;
1927 int64_t num_omap_bytes
= 0;
1928 int64_t num_omap_keys
= 0;
1929 int64_t num_objects_repaired
= 0;
1933 num_objects(0), num_object_clones(0), num_object_copies(0),
1934 num_objects_missing_on_primary(0), num_objects_degraded(0),
1935 num_objects_unfound(0),
1936 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1937 num_scrub_errors(0),
1938 num_objects_recovered(0),
1939 num_bytes_recovered(0),
1940 num_keys_recovered(0),
1941 num_shallow_scrub_errors(0),
1942 num_deep_scrub_errors(0),
1943 num_objects_dirty(0),
1945 num_objects_omap(0),
1946 num_objects_hit_set_archive(0),
1947 num_objects_misplaced(0),
1948 num_bytes_hit_set_archive(0),
1954 num_flush_mode_high(0), num_flush_mode_low(0),
1955 num_evict_mode_some(0), num_evict_mode_full(0),
1956 num_objects_pinned(0),
1957 num_objects_missing(0),
1958 num_legacy_snapsets(0)
1961 void floor(int64_t f
) {
1962 #define FLOOR(x) if (x < f) x = f
1965 FLOOR(num_object_clones
);
1966 FLOOR(num_object_copies
);
1967 FLOOR(num_objects_missing_on_primary
);
1968 FLOOR(num_objects_missing
);
1969 FLOOR(num_objects_degraded
);
1970 FLOOR(num_objects_misplaced
);
1971 FLOOR(num_objects_unfound
);
1976 FLOOR(num_large_omap_objects
);
1977 FLOOR(num_objects_manifest
);
1978 FLOOR(num_omap_bytes
);
1979 FLOOR(num_omap_keys
);
1980 FLOOR(num_shallow_scrub_errors
);
1981 FLOOR(num_deep_scrub_errors
);
1982 num_scrub_errors
= num_shallow_scrub_errors
+ num_deep_scrub_errors
;
1983 FLOOR(num_objects_recovered
);
1984 FLOOR(num_bytes_recovered
);
1985 FLOOR(num_keys_recovered
);
1986 FLOOR(num_objects_dirty
);
1987 FLOOR(num_whiteouts
);
1988 FLOOR(num_objects_omap
);
1989 FLOOR(num_objects_hit_set_archive
);
1990 FLOOR(num_bytes_hit_set_archive
);
1992 FLOOR(num_flush_kb
);
1994 FLOOR(num_evict_kb
);
1996 FLOOR(num_flush_mode_high
);
1997 FLOOR(num_flush_mode_low
);
1998 FLOOR(num_evict_mode_some
);
1999 FLOOR(num_evict_mode_full
);
2000 FLOOR(num_objects_pinned
);
2001 FLOOR(num_legacy_snapsets
);
2002 FLOOR(num_objects_repaired
);
2006 void split(std::vector
<object_stat_sum_t
> &out
) const {
2007 #define SPLIT(PARAM) \
2008 for (unsigned i = 0; i < out.size(); ++i) { \
2009 out[i].PARAM = PARAM / out.size(); \
2010 if (i < (PARAM % out.size())) { \
2014 #define SPLIT_PRESERVE_NONZERO(PARAM) \
2015 for (unsigned i = 0; i < out.size(); ++i) { \
2017 out[i].PARAM = 1 + PARAM / out.size(); \
2024 SPLIT(num_object_clones
);
2025 SPLIT(num_object_copies
);
2026 SPLIT(num_objects_missing_on_primary
);
2027 SPLIT(num_objects_missing
);
2028 SPLIT(num_objects_degraded
);
2029 SPLIT(num_objects_misplaced
);
2030 SPLIT(num_objects_unfound
);
2035 SPLIT(num_large_omap_objects
);
2036 SPLIT(num_objects_manifest
);
2037 SPLIT(num_omap_bytes
);
2038 SPLIT(num_omap_keys
);
2039 SPLIT(num_objects_repaired
);
2040 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors
);
2041 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors
);
2042 for (unsigned i
= 0; i
< out
.size(); ++i
) {
2043 out
[i
].num_scrub_errors
= out
[i
].num_shallow_scrub_errors
+
2044 out
[i
].num_deep_scrub_errors
;
2046 SPLIT(num_objects_recovered
);
2047 SPLIT(num_bytes_recovered
);
2048 SPLIT(num_keys_recovered
);
2049 SPLIT(num_objects_dirty
);
2050 SPLIT(num_whiteouts
);
2051 SPLIT(num_objects_omap
);
2052 SPLIT(num_objects_hit_set_archive
);
2053 SPLIT(num_bytes_hit_set_archive
);
2055 SPLIT(num_flush_kb
);
2057 SPLIT(num_evict_kb
);
2059 SPLIT(num_flush_mode_high
);
2060 SPLIT(num_flush_mode_low
);
2061 SPLIT(num_evict_mode_some
);
2062 SPLIT(num_evict_mode_full
);
2063 SPLIT(num_objects_pinned
);
2064 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets
);
2066 #undef SPLIT_PRESERVE_NONZERO
2070 // FIPS zeroization audit 20191117: this memset is not security related.
2071 memset(this, 0, sizeof(*this));
2074 void calc_copies(int nrep
) {
2075 num_object_copies
= nrep
* num_objects
;
2078 bool is_zero() const {
2079 return mem_is_zero((char*)this, sizeof(*this));
2082 void add(const object_stat_sum_t
& o
);
2083 void sub(const object_stat_sum_t
& o
);
2085 void dump(ceph::Formatter
*f
) const;
2086 void padding_check() {
2088 sizeof(object_stat_sum_t
) ==
2090 sizeof(num_objects
) +
2091 sizeof(num_object_clones
) +
2092 sizeof(num_object_copies
) +
2093 sizeof(num_objects_missing_on_primary
) +
2094 sizeof(num_objects_degraded
) +
2095 sizeof(num_objects_unfound
) +
2100 sizeof(num_scrub_errors
) +
2101 sizeof(num_large_omap_objects
) +
2102 sizeof(num_objects_manifest
) +
2103 sizeof(num_omap_bytes
) +
2104 sizeof(num_omap_keys
) +
2105 sizeof(num_objects_repaired
) +
2106 sizeof(num_objects_recovered
) +
2107 sizeof(num_bytes_recovered
) +
2108 sizeof(num_keys_recovered
) +
2109 sizeof(num_shallow_scrub_errors
) +
2110 sizeof(num_deep_scrub_errors
) +
2111 sizeof(num_objects_dirty
) +
2112 sizeof(num_whiteouts
) +
2113 sizeof(num_objects_omap
) +
2114 sizeof(num_objects_hit_set_archive
) +
2115 sizeof(num_objects_misplaced
) +
2116 sizeof(num_bytes_hit_set_archive
) +
2118 sizeof(num_flush_kb
) +
2120 sizeof(num_evict_kb
) +
2121 sizeof(num_promote
) +
2122 sizeof(num_flush_mode_high
) +
2123 sizeof(num_flush_mode_low
) +
2124 sizeof(num_evict_mode_some
) +
2125 sizeof(num_evict_mode_full
) +
2126 sizeof(num_objects_pinned
) +
2127 sizeof(num_objects_missing
) +
2128 sizeof(num_legacy_snapsets
)
2130 "object_stat_sum_t have padding");
2132 void encode(ceph::buffer::list
& bl
) const;
2133 void decode(ceph::buffer::list::const_iterator
& bl
);
2134 static void generate_test_instances(std::list
<object_stat_sum_t
*>& o
);
2136 WRITE_CLASS_ENCODER(object_stat_sum_t
)
2138 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
);
2141 * a collection of object stat sums
2143 * This is a collection of stat sums over different categories.
2145 struct object_stat_collection_t
{
2146 /**************************************************************************
2147 * WARNING: be sure to update the operator== when adding/removing fields! *
2148 **************************************************************************/
2149 object_stat_sum_t sum
;
2151 void calc_copies(int nrep
) {
2152 sum
.calc_copies(nrep
);
2155 void dump(ceph::Formatter
*f
) const;
2156 void encode(ceph::buffer::list
& bl
) const;
2157 void decode(ceph::buffer::list::const_iterator
& bl
);
2158 static void generate_test_instances(std::list
<object_stat_collection_t
*>& o
);
2160 bool is_zero() const {
2161 return sum
.is_zero();
2168 void floor(int64_t f
) {
2172 void add(const object_stat_sum_t
& o
) {
2176 void add(const object_stat_collection_t
& o
) {
2179 void sub(const object_stat_collection_t
& o
) {
2183 WRITE_CLASS_ENCODER(object_stat_collection_t
)
2185 inline bool operator==(const object_stat_collection_t
& l
,
2186 const object_stat_collection_t
& r
) {
2187 return l
.sum
== r
.sum
;
2190 enum class scrub_level_t
: bool { shallow
= false, deep
= true };
2191 enum class scrub_type_t
: bool { not_repair
= false, do_repair
= true };
2193 /// is there a scrub in our future?
2194 enum class pg_scrub_sched_status_t
: uint16_t {
2195 unknown
, ///< status not reported yet
2196 not_queued
, ///< not in the OSD's scrub queue. Probably not active.
2197 active
, ///< scrubbing
2198 scheduled
, ///< scheduled for a scrub at an already determined time
2199 queued
///< queued to be scrubbed
2202 struct pg_scrubbing_status_t
{
2203 utime_t m_scheduled_at
{};
2204 int32_t m_duration_seconds
{0}; // relevant when scrubbing
2205 pg_scrub_sched_status_t m_sched_status
{pg_scrub_sched_status_t::unknown
};
2206 bool m_is_active
{false};
2207 scrub_level_t m_is_deep
{scrub_level_t::shallow
};
2208 bool m_is_periodic
{true};
2211 bool operator==(const pg_scrubbing_status_t
& l
, const pg_scrubbing_status_t
& r
);
2214 * aggregate stats for a single PG.
2217 /**************************************************************************
2218 * WARNING: be sure to update the operator== when adding/removing fields! *
2219 **************************************************************************/
2221 version_t reported_seq
; // sequence number
2222 epoch_t reported_epoch
; // epoch of this report
2224 utime_t last_fresh
; // last reported
2225 utime_t last_change
; // new state != previous state
2226 utime_t last_active
; // state & PG_STATE_ACTIVE
2227 utime_t last_peered
; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2228 utime_t last_clean
; // state & PG_STATE_CLEAN
2229 utime_t last_unstale
; // (state & PG_STATE_STALE) == 0
2230 utime_t last_undegraded
; // (state & PG_STATE_DEGRADED) == 0
2231 utime_t last_fullsized
; // (state & PG_STATE_UNDERSIZED) == 0
2233 eversion_t log_start
; // (log_start,version]
2234 eversion_t ondisk_log_start
; // there may be more on disk
2237 epoch_t last_epoch_clean
;
2239 __u32 parent_split_bits
;
2241 eversion_t last_scrub
;
2242 eversion_t last_deep_scrub
;
2243 utime_t last_scrub_stamp
;
2244 utime_t last_deep_scrub_stamp
;
2245 utime_t last_clean_scrub_stamp
;
2246 int32_t last_scrub_duration
{0};
2248 object_stat_collection_t stats
;
2251 int64_t ondisk_log_size
; // >= active_log_size
2252 int64_t objects_scrubbed
;
2253 double scrub_duration
;
2255 std::vector
<int32_t> up
, acting
;
2256 std::vector
<pg_shard_t
> avail_no_missing
;
2257 std::map
< std::set
<pg_shard_t
>, int32_t > object_location_counts
;
2258 epoch_t mapping_epoch
;
2260 std::vector
<int32_t> blocked_by
; ///< osds on which the pg is blocked
2262 interval_set
<snapid_t
> purged_snaps
; ///< recently removed snaps that we've purged
2264 utime_t last_became_active
;
2265 utime_t last_became_peered
;
2267 /// up, acting primaries
2269 int32_t acting_primary
;
2271 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2272 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2273 uint32_t snaptrimq_len
;
2274 int64_t objects_trimmed
;
2275 double snaptrim_duration
;
2277 pg_scrubbing_status_t scrub_sched_status
;
2279 bool stats_invalid
:1;
2280 /// true if num_objects_dirty is not accurate (because it was not
2281 /// maintained starting from pool creation)
2282 bool dirty_stats_invalid
:1;
2283 bool omap_stats_invalid
:1;
2284 bool hitset_stats_invalid
:1;
2285 bool hitset_bytes_stats_invalid
:1;
2286 bool pin_stats_invalid
:1;
2287 bool manifest_stats_invalid
:1;
2293 created(0), last_epoch_clean(0),
2294 parent_split_bits(0),
2295 log_size(0), ondisk_log_size(0),
2296 objects_scrubbed(0),
2303 snaptrim_duration(0.0),
2304 stats_invalid(false),
2305 dirty_stats_invalid(false),
2306 omap_stats_invalid(false),
2307 hitset_stats_invalid(false),
2308 hitset_bytes_stats_invalid(false),
2309 pin_stats_invalid(false),
2310 manifest_stats_invalid(false)
2313 epoch_t
get_effective_last_epoch_clean() const {
2314 if (state
& PG_STATE_CLEAN
) {
2315 // we are clean as of this report, and should thus take the
2317 return reported_epoch
;
2319 return last_epoch_clean
;
2323 std::pair
<epoch_t
, version_t
> get_version_pair() const {
2324 return { reported_epoch
, reported_seq
};
2327 void floor(int64_t f
) {
2331 if (ondisk_log_size
< f
)
2332 ondisk_log_size
= f
;
2333 if (snaptrimq_len
< f
)
2337 void add_sub_invalid_flags(const pg_stat_t
& o
) {
2338 // adding (or subtracting!) invalid stats render our stats invalid too
2339 stats_invalid
|= o
.stats_invalid
;
2340 dirty_stats_invalid
|= o
.dirty_stats_invalid
;
2341 omap_stats_invalid
|= o
.omap_stats_invalid
;
2342 hitset_stats_invalid
|= o
.hitset_stats_invalid
;
2343 hitset_bytes_stats_invalid
|= o
.hitset_bytes_stats_invalid
;
2344 pin_stats_invalid
|= o
.pin_stats_invalid
;
2345 manifest_stats_invalid
|= o
.manifest_stats_invalid
;
2347 void add(const pg_stat_t
& o
) {
2349 log_size
+= o
.log_size
;
2350 ondisk_log_size
+= o
.ondisk_log_size
;
2351 snaptrimq_len
= std::min((uint64_t)snaptrimq_len
+ o
.snaptrimq_len
,
2352 (uint64_t)(1ull << 31));
2353 add_sub_invalid_flags(o
);
2355 void sub(const pg_stat_t
& o
) {
2357 log_size
-= o
.log_size
;
2358 ondisk_log_size
-= o
.ondisk_log_size
;
2359 if (o
.snaptrimq_len
< snaptrimq_len
) {
2360 snaptrimq_len
-= o
.snaptrimq_len
;
2364 add_sub_invalid_flags(o
);
2367 bool is_acting_osd(int32_t osd
, bool primary
) const;
2368 void dump(ceph::Formatter
*f
) const;
2369 void dump_brief(ceph::Formatter
*f
) const;
2370 std::string
dump_scrub_schedule() const;
2371 void encode(ceph::buffer::list
&bl
) const;
2372 void decode(ceph::buffer::list::const_iterator
&bl
);
2373 static void generate_test_instances(std::list
<pg_stat_t
*>& o
);
2375 WRITE_CLASS_ENCODER(pg_stat_t
)
2377 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
);
2380 * ObjectStore full statfs information
2382 struct store_statfs_t
2384 uint64_t total
= 0; ///< Total bytes
2385 uint64_t available
= 0; ///< Free bytes available
2386 uint64_t internally_reserved
= 0; ///< Bytes reserved for internal purposes
2388 int64_t allocated
= 0; ///< Bytes allocated by the store
2390 int64_t data_stored
= 0; ///< Bytes actually stored by the user
2391 int64_t data_compressed
= 0; ///< Bytes stored after compression
2392 int64_t data_compressed_allocated
= 0; ///< Bytes allocated for compressed data
2393 int64_t data_compressed_original
= 0; ///< Bytes that were compressed
2395 int64_t omap_allocated
= 0; ///< approx usage of omap data
2396 int64_t internal_metadata
= 0; ///< approx usage of internal metadata
2399 *this = store_statfs_t();
2401 void floor(int64_t f
) {
2402 #define FLOOR(x) if (int64_t(x) < f) x = f
2405 FLOOR(internally_reserved
);
2408 FLOOR(data_compressed
);
2409 FLOOR(data_compressed_allocated
);
2410 FLOOR(data_compressed_original
);
2412 FLOOR(omap_allocated
);
2413 FLOOR(internal_metadata
);
2417 bool operator ==(const store_statfs_t
& other
) const;
2418 bool is_zero() const {
2419 return *this == store_statfs_t();
2422 uint64_t get_used() const {
2423 return total
- available
- internally_reserved
;
2426 // this accumulates both actually used and statfs's internally_reserved
2427 uint64_t get_used_raw() const {
2428 return total
- available
;
2431 float get_used_raw_ratio() const {
2433 return (float)get_used_raw() / (float)total
;
2439 // helpers to ease legacy code porting
2440 uint64_t kb_avail() const {
2441 return available
>> 10;
2443 uint64_t kb() const {
2446 uint64_t kb_used() const {
2447 return (total
- available
- internally_reserved
) >> 10;
2449 uint64_t kb_used_raw() const {
2450 return get_used_raw() >> 10;
2453 uint64_t kb_used_data() const {
2454 return allocated
>> 10;
2456 uint64_t kb_used_omap() const {
2457 return omap_allocated
>> 10;
2460 uint64_t kb_used_internal_metadata() const {
2461 return internal_metadata
>> 10;
2464 void add(const store_statfs_t
& o
) {
2466 available
+= o
.available
;
2467 internally_reserved
+= o
.internally_reserved
;
2468 allocated
+= o
.allocated
;
2469 data_stored
+= o
.data_stored
;
2470 data_compressed
+= o
.data_compressed
;
2471 data_compressed_allocated
+= o
.data_compressed_allocated
;
2472 data_compressed_original
+= o
.data_compressed_original
;
2473 omap_allocated
+= o
.omap_allocated
;
2474 internal_metadata
+= o
.internal_metadata
;
2476 void sub(const store_statfs_t
& o
) {
2478 available
-= o
.available
;
2479 internally_reserved
-= o
.internally_reserved
;
2480 allocated
-= o
.allocated
;
2481 data_stored
-= o
.data_stored
;
2482 data_compressed
-= o
.data_compressed
;
2483 data_compressed_allocated
-= o
.data_compressed_allocated
;
2484 data_compressed_original
-= o
.data_compressed_original
;
2485 omap_allocated
-= o
.omap_allocated
;
2486 internal_metadata
-= o
.internal_metadata
;
2488 void dump(ceph::Formatter
*f
) const;
2489 DENC(store_statfs_t
, v
, p
) {
2490 DENC_START(1, 1, p
);
2492 denc(v
.available
, p
);
2493 denc(v
.internally_reserved
, p
);
2494 denc(v
.allocated
, p
);
2495 denc(v
.data_stored
, p
);
2496 denc(v
.data_compressed
, p
);
2497 denc(v
.data_compressed_allocated
, p
);
2498 denc(v
.data_compressed_original
, p
);
2499 denc(v
.omap_allocated
, p
);
2500 denc(v
.internal_metadata
, p
);
2503 static void generate_test_instances(std::list
<store_statfs_t
*>& o
);
2505 WRITE_CLASS_DENC(store_statfs_t
)
2507 std::ostream
&operator<<(std::ostream
&lhs
, const store_statfs_t
&rhs
);
2510 * aggregate stats for an osd
2513 store_statfs_t statfs
;
2514 std::vector
<int> hb_peers
;
2515 int32_t snap_trim_queue_len
, num_snap_trimming
;
2516 uint64_t num_shards_repaired
;
2518 pow2_hist_t op_queue_age_hist
;
2520 objectstore_perf_stat_t os_perf_stat
;
2521 osd_alerts_t os_alerts
;
2523 epoch_t up_from
= 0;
2526 uint32_t num_pgs
= 0;
2528 uint32_t num_osds
= 0;
2529 uint32_t num_per_pool_osds
= 0;
2530 uint32_t num_per_pool_omap_osds
= 0;
2533 uint32_t last_update
; // in seconds
2534 uint32_t back_pingtime
[3];
2535 uint32_t back_min
[3];
2536 uint32_t back_max
[3];
2538 uint32_t front_pingtime
[3];
2539 uint32_t front_min
[3];
2540 uint32_t front_max
[3];
2541 uint32_t front_last
;
2543 std::map
<int, Interfaces
> hb_pingtime
; ///< map of osd id to Interfaces
2545 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2546 num_shards_repaired(0) {}
2548 void add(const osd_stat_t
& o
) {
2549 statfs
.add(o
.statfs
);
2550 snap_trim_queue_len
+= o
.snap_trim_queue_len
;
2551 num_snap_trimming
+= o
.num_snap_trimming
;
2552 num_shards_repaired
+= o
.num_shards_repaired
;
2553 op_queue_age_hist
.add(o
.op_queue_age_hist
);
2554 os_perf_stat
.add(o
.os_perf_stat
);
2555 num_pgs
+= o
.num_pgs
;
2556 num_osds
+= o
.num_osds
;
2557 num_per_pool_osds
+= o
.num_per_pool_osds
;
2558 num_per_pool_omap_osds
+= o
.num_per_pool_omap_osds
;
2559 for (const auto& a
: o
.os_alerts
) {
2560 auto& target
= os_alerts
[a
.first
];
2561 for (auto& i
: a
.second
) {
2562 target
.emplace(i
.first
, i
.second
);
2566 void sub(const osd_stat_t
& o
) {
2567 statfs
.sub(o
.statfs
);
2568 snap_trim_queue_len
-= o
.snap_trim_queue_len
;
2569 num_snap_trimming
-= o
.num_snap_trimming
;
2570 num_shards_repaired
-= o
.num_shards_repaired
;
2571 op_queue_age_hist
.sub(o
.op_queue_age_hist
);
2572 os_perf_stat
.sub(o
.os_perf_stat
);
2573 num_pgs
-= o
.num_pgs
;
2574 num_osds
-= o
.num_osds
;
2575 num_per_pool_osds
-= o
.num_per_pool_osds
;
2576 num_per_pool_omap_osds
-= o
.num_per_pool_omap_osds
;
2577 for (const auto& a
: o
.os_alerts
) {
2578 auto& target
= os_alerts
[a
.first
];
2579 for (auto& i
: a
.second
) {
2580 target
.erase(i
.first
);
2582 if (target
.empty()) {
2583 os_alerts
.erase(a
.first
);
2587 void dump(ceph::Formatter
*f
, bool with_net
= true) const;
2588 void dump_ping_time(ceph::Formatter
*f
) const;
2589 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
2590 void decode(ceph::buffer::list::const_iterator
&bl
);
2591 static void generate_test_instances(std::list
<osd_stat_t
*>& o
);
2593 WRITE_CLASS_ENCODER_FEATURES(osd_stat_t
)
2595 inline bool operator==(const osd_stat_t
& l
, const osd_stat_t
& r
) {
2596 return l
.statfs
== r
.statfs
&&
2597 l
.snap_trim_queue_len
== r
.snap_trim_queue_len
&&
2598 l
.num_snap_trimming
== r
.num_snap_trimming
&&
2599 l
.num_shards_repaired
== r
.num_shards_repaired
&&
2600 l
.hb_peers
== r
.hb_peers
&&
2601 l
.op_queue_age_hist
== r
.op_queue_age_hist
&&
2602 l
.os_perf_stat
== r
.os_perf_stat
&&
2603 l
.num_pgs
== r
.num_pgs
&&
2604 l
.num_osds
== r
.num_osds
&&
2605 l
.num_per_pool_osds
== r
.num_per_pool_osds
&&
2606 l
.num_per_pool_omap_osds
== r
.num_per_pool_omap_osds
;
2608 inline bool operator!=(const osd_stat_t
& l
, const osd_stat_t
& r
) {
2612 inline std::ostream
& operator<<(std::ostream
& out
, const osd_stat_t
& s
) {
2613 return out
<< "osd_stat(" << s
.statfs
<< ", "
2614 << "peers " << s
.hb_peers
2615 << " op hist " << s
.op_queue_age_hist
.h
2620 * summation over an entire pool
2622 struct pool_stat_t
{
2623 object_stat_collection_t stats
;
2624 store_statfs_t store_stats
;
2626 int64_t ondisk_log_size
; // >= active_log_size
2627 int32_t up
; ///< number of up replicas or shards
2628 int32_t acting
; ///< number of acting replicas or shards
2629 int32_t num_store_stats
; ///< amount of store_stats accumulated
2631 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2635 void floor(int64_t f
) {
2637 store_stats
.floor(f
);
2640 if (ondisk_log_size
< f
)
2641 ondisk_log_size
= f
;
2646 if (num_store_stats
< f
)
2647 num_store_stats
= f
;
2650 void add(const store_statfs_t
& o
) {
2654 void sub(const store_statfs_t
& o
) {
2659 void add(const pg_stat_t
& o
) {
2661 log_size
+= o
.log_size
;
2662 ondisk_log_size
+= o
.ondisk_log_size
;
2664 acting
+= o
.acting
.size();
2666 void sub(const pg_stat_t
& o
) {
2668 log_size
-= o
.log_size
;
2669 ondisk_log_size
-= o
.ondisk_log_size
;
2671 acting
-= o
.acting
.size();
2674 bool is_zero() const {
2675 return (stats
.is_zero() &&
2676 store_stats
.is_zero() &&
2678 ondisk_log_size
== 0 &&
2681 num_store_stats
== 0);
2684 // helper accessors to retrieve used/netto bytes depending on the
2685 // collection method: new per-pool objectstore report or legacy PG
2686 // summation at OSD.
2687 // In legacy mode used and netto values are the same. But for new per-pool
2688 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2689 // and 'netto' is amount of stored user data.
2690 uint64_t get_allocated_data_bytes(bool per_pool
) const {
2692 return store_stats
.allocated
;
2694 // legacy mode, use numbers from 'stats'
2695 return stats
.sum
.num_bytes
+ stats
.sum
.num_bytes_hit_set_archive
;
2698 uint64_t get_allocated_omap_bytes(bool per_pool_omap
) const {
2699 if (per_pool_omap
) {
2700 return store_stats
.omap_allocated
;
2702 // omap is not broken out by pool by nautilus bluestore; report the
2703 // scrub value. this will be imprecise in that it won't account for
2704 // any storage overhead/efficiency.
2705 return stats
.sum
.num_omap_bytes
;
2708 uint64_t get_user_data_bytes(float raw_used_rate
, ///< space amp factor
2709 bool per_pool
) const {
2710 // NOTE: we need the space amp factor so that we can work backwards from
2711 // the raw utilization to the amount of data that the user actually stored.
2713 return raw_used_rate
? store_stats
.data_stored
/ raw_used_rate
: 0;
2715 // legacy mode, use numbers from 'stats'. note that we do NOT use the
2716 // raw_used_rate factor here because we are working from the PG stats
2718 return stats
.sum
.num_bytes
+ stats
.sum
.num_bytes_hit_set_archive
;
2721 uint64_t get_user_omap_bytes(float raw_used_rate
, ///< space amp factor
2722 bool per_pool_omap
) const {
2723 if (per_pool_omap
) {
2724 return raw_used_rate
? store_stats
.omap_allocated
/ raw_used_rate
: 0;
2726 // omap usage is lazily reported during scrub; this value may lag.
2727 return stats
.sum
.num_omap_bytes
;
2731 void dump(ceph::Formatter
*f
) const;
2732 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
2733 void decode(ceph::buffer::list::const_iterator
&bl
);
2734 static void generate_test_instances(std::list
<pool_stat_t
*>& o
);
2736 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t
)
2739 // -----------------------------------------
2742 * pg_hit_set_info_t - information about a single recorded HitSet
2744 * Track basic metadata about a HitSet, like the number of insertions
2745 * and the time range it covers.
2747 struct pg_hit_set_info_t
{
2748 utime_t begin
, end
; ///< time interval
2749 eversion_t version
; ///< version this HitSet object was written
2750 bool using_gmt
; ///< use gmt for creating the hit_set archive object name
2752 friend bool operator==(const pg_hit_set_info_t
& l
,
2753 const pg_hit_set_info_t
& r
) {
2755 l
.begin
== r
.begin
&&
2757 l
.version
== r
.version
&&
2758 l
.using_gmt
== r
.using_gmt
;
2761 explicit pg_hit_set_info_t(bool using_gmt
= true)
2762 : using_gmt(using_gmt
) {}
2764 void encode(ceph::buffer::list
&bl
) const;
2765 void decode(ceph::buffer::list::const_iterator
&bl
);
2766 void dump(ceph::Formatter
*f
) const;
2767 static void generate_test_instances(std::list
<pg_hit_set_info_t
*>& o
);
2769 WRITE_CLASS_ENCODER(pg_hit_set_info_t
)
2772 * pg_hit_set_history_t - information about a history of hitsets
2774 * Include information about the currently accumulating hit set as well
2775 * as archived/historical ones.
2777 struct pg_hit_set_history_t
{
2778 eversion_t current_last_update
; ///< last version inserted into current set
2779 std::list
<pg_hit_set_info_t
> history
; ///< archived sets, sorted oldest -> newest
2781 friend bool operator==(const pg_hit_set_history_t
& l
,
2782 const pg_hit_set_history_t
& r
) {
2784 l
.current_last_update
== r
.current_last_update
&&
2785 l
.history
== r
.history
;
2788 void encode(ceph::buffer::list
&bl
) const;
2789 void decode(ceph::buffer::list::const_iterator
&bl
);
2790 void dump(ceph::Formatter
*f
) const;
2791 static void generate_test_instances(std::list
<pg_hit_set_history_t
*>& o
);
2793 WRITE_CLASS_ENCODER(pg_hit_set_history_t
)
2796 // -----------------------------------------
2799 * pg_history_t - information about recent pg peering/mapping history
2801 * This is aggressively shared between OSDs to bound the amount of past
2802 * history they need to worry about.
2804 struct pg_history_t
{
2805 epoch_t epoch_created
= 0; // epoch in which *pg* was created (pool or pg)
2806 epoch_t epoch_pool_created
= 0; // epoch in which *pool* was created
2807 // (note: may be pg creation epoch for
2808 // pre-luminous clusters)
2809 epoch_t last_epoch_started
= 0;; // lower bound on last epoch started (anywhere, not necessarily locally)
2810 // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/
2811 epoch_t last_interval_started
= 0;; // first epoch of last_epoch_started interval
2812 epoch_t last_epoch_clean
= 0;; // lower bound on last epoch the PG was completely clean.
2813 epoch_t last_interval_clean
= 0;; // first epoch of last_epoch_clean interval
2814 epoch_t last_epoch_split
= 0;; // as parent or child
2815 epoch_t last_epoch_marked_full
= 0;; // pool or cluster
2818 * In the event of a map discontinuity, same_*_since may reflect the first
2819 * map the osd has seen in the new map sequence rather than the actual start
2820 * of the interval. This is ok since a discontinuity at epoch e means there
2821 * must have been a clean interval between e and now and that we cannot be
2822 * in the active set during the interval containing e.
2824 epoch_t same_up_since
= 0;; // same acting set since
2825 epoch_t same_interval_since
= 0;; // same acting AND up set since
2826 epoch_t same_primary_since
= 0;; // same primary at least back through this epoch.
2828 eversion_t last_scrub
;
2829 eversion_t last_deep_scrub
;
2830 utime_t last_scrub_stamp
;
2831 utime_t last_deep_scrub_stamp
;
2832 utime_t last_clean_scrub_stamp
;
2834 /// upper bound on how long prior interval readable (relative to encode time)
2835 ceph::timespan prior_readable_until_ub
= ceph::timespan::zero();
2837 friend bool operator==(const pg_history_t
& l
, const pg_history_t
& r
) {
2839 l
.epoch_created
== r
.epoch_created
&&
2840 l
.epoch_pool_created
== r
.epoch_pool_created
&&
2841 l
.last_epoch_started
== r
.last_epoch_started
&&
2842 l
.last_interval_started
== r
.last_interval_started
&&
2843 l
.last_epoch_clean
== r
.last_epoch_clean
&&
2844 l
.last_interval_clean
== r
.last_interval_clean
&&
2845 l
.last_epoch_split
== r
.last_epoch_split
&&
2846 l
.last_epoch_marked_full
== r
.last_epoch_marked_full
&&
2847 l
.same_up_since
== r
.same_up_since
&&
2848 l
.same_interval_since
== r
.same_interval_since
&&
2849 l
.same_primary_since
== r
.same_primary_since
&&
2850 l
.last_scrub
== r
.last_scrub
&&
2851 l
.last_deep_scrub
== r
.last_deep_scrub
&&
2852 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
2853 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
2854 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
&&
2855 l
.prior_readable_until_ub
== r
.prior_readable_until_ub
;
2859 pg_history_t(epoch_t created
, utime_t stamp
)
2860 : epoch_created(created
),
2861 epoch_pool_created(created
),
2862 same_up_since(created
),
2863 same_interval_since(created
),
2864 same_primary_since(created
),
2865 last_scrub_stamp(stamp
),
2866 last_deep_scrub_stamp(stamp
),
2867 last_clean_scrub_stamp(stamp
) {}
2869 bool merge(const pg_history_t
&other
) {
2870 // Here, we only update the fields which cannot be calculated from the OSDmap.
2871 bool modified
= false;
2872 if (epoch_created
< other
.epoch_created
) {
2873 epoch_created
= other
.epoch_created
;
2876 if (epoch_pool_created
< other
.epoch_pool_created
) {
2877 // FIXME: for jewel compat only; this should either be 0 or always the
2878 // same value across all pg instances.
2879 epoch_pool_created
= other
.epoch_pool_created
;
2882 if (last_epoch_started
< other
.last_epoch_started
) {
2883 last_epoch_started
= other
.last_epoch_started
;
2886 if (last_interval_started
< other
.last_interval_started
) {
2887 last_interval_started
= other
.last_interval_started
;
2888 // if we are learning about a newer *started* interval, our
2889 // readable_until_ub is obsolete
2890 prior_readable_until_ub
= other
.prior_readable_until_ub
;
2892 } else if (other
.last_interval_started
== last_interval_started
&&
2893 other
.prior_readable_until_ub
< prior_readable_until_ub
) {
2894 // if other is the *same* interval, than pull our upper bound in
2895 // if they have a tighter bound.
2896 prior_readable_until_ub
= other
.prior_readable_until_ub
;
2899 if (last_epoch_clean
< other
.last_epoch_clean
) {
2900 last_epoch_clean
= other
.last_epoch_clean
;
2903 if (last_interval_clean
< other
.last_interval_clean
) {
2904 last_interval_clean
= other
.last_interval_clean
;
2907 if (last_epoch_split
< other
.last_epoch_split
) {
2908 last_epoch_split
= other
.last_epoch_split
;
2911 if (last_epoch_marked_full
< other
.last_epoch_marked_full
) {
2912 last_epoch_marked_full
= other
.last_epoch_marked_full
;
2915 if (other
.last_scrub
> last_scrub
) {
2916 last_scrub
= other
.last_scrub
;
2919 if (other
.last_scrub_stamp
> last_scrub_stamp
) {
2920 last_scrub_stamp
= other
.last_scrub_stamp
;
2923 if (other
.last_deep_scrub
> last_deep_scrub
) {
2924 last_deep_scrub
= other
.last_deep_scrub
;
2927 if (other
.last_deep_scrub_stamp
> last_deep_scrub_stamp
) {
2928 last_deep_scrub_stamp
= other
.last_deep_scrub_stamp
;
2931 if (other
.last_clean_scrub_stamp
> last_clean_scrub_stamp
) {
2932 last_clean_scrub_stamp
= other
.last_clean_scrub_stamp
;
2938 void encode(ceph::buffer::list
& bl
) const;
2939 void decode(ceph::buffer::list::const_iterator
& p
);
2940 void dump(ceph::Formatter
*f
) const;
2941 static void generate_test_instances(std::list
<pg_history_t
*>& o
);
2943 ceph::signedspan
refresh_prior_readable_until_ub(
2944 ceph::signedspan now
, ///< now, relative to osd startup_time
2945 ceph::signedspan ub
) { ///< ub, relative to osd startup_time
2947 // prior interval(s) are unreadable; we can zero the upper bound
2948 prior_readable_until_ub
= ceph::signedspan::zero();
2949 return ceph::signedspan::zero();
2951 prior_readable_until_ub
= ub
- now
;
2955 ceph::signedspan
get_prior_readable_until_ub(ceph::signedspan now
) {
2956 if (prior_readable_until_ub
== ceph::signedspan::zero()) {
2957 return ceph::signedspan::zero();
2959 return now
+ prior_readable_until_ub
;
2962 WRITE_CLASS_ENCODER(pg_history_t
)
2964 inline std::ostream
& operator<<(std::ostream
& out
, const pg_history_t
& h
) {
2965 out
<< "ec=" << h
.epoch_created
<< "/" << h
.epoch_pool_created
2966 << " lis/c=" << h
.last_interval_started
2967 << "/" << h
.last_interval_clean
2968 << " les/c/f=" << h
.last_epoch_started
<< "/" << h
.last_epoch_clean
2969 << "/" << h
.last_epoch_marked_full
2970 << " sis=" << h
.same_interval_since
;
2971 if (h
.prior_readable_until_ub
!= ceph::timespan::zero()) {
2972 out
<< " pruub=" << h
.prior_readable_until_ub
;
2979 * pg_info_t - summary of PG statistics.
2982 * - last_complete implies we have all objects that existed as of that
2983 * stamp, OR a newer object, OR have already applied a later delete.
2984 * - if last_complete >= log.tail, then we know pg contents thru log.head.
2985 * otherwise, we have no idea what the pg is supposed to contain.
2989 eversion_t last_update
; ///< last object version applied to store.
2990 eversion_t last_complete
; ///< last version pg was complete through.
2991 epoch_t last_epoch_started
; ///< last epoch at which this pg started on this osd
2992 epoch_t last_interval_started
; ///< first epoch of last_epoch_started interval
2994 version_t last_user_version
; ///< last user object version applied to store
2996 eversion_t log_tail
; ///< oldest log entry.
2998 hobject_t last_backfill
; ///< objects >= this and < last_complete may be missing
3000 interval_set
<snapid_t
> purged_snaps
;
3004 pg_history_t history
;
3005 pg_hit_set_history_t hit_set
;
3007 friend bool operator==(const pg_info_t
& l
, const pg_info_t
& r
) {
3010 l
.last_update
== r
.last_update
&&
3011 l
.last_complete
== r
.last_complete
&&
3012 l
.last_epoch_started
== r
.last_epoch_started
&&
3013 l
.last_interval_started
== r
.last_interval_started
&&
3014 l
.last_user_version
== r
.last_user_version
&&
3015 l
.log_tail
== r
.log_tail
&&
3016 l
.last_backfill
== r
.last_backfill
&&
3017 l
.purged_snaps
== r
.purged_snaps
&&
3018 l
.stats
== r
.stats
&&
3019 l
.history
== r
.history
&&
3020 l
.hit_set
== r
.hit_set
;
3024 : last_epoch_started(0),
3025 last_interval_started(0),
3026 last_user_version(0),
3027 last_backfill(hobject_t::get_max())
3029 // cppcheck-suppress noExplicitConstructor
3032 last_epoch_started(0),
3033 last_interval_started(0),
3034 last_user_version(0),
3035 last_backfill(hobject_t::get_max())
3038 void set_last_backfill(hobject_t pos
) {
3039 last_backfill
= pos
;
3042 bool is_empty() const { return last_update
.version
== 0; }
3043 bool dne() const { return history
.epoch_created
== 0; }
3045 bool has_missing() const { return last_complete
!= last_update
; }
3046 bool is_incomplete() const { return !last_backfill
.is_max(); }
3048 void encode(ceph::buffer::list
& bl
) const;
3049 void decode(ceph::buffer::list::const_iterator
& p
);
3050 void dump(ceph::Formatter
*f
) const;
3051 static void generate_test_instances(std::list
<pg_info_t
*>& o
);
3053 WRITE_CLASS_ENCODER(pg_info_t
)
3055 inline std::ostream
& operator<<(std::ostream
& out
, const pg_info_t
& pgi
)
3057 out
<< pgi
.pgid
<< "(";
3063 out
<< " v " << pgi
.last_update
;
3064 if (pgi
.last_complete
!= pgi
.last_update
)
3065 out
<< " lc " << pgi
.last_complete
;
3066 out
<< " (" << pgi
.log_tail
<< "," << pgi
.last_update
<< "]";
3068 if (pgi
.is_incomplete())
3069 out
<< " lb " << pgi
.last_backfill
;
3070 //out << " c " << pgi.epoch_created;
3071 out
<< " local-lis/les=" << pgi
.last_interval_started
3072 << "/" << pgi
.last_epoch_started
;
3073 out
<< " n=" << pgi
.stats
.stats
.sum
.num_objects
;
3074 out
<< " " << pgi
.history
3080 * pg_fast_info_t - common pg_info_t fields
3082 * These are the fields of pg_info_t (and children) that are updated for
3083 * most IO operations.
3086 * Because we rely on these fields to be applied to the normal
3087 * info struct, adding a new field here that is not also new in info
3088 * means that we must set an incompat OSD feature bit!
3090 struct pg_fast_info_t
{
3091 eversion_t last_update
;
3092 eversion_t last_complete
;
3093 version_t last_user_version
;
3094 struct { // pg_stat_t stats
3096 version_t reported_seq
;
3098 utime_t last_active
;
3099 utime_t last_peered
;
3101 utime_t last_unstale
;
3102 utime_t last_undegraded
;
3103 utime_t last_fullsized
;
3104 int64_t log_size
; // (also ondisk_log_size, which has the same value)
3105 struct { // object_stat_collection_t stats;
3106 struct { // objct_stat_sum_t sum
3107 int64_t num_bytes
; // in bytes
3108 int64_t num_objects
;
3109 int64_t num_object_copies
;
3114 int64_t num_objects_dirty
;
3119 void populate_from(const pg_info_t
& info
) {
3120 last_update
= info
.last_update
;
3121 last_complete
= info
.last_complete
;
3122 last_user_version
= info
.last_user_version
;
3123 stats
.version
= info
.stats
.version
;
3124 stats
.reported_seq
= info
.stats
.reported_seq
;
3125 stats
.last_fresh
= info
.stats
.last_fresh
;
3126 stats
.last_active
= info
.stats
.last_active
;
3127 stats
.last_peered
= info
.stats
.last_peered
;
3128 stats
.last_clean
= info
.stats
.last_clean
;
3129 stats
.last_unstale
= info
.stats
.last_unstale
;
3130 stats
.last_undegraded
= info
.stats
.last_undegraded
;
3131 stats
.last_fullsized
= info
.stats
.last_fullsized
;
3132 stats
.log_size
= info
.stats
.log_size
;
3133 stats
.stats
.sum
.num_bytes
= info
.stats
.stats
.sum
.num_bytes
;
3134 stats
.stats
.sum
.num_objects
= info
.stats
.stats
.sum
.num_objects
;
3135 stats
.stats
.sum
.num_object_copies
= info
.stats
.stats
.sum
.num_object_copies
;
3136 stats
.stats
.sum
.num_rd
= info
.stats
.stats
.sum
.num_rd
;
3137 stats
.stats
.sum
.num_rd_kb
= info
.stats
.stats
.sum
.num_rd_kb
;
3138 stats
.stats
.sum
.num_wr
= info
.stats
.stats
.sum
.num_wr
;
3139 stats
.stats
.sum
.num_wr_kb
= info
.stats
.stats
.sum
.num_wr_kb
;
3140 stats
.stats
.sum
.num_objects_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
3143 bool try_apply_to(pg_info_t
* info
) {
3144 if (last_update
<= info
->last_update
)
3146 info
->last_update
= last_update
;
3147 info
->last_complete
= last_complete
;
3148 info
->last_user_version
= last_user_version
;
3149 info
->stats
.version
= stats
.version
;
3150 info
->stats
.reported_seq
= stats
.reported_seq
;
3151 info
->stats
.last_fresh
= stats
.last_fresh
;
3152 info
->stats
.last_active
= stats
.last_active
;
3153 info
->stats
.last_peered
= stats
.last_peered
;
3154 info
->stats
.last_clean
= stats
.last_clean
;
3155 info
->stats
.last_unstale
= stats
.last_unstale
;
3156 info
->stats
.last_undegraded
= stats
.last_undegraded
;
3157 info
->stats
.last_fullsized
= stats
.last_fullsized
;
3158 info
->stats
.log_size
= stats
.log_size
;
3159 info
->stats
.ondisk_log_size
= stats
.log_size
;
3160 info
->stats
.stats
.sum
.num_bytes
= stats
.stats
.sum
.num_bytes
;
3161 info
->stats
.stats
.sum
.num_objects
= stats
.stats
.sum
.num_objects
;
3162 info
->stats
.stats
.sum
.num_object_copies
= stats
.stats
.sum
.num_object_copies
;
3163 info
->stats
.stats
.sum
.num_rd
= stats
.stats
.sum
.num_rd
;
3164 info
->stats
.stats
.sum
.num_rd_kb
= stats
.stats
.sum
.num_rd_kb
;
3165 info
->stats
.stats
.sum
.num_wr
= stats
.stats
.sum
.num_wr
;
3166 info
->stats
.stats
.sum
.num_wr_kb
= stats
.stats
.sum
.num_wr_kb
;
3167 info
->stats
.stats
.sum
.num_objects_dirty
= stats
.stats
.sum
.num_objects_dirty
;
3171 void encode(ceph::buffer::list
& bl
) const {
3172 ENCODE_START(1, 1, bl
);
3173 encode(last_update
, bl
);
3174 encode(last_complete
, bl
);
3175 encode(last_user_version
, bl
);
3176 encode(stats
.version
, bl
);
3177 encode(stats
.reported_seq
, bl
);
3178 encode(stats
.last_fresh
, bl
);
3179 encode(stats
.last_active
, bl
);
3180 encode(stats
.last_peered
, bl
);
3181 encode(stats
.last_clean
, bl
);
3182 encode(stats
.last_unstale
, bl
);
3183 encode(stats
.last_undegraded
, bl
);
3184 encode(stats
.last_fullsized
, bl
);
3185 encode(stats
.log_size
, bl
);
3186 encode(stats
.stats
.sum
.num_bytes
, bl
);
3187 encode(stats
.stats
.sum
.num_objects
, bl
);
3188 encode(stats
.stats
.sum
.num_object_copies
, bl
);
3189 encode(stats
.stats
.sum
.num_rd
, bl
);
3190 encode(stats
.stats
.sum
.num_rd_kb
, bl
);
3191 encode(stats
.stats
.sum
.num_wr
, bl
);
3192 encode(stats
.stats
.sum
.num_wr_kb
, bl
);
3193 encode(stats
.stats
.sum
.num_objects_dirty
, bl
);
3196 void decode(ceph::buffer::list::const_iterator
& p
) {
3198 decode(last_update
, p
);
3199 decode(last_complete
, p
);
3200 decode(last_user_version
, p
);
3201 decode(stats
.version
, p
);
3202 decode(stats
.reported_seq
, p
);
3203 decode(stats
.last_fresh
, p
);
3204 decode(stats
.last_active
, p
);
3205 decode(stats
.last_peered
, p
);
3206 decode(stats
.last_clean
, p
);
3207 decode(stats
.last_unstale
, p
);
3208 decode(stats
.last_undegraded
, p
);
3209 decode(stats
.last_fullsized
, p
);
3210 decode(stats
.log_size
, p
);
3211 decode(stats
.stats
.sum
.num_bytes
, p
);
3212 decode(stats
.stats
.sum
.num_objects
, p
);
3213 decode(stats
.stats
.sum
.num_object_copies
, p
);
3214 decode(stats
.stats
.sum
.num_rd
, p
);
3215 decode(stats
.stats
.sum
.num_rd_kb
, p
);
3216 decode(stats
.stats
.sum
.num_wr
, p
);
3217 decode(stats
.stats
.sum
.num_wr_kb
, p
);
3218 decode(stats
.stats
.sum
.num_objects_dirty
, p
);
3222 WRITE_CLASS_ENCODER(pg_fast_info_t
)
3226 * PastIntervals -- information needed to determine the PriorSet and
3227 * the might_have_unfound set
3229 class PastIntervals
{
3231 using OSDMapRef
= boost::local_shared_ptr
<const OSDMap
>;
3233 using OSDMapRef
= std::shared_ptr
<const OSDMap
>;
3236 struct pg_interval_t
{
3237 std::vector
<int32_t> up
, acting
;
3238 epoch_t first
, last
;
3244 : first(0), last(0),
3245 maybe_went_rw(false),
3251 std::vector
<int32_t> &&up
,
3252 std::vector
<int32_t> &&acting
,
3258 : up(up
), acting(acting
), first(first
), last(last
),
3259 maybe_went_rw(maybe_went_rw
), primary(primary
), up_primary(up_primary
)
3262 void encode(ceph::buffer::list
& bl
) const;
3263 void decode(ceph::buffer::list::const_iterator
& bl
);
3264 void dump(ceph::Formatter
*f
) const;
3265 static void generate_test_instances(std::list
<pg_interval_t
*>& o
);
3269 PastIntervals(PastIntervals
&&rhs
) = default;
3270 PastIntervals
&operator=(PastIntervals
&&rhs
) = default;
3272 PastIntervals(const PastIntervals
&rhs
);
3273 PastIntervals
&operator=(const PastIntervals
&rhs
);
3275 class interval_rep
{
3277 virtual size_t size() const = 0;
3278 virtual bool empty() const = 0;
3279 virtual void clear() = 0;
3280 virtual std::pair
<epoch_t
, epoch_t
> get_bounds() const = 0;
3281 virtual std::set
<pg_shard_t
> get_all_participants(
3282 bool ec_pool
) const = 0;
3283 virtual void add_interval(bool ec_pool
, const pg_interval_t
&interval
) = 0;
3284 virtual std::unique_ptr
<interval_rep
> clone() const = 0;
3285 virtual std::ostream
&print(std::ostream
&out
) const = 0;
3286 virtual void encode(ceph::buffer::list
&bl
) const = 0;
3287 virtual void decode(ceph::buffer::list::const_iterator
&bl
) = 0;
3288 virtual void dump(ceph::Formatter
*f
) const = 0;
3289 virtual void iterate_mayberw_back_to(
3291 std::function
<void(epoch_t
, const std::set
<pg_shard_t
> &)> &&f
) const = 0;
3293 virtual bool has_full_intervals() const { return false; }
3294 virtual void iterate_all_intervals(
3295 std::function
<void(const pg_interval_t
&)> &&f
) const {
3296 ceph_assert(!has_full_intervals());
3297 ceph_abort_msg("not valid for this implementation");
3299 virtual void adjust_start_backwards(epoch_t last_epoch_clean
) = 0;
3301 virtual ~interval_rep() {}
3303 friend class pi_compact_rep
;
3306 std::unique_ptr
<interval_rep
> past_intervals
;
3308 explicit PastIntervals(interval_rep
*rep
) : past_intervals(rep
) {}
3311 void add_interval(bool ec_pool
, const pg_interval_t
&interval
) {
3312 ceph_assert(past_intervals
);
3313 return past_intervals
->add_interval(ec_pool
, interval
);
3316 void encode(ceph::buffer::list
&bl
) const {
3317 ENCODE_START(1, 1, bl
);
3318 if (past_intervals
) {
3321 past_intervals
->encode(bl
);
3323 encode((__u8
)0, bl
);
3328 void decode(ceph::buffer::list::const_iterator
&bl
);
3330 void dump(ceph::Formatter
*f
) const {
3331 ceph_assert(past_intervals
);
3332 past_intervals
->dump(f
);
3334 static void generate_test_instances(std::list
<PastIntervals
*> & o
);
3337 * Determines whether there is an interval change
3339 static bool is_new_interval(
3340 int old_acting_primary
,
3341 int new_acting_primary
,
3342 const std::vector
<int> &old_acting
,
3343 const std::vector
<int> &new_acting
,
3346 const std::vector
<int> &old_up
,
3347 const std::vector
<int> &new_up
,
3352 unsigned old_pg_num
,
3353 unsigned new_pg_num
,
3354 unsigned old_pg_num_pending
,
3355 unsigned new_pg_num_pending
,
3356 bool old_sort_bitwise
,
3357 bool new_sort_bitwise
,
3358 bool old_recovery_deletes
,
3359 bool new_recovery_deletes
,
3360 uint32_t old_crush_count
,
3361 uint32_t new_crush_count
,
3362 uint32_t old_crush_target
,
3363 uint32_t new_crush_target
,
3364 uint32_t old_crush_barrier
,
3365 uint32_t new_crush_barrier
,
3366 int32_t old_crush_member
,
3367 int32_t new_crush_member
,
3372 * Determines whether there is an interval change
3374 static bool is_new_interval(
3375 int old_acting_primary
, ///< [in] primary as of lastmap
3376 int new_acting_primary
, ///< [in] primary as of lastmap
3377 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3378 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3379 int old_up_primary
, ///< [in] up primary of lastmap
3380 int new_up_primary
, ///< [in] up primary of osdmap
3381 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3382 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3383 const OSDMap
*osdmap
, ///< [in] current map
3384 const OSDMap
*lastmap
, ///< [in] last map
3385 pg_t pgid
///< [in] pgid for pg
3389 * Integrates a new map into *past_intervals, returns true
3390 * if an interval was closed out.
3392 static bool check_new_interval(
3393 int old_acting_primary
, ///< [in] primary as of lastmap
3394 int new_acting_primary
, ///< [in] primary as of osdmap
3395 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3396 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3397 int old_up_primary
, ///< [in] up primary of lastmap
3398 int new_up_primary
, ///< [in] up primary of osdmap
3399 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3400 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3401 epoch_t same_interval_since
, ///< [in] as of osdmap
3402 epoch_t last_epoch_clean
, ///< [in] current
3403 const OSDMap
*osdmap
, ///< [in] current map
3404 const OSDMap
*lastmap
, ///< [in] last map
3405 pg_t pgid
, ///< [in] pgid for pg
3406 const IsPGRecoverablePredicate
&could_have_gone_active
, ///< [in] predicate whether the pg can be active
3407 PastIntervals
*past_intervals
, ///< [out] intervals
3408 std::ostream
*out
= 0 ///< [out] debug ostream
3410 static bool check_new_interval(
3411 int old_acting_primary
, ///< [in] primary as of lastmap
3412 int new_acting_primary
, ///< [in] primary as of osdmap
3413 const std::vector
<int> &old_acting
, ///< [in] acting as of lastmap
3414 const std::vector
<int> &new_acting
, ///< [in] acting as of osdmap
3415 int old_up_primary
, ///< [in] up primary of lastmap
3416 int new_up_primary
, ///< [in] up primary of osdmap
3417 const std::vector
<int> &old_up
, ///< [in] up as of lastmap
3418 const std::vector
<int> &new_up
, ///< [in] up as of osdmap
3419 epoch_t same_interval_since
, ///< [in] as of osdmap
3420 epoch_t last_epoch_clean
, ///< [in] current
3421 OSDMapRef osdmap
, ///< [in] current map
3422 OSDMapRef lastmap
, ///< [in] last map
3423 pg_t pgid
, ///< [in] pgid for pg
3424 const IsPGRecoverablePredicate
&could_have_gone_active
, ///< [in] predicate whether the pg can be active
3425 PastIntervals
*past_intervals
, ///< [out] intervals
3426 std::ostream
*out
= 0 ///< [out] debug ostream
3428 return check_new_interval(
3429 old_acting_primary
, new_acting_primary
,
3430 old_acting
, new_acting
,
3431 old_up_primary
, new_up_primary
,
3433 same_interval_since
, last_epoch_clean
,
3434 osdmap
.get(), lastmap
.get(),
3436 could_have_gone_active
,
3441 friend std::ostream
& operator<<(std::ostream
& out
, const PastIntervals
&i
);
3443 template <typename F
>
3444 void iterate_mayberw_back_to(
3447 ceph_assert(past_intervals
);
3448 past_intervals
->iterate_mayberw_back_to(les
, std::forward
<F
>(f
));
3451 ceph_assert(past_intervals
);
3452 past_intervals
->clear();
3456 * Should return a value which gives an indication of the amount
3457 * of state contained
3459 size_t size() const {
3460 ceph_assert(past_intervals
);
3461 return past_intervals
->size();
3464 bool empty() const {
3465 ceph_assert(past_intervals
);
3466 return past_intervals
->empty();
3469 void swap(PastIntervals
&other
) {
3471 swap(other
.past_intervals
, past_intervals
);
3475 * Return all shards which have been in the acting set back to the
3476 * latest epoch to which we have trimmed except for pg_whoami
3478 std::set
<pg_shard_t
> get_might_have_unfound(
3479 pg_shard_t pg_whoami
,
3480 bool ec_pool
) const {
3481 ceph_assert(past_intervals
);
3482 auto ret
= past_intervals
->get_all_participants(ec_pool
);
3483 ret
.erase(pg_whoami
);
3488 * Return all shards which we might want to talk to for peering
3490 std::set
<pg_shard_t
> get_all_probe(
3491 bool ec_pool
) const {
3492 ceph_assert(past_intervals
);
3493 return past_intervals
->get_all_participants(ec_pool
);
3496 /* Return the set of epochs [start, end) represented by the
3497 * past_interval set.
3499 std::pair
<epoch_t
, epoch_t
> get_bounds() const {
3500 ceph_assert(past_intervals
);
3501 return past_intervals
->get_bounds();
3504 void adjust_start_backwards(epoch_t last_epoch_clean
) {
3505 ceph_assert(past_intervals
);
3506 past_intervals
->adjust_start_backwards(last_epoch_clean
);
3516 bool ec_pool
= false;
3517 std::set
<pg_shard_t
> probe
; ///< current+prior OSDs we need to probe.
3518 std::set
<int> down
; ///< down osds that would normally be in @a probe and might be interesting.
3519 std::map
<int, epoch_t
> blocked_by
; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
3521 bool pg_down
= false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
3522 const IsPGRecoverablePredicate
* pcontdec
= nullptr;
3524 PriorSet() = default;
3525 PriorSet(PriorSet
&&) = default;
3526 PriorSet
&operator=(PriorSet
&&) = default;
3528 PriorSet
&operator=(const PriorSet
&) = delete;
3529 PriorSet(const PriorSet
&) = delete;
3531 bool operator==(const PriorSet
&rhs
) const {
3532 return (ec_pool
== rhs
.ec_pool
) &&
3533 (probe
== rhs
.probe
) &&
3534 (down
== rhs
.down
) &&
3535 (blocked_by
== rhs
.blocked_by
) &&
3536 (pg_down
== rhs
.pg_down
);
3539 bool affected_by_map(
3540 const OSDMap
&osdmap
,
3541 const DoutPrefixProvider
*dpp
) const;
3543 // For verifying tests
3546 std::set
<pg_shard_t
> probe
,
3548 std::map
<int, epoch_t
> blocked_by
,
3550 const IsPGRecoverablePredicate
*pcontdec
)
3551 : ec_pool(ec_pool
), probe(probe
), down(down
), blocked_by(blocked_by
),
3552 pg_down(pg_down
), pcontdec(pcontdec
) {}
3555 template <typename F
>
3557 const PastIntervals
&past_intervals
,
3559 epoch_t last_epoch_started
,
3560 const IsPGRecoverablePredicate
*c
,
3562 const std::vector
<int> &up
,
3563 const std::vector
<int> &acting
,
3564 const DoutPrefixProvider
*dpp
);
3566 friend class PastIntervals
;
3569 template <typename
... Args
>
3570 PriorSet
get_prior_set(Args
&&... args
) const {
3571 return PriorSet(*this, std::forward
<Args
>(args
)...);
3574 WRITE_CLASS_ENCODER(PastIntervals
)
3576 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals::pg_interval_t
& i
);
3577 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals
&i
);
3578 std::ostream
& operator<<(std::ostream
& out
, const PastIntervals::PriorSet
&i
);
3580 template <typename F
>
3581 PastIntervals::PriorSet::PriorSet(
3582 const PastIntervals
&past_intervals
,
3584 epoch_t last_epoch_started
,
3585 const IsPGRecoverablePredicate
*c
,
3587 const std::vector
<int> &up
,
3588 const std::vector
<int> &acting
,
3589 const DoutPrefixProvider
*dpp
)
3590 : ec_pool(ec_pool
), pg_down(false), pcontdec(c
)
3593 * We have to be careful to gracefully deal with situations like
3594 * so. Say we have a power outage or something that takes out both
3595 * OSDs, but the monitor doesn't mark them down in the same epoch.
3596 * The history may look like
3600 * 3: let's say B dies for good, too (say, from the power spike)
3603 * which makes it look like B may have applied updates to the PG
3604 * that we need in order to proceed. This sucks...
3606 * To minimize the risk of this happening, we CANNOT go active if
3607 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3608 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3609 * Then, we have something like
3616 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3627 * -> we must wait for B, bc it was alive through 2, and could have
3628 * written to the pg.
3630 * If B is really dead, then an administrator will need to manually
3631 * intervene by marking the OSD as "lost."
3634 // Include current acting and up nodes... not because they may
3635 // contain old data (this interval hasn't gone active, obviously),
3636 // but because we want their pg_info to inform choose_acting(), and
3637 // so that we know what they do/do not have explicitly before
3638 // sending them any new info/logs/whatever.
3639 for (unsigned i
= 0; i
< acting
.size(); i
++) {
3640 if (acting
[i
] != pg_pool_t::pg_CRUSH_ITEM_NONE
)
3641 probe
.insert(pg_shard_t(acting
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3643 // It may be possible to exclude the up nodes, but let's keep them in
3645 for (unsigned i
= 0; i
< up
.size(); i
++) {
3646 if (up
[i
] != pg_pool_t::pg_CRUSH_ITEM_NONE
)
3647 probe
.insert(pg_shard_t(up
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3650 std::set
<pg_shard_t
> all_probe
= past_intervals
.get_all_probe(ec_pool
);
3651 ldpp_dout(dpp
, 10) << "build_prior all_probe " << all_probe
<< dendl
;
3652 for (auto &&i
: all_probe
) {
3653 switch (f(0, i
.osd
, nullptr)) {
3667 past_intervals
.iterate_mayberw_back_to(
3669 [&](epoch_t start
, const std::set
<pg_shard_t
> &acting
) {
3670 ldpp_dout(dpp
, 10) << "build_prior maybe_rw interval:" << start
3671 << ", acting: " << acting
<< dendl
;
3673 // look at candidate osds during this interval. each falls into
3674 // one of three categories: up, down (but potentially
3675 // interesting), or lost (down, but we won't wait for it).
3676 std::set
<pg_shard_t
> up_now
;
3677 std::map
<int, epoch_t
> candidate_blocked_by
;
3678 // any candidates down now (that might have useful data)
3679 bool any_down_now
= false;
3681 // consider ACTING osds
3682 for (auto &&so
: acting
) {
3683 epoch_t lost_at
= 0;
3684 switch (f(start
, so
.osd
, &lost_at
)) {
3686 // include past acting osds if they are up.
3691 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3692 << " no longer exists" << dendl
;
3696 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3697 << " is down, but lost_at " << lost_at
<< dendl
;
3702 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3703 << " is down" << dendl
;
3704 candidate_blocked_by
[so
.osd
] = lost_at
;
3705 any_down_now
= true;
3711 // if not enough osds survived this interval, and we may have gone rw,
3712 // then we need to wait for one of those osds to recover to
3713 // ensure that we haven't lost any information.
3714 if (!(*pcontdec
)(up_now
) && any_down_now
) {
3715 // fixme: how do we identify a "clean" shutdown anyway?
3716 ldpp_dout(dpp
, 10) << "build_prior possibly went active+rw,"
3717 << " insufficient up; including down osds" << dendl
;
3718 ceph_assert(!candidate_blocked_by
.empty());
3721 candidate_blocked_by
.begin(),
3722 candidate_blocked_by
.end());
3726 ldpp_dout(dpp
, 10) << "build_prior final: probe " << probe
3728 << " blocked_by " << blocked_by
3729 << (pg_down
? " pg_down":"")
3733 struct pg_notify_t
{
3734 epoch_t query_epoch
;
3739 PastIntervals past_intervals
;
3741 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3742 from(shard_id_t::NO_SHARD
) {}
3746 epoch_t query_epoch
,
3748 const pg_info_t
&info
,
3749 const PastIntervals
& pi
)
3750 : query_epoch(query_epoch
),
3751 epoch_sent(epoch_sent
),
3752 info(info
), to(to
), from(from
),
3753 past_intervals(pi
) {
3754 ceph_assert(from
== info
.pgid
.shard
);
3756 void encode(ceph::buffer::list
&bl
) const;
3757 void decode(ceph::buffer::list::const_iterator
&p
);
3758 void dump(ceph::Formatter
*f
) const;
3759 static void generate_test_instances(std::list
<pg_notify_t
*> &o
);
3761 WRITE_CLASS_ENCODER(pg_notify_t
)
3762 std::ostream
&operator<<(std::ostream
&lhs
, const pg_notify_t
¬ify
);
3766 * pg_query_t - used to ask a peer for information about a pg.
3768 * note: if version=0, type=LOG, then we just provide our full log.
3777 std::string_view
get_type_name() const {
3779 case INFO
: return "info";
3780 case LOG
: return "log";
3781 case MISSING
: return "missing";
3782 case FULLLOG
: return "fulllog";
3783 default: return "???";
3789 pg_history_t history
;
3794 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3795 from(shard_id_t::NO_SHARD
) {}
3800 const pg_history_t
& h
,
3804 epoch_sent(epoch_sent
),
3805 to(to
), from(from
) {
3806 ceph_assert(t
!= LOG
);
3813 const pg_history_t
& h
,
3815 : type(t
), since(s
), history(h
),
3816 epoch_sent(epoch_sent
), to(to
), from(from
) {
3817 ceph_assert(t
== LOG
);
3820 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
3821 void decode(ceph::buffer::list::const_iterator
&bl
);
3823 void dump(ceph::Formatter
*f
) const;
3824 static void generate_test_instances(std::list
<pg_query_t
*>& o
);
3826 WRITE_CLASS_ENCODER_FEATURES(pg_query_t
)
3828 inline std::ostream
& operator<<(std::ostream
& out
, const pg_query_t
& q
) {
3829 out
<< "query(" << q
.get_type_name() << " " << q
.since
;
3830 if (q
.type
== pg_query_t::LOG
)
3831 out
<< " " << q
.history
;
3832 out
<< " epoch_sent " << q
.epoch_sent
;
3838 * pg_lease_t - readable lease metadata, from primary -> non-primary
3840 * This metadata serves to increase either or both of the lease expiration
3841 * and upper bound on the non-primary.
3844 /// pg readable_until value; replicas must not be readable beyond this
3845 ceph::signedspan readable_until
= ceph::signedspan::zero();
3847 /// upper bound on any acting osd's readable_until
3848 ceph::signedspan readable_until_ub
= ceph::signedspan::zero();
3850 /// duration of the lease (in case clock deltas aren't available)
3851 ceph::signedspan interval
= ceph::signedspan::zero();
3854 pg_lease_t(ceph::signedspan ru
, ceph::signedspan ruub
,
3856 : readable_until(ru
),
3857 readable_until_ub(ruub
),
3860 void encode(ceph::buffer::list
&bl
) const;
3861 void decode(ceph::buffer::list::const_iterator
&bl
);
3862 void dump(ceph::Formatter
*f
) const;
3863 static void generate_test_instances(std::list
<pg_lease_t
*>& o
);
3865 friend std::ostream
& operator<<(std::ostream
& out
, const pg_lease_t
& l
) {
3866 return out
<< "pg_lease(ru " << l
.readable_until
3867 << " ub " << l
.readable_until_ub
3868 << " int " << l
.interval
<< ")";
3871 WRITE_CLASS_ENCODER(pg_lease_t
)
3874 * pg_lease_ack_t - lease ack, from non-primary -> primary
3876 * This metadata acknowledges to the primary what a non-primary's noted
3879 struct pg_lease_ack_t
{
3880 /// highest upper bound non-primary has recorded (primary's clock)
3881 ceph::signedspan readable_until_ub
= ceph::signedspan::zero();
3884 pg_lease_ack_t(ceph::signedspan ub
)
3885 : readable_until_ub(ub
) {}
3887 void encode(ceph::buffer::list
&bl
) const;
3888 void decode(ceph::buffer::list::const_iterator
&bl
);
3889 void dump(ceph::Formatter
*f
) const;
3890 static void generate_test_instances(std::list
<pg_lease_ack_t
*>& o
);
3892 friend std::ostream
& operator<<(std::ostream
& out
, const pg_lease_ack_t
& l
) {
3893 return out
<< "pg_lease_ack(ruub " << l
.readable_until_ub
<< ")";
3896 WRITE_CLASS_ENCODER(pg_lease_ack_t
)
3901 class ObjectModDesc
{
3902 bool can_local_rollback
;
3903 bool rollback_info_completed
;
3905 // version required to decode, reflected in encode/decode version
3906 __u8 max_required_version
= 1;
3910 virtual void append(uint64_t old_offset
) {}
3911 virtual void setattrs(std::map
<std::string
, std::optional
<ceph::buffer::list
>> &attrs
) {}
3912 virtual void rmobject(version_t old_version
) {}
3914 * Used to support the unfound_lost_delete log event: if the stashed
3915 * version exists, we unstash it, otherwise, we do nothing. This way
3916 * each replica rolls back to whatever state it had prior to the attempt
3917 * at mark unfound lost delete
3919 virtual void try_rmobject(version_t old_version
) {
3920 rmobject(old_version
);
3922 virtual void create() {}
3923 virtual void update_snaps(const std::set
<snapid_t
> &old_snaps
) {}
3924 virtual void rollback_extents(
3926 const std::vector
<std::pair
<uint64_t, uint64_t> > &extents
) {}
3927 virtual ~Visitor() {}
3929 void visit(Visitor
*visitor
) const;
3930 mutable ceph::buffer::list bl
;
3938 ROLLBACK_EXTENTS
= 7
3940 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3941 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3943 void claim(ObjectModDesc
&other
) {
3944 bl
= std::move(other
.bl
);
3945 can_local_rollback
= other
.can_local_rollback
;
3946 rollback_info_completed
= other
.rollback_info_completed
;
3948 void claim_append(ObjectModDesc
&other
) {
3949 if (!can_local_rollback
|| rollback_info_completed
)
3951 if (!other
.can_local_rollback
) {
3952 mark_unrollbackable();
3955 bl
.claim_append(other
.bl
);
3956 rollback_info_completed
= other
.rollback_info_completed
;
3958 void swap(ObjectModDesc
&other
) {
3962 swap(other
.can_local_rollback
, can_local_rollback
);
3963 swap(other
.rollback_info_completed
, rollback_info_completed
);
3964 swap(other
.max_required_version
, max_required_version
);
3966 void append_id(ModID id
) {
3971 void append(uint64_t old_size
) {
3972 if (!can_local_rollback
|| rollback_info_completed
)
3974 ENCODE_START(1, 1, bl
);
3976 encode(old_size
, bl
);
3979 void setattrs(std::map
<std::string
, std::optional
<ceph::buffer::list
>> &old_attrs
) {
3980 if (!can_local_rollback
|| rollback_info_completed
)
3982 ENCODE_START(1, 1, bl
);
3983 append_id(SETATTRS
);
3984 encode(old_attrs
, bl
);
3987 bool rmobject(version_t deletion_version
) {
3988 if (!can_local_rollback
|| rollback_info_completed
)
3990 ENCODE_START(1, 1, bl
);
3992 encode(deletion_version
, bl
);
3994 rollback_info_completed
= true;
3997 bool try_rmobject(version_t deletion_version
) {
3998 if (!can_local_rollback
|| rollback_info_completed
)
4000 ENCODE_START(1, 1, bl
);
4001 append_id(TRY_DELETE
);
4002 encode(deletion_version
, bl
);
4004 rollback_info_completed
= true;
4008 if (!can_local_rollback
|| rollback_info_completed
)
4010 rollback_info_completed
= true;
4011 ENCODE_START(1, 1, bl
);
4015 void update_snaps(const std::set
<snapid_t
> &old_snaps
) {
4016 if (!can_local_rollback
|| rollback_info_completed
)
4018 ENCODE_START(1, 1, bl
);
4019 append_id(UPDATE_SNAPS
);
4020 encode(old_snaps
, bl
);
4023 void rollback_extents(
4024 version_t gen
, const std::vector
<std::pair
<uint64_t, uint64_t> > &extents
) {
4025 ceph_assert(can_local_rollback
);
4026 ceph_assert(!rollback_info_completed
);
4027 if (max_required_version
< 2)
4028 max_required_version
= 2;
4029 ENCODE_START(2, 2, bl
);
4030 append_id(ROLLBACK_EXTENTS
);
4032 encode(extents
, bl
);
4036 // cannot be rolled back
4037 void mark_unrollbackable() {
4038 can_local_rollback
= false;
4041 bool can_rollback() const {
4042 return can_local_rollback
;
4044 bool empty() const {
4045 return can_local_rollback
&& (bl
.length() == 0);
4048 bool requires_kraken() const {
4049 return max_required_version
>= 2;
4053 * Create fresh copy of bl bytes to avoid keeping large buffers around
4054 * in the case that bl contains ptrs which point into a much larger
4057 void trim_bl() const {
4058 if (bl
.length() > 0)
4061 void encode(ceph::buffer::list
&bl
) const;
4062 void decode(ceph::buffer::list::const_iterator
&bl
);
4063 void dump(ceph::Formatter
*f
) const;
4064 static void generate_test_instances(std::list
<ObjectModDesc
*>& o
);
4066 WRITE_CLASS_ENCODER(ObjectModDesc
)
4068 class ObjectCleanRegions
{
4072 interval_set
<uint64_t> clean_offsets
;
4073 static std::atomic
<uint32_t> max_num_intervals
;
4076 * trim the number of intervals if clean_offsets.num_intervals()
4077 * exceeds the given upbound max_num_intervals
4078 * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
4079 * then new interval [30~10] will evict out the shortest one [20~5]
4080 * finally, clean_offsets becomes {[5~10], [30~10]}
4083 friend std::ostream
& operator<<(std::ostream
& out
, const ObjectCleanRegions
& ocr
);
4085 ObjectCleanRegions() : new_object(false), clean_omap(true) {
4086 clean_offsets
.insert(0, (uint64_t)-1);
4088 ObjectCleanRegions(uint64_t offset
, uint64_t len
, bool co
)
4089 : new_object(false), clean_omap(co
) {
4090 clean_offsets
.insert(offset
, len
);
4092 bool operator==(const ObjectCleanRegions
&orc
) const {
4093 return new_object
== orc
.new_object
&& clean_omap
== orc
.clean_omap
&& clean_offsets
== orc
.clean_offsets
;
4095 static void set_max_num_intervals(uint32_t num
);
4096 void merge(const ObjectCleanRegions
&other
);
4097 void mark_data_region_dirty(uint64_t offset
, uint64_t len
);
4098 void mark_omap_dirty();
4099 void mark_object_new();
4100 void mark_fully_dirty();
4101 interval_set
<uint64_t> get_dirty_regions() const;
4102 bool omap_is_dirty() const;
4103 bool object_is_exist() const;
4104 bool is_clean_region(uint64_t offset
, uint64_t len
) const;
4106 void encode(ceph::buffer::list
&bl
) const;
4107 void decode(ceph::buffer::list::const_iterator
&bl
);
4108 void dump(ceph::Formatter
*f
) const;
4109 static void generate_test_instances(std::list
<ObjectCleanRegions
*>& o
);
4111 WRITE_CLASS_ENCODER(ObjectCleanRegions
)
4112 std::ostream
& operator<<(std::ostream
& out
, const ObjectCleanRegions
& ocr
);
4119 ceph::buffer::list indata
, outdata
;
4120 errorcode32_t rval
= 0;
4123 // FIPS zeroization audit 20191115: this memset clean for security
4124 memset(&op
, 0, sizeof(ceph_osd_op
));
4127 OSDOp(const int op_code
) {
4128 // FIPS zeroization audit 20191115: this memset clean for security
4129 memset(&op
, 0, sizeof(ceph_osd_op
));
4134 * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
4136 * @param ops [out] vector of OSDOps
4137 * @param in [in] combined data buffer
4139 template<typename V
>
4140 static void split_osd_op_vector_in_data(V
& ops
,
4141 ceph::buffer::list
& in
) {
4142 ceph::buffer::list::iterator datap
= in
.begin();
4143 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4144 if (ops
[i
].op
.payload_len
) {
4145 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].indata
);
4151 * merge indata members of a vector of OSDOp into a single ceph::buffer::list
4153 * Notably this also encodes certain other OSDOp data into the data
4154 * buffer, including the sobject_t soid.
4156 * @param ops [in] vector of OSDOps
4157 * @param out [out] combined data buffer
4159 template<typename V
>
4160 static void merge_osd_op_vector_in_data(V
& ops
, ceph::buffer::list
& out
) {
4161 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4162 if (ops
[i
].indata
.length()) {
4163 ops
[i
].op
.payload_len
= ops
[i
].indata
.length();
4164 out
.append(ops
[i
].indata
);
4170 * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
4172 * @param ops [out] vector of OSDOps
4173 * @param in [in] combined data buffer
4175 static void split_osd_op_vector_out_data(std::vector
<OSDOp
>& ops
, ceph::buffer::list
& in
);
4178 * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
4180 * @param ops [in] vector of OSDOps
4181 * @param out [out] combined data buffer
4183 static void merge_osd_op_vector_out_data(std::vector
<OSDOp
>& ops
, ceph::buffer::list
& out
);
4186 * Clear data as much as possible, leave minimal data for historical op dump
4188 * @param ops [in] vector of OSDOps
4190 template<typename V
>
4191 static void clear_data(V
& ops
) {
4192 for (unsigned i
= 0; i
< ops
.size(); i
++) {
4195 if (ceph_osd_op_type_attr(op
.op
.op
) &&
4196 op
.op
.xattr
.name_len
&&
4197 op
.indata
.length() >= op
.op
.xattr
.name_len
) {
4198 ceph::buffer::list bl
;
4199 bl
.push_back(ceph::buffer::ptr_node::create(op
.op
.xattr
.name_len
));
4200 bl
.begin().copy_in(op
.op
.xattr
.name_len
, op
.indata
);
4201 op
.indata
= std::move(bl
);
4202 } else if (ceph_osd_op_type_exec(op
.op
.op
) &&
4203 op
.op
.cls
.class_len
&&
4204 op
.indata
.length() >
4205 (op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
)) {
4206 __u8 len
= op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
;
4207 ceph::buffer::list bl
;
4208 bl
.push_back(ceph::buffer::ptr_node::create(len
));
4209 bl
.begin().copy_in(len
, op
.indata
);
4210 op
.indata
= std::move(bl
);
4217 std::ostream
& operator<<(std::ostream
& out
, const OSDOp
& op
);
4219 struct pg_log_op_return_item_t
{
4221 ceph::buffer::list bl
;
4222 void encode(ceph::buffer::list
& p
) const {
4227 void decode(ceph::buffer::list::const_iterator
& p
) {
4232 void dump(ceph::Formatter
*f
) const {
4233 f
->dump_int("rval", rval
);
4234 f
->dump_unsigned("bl_length", bl
.length());
4236 friend bool operator==(const pg_log_op_return_item_t
& lhs
,
4237 const pg_log_op_return_item_t
& rhs
) {
4238 return lhs
.rval
== rhs
.rval
&&
4239 lhs
.bl
.contents_equal(rhs
.bl
);
4241 friend bool operator!=(const pg_log_op_return_item_t
& lhs
,
4242 const pg_log_op_return_item_t
& rhs
) {
4243 return !(lhs
== rhs
);
4245 friend std::ostream
& operator<<(std::ostream
& out
, const pg_log_op_return_item_t
& i
) {
4246 return out
<< "r=" << i
.rval
<< "+" << i
.bl
.length() << "b";
4249 WRITE_CLASS_ENCODER(pg_log_op_return_item_t
)
4252 * pg_log_entry_t - single entry/event in pg log
4255 struct pg_log_entry_t
{
4257 MODIFY
= 1, // some unspecified modification (but not *all* modifications)
4258 CLONE
= 2, // cloned object from head
4259 DELETE
= 3, // deleted object
4260 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
4261 LOST_REVERT
= 5, // lost new version, revert to an older version.
4262 LOST_DELETE
= 6, // lost new version, revert to no object (deleted).
4263 LOST_MARK
= 7, // lost new version, now EIO
4264 PROMOTE
= 8, // promoted object from another tier
4265 CLEAN
= 9, // mark an object clean
4266 ERROR
= 10, // write that returned an error
4268 static const char *get_op_name(int op
) {
4292 const char *get_op_name() const {
4293 return get_op_name(op
);
4296 // describes state for a locally-rollbackable entry
4297 ObjectModDesc mod_desc
;
4298 ceph::buffer::list snaps
; // only for clone entries
4300 osd_reqid_t reqid
; // caller+tid to uniquely identify request
4301 mempool::osd_pglog::vector
<std::pair
<osd_reqid_t
, version_t
> > extra_reqids
;
4303 /// map extra_reqids by index to error return code (if any)
4304 mempool::osd_pglog::map
<uint32_t, int> extra_reqid_return_codes
;
4306 eversion_t version
, prior_version
, reverting_to
;
4307 version_t user_version
; // the user version for this entry
4308 utime_t mtime
; // this is the _user_ mtime, mind you
4309 int32_t return_code
; // only stored for ERRORs for dup detection
4311 std::vector
<pg_log_op_return_item_t
> op_returns
;
4314 bool invalid_hash
; // only when decoding sobject_t based entries
4315 bool invalid_pool
; // only when decoding pool-less hobject based entries
4316 ObjectCleanRegions clean_regions
;
4319 : user_version(0), return_code(0), op(0),
4320 invalid_hash(false), invalid_pool(false) {
4321 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4323 pg_log_entry_t(int _op
, const hobject_t
& _soid
,
4324 const eversion_t
& v
, const eversion_t
& pv
,
4326 const osd_reqid_t
& rid
, const utime_t
& mt
,
4328 : soid(_soid
), reqid(rid
), version(v
), prior_version(pv
), user_version(uv
),
4329 mtime(mt
), return_code(return_code
), op(_op
),
4330 invalid_hash(false), invalid_pool(false) {
4331 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4334 bool is_clone() const { return op
== CLONE
; }
4335 bool is_modify() const { return op
== MODIFY
; }
4336 bool is_promote() const { return op
== PROMOTE
; }
4337 bool is_clean() const { return op
== CLEAN
; }
4338 bool is_lost_revert() const { return op
== LOST_REVERT
; }
4339 bool is_lost_delete() const { return op
== LOST_DELETE
; }
4340 bool is_lost_mark() const { return op
== LOST_MARK
; }
4341 bool is_error() const { return op
== ERROR
; }
4343 bool is_update() const {
4345 is_clone() || is_modify() || is_promote() || is_clean() ||
4346 is_lost_revert() || is_lost_mark();
4348 bool is_delete() const {
4349 return op
== DELETE
|| op
== LOST_DELETE
;
4352 bool can_rollback() const {
4353 return mod_desc
.can_rollback();
4356 void mark_unrollbackable() {
4357 mod_desc
.mark_unrollbackable();
4360 bool requires_kraken() const {
4361 return mod_desc
.requires_kraken();
4364 // Errors are only used for dup detection, whereas
4365 // the index by objects is used by recovery, copy_get,
4366 // and other facilities that don't expect or need to
4367 // be aware of error entries.
4368 bool object_is_indexed() const {
4372 bool reqid_is_indexed() const {
4373 return reqid
!= osd_reqid_t() &&
4374 (op
== MODIFY
|| op
== DELETE
|| op
== ERROR
);
4377 void set_op_returns(const std::vector
<OSDOp
>& ops
) {
4378 op_returns
.resize(ops
.size());
4379 for (unsigned i
= 0; i
< ops
.size(); ++i
) {
4380 op_returns
[i
].rval
= ops
[i
].rval
;
4381 op_returns
[i
].bl
= ops
[i
].outdata
;
4385 std::string
get_key_name() const;
4386 void encode_with_checksum(ceph::buffer::list
& bl
) const;
4387 void decode_with_checksum(ceph::buffer::list::const_iterator
& p
);
4389 void encode(ceph::buffer::list
&bl
) const;
4390 void decode(ceph::buffer::list::const_iterator
&bl
);
4391 void dump(ceph::Formatter
*f
) const;
4392 static void generate_test_instances(std::list
<pg_log_entry_t
*>& o
);
4395 WRITE_CLASS_ENCODER(pg_log_entry_t
)
4397 std::ostream
& operator<<(std::ostream
& out
, const pg_log_entry_t
& e
);
4399 struct pg_log_dup_t
{
4400 osd_reqid_t reqid
; // caller+tid to uniquely identify request
4402 version_t user_version
; // the user version for this entry
4403 int32_t return_code
; // only stored for ERRORs for dup detection
4405 std::vector
<pg_log_op_return_item_t
> op_returns
;
4408 : user_version(0), return_code(0)
4410 explicit pg_log_dup_t(const pg_log_entry_t
& entry
)
4411 : reqid(entry
.reqid
), version(entry
.version
),
4412 user_version(entry
.user_version
),
4413 return_code(entry
.return_code
),
4414 op_returns(entry
.op_returns
)
4416 pg_log_dup_t(const eversion_t
& v
, version_t uv
,
4417 const osd_reqid_t
& rid
, int return_code
)
4418 : reqid(rid
), version(v
), user_version(uv
),
4419 return_code(return_code
)
4422 std::string
get_key_name() const;
4423 void encode(ceph::buffer::list
&bl
) const;
4424 void decode(ceph::buffer::list::const_iterator
&bl
);
4425 void dump(ceph::Formatter
*f
) const;
4426 static void generate_test_instances(std::list
<pg_log_dup_t
*>& o
);
4428 bool operator==(const pg_log_dup_t
&rhs
) const {
4429 return reqid
== rhs
.reqid
&&
4430 version
== rhs
.version
&&
4431 user_version
== rhs
.user_version
&&
4432 return_code
== rhs
.return_code
&&
4433 op_returns
== rhs
.op_returns
;
4435 bool operator!=(const pg_log_dup_t
&rhs
) const {
4436 return !(*this == rhs
);
4439 friend std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
4441 WRITE_CLASS_ENCODER(pg_log_dup_t
)
4443 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
4446 * pg_log_t - incremental log of recent pg changes.
4448 * serves as a recovery queue for recent changes.
4452 * head - newest entry (update|delete)
4453 * tail - entry previous to oldest (update|delete) for which we have
4454 * complete negative information.
4455 * i.e. we can infer pg contents for any store whose last_update >= tail.
4457 eversion_t head
; // newest entry
4458 eversion_t tail
; // version prior to oldest
4461 // We can rollback rollback-able entries > can_rollback_to
4462 eversion_t can_rollback_to
;
4464 // always <= can_rollback_to, indicates how far stashed rollback
4465 // data can be found
4466 eversion_t rollback_info_trimmed_to
;
4470 mempool::osd_pglog::list
<pg_log_entry_t
> log
;
4472 // entries just for dup op detection ordered oldest to newest
4473 mempool::osd_pglog::list
<pg_log_dup_t
> dups
;
4475 pg_log_t() = default;
4476 pg_log_t(const eversion_t
&last_update
,
4477 const eversion_t
&log_tail
,
4478 const eversion_t
&can_rollback_to
,
4479 const eversion_t
&rollback_info_trimmed_to
,
4480 mempool::osd_pglog::list
<pg_log_entry_t
> &&entries
,
4481 mempool::osd_pglog::list
<pg_log_dup_t
> &&dup_entries
)
4482 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
4483 rollback_info_trimmed_to(rollback_info_trimmed_to
),
4484 log(std::move(entries
)), dups(std::move(dup_entries
)) {}
4485 pg_log_t(const eversion_t
&last_update
,
4486 const eversion_t
&log_tail
,
4487 const eversion_t
&can_rollback_to
,
4488 const eversion_t
&rollback_info_trimmed_to
,
4489 const std::list
<pg_log_entry_t
> &entries
,
4490 const std::list
<pg_log_dup_t
> &dup_entries
)
4491 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
4492 rollback_info_trimmed_to(rollback_info_trimmed_to
) {
4493 for (auto &&entry
: entries
) {
4494 log
.push_back(entry
);
4496 for (auto &&entry
: dup_entries
) {
4497 dups
.push_back(entry
);
4503 rollback_info_trimmed_to
= can_rollback_to
= head
= tail
= z
;
4508 eversion_t
get_rollback_info_trimmed_to() const {
4509 return rollback_info_trimmed_to
;
4511 eversion_t
get_can_rollback_to() const {
4512 return can_rollback_to
;
4516 pg_log_t
split_out_child(pg_t child_pgid
, unsigned split_bits
) {
4517 mempool::osd_pglog::list
<pg_log_entry_t
> oldlog
, childlog
;
4520 eversion_t old_tail
;
4521 unsigned mask
= ~((~0)<<split_bits
);
4522 for (auto i
= oldlog
.begin();
4525 if ((i
->soid
.get_hash() & mask
) == child_pgid
.m_seed
) {
4526 childlog
.push_back(*i
);
4533 // osd_reqid is unique, so it doesn't matter if there are extra
4534 // dup entries in each pg. To avoid storing oid with the dup
4535 // entries, just copy the whole list.
4536 auto childdups(dups
);
4542 rollback_info_trimmed_to
,
4543 std::move(childlog
),
4544 std::move(childdups
));
4547 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
4548 ceph_assert(newhead
>= tail
);
4550 mempool::osd_pglog::list
<pg_log_entry_t
>::iterator p
= log
.end();
4551 mempool::osd_pglog::list
<pg_log_entry_t
> divergent
;
4553 if (p
== log
.begin()) {
4554 // yikes, the whole thing is divergent!
4556 swap(divergent
, log
);
4560 if (p
->version
.version
<= newhead
.version
) {
4562 * look at eversion.version here. we want to avoid a situation like:
4563 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4564 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4565 * lower_bound = 100'9
4566 * i.e, same request, different version. If the eversion.version is > the
4567 * lower_bound, we it is divergent.
4570 divergent
.splice(divergent
.begin(), log
, p
, log
.end());
4573 ceph_assert(p
->version
> newhead
);
4577 if (can_rollback_to
> newhead
)
4578 can_rollback_to
= newhead
;
4580 if (rollback_info_trimmed_to
> newhead
)
4581 rollback_info_trimmed_to
= newhead
;
4586 void merge_from(const std::vector
<pg_log_t
*>& slogs
, eversion_t last_update
) {
4589 // sort and merge dups
4590 std::multimap
<eversion_t
,pg_log_dup_t
> sorted
;
4591 for (auto& d
: dups
) {
4592 sorted
.emplace(d
.version
, d
);
4594 for (auto l
: slogs
) {
4595 for (auto& d
: l
->dups
) {
4596 sorted
.emplace(d
.version
, d
);
4600 for (auto& i
: sorted
) {
4601 dups
.push_back(i
.second
);
4606 can_rollback_to
= last_update
;
4607 rollback_info_trimmed_to
= last_update
;
4610 bool empty() const {
4615 return head
.version
== 0 && head
.epoch
== 0;
4618 uint64_t approx_size() const {
4619 return head
.version
- tail
.version
;
4622 static void filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
4623 const std::string
&hit_set_namespace
, const pg_log_t
&in
,
4624 pg_log_t
&out
, pg_log_t
&reject
);
4627 * copy entries from the tail of another pg_log_t
4629 * @param other pg_log_t to copy from
4630 * @param from copy entries after this version
4632 void copy_after(CephContext
* cct
, const pg_log_t
&other
, eversion_t from
);
4635 * copy up to N entries
4637 * @param other source log
4638 * @param max max number of entries to copy
4640 void copy_up_to(CephContext
* cct
, const pg_log_t
&other
, int max
);
4642 std::ostream
& print(std::ostream
& out
) const;
4644 void encode(ceph::buffer::list
&bl
) const;
4645 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1);
4646 void dump(ceph::Formatter
*f
) const;
4647 static void generate_test_instances(std::list
<pg_log_t
*>& o
);
4649 WRITE_CLASS_ENCODER(pg_log_t
)
4651 inline std::ostream
& operator<<(std::ostream
& out
, const pg_log_t
& log
)
4653 out
<< "log((" << log
.tail
<< "," << log
.head
<< "], crt="
4654 << log
.get_can_rollback_to() << ")";
4660 * pg_missing_t - summary of missing objects.
4662 * kept in memory, as a supplement to pg_log_t
4663 * also used to pass missing info in messages.
4665 struct pg_missing_item
{
4666 eversion_t need
, have
;
4667 ObjectCleanRegions clean_regions
;
4668 enum missing_flags_t
{
4672 pg_missing_item() : flags(FLAG_NONE
) {}
4673 explicit pg_missing_item(eversion_t n
) : need(n
), flags(FLAG_NONE
) {} // have no old version
4674 pg_missing_item(eversion_t n
, eversion_t h
, bool is_delete
=false, bool old_style
= false) :
4676 set_delete(is_delete
);
4678 clean_regions
.mark_fully_dirty();
4681 void encode(ceph::buffer::list
& bl
, uint64_t features
) const {
4683 if (HAVE_FEATURE(features
, SERVER_OCTOPUS
)) {
4684 // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
4685 // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
4686 // possible. This can be replaced with the legacy encoding
4687 encode(eversion_t(), bl
);
4688 encode(eversion_t(-1, -1), bl
);
4691 encode(static_cast<uint8_t>(flags
), bl
);
4692 encode(clean_regions
, bl
);
4694 encode(eversion_t(), bl
);
4697 encode(static_cast<uint8_t>(flags
), bl
);
4700 void decode(ceph::buffer::list::const_iterator
& bl
) {
4705 if(l
== eversion_t(-1, -1)) {
4711 flags
= static_cast<missing_flags_t
>(f
);
4712 decode(clean_regions
, bl
);
4714 // support OSD_RECOVERY_DELETES
4719 flags
= static_cast<missing_flags_t
>(f
);
4720 clean_regions
.mark_fully_dirty();
4724 void set_delete(bool is_delete
) {
4725 flags
= is_delete
? FLAG_DELETE
: FLAG_NONE
;
4728 bool is_delete() const {
4729 return (flags
& FLAG_DELETE
) == FLAG_DELETE
;
4732 std::string
flag_str() const {
4733 if (flags
== FLAG_NONE
) {
4740 void dump(ceph::Formatter
*f
) const {
4741 f
->dump_stream("need") << need
;
4742 f
->dump_stream("have") << have
;
4743 f
->dump_stream("flags") << flag_str();
4744 f
->dump_stream("clean_regions") << clean_regions
;
4746 static void generate_test_instances(std::list
<pg_missing_item
*>& o
) {
4747 o
.push_back(new pg_missing_item
);
4748 o
.push_back(new pg_missing_item
);
4749 o
.back()->need
= eversion_t(1, 2);
4750 o
.back()->have
= eversion_t(1, 1);
4751 o
.push_back(new pg_missing_item
);
4752 o
.back()->need
= eversion_t(3, 5);
4753 o
.back()->have
= eversion_t(3, 4);
4754 o
.back()->clean_regions
.mark_data_region_dirty(4096, 8192);
4755 o
.back()->clean_regions
.mark_omap_dirty();
4756 o
.back()->flags
= FLAG_DELETE
;
4758 bool operator==(const pg_missing_item
&rhs
) const {
4759 return need
== rhs
.need
&& have
== rhs
.have
&& flags
== rhs
.flags
;
4761 bool operator!=(const pg_missing_item
&rhs
) const {
4762 return !(*this == rhs
);
4765 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item
)
4766 std::ostream
& operator<<(std::ostream
& out
, const pg_missing_item
&item
);
4768 class pg_missing_const_i
{
4770 virtual const std::map
<hobject_t
, pg_missing_item
> &
4771 get_items() const = 0;
4772 virtual const std::map
<version_t
, hobject_t
> &get_rmissing() const = 0;
4773 virtual bool get_may_include_deletes() const = 0;
4774 virtual unsigned int num_missing() const = 0;
4775 virtual bool have_missing() const = 0;
4776 virtual bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const = 0;
4777 virtual bool is_missing(const hobject_t
& oid
, eversion_t v
) const = 0;
4778 virtual ~pg_missing_const_i() {}
4782 template <bool Track
>
4783 class ChangeTracker
{
4785 void changed(const hobject_t
&obj
) {}
4786 template <typename F
>
4787 void get_changed(F
&&f
) const {}
4789 bool is_clean() const {
4794 class ChangeTracker
<true> {
4795 std::set
<hobject_t
> _changed
;
4797 void changed(const hobject_t
&obj
) {
4798 _changed
.insert(obj
);
4800 template <typename F
>
4801 void get_changed(F
&&f
) const {
4802 for (auto const &i
: _changed
) {
4809 bool is_clean() const {
4810 return _changed
.empty();
4814 template <bool TrackChanges
>
4815 class pg_missing_set
: public pg_missing_const_i
{
4816 using item
= pg_missing_item
;
4817 std::map
<hobject_t
, item
> missing
; // oid -> (need v, have v)
4818 std::map
<version_t
, hobject_t
> rmissing
; // v -> oid
4819 ChangeTracker
<TrackChanges
> tracker
;
4822 pg_missing_set() = default;
4824 template <typename missing_type
>
4825 pg_missing_set(const missing_type
&m
) {
4826 missing
= m
.get_items();
4827 rmissing
= m
.get_rmissing();
4828 may_include_deletes
= m
.get_may_include_deletes();
4829 for (auto &&i
: missing
)
4830 tracker
.changed(i
.first
);
4833 bool may_include_deletes
= false;
4835 const std::map
<hobject_t
, item
> &get_items() const override
{
4838 const std::map
<version_t
, hobject_t
> &get_rmissing() const override
{
4841 bool get_may_include_deletes() const override
{
4842 return may_include_deletes
;
4844 unsigned int num_missing() const override
{
4845 return missing
.size();
4847 bool have_missing() const override
{
4848 return !missing
.empty();
4850 void merge(const pg_log_entry_t
& e
) {
4851 auto miter
= missing
.find(e
.soid
);
4852 if (miter
!= missing
.end() && miter
->second
.have
!= eversion_t() && e
.version
> miter
->second
.have
)
4853 miter
->second
.clean_regions
.merge(e
.clean_regions
);
4855 bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const override
{
4856 auto iter
= missing
.find(oid
);
4857 if (iter
== missing
.end())
4860 *out
= iter
->second
;
4863 bool is_missing(const hobject_t
& oid
, eversion_t v
) const override
{
4864 std::map
<hobject_t
, item
>::const_iterator m
=
4866 if (m
== missing
.end())
4868 const item
&item(m
->second
);
4873 eversion_t
get_oldest_need() const {
4874 if (missing
.empty()) {
4875 return eversion_t();
4877 auto it
= missing
.find(rmissing
.begin()->second
);
4878 ceph_assert(it
!= missing
.end());
4879 return it
->second
.need
;
4882 void claim(pg_missing_set
&& o
) {
4883 static_assert(!TrackChanges
, "Can't use claim with TrackChanges");
4884 missing
= std::move(o
.missing
);
4885 rmissing
= std::move(o
.rmissing
);
4889 * this needs to be called in log order as we extend the log. it
4890 * assumes missing is accurate up through the previous log entry.
4892 void add_next_event(const pg_log_entry_t
& e
) {
4893 std::map
<hobject_t
, item
>::iterator missing_it
;
4894 missing_it
= missing
.find(e
.soid
);
4895 bool is_missing_divergent_item
= missing_it
!= missing
.end();
4896 if (e
.prior_version
== eversion_t() || e
.is_clone()) {
4898 if (is_missing_divergent_item
) { // use iterator
4899 rmissing
.erase(missing_it
->second
.need
.version
);
4901 missing_it
->second
= item(e
.version
, eversion_t(), e
.is_delete());
4902 missing_it
->second
.clean_regions
.mark_fully_dirty();
4904 // create new element in missing map
4906 missing
[e
.soid
] = item(e
.version
, eversion_t(), e
.is_delete());
4907 missing
[e
.soid
].clean_regions
.mark_fully_dirty();
4909 } else if (is_missing_divergent_item
) {
4910 // already missing (prior).
4911 rmissing
.erase((missing_it
->second
).need
.version
);
4912 missing_it
->second
.need
= e
.version
; // leave .have unchanged.
4913 missing_it
->second
.set_delete(e
.is_delete());
4914 if (e
.is_lost_revert())
4915 missing_it
->second
.clean_regions
.mark_fully_dirty();
4917 missing_it
->second
.clean_regions
.merge(e
.clean_regions
);
4919 // not missing, we must have prior_version (if any)
4920 ceph_assert(!is_missing_divergent_item
);
4921 missing
[e
.soid
] = item(e
.version
, e
.prior_version
, e
.is_delete());
4922 if (e
.is_lost_revert())
4923 missing
[e
.soid
].clean_regions
.mark_fully_dirty();
4925 missing
[e
.soid
].clean_regions
= e
.clean_regions
;
4927 rmissing
[e
.version
.version
] = e
.soid
;
4928 tracker
.changed(e
.soid
);
4931 void revise_need(hobject_t oid
, eversion_t need
, bool is_delete
) {
4932 auto p
= missing
.find(oid
);
4933 if (p
!= missing
.end()) {
4934 rmissing
.erase((p
->second
).need
.version
);
4935 p
->second
.need
= need
; // do not adjust .have
4936 p
->second
.set_delete(is_delete
);
4937 p
->second
.clean_regions
.mark_fully_dirty();
4939 missing
[oid
] = item(need
, eversion_t(), is_delete
);
4940 missing
[oid
].clean_regions
.mark_fully_dirty();
4942 rmissing
[need
.version
] = oid
;
4944 tracker
.changed(oid
);
4947 void revise_have(hobject_t oid
, eversion_t have
) {
4948 auto p
= missing
.find(oid
);
4949 if (p
!= missing
.end()) {
4950 tracker
.changed(oid
);
4951 (p
->second
).have
= have
;
4955 void mark_fully_dirty(const hobject_t
& oid
) {
4956 auto p
= missing
.find(oid
);
4957 if (p
!= missing
.end()) {
4958 tracker
.changed(oid
);
4959 (p
->second
).clean_regions
.mark_fully_dirty();
4963 void add(const hobject_t
& oid
, eversion_t need
, eversion_t have
,
4965 missing
[oid
] = item(need
, have
, is_delete
, true);
4966 rmissing
[need
.version
] = oid
;
4967 tracker
.changed(oid
);
4970 void add(const hobject_t
& oid
, pg_missing_item
&& item
) {
4971 rmissing
[item
.need
.version
] = oid
;
4972 missing
.insert({oid
, std::move(item
)});
4973 tracker
.changed(oid
);
4976 void rm(const hobject_t
& oid
, eversion_t v
) {
4977 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
4978 if (p
!= missing
.end() && p
->second
.need
<= v
)
4982 void rm(std::map
<hobject_t
, item
>::const_iterator m
) {
4983 tracker
.changed(m
->first
);
4984 rmissing
.erase(m
->second
.need
.version
);
4988 void got(const hobject_t
& oid
, eversion_t v
) {
4989 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
4990 ceph_assert(p
!= missing
.end());
4991 ceph_assert(p
->second
.need
<= v
|| p
->second
.is_delete());
4995 void got(std::map
<hobject_t
, item
>::const_iterator m
) {
4996 tracker
.changed(m
->first
);
4997 rmissing
.erase(m
->second
.need
.version
);
5003 unsigned split_bits
,
5004 pg_missing_set
*omissing
) {
5005 omissing
->may_include_deletes
= may_include_deletes
;
5006 unsigned mask
= ~((~0)<<split_bits
);
5007 for (std::map
<hobject_t
, item
>::iterator i
= missing
.begin();
5010 if ((i
->first
.get_hash() & mask
) == child_pgid
.m_seed
) {
5011 omissing
->add(i
->first
, i
->second
.need
, i
->second
.have
,
5012 i
->second
.is_delete());
5021 for (auto const &i
: missing
)
5022 tracker
.changed(i
.first
);
5027 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
5028 ENCODE_START(5, 2, bl
)
5029 encode(missing
, bl
, features
);
5030 encode(may_include_deletes
, bl
);
5033 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1) {
5034 for (auto const &i
: missing
)
5035 tracker
.changed(i
.first
);
5036 DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl
);
5037 decode(missing
, bl
);
5038 if (struct_v
>= 4) {
5039 decode(may_include_deletes
, bl
);
5044 // Handle hobject_t upgrade
5045 std::map
<hobject_t
, item
> tmp
;
5046 for (std::map
<hobject_t
, item
>::iterator i
=
5050 if (!i
->first
.is_max() && i
->first
.pool
== -1) {
5051 hobject_t
to_insert(i
->first
);
5052 to_insert
.pool
= pool
;
5053 tmp
[to_insert
] = i
->second
;
5059 missing
.insert(tmp
.begin(), tmp
.end());
5062 for (std::map
<hobject_t
,item
>::iterator it
=
5064 it
!= missing
.end();
5066 rmissing
[it
->second
.need
.version
] = it
->first
;
5067 for (auto const &i
: missing
)
5068 tracker
.changed(i
.first
);
5070 void dump(ceph::Formatter
*f
) const {
5071 f
->open_array_section("missing");
5072 for (std::map
<hobject_t
,item
>::const_iterator p
=
5073 missing
.begin(); p
!= missing
.end(); ++p
) {
5074 f
->open_object_section("item");
5075 f
->dump_stream("object") << p
->first
;
5080 f
->dump_bool("may_include_deletes", may_include_deletes
);
5082 template <typename F
>
5083 void filter_objects(F
&&f
) {
5084 for (auto i
= missing
.begin(); i
!= missing
.end();) {
5092 static void generate_test_instances(std::list
<pg_missing_set
*>& o
) {
5093 o
.push_back(new pg_missing_set
);
5094 o
.back()->may_include_deletes
= true;
5095 o
.push_back(new pg_missing_set
);
5097 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5098 eversion_t(5, 6), eversion_t(5, 1), false);
5099 o
.back()->may_include_deletes
= true;
5100 o
.push_back(new pg_missing_set
);
5102 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
5103 eversion_t(5, 6), eversion_t(5, 1), true);
5104 o
.back()->may_include_deletes
= true;
5106 template <typename F
>
5107 void get_changed(F
&&f
) const {
5108 tracker
.get_changed(f
);
5113 bool is_clean() const {
5114 return tracker
.is_clean();
5116 template <typename missing_t
>
5117 bool debug_verify_from_init(
5118 const missing_t
&init_missing
,
5119 std::ostream
*oss
) const {
5122 auto check_missing(init_missing
.get_items());
5123 tracker
.get_changed([&](const hobject_t
&hoid
) {
5124 check_missing
.erase(hoid
);
5125 if (missing
.count(hoid
)) {
5126 check_missing
.insert(*(missing
.find(hoid
)));
5130 if (check_missing
.size() != missing
.size()) {
5132 *oss
<< "Size mismatch, check: " << check_missing
.size()
5133 << ", actual: " << missing
.size() << "\n";
5137 for (auto &i
: missing
) {
5138 if (!check_missing
.count(i
.first
)) {
5140 *oss
<< "check_missing missing " << i
.first
<< "\n";
5142 } else if (check_missing
[i
.first
] != i
.second
) {
5144 *oss
<< "check_missing missing item mismatch on " << i
.first
5145 << ", check: " << check_missing
[i
.first
]
5146 << ", actual: " << i
.second
<< "\n";
5151 *oss
<< "check_missing: " << check_missing
<< "\n";
5152 std::set
<hobject_t
> changed
;
5153 tracker
.get_changed([&](const hobject_t
&hoid
) { changed
.insert(hoid
); });
5154 *oss
<< "changed: " << changed
<< "\n";
5159 template <bool TrackChanges
>
5161 const pg_missing_set
<TrackChanges
> &c
, ceph::buffer::list
&bl
, uint64_t features
=0) {
5163 c
.encode(bl
, features
);
5164 ENCODE_DUMP_POST(cl
);
5166 template <bool TrackChanges
>
5167 void decode(pg_missing_set
<TrackChanges
> &c
, ceph::buffer::list::const_iterator
&p
) {
5170 template <bool TrackChanges
>
5171 std::ostream
& operator<<(std::ostream
& out
, const pg_missing_set
<TrackChanges
> &missing
)
5173 out
<< "missing(" << missing
.num_missing()
5174 << " may_include_deletes = " << missing
.may_include_deletes
;
5175 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
5180 using pg_missing_t
= pg_missing_set
<false>;
5181 using pg_missing_tracker_t
= pg_missing_set
<true>;
5187 * pg list objects response format
5191 template<typename T
>
5192 struct pg_nls_response_template
{
5193 collection_list_handle_t handle
;
5194 std::vector
<T
> entries
;
5196 void encode(ceph::buffer::list
& bl
) const {
5197 ENCODE_START(1, 1, bl
);
5199 __u32 n
= (__u32
)entries
.size();
5201 for (auto i
= entries
.begin(); i
!= entries
.end(); ++i
) {
5202 encode(i
->nspace
, bl
);
5204 encode(i
->locator
, bl
);
5208 void decode(ceph::buffer::list::const_iterator
& bl
) {
5209 DECODE_START(1, bl
);
5216 decode(i
.nspace
, bl
);
5218 decode(i
.locator
, bl
);
5219 entries
.push_back(i
);
5223 void dump(ceph::Formatter
*f
) const {
5224 f
->dump_stream("handle") << handle
;
5225 f
->open_array_section("entries");
5226 for (auto p
= entries
.begin(); p
!= entries
.end(); ++p
) {
5227 f
->open_object_section("object");
5228 f
->dump_string("namespace", p
->nspace
);
5229 f
->dump_string("object", p
->oid
);
5230 f
->dump_string("key", p
->locator
);
5235 static void generate_test_instances(std::list
<pg_nls_response_template
<T
>*>& o
) {
5236 o
.push_back(new pg_nls_response_template
<T
>);
5237 o
.push_back(new pg_nls_response_template
<T
>);
5238 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5239 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
5240 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
5241 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
5242 o
.push_back(new pg_nls_response_template
<T
>);
5243 o
.back()->handle
= hobject_t(object_t("hi"), "key", 3, 4, -1, "");
5244 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5245 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5246 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5247 o
.push_back(new pg_nls_response_template
<T
>);
5248 o
.back()->handle
= hobject_t(object_t("hi"), "key", 5, 6, -1, "");
5249 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
5250 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
5251 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
5252 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5253 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5254 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5258 using pg_nls_response_t
= pg_nls_response_template
<librados::ListObjectImpl
>;
5260 WRITE_CLASS_ENCODER(pg_nls_response_t
)
5262 // For backwards compatibility with older OSD requests
5263 struct pg_ls_response_t
{
5264 collection_list_handle_t handle
;
5265 std::list
<std::pair
<object_t
, std::string
> > entries
;
5267 void encode(ceph::buffer::list
& bl
) const {
5272 encode(entries
, bl
);
5274 void decode(ceph::buffer::list::const_iterator
& bl
) {
5278 ceph_assert(v
== 1);
5280 decode(entries
, bl
);
5282 void dump(ceph::Formatter
*f
) const {
5283 f
->dump_stream("handle") << handle
;
5284 f
->open_array_section("entries");
5285 for (std::list
<std::pair
<object_t
, std::string
> >::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
5286 f
->open_object_section("object");
5287 f
->dump_stream("object") << p
->first
;
5288 f
->dump_string("key", p
->second
);
5293 static void generate_test_instances(std::list
<pg_ls_response_t
*>& o
) {
5294 o
.push_back(new pg_ls_response_t
);
5295 o
.push_back(new pg_ls_response_t
);
5296 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5297 o
.back()->entries
.push_back(std::make_pair(object_t("one"), std::string()));
5298 o
.back()->entries
.push_back(std::make_pair(object_t("two"), std::string("twokey")));
5302 WRITE_CLASS_ENCODER(pg_ls_response_t
)
5305 * object_copy_cursor_t
5307 struct object_copy_cursor_t
{
5308 uint64_t data_offset
;
5309 std::string omap_offset
;
5314 object_copy_cursor_t()
5316 attr_complete(false),
5317 data_complete(false),
5318 omap_complete(false)
5321 bool is_initial() const {
5322 return !attr_complete
&& data_offset
== 0 && omap_offset
.empty();
5324 bool is_complete() const {
5325 return attr_complete
&& data_complete
&& omap_complete
;
5328 static void generate_test_instances(std::list
<object_copy_cursor_t
*>& o
);
5329 void encode(ceph::buffer::list
& bl
) const;
5330 void decode(ceph::buffer::list::const_iterator
&bl
);
5331 void dump(ceph::Formatter
*f
) const;
5333 WRITE_CLASS_ENCODER(object_copy_cursor_t
)
5336 * object_copy_data_t
5338 * Return data from a copy request. The semantics are a little strange
5339 * as a result of the encoding's heritage.
5341 * In particular, the sender unconditionally fills in the cursor (from what
5342 * it receives and sends), the size, and the mtime, but is responsible for
5343 * figuring out whether it should put any data in the attrs, data, or
5344 * omap members (corresponding to xattrs, object data, and the omap entries)
5345 * based on external data (the client includes a max amount to return with
5346 * the copy request). The client then looks into the attrs, data, and/or omap
5347 * based on the contents of the cursor.
5349 struct object_copy_data_t
{
5351 FLAG_DATA_DIGEST
= 1<<0,
5352 FLAG_OMAP_DIGEST
= 1<<1,
5354 object_copy_cursor_t cursor
;
5357 uint32_t data_digest
, omap_digest
;
5359 std::map
<std::string
, ceph::buffer::list
, std::less
<>> attrs
;
5360 ceph::buffer::list data
;
5361 ceph::buffer::list omap_header
;
5362 ceph::buffer::list omap_data
;
5364 /// which snaps we are defined for (if a snap and not the head)
5365 std::vector
<snapid_t
> snaps
;
5366 /// latest snap seq for the object (if head)
5369 /// recent reqids on this object
5370 mempool::osd_pglog::vector
<std::pair
<osd_reqid_t
, version_t
> > reqids
;
5372 /// map reqids by index to error return code (if any)
5373 mempool::osd_pglog::map
<uint32_t, int> reqid_return_codes
;
5375 uint64_t truncate_seq
;
5376 uint64_t truncate_size
;
5379 object_copy_data_t() :
5380 size((uint64_t)-1), data_digest(-1),
5381 omap_digest(-1), flags(0),
5385 static void generate_test_instances(std::list
<object_copy_data_t
*>& o
);
5386 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5387 void decode(ceph::buffer::list::const_iterator
& bl
);
5388 void dump(ceph::Formatter
*f
) const;
5390 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t
)
5395 struct pg_create_t
{
5396 epoch_t created
; // epoch pg created
5397 pg_t parent
; // split from parent (if != pg_t())
5401 : created(0), split_bits(0) {}
5402 pg_create_t(unsigned c
, pg_t p
, int s
)
5403 : created(c
), parent(p
), split_bits(s
) {}
5405 void encode(ceph::buffer::list
&bl
) const;
5406 void decode(ceph::buffer::list::const_iterator
&bl
);
5407 void dump(ceph::Formatter
*f
) const;
5408 static void generate_test_instances(std::list
<pg_create_t
*>& o
);
5410 WRITE_CLASS_ENCODER(pg_create_t
)
5412 // -----------------------------------------
5414 class ObjectExtent
{
5416 * ObjectExtents are used for specifying IO behavior against RADOS
5417 * objects when one is using the ObjectCacher.
5419 * To use this in a real system, *every member* must be filled
5420 * out correctly. In particular, make sure to initialize the
5421 * oloc correctly, as its default values are deliberate poison
5422 * and will cause internal ObjectCacher asserts.
5424 * Similarly, your buffer_extents vector *must* specify a total
5425 * size equal to your length. If the buffer_extents inadvertently
5426 * contain less space than the length member specifies, you
5427 * will get unintelligible asserts deep in the ObjectCacher.
5429 * If you are trying to do testing and don't care about actual
5430 * RADOS function, the simplest thing to do is to initialize
5431 * the ObjectExtent (truncate_size can be 0), create a single entry
5432 * in buffer_extents matching the length, and set oloc.pool to 0.
5435 object_t oid
; // object id
5437 uint64_t offset
; // in object
5438 uint64_t length
; // in object
5439 uint64_t truncate_size
; // in object
5441 object_locator_t oloc
; // object locator (pool etc)
5443 std::vector
<std::pair
<uint64_t,uint64_t> > buffer_extents
; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
5445 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
5446 ObjectExtent(object_t o
, uint64_t ono
, uint64_t off
, uint64_t l
, uint64_t ts
) :
5447 oid(o
), objectno(ono
), offset(off
), length(l
), truncate_size(ts
) { }
5450 inline std::ostream
& operator<<(std::ostream
& out
, const ObjectExtent
&ex
)
5452 return out
<< "extent("
5453 << ex
.oid
<< " (" << ex
.objectno
<< ") in " << ex
.oloc
5454 << " " << ex
.offset
<< "~" << ex
.length
5455 << " -> " << ex
.buffer_extents
5460 // ---------------------------------------
5462 class OSDSuperblock
{
5464 uuid_d cluster_fsid
, osd_fsid
;
5465 int32_t whoami
= -1; // my role in this fs.
5466 epoch_t current_epoch
= 0; // most recent epoch
5467 epoch_t oldest_map
= 0, newest_map
= 0; // oldest/newest maps we have.
5468 double weight
= 0.0;
5470 CompatSet compat_features
;
5472 // last interval over which i mounted and was then active
5473 epoch_t mounted
= 0; // last epoch i mounted
5474 epoch_t clean_thru
= 0; // epoch i was active and clean thru
5476 epoch_t purged_snaps_last
= 0;
5477 utime_t last_purged_snaps_scrub
;
5479 void encode(ceph::buffer::list
&bl
) const;
5480 void decode(ceph::buffer::list::const_iterator
&bl
);
5481 void dump(ceph::Formatter
*f
) const;
5482 static void generate_test_instances(std::list
<OSDSuperblock
*>& o
);
5484 WRITE_CLASS_ENCODER(OSDSuperblock
)
5486 inline std::ostream
& operator<<(std::ostream
& out
, const OSDSuperblock
& sb
)
5488 return out
<< "sb(" << sb
.cluster_fsid
5489 << " osd." << sb
.whoami
5490 << " " << sb
.osd_fsid
5491 << " e" << sb
.current_epoch
5492 << " [" << sb
.oldest_map
<< "," << sb
.newest_map
<< "]"
5493 << " lci=[" << sb
.mounted
<< "," << sb
.clean_thru
<< "]"
5506 * attached to object head. describes most recent snap context, and
5507 * set of existing clones.
5511 // NOTE: this is for pre-octopus compatibility only! remove in Q release
5512 std::vector
<snapid_t
> snaps
; // descending
5513 std::vector
<snapid_t
> clones
; // ascending
5514 std::map
<snapid_t
, interval_set
<uint64_t> > clone_overlap
; // overlap w/ next newest
5515 std::map
<snapid_t
, uint64_t> clone_size
;
5516 std::map
<snapid_t
, std::vector
<snapid_t
>> clone_snaps
; // descending
5518 SnapSet() : seq(0) {}
5519 explicit SnapSet(ceph::buffer::list
& bl
) {
5520 auto p
= std::cbegin(bl
);
5524 /// populate SnapSet from a librados::snap_set_t
5525 void from_snap_set(const librados::snap_set_t
& ss
, bool legacy
);
5527 /// get space accounted to clone
5528 uint64_t get_clone_bytes(snapid_t clone
) const;
5530 void encode(ceph::buffer::list
& bl
) const;
5531 void decode(ceph::buffer::list::const_iterator
& bl
);
5532 void dump(ceph::Formatter
*f
) const;
5533 static void generate_test_instances(std::list
<SnapSet
*>& o
);
5535 SnapContext
get_ssc_as_of(snapid_t as_of
) const {
5538 for (auto p
= clone_snaps
.rbegin();
5539 p
!= clone_snaps
.rend();
5541 for (auto snap
: p
->second
) {
5542 if (snap
<= as_of
) {
5543 out
.snaps
.push_back(snap
);
5551 SnapSet
get_filtered(const pg_pool_t
&pinfo
) const;
5552 void filter(const pg_pool_t
&pinfo
);
5554 WRITE_CLASS_ENCODER(SnapSet
)
5556 std::ostream
& operator<<(std::ostream
& out
, const SnapSet
& cs
);
5561 #define SS_ATTR "snapset"
5563 struct watch_info_t
{
5565 uint32_t timeout_seconds
;
5568 watch_info_t() : cookie(0), timeout_seconds(0) { }
5569 watch_info_t(uint64_t c
, uint32_t t
, const entity_addr_t
& a
) : cookie(c
), timeout_seconds(t
), addr(a
) {}
5571 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5572 void decode(ceph::buffer::list::const_iterator
& bl
);
5573 void dump(ceph::Formatter
*f
) const;
5574 static void generate_test_instances(std::list
<watch_info_t
*>& o
);
5576 WRITE_CLASS_ENCODER_FEATURES(watch_info_t
)
5578 static inline bool operator==(const watch_info_t
& l
, const watch_info_t
& r
) {
5579 return l
.cookie
== r
.cookie
&& l
.timeout_seconds
== r
.timeout_seconds
5580 && l
.addr
== r
.addr
;
5583 static inline std::ostream
& operator<<(std::ostream
& out
, const watch_info_t
& w
) {
5584 return out
<< "watch(cookie " << w
.cookie
<< " " << w
.timeout_seconds
<< "s"
5585 << " " << w
.addr
<< ")";
5588 struct notify_info_t
{
5592 ceph::buffer::list bl
;
5595 static inline std::ostream
& operator<<(std::ostream
& out
, const notify_info_t
& n
) {
5596 return out
<< "notify(cookie " << n
.cookie
5597 << " notify" << n
.notify_id
5598 << " " << n
.timeout
<< "s)";
5601 class object_ref_delta_t
{
5602 std::map
<hobject_t
, int> ref_delta
;
5605 object_ref_delta_t() = default;
5606 object_ref_delta_t(const object_ref_delta_t
&) = default;
5607 object_ref_delta_t(object_ref_delta_t
&&) = default;
5609 object_ref_delta_t(decltype(ref_delta
) &&ref_delta
)
5610 : ref_delta(std::move(ref_delta
)) {}
5611 object_ref_delta_t(const decltype(ref_delta
) &ref_delta
)
5612 : ref_delta(ref_delta
) {}
5614 object_ref_delta_t
&operator=(const object_ref_delta_t
&) = default;
5615 object_ref_delta_t
&operator=(object_ref_delta_t
&&) = default;
5617 void dec_ref(const hobject_t
&hoid
, unsigned num
=1) {
5618 mut_ref(hoid
, -num
);
5620 void inc_ref(const hobject_t
&hoid
, unsigned num
=1) {
5623 void mut_ref(const hobject_t
&hoid
, int num
) {
5624 [[maybe_unused
]] auto [iter
, _
] = ref_delta
.try_emplace(hoid
, 0);
5625 iter
->second
+= num
;
5626 if (iter
->second
== 0)
5627 ref_delta
.erase(iter
);
5630 auto begin() const { return ref_delta
.begin(); }
5631 auto end() const { return ref_delta
.end(); }
5632 auto find(hobject_t
&key
) const { return ref_delta
.find(key
); }
5634 bool operator==(const object_ref_delta_t
&rhs
) const {
5635 return ref_delta
== rhs
.ref_delta
;
5637 bool operator!=(const object_ref_delta_t
&rhs
) const {
5638 return !(*this == rhs
);
5641 return ref_delta
.empty();
5644 return ref_delta
.size();
5646 friend std::ostream
& operator<<(std::ostream
& out
, const object_ref_delta_t
& ci
);
5649 struct chunk_info_t
{
5653 FLAG_HAS_REFERENCE
= 4,
5654 FLAG_HAS_FINGERPRINT
= 8,
5659 cflag_t flags
; // FLAG_*
5661 chunk_info_t() : offset(0), length(0), flags((cflag_t
)0) { }
5662 chunk_info_t(uint32_t offset
, uint32_t length
, hobject_t oid
) :
5663 offset(offset
), length(length
), oid(oid
), flags((cflag_t
)0) { }
5665 static std::string
get_flag_string(uint64_t flags
) {
5667 if (flags
& FLAG_DIRTY
) {
5670 if (flags
& FLAG_MISSING
) {
5673 if (flags
& FLAG_HAS_REFERENCE
) {
5674 r
+= "|has_reference";
5676 if (flags
& FLAG_HAS_FINGERPRINT
) {
5677 r
+= "|has_fingerprint";
5683 bool test_flag(cflag_t f
) const {
5684 return (flags
& f
) == f
;
5686 void set_flag(cflag_t f
) {
5687 flags
= (cflag_t
)(flags
| f
);
5689 void set_flags(cflag_t f
) {
5692 void clear_flag(cflag_t f
) {
5693 flags
= (cflag_t
)(flags
& ~f
);
5695 void clear_flags() {
5698 bool is_dirty() const {
5699 return test_flag(FLAG_DIRTY
);
5701 bool is_missing() const {
5702 return test_flag(FLAG_MISSING
);
5704 bool has_reference() const {
5705 return test_flag(FLAG_HAS_REFERENCE
);
5707 bool has_fingerprint() const {
5708 return test_flag(FLAG_HAS_FINGERPRINT
);
5710 void encode(ceph::buffer::list
&bl
) const;
5711 void decode(ceph::buffer::list::const_iterator
&bl
);
5712 void dump(ceph::Formatter
*f
) const;
5713 friend std::ostream
& operator<<(std::ostream
& out
, const chunk_info_t
& ci
);
5714 bool operator==(const chunk_info_t
& cit
) const;
5715 bool operator!=(const chunk_info_t
& cit
) const {
5716 return !(cit
== *this);
5719 WRITE_CLASS_ENCODER(chunk_info_t
)
5720 std::ostream
& operator<<(std::ostream
& out
, const chunk_info_t
& ci
);
5722 struct object_info_t
;
5723 struct object_manifest_t
{
5729 uint8_t type
; // redirect, chunked, ...
5730 hobject_t redirect_target
;
5731 std::map
<uint64_t, chunk_info_t
> chunk_map
;
5733 object_manifest_t() : type(0) { }
5734 object_manifest_t(uint8_t type
, const hobject_t
& redirect_target
)
5735 : type(type
), redirect_target(redirect_target
) { }
5737 bool is_empty() const {
5738 return type
== TYPE_NONE
;
5740 bool is_redirect() const {
5741 return type
== TYPE_REDIRECT
;
5743 bool is_chunked() const {
5744 return type
== TYPE_CHUNKED
;
5746 static std::string_view
get_type_name(uint8_t m
) {
5748 case TYPE_NONE
: return "none";
5749 case TYPE_REDIRECT
: return "redirect";
5750 case TYPE_CHUNKED
: return "chunked";
5751 default: return "unknown";
5754 std::string_view
get_type_name() const {
5755 return get_type_name(type
);
5759 redirect_target
= hobject_t();
5764 * calc_refs_to_inc_on_set
5766 * Takes a manifest and returns the set of refs to
5767 * increment upon set-chunk
5769 * l should be nullptr if there are no clones, or
5770 * l and g may each be null if the corresponding clone does not exist.
5771 * *this contains the set of new references to set
5774 void calc_refs_to_inc_on_set(
5775 const object_manifest_t
* g
, ///< [in] manifest for clone > *this
5776 const object_manifest_t
* l
, ///< [in] manifest for clone < *this
5777 object_ref_delta_t
&delta
///< [out] set of refs to drop
5781 * calc_refs_to_drop_on_modify
5783 * Takes a manifest and returns the set of refs to
5784 * drop upon modification
5786 * l should be nullptr if there are no clones, or
5787 * l may be null if the corresponding clone does not exist.
5790 void calc_refs_to_drop_on_modify(
5791 const object_manifest_t
* l
, ///< [in] manifest for previous clone
5792 const ObjectCleanRegions
& clean_regions
, ///< [in] clean regions
5793 object_ref_delta_t
&delta
///< [out] set of refs to drop
5797 * calc_refs_to_drop_on_removal
5799 * Takes the two adjacent manifests and returns the set of refs to
5800 * drop upon removal of the clone containing *this.
5802 * g should be nullptr if *this is on HEAD, l should be nullptr if
5803 * *this is on the oldest clone (or head if there are no clones).
5805 void calc_refs_to_drop_on_removal(
5806 const object_manifest_t
* g
, ///< [in] manifest for clone > *this
5807 const object_manifest_t
* l
, ///< [in] manifest for clone < *this
5808 object_ref_delta_t
&delta
///< [out] set of refs to drop
5811 static void generate_test_instances(std::list
<object_manifest_t
*>& o
);
5812 void encode(ceph::buffer::list
&bl
) const;
5813 void decode(ceph::buffer::list::const_iterator
&bl
);
5814 void dump(ceph::Formatter
*f
) const;
5815 friend std::ostream
& operator<<(std::ostream
& out
, const object_info_t
& oi
);
5817 WRITE_CLASS_ENCODER(object_manifest_t
)
5818 std::ostream
& operator<<(std::ostream
& out
, const object_manifest_t
& oi
);
5820 struct object_info_t
{
5822 eversion_t version
, prior_version
;
5823 version_t user_version
;
5824 osd_reqid_t last_reqid
;
5828 utime_t local_mtime
; // local mtime
5830 // note: these are currently encoded into a total 16 bits; see
5831 // encode()/decode() for the weirdness.
5834 FLAG_WHITEOUT
= 1<<1, // object logically does not exist
5835 FLAG_DIRTY
= 1<<2, // object has been modified since last flushed or undirtied
5836 FLAG_OMAP
= 1<<3, // has (or may have) some/any omap data
5837 FLAG_DATA_DIGEST
= 1<<4, // has data crc
5838 FLAG_OMAP_DIGEST
= 1<<5, // has omap crc
5839 FLAG_CACHE_PIN
= 1<<6, // pin the object in cache tier
5840 FLAG_MANIFEST
= 1<<7, // has manifest
5841 FLAG_USES_TMAP
= 1<<8, // deprecated; no longer used
5842 FLAG_REDIRECT_HAS_REFERENCE
= 1<<9, // has reference
5847 static std::string
get_flag_string(flag_t flags
) {
5849 std::vector
<std::string
> sv
= get_flag_vector(flags
);
5850 for (auto ss
: sv
) {
5851 s
+= std::string("|") + ss
;
5857 static std::vector
<std::string
> get_flag_vector(flag_t flags
) {
5858 std::vector
<std::string
> sv
;
5859 if (flags
& FLAG_LOST
)
5860 sv
.insert(sv
.end(), "lost");
5861 if (flags
& FLAG_WHITEOUT
)
5862 sv
.insert(sv
.end(), "whiteout");
5863 if (flags
& FLAG_DIRTY
)
5864 sv
.insert(sv
.end(), "dirty");
5865 if (flags
& FLAG_USES_TMAP
)
5866 sv
.insert(sv
.end(), "uses_tmap");
5867 if (flags
& FLAG_OMAP
)
5868 sv
.insert(sv
.end(), "omap");
5869 if (flags
& FLAG_DATA_DIGEST
)
5870 sv
.insert(sv
.end(), "data_digest");
5871 if (flags
& FLAG_OMAP_DIGEST
)
5872 sv
.insert(sv
.end(), "omap_digest");
5873 if (flags
& FLAG_CACHE_PIN
)
5874 sv
.insert(sv
.end(), "cache_pin");
5875 if (flags
& FLAG_MANIFEST
)
5876 sv
.insert(sv
.end(), "manifest");
5877 if (flags
& FLAG_REDIRECT_HAS_REFERENCE
)
5878 sv
.insert(sv
.end(), "redirect_has_reference");
5881 std::string
get_flag_string() const {
5882 return get_flag_string(flags
);
5885 uint64_t truncate_seq
, truncate_size
;
5887 std::map
<std::pair
<uint64_t, entity_name_t
>, watch_info_t
> watchers
;
5889 // opportunistic checksums; may or may not be present
5890 __u32 data_digest
; ///< data crc32c
5891 __u32 omap_digest
; ///< omap crc32c
5893 // alloc hint attribute
5894 uint64_t expected_object_size
, expected_write_size
;
5895 uint32_t alloc_hint_flags
;
5897 struct object_manifest_t manifest
;
5899 void copy_user_bits(const object_info_t
& other
);
5901 bool test_flag(flag_t f
) const {
5902 return (flags
& f
) == f
;
5904 void set_flag(flag_t f
) {
5905 flags
= (flag_t
)(flags
| f
);
5907 void clear_flag(flag_t f
) {
5908 flags
= (flag_t
)(flags
& ~f
);
5910 bool is_lost() const {
5911 return test_flag(FLAG_LOST
);
5913 bool is_whiteout() const {
5914 return test_flag(FLAG_WHITEOUT
);
5916 bool is_dirty() const {
5917 return test_flag(FLAG_DIRTY
);
5919 bool is_omap() const {
5920 return test_flag(FLAG_OMAP
);
5922 bool is_data_digest() const {
5923 return test_flag(FLAG_DATA_DIGEST
);
5925 bool is_omap_digest() const {
5926 return test_flag(FLAG_OMAP_DIGEST
);
5928 bool is_cache_pinned() const {
5929 return test_flag(FLAG_CACHE_PIN
);
5931 bool has_manifest() const {
5932 return test_flag(FLAG_MANIFEST
);
5934 void set_data_digest(__u32 d
) {
5935 set_flag(FLAG_DATA_DIGEST
);
5938 void set_omap_digest(__u32 d
) {
5939 set_flag(FLAG_OMAP_DIGEST
);
5942 void clear_data_digest() {
5943 clear_flag(FLAG_DATA_DIGEST
);
5946 void clear_omap_digest() {
5947 clear_flag(FLAG_OMAP_DIGEST
);
5951 clear_data_digest();
5952 clear_omap_digest();
5955 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
5956 void decode(ceph::buffer::list::const_iterator
& bl
);
5957 void decode(const ceph::buffer::list
& bl
) {
5958 auto p
= std::cbegin(bl
);
5962 void encode_no_oid(ceph::buffer::list
& bl
, uint64_t features
) {
5963 // TODO: drop soid field and remove the denc no_oid methods
5964 auto tmp_oid
= hobject_t(hobject_t::get_max());
5966 encode(bl
, features
);
5969 void decode_no_oid(ceph::buffer::list::const_iterator
& bl
) {
5971 ceph_assert(soid
.is_max());
5973 void decode_no_oid(const ceph::buffer::list
& bl
) {
5974 auto p
= std::cbegin(bl
);
5977 void decode_no_oid(const ceph::buffer::list
& bl
, const hobject_t
& _soid
) {
5978 auto p
= std::cbegin(bl
);
5983 void dump(ceph::Formatter
*f
) const;
5984 static void generate_test_instances(std::list
<object_info_t
*>& o
);
5986 explicit object_info_t()
5987 : user_version(0), size(0), flags((flag_t
)0),
5988 truncate_seq(0), truncate_size(0),
5989 data_digest(-1), omap_digest(-1),
5990 expected_object_size(0), expected_write_size(0),
5994 explicit object_info_t(const hobject_t
& s
)
5996 user_version(0), size(0), flags((flag_t
)0),
5997 truncate_seq(0), truncate_size(0),
5998 data_digest(-1), omap_digest(-1),
5999 expected_object_size(0), expected_write_size(0),
6003 explicit object_info_t(const ceph::buffer::list
& bl
) {
6007 explicit object_info_t(const ceph::buffer::list
& bl
, const hobject_t
& _soid
) {
6012 WRITE_CLASS_ENCODER_FEATURES(object_info_t
)
6014 std::ostream
& operator<<(std::ostream
& out
, const object_info_t
& oi
);
6019 struct ObjectRecoveryInfo
{
6024 SnapSet ss
; // only populated if soid is_snap()
6025 interval_set
<uint64_t> copy_subset
;
6026 std::map
<hobject_t
, interval_set
<uint64_t>> clone_subset
;
6029 ObjectRecoveryInfo() : size(0), object_exist(true) { }
6031 static void generate_test_instances(std::list
<ObjectRecoveryInfo
*>& o
);
6032 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6033 void decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
= -1);
6034 std::ostream
&print(std::ostream
&out
) const;
6035 void dump(ceph::Formatter
*f
) const;
6037 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo
)
6038 std::ostream
& operator<<(std::ostream
& out
, const ObjectRecoveryInfo
&inf
);
6040 struct ObjectRecoveryProgress
{
6041 uint64_t data_recovered_to
;
6042 std::string omap_recovered_to
;
6048 ObjectRecoveryProgress()
6049 : data_recovered_to(0),
6051 data_complete(false), omap_complete(false) { }
6053 bool is_complete(const ObjectRecoveryInfo
& info
) const {
6054 return (data_recovered_to
>= (
6055 info
.copy_subset
.empty() ?
6056 0 : info
.copy_subset
.range_end())) &&
6060 static void generate_test_instances(std::list
<ObjectRecoveryProgress
*>& o
);
6061 void encode(ceph::buffer::list
&bl
) const;
6062 void decode(ceph::buffer::list::const_iterator
&bl
);
6063 std::ostream
&print(std::ostream
&out
) const;
6064 void dump(ceph::Formatter
*f
) const;
6066 WRITE_CLASS_ENCODER(ObjectRecoveryProgress
)
6067 std::ostream
& operator<<(std::ostream
& out
, const ObjectRecoveryProgress
&prog
);
6069 struct PushReplyOp
{
6072 static void generate_test_instances(std::list
<PushReplyOp
*>& o
);
6073 void encode(ceph::buffer::list
&bl
) const;
6074 void decode(ceph::buffer::list::const_iterator
&bl
);
6075 std::ostream
&print(std::ostream
&out
) const;
6076 void dump(ceph::Formatter
*f
) const;
6078 uint64_t cost(CephContext
*cct
) const;
6080 WRITE_CLASS_ENCODER(PushReplyOp
)
6081 std::ostream
& operator<<(std::ostream
& out
, const PushReplyOp
&op
);
6086 ObjectRecoveryInfo recovery_info
;
6087 ObjectRecoveryProgress recovery_progress
;
6089 static void generate_test_instances(std::list
<PullOp
*>& o
);
6090 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6091 void decode(ceph::buffer::list::const_iterator
&bl
);
6092 std::ostream
&print(std::ostream
&out
) const;
6093 void dump(ceph::Formatter
*f
) const;
6095 uint64_t cost(CephContext
*cct
) const;
6097 WRITE_CLASS_ENCODER_FEATURES(PullOp
)
6098 std::ostream
& operator<<(std::ostream
& out
, const PullOp
&op
);
6103 ceph::buffer::list data
;
6104 interval_set
<uint64_t> data_included
;
6105 ceph::buffer::list omap_header
;
6106 std::map
<std::string
, ceph::buffer::list
> omap_entries
;
6107 std::map
<std::string
, ceph::buffer::list
, std::less
<>> attrset
;
6109 ObjectRecoveryInfo recovery_info
;
6110 ObjectRecoveryProgress before_progress
;
6111 ObjectRecoveryProgress after_progress
;
6113 static void generate_test_instances(std::list
<PushOp
*>& o
);
6114 void encode(ceph::buffer::list
&bl
, uint64_t features
) const;
6115 void decode(ceph::buffer::list::const_iterator
&bl
);
6116 std::ostream
&print(std::ostream
&out
) const;
6117 void dump(ceph::Formatter
*f
) const;
6119 uint64_t cost(CephContext
*cct
) const;
6121 WRITE_CLASS_ENCODER_FEATURES(PushOp
)
6122 std::ostream
& operator<<(std::ostream
& out
, const PushOp
&op
);
6125 * summarize pg contents for purposes of a scrub
6129 std::map
<std::string
, ceph::buffer::ptr
, std::less
<>> attrs
;
6131 __u32 omap_digest
; ///< omap crc32c
6132 __u32 digest
; ///< data crc32c
6134 bool digest_present
:1;
6135 bool omap_digest_present
:1;
6138 bool ec_hash_mismatch
:1;
6139 bool ec_size_mismatch
:1;
6140 bool large_omap_object_found
:1;
6141 uint64_t large_omap_object_key_count
= 0;
6142 uint64_t large_omap_object_value_size
= 0;
6143 uint64_t object_omap_bytes
= 0;
6144 uint64_t object_omap_keys
= 0;
6147 // Init invalid size so it won't match if we get a stat EIO error
6148 size(-1), omap_digest(0), digest(0),
6149 negative(false), digest_present(false), omap_digest_present(false),
6150 read_error(false), stat_error(false), ec_hash_mismatch(false),
6151 ec_size_mismatch(false), large_omap_object_found(false) {}
6153 void encode(ceph::buffer::list
& bl
) const;
6154 void decode(ceph::buffer::list::const_iterator
& bl
);
6155 void dump(ceph::Formatter
*f
) const;
6156 static void generate_test_instances(std::list
<object
*>& o
);
6158 WRITE_CLASS_ENCODER(object
)
6160 std::map
<hobject_t
,object
> objects
;
6161 eversion_t valid_through
;
6162 eversion_t incr_since
;
6163 bool has_large_omap_object_errors
:1;
6164 bool has_omap_keys
:1;
6166 void merge_incr(const ScrubMap
&l
);
6167 void clear_from(const hobject_t
& start
) {
6168 objects
.erase(objects
.lower_bound(start
), objects
.end());
6170 void insert(const ScrubMap
&r
) {
6171 objects
.insert(r
.objects
.begin(), r
.objects
.end());
6173 void swap(ScrubMap
&r
) {
6175 swap(objects
, r
.objects
);
6176 swap(valid_through
, r
.valid_through
);
6177 swap(incr_since
, r
.incr_since
);
6180 void encode(ceph::buffer::list
& bl
) const;
6181 void decode(ceph::buffer::list::const_iterator
& bl
, int64_t pool
=-1);
6182 void dump(ceph::Formatter
*f
) const;
6183 static void generate_test_instances(std::list
<ScrubMap
*>& o
);
6185 WRITE_CLASS_ENCODER(ScrubMap::object
)
6186 WRITE_CLASS_ENCODER(ScrubMap
)
6188 struct ScrubMapBuilder
{
6190 std::vector
<hobject_t
> ls
;
6192 int64_t data_pos
= 0;
6193 std::string omap_pos
;
6195 ceph::buffer::hash data_hash
, omap_hash
; ///< accumulatinng hash value
6196 uint64_t omap_keys
= 0;
6197 uint64_t omap_bytes
= 0;
6203 return pos
>= ls
.size();
6206 *this = ScrubMapBuilder();
6210 return data_pos
< 0;
6213 void next_object() {
6221 friend std::ostream
& operator<<(std::ostream
& out
, const ScrubMapBuilder
& pos
) {
6222 out
<< "(" << pos
.pos
<< "/" << pos
.ls
.size();
6223 if (pos
.pos
< pos
.ls
.size()) {
6224 out
<< " " << pos
.ls
[pos
.pos
];
6226 if (pos
.data_pos
< 0) {
6227 out
<< " byte " << pos
.data_pos
;
6229 if (!pos
.omap_pos
.empty()) {
6230 out
<< " key " << pos
.omap_pos
;
6236 out
<< " ret " << pos
.ret
;
6242 struct watch_item_t
{
6245 uint32_t timeout_seconds
;
6248 watch_item_t() : cookie(0), timeout_seconds(0) { }
6249 watch_item_t(entity_name_t name
, uint64_t cookie
, uint32_t timeout
,
6250 const entity_addr_t
& addr
)
6251 : name(name
), cookie(cookie
), timeout_seconds(timeout
),
6254 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
6255 ENCODE_START(2, 1, bl
);
6258 encode(timeout_seconds
, bl
);
6259 encode(addr
, bl
, features
);
6262 void decode(ceph::buffer::list::const_iterator
&bl
) {
6263 DECODE_START(2, bl
);
6266 decode(timeout_seconds
, bl
);
6267 if (struct_v
>= 2) {
6272 void dump(ceph::Formatter
*f
) const {
6273 f
->dump_stream("watcher") << name
;
6274 f
->dump_int("cookie", cookie
);
6275 f
->dump_int("timeout", timeout_seconds
);
6276 f
->open_object_section("addr");
6280 static void generate_test_instances(std::list
<watch_item_t
*>& o
) {
6282 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
6284 ea
.set_family(AF_INET
);
6285 ea
.set_in4_quad(0, 127);
6286 ea
.set_in4_quad(1, 0);
6287 ea
.set_in4_quad(2, 0);
6288 ea
.set_in4_quad(3, 1);
6290 o
.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 1), 10, 30, ea
));
6292 ea
.set_in4_quad(3, 2);
6294 o
.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 2), 20, 60, ea
));
6297 WRITE_CLASS_ENCODER_FEATURES(watch_item_t
)
6299 struct obj_watch_item_t
{
6305 * obj list watch response format
6308 struct obj_list_watch_response_t
{
6309 std::list
<watch_item_t
> entries
;
6311 void encode(ceph::buffer::list
& bl
, uint64_t features
) const {
6312 ENCODE_START(1, 1, bl
);
6313 encode(entries
, bl
, features
);
6316 void decode(ceph::buffer::list::const_iterator
& bl
) {
6317 DECODE_START(1, bl
);
6318 decode(entries
, bl
);
6321 void dump(ceph::Formatter
*f
) const {
6322 f
->open_array_section("entries");
6323 for (std::list
<watch_item_t
>::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
6324 f
->open_object_section("watch");
6330 static void generate_test_instances(std::list
<obj_list_watch_response_t
*>& o
) {
6332 o
.push_back(new obj_list_watch_response_t
);
6333 o
.push_back(new obj_list_watch_response_t
);
6334 std::list
<watch_item_t
*> test_watchers
;
6335 watch_item_t::generate_test_instances(test_watchers
);
6336 for (auto &e
: test_watchers
) {
6337 o
.back()->entries
.push_back(*e
);
6342 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t
)
6346 std::vector
<snapid_t
> snaps
; // ascending
6347 std::vector
< std::pair
<uint64_t,uint64_t> > overlap
;
6350 clone_info() : cloneid(CEPH_NOSNAP
), size(0) {}
6352 void encode(ceph::buffer::list
& bl
) const {
6353 ENCODE_START(1, 1, bl
);
6354 encode(cloneid
, bl
);
6356 encode(overlap
, bl
);
6360 void decode(ceph::buffer::list::const_iterator
& bl
) {
6361 DECODE_START(1, bl
);
6362 decode(cloneid
, bl
);
6364 decode(overlap
, bl
);
6368 void dump(ceph::Formatter
*f
) const {
6369 if (cloneid
== CEPH_NOSNAP
)
6370 f
->dump_string("cloneid", "HEAD");
6372 f
->dump_unsigned("cloneid", cloneid
.val
);
6373 f
->open_array_section("snapshots");
6374 for (std::vector
<snapid_t
>::const_iterator p
= snaps
.begin(); p
!= snaps
.end(); ++p
) {
6375 f
->open_object_section("snap");
6376 f
->dump_unsigned("id", p
->val
);
6380 f
->open_array_section("overlaps");
6381 for (std::vector
< std::pair
<uint64_t,uint64_t> >::const_iterator q
= overlap
.begin();
6382 q
!= overlap
.end(); ++q
) {
6383 f
->open_object_section("overlap");
6384 f
->dump_unsigned("offset", q
->first
);
6385 f
->dump_unsigned("length", q
->second
);
6389 f
->dump_unsigned("size", size
);
6391 static void generate_test_instances(std::list
<clone_info
*>& o
) {
6392 o
.push_back(new clone_info
);
6393 o
.push_back(new clone_info
);
6394 o
.back()->cloneid
= 1;
6395 o
.back()->snaps
.push_back(1);
6396 o
.back()->overlap
.push_back(std::pair
<uint64_t,uint64_t>(0,4096));
6397 o
.back()->overlap
.push_back(std::pair
<uint64_t,uint64_t>(8192,4096));
6398 o
.back()->size
= 16384;
6399 o
.push_back(new clone_info
);
6400 o
.back()->cloneid
= CEPH_NOSNAP
;
6401 o
.back()->size
= 32768;
6404 WRITE_CLASS_ENCODER(clone_info
)
6407 * obj list snaps response format
6410 struct obj_list_snap_response_t
{
6411 std::vector
<clone_info
> clones
; // ascending
6414 void encode(ceph::buffer::list
& bl
) const {
6415 ENCODE_START(2, 1, bl
);
6420 void decode(ceph::buffer::list::const_iterator
& bl
) {
6421 DECODE_START(2, bl
);
6429 void dump(ceph::Formatter
*f
) const {
6430 f
->open_array_section("clones");
6431 for (std::vector
<clone_info
>::const_iterator p
= clones
.begin(); p
!= clones
.end(); ++p
) {
6432 f
->open_object_section("clone");
6436 f
->dump_unsigned("seq", seq
);
6439 static void generate_test_instances(std::list
<obj_list_snap_response_t
*>& o
) {
6440 o
.push_back(new obj_list_snap_response_t
);
6441 o
.push_back(new obj_list_snap_response_t
);
6444 cl
.snaps
.push_back(1);
6445 cl
.overlap
.push_back(std::pair
<uint64_t,uint64_t>(0,4096));
6446 cl
.overlap
.push_back(std::pair
<uint64_t,uint64_t>(8192,4096));
6448 o
.back()->clones
.push_back(cl
);
6449 cl
.cloneid
= CEPH_NOSNAP
;
6453 o
.back()->clones
.push_back(cl
);
6454 o
.back()->seq
= 123;
6458 WRITE_CLASS_ENCODER(obj_list_snap_response_t
)
6462 struct PromoteCounter
{
6463 std::atomic
<unsigned long long> attempts
{0};
6464 std::atomic
<unsigned long long> objects
{0};
6465 std::atomic
<unsigned long long> bytes
{0};
6471 void finish(uint64_t size
) {
6476 void sample_and_attenuate(uint64_t *a
, uint64_t *o
, uint64_t *b
) {
6486 struct pool_pg_num_history_t
{
6487 /// last epoch updated
6489 /// poolid -> epoch -> pg_num
6490 std::map
<int64_t, std::map
<epoch_t
,uint32_t>> pg_nums
;
6491 /// pair(epoch, poolid)
6492 std::set
<std::pair
<epoch_t
,int64_t>> deleted_pools
;
6494 void log_pg_num_change(epoch_t epoch
, int64_t pool
, uint32_t pg_num
) {
6495 pg_nums
[pool
][epoch
] = pg_num
;
6497 void log_pool_delete(epoch_t epoch
, int64_t pool
) {
6498 deleted_pools
.insert(std::make_pair(epoch
, pool
));
6501 /// prune history based on oldest osdmap epoch in the cluster
6502 void prune(epoch_t oldest_epoch
) {
6503 auto i
= deleted_pools
.begin();
6504 while (i
!= deleted_pools
.end()) {
6505 if (i
->first
>= oldest_epoch
) {
6508 pg_nums
.erase(i
->second
);
6509 i
= deleted_pools
.erase(i
);
6511 for (auto& j
: pg_nums
) {
6512 auto k
= j
.second
.lower_bound(oldest_epoch
);
6513 // keep this and the entry before it (just to be paranoid)
6514 if (k
!= j
.second
.begin()) {
6516 j
.second
.erase(j
.second
.begin(), k
);
6521 void encode(ceph::buffer::list
& bl
) const {
6522 ENCODE_START(1, 1, bl
);
6524 encode(pg_nums
, bl
);
6525 encode(deleted_pools
, bl
);
6528 void decode(ceph::buffer::list::const_iterator
& p
) {
6532 decode(deleted_pools
, p
);
6535 void dump(ceph::Formatter
*f
) const {
6536 f
->dump_unsigned("epoch", epoch
);
6537 f
->open_object_section("pools");
6538 for (auto& i
: pg_nums
) {
6539 f
->open_object_section("pool");
6540 f
->dump_unsigned("pool_id", i
.first
);
6541 f
->open_array_section("changes");
6542 for (auto& j
: i
.second
) {
6543 f
->open_object_section("change");
6544 f
->dump_unsigned("epoch", j
.first
);
6545 f
->dump_unsigned("pg_num", j
.second
);
6552 f
->open_array_section("deleted_pools");
6553 for (auto& i
: deleted_pools
) {
6554 f
->open_object_section("deletion");
6555 f
->dump_unsigned("pool_id", i
.second
);
6556 f
->dump_unsigned("epoch", i
.first
);
6561 static void generate_test_instances(std::list
<pool_pg_num_history_t
*>& ls
) {
6562 ls
.push_back(new pool_pg_num_history_t
);
6564 friend std::ostream
& operator<<(std::ostream
& out
, const pool_pg_num_history_t
& h
) {
6565 return out
<< "pg_num_history(e" << h
.epoch
6566 << " pg_nums " << h
.pg_nums
6567 << " deleted_pools " << h
.deleted_pools
6571 WRITE_CLASS_ENCODER(pool_pg_num_history_t
)
6573 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
6575 static const std::string_view infover_key
= "_infover";
6576 static const std::string_view info_key
= "_info";
6577 static const std::string_view biginfo_key
= "_biginfo";
6578 static const std::string_view epoch_key
= "_epoch";
6579 static const std::string_view fastinfo_key
= "_fastinfo";
6581 static const __u8 pg_latest_struct_v
= 10;
6582 // v10 is the new past_intervals encoding
6583 // v9 was fastinfo_key addition
6584 // v8 was the move to a per-pg pgmeta object
6585 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
6586 // (first appeared in cuttlefish).
6587 static const __u8 pg_compat_struct_v
= 10;
6589 int prepare_info_keymap(
6591 std::map
<std::string
,ceph::buffer::list
> *km
,
6592 std::string
*key_to_remove
,
6595 pg_info_t
&last_written_info
,
6596 PastIntervals
&past_intervals
,
6597 bool dirty_big_info
,
6600 PerfCounters
*logger
= nullptr,
6601 DoutPrefixProvider
*dpp
= nullptr);
6603 namespace ceph::os
{
6607 void create_pg_collection(
6608 ceph::os::Transaction
& t
, spg_t pgid
, int bits
);
6610 void init_pg_ondisk(
6611 ceph::os::Transaction
& t
, spg_t pgid
, const pg_pool_t
*pool
);
6613 // omap specific stats
6614 struct omap_stat_t
{
6615 int large_omap_objects
;
6620 // filter for pg listings
6627 virtual ~PGLSFilter();
6628 virtual bool filter(const hobject_t
&obj
,
6629 const ceph::buffer::list
& xattr_data
) const = 0;
6632 * Arguments passed from the RADOS client. Implementations must
6633 * handle any encoding errors, and return an appropriate error code,
6634 * or 0 on valid input.
6636 virtual int init(ceph::buffer::list::const_iterator
¶ms
) = 0;
6639 * xattr key, or empty string. If non-empty, this xattr will be fetched
6640 * and the value passed into ::filter
6642 virtual const std::string
& get_xattr() const { return xattr
; }
6645 * If true, objects without the named xattr (if xattr name is not empty)
6646 * will be rejected without calling ::filter
6648 virtual bool reject_empty_xattr() const { return true; }
6651 class PGLSPlainFilter
: public PGLSFilter
{
6654 int init(ceph::buffer::list::const_iterator
¶ms
) override
;
6655 ~PGLSPlainFilter() override
{}
6656 bool filter(const hobject_t
& obj
,
6657 const ceph::buffer::list
& xattr_data
) const override
;
6660 // alias name for this structure:
6661 using missing_map_t
= std::map
<hobject_t
,
6662 std::pair
<std::optional
<uint32_t>,
6663 std::optional
<uint32_t>>>;