1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
24 #include <string_view>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/optional/optional_io.hpp>
27 #include <boost/variant.hpp>
29 #include "include/rados/rados_types.hpp"
30 #include "include/mempool.h"
32 #include "msg/msg_types.h"
33 #include "include/types.h"
34 #include "include/utime.h"
35 #include "include/CompatSet.h"
36 #include "common/histogram.h"
37 #include "include/interval_set.h"
38 #include "include/inline_memory.h"
39 #include "common/Formatter.h"
40 #include "common/bloom_filter.hpp"
41 #include "common/hobject.h"
42 #include "common/snap_types.h"
45 #include "include/cmp.h"
46 #include "librados/ListObjectImpl.h"
47 #include "compressor/Compressor.h"
50 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
52 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
53 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
54 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
55 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
56 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
57 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
58 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
59 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
60 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
61 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
62 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
63 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
64 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
65 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
66 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
67 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
70 /// pool priority range set by user
71 #define OSD_POOL_PRIORITY_MAX 10
72 #define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
74 /// min recovery priority for MBackfillReserve
75 #define OSD_RECOVERY_PRIORITY_MIN 0
77 /// base backfill priority for MBackfillReserve
78 #define OSD_BACKFILL_PRIORITY_BASE 100
80 /// base backfill priority for MBackfillReserve (degraded PG)
81 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
83 /// base recovery priority for MBackfillReserve
84 #define OSD_RECOVERY_PRIORITY_BASE 180
86 /// base backfill priority for MBackfillReserve (inactive PG)
87 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
89 /// base recovery priority for MRecoveryReserve (inactive PG)
90 #define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
92 /// max manually/automatically set recovery priority for MBackfillReserve
93 #define OSD_RECOVERY_PRIORITY_MAX 253
95 /// backfill priority for MBackfillReserve, when forced manually
96 #define OSD_BACKFILL_PRIORITY_FORCED 254
98 /// recovery priority for MRecoveryReserve, when forced manually
99 #define OSD_RECOVERY_PRIORITY_FORCED 255
101 /// priority for pg deletion when osd is not fullish
102 #define OSD_DELETE_PRIORITY_NORMAL 179
104 /// priority for pg deletion when osd is approaching full
105 #define OSD_DELETE_PRIORITY_FULLISH 219
107 /// priority when more full
108 #define OSD_DELETE_PRIORITY_FULL 255
110 static std::map
<int, int> max_prio_map
= {
111 {OSD_BACKFILL_PRIORITY_BASE
, OSD_BACKFILL_DEGRADED_PRIORITY_BASE
- 1},
112 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_BASE
- 1},
113 {OSD_RECOVERY_PRIORITY_BASE
, OSD_BACKFILL_INACTIVE_PRIORITY_BASE
- 1},
114 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_MAX
},
115 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE
, OSD_RECOVERY_PRIORITY_MAX
}
118 typedef hobject_t collection_list_handle_t
;
120 /// convert a single CPEH_OSD_FLAG_* to a string
121 const char *ceph_osd_flag_name(unsigned flag
);
122 /// convert a single CEPH_OSD_OF_FLAG_* to a string
123 const char *ceph_osd_op_flag_name(unsigned flag
);
125 /// convert CEPH_OSD_FLAG_* op flags to a string
126 string
ceph_osd_flag_string(unsigned flags
);
127 /// conver CEPH_OSD_OP_FLAG_* op flags to a string
128 string
ceph_osd_op_flag_string(unsigned flags
);
129 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
130 string
ceph_osd_alloc_hint_flag_string(unsigned flags
);
132 typedef map
<string
,string
> osd_alert_list_t
;
133 /// map osd id -> alert_list_t
134 typedef map
<int, osd_alert_list_t
> osd_alerts_t
;
135 void dump(Formatter
* f
, const osd_alerts_t
& alerts
);
138 * osd request identifier
140 * caller name + incarnation# + tid to unique identify this request.
143 entity_name_t name
; // who
145 int32_t inc
; // incarnation
150 osd_reqid_t(const osd_reqid_t
& other
)
151 : name(other
.name
), tid(other
.tid
), inc(other
.inc
)
153 osd_reqid_t(const entity_name_t
& a
, int i
, ceph_tid_t t
)
154 : name(a
), tid(t
), inc(i
)
157 DENC(osd_reqid_t
, v
, p
) {
164 void dump(Formatter
*f
) const;
165 static void generate_test_instances(list
<osd_reqid_t
*>& o
);
167 WRITE_CLASS_DENC(osd_reqid_t
)
172 static const int32_t NO_OSD
= 0x7fffffff;
175 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD
) {}
176 explicit pg_shard_t(int osd
) : osd(osd
), shard(shard_id_t::NO_SHARD
) {}
177 pg_shard_t(int osd
, shard_id_t shard
) : osd(osd
), shard(shard
) {}
178 bool is_undefined() const {
181 string
get_osd() const { return (osd
== NO_OSD
? "NONE" : to_string(osd
)); }
182 void encode(bufferlist
&bl
) const;
183 void decode(bufferlist::const_iterator
&bl
);
184 void dump(Formatter
*f
) const {
185 f
->dump_unsigned("osd", osd
);
186 if (shard
!= shard_id_t::NO_SHARD
) {
187 f
->dump_unsigned("shard", shard
);
191 WRITE_CLASS_ENCODER(pg_shard_t
)
192 WRITE_EQ_OPERATORS_2(pg_shard_t
, osd
, shard
)
193 WRITE_CMP_OPERATORS_2(pg_shard_t
, osd
, shard
)
194 ostream
&operator<<(ostream
&lhs
, const pg_shard_t
&rhs
);
196 class IsPGRecoverablePredicate
{
199 * have encodes the shards available
201 virtual bool operator()(const set
<pg_shard_t
> &have
) const = 0;
202 virtual ~IsPGRecoverablePredicate() {}
205 class IsPGReadablePredicate
{
208 * have encodes the shards available
210 virtual bool operator()(const set
<pg_shard_t
> &have
) const = 0;
211 virtual ~IsPGReadablePredicate() {}
214 inline ostream
& operator<<(ostream
& out
, const osd_reqid_t
& r
) {
215 return out
<< r
.name
<< "." << r
.inc
<< ":" << r
.tid
;
218 inline bool operator==(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
219 return (l
.name
== r
.name
) && (l
.inc
== r
.inc
) && (l
.tid
== r
.tid
);
221 inline bool operator!=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
222 return (l
.name
!= r
.name
) || (l
.inc
!= r
.inc
) || (l
.tid
!= r
.tid
);
224 inline bool operator<(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
225 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
226 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
< r
.tid
);
228 inline bool operator<=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
229 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
230 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
<= r
.tid
);
232 inline bool operator>(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
<= r
); }
233 inline bool operator>=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
< r
); }
236 template<> struct hash
<osd_reqid_t
> {
237 size_t operator()(const osd_reqid_t
&r
) const {
238 static hash
<uint64_t> H
;
239 return H(r
.name
.num() ^ r
.tid
^ r
.inc
);
247 // a locator constrains the placement of an object. mainly, which pool
249 struct object_locator_t
{
250 // You specify either the hash or the key -- not both
251 int64_t pool
; ///< pool id
252 string key
; ///< key string (if non-empty)
253 string nspace
; ///< namespace
254 int64_t hash
; ///< hash position (if >= 0)
256 explicit object_locator_t()
257 : pool(-1), hash(-1) {}
258 explicit object_locator_t(int64_t po
)
259 : pool(po
), hash(-1) {}
260 explicit object_locator_t(int64_t po
, int64_t ps
)
261 : pool(po
), hash(ps
) {}
262 explicit object_locator_t(int64_t po
, string ns
)
263 : pool(po
), nspace(ns
), hash(-1) {}
264 explicit object_locator_t(int64_t po
, string ns
, int64_t ps
)
265 : pool(po
), nspace(ns
), hash(ps
) {}
266 explicit object_locator_t(int64_t po
, string ns
, string s
)
267 : pool(po
), key(s
), nspace(ns
), hash(-1) {}
268 explicit object_locator_t(const hobject_t
& soid
)
269 : pool(soid
.pool
), key(soid
.get_key()), nspace(soid
.nspace
), hash(-1) {}
271 int64_t get_pool() const {
286 void encode(bufferlist
& bl
) const;
287 void decode(bufferlist::const_iterator
& p
);
288 void dump(Formatter
*f
) const;
289 static void generate_test_instances(list
<object_locator_t
*>& o
);
291 WRITE_CLASS_ENCODER(object_locator_t
)
293 inline bool operator==(const object_locator_t
& l
, const object_locator_t
& r
) {
294 return l
.pool
== r
.pool
&& l
.key
== r
.key
&& l
.nspace
== r
.nspace
&& l
.hash
== r
.hash
;
296 inline bool operator!=(const object_locator_t
& l
, const object_locator_t
& r
) {
300 inline ostream
& operator<<(ostream
& out
, const object_locator_t
& loc
)
302 out
<< "@" << loc
.pool
;
303 if (loc
.nspace
.length())
304 out
<< ";" << loc
.nspace
;
305 if (loc
.key
.length())
306 out
<< ":" << loc
.key
;
310 struct request_redirect_t
{
312 object_locator_t redirect_locator
; ///< this is authoritative
313 string redirect_object
; ///< If non-empty, the request goes to this object name
315 friend ostream
& operator<<(ostream
& out
, const request_redirect_t
& redir
);
318 request_redirect_t() {}
319 explicit request_redirect_t(const object_locator_t
& orig
, int64_t rpool
) :
320 redirect_locator(orig
) { redirect_locator
.pool
= rpool
; }
321 explicit request_redirect_t(const object_locator_t
& rloc
) :
322 redirect_locator(rloc
) {}
323 explicit request_redirect_t(const object_locator_t
& orig
,
324 const string
& robj
) :
325 redirect_locator(orig
), redirect_object(robj
) {}
327 bool empty() const { return redirect_locator
.empty() &&
328 redirect_object
.empty(); }
330 void combine_with_locator(object_locator_t
& orig
, string
& obj
) const {
331 orig
= redirect_locator
;
332 if (!redirect_object
.empty())
333 obj
= redirect_object
;
336 void encode(bufferlist
& bl
) const;
337 void decode(bufferlist::const_iterator
& bl
);
338 void dump(Formatter
*f
) const;
339 static void generate_test_instances(list
<request_redirect_t
*>& o
);
341 WRITE_CLASS_ENCODER(request_redirect_t
)
343 inline ostream
& operator<<(ostream
& out
, const request_redirect_t
& redir
) {
344 out
<< "object " << redir
.redirect_object
<< ", locator{" << redir
.redirect_locator
<< "}";
348 // Internal OSD op flags - set by the OSD based on the op types
350 CEPH_OSD_RMW_FLAG_READ
= (1 << 1),
351 CEPH_OSD_RMW_FLAG_WRITE
= (1 << 2),
352 CEPH_OSD_RMW_FLAG_CLASS_READ
= (1 << 3),
353 CEPH_OSD_RMW_FLAG_CLASS_WRITE
= (1 << 4),
354 CEPH_OSD_RMW_FLAG_PGOP
= (1 << 5),
355 CEPH_OSD_RMW_FLAG_CACHE
= (1 << 6),
356 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE
= (1 << 7),
357 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE
= (1 << 8),
358 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE
= (1 << 9),
359 CEPH_OSD_RMW_FLAG_RWORDERED
= (1 << 10),
365 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
367 // placement seed (a hash value)
368 typedef uint32_t ps_t
;
370 // old (v1) pg_t encoding (wrap old struct ceph_pg)
373 void encode(bufferlist
& bl
) const {
376 void decode(bufferlist::const_iterator
& bl
) {
380 WRITE_CLASS_ENCODER(old_pg_t
)
382 // placement group id
387 pg_t() : m_pool(0), m_seed(0) {}
388 pg_t(ps_t seed
, uint64_t pool
) :
389 m_pool(pool
), m_seed(seed
) {}
390 // cppcheck-suppress noExplicitConstructor
391 pg_t(const ceph_pg
& cpg
) :
392 m_pool(cpg
.pool
), m_seed(cpg
.ps
) {}
394 // cppcheck-suppress noExplicitConstructor
395 pg_t(const old_pg_t
& opg
) {
399 old_pg_t
get_old_pg() const {
401 ceph_assert(m_pool
< 0xffffffffull
);
404 o
.v
.preferred
= (__s16
)-1;
411 int64_t pool() const {
415 static const uint8_t calc_name_buf_size
= 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
416 char *calc_name(char *buf
, const char *suffix_backwords
) const;
418 void set_ps(ps_t p
) {
421 void set_pool(uint64_t p
) {
425 pg_t
get_parent() const;
426 pg_t
get_ancestor(unsigned old_pg_num
) const;
428 int print(char *o
, int maxlen
) const;
429 bool parse(const char *s
);
431 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
, set
<pg_t
> *pchildren
) const;
433 bool is_merge_source(unsigned old_pg_num
, unsigned new_pg_num
, pg_t
*parent
) const;
434 bool is_merge_target(unsigned old_pg_num
, unsigned new_pg_num
) const {
435 return ps() < new_pg_num
&& is_split(new_pg_num
, old_pg_num
, nullptr);
439 * Returns b such that for all object o:
440 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
442 unsigned get_split_bits(unsigned pg_num
) const;
444 bool contains(int bits
, const ghobject_t
& oid
) {
446 (int64_t)m_pool
== oid
.hobj
.get_logical_pool() &&
447 oid
.match(bits
, ps());
449 bool contains(int bits
, const hobject_t
& oid
) {
451 (int64_t)m_pool
== oid
.get_logical_pool() &&
452 oid
.match(bits
, ps());
455 hobject_t
get_hobj_start() const;
456 hobject_t
get_hobj_end(unsigned pg_num
) const;
458 void encode(bufferlist
& bl
) const {
464 encode((int32_t)-1, bl
); // was preferred
466 void decode(bufferlist::const_iterator
& bl
) {
472 bl
.advance(sizeof(int32_t)); // was preferred
474 void decode_old(bufferlist::const_iterator
& bl
) {
480 void dump(Formatter
*f
) const;
481 static void generate_test_instances(list
<pg_t
*>& o
);
483 WRITE_CLASS_ENCODER(pg_t
)
485 inline bool operator<(const pg_t
& l
, const pg_t
& r
) {
486 return l
.pool() < r
.pool() ||
487 (l
.pool() == r
.pool() && (l
.ps() < r
.ps()));
489 inline bool operator<=(const pg_t
& l
, const pg_t
& r
) {
490 return l
.pool() < r
.pool() ||
491 (l
.pool() == r
.pool() && (l
.ps() <= r
.ps()));
493 inline bool operator==(const pg_t
& l
, const pg_t
& r
) {
494 return l
.pool() == r
.pool() &&
497 inline bool operator!=(const pg_t
& l
, const pg_t
& r
) {
498 return l
.pool() != r
.pool() ||
501 inline bool operator>(const pg_t
& l
, const pg_t
& r
) {
502 return l
.pool() > r
.pool() ||
503 (l
.pool() == r
.pool() && (l
.ps() > r
.ps()));
505 inline bool operator>=(const pg_t
& l
, const pg_t
& r
) {
506 return l
.pool() > r
.pool() ||
507 (l
.pool() == r
.pool() && (l
.ps() >= r
.ps()));
510 ostream
& operator<<(ostream
& out
, const pg_t
&pg
);
513 template<> struct hash
< pg_t
>
515 size_t operator()( const pg_t
& x
) const
517 static hash
<uint32_t> H
;
518 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
519 return H((x
.pool() & 0xffffffff) ^ (x
.pool() >> 32) ^ x
.ps() ^ (int32_t)(-1));
527 spg_t() : shard(shard_id_t::NO_SHARD
) {}
528 spg_t(pg_t pgid
, shard_id_t shard
) : pgid(pgid
), shard(shard
) {}
529 explicit spg_t(pg_t pgid
) : pgid(pgid
), shard(shard_id_t::NO_SHARD
) {}
530 unsigned get_split_bits(unsigned pg_num
) const {
531 return pgid
.get_split_bits(pg_num
);
533 spg_t
get_parent() const {
534 return spg_t(pgid
.get_parent(), shard
);
539 uint64_t pool() const {
543 static const uint8_t calc_name_buf_size
= pg_t::calc_name_buf_size
+ 4; // 36 + len('s') + len("255");
544 char *calc_name(char *buf
, const char *suffix_backwords
) const;
546 bool parse(const char *s
);
547 bool parse(const std::string
& s
) {
548 return parse(s
.c_str());
551 spg_t
get_ancestor(unsigned old_pg_num
) const {
552 return spg_t(pgid
.get_ancestor(old_pg_num
), shard
);
555 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
,
556 set
<spg_t
> *pchildren
) const {
558 set
<pg_t
> *children
= pchildren
? &_children
: NULL
;
559 bool is_split
= pgid
.is_split(old_pg_num
, new_pg_num
, children
);
560 if (pchildren
&& is_split
) {
561 for (set
<pg_t
>::iterator i
= _children
.begin();
562 i
!= _children
.end();
564 pchildren
->insert(spg_t(*i
, shard
));
569 bool is_merge_target(unsigned old_pg_num
, unsigned new_pg_num
) const {
570 return pgid
.is_merge_target(old_pg_num
, new_pg_num
);
572 bool is_merge_source(unsigned old_pg_num
, unsigned new_pg_num
,
573 spg_t
*parent
) const {
575 bool r
= pgid
.is_merge_source(old_pg_num
, new_pg_num
, &out
.pgid
);
582 bool is_no_shard() const {
583 return shard
== shard_id_t::NO_SHARD
;
586 ghobject_t
make_pgmeta_oid() const {
587 return ghobject_t::make_pgmeta(pgid
.pool(), pgid
.ps(), shard
);
590 void encode(bufferlist
&bl
) const {
591 ENCODE_START(1, 1, bl
);
596 void decode(bufferlist::const_iterator
& bl
) {
603 ghobject_t
make_temp_ghobject(const string
& name
) const {
605 hobject_t(object_t(name
), "", CEPH_NOSNAP
,
607 hobject_t::get_temp_pool(pgid
.pool()),
613 unsigned hash_to_shard(unsigned num_shards
) const {
614 return ps() % num_shards
;
617 WRITE_CLASS_ENCODER(spg_t
)
618 WRITE_EQ_OPERATORS_2(spg_t
, pgid
, shard
)
619 WRITE_CMP_OPERATORS_2(spg_t
, pgid
, shard
)
622 template<> struct hash
< spg_t
>
624 size_t operator()( const spg_t
& x
) const
626 static hash
<uint32_t> H
;
627 return H(hash
<pg_t
>()(x
.pgid
) ^ x
.shard
);
632 ostream
& operator<<(ostream
& out
, const spg_t
&pg
);
634 // ----------------------
639 TYPE_LEGACY_TEMP
= 1, /* no longer used */
645 uint64_t removal_seq
; // note: deprecated, not encoded
647 char _str_buff
[spg_t::calc_name_buf_size
];
652 coll_t(type_t t
, spg_t p
, uint64_t r
)
653 : type(t
), pgid(p
), removal_seq(r
) {
658 coll_t() : type(TYPE_META
), removal_seq(0)
663 coll_t(const coll_t
& other
)
664 : type(other
.type
), pgid(other
.pgid
), removal_seq(other
.removal_seq
) {
668 explicit coll_t(spg_t pgid
)
669 : type(TYPE_PG
), pgid(pgid
), removal_seq(0)
674 coll_t
& operator=(const coll_t
& rhs
)
676 this->type
= rhs
.type
;
677 this->pgid
= rhs
.pgid
;
678 this->removal_seq
= rhs
.removal_seq
;
683 // named constructors
684 static coll_t
meta() {
687 static coll_t
pg(spg_t p
) {
691 const std::string
to_str() const {
694 const char *c_str() const {
698 bool parse(const std::string
& s
);
700 int operator<(const coll_t
&rhs
) const {
701 return type
< rhs
.type
||
702 (type
== rhs
.type
&& pgid
< rhs
.pgid
);
705 bool is_meta() const {
706 return type
== TYPE_META
;
708 bool is_pg_prefix(spg_t
*pgid_
) const {
709 if (type
== TYPE_PG
|| type
== TYPE_PG_TEMP
) {
716 return type
== TYPE_PG
;
718 bool is_pg(spg_t
*pgid_
) const {
719 if (type
== TYPE_PG
) {
725 bool is_temp() const {
726 return type
== TYPE_PG_TEMP
;
728 bool is_temp(spg_t
*pgid_
) const {
729 if (type
== TYPE_PG_TEMP
) {
736 void encode(bufferlist
& bl
) const;
737 void decode(bufferlist::const_iterator
& bl
);
738 size_t encoded_size() const;
740 inline bool operator==(const coll_t
& rhs
) const {
741 // only compare type if meta
742 if (type
!= rhs
.type
)
744 if (type
== TYPE_META
)
746 return type
== rhs
.type
&& pgid
== rhs
.pgid
;
748 inline bool operator!=(const coll_t
& rhs
) const {
749 return !(*this == rhs
);
752 // get a TEMP collection that corresponds to the current collection,
753 // which we presume is a pg collection.
754 coll_t
get_temp() const {
755 ceph_assert(type
== TYPE_PG
);
756 return coll_t(TYPE_PG_TEMP
, pgid
, 0);
759 ghobject_t
get_min_hobj() const {
763 o
.hobj
.pool
= pgid
.pool();
764 o
.set_shard(pgid
.shard
);
775 unsigned hash_to_shard(unsigned num_shards
) const {
777 return pgid
.hash_to_shard(num_shards
);
778 return 0; // whatever.
781 void dump(Formatter
*f
) const;
782 static void generate_test_instances(list
<coll_t
*>& o
);
785 WRITE_CLASS_ENCODER(coll_t
)
787 inline ostream
& operator<<(ostream
& out
, const coll_t
& c
) {
793 template<> struct hash
<coll_t
> {
794 size_t operator()(const coll_t
&c
) const {
796 string
str(c
.to_str());
797 std::string::const_iterator
end(str
.end());
798 for (std::string::const_iterator s
= str
.begin(); s
!= end
; ++s
) {
811 inline ostream
& operator<<(ostream
& out
, const ceph_object_layout
&ol
)
813 out
<< pg_t(ol
.ol_pgid
);
814 int su
= ol
.ol_stripe_unit
;
822 // compound rados version type
823 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
824 * work well. For little-endian machine, we should make sure there is no padding
825 * in 32-bit machine and 64-bit machine.
832 eversion_t() : version(0), epoch(0), __pad(0) {}
833 eversion_t(epoch_t e
, version_t v
) : version(v
), epoch(e
), __pad(0) {}
835 // cppcheck-suppress noExplicitConstructor
836 eversion_t(const ceph_eversion
& ce
) :
841 explicit eversion_t(bufferlist
& bl
) : __pad(0) { decode(bl
); }
843 static const eversion_t
& max() {
844 static const eversion_t
max(-1,-1);
848 operator ceph_eversion() {
855 string
get_key_name() const;
857 // key must point to the beginning of a block of 32 chars
858 inline void get_key_name(char* key
) const {
859 // Below is equivalent of sprintf("%010u.%020llu");
861 ritoa
<uint64_t, 10, 20>(version
, key
+ 31);
863 ritoa
<uint32_t, 10, 10>(epoch
, key
+ 10);
866 void encode(bufferlist
&bl
) const {
867 #if defined(CEPH_LITTLE_ENDIAN)
868 bl
.append((char *)this, sizeof(version_t
) + sizeof(epoch_t
));
875 void decode(bufferlist::const_iterator
&bl
) {
876 #if defined(CEPH_LITTLE_ENDIAN)
877 bl
.copy(sizeof(version_t
) + sizeof(epoch_t
), (char *)this);
884 void decode(bufferlist
& bl
) {
885 auto p
= std::cbegin(bl
);
889 WRITE_CLASS_ENCODER(eversion_t
)
891 inline bool operator==(const eversion_t
& l
, const eversion_t
& r
) {
892 return (l
.epoch
== r
.epoch
) && (l
.version
== r
.version
);
894 inline bool operator!=(const eversion_t
& l
, const eversion_t
& r
) {
895 return (l
.epoch
!= r
.epoch
) || (l
.version
!= r
.version
);
897 inline bool operator<(const eversion_t
& l
, const eversion_t
& r
) {
898 return (l
.epoch
== r
.epoch
) ? (l
.version
< r
.version
):(l
.epoch
< r
.epoch
);
900 inline bool operator<=(const eversion_t
& l
, const eversion_t
& r
) {
901 return (l
.epoch
== r
.epoch
) ? (l
.version
<= r
.version
):(l
.epoch
<= r
.epoch
);
903 inline bool operator>(const eversion_t
& l
, const eversion_t
& r
) {
904 return (l
.epoch
== r
.epoch
) ? (l
.version
> r
.version
):(l
.epoch
> r
.epoch
);
906 inline bool operator>=(const eversion_t
& l
, const eversion_t
& r
) {
907 return (l
.epoch
== r
.epoch
) ? (l
.version
>= r
.version
):(l
.epoch
>= r
.epoch
);
909 inline ostream
& operator<<(ostream
& out
, const eversion_t
& e
) {
910 return out
<< e
.epoch
<< "'" << e
.version
;
914 * objectstore_perf_stat_t
916 * current perf information about the osd
918 struct objectstore_perf_stat_t
{
919 // cur_op_latency is in ns since double add/sub are not associative
920 uint64_t os_commit_latency_ns
;
921 uint64_t os_apply_latency_ns
;
923 objectstore_perf_stat_t() :
924 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
926 bool operator==(const objectstore_perf_stat_t
&r
) const {
927 return os_commit_latency_ns
== r
.os_commit_latency_ns
&&
928 os_apply_latency_ns
== r
.os_apply_latency_ns
;
931 void add(const objectstore_perf_stat_t
&o
) {
932 os_commit_latency_ns
+= o
.os_commit_latency_ns
;
933 os_apply_latency_ns
+= o
.os_apply_latency_ns
;
935 void sub(const objectstore_perf_stat_t
&o
) {
936 os_commit_latency_ns
-= o
.os_commit_latency_ns
;
937 os_apply_latency_ns
-= o
.os_apply_latency_ns
;
939 void dump(Formatter
*f
) const;
940 void encode(bufferlist
&bl
, uint64_t features
) const;
941 void decode(bufferlist::const_iterator
&bl
);
942 static void generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
);
944 WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t
)
949 #define PG_STATE_CREATING (1ULL << 0) // creating
950 #define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
951 #define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
952 #define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
953 #define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
954 #define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
955 #define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
956 #define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
957 //#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
958 #define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
959 #define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
960 #define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
961 #define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
962 #define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
963 #define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
964 #define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
965 #define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
966 #define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
967 #define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
968 #define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
969 #define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
970 #define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
971 #define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
972 #define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
973 #define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
974 #define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
975 #define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
976 #define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
977 #define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
978 #define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
979 #define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
980 #define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
982 std::string
pg_state_string(uint64_t state
);
983 std::string
pg_vector_string(const vector
<int32_t> &a
);
984 boost::optional
<uint64_t> pg_string_state(const std::string
& state
);
990 * attributes for a single pool snapshot.
992 struct pool_snap_info_t
{
997 void dump(Formatter
*f
) const;
998 void encode(bufferlist
& bl
, uint64_t features
) const;
999 void decode(bufferlist::const_iterator
& bl
);
1000 static void generate_test_instances(list
<pool_snap_info_t
*>& o
);
1002 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t
)
1004 inline ostream
& operator<<(ostream
& out
, const pool_snap_info_t
& si
) {
1005 return out
<< si
.snapid
<< '(' << si
.name
<< ' ' << si
.stamp
<< ')';
1020 DEEP_SCRUB_INTERVAL
,
1022 RECOVERY_OP_PRIORITY
,
1025 COMPRESSION_ALGORITHM
,
1026 COMPRESSION_REQUIRED_RATIO
,
1027 COMPRESSION_MAX_BLOB_SIZE
,
1028 COMPRESSION_MIN_BLOB_SIZE
,
1032 FINGERPRINT_ALGORITHM
,
1033 PG_NUM_MIN
, // min pg_num
1034 TARGET_SIZE_BYTES
, // total bytes in pool
1035 TARGET_SIZE_RATIO
, // fraction of total cluster
1049 opt_desc_t(key_t k
, type_t t
) : key(k
), type(t
) {}
1051 bool operator==(const opt_desc_t
& rhs
) const {
1052 return key
== rhs
.key
&& type
== rhs
.type
;
1056 typedef boost::variant
<std::string
,int64_t,double> value_t
;
1058 static bool is_opt_name(const std::string
& name
);
1059 static opt_desc_t
get_opt_desc(const std::string
& name
);
1061 pool_opts_t() : opts() {}
1063 bool is_set(key_t key
) const;
1065 template<typename T
>
1066 void set(key_t key
, const T
&val
) {
1067 value_t value
= val
;
1071 template<typename T
>
1072 bool get(key_t key
, T
*val
) const {
1073 opts_t::const_iterator i
= opts
.find(key
);
1074 if (i
== opts
.end()) {
1077 *val
= boost::get
<T
>(i
->second
);
1081 const value_t
& get(key_t key
) const;
1083 bool unset(key_t key
);
1085 void dump(const std::string
& name
, Formatter
*f
) const;
1087 void dump(Formatter
*f
) const;
1088 void encode(bufferlist
&bl
, uint64_t features
) const;
1089 void decode(bufferlist::const_iterator
&bl
);
1092 typedef std::map
<key_t
, value_t
> opts_t
;
1095 friend ostream
& operator<<(ostream
& out
, const pool_opts_t
& opts
);
1097 WRITE_CLASS_ENCODER_FEATURES(pool_opts_t
)
1099 struct pg_merge_meta_t
{
1101 epoch_t ready_epoch
= 0;
1102 epoch_t last_epoch_started
= 0;
1103 epoch_t last_epoch_clean
= 0;
1104 eversion_t source_version
;
1105 eversion_t target_version
;
1107 void encode(bufferlist
& bl
) const {
1108 ENCODE_START(1, 1, bl
);
1109 encode(source_pgid
, bl
);
1110 encode(ready_epoch
, bl
);
1111 encode(last_epoch_started
, bl
);
1112 encode(last_epoch_clean
, bl
);
1113 encode(source_version
, bl
);
1114 encode(target_version
, bl
);
1117 void decode(bufferlist::const_iterator
& p
) {
1119 decode(source_pgid
, p
);
1120 decode(ready_epoch
, p
);
1121 decode(last_epoch_started
, p
);
1122 decode(last_epoch_clean
, p
);
1123 decode(source_version
, p
);
1124 decode(target_version
, p
);
1127 void dump(Formatter
*f
) const {
1128 f
->dump_stream("source_pgid") << source_pgid
;
1129 f
->dump_unsigned("ready_epoch", ready_epoch
);
1130 f
->dump_unsigned("last_epoch_started", last_epoch_started
);
1131 f
->dump_unsigned("last_epoch_clean", last_epoch_clean
);
1132 f
->dump_stream("source_version") << source_version
;
1133 f
->dump_stream("target_version") << target_version
;
1136 WRITE_CLASS_ENCODER(pg_merge_meta_t
)
1142 static const char *APPLICATION_NAME_CEPHFS
;
1143 static const char *APPLICATION_NAME_RBD
;
1144 static const char *APPLICATION_NAME_RGW
;
1147 TYPE_REPLICATED
= 1, // replication
1148 //TYPE_RAID4 = 2, // raid4 (never implemented)
1149 TYPE_ERASURE
= 3, // erasure-coded
1151 static std::string_view
get_type_name(int t
) {
1153 case TYPE_REPLICATED
: return "replicated";
1154 //case TYPE_RAID4: return "raid4";
1155 case TYPE_ERASURE
: return "erasure";
1156 default: return "???";
1159 std::string_view
get_type_name() const {
1160 return get_type_name(type
);
1164 FLAG_HASHPSPOOL
= 1<<0, // hash pg seed and pool together (instead of adding)
1165 FLAG_FULL
= 1<<1, // pool is full
1166 FLAG_EC_OVERWRITES
= 1<<2, // enables overwrites, once enabled, cannot be disabled
1167 FLAG_INCOMPLETE_CLONES
= 1<<3, // may have incomplete clones (bc we are/were an overlay)
1168 FLAG_NODELETE
= 1<<4, // pool can't be deleted
1169 FLAG_NOPGCHANGE
= 1<<5, // pool's pg and pgp num can't be changed
1170 FLAG_NOSIZECHANGE
= 1<<6, // pool's size and min size can't be changed
1171 FLAG_WRITE_FADVISE_DONTNEED
= 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1172 FLAG_NOSCRUB
= 1<<8, // block periodic scrub
1173 FLAG_NODEEP_SCRUB
= 1<<9, // block periodic deep-scrub
1174 FLAG_FULL_QUOTA
= 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1175 FLAG_NEARFULL
= 1<<11, // pool is nearfull
1176 FLAG_BACKFILLFULL
= 1<<12, // pool is backfillfull
1177 FLAG_SELFMANAGED_SNAPS
= 1<<13, // pool uses selfmanaged snaps
1178 FLAG_POOL_SNAPS
= 1<<14, // pool has pool snaps
1179 FLAG_CREATING
= 1<<15, // initial pool PGs are being created
1182 static const char *get_flag_name(int f
) {
1184 case FLAG_HASHPSPOOL
: return "hashpspool";
1185 case FLAG_FULL
: return "full";
1186 case FLAG_EC_OVERWRITES
: return "ec_overwrites";
1187 case FLAG_INCOMPLETE_CLONES
: return "incomplete_clones";
1188 case FLAG_NODELETE
: return "nodelete";
1189 case FLAG_NOPGCHANGE
: return "nopgchange";
1190 case FLAG_NOSIZECHANGE
: return "nosizechange";
1191 case FLAG_WRITE_FADVISE_DONTNEED
: return "write_fadvise_dontneed";
1192 case FLAG_NOSCRUB
: return "noscrub";
1193 case FLAG_NODEEP_SCRUB
: return "nodeep-scrub";
1194 case FLAG_FULL_QUOTA
: return "full_quota";
1195 case FLAG_NEARFULL
: return "nearfull";
1196 case FLAG_BACKFILLFULL
: return "backfillfull";
1197 case FLAG_SELFMANAGED_SNAPS
: return "selfmanaged_snaps";
1198 case FLAG_POOL_SNAPS
: return "pool_snaps";
1199 case FLAG_CREATING
: return "creating";
1200 default: return "???";
1203 static string
get_flags_string(uint64_t f
) {
1205 for (unsigned n
=0; f
&& n
<64; ++n
) {
1206 if (f
& (1ull << n
)) {
1209 s
+= get_flag_name(1ull << n
);
1214 string
get_flags_string() const {
1215 return get_flags_string(flags
);
1217 static uint64_t get_flag_by_name(const string
& name
) {
1218 if (name
== "hashpspool")
1219 return FLAG_HASHPSPOOL
;
1222 if (name
== "ec_overwrites")
1223 return FLAG_EC_OVERWRITES
;
1224 if (name
== "incomplete_clones")
1225 return FLAG_INCOMPLETE_CLONES
;
1226 if (name
== "nodelete")
1227 return FLAG_NODELETE
;
1228 if (name
== "nopgchange")
1229 return FLAG_NOPGCHANGE
;
1230 if (name
== "nosizechange")
1231 return FLAG_NOSIZECHANGE
;
1232 if (name
== "write_fadvise_dontneed")
1233 return FLAG_WRITE_FADVISE_DONTNEED
;
1234 if (name
== "noscrub")
1235 return FLAG_NOSCRUB
;
1236 if (name
== "nodeep-scrub")
1237 return FLAG_NODEEP_SCRUB
;
1238 if (name
== "full_quota")
1239 return FLAG_FULL_QUOTA
;
1240 if (name
== "nearfull")
1241 return FLAG_NEARFULL
;
1242 if (name
== "backfillfull")
1243 return FLAG_BACKFILLFULL
;
1244 if (name
== "selfmanaged_snaps")
1245 return FLAG_SELFMANAGED_SNAPS
;
1246 if (name
== "pool_snaps")
1247 return FLAG_POOL_SNAPS
;
1248 if (name
== "creating")
1249 return FLAG_CREATING
;
1253 /// converts the acting/up vector to a set of pg shards
1254 void convert_to_pg_shards(const vector
<int> &from
, set
<pg_shard_t
>* to
) const;
1257 CACHEMODE_NONE
= 0, ///< no caching
1258 CACHEMODE_WRITEBACK
= 1, ///< write to cache, flush later
1259 CACHEMODE_FORWARD
= 2, ///< forward if not in cache
1260 CACHEMODE_READONLY
= 3, ///< handle reads, forward writes [not strongly consistent]
1261 CACHEMODE_READFORWARD
= 4, ///< forward reads, write to cache flush later
1262 CACHEMODE_READPROXY
= 5, ///< proxy reads, write to cache flush later
1263 CACHEMODE_PROXY
= 6, ///< proxy if not in cache
1265 static const char *get_cache_mode_name(cache_mode_t m
) {
1267 case CACHEMODE_NONE
: return "none";
1268 case CACHEMODE_WRITEBACK
: return "writeback";
1269 case CACHEMODE_FORWARD
: return "forward";
1270 case CACHEMODE_READONLY
: return "readonly";
1271 case CACHEMODE_READFORWARD
: return "readforward";
1272 case CACHEMODE_READPROXY
: return "readproxy";
1273 case CACHEMODE_PROXY
: return "proxy";
1274 default: return "unknown";
1277 static cache_mode_t
get_cache_mode_from_str(const string
& s
) {
1279 return CACHEMODE_NONE
;
1280 if (s
== "writeback")
1281 return CACHEMODE_WRITEBACK
;
1283 return CACHEMODE_FORWARD
;
1284 if (s
== "readonly")
1285 return CACHEMODE_READONLY
;
1286 if (s
== "readforward")
1287 return CACHEMODE_READFORWARD
;
1288 if (s
== "readproxy")
1289 return CACHEMODE_READPROXY
;
1291 return CACHEMODE_PROXY
;
1292 return (cache_mode_t
)-1;
1294 const char *get_cache_mode_name() const {
1295 return get_cache_mode_name(cache_mode
);
1297 bool cache_mode_requires_hit_set() const {
1298 switch (cache_mode
) {
1299 case CACHEMODE_NONE
:
1300 case CACHEMODE_FORWARD
:
1301 case CACHEMODE_READONLY
:
1302 case CACHEMODE_PROXY
:
1304 case CACHEMODE_WRITEBACK
:
1305 case CACHEMODE_READFORWARD
:
1306 case CACHEMODE_READPROXY
:
1309 ceph_abort_msg("implement me");
1314 PG_AUTOSCALE_MODE_OFF
= 0,
1315 PG_AUTOSCALE_MODE_WARN
= 1,
1316 PG_AUTOSCALE_MODE_ON
= 2,
1318 static const char *get_pg_autoscale_mode_name(int m
) {
1320 case PG_AUTOSCALE_MODE_OFF
: return "off";
1321 case PG_AUTOSCALE_MODE_ON
: return "on";
1322 case PG_AUTOSCALE_MODE_WARN
: return "warn";
1323 default: return "???";
1326 static int get_pg_autoscale_mode_by_name(const string
& m
) {
1328 return PG_AUTOSCALE_MODE_OFF
;
1331 return PG_AUTOSCALE_MODE_WARN
;
1334 return PG_AUTOSCALE_MODE_ON
;
1339 utime_t create_time
;
1340 uint64_t flags
; ///< FLAG_*
1341 __u8 type
; ///< TYPE_*
1342 __u8 size
, min_size
; ///< number of osds in each pg
1343 __u8 crush_rule
; ///< crush placement rule
1344 __u8 object_hash
; ///< hash mapping object name to ps
1345 __u8 pg_autoscale_mode
; ///< PG_AUTOSCALE_MODE_
1347 __u32 pg_num
= 0, pgp_num
= 0; ///< number of pgs
1348 __u32 pg_num_pending
= 0; ///< pg_num we are about to merge down to
1349 __u32 pg_num_target
= 0; ///< pg_num we should converge toward
1350 __u32 pgp_num_target
= 0; ///< pgp_num we should converge toward
1353 map
<string
,string
> properties
; ///< OBSOLETE
1354 string erasure_code_profile
; ///< name of the erasure code profile in OSDMap
1355 epoch_t last_change
; ///< most recent epoch changed, exclusing snapshot changes
1357 /// last epoch that forced clients to resend
1358 epoch_t last_force_op_resend
= 0;
1359 /// last epoch that forced clients to resend (pre-nautilus clients only)
1360 epoch_t last_force_op_resend_prenautilus
= 0;
1361 /// last epoch that forced clients to resend (pre-luminous clients only)
1362 epoch_t last_force_op_resend_preluminous
= 0;
1364 /// metadata for the most recent PG merge
1365 pg_merge_meta_t last_pg_merge_meta
;
1367 snapid_t snap_seq
; ///< seq for per-pool snapshot
1368 epoch_t snap_epoch
; ///< osdmap epoch of last snap
1369 uint64_t auid
; ///< who owns the pg
1371 uint64_t quota_max_bytes
; ///< maximum number of bytes for this pool
1372 uint64_t quota_max_objects
; ///< maximum number of objects for this pool
1375 * Pool snaps (global to this pool). These define a SnapContext for
1376 * the pool, unless the client manually specifies an alternate
1379 map
<snapid_t
, pool_snap_info_t
> snaps
;
1381 * Alternatively, if we are defining non-pool snaps (e.g. via the
1382 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1383 * used). Snaps and removed_snaps are to be used exclusive of each
1386 interval_set
<snapid_t
> removed_snaps
;
1388 unsigned pg_num_mask
, pgp_num_mask
;
1390 set
<uint64_t> tiers
; ///< pools that are tiers of us
1391 int64_t tier_of
; ///< pool for which we are a tier
1392 // Note that write wins for read+write ops
1393 int64_t read_tier
; ///< pool/tier for objecter to direct reads to
1394 int64_t write_tier
; ///< pool/tier for objecter to direct writes to
1395 cache_mode_t cache_mode
; ///< cache pool mode
1397 bool is_tier() const { return tier_of
>= 0; }
1398 bool has_tiers() const { return !tiers
.empty(); }
1403 clear_tier_tunables();
1405 bool has_read_tier() const { return read_tier
>= 0; }
1406 void clear_read_tier() { read_tier
= -1; }
1407 bool has_write_tier() const { return write_tier
>= 0; }
1408 void clear_write_tier() { write_tier
= -1; }
1409 void clear_tier_tunables() {
1410 if (cache_mode
!= CACHEMODE_NONE
)
1411 flags
|= FLAG_INCOMPLETE_CLONES
;
1412 cache_mode
= CACHEMODE_NONE
;
1414 target_max_bytes
= 0;
1415 target_max_objects
= 0;
1416 cache_target_dirty_ratio_micro
= 0;
1417 cache_target_dirty_high_ratio_micro
= 0;
1418 cache_target_full_ratio_micro
= 0;
1419 hit_set_params
= HitSet::Params();
1422 hit_set_grade_decay_rate
= 0;
1423 hit_set_search_last_n
= 0;
1424 grade_table
.resize(0);
1427 uint64_t target_max_bytes
; ///< tiering: target max pool size
1428 uint64_t target_max_objects
; ///< tiering: target max pool size
1430 uint32_t cache_target_dirty_ratio_micro
; ///< cache: fraction of target to leave dirty
1431 uint32_t cache_target_dirty_high_ratio_micro
; ///< cache: fraction of target to flush with high speed
1432 uint32_t cache_target_full_ratio_micro
; ///< cache: fraction of target to fill before we evict in earnest
1434 uint32_t cache_min_flush_age
; ///< minimum age (seconds) before we can flush
1435 uint32_t cache_min_evict_age
; ///< minimum age (seconds) before we can evict
1437 HitSet::Params hit_set_params
; ///< The HitSet params to use on this pool
1438 uint32_t hit_set_period
; ///< periodicity of HitSet segments (seconds)
1439 uint32_t hit_set_count
; ///< number of periods to retain
1440 bool use_gmt_hitset
; ///< use gmt to name the hitset archive object
1441 uint32_t min_read_recency_for_promote
; ///< minimum number of HitSet to check before promote on read
1442 uint32_t min_write_recency_for_promote
; ///< minimum number of HitSet to check before promote on write
1443 uint32_t hit_set_grade_decay_rate
; ///< current hit_set has highest priority on objects
1444 ///< temperature count,the follow hit_set's priority decay
1445 ///< by this params than pre hit_set
1446 uint32_t hit_set_search_last_n
; ///< accumulate atmost N hit_sets for temperature
1448 uint32_t stripe_width
; ///< erasure coded stripe size in bytes
1450 uint64_t expected_num_objects
; ///< expected number of objects on this pool, a value of 0 indicates
1451 ///< user does not specify any expected value
1452 bool fast_read
; ///< whether turn on fast read on the pool or not
1454 pool_opts_t opts
; ///< options
1457 TYPE_FINGERPRINT_NONE
= 0,
1458 TYPE_FINGERPRINT_SHA1
= 1,
1460 static fingerprint_t
get_fingerprint_from_str(const string
& s
) {
1462 return TYPE_FINGERPRINT_NONE
;
1464 return TYPE_FINGERPRINT_SHA1
;
1465 return (fingerprint_t
)-1;
1467 const fingerprint_t
get_fingerprint_type() const {
1469 opts
.get(pool_opts_t::FINGERPRINT_ALGORITHM
, &fp_str
);
1470 return get_fingerprint_from_str(fp_str
);
1472 const char *get_fingerprint_name() const {
1475 opts
.get(pool_opts_t::FINGERPRINT_ALGORITHM
, &fp_str
);
1476 fp_t
= get_fingerprint_from_str(fp_str
);
1477 return get_fingerprint_name(fp_t
);
1479 static const char *get_fingerprint_name(fingerprint_t m
) {
1481 case TYPE_FINGERPRINT_NONE
: return "none";
1482 case TYPE_FINGERPRINT_SHA1
: return "sha1";
1483 default: return "unknown";
1487 /// application -> key/value metadata
1488 map
<string
, std::map
<string
, string
>> application_metadata
;
1491 vector
<uint32_t> grade_table
;
1494 uint32_t get_grade(unsigned i
) const {
1495 if (grade_table
.size() <= i
)
1497 return grade_table
[i
];
1499 void calc_grade_table() {
1500 unsigned v
= 1000000;
1501 grade_table
.resize(hit_set_count
);
1502 for (unsigned i
= 0; i
< hit_set_count
; i
++) {
1503 v
= v
* (1 - (hit_set_grade_decay_rate
/ 100.0));
1509 : flags(0), type(0), size(0), min_size(0),
1510 crush_rule(0), object_hash(0),
1512 snap_seq(0), snap_epoch(0),
1514 quota_max_bytes(0), quota_max_objects(0),
1515 pg_num_mask(0), pgp_num_mask(0),
1516 tier_of(-1), read_tier(-1), write_tier(-1),
1517 cache_mode(CACHEMODE_NONE
),
1518 target_max_bytes(0), target_max_objects(0),
1519 cache_target_dirty_ratio_micro(0),
1520 cache_target_dirty_high_ratio_micro(0),
1521 cache_target_full_ratio_micro(0),
1522 cache_min_flush_age(0),
1523 cache_min_evict_age(0),
1527 use_gmt_hitset(true),
1528 min_read_recency_for_promote(0),
1529 min_write_recency_for_promote(0),
1530 hit_set_grade_decay_rate(0),
1531 hit_set_search_last_n(0),
1533 expected_num_objects(0),
1538 void dump(Formatter
*f
) const;
1540 const utime_t
&get_create_time() const { return create_time
; }
1541 uint64_t get_flags() const { return flags
; }
1542 bool has_flag(uint64_t f
) const { return flags
& f
; }
1543 void set_flag(uint64_t f
) { flags
|= f
; }
1544 void unset_flag(uint64_t f
) { flags
&= ~f
; }
1546 bool require_rollback() const {
1547 return is_erasure();
1550 /// true if incomplete clones may be present
1551 bool allow_incomplete_clones() const {
1552 return cache_mode
!= CACHEMODE_NONE
|| has_flag(FLAG_INCOMPLETE_CLONES
);
1555 unsigned get_type() const { return type
; }
1556 unsigned get_size() const { return size
; }
1557 unsigned get_min_size() const { return min_size
; }
1558 int get_crush_rule() const { return crush_rule
; }
1559 int get_object_hash() const { return object_hash
; }
1560 const char *get_object_hash_name() const {
1561 return ceph_str_hash_name(get_object_hash());
1563 epoch_t
get_last_change() const { return last_change
; }
1564 epoch_t
get_last_force_op_resend() const { return last_force_op_resend
; }
1565 epoch_t
get_last_force_op_resend_prenautilus() const {
1566 return last_force_op_resend_prenautilus
;
1568 epoch_t
get_last_force_op_resend_preluminous() const {
1569 return last_force_op_resend_preluminous
;
1571 epoch_t
get_snap_epoch() const { return snap_epoch
; }
1572 snapid_t
get_snap_seq() const { return snap_seq
; }
1573 uint64_t get_auid() const { return auid
; }
1575 void set_snap_seq(snapid_t s
) { snap_seq
= s
; }
1576 void set_snap_epoch(epoch_t e
) { snap_epoch
= e
; }
1578 void set_stripe_width(uint32_t s
) { stripe_width
= s
; }
1579 uint32_t get_stripe_width() const { return stripe_width
; }
1581 bool is_replicated() const { return get_type() == TYPE_REPLICATED
; }
1582 bool is_erasure() const { return get_type() == TYPE_ERASURE
; }
1584 bool supports_omap() const {
1585 return !(get_type() == TYPE_ERASURE
);
1588 bool requires_aligned_append() const {
1589 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES
);
1591 uint64_t required_alignment() const { return stripe_width
; }
1593 bool allows_ecoverwrites() const {
1594 return has_flag(FLAG_EC_OVERWRITES
);
1597 bool can_shift_osds() const {
1598 switch (get_type()) {
1599 case TYPE_REPLICATED
:
1604 ceph_abort_msg("unhandled pool type");
1608 unsigned get_pg_num() const { return pg_num
; }
1609 unsigned get_pgp_num() const { return pgp_num
; }
1610 unsigned get_pg_num_target() const { return pg_num_target
; }
1611 unsigned get_pgp_num_target() const { return pgp_num_target
; }
1612 unsigned get_pg_num_pending() const { return pg_num_pending
; }
1614 unsigned get_pg_num_mask() const { return pg_num_mask
; }
1615 unsigned get_pgp_num_mask() const { return pgp_num_mask
; }
1617 // if pg_num is not a multiple of two, pgs are not equally sized.
1618 // return, for a given pg, the fraction (denominator) of the total
1619 // pool size that it represents.
1620 unsigned get_pg_num_divisor(pg_t pgid
) const;
1622 bool is_pending_merge(pg_t pgid
, bool *target
) const;
1624 void set_pg_num(int p
) {
1629 void set_pgp_num(int p
) {
1633 void set_pg_num_pending(int p
) {
1637 void set_pg_num_target(int p
) {
1640 void set_pgp_num_target(int p
) {
1643 void dec_pg_num(pg_t source_pgid
,
1644 epoch_t ready_epoch
,
1645 eversion_t source_version
,
1646 eversion_t target_version
,
1647 epoch_t last_epoch_started
,
1648 epoch_t last_epoch_clean
) {
1650 last_pg_merge_meta
.source_pgid
= source_pgid
;
1651 last_pg_merge_meta
.ready_epoch
= ready_epoch
;
1652 last_pg_merge_meta
.source_version
= source_version
;
1653 last_pg_merge_meta
.target_version
= target_version
;
1654 last_pg_merge_meta
.last_epoch_started
= last_epoch_started
;
1655 last_pg_merge_meta
.last_epoch_clean
= last_epoch_clean
;
1659 void set_quota_max_bytes(uint64_t m
) {
1660 quota_max_bytes
= m
;
1662 uint64_t get_quota_max_bytes() {
1663 return quota_max_bytes
;
1666 void set_quota_max_objects(uint64_t m
) {
1667 quota_max_objects
= m
;
1669 uint64_t get_quota_max_objects() {
1670 return quota_max_objects
;
1673 void set_last_force_op_resend(uint64_t t
) {
1674 last_force_op_resend
= t
;
1675 last_force_op_resend_prenautilus
= t
;
1676 last_force_op_resend_preluminous
= t
;
1679 void calc_pg_masks();
1682 * we have two snap modes:
1683 * - pool global snaps
1684 * - snap existence/non-existence defined by snaps[] and snap_seq
1685 * - user managed snaps
1686 * - removal governed by removed_snaps
1688 * we know which mode we're using based on whether removed_snaps is empty.
1689 * If nothing has been created, both functions report false.
1691 bool is_pool_snaps_mode() const;
1692 bool is_unmanaged_snaps_mode() const;
1693 bool is_removed_snap(snapid_t s
) const;
1696 * build set of known-removed sets from either pool snaps or
1697 * explicit removed_snaps set.
1699 void build_removed_snaps(interval_set
<snapid_t
>& rs
) const;
1700 bool maybe_updated_removed_snaps(const interval_set
<snapid_t
>& cached
) const;
1701 snapid_t
snap_exists(const char *s
) const;
1702 void add_snap(const char *n
, utime_t stamp
);
1703 void add_unmanaged_snap(uint64_t& snapid
);
1704 void remove_snap(snapid_t s
);
1705 void remove_unmanaged_snap(snapid_t s
);
1707 SnapContext
get_snap_context() const;
1709 /// hash a object name+namespace key to a hash position
1710 uint32_t hash_key(const string
& key
, const string
& ns
) const;
1712 /// round a hash position down to a pg num
1713 uint32_t raw_hash_to_pg(uint32_t v
) const;
1716 * map a raw pg (with full precision ps) into an actual pg, for storage
1718 pg_t
raw_pg_to_pg(pg_t pg
) const;
1721 * map raw pg (full precision ps) into a placement seed. include
1722 * pool id in that value so that different pools don't use the same
1725 ps_t
raw_pg_to_pps(pg_t pg
) const;
1727 /// choose a random hash position within a pg
1728 uint32_t get_random_pg_position(pg_t pgid
, uint32_t seed
) const;
1730 void encode(bufferlist
& bl
, uint64_t features
) const;
1731 void decode(bufferlist::const_iterator
& bl
);
1733 static void generate_test_instances(list
<pg_pool_t
*>& o
);
1735 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t
)
1737 ostream
& operator<<(ostream
& out
, const pg_pool_t
& p
);
1741 * a summation of object stats
1743 * This is just a container for object stats; we don't know what for.
1745 * If you add members in object_stat_sum_t, you should make sure there are
1746 * not padding among these members.
1747 * You should also modify the padding_check function.
1750 struct object_stat_sum_t
{
1751 /**************************************************************************
1752 * WARNING: be sure to update operator==, floor, and split when
1753 * adding/removing fields!
1754 **************************************************************************/
1755 int64_t num_bytes
; // in bytes
1756 int64_t num_objects
;
1757 int64_t num_object_clones
;
1758 int64_t num_object_copies
; // num_objects * num_replicas
1759 int64_t num_objects_missing_on_primary
;
1760 int64_t num_objects_degraded
;
1761 int64_t num_objects_unfound
;
1766 int64_t num_scrub_errors
; // total deep and shallow scrub errors
1767 int64_t num_objects_recovered
;
1768 int64_t num_bytes_recovered
;
1769 int64_t num_keys_recovered
;
1770 int64_t num_shallow_scrub_errors
;
1771 int64_t num_deep_scrub_errors
;
1772 int64_t num_objects_dirty
;
1773 int64_t num_whiteouts
;
1774 int64_t num_objects_omap
;
1775 int64_t num_objects_hit_set_archive
;
1776 int64_t num_objects_misplaced
;
1777 int64_t num_bytes_hit_set_archive
;
1779 int64_t num_flush_kb
;
1781 int64_t num_evict_kb
;
1782 int64_t num_promote
;
1783 int32_t num_flush_mode_high
; // 1 when in high flush mode, otherwise 0
1784 int32_t num_flush_mode_low
; // 1 when in low flush mode, otherwise 0
1785 int32_t num_evict_mode_some
; // 1 when in evict some mode, otherwise 0
1786 int32_t num_evict_mode_full
; // 1 when in evict full mode, otherwise 0
1787 int64_t num_objects_pinned
;
1788 int64_t num_objects_missing
;
1789 int64_t num_legacy_snapsets
; ///< upper bound on pre-luminous-style SnapSets
1790 int64_t num_large_omap_objects
= 0;
1791 int64_t num_objects_manifest
= 0;
1792 int64_t num_omap_bytes
= 0;
1793 int64_t num_omap_keys
= 0;
1794 int64_t num_objects_repaired
= 0;
1798 num_objects(0), num_object_clones(0), num_object_copies(0),
1799 num_objects_missing_on_primary(0), num_objects_degraded(0),
1800 num_objects_unfound(0),
1801 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1802 num_scrub_errors(0),
1803 num_objects_recovered(0),
1804 num_bytes_recovered(0),
1805 num_keys_recovered(0),
1806 num_shallow_scrub_errors(0),
1807 num_deep_scrub_errors(0),
1808 num_objects_dirty(0),
1810 num_objects_omap(0),
1811 num_objects_hit_set_archive(0),
1812 num_objects_misplaced(0),
1813 num_bytes_hit_set_archive(0),
1819 num_flush_mode_high(0), num_flush_mode_low(0),
1820 num_evict_mode_some(0), num_evict_mode_full(0),
1821 num_objects_pinned(0),
1822 num_objects_missing(0),
1823 num_legacy_snapsets(0)
1826 void floor(int64_t f
) {
1827 #define FLOOR(x) if (x < f) x = f
1830 FLOOR(num_object_clones
);
1831 FLOOR(num_object_copies
);
1832 FLOOR(num_objects_missing_on_primary
);
1833 FLOOR(num_objects_missing
);
1834 FLOOR(num_objects_degraded
);
1835 FLOOR(num_objects_misplaced
);
1836 FLOOR(num_objects_unfound
);
1841 FLOOR(num_large_omap_objects
);
1842 FLOOR(num_objects_manifest
);
1843 FLOOR(num_omap_bytes
);
1844 FLOOR(num_omap_keys
);
1845 FLOOR(num_shallow_scrub_errors
);
1846 FLOOR(num_deep_scrub_errors
);
1847 num_scrub_errors
= num_shallow_scrub_errors
+ num_deep_scrub_errors
;
1848 FLOOR(num_objects_recovered
);
1849 FLOOR(num_bytes_recovered
);
1850 FLOOR(num_keys_recovered
);
1851 FLOOR(num_objects_dirty
);
1852 FLOOR(num_whiteouts
);
1853 FLOOR(num_objects_omap
);
1854 FLOOR(num_objects_hit_set_archive
);
1855 FLOOR(num_bytes_hit_set_archive
);
1857 FLOOR(num_flush_kb
);
1859 FLOOR(num_evict_kb
);
1861 FLOOR(num_flush_mode_high
);
1862 FLOOR(num_flush_mode_low
);
1863 FLOOR(num_evict_mode_some
);
1864 FLOOR(num_evict_mode_full
);
1865 FLOOR(num_objects_pinned
);
1866 FLOOR(num_legacy_snapsets
);
1867 FLOOR(num_objects_repaired
);
1871 void split(vector
<object_stat_sum_t
> &out
) const {
1872 #define SPLIT(PARAM) \
1873 for (unsigned i = 0; i < out.size(); ++i) { \
1874 out[i].PARAM = PARAM / out.size(); \
1875 if (i < (PARAM % out.size())) { \
1879 #define SPLIT_PRESERVE_NONZERO(PARAM) \
1880 for (unsigned i = 0; i < out.size(); ++i) { \
1882 out[i].PARAM = 1 + PARAM / out.size(); \
1889 SPLIT(num_object_clones
);
1890 SPLIT(num_object_copies
);
1891 SPLIT(num_objects_missing_on_primary
);
1892 SPLIT(num_objects_missing
);
1893 SPLIT(num_objects_degraded
);
1894 SPLIT(num_objects_misplaced
);
1895 SPLIT(num_objects_unfound
);
1900 SPLIT(num_large_omap_objects
);
1901 SPLIT(num_objects_manifest
);
1902 SPLIT(num_omap_bytes
);
1903 SPLIT(num_omap_keys
);
1904 SPLIT(num_objects_repaired
);
1905 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors
);
1906 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors
);
1907 for (unsigned i
= 0; i
< out
.size(); ++i
) {
1908 out
[i
].num_scrub_errors
= out
[i
].num_shallow_scrub_errors
+
1909 out
[i
].num_deep_scrub_errors
;
1911 SPLIT(num_objects_recovered
);
1912 SPLIT(num_bytes_recovered
);
1913 SPLIT(num_keys_recovered
);
1914 SPLIT(num_objects_dirty
);
1915 SPLIT(num_whiteouts
);
1916 SPLIT(num_objects_omap
);
1917 SPLIT(num_objects_hit_set_archive
);
1918 SPLIT(num_bytes_hit_set_archive
);
1920 SPLIT(num_flush_kb
);
1922 SPLIT(num_evict_kb
);
1924 SPLIT(num_flush_mode_high
);
1925 SPLIT(num_flush_mode_low
);
1926 SPLIT(num_evict_mode_some
);
1927 SPLIT(num_evict_mode_full
);
1928 SPLIT(num_objects_pinned
);
1929 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets
);
1931 #undef SPLIT_PRESERVE_NONZERO
1935 memset(this, 0, sizeof(*this));
1938 void calc_copies(int nrep
) {
1939 num_object_copies
= nrep
* num_objects
;
1942 bool is_zero() const {
1943 return mem_is_zero((char*)this, sizeof(*this));
1946 void add(const object_stat_sum_t
& o
);
1947 void sub(const object_stat_sum_t
& o
);
1949 void dump(Formatter
*f
) const;
1950 void padding_check() {
1952 sizeof(object_stat_sum_t
) ==
1954 sizeof(num_objects
) +
1955 sizeof(num_object_clones
) +
1956 sizeof(num_object_copies
) +
1957 sizeof(num_objects_missing_on_primary
) +
1958 sizeof(num_objects_degraded
) +
1959 sizeof(num_objects_unfound
) +
1964 sizeof(num_scrub_errors
) +
1965 sizeof(num_large_omap_objects
) +
1966 sizeof(num_objects_manifest
) +
1967 sizeof(num_omap_bytes
) +
1968 sizeof(num_omap_keys
) +
1969 sizeof(num_objects_repaired
) +
1970 sizeof(num_objects_recovered
) +
1971 sizeof(num_bytes_recovered
) +
1972 sizeof(num_keys_recovered
) +
1973 sizeof(num_shallow_scrub_errors
) +
1974 sizeof(num_deep_scrub_errors
) +
1975 sizeof(num_objects_dirty
) +
1976 sizeof(num_whiteouts
) +
1977 sizeof(num_objects_omap
) +
1978 sizeof(num_objects_hit_set_archive
) +
1979 sizeof(num_objects_misplaced
) +
1980 sizeof(num_bytes_hit_set_archive
) +
1982 sizeof(num_flush_kb
) +
1984 sizeof(num_evict_kb
) +
1985 sizeof(num_promote
) +
1986 sizeof(num_flush_mode_high
) +
1987 sizeof(num_flush_mode_low
) +
1988 sizeof(num_evict_mode_some
) +
1989 sizeof(num_evict_mode_full
) +
1990 sizeof(num_objects_pinned
) +
1991 sizeof(num_objects_missing
) +
1992 sizeof(num_legacy_snapsets
)
1994 "object_stat_sum_t have padding");
1996 void encode(bufferlist
& bl
) const;
1997 void decode(bufferlist::const_iterator
& bl
);
1998 static void generate_test_instances(list
<object_stat_sum_t
*>& o
);
2000 WRITE_CLASS_ENCODER(object_stat_sum_t
)
2002 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
);
2005 * a collection of object stat sums
2007 * This is a collection of stat sums over different categories.
2009 struct object_stat_collection_t
{
2010 /**************************************************************************
2011 * WARNING: be sure to update the operator== when adding/removing fields! *
2012 **************************************************************************/
2013 object_stat_sum_t sum
;
2015 void calc_copies(int nrep
) {
2016 sum
.calc_copies(nrep
);
2019 void dump(Formatter
*f
) const;
2020 void encode(bufferlist
& bl
) const;
2021 void decode(bufferlist::const_iterator
& bl
);
2022 static void generate_test_instances(list
<object_stat_collection_t
*>& o
);
2024 bool is_zero() const {
2025 return sum
.is_zero();
2032 void floor(int64_t f
) {
2036 void add(const object_stat_sum_t
& o
) {
2040 void add(const object_stat_collection_t
& o
) {
2043 void sub(const object_stat_collection_t
& o
) {
2047 WRITE_CLASS_ENCODER(object_stat_collection_t
)
2049 inline bool operator==(const object_stat_collection_t
& l
,
2050 const object_stat_collection_t
& r
) {
2051 return l
.sum
== r
.sum
;
2056 * aggregate stats for a single PG.
2059 /**************************************************************************
2060 * WARNING: be sure to update the operator== when adding/removing fields! *
2061 **************************************************************************/
2063 version_t reported_seq
; // sequence number
2064 epoch_t reported_epoch
; // epoch of this report
2066 utime_t last_fresh
; // last reported
2067 utime_t last_change
; // new state != previous state
2068 utime_t last_active
; // state & PG_STATE_ACTIVE
2069 utime_t last_peered
; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2070 utime_t last_clean
; // state & PG_STATE_CLEAN
2071 utime_t last_unstale
; // (state & PG_STATE_STALE) == 0
2072 utime_t last_undegraded
; // (state & PG_STATE_DEGRADED) == 0
2073 utime_t last_fullsized
; // (state & PG_STATE_UNDERSIZED) == 0
2075 eversion_t log_start
; // (log_start,version]
2076 eversion_t ondisk_log_start
; // there may be more on disk
2079 epoch_t last_epoch_clean
;
2081 __u32 parent_split_bits
;
2083 eversion_t last_scrub
;
2084 eversion_t last_deep_scrub
;
2085 utime_t last_scrub_stamp
;
2086 utime_t last_deep_scrub_stamp
;
2087 utime_t last_clean_scrub_stamp
;
2089 object_stat_collection_t stats
;
2092 int64_t ondisk_log_size
; // >= active_log_size
2094 vector
<int32_t> up
, acting
;
2095 vector
<pg_shard_t
> avail_no_missing
;
2096 map
< std::set
<pg_shard_t
>, int32_t > object_location_counts
;
2097 epoch_t mapping_epoch
;
2099 vector
<int32_t> blocked_by
; ///< osds on which the pg is blocked
2101 interval_set
<snapid_t
> purged_snaps
; ///< recently removed snaps that we've purged
2103 utime_t last_became_active
;
2104 utime_t last_became_peered
;
2106 /// up, acting primaries
2108 int32_t acting_primary
;
2110 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2111 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2112 uint32_t snaptrimq_len
;
2114 bool stats_invalid
:1;
2115 /// true if num_objects_dirty is not accurate (because it was not
2116 /// maintained starting from pool creation)
2117 bool dirty_stats_invalid
:1;
2118 bool omap_stats_invalid
:1;
2119 bool hitset_stats_invalid
:1;
2120 bool hitset_bytes_stats_invalid
:1;
2121 bool pin_stats_invalid
:1;
2122 bool manifest_stats_invalid
:1;
2128 created(0), last_epoch_clean(0),
2129 parent_split_bits(0),
2130 log_size(0), ondisk_log_size(0),
2135 stats_invalid(false),
2136 dirty_stats_invalid(false),
2137 omap_stats_invalid(false),
2138 hitset_stats_invalid(false),
2139 hitset_bytes_stats_invalid(false),
2140 pin_stats_invalid(false),
2141 manifest_stats_invalid(false)
2144 epoch_t
get_effective_last_epoch_clean() const {
2145 if (state
& PG_STATE_CLEAN
) {
2146 // we are clean as of this report, and should thus take the
2148 return reported_epoch
;
2150 return last_epoch_clean
;
2154 pair
<epoch_t
, version_t
> get_version_pair() const {
2155 return make_pair(reported_epoch
, reported_seq
);
2158 void floor(int64_t f
) {
2162 if (ondisk_log_size
< f
)
2163 ondisk_log_size
= f
;
2164 if (snaptrimq_len
< f
)
2168 void add_sub_invalid_flags(const pg_stat_t
& o
) {
2169 // adding (or subtracting!) invalid stats render our stats invalid too
2170 stats_invalid
|= o
.stats_invalid
;
2171 dirty_stats_invalid
|= o
.dirty_stats_invalid
;
2172 hitset_stats_invalid
|= o
.hitset_stats_invalid
;
2173 pin_stats_invalid
|= o
.pin_stats_invalid
;
2174 manifest_stats_invalid
|= o
.manifest_stats_invalid
;
2176 void add(const pg_stat_t
& o
) {
2178 log_size
+= o
.log_size
;
2179 ondisk_log_size
+= o
.ondisk_log_size
;
2180 snaptrimq_len
= std::min((uint64_t)snaptrimq_len
+ o
.snaptrimq_len
,
2181 (uint64_t)(1ull << 31));
2182 add_sub_invalid_flags(o
);
2184 void sub(const pg_stat_t
& o
) {
2186 log_size
-= o
.log_size
;
2187 ondisk_log_size
-= o
.ondisk_log_size
;
2188 if (o
.snaptrimq_len
< snaptrimq_len
) {
2189 snaptrimq_len
-= o
.snaptrimq_len
;
2193 add_sub_invalid_flags(o
);
2196 bool is_acting_osd(int32_t osd
, bool primary
) const;
2197 void dump(Formatter
*f
) const;
2198 void dump_brief(Formatter
*f
) const;
2199 void encode(bufferlist
&bl
) const;
2200 void decode(bufferlist::const_iterator
&bl
);
2201 static void generate_test_instances(list
<pg_stat_t
*>& o
);
2203 WRITE_CLASS_ENCODER(pg_stat_t
)
2205 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
);
2208 * ObjectStore full statfs information
2210 struct store_statfs_t
2212 uint64_t total
= 0; ///< Total bytes
2213 uint64_t available
= 0; ///< Free bytes available
2214 uint64_t internally_reserved
= 0; ///< Bytes reserved for internal purposes
2216 int64_t allocated
= 0; ///< Bytes allocated by the store
2218 int64_t data_stored
= 0; ///< Bytes actually stored by the user
2219 int64_t data_compressed
= 0; ///< Bytes stored after compression
2220 int64_t data_compressed_allocated
= 0; ///< Bytes allocated for compressed data
2221 int64_t data_compressed_original
= 0; ///< Bytes that were compressed
2223 int64_t omap_allocated
= 0; ///< approx usage of omap data
2224 int64_t internal_metadata
= 0; ///< approx usage of internal metadata
2227 *this = store_statfs_t();
2229 void floor(int64_t f
) {
2230 #define FLOOR(x) if (int64_t(x) < f) x = f
2233 FLOOR(internally_reserved
);
2236 FLOOR(data_compressed
);
2237 FLOOR(data_compressed_allocated
);
2238 FLOOR(data_compressed_original
);
2240 FLOOR(omap_allocated
);
2241 FLOOR(internal_metadata
);
2245 bool operator ==(const store_statfs_t
& other
) const;
2246 bool is_zero() const {
2247 return *this == store_statfs_t();
2250 uint64_t get_used() const {
2251 return total
- available
- internally_reserved
;
2254 // this accumulates both actually used and statfs's internally_reserved
2255 uint64_t get_used_raw() const {
2256 return total
- available
;
2259 float get_used_raw_ratio() const {
2261 return (float)get_used_raw() / (float)total
;
2267 // helpers to ease legacy code porting
2268 uint64_t kb_avail() const {
2269 return available
>> 10;
2271 uint64_t kb() const {
2274 uint64_t kb_used() const {
2275 return (total
- available
- internally_reserved
) >> 10;
2277 uint64_t kb_used_raw() const {
2278 return get_used_raw() >> 10;
2281 uint64_t kb_used_data() const {
2282 return allocated
>> 10;
2284 uint64_t kb_used_omap() const {
2285 return omap_allocated
>> 10;
2288 uint64_t kb_used_internal_metadata() const {
2289 return internal_metadata
>> 10;
2292 void add(const store_statfs_t
& o
) {
2294 available
+= o
.available
;
2295 internally_reserved
+= o
.internally_reserved
;
2296 allocated
+= o
.allocated
;
2297 data_stored
+= o
.data_stored
;
2298 data_compressed
+= o
.data_compressed
;
2299 data_compressed_allocated
+= o
.data_compressed_allocated
;
2300 data_compressed_original
+= o
.data_compressed_original
;
2301 omap_allocated
+= o
.omap_allocated
;
2302 internal_metadata
+= o
.internal_metadata
;
2304 void sub(const store_statfs_t
& o
) {
2306 available
-= o
.available
;
2307 internally_reserved
-= o
.internally_reserved
;
2308 allocated
-= o
.allocated
;
2309 data_stored
-= o
.data_stored
;
2310 data_compressed
-= o
.data_compressed
;
2311 data_compressed_allocated
-= o
.data_compressed_allocated
;
2312 data_compressed_original
-= o
.data_compressed_original
;
2313 omap_allocated
-= o
.omap_allocated
;
2314 internal_metadata
-= o
.internal_metadata
;
2316 void dump(Formatter
*f
) const;
2317 DENC(store_statfs_t
, v
, p
) {
2318 DENC_START(1, 1, p
);
2320 denc(v
.available
, p
);
2321 denc(v
.internally_reserved
, p
);
2322 denc(v
.allocated
, p
);
2323 denc(v
.data_stored
, p
);
2324 denc(v
.data_compressed
, p
);
2325 denc(v
.data_compressed_allocated
, p
);
2326 denc(v
.data_compressed_original
, p
);
2327 denc(v
.omap_allocated
, p
);
2328 denc(v
.internal_metadata
, p
);
2331 static void generate_test_instances(list
<store_statfs_t
*>& o
);
2333 WRITE_CLASS_DENC(store_statfs_t
)
2335 ostream
&operator<<(ostream
&lhs
, const store_statfs_t
&rhs
);
2338 * aggregate stats for an osd
2341 store_statfs_t statfs
;
2342 vector
<int> hb_peers
;
2343 int32_t snap_trim_queue_len
, num_snap_trimming
;
2344 uint64_t num_shards_repaired
;
2346 pow2_hist_t op_queue_age_hist
;
2348 objectstore_perf_stat_t os_perf_stat
;
2349 osd_alerts_t os_alerts
;
2351 epoch_t up_from
= 0;
2354 uint32_t num_pgs
= 0;
2356 uint32_t num_osds
= 0;
2357 uint32_t num_per_pool_osds
= 0;
2359 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2360 num_shards_repaired(0) {}
2362 void add(const osd_stat_t
& o
) {
2363 statfs
.add(o
.statfs
);
2364 snap_trim_queue_len
+= o
.snap_trim_queue_len
;
2365 num_snap_trimming
+= o
.num_snap_trimming
;
2366 num_shards_repaired
+= o
.num_shards_repaired
;
2367 op_queue_age_hist
.add(o
.op_queue_age_hist
);
2368 os_perf_stat
.add(o
.os_perf_stat
);
2369 num_pgs
+= o
.num_pgs
;
2370 num_osds
+= o
.num_osds
;
2371 num_per_pool_osds
+= o
.num_per_pool_osds
;
2372 for (const auto& a
: o
.os_alerts
) {
2373 auto& target
= os_alerts
[a
.first
];
2374 for (auto& i
: a
.second
) {
2375 target
.emplace(i
.first
, i
.second
);
2379 void sub(const osd_stat_t
& o
) {
2380 statfs
.sub(o
.statfs
);
2381 snap_trim_queue_len
-= o
.snap_trim_queue_len
;
2382 num_snap_trimming
-= o
.num_snap_trimming
;
2383 num_shards_repaired
-= o
.num_shards_repaired
;
2384 op_queue_age_hist
.sub(o
.op_queue_age_hist
);
2385 os_perf_stat
.sub(o
.os_perf_stat
);
2386 num_pgs
-= o
.num_pgs
;
2387 num_osds
-= o
.num_osds
;
2388 num_per_pool_osds
-= o
.num_per_pool_osds
;
2389 for (const auto& a
: o
.os_alerts
) {
2390 auto& target
= os_alerts
[a
.first
];
2391 for (auto& i
: a
.second
) {
2392 target
.erase(i
.first
);
2394 if (target
.empty()) {
2395 os_alerts
.erase(a
.first
);
2399 void dump(Formatter
*f
) const;
2400 void encode(bufferlist
&bl
, uint64_t features
) const;
2401 void decode(bufferlist::const_iterator
&bl
);
2402 static void generate_test_instances(std::list
<osd_stat_t
*>& o
);
2404 WRITE_CLASS_ENCODER_FEATURES(osd_stat_t
)
2406 inline bool operator==(const osd_stat_t
& l
, const osd_stat_t
& r
) {
2407 return l
.statfs
== r
.statfs
&&
2408 l
.snap_trim_queue_len
== r
.snap_trim_queue_len
&&
2409 l
.num_snap_trimming
== r
.num_snap_trimming
&&
2410 l
.num_shards_repaired
== r
.num_shards_repaired
&&
2411 l
.hb_peers
== r
.hb_peers
&&
2412 l
.op_queue_age_hist
== r
.op_queue_age_hist
&&
2413 l
.os_perf_stat
== r
.os_perf_stat
&&
2414 l
.num_pgs
== r
.num_pgs
&&
2415 l
.num_osds
== r
.num_osds
&&
2416 l
.num_per_pool_osds
== r
.num_per_pool_osds
;
2418 inline bool operator!=(const osd_stat_t
& l
, const osd_stat_t
& r
) {
2422 inline ostream
& operator<<(ostream
& out
, const osd_stat_t
& s
) {
2423 return out
<< "osd_stat(" << s
.statfs
<< ", "
2424 << "peers " << s
.hb_peers
2425 << " op hist " << s
.op_queue_age_hist
.h
2430 * summation over an entire pool
2432 struct pool_stat_t
{
2433 object_stat_collection_t stats
;
2434 store_statfs_t store_stats
;
2436 int64_t ondisk_log_size
; // >= active_log_size
2437 int32_t up
; ///< number of up replicas or shards
2438 int32_t acting
; ///< number of acting replicas or shards
2439 int32_t num_store_stats
; ///< amount of store_stats accumulated
2441 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2445 void floor(int64_t f
) {
2447 store_stats
.floor(f
);
2450 if (ondisk_log_size
< f
)
2451 ondisk_log_size
= f
;
2456 if (num_store_stats
< f
)
2457 num_store_stats
= f
;
2460 void add(const store_statfs_t
& o
) {
2464 void sub(const store_statfs_t
& o
) {
2469 void add(const pg_stat_t
& o
) {
2471 log_size
+= o
.log_size
;
2472 ondisk_log_size
+= o
.ondisk_log_size
;
2474 acting
+= o
.acting
.size();
2476 void sub(const pg_stat_t
& o
) {
2478 log_size
-= o
.log_size
;
2479 ondisk_log_size
-= o
.ondisk_log_size
;
2481 acting
-= o
.acting
.size();
2484 bool is_zero() const {
2485 return (stats
.is_zero() &&
2486 store_stats
.is_zero() &&
2488 ondisk_log_size
== 0 &&
2491 num_store_stats
== 0);
2494 // helper accessors to retrieve used/netto bytes depending on the
2495 // collection method: new per-pool objectstore report or legacy PG
2496 // summation at OSD.
2497 // In legacy mode used and netto values are the same. But for new per-pool
2498 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2499 // and 'netto' is amount of stored user data.
2500 uint64_t get_allocated_bytes(bool per_pool
) const {
2501 uint64_t allocated_bytes
;
2503 allocated_bytes
= store_stats
.allocated
;
2505 // legacy mode, use numbers from 'stats'
2506 allocated_bytes
= stats
.sum
.num_bytes
+
2507 stats
.sum
.num_bytes_hit_set_archive
;
2509 // omap is not broken out by pool by nautilus bluestore
2510 allocated_bytes
+= stats
.sum
.num_omap_bytes
;
2511 return allocated_bytes
;
2513 uint64_t get_user_bytes(float raw_used_rate
, bool per_pool
) const {
2514 uint64_t user_bytes
;
2516 user_bytes
= raw_used_rate
? store_stats
.data_stored
/ raw_used_rate
: 0;
2518 // legacy mode, use numbers from 'stats'
2519 user_bytes
= stats
.sum
.num_bytes
+
2520 stats
.sum
.num_bytes_hit_set_archive
;
2522 // omap is not broken out by pool by nautilus bluestore
2523 user_bytes
+= stats
.sum
.num_omap_bytes
;
2527 void dump(Formatter
*f
) const;
2528 void encode(bufferlist
&bl
, uint64_t features
) const;
2529 void decode(bufferlist::const_iterator
&bl
);
2530 static void generate_test_instances(list
<pool_stat_t
*>& o
);
2532 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t
)
2535 // -----------------------------------------
2538 * pg_hit_set_info_t - information about a single recorded HitSet
2540 * Track basic metadata about a HitSet, like the number of insertions
2541 * and the time range it covers.
2543 struct pg_hit_set_info_t
{
2544 utime_t begin
, end
; ///< time interval
2545 eversion_t version
; ///< version this HitSet object was written
2546 bool using_gmt
; ///< use gmt for creating the hit_set archive object name
2548 friend bool operator==(const pg_hit_set_info_t
& l
,
2549 const pg_hit_set_info_t
& r
) {
2551 l
.begin
== r
.begin
&&
2553 l
.version
== r
.version
&&
2554 l
.using_gmt
== r
.using_gmt
;
2557 explicit pg_hit_set_info_t(bool using_gmt
= true)
2558 : using_gmt(using_gmt
) {}
2560 void encode(bufferlist
&bl
) const;
2561 void decode(bufferlist::const_iterator
&bl
);
2562 void dump(Formatter
*f
) const;
2563 static void generate_test_instances(list
<pg_hit_set_info_t
*>& o
);
2565 WRITE_CLASS_ENCODER(pg_hit_set_info_t
)
2568 * pg_hit_set_history_t - information about a history of hitsets
2570 * Include information about the currently accumulating hit set as well
2571 * as archived/historical ones.
2573 struct pg_hit_set_history_t
{
2574 eversion_t current_last_update
; ///< last version inserted into current set
2575 list
<pg_hit_set_info_t
> history
; ///< archived sets, sorted oldest -> newest
2577 friend bool operator==(const pg_hit_set_history_t
& l
,
2578 const pg_hit_set_history_t
& r
) {
2580 l
.current_last_update
== r
.current_last_update
&&
2581 l
.history
== r
.history
;
2584 void encode(bufferlist
&bl
) const;
2585 void decode(bufferlist::const_iterator
&bl
);
2586 void dump(Formatter
*f
) const;
2587 static void generate_test_instances(list
<pg_hit_set_history_t
*>& o
);
2589 WRITE_CLASS_ENCODER(pg_hit_set_history_t
)
2592 // -----------------------------------------
2595 * pg_history_t - information about recent pg peering/mapping history
2597 * This is aggressively shared between OSDs to bound the amount of past
2598 * history they need to worry about.
2600 struct pg_history_t
{
2601 epoch_t epoch_created
; // epoch in which *pg* was created (pool or pg)
2602 epoch_t epoch_pool_created
; // epoch in which *pool* was created
2603 // (note: may be pg creation epoch for
2604 // pre-luminous clusters)
2605 epoch_t last_epoch_started
; // lower bound on last epoch started (anywhere, not necessarily locally)
2606 epoch_t last_interval_started
; // first epoch of last_epoch_started interval
2607 epoch_t last_epoch_clean
; // lower bound on last epoch the PG was completely clean.
2608 epoch_t last_interval_clean
; // first epoch of last_epoch_clean interval
2609 epoch_t last_epoch_split
; // as parent or child
2610 epoch_t last_epoch_marked_full
; // pool or cluster
2613 * In the event of a map discontinuity, same_*_since may reflect the first
2614 * map the osd has seen in the new map sequence rather than the actual start
2615 * of the interval. This is ok since a discontinuity at epoch e means there
2616 * must have been a clean interval between e and now and that we cannot be
2617 * in the active set during the interval containing e.
2619 epoch_t same_up_since
; // same acting set since
2620 epoch_t same_interval_since
; // same acting AND up set since
2621 epoch_t same_primary_since
; // same primary at least back through this epoch.
2623 eversion_t last_scrub
;
2624 eversion_t last_deep_scrub
;
2625 utime_t last_scrub_stamp
;
2626 utime_t last_deep_scrub_stamp
;
2627 utime_t last_clean_scrub_stamp
;
2629 friend bool operator==(const pg_history_t
& l
, const pg_history_t
& r
) {
2631 l
.epoch_created
== r
.epoch_created
&&
2632 l
.epoch_pool_created
== r
.epoch_pool_created
&&
2633 l
.last_epoch_started
== r
.last_epoch_started
&&
2634 l
.last_interval_started
== r
.last_interval_started
&&
2635 l
.last_epoch_clean
== r
.last_epoch_clean
&&
2636 l
.last_interval_clean
== r
.last_interval_clean
&&
2637 l
.last_epoch_split
== r
.last_epoch_split
&&
2638 l
.last_epoch_marked_full
== r
.last_epoch_marked_full
&&
2639 l
.same_up_since
== r
.same_up_since
&&
2640 l
.same_interval_since
== r
.same_interval_since
&&
2641 l
.same_primary_since
== r
.same_primary_since
&&
2642 l
.last_scrub
== r
.last_scrub
&&
2643 l
.last_deep_scrub
== r
.last_deep_scrub
&&
2644 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
2645 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
2646 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
;
2651 epoch_pool_created(0),
2652 last_epoch_started(0),
2653 last_interval_started(0),
2654 last_epoch_clean(0),
2655 last_interval_clean(0),
2656 last_epoch_split(0),
2657 last_epoch_marked_full(0),
2658 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2660 bool merge(const pg_history_t
&other
) {
2661 // Here, we only update the fields which cannot be calculated from the OSDmap.
2662 bool modified
= false;
2663 if (epoch_created
< other
.epoch_created
) {
2664 epoch_created
= other
.epoch_created
;
2667 if (epoch_pool_created
< other
.epoch_pool_created
) {
2668 // FIXME: for jewel compat only; this should either be 0 or always the
2669 // same value across all pg instances.
2670 epoch_pool_created
= other
.epoch_pool_created
;
2673 if (last_epoch_started
< other
.last_epoch_started
) {
2674 last_epoch_started
= other
.last_epoch_started
;
2677 if (last_interval_started
< other
.last_interval_started
) {
2678 last_interval_started
= other
.last_interval_started
;
2681 if (last_epoch_clean
< other
.last_epoch_clean
) {
2682 last_epoch_clean
= other
.last_epoch_clean
;
2685 if (last_interval_clean
< other
.last_interval_clean
) {
2686 last_interval_clean
= other
.last_interval_clean
;
2689 if (last_epoch_split
< other
.last_epoch_split
) {
2690 last_epoch_split
= other
.last_epoch_split
;
2693 if (last_epoch_marked_full
< other
.last_epoch_marked_full
) {
2694 last_epoch_marked_full
= other
.last_epoch_marked_full
;
2697 if (other
.last_scrub
> last_scrub
) {
2698 last_scrub
= other
.last_scrub
;
2701 if (other
.last_scrub_stamp
> last_scrub_stamp
) {
2702 last_scrub_stamp
= other
.last_scrub_stamp
;
2705 if (other
.last_deep_scrub
> last_deep_scrub
) {
2706 last_deep_scrub
= other
.last_deep_scrub
;
2709 if (other
.last_deep_scrub_stamp
> last_deep_scrub_stamp
) {
2710 last_deep_scrub_stamp
= other
.last_deep_scrub_stamp
;
2713 if (other
.last_clean_scrub_stamp
> last_clean_scrub_stamp
) {
2714 last_clean_scrub_stamp
= other
.last_clean_scrub_stamp
;
2720 void encode(bufferlist
& bl
) const;
2721 void decode(bufferlist::const_iterator
& p
);
2722 void dump(Formatter
*f
) const;
2723 static void generate_test_instances(list
<pg_history_t
*>& o
);
2725 WRITE_CLASS_ENCODER(pg_history_t
)
2727 inline ostream
& operator<<(ostream
& out
, const pg_history_t
& h
) {
2728 return out
<< "ec=" << h
.epoch_created
<< "/" << h
.epoch_pool_created
2729 << " lis/c " << h
.last_interval_started
2730 << "/" << h
.last_interval_clean
2731 << " les/c/f " << h
.last_epoch_started
<< "/" << h
.last_epoch_clean
2732 << "/" << h
.last_epoch_marked_full
2733 << " " << h
.same_up_since
2734 << "/" << h
.same_interval_since
2735 << "/" << h
.same_primary_since
;
2740 * pg_info_t - summary of PG statistics.
2743 * - last_complete implies we have all objects that existed as of that
2744 * stamp, OR a newer object, OR have already applied a later delete.
2745 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2746 * otherwise, we have no idea what the pg is supposed to contain.
2750 eversion_t last_update
; ///< last object version applied to store.
2751 eversion_t last_complete
; ///< last version pg was complete through.
2752 epoch_t last_epoch_started
; ///< last epoch at which this pg started on this osd
2753 epoch_t last_interval_started
; ///< first epoch of last_epoch_started interval
2755 version_t last_user_version
; ///< last user object version applied to store
2757 eversion_t log_tail
; ///< oldest log entry.
2759 hobject_t last_backfill
; ///< objects >= this and < last_complete may be missing
2760 bool last_backfill_bitwise
; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2762 interval_set
<snapid_t
> purged_snaps
;
2766 pg_history_t history
;
2767 pg_hit_set_history_t hit_set
;
2769 friend bool operator==(const pg_info_t
& l
, const pg_info_t
& r
) {
2772 l
.last_update
== r
.last_update
&&
2773 l
.last_complete
== r
.last_complete
&&
2774 l
.last_epoch_started
== r
.last_epoch_started
&&
2775 l
.last_interval_started
== r
.last_interval_started
&&
2776 l
.last_user_version
== r
.last_user_version
&&
2777 l
.log_tail
== r
.log_tail
&&
2778 l
.last_backfill
== r
.last_backfill
&&
2779 l
.last_backfill_bitwise
== r
.last_backfill_bitwise
&&
2780 l
.purged_snaps
== r
.purged_snaps
&&
2781 l
.stats
== r
.stats
&&
2782 l
.history
== r
.history
&&
2783 l
.hit_set
== r
.hit_set
;
2787 : last_epoch_started(0),
2788 last_interval_started(0),
2789 last_user_version(0),
2790 last_backfill(hobject_t::get_max()),
2791 last_backfill_bitwise(false)
2793 // cppcheck-suppress noExplicitConstructor
2796 last_epoch_started(0),
2797 last_interval_started(0),
2798 last_user_version(0),
2799 last_backfill(hobject_t::get_max()),
2800 last_backfill_bitwise(false)
2803 void set_last_backfill(hobject_t pos
) {
2804 last_backfill
= pos
;
2805 last_backfill_bitwise
= true;
2808 bool is_empty() const { return last_update
.version
== 0; }
2809 bool dne() const { return history
.epoch_created
== 0; }
2811 bool has_missing() const { return last_complete
!= last_update
; }
2812 bool is_incomplete() const { return !last_backfill
.is_max(); }
2814 void encode(bufferlist
& bl
) const;
2815 void decode(bufferlist::const_iterator
& p
);
2816 void dump(Formatter
*f
) const;
2817 static void generate_test_instances(list
<pg_info_t
*>& o
);
2819 WRITE_CLASS_ENCODER(pg_info_t
)
2821 inline ostream
& operator<<(ostream
& out
, const pg_info_t
& pgi
)
2823 out
<< pgi
.pgid
<< "(";
2829 out
<< " v " << pgi
.last_update
;
2830 if (pgi
.last_complete
!= pgi
.last_update
)
2831 out
<< " lc " << pgi
.last_complete
;
2832 out
<< " (" << pgi
.log_tail
<< "," << pgi
.last_update
<< "]";
2834 if (pgi
.is_incomplete())
2835 out
<< " lb " << pgi
.last_backfill
2836 << (pgi
.last_backfill_bitwise
? " (bitwise)" : " (NIBBLEWISE)");
2837 //out << " c " << pgi.epoch_created;
2838 out
<< " local-lis/les=" << pgi
.last_interval_started
2839 << "/" << pgi
.last_epoch_started
;
2840 out
<< " n=" << pgi
.stats
.stats
.sum
.num_objects
;
2841 out
<< " " << pgi
.history
2847 * pg_fast_info_t - common pg_info_t fields
2849 * These are the fields of pg_info_t (and children) that are updated for
2850 * most IO operations.
2853 * Because we rely on these fields to be applied to the normal
2854 * info struct, adding a new field here that is not also new in info
2855 * means that we must set an incompat OSD feature bit!
2857 struct pg_fast_info_t
{
2858 eversion_t last_update
;
2859 eversion_t last_complete
;
2860 version_t last_user_version
;
2861 struct { // pg_stat_t stats
2863 version_t reported_seq
;
2865 utime_t last_active
;
2866 utime_t last_peered
;
2868 utime_t last_unstale
;
2869 utime_t last_undegraded
;
2870 utime_t last_fullsized
;
2871 int64_t log_size
; // (also ondisk_log_size, which has the same value)
2872 struct { // object_stat_collection_t stats;
2873 struct { // objct_stat_sum_t sum
2874 int64_t num_bytes
; // in bytes
2875 int64_t num_objects
;
2876 int64_t num_object_copies
;
2881 int64_t num_objects_dirty
;
2886 void populate_from(const pg_info_t
& info
) {
2887 last_update
= info
.last_update
;
2888 last_complete
= info
.last_complete
;
2889 last_user_version
= info
.last_user_version
;
2890 stats
.version
= info
.stats
.version
;
2891 stats
.reported_seq
= info
.stats
.reported_seq
;
2892 stats
.last_fresh
= info
.stats
.last_fresh
;
2893 stats
.last_active
= info
.stats
.last_active
;
2894 stats
.last_peered
= info
.stats
.last_peered
;
2895 stats
.last_clean
= info
.stats
.last_clean
;
2896 stats
.last_unstale
= info
.stats
.last_unstale
;
2897 stats
.last_undegraded
= info
.stats
.last_undegraded
;
2898 stats
.last_fullsized
= info
.stats
.last_fullsized
;
2899 stats
.log_size
= info
.stats
.log_size
;
2900 stats
.stats
.sum
.num_bytes
= info
.stats
.stats
.sum
.num_bytes
;
2901 stats
.stats
.sum
.num_objects
= info
.stats
.stats
.sum
.num_objects
;
2902 stats
.stats
.sum
.num_object_copies
= info
.stats
.stats
.sum
.num_object_copies
;
2903 stats
.stats
.sum
.num_rd
= info
.stats
.stats
.sum
.num_rd
;
2904 stats
.stats
.sum
.num_rd_kb
= info
.stats
.stats
.sum
.num_rd_kb
;
2905 stats
.stats
.sum
.num_wr
= info
.stats
.stats
.sum
.num_wr
;
2906 stats
.stats
.sum
.num_wr_kb
= info
.stats
.stats
.sum
.num_wr_kb
;
2907 stats
.stats
.sum
.num_objects_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
2910 bool try_apply_to(pg_info_t
* info
) {
2911 if (last_update
<= info
->last_update
)
2913 info
->last_update
= last_update
;
2914 info
->last_complete
= last_complete
;
2915 info
->last_user_version
= last_user_version
;
2916 info
->stats
.version
= stats
.version
;
2917 info
->stats
.reported_seq
= stats
.reported_seq
;
2918 info
->stats
.last_fresh
= stats
.last_fresh
;
2919 info
->stats
.last_active
= stats
.last_active
;
2920 info
->stats
.last_peered
= stats
.last_peered
;
2921 info
->stats
.last_clean
= stats
.last_clean
;
2922 info
->stats
.last_unstale
= stats
.last_unstale
;
2923 info
->stats
.last_undegraded
= stats
.last_undegraded
;
2924 info
->stats
.last_fullsized
= stats
.last_fullsized
;
2925 info
->stats
.log_size
= stats
.log_size
;
2926 info
->stats
.ondisk_log_size
= stats
.log_size
;
2927 info
->stats
.stats
.sum
.num_bytes
= stats
.stats
.sum
.num_bytes
;
2928 info
->stats
.stats
.sum
.num_objects
= stats
.stats
.sum
.num_objects
;
2929 info
->stats
.stats
.sum
.num_object_copies
= stats
.stats
.sum
.num_object_copies
;
2930 info
->stats
.stats
.sum
.num_rd
= stats
.stats
.sum
.num_rd
;
2931 info
->stats
.stats
.sum
.num_rd_kb
= stats
.stats
.sum
.num_rd_kb
;
2932 info
->stats
.stats
.sum
.num_wr
= stats
.stats
.sum
.num_wr
;
2933 info
->stats
.stats
.sum
.num_wr_kb
= stats
.stats
.sum
.num_wr_kb
;
2934 info
->stats
.stats
.sum
.num_objects_dirty
= stats
.stats
.sum
.num_objects_dirty
;
2938 void encode(bufferlist
& bl
) const {
2939 ENCODE_START(1, 1, bl
);
2940 encode(last_update
, bl
);
2941 encode(last_complete
, bl
);
2942 encode(last_user_version
, bl
);
2943 encode(stats
.version
, bl
);
2944 encode(stats
.reported_seq
, bl
);
2945 encode(stats
.last_fresh
, bl
);
2946 encode(stats
.last_active
, bl
);
2947 encode(stats
.last_peered
, bl
);
2948 encode(stats
.last_clean
, bl
);
2949 encode(stats
.last_unstale
, bl
);
2950 encode(stats
.last_undegraded
, bl
);
2951 encode(stats
.last_fullsized
, bl
);
2952 encode(stats
.log_size
, bl
);
2953 encode(stats
.stats
.sum
.num_bytes
, bl
);
2954 encode(stats
.stats
.sum
.num_objects
, bl
);
2955 encode(stats
.stats
.sum
.num_object_copies
, bl
);
2956 encode(stats
.stats
.sum
.num_rd
, bl
);
2957 encode(stats
.stats
.sum
.num_rd_kb
, bl
);
2958 encode(stats
.stats
.sum
.num_wr
, bl
);
2959 encode(stats
.stats
.sum
.num_wr_kb
, bl
);
2960 encode(stats
.stats
.sum
.num_objects_dirty
, bl
);
2963 void decode(bufferlist::const_iterator
& p
) {
2965 decode(last_update
, p
);
2966 decode(last_complete
, p
);
2967 decode(last_user_version
, p
);
2968 decode(stats
.version
, p
);
2969 decode(stats
.reported_seq
, p
);
2970 decode(stats
.last_fresh
, p
);
2971 decode(stats
.last_active
, p
);
2972 decode(stats
.last_peered
, p
);
2973 decode(stats
.last_clean
, p
);
2974 decode(stats
.last_unstale
, p
);
2975 decode(stats
.last_undegraded
, p
);
2976 decode(stats
.last_fullsized
, p
);
2977 decode(stats
.log_size
, p
);
2978 decode(stats
.stats
.sum
.num_bytes
, p
);
2979 decode(stats
.stats
.sum
.num_objects
, p
);
2980 decode(stats
.stats
.sum
.num_object_copies
, p
);
2981 decode(stats
.stats
.sum
.num_rd
, p
);
2982 decode(stats
.stats
.sum
.num_rd_kb
, p
);
2983 decode(stats
.stats
.sum
.num_wr
, p
);
2984 decode(stats
.stats
.sum
.num_wr_kb
, p
);
2985 decode(stats
.stats
.sum
.num_objects_dirty
, p
);
2989 WRITE_CLASS_ENCODER(pg_fast_info_t
)
2992 struct pg_notify_t
{
2993 epoch_t query_epoch
;
2999 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3000 from(shard_id_t::NO_SHARD
) {}
3004 epoch_t query_epoch
,
3006 const pg_info_t
&info
)
3007 : query_epoch(query_epoch
),
3008 epoch_sent(epoch_sent
),
3009 info(info
), to(to
), from(from
) {
3010 ceph_assert(from
== info
.pgid
.shard
);
3012 void encode(bufferlist
&bl
) const;
3013 void decode(bufferlist::const_iterator
&p
);
3014 void dump(Formatter
*f
) const;
3015 static void generate_test_instances(list
<pg_notify_t
*> &o
);
3017 WRITE_CLASS_ENCODER(pg_notify_t
)
3018 ostream
&operator<<(ostream
&lhs
, const pg_notify_t
¬ify
);
3023 * PastIntervals -- information needed to determine the PriorSet and
3024 * the might_have_unfound set
3026 class PastIntervals
{
3028 struct pg_interval_t
{
3029 vector
<int32_t> up
, acting
;
3030 epoch_t first
, last
;
3036 : first(0), last(0),
3037 maybe_went_rw(false),
3043 vector
<int32_t> &&up
,
3044 vector
<int32_t> &&acting
,
3050 : up(up
), acting(acting
), first(first
), last(last
),
3051 maybe_went_rw(maybe_went_rw
), primary(primary
), up_primary(up_primary
)
3054 void encode(bufferlist
& bl
) const;
3055 void decode(bufferlist::const_iterator
& bl
);
3056 void dump(Formatter
*f
) const;
3057 static void generate_test_instances(list
<pg_interval_t
*>& o
);
3061 PastIntervals(PastIntervals
&&rhs
) = default;
3062 PastIntervals
&operator=(PastIntervals
&&rhs
) = default;
3064 PastIntervals(const PastIntervals
&rhs
);
3065 PastIntervals
&operator=(const PastIntervals
&rhs
);
3067 class interval_rep
{
3069 virtual size_t size() const = 0;
3070 virtual bool empty() const = 0;
3071 virtual void clear() = 0;
3072 virtual pair
<epoch_t
, epoch_t
> get_bounds() const = 0;
3073 virtual set
<pg_shard_t
> get_all_participants(
3074 bool ec_pool
) const = 0;
3075 virtual void add_interval(bool ec_pool
, const pg_interval_t
&interval
) = 0;
3076 virtual unique_ptr
<interval_rep
> clone() const = 0;
3077 virtual ostream
&print(ostream
&out
) const = 0;
3078 virtual void encode(bufferlist
&bl
) const = 0;
3079 virtual void decode(bufferlist::const_iterator
&bl
) = 0;
3080 virtual void dump(Formatter
*f
) const = 0;
3081 virtual void iterate_mayberw_back_to(
3083 std::function
<void(epoch_t
, const set
<pg_shard_t
> &)> &&f
) const = 0;
3085 virtual bool has_full_intervals() const { return false; }
3086 virtual void iterate_all_intervals(
3087 std::function
<void(const pg_interval_t
&)> &&f
) const {
3088 ceph_assert(!has_full_intervals());
3089 ceph_abort_msg("not valid for this implementation");
3091 virtual void adjust_start_backwards(epoch_t last_epoch_clean
) = 0;
3093 virtual ~interval_rep() {}
3095 friend class pi_compact_rep
;
3098 unique_ptr
<interval_rep
> past_intervals
;
3100 explicit PastIntervals(interval_rep
*rep
) : past_intervals(rep
) {}
3103 void add_interval(bool ec_pool
, const pg_interval_t
&interval
) {
3104 ceph_assert(past_intervals
);
3105 return past_intervals
->add_interval(ec_pool
, interval
);
3108 void encode(bufferlist
&bl
) const {
3109 ENCODE_START(1, 1, bl
);
3110 if (past_intervals
) {
3113 past_intervals
->encode(bl
);
3115 encode((__u8
)0, bl
);
3120 void decode(bufferlist::const_iterator
&bl
);
3122 void dump(Formatter
*f
) const {
3123 ceph_assert(past_intervals
);
3124 past_intervals
->dump(f
);
3126 static void generate_test_instances(list
<PastIntervals
*> & o
);
3129 * Determines whether there is an interval change
3131 static bool is_new_interval(
3132 int old_acting_primary
,
3133 int new_acting_primary
,
3134 const vector
<int> &old_acting
,
3135 const vector
<int> &new_acting
,
3138 const vector
<int> &old_up
,
3139 const vector
<int> &new_up
,
3144 unsigned old_pg_num
,
3145 unsigned new_pg_num
,
3146 unsigned old_pg_num_pending
,
3147 unsigned new_pg_num_pending
,
3148 bool old_sort_bitwise
,
3149 bool new_sort_bitwise
,
3150 bool old_recovery_deletes
,
3151 bool new_recovery_deletes
,
3156 * Determines whether there is an interval change
3158 static bool is_new_interval(
3159 int old_acting_primary
, ///< [in] primary as of lastmap
3160 int new_acting_primary
, ///< [in] primary as of lastmap
3161 const vector
<int> &old_acting
, ///< [in] acting as of lastmap
3162 const vector
<int> &new_acting
, ///< [in] acting as of osdmap
3163 int old_up_primary
, ///< [in] up primary of lastmap
3164 int new_up_primary
, ///< [in] up primary of osdmap
3165 const vector
<int> &old_up
, ///< [in] up as of lastmap
3166 const vector
<int> &new_up
, ///< [in] up as of osdmap
3167 std::shared_ptr
<const OSDMap
> osdmap
, ///< [in] current map
3168 std::shared_ptr
<const OSDMap
> lastmap
, ///< [in] last map
3169 pg_t pgid
///< [in] pgid for pg
3173 * Integrates a new map into *past_intervals, returns true
3174 * if an interval was closed out.
3176 static bool check_new_interval(
3177 int old_acting_primary
, ///< [in] primary as of lastmap
3178 int new_acting_primary
, ///< [in] primary as of osdmap
3179 const vector
<int> &old_acting
, ///< [in] acting as of lastmap
3180 const vector
<int> &new_acting
, ///< [in] acting as of osdmap
3181 int old_up_primary
, ///< [in] up primary of lastmap
3182 int new_up_primary
, ///< [in] up primary of osdmap
3183 const vector
<int> &old_up
, ///< [in] up as of lastmap
3184 const vector
<int> &new_up
, ///< [in] up as of osdmap
3185 epoch_t same_interval_since
, ///< [in] as of osdmap
3186 epoch_t last_epoch_clean
, ///< [in] current
3187 std::shared_ptr
<const OSDMap
> osdmap
, ///< [in] current map
3188 std::shared_ptr
<const OSDMap
> lastmap
, ///< [in] last map
3189 pg_t pgid
, ///< [in] pgid for pg
3190 IsPGRecoverablePredicate
*could_have_gone_active
, ///< [in] predicate whether the pg can be active
3191 PastIntervals
*past_intervals
, ///< [out] intervals
3192 ostream
*out
= 0 ///< [out] debug ostream
3195 friend ostream
& operator<<(ostream
& out
, const PastIntervals
&i
);
3197 template <typename F
>
3198 void iterate_mayberw_back_to(
3201 ceph_assert(past_intervals
);
3202 past_intervals
->iterate_mayberw_back_to(les
, std::forward
<F
>(f
));
3205 ceph_assert(past_intervals
);
3206 past_intervals
->clear();
3210 * Should return a value which gives an indication of the amount
3211 * of state contained
3213 size_t size() const {
3214 ceph_assert(past_intervals
);
3215 return past_intervals
->size();
3218 bool empty() const {
3219 ceph_assert(past_intervals
);
3220 return past_intervals
->empty();
3223 void swap(PastIntervals
&other
) {
3225 swap(other
.past_intervals
, past_intervals
);
3229 * Return all shards which have been in the acting set back to the
3230 * latest epoch to which we have trimmed except for pg_whoami
3232 set
<pg_shard_t
> get_might_have_unfound(
3233 pg_shard_t pg_whoami
,
3234 bool ec_pool
) const {
3235 ceph_assert(past_intervals
);
3236 auto ret
= past_intervals
->get_all_participants(ec_pool
);
3237 ret
.erase(pg_whoami
);
3242 * Return all shards which we might want to talk to for peering
3244 set
<pg_shard_t
> get_all_probe(
3245 bool ec_pool
) const {
3246 ceph_assert(past_intervals
);
3247 return past_intervals
->get_all_participants(ec_pool
);
3250 /* Return the set of epochs [start, end) represented by the
3251 * past_interval set.
3253 pair
<epoch_t
, epoch_t
> get_bounds() const {
3254 ceph_assert(past_intervals
);
3255 return past_intervals
->get_bounds();
3258 void adjust_start_backwards(epoch_t last_epoch_clean
) {
3259 ceph_assert(past_intervals
);
3260 past_intervals
->adjust_start_backwards(last_epoch_clean
);
3270 bool ec_pool
= false;
3271 set
<pg_shard_t
> probe
; ///< current+prior OSDs we need to probe.
3272 set
<int> down
; ///< down osds that would normally be in @a probe and might be interesting.
3273 map
<int, epoch_t
> blocked_by
; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
3275 bool pg_down
= false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
3276 unique_ptr
<IsPGRecoverablePredicate
> pcontdec
;
3278 PriorSet() = default;
3279 PriorSet(PriorSet
&&) = default;
3280 PriorSet
&operator=(PriorSet
&&) = default;
3282 PriorSet
&operator=(const PriorSet
&) = delete;
3283 PriorSet(const PriorSet
&) = delete;
3285 bool operator==(const PriorSet
&rhs
) const {
3286 return (ec_pool
== rhs
.ec_pool
) &&
3287 (probe
== rhs
.probe
) &&
3288 (down
== rhs
.down
) &&
3289 (blocked_by
== rhs
.blocked_by
) &&
3290 (pg_down
== rhs
.pg_down
);
3293 bool affected_by_map(
3294 const OSDMap
&osdmap
,
3295 const DoutPrefixProvider
*dpp
) const;
3297 // For verifying tests
3300 set
<pg_shard_t
> probe
,
3302 map
<int, epoch_t
> blocked_by
,
3304 IsPGRecoverablePredicate
*pcontdec
)
3305 : ec_pool(ec_pool
), probe(probe
), down(down
), blocked_by(blocked_by
),
3306 pg_down(pg_down
), pcontdec(pcontdec
) {}
3309 template <typename F
>
3311 const PastIntervals
&past_intervals
,
3313 epoch_t last_epoch_started
,
3314 IsPGRecoverablePredicate
*c
,
3316 const vector
<int> &up
,
3317 const vector
<int> &acting
,
3318 const DoutPrefixProvider
*dpp
);
3320 friend class PastIntervals
;
3323 template <typename
... Args
>
3324 PriorSet
get_prior_set(Args
&&... args
) const {
3325 return PriorSet(*this, std::forward
<Args
>(args
)...);
3328 WRITE_CLASS_ENCODER(PastIntervals
)
3330 ostream
& operator<<(ostream
& out
, const PastIntervals::pg_interval_t
& i
);
3331 ostream
& operator<<(ostream
& out
, const PastIntervals
&i
);
3332 ostream
& operator<<(ostream
& out
, const PastIntervals::PriorSet
&i
);
3334 template <typename F
>
3335 PastIntervals::PriorSet::PriorSet(
3336 const PastIntervals
&past_intervals
,
3338 epoch_t last_epoch_started
,
3339 IsPGRecoverablePredicate
*c
,
3341 const vector
<int> &up
,
3342 const vector
<int> &acting
,
3343 const DoutPrefixProvider
*dpp
)
3344 : ec_pool(ec_pool
), pg_down(false), pcontdec(c
)
3347 * We have to be careful to gracefully deal with situations like
3348 * so. Say we have a power outage or something that takes out both
3349 * OSDs, but the monitor doesn't mark them down in the same epoch.
3350 * The history may look like
3354 * 3: let's say B dies for good, too (say, from the power spike)
3357 * which makes it look like B may have applied updates to the PG
3358 * that we need in order to proceed. This sucks...
3360 * To minimize the risk of this happening, we CANNOT go active if
3361 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3362 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3363 * Then, we have something like
3370 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3381 * -> we must wait for B, bc it was alive through 2, and could have
3382 * written to the pg.
3384 * If B is really dead, then an administrator will need to manually
3385 * intervene by marking the OSD as "lost."
3388 // Include current acting and up nodes... not because they may
3389 // contain old data (this interval hasn't gone active, obviously),
3390 // but because we want their pg_info to inform choose_acting(), and
3391 // so that we know what they do/do not have explicitly before
3392 // sending them any new info/logs/whatever.
3393 for (unsigned i
= 0; i
< acting
.size(); i
++) {
3394 if (acting
[i
] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3395 probe
.insert(pg_shard_t(acting
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3397 // It may be possible to exclude the up nodes, but let's keep them in
3399 for (unsigned i
= 0; i
< up
.size(); i
++) {
3400 if (up
[i
] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3401 probe
.insert(pg_shard_t(up
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3404 set
<pg_shard_t
> all_probe
= past_intervals
.get_all_probe(ec_pool
);
3405 ldpp_dout(dpp
, 10) << "build_prior all_probe " << all_probe
<< dendl
;
3406 for (auto &&i
: all_probe
) {
3407 switch (f(0, i
.osd
, nullptr)) {
3421 past_intervals
.iterate_mayberw_back_to(
3423 [&](epoch_t start
, const set
<pg_shard_t
> &acting
) {
3424 ldpp_dout(dpp
, 10) << "build_prior maybe_rw interval:" << start
3425 << ", acting: " << acting
<< dendl
;
3427 // look at candidate osds during this interval. each falls into
3428 // one of three categories: up, down (but potentially
3429 // interesting), or lost (down, but we won't wait for it).
3430 set
<pg_shard_t
> up_now
;
3431 map
<int, epoch_t
> candidate_blocked_by
;
3432 // any candidates down now (that might have useful data)
3433 bool any_down_now
= false;
3435 // consider ACTING osds
3436 for (auto &&so
: acting
) {
3437 epoch_t lost_at
= 0;
3438 switch (f(start
, so
.osd
, &lost_at
)) {
3440 // include past acting osds if they are up.
3445 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3446 << " no longer exists" << dendl
;
3450 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3451 << " is down, but lost_at " << lost_at
<< dendl
;
3456 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3457 << " is down" << dendl
;
3458 candidate_blocked_by
[so
.osd
] = lost_at
;
3459 any_down_now
= true;
3465 // if not enough osds survived this interval, and we may have gone rw,
3466 // then we need to wait for one of those osds to recover to
3467 // ensure that we haven't lost any information.
3468 if (!(*pcontdec
)(up_now
) && any_down_now
) {
3469 // fixme: how do we identify a "clean" shutdown anyway?
3470 ldpp_dout(dpp
, 10) << "build_prior possibly went active+rw,"
3471 << " insufficient up; including down osds" << dendl
;
3472 ceph_assert(!candidate_blocked_by
.empty());
3475 candidate_blocked_by
.begin(),
3476 candidate_blocked_by
.end());
3480 ldpp_dout(dpp
, 10) << "build_prior final: probe " << probe
3482 << " blocked_by " << blocked_by
3483 << (pg_down
? " pg_down":"")
3488 * pg_query_t - used to ask a peer for information about a pg.
3490 * note: if version=0, type=LOG, then we just provide our full log.
3499 std::string_view
get_type_name() const {
3501 case INFO
: return "info";
3502 case LOG
: return "log";
3503 case MISSING
: return "missing";
3504 case FULLLOG
: return "fulllog";
3505 default: return "???";
3511 pg_history_t history
;
3516 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3517 from(shard_id_t::NO_SHARD
) {}
3522 const pg_history_t
& h
,
3526 epoch_sent(epoch_sent
),
3527 to(to
), from(from
) {
3528 ceph_assert(t
!= LOG
);
3535 const pg_history_t
& h
,
3537 : type(t
), since(s
), history(h
),
3538 epoch_sent(epoch_sent
), to(to
), from(from
) {
3539 ceph_assert(t
== LOG
);
3542 void encode(bufferlist
&bl
, uint64_t features
) const;
3543 void decode(bufferlist::const_iterator
&bl
);
3545 void dump(Formatter
*f
) const;
3546 static void generate_test_instances(list
<pg_query_t
*>& o
);
3548 WRITE_CLASS_ENCODER_FEATURES(pg_query_t
)
3550 inline ostream
& operator<<(ostream
& out
, const pg_query_t
& q
) {
3551 out
<< "query(" << q
.get_type_name() << " " << q
.since
;
3552 if (q
.type
== pg_query_t::LOG
)
3553 out
<< " " << q
.history
;
3554 out
<< " epoch_sent " << q
.epoch_sent
;
3560 class ObjectModDesc
{
3561 bool can_local_rollback
;
3562 bool rollback_info_completed
;
3564 // version required to decode, reflected in encode/decode version
3565 __u8 max_required_version
= 1;
3569 virtual void append(uint64_t old_offset
) {}
3570 virtual void setattrs(map
<string
, boost::optional
<bufferlist
> > &attrs
) {}
3571 virtual void rmobject(version_t old_version
) {}
3573 * Used to support the unfound_lost_delete log event: if the stashed
3574 * version exists, we unstash it, otherwise, we do nothing. This way
3575 * each replica rolls back to whatever state it had prior to the attempt
3576 * at mark unfound lost delete
3578 virtual void try_rmobject(version_t old_version
) {
3579 rmobject(old_version
);
3581 virtual void create() {}
3582 virtual void update_snaps(const set
<snapid_t
> &old_snaps
) {}
3583 virtual void rollback_extents(
3585 const vector
<pair
<uint64_t, uint64_t> > &extents
) {}
3586 virtual ~Visitor() {}
3588 void visit(Visitor
*visitor
) const;
3589 mutable bufferlist bl
;
3597 ROLLBACK_EXTENTS
= 7
3599 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3600 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3602 void claim(ObjectModDesc
&other
) {
3605 can_local_rollback
= other
.can_local_rollback
;
3606 rollback_info_completed
= other
.rollback_info_completed
;
3608 void claim_append(ObjectModDesc
&other
) {
3609 if (!can_local_rollback
|| rollback_info_completed
)
3611 if (!other
.can_local_rollback
) {
3612 mark_unrollbackable();
3615 bl
.claim_append(other
.bl
);
3616 rollback_info_completed
= other
.rollback_info_completed
;
3618 void swap(ObjectModDesc
&other
) {
3622 swap(other
.can_local_rollback
, can_local_rollback
);
3623 swap(other
.rollback_info_completed
, rollback_info_completed
);
3624 swap(other
.max_required_version
, max_required_version
);
3626 void append_id(ModID id
) {
3631 void append(uint64_t old_size
) {
3632 if (!can_local_rollback
|| rollback_info_completed
)
3634 ENCODE_START(1, 1, bl
);
3636 encode(old_size
, bl
);
3639 void setattrs(map
<string
, boost::optional
<bufferlist
> > &old_attrs
) {
3640 if (!can_local_rollback
|| rollback_info_completed
)
3642 ENCODE_START(1, 1, bl
);
3643 append_id(SETATTRS
);
3644 encode(old_attrs
, bl
);
3647 bool rmobject(version_t deletion_version
) {
3648 if (!can_local_rollback
|| rollback_info_completed
)
3650 ENCODE_START(1, 1, bl
);
3652 encode(deletion_version
, bl
);
3654 rollback_info_completed
= true;
3657 bool try_rmobject(version_t deletion_version
) {
3658 if (!can_local_rollback
|| rollback_info_completed
)
3660 ENCODE_START(1, 1, bl
);
3661 append_id(TRY_DELETE
);
3662 encode(deletion_version
, bl
);
3664 rollback_info_completed
= true;
3668 if (!can_local_rollback
|| rollback_info_completed
)
3670 rollback_info_completed
= true;
3671 ENCODE_START(1, 1, bl
);
3675 void update_snaps(const set
<snapid_t
> &old_snaps
) {
3676 if (!can_local_rollback
|| rollback_info_completed
)
3678 ENCODE_START(1, 1, bl
);
3679 append_id(UPDATE_SNAPS
);
3680 encode(old_snaps
, bl
);
3683 void rollback_extents(
3684 version_t gen
, const vector
<pair
<uint64_t, uint64_t> > &extents
) {
3685 ceph_assert(can_local_rollback
);
3686 ceph_assert(!rollback_info_completed
);
3687 if (max_required_version
< 2)
3688 max_required_version
= 2;
3689 ENCODE_START(2, 2, bl
);
3690 append_id(ROLLBACK_EXTENTS
);
3692 encode(extents
, bl
);
3696 // cannot be rolled back
3697 void mark_unrollbackable() {
3698 can_local_rollback
= false;
3701 bool can_rollback() const {
3702 return can_local_rollback
;
3704 bool empty() const {
3705 return can_local_rollback
&& (bl
.length() == 0);
3708 bool requires_kraken() const {
3709 return max_required_version
>= 2;
3713 * Create fresh copy of bl bytes to avoid keeping large buffers around
3714 * in the case that bl contains ptrs which point into a much larger
3717 void trim_bl() const {
3718 if (bl
.length() > 0)
3721 void encode(bufferlist
&bl
) const;
3722 void decode(bufferlist::const_iterator
&bl
);
3723 void dump(Formatter
*f
) const;
3724 static void generate_test_instances(list
<ObjectModDesc
*>& o
);
3726 WRITE_CLASS_ENCODER(ObjectModDesc
)
3730 * pg_log_entry_t - single entry/event in pg log
3733 struct pg_log_entry_t
{
3735 MODIFY
= 1, // some unspecified modification (but not *all* modifications)
3736 CLONE
= 2, // cloned object from head
3737 DELETE
= 3, // deleted object
3738 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
3739 LOST_REVERT
= 5, // lost new version, revert to an older version.
3740 LOST_DELETE
= 6, // lost new version, revert to no object (deleted).
3741 LOST_MARK
= 7, // lost new version, now EIO
3742 PROMOTE
= 8, // promoted object from another tier
3743 CLEAN
= 9, // mark an object clean
3744 ERROR
= 10, // write that returned an error
3746 static const char *get_op_name(int op
) {
3770 const char *get_op_name() const {
3771 return get_op_name(op
);
3774 // describes state for a locally-rollbackable entry
3775 ObjectModDesc mod_desc
;
3776 bufferlist snaps
; // only for clone entries
3778 osd_reqid_t reqid
; // caller+tid to uniquely identify request
3779 mempool::osd_pglog::vector
<pair
<osd_reqid_t
, version_t
> > extra_reqids
;
3781 /// map extra_reqids by index to error return code (if any)
3782 mempool::osd_pglog::map
<uint32_t, int> extra_reqid_return_codes
;
3784 eversion_t version
, prior_version
, reverting_to
;
3785 version_t user_version
; // the user version for this entry
3786 utime_t mtime
; // this is the _user_ mtime, mind you
3787 int32_t return_code
; // only stored for ERRORs for dup detection
3790 bool invalid_hash
; // only when decoding sobject_t based entries
3791 bool invalid_pool
; // only when decoding pool-less hobject based entries
3794 : user_version(0), return_code(0), op(0),
3795 invalid_hash(false), invalid_pool(false) {
3796 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3798 pg_log_entry_t(int _op
, const hobject_t
& _soid
,
3799 const eversion_t
& v
, const eversion_t
& pv
,
3801 const osd_reqid_t
& rid
, const utime_t
& mt
,
3803 : soid(_soid
), reqid(rid
), version(v
), prior_version(pv
), user_version(uv
),
3804 mtime(mt
), return_code(return_code
), op(_op
),
3805 invalid_hash(false), invalid_pool(false) {
3806 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3809 bool is_clone() const { return op
== CLONE
; }
3810 bool is_modify() const { return op
== MODIFY
; }
3811 bool is_promote() const { return op
== PROMOTE
; }
3812 bool is_clean() const { return op
== CLEAN
; }
3813 bool is_lost_revert() const { return op
== LOST_REVERT
; }
3814 bool is_lost_delete() const { return op
== LOST_DELETE
; }
3815 bool is_lost_mark() const { return op
== LOST_MARK
; }
3816 bool is_error() const { return op
== ERROR
; }
3818 bool is_update() const {
3820 is_clone() || is_modify() || is_promote() || is_clean() ||
3821 is_lost_revert() || is_lost_mark();
3823 bool is_delete() const {
3824 return op
== DELETE
|| op
== LOST_DELETE
;
3827 bool can_rollback() const {
3828 return mod_desc
.can_rollback();
3831 void mark_unrollbackable() {
3832 mod_desc
.mark_unrollbackable();
3835 bool requires_kraken() const {
3836 return mod_desc
.requires_kraken();
3839 // Errors are only used for dup detection, whereas
3840 // the index by objects is used by recovery, copy_get,
3841 // and other facilities that don't expect or need to
3842 // be aware of error entries.
3843 bool object_is_indexed() const {
3847 bool reqid_is_indexed() const {
3848 return reqid
!= osd_reqid_t() &&
3849 (op
== MODIFY
|| op
== DELETE
|| op
== ERROR
);
3852 string
get_key_name() const;
3853 void encode_with_checksum(bufferlist
& bl
) const;
3854 void decode_with_checksum(bufferlist::const_iterator
& p
);
3856 void encode(bufferlist
&bl
) const;
3857 void decode(bufferlist::const_iterator
&bl
);
3858 void dump(Formatter
*f
) const;
3859 static void generate_test_instances(list
<pg_log_entry_t
*>& o
);
3862 WRITE_CLASS_ENCODER(pg_log_entry_t
)
3864 ostream
& operator<<(ostream
& out
, const pg_log_entry_t
& e
);
3866 struct pg_log_dup_t
{
3867 osd_reqid_t reqid
; // caller+tid to uniquely identify request
3869 version_t user_version
; // the user version for this entry
3870 int32_t return_code
; // only stored for ERRORs for dup detection
3873 : user_version(0), return_code(0)
3875 explicit pg_log_dup_t(const pg_log_entry_t
& entry
)
3876 : reqid(entry
.reqid
), version(entry
.version
),
3877 user_version(entry
.user_version
), return_code(entry
.return_code
)
3879 pg_log_dup_t(const eversion_t
& v
, version_t uv
,
3880 const osd_reqid_t
& rid
, int return_code
)
3881 : reqid(rid
), version(v
), user_version(uv
),
3882 return_code(return_code
)
3885 string
get_key_name() const;
3886 void encode(bufferlist
&bl
) const;
3887 void decode(bufferlist::const_iterator
&bl
);
3888 void dump(Formatter
*f
) const;
3889 static void generate_test_instances(list
<pg_log_dup_t
*>& o
);
3891 bool operator==(const pg_log_dup_t
&rhs
) const {
3892 return reqid
== rhs
.reqid
&&
3893 version
== rhs
.version
&&
3894 user_version
== rhs
.user_version
&&
3895 return_code
== rhs
.return_code
;
3897 bool operator!=(const pg_log_dup_t
&rhs
) const {
3898 return !(*this == rhs
);
3901 friend std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
3903 WRITE_CLASS_ENCODER(pg_log_dup_t
)
3905 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
3908 * pg_log_t - incremental log of recent pg changes.
3910 * serves as a recovery queue for recent changes.
3914 * head - newest entry (update|delete)
3915 * tail - entry previous to oldest (update|delete) for which we have
3916 * complete negative information.
3917 * i.e. we can infer pg contents for any store whose last_update >= tail.
3919 eversion_t head
; // newest entry
3920 eversion_t tail
; // version prior to oldest
3923 // We can rollback rollback-able entries > can_rollback_to
3924 eversion_t can_rollback_to
;
3926 // always <= can_rollback_to, indicates how far stashed rollback
3927 // data can be found
3928 eversion_t rollback_info_trimmed_to
;
3932 mempool::osd_pglog::list
<pg_log_entry_t
> log
;
3934 // entries just for dup op detection ordered oldest to newest
3935 mempool::osd_pglog::list
<pg_log_dup_t
> dups
;
3937 pg_log_t() = default;
3938 pg_log_t(const eversion_t
&last_update
,
3939 const eversion_t
&log_tail
,
3940 const eversion_t
&can_rollback_to
,
3941 const eversion_t
&rollback_info_trimmed_to
,
3942 mempool::osd_pglog::list
<pg_log_entry_t
> &&entries
,
3943 mempool::osd_pglog::list
<pg_log_dup_t
> &&dup_entries
)
3944 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
3945 rollback_info_trimmed_to(rollback_info_trimmed_to
),
3946 log(std::move(entries
)), dups(std::move(dup_entries
)) {}
3947 pg_log_t(const eversion_t
&last_update
,
3948 const eversion_t
&log_tail
,
3949 const eversion_t
&can_rollback_to
,
3950 const eversion_t
&rollback_info_trimmed_to
,
3951 const std::list
<pg_log_entry_t
> &entries
,
3952 const std::list
<pg_log_dup_t
> &dup_entries
)
3953 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
3954 rollback_info_trimmed_to(rollback_info_trimmed_to
) {
3955 for (auto &&entry
: entries
) {
3956 log
.push_back(entry
);
3958 for (auto &&entry
: dup_entries
) {
3959 dups
.push_back(entry
);
3965 rollback_info_trimmed_to
= can_rollback_to
= head
= tail
= z
;
3970 eversion_t
get_rollback_info_trimmed_to() const {
3971 return rollback_info_trimmed_to
;
3973 eversion_t
get_can_rollback_to() const {
3974 return can_rollback_to
;
3978 pg_log_t
split_out_child(pg_t child_pgid
, unsigned split_bits
) {
3979 mempool::osd_pglog::list
<pg_log_entry_t
> oldlog
, childlog
;
3982 eversion_t old_tail
;
3983 unsigned mask
= ~((~0)<<split_bits
);
3984 for (auto i
= oldlog
.begin();
3987 if ((i
->soid
.get_hash() & mask
) == child_pgid
.m_seed
) {
3988 childlog
.push_back(*i
);
3995 // osd_reqid is unique, so it doesn't matter if there are extra
3996 // dup entries in each pg. To avoid storing oid with the dup
3997 // entries, just copy the whole list.
3998 auto childdups(dups
);
4004 rollback_info_trimmed_to
,
4005 std::move(childlog
),
4006 std::move(childdups
));
4009 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
4010 ceph_assert(newhead
>= tail
);
4012 mempool::osd_pglog::list
<pg_log_entry_t
>::iterator p
= log
.end();
4013 mempool::osd_pglog::list
<pg_log_entry_t
> divergent
;
4015 if (p
== log
.begin()) {
4016 // yikes, the whole thing is divergent!
4018 swap(divergent
, log
);
4022 if (p
->version
.version
<= newhead
.version
) {
4024 * look at eversion.version here. we want to avoid a situation like:
4025 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4026 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4027 * lower_bound = 100'9
4028 * i.e, same request, different version. If the eversion.version is > the
4029 * lower_bound, we it is divergent.
4032 divergent
.splice(divergent
.begin(), log
, p
, log
.end());
4035 ceph_assert(p
->version
> newhead
);
4039 if (can_rollback_to
> newhead
)
4040 can_rollback_to
= newhead
;
4042 if (rollback_info_trimmed_to
> newhead
)
4043 rollback_info_trimmed_to
= newhead
;
4048 void merge_from(const vector
<pg_log_t
*>& slogs
, eversion_t last_update
) {
4051 // sort and merge dups
4052 multimap
<eversion_t
,pg_log_dup_t
> sorted
;
4053 for (auto& d
: dups
) {
4054 sorted
.emplace(d
.version
, d
);
4056 for (auto l
: slogs
) {
4057 for (auto& d
: l
->dups
) {
4058 sorted
.emplace(d
.version
, d
);
4062 for (auto& i
: sorted
) {
4063 dups
.push_back(i
.second
);
4068 can_rollback_to
= last_update
;
4069 rollback_info_trimmed_to
= last_update
;
4072 bool empty() const {
4077 return head
.version
== 0 && head
.epoch
== 0;
4080 size_t approx_size() const {
4081 return head
.version
- tail
.version
;
4084 static void filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
4085 const string
&hit_set_namespace
, const pg_log_t
&in
,
4086 pg_log_t
&out
, pg_log_t
&reject
);
4089 * copy entries from the tail of another pg_log_t
4091 * @param other pg_log_t to copy from
4092 * @param from copy entries after this version
4094 void copy_after(CephContext
* cct
, const pg_log_t
&other
, eversion_t from
);
4097 * copy up to N entries
4099 * @param other source log
4100 * @param max max number of entries to copy
4102 void copy_up_to(CephContext
* cct
, const pg_log_t
&other
, int max
);
4104 ostream
& print(ostream
& out
) const;
4106 void encode(bufferlist
&bl
) const;
4107 void decode(bufferlist::const_iterator
&bl
, int64_t pool
= -1);
4108 void dump(Formatter
*f
) const;
4109 static void generate_test_instances(list
<pg_log_t
*>& o
);
4111 WRITE_CLASS_ENCODER(pg_log_t
)
4113 inline ostream
& operator<<(ostream
& out
, const pg_log_t
& log
)
4115 out
<< "log((" << log
.tail
<< "," << log
.head
<< "], crt="
4116 << log
.get_can_rollback_to() << ")";
4122 * pg_missing_t - summary of missing objects.
4124 * kept in memory, as a supplement to pg_log_t
4125 * also used to pass missing info in messages.
4127 struct pg_missing_item
{
4128 eversion_t need
, have
;
4129 enum missing_flags_t
{
4133 pg_missing_item() : flags(FLAG_NONE
) {}
4134 explicit pg_missing_item(eversion_t n
) : need(n
), flags(FLAG_NONE
) {} // have no old version
4135 pg_missing_item(eversion_t n
, eversion_t h
, bool is_delete
=false) : need(n
), have(h
) {
4136 set_delete(is_delete
);
4139 void encode(bufferlist
& bl
, uint64_t features
) const {
4141 if (HAVE_FEATURE(features
, OSD_RECOVERY_DELETES
)) {
4142 // encoding a zeroed eversion_t to differentiate between this and
4143 // legacy unversioned encoding - a need value of 0'0 is not
4144 // possible. This can be replaced with the legacy encoding
4145 // macros post-luminous.
4150 encode(static_cast<uint8_t>(flags
), bl
);
4152 // legacy unversioned encoding
4157 void decode(bufferlist::const_iterator
& bl
) {
4161 if (e
!= eversion_t()) {
4162 // legacy encoding, this is the need value
4170 flags
= static_cast<missing_flags_t
>(f
);
4174 void set_delete(bool is_delete
) {
4175 flags
= is_delete
? FLAG_DELETE
: FLAG_NONE
;
4178 bool is_delete() const {
4179 return (flags
& FLAG_DELETE
) == FLAG_DELETE
;
4182 string
flag_str() const {
4183 if (flags
== FLAG_NONE
) {
4190 void dump(Formatter
*f
) const {
4191 f
->dump_stream("need") << need
;
4192 f
->dump_stream("have") << have
;
4193 f
->dump_stream("flags") << flag_str();
4195 static void generate_test_instances(list
<pg_missing_item
*>& o
) {
4196 o
.push_back(new pg_missing_item
);
4197 o
.push_back(new pg_missing_item
);
4198 o
.back()->need
= eversion_t(1, 2);
4199 o
.back()->have
= eversion_t(1, 1);
4200 o
.push_back(new pg_missing_item
);
4201 o
.back()->need
= eversion_t(3, 5);
4202 o
.back()->have
= eversion_t(3, 4);
4203 o
.back()->flags
= FLAG_DELETE
;
4205 bool operator==(const pg_missing_item
&rhs
) const {
4206 return need
== rhs
.need
&& have
== rhs
.have
&& flags
== rhs
.flags
;
4208 bool operator!=(const pg_missing_item
&rhs
) const {
4209 return !(*this == rhs
);
4212 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item
)
4213 ostream
& operator<<(ostream
& out
, const pg_missing_item
&item
);
4215 class pg_missing_const_i
{
4217 virtual const map
<hobject_t
, pg_missing_item
> &
4218 get_items() const = 0;
4219 virtual const map
<version_t
, hobject_t
> &get_rmissing() const = 0;
4220 virtual bool get_may_include_deletes() const = 0;
4221 virtual unsigned int num_missing() const = 0;
4222 virtual bool have_missing() const = 0;
4223 virtual bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const = 0;
4224 virtual bool is_missing(const hobject_t
& oid
, eversion_t v
) const = 0;
4225 virtual ~pg_missing_const_i() {}
4229 template <bool Track
>
4230 class ChangeTracker
{
4232 void changed(const hobject_t
&obj
) {}
4233 template <typename F
>
4234 void get_changed(F
&&f
) const {}
4236 bool is_clean() const {
4241 class ChangeTracker
<true> {
4242 set
<hobject_t
> _changed
;
4244 void changed(const hobject_t
&obj
) {
4245 _changed
.insert(obj
);
4247 template <typename F
>
4248 void get_changed(F
&&f
) const {
4249 for (auto const &i
: _changed
) {
4256 bool is_clean() const {
4257 return _changed
.empty();
4261 template <bool TrackChanges
>
4262 class pg_missing_set
: public pg_missing_const_i
{
4263 using item
= pg_missing_item
;
4264 map
<hobject_t
, item
> missing
; // oid -> (need v, have v)
4265 map
<version_t
, hobject_t
> rmissing
; // v -> oid
4266 ChangeTracker
<TrackChanges
> tracker
;
4269 pg_missing_set() = default;
4271 template <typename missing_type
>
4272 pg_missing_set(const missing_type
&m
) {
4273 missing
= m
.get_items();
4274 rmissing
= m
.get_rmissing();
4275 may_include_deletes
= m
.get_may_include_deletes();
4276 for (auto &&i
: missing
)
4277 tracker
.changed(i
.first
);
4280 bool may_include_deletes
= false;
4282 const map
<hobject_t
, item
> &get_items() const override
{
4285 const map
<version_t
, hobject_t
> &get_rmissing() const override
{
4288 bool get_may_include_deletes() const override
{
4289 return may_include_deletes
;
4291 unsigned int num_missing() const override
{
4292 return missing
.size();
4294 bool have_missing() const override
{
4295 return !missing
.empty();
4297 bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const override
{
4298 auto iter
= missing
.find(oid
);
4299 if (iter
== missing
.end())
4302 *out
= iter
->second
;
4305 bool is_missing(const hobject_t
& oid
, eversion_t v
) const override
{
4306 map
<hobject_t
, item
>::const_iterator m
=
4308 if (m
== missing
.end())
4310 const item
&item(m
->second
);
4315 eversion_t
get_oldest_need() const {
4316 if (missing
.empty()) {
4317 return eversion_t();
4319 auto it
= missing
.find(rmissing
.begin()->second
);
4320 ceph_assert(it
!= missing
.end());
4321 return it
->second
.need
;
4324 void claim(pg_missing_set
& o
) {
4325 static_assert(!TrackChanges
, "Can't use claim with TrackChanges");
4326 missing
.swap(o
.missing
);
4327 rmissing
.swap(o
.rmissing
);
4331 * this needs to be called in log order as we extend the log. it
4332 * assumes missing is accurate up through the previous log entry.
4334 void add_next_event(const pg_log_entry_t
& e
) {
4335 map
<hobject_t
, item
>::iterator missing_it
;
4336 missing_it
= missing
.find(e
.soid
);
4337 bool is_missing_divergent_item
= missing_it
!= missing
.end();
4338 if (e
.prior_version
== eversion_t() || e
.is_clone()) {
4340 if (is_missing_divergent_item
) { // use iterator
4341 rmissing
.erase((missing_it
->second
).need
.version
);
4342 missing_it
->second
= item(e
.version
, eversion_t(), e
.is_delete()); // .have = nil
4343 } else // create new element in missing map
4344 missing
[e
.soid
] = item(e
.version
, eversion_t(), e
.is_delete()); // .have = nil
4345 } else if (is_missing_divergent_item
) {
4346 // already missing (prior).
4347 rmissing
.erase((missing_it
->second
).need
.version
);
4348 (missing_it
->second
).need
= e
.version
; // leave .have unchanged.
4349 missing_it
->second
.set_delete(e
.is_delete());
4351 // not missing, we must have prior_version (if any)
4352 ceph_assert(!is_missing_divergent_item
);
4353 missing
[e
.soid
] = item(e
.version
, e
.prior_version
, e
.is_delete());
4355 rmissing
[e
.version
.version
] = e
.soid
;
4356 tracker
.changed(e
.soid
);
4359 void revise_need(hobject_t oid
, eversion_t need
, bool is_delete
) {
4360 if (missing
.count(oid
)) {
4361 rmissing
.erase(missing
[oid
].need
.version
);
4362 missing
[oid
].need
= need
; // no not adjust .have
4363 missing
[oid
].set_delete(is_delete
);
4365 missing
[oid
] = item(need
, eversion_t(), is_delete
);
4367 rmissing
[need
.version
] = oid
;
4369 tracker
.changed(oid
);
4372 void revise_have(hobject_t oid
, eversion_t have
) {
4373 if (missing
.count(oid
)) {
4374 tracker
.changed(oid
);
4375 missing
[oid
].have
= have
;
4379 void add(const hobject_t
& oid
, eversion_t need
, eversion_t have
,
4381 missing
[oid
] = item(need
, have
, is_delete
);
4382 rmissing
[need
.version
] = oid
;
4383 tracker
.changed(oid
);
4386 void rm(const hobject_t
& oid
, eversion_t v
) {
4387 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
4388 if (p
!= missing
.end() && p
->second
.need
<= v
)
4392 void rm(std::map
<hobject_t
, item
>::const_iterator m
) {
4393 tracker
.changed(m
->first
);
4394 rmissing
.erase(m
->second
.need
.version
);
4398 void got(const hobject_t
& oid
, eversion_t v
) {
4399 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
4400 ceph_assert(p
!= missing
.end());
4401 ceph_assert(p
->second
.need
<= v
|| p
->second
.is_delete());
4405 void got(std::map
<hobject_t
, item
>::const_iterator m
) {
4406 tracker
.changed(m
->first
);
4407 rmissing
.erase(m
->second
.need
.version
);
4413 unsigned split_bits
,
4414 pg_missing_set
*omissing
) {
4415 omissing
->may_include_deletes
= may_include_deletes
;
4416 unsigned mask
= ~((~0)<<split_bits
);
4417 for (map
<hobject_t
, item
>::iterator i
= missing
.begin();
4420 if ((i
->first
.get_hash() & mask
) == child_pgid
.m_seed
) {
4421 omissing
->add(i
->first
, i
->second
.need
, i
->second
.have
,
4422 i
->second
.is_delete());
4431 for (auto const &i
: missing
)
4432 tracker
.changed(i
.first
);
4437 void encode(bufferlist
&bl
) const {
4438 ENCODE_START(4, 2, bl
);
4439 encode(missing
, bl
, may_include_deletes
? CEPH_FEATURE_OSD_RECOVERY_DELETES
: 0);
4440 encode(may_include_deletes
, bl
);
4443 void decode(bufferlist::const_iterator
&bl
, int64_t pool
= -1) {
4444 for (auto const &i
: missing
)
4445 tracker
.changed(i
.first
);
4446 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl
);
4447 decode(missing
, bl
);
4448 if (struct_v
>= 4) {
4449 decode(may_include_deletes
, bl
);
4454 // Handle hobject_t upgrade
4455 map
<hobject_t
, item
> tmp
;
4456 for (map
<hobject_t
, item
>::iterator i
=
4460 if (!i
->first
.is_max() && i
->first
.pool
== -1) {
4461 hobject_t
to_insert(i
->first
);
4462 to_insert
.pool
= pool
;
4463 tmp
[to_insert
] = i
->second
;
4469 missing
.insert(tmp
.begin(), tmp
.end());
4472 for (map
<hobject_t
,item
>::iterator it
=
4474 it
!= missing
.end();
4476 rmissing
[it
->second
.need
.version
] = it
->first
;
4477 for (auto const &i
: missing
)
4478 tracker
.changed(i
.first
);
4480 void dump(Formatter
*f
) const {
4481 f
->open_array_section("missing");
4482 for (map
<hobject_t
,item
>::const_iterator p
=
4483 missing
.begin(); p
!= missing
.end(); ++p
) {
4484 f
->open_object_section("item");
4485 f
->dump_stream("object") << p
->first
;
4490 f
->dump_bool("may_include_deletes", may_include_deletes
);
4492 template <typename F
>
4493 void filter_objects(F
&&f
) {
4494 for (auto i
= missing
.begin(); i
!= missing
.end();) {
4502 static void generate_test_instances(list
<pg_missing_set
*>& o
) {
4503 o
.push_back(new pg_missing_set
);
4504 o
.push_back(new pg_missing_set
);
4506 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4507 eversion_t(5, 6), eversion_t(5, 1), false);
4508 o
.push_back(new pg_missing_set
);
4510 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4511 eversion_t(5, 6), eversion_t(5, 1), true);
4512 o
.back()->may_include_deletes
= true;
4514 template <typename F
>
4515 void get_changed(F
&&f
) const {
4516 tracker
.get_changed(f
);
4521 bool is_clean() const {
4522 return tracker
.is_clean();
4524 template <typename missing_t
>
4525 bool debug_verify_from_init(
4526 const missing_t
&init_missing
,
4527 ostream
*oss
) const {
4530 auto check_missing(init_missing
.get_items());
4531 tracker
.get_changed([&](const hobject_t
&hoid
) {
4532 check_missing
.erase(hoid
);
4533 if (missing
.count(hoid
)) {
4534 check_missing
.insert(*(missing
.find(hoid
)));
4538 if (check_missing
.size() != missing
.size()) {
4540 *oss
<< "Size mismatch, check: " << check_missing
.size()
4541 << ", actual: " << missing
.size() << "\n";
4545 for (auto &i
: missing
) {
4546 if (!check_missing
.count(i
.first
)) {
4548 *oss
<< "check_missing missing " << i
.first
<< "\n";
4550 } else if (check_missing
[i
.first
] != i
.second
) {
4552 *oss
<< "check_missing missing item mismatch on " << i
.first
4553 << ", check: " << check_missing
[i
.first
]
4554 << ", actual: " << i
.second
<< "\n";
4559 *oss
<< "check_missing: " << check_missing
<< "\n";
4560 set
<hobject_t
> changed
;
4561 tracker
.get_changed([&](const hobject_t
&hoid
) { changed
.insert(hoid
); });
4562 *oss
<< "changed: " << changed
<< "\n";
4567 template <bool TrackChanges
>
4569 const pg_missing_set
<TrackChanges
> &c
, bufferlist
&bl
, uint64_t features
=0) {
4572 ENCODE_DUMP_POST(cl
);
4574 template <bool TrackChanges
>
4575 void decode(pg_missing_set
<TrackChanges
> &c
, bufferlist::const_iterator
&p
) {
4578 template <bool TrackChanges
>
4579 ostream
& operator<<(ostream
& out
, const pg_missing_set
<TrackChanges
> &missing
)
4581 out
<< "missing(" << missing
.num_missing()
4582 << " may_include_deletes = " << missing
.may_include_deletes
;
4583 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4588 using pg_missing_t
= pg_missing_set
<false>;
4589 using pg_missing_tracker_t
= pg_missing_set
<true>;
4593 * pg list objects response format
4596 struct pg_nls_response_t
{
4597 collection_list_handle_t handle
;
4598 list
<librados::ListObjectImpl
> entries
;
4600 void encode(bufferlist
& bl
) const {
4601 ENCODE_START(1, 1, bl
);
4603 __u32 n
= (__u32
)entries
.size();
4605 for (list
<librados::ListObjectImpl
>::const_iterator i
= entries
.begin(); i
!= entries
.end(); ++i
) {
4606 encode(i
->nspace
, bl
);
4608 encode(i
->locator
, bl
);
4612 void decode(bufferlist::const_iterator
& bl
) {
4613 DECODE_START(1, bl
);
4619 librados::ListObjectImpl i
;
4620 decode(i
.nspace
, bl
);
4622 decode(i
.locator
, bl
);
4623 entries
.push_back(i
);
4627 void dump(Formatter
*f
) const {
4628 f
->dump_stream("handle") << handle
;
4629 f
->open_array_section("entries");
4630 for (list
<librados::ListObjectImpl
>::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
4631 f
->open_object_section("object");
4632 f
->dump_string("namespace", p
->nspace
);
4633 f
->dump_string("object", p
->oid
);
4634 f
->dump_string("key", p
->locator
);
4639 static void generate_test_instances(list
<pg_nls_response_t
*>& o
) {
4640 o
.push_back(new pg_nls_response_t
);
4641 o
.push_back(new pg_nls_response_t
);
4642 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4643 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
4644 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
4645 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
4646 o
.push_back(new pg_nls_response_t
);
4647 o
.back()->handle
= hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4648 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4649 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4650 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4651 o
.push_back(new pg_nls_response_t
);
4652 o
.back()->handle
= hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4653 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
4654 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
4655 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
4656 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4657 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4658 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4662 WRITE_CLASS_ENCODER(pg_nls_response_t
)
4664 // For backwards compatibility with older OSD requests
4665 struct pg_ls_response_t
{
4666 collection_list_handle_t handle
;
4667 list
<pair
<object_t
, string
> > entries
;
4669 void encode(bufferlist
& bl
) const {
4674 encode(entries
, bl
);
4676 void decode(bufferlist::const_iterator
& bl
) {
4680 ceph_assert(v
== 1);
4682 decode(entries
, bl
);
4684 void dump(Formatter
*f
) const {
4685 f
->dump_stream("handle") << handle
;
4686 f
->open_array_section("entries");
4687 for (list
<pair
<object_t
, string
> >::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
4688 f
->open_object_section("object");
4689 f
->dump_stream("object") << p
->first
;
4690 f
->dump_string("key", p
->second
);
4695 static void generate_test_instances(list
<pg_ls_response_t
*>& o
) {
4696 o
.push_back(new pg_ls_response_t
);
4697 o
.push_back(new pg_ls_response_t
);
4698 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4699 o
.back()->entries
.push_back(make_pair(object_t("one"), string()));
4700 o
.back()->entries
.push_back(make_pair(object_t("two"), string("twokey")));
4704 WRITE_CLASS_ENCODER(pg_ls_response_t
)
4707 * object_copy_cursor_t
4709 struct object_copy_cursor_t
{
4710 uint64_t data_offset
;
4716 object_copy_cursor_t()
4718 attr_complete(false),
4719 data_complete(false),
4720 omap_complete(false)
4723 bool is_initial() const {
4724 return !attr_complete
&& data_offset
== 0 && omap_offset
.empty();
4726 bool is_complete() const {
4727 return attr_complete
&& data_complete
&& omap_complete
;
4730 static void generate_test_instances(list
<object_copy_cursor_t
*>& o
);
4731 void encode(bufferlist
& bl
) const;
4732 void decode(bufferlist::const_iterator
&bl
);
4733 void dump(Formatter
*f
) const;
4735 WRITE_CLASS_ENCODER(object_copy_cursor_t
)
4738 * object_copy_data_t
4740 * Return data from a copy request. The semantics are a little strange
4741 * as a result of the encoding's heritage.
4743 * In particular, the sender unconditionally fills in the cursor (from what
4744 * it receives and sends), the size, and the mtime, but is responsible for
4745 * figuring out whether it should put any data in the attrs, data, or
4746 * omap members (corresponding to xattrs, object data, and the omap entries)
4747 * based on external data (the client includes a max amount to return with
4748 * the copy request). The client then looks into the attrs, data, and/or omap
4749 * based on the contents of the cursor.
4751 struct object_copy_data_t
{
4753 FLAG_DATA_DIGEST
= 1<<0,
4754 FLAG_OMAP_DIGEST
= 1<<1,
4756 object_copy_cursor_t cursor
;
4759 uint32_t data_digest
, omap_digest
;
4761 map
<string
, bufferlist
> attrs
;
4763 bufferlist omap_header
;
4764 bufferlist omap_data
;
4766 /// which snaps we are defined for (if a snap and not the head)
4767 vector
<snapid_t
> snaps
;
4768 /// latest snap seq for the object (if head)
4771 /// recent reqids on this object
4772 mempool::osd_pglog::vector
<pair
<osd_reqid_t
, version_t
> > reqids
;
4774 /// map reqids by index to error return code (if any)
4775 mempool::osd_pglog::map
<uint32_t, int> reqid_return_codes
;
4777 uint64_t truncate_seq
;
4778 uint64_t truncate_size
;
4781 object_copy_data_t() :
4782 size((uint64_t)-1), data_digest(-1),
4783 omap_digest(-1), flags(0),
4787 static void generate_test_instances(list
<object_copy_data_t
*>& o
);
4788 void encode(bufferlist
& bl
, uint64_t features
) const;
4789 void decode(bufferlist::const_iterator
& bl
);
4790 void dump(Formatter
*f
) const;
4792 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t
)
4797 struct pg_create_t
{
4798 epoch_t created
; // epoch pg created
4799 pg_t parent
; // split from parent (if != pg_t())
4803 : created(0), split_bits(0) {}
4804 pg_create_t(unsigned c
, pg_t p
, int s
)
4805 : created(c
), parent(p
), split_bits(s
) {}
4807 void encode(bufferlist
&bl
) const;
4808 void decode(bufferlist::const_iterator
&bl
);
4809 void dump(Formatter
*f
) const;
4810 static void generate_test_instances(list
<pg_create_t
*>& o
);
4812 WRITE_CLASS_ENCODER(pg_create_t
)
4814 // -----------------------------------------
4816 class ObjectExtent
{
4818 * ObjectExtents are used for specifying IO behavior against RADOS
4819 * objects when one is using the ObjectCacher.
4821 * To use this in a real system, *every member* must be filled
4822 * out correctly. In particular, make sure to initialize the
4823 * oloc correctly, as its default values are deliberate poison
4824 * and will cause internal ObjectCacher asserts.
4826 * Similarly, your buffer_extents vector *must* specify a total
4827 * size equal to your length. If the buffer_extents inadvertently
4828 * contain less space than the length member specifies, you
4829 * will get unintelligible asserts deep in the ObjectCacher.
4831 * If you are trying to do testing and don't care about actual
4832 * RADOS function, the simplest thing to do is to initialize
4833 * the ObjectExtent (truncate_size can be 0), create a single entry
4834 * in buffer_extents matching the length, and set oloc.pool to 0.
4837 object_t oid
; // object id
4839 uint64_t offset
; // in object
4840 uint64_t length
; // in object
4841 uint64_t truncate_size
; // in object
4843 object_locator_t oloc
; // object locator (pool etc)
4845 vector
<pair
<uint64_t,uint64_t> > buffer_extents
; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4847 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4848 ObjectExtent(object_t o
, uint64_t ono
, uint64_t off
, uint64_t l
, uint64_t ts
) :
4849 oid(o
), objectno(ono
), offset(off
), length(l
), truncate_size(ts
) { }
4852 inline ostream
& operator<<(ostream
& out
, const ObjectExtent
&ex
)
4854 return out
<< "extent("
4855 << ex
.oid
<< " (" << ex
.objectno
<< ") in " << ex
.oloc
4856 << " " << ex
.offset
<< "~" << ex
.length
4857 << " -> " << ex
.buffer_extents
4862 // ---------------------------------------
4864 class OSDSuperblock
{
4866 uuid_d cluster_fsid
, osd_fsid
;
4867 int32_t whoami
; // my role in this fs.
4868 epoch_t current_epoch
; // most recent epoch
4869 epoch_t oldest_map
, newest_map
; // oldest/newest maps we have.
4872 CompatSet compat_features
;
4874 // last interval over which i mounted and was then active
4875 epoch_t mounted
; // last epoch i mounted
4876 epoch_t clean_thru
; // epoch i was active and clean thru
4880 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4881 mounted(0), clean_thru(0) {
4884 void encode(bufferlist
&bl
) const;
4885 void decode(bufferlist::const_iterator
&bl
);
4886 void dump(Formatter
*f
) const;
4887 static void generate_test_instances(list
<OSDSuperblock
*>& o
);
4889 WRITE_CLASS_ENCODER(OSDSuperblock
)
4891 inline ostream
& operator<<(ostream
& out
, const OSDSuperblock
& sb
)
4893 return out
<< "sb(" << sb
.cluster_fsid
4894 << " osd." << sb
.whoami
4895 << " " << sb
.osd_fsid
4896 << " e" << sb
.current_epoch
4897 << " [" << sb
.oldest_map
<< "," << sb
.newest_map
<< "]"
4898 << " lci=[" << sb
.mounted
<< "," << sb
.clean_thru
<< "]"
4911 * attached to object head. describes most recent snap context, and
4912 * set of existing clones.
4916 vector
<snapid_t
> snaps
; // descending
4917 vector
<snapid_t
> clones
; // ascending
4918 map
<snapid_t
, interval_set
<uint64_t> > clone_overlap
; // overlap w/ next newest
4919 map
<snapid_t
, uint64_t> clone_size
;
4920 map
<snapid_t
, vector
<snapid_t
>> clone_snaps
; // descending
4922 SnapSet() : seq(0) {}
4923 explicit SnapSet(bufferlist
& bl
) {
4924 auto p
= std::cbegin(bl
);
4928 /// populate SnapSet from a librados::snap_set_t
4929 void from_snap_set(const librados::snap_set_t
& ss
, bool legacy
);
4931 /// get space accounted to clone
4932 uint64_t get_clone_bytes(snapid_t clone
) const;
4934 void encode(bufferlist
& bl
) const;
4935 void decode(bufferlist::const_iterator
& bl
);
4936 void dump(Formatter
*f
) const;
4937 static void generate_test_instances(list
<SnapSet
*>& o
);
4939 SnapContext
get_ssc_as_of(snapid_t as_of
) const {
4942 for (vector
<snapid_t
>::const_iterator i
= snaps
.begin();
4946 out
.snaps
.push_back(*i
);
4952 SnapSet
get_filtered(const pg_pool_t
&pinfo
) const;
4953 void filter(const pg_pool_t
&pinfo
);
4955 WRITE_CLASS_ENCODER(SnapSet
)
4957 ostream
& operator<<(ostream
& out
, const SnapSet
& cs
);
4962 #define SS_ATTR "snapset"
4964 struct watch_info_t
{
4966 uint32_t timeout_seconds
;
4969 watch_info_t() : cookie(0), timeout_seconds(0) { }
4970 watch_info_t(uint64_t c
, uint32_t t
, const entity_addr_t
& a
) : cookie(c
), timeout_seconds(t
), addr(a
) {}
4972 void encode(bufferlist
& bl
, uint64_t features
) const;
4973 void decode(bufferlist::const_iterator
& bl
);
4974 void dump(Formatter
*f
) const;
4975 static void generate_test_instances(list
<watch_info_t
*>& o
);
4977 WRITE_CLASS_ENCODER_FEATURES(watch_info_t
)
4979 static inline bool operator==(const watch_info_t
& l
, const watch_info_t
& r
) {
4980 return l
.cookie
== r
.cookie
&& l
.timeout_seconds
== r
.timeout_seconds
4981 && l
.addr
== r
.addr
;
4984 static inline ostream
& operator<<(ostream
& out
, const watch_info_t
& w
) {
4985 return out
<< "watch(cookie " << w
.cookie
<< " " << w
.timeout_seconds
<< "s"
4986 << " " << w
.addr
<< ")";
4989 struct notify_info_t
{
4996 static inline ostream
& operator<<(ostream
& out
, const notify_info_t
& n
) {
4997 return out
<< "notify(cookie " << n
.cookie
4998 << " notify" << n
.notify_id
4999 << " " << n
.timeout
<< "s)";
5002 struct chunk_info_t
{
5006 FLAG_HAS_REFERENCE
= 4,
5007 FLAG_HAS_FINGERPRINT
= 8,
5012 cflag_t flags
; // FLAG_*
5014 chunk_info_t() : offset(0), length(0), flags((cflag_t
)0) { }
5016 static string
get_flag_string(uint64_t flags
) {
5018 if (flags
& FLAG_DIRTY
) {
5021 if (flags
& FLAG_MISSING
) {
5024 if (flags
& FLAG_HAS_REFERENCE
) {
5025 r
+= "|has_reference";
5027 if (flags
& FLAG_HAS_FINGERPRINT
) {
5028 r
+= "|has_fingerprint";
5034 bool test_flag(cflag_t f
) const {
5035 return (flags
& f
) == f
;
5037 void set_flag(cflag_t f
) {
5038 flags
= (cflag_t
)(flags
| f
);
5040 void set_flags(cflag_t f
) {
5043 void clear_flag(cflag_t f
) {
5044 flags
= (cflag_t
)(flags
& ~f
);
5046 void clear_flags() {
5049 bool is_dirty() const {
5050 return test_flag(FLAG_DIRTY
);
5052 bool is_missing() const {
5053 return test_flag(FLAG_MISSING
);
5055 bool has_reference() const {
5056 return test_flag(FLAG_HAS_REFERENCE
);
5058 bool has_fingerprint() const {
5059 return test_flag(FLAG_HAS_FINGERPRINT
);
5061 void encode(bufferlist
&bl
) const;
5062 void decode(bufferlist::const_iterator
&bl
);
5063 void dump(Formatter
*f
) const;
5064 friend ostream
& operator<<(ostream
& out
, const chunk_info_t
& ci
);
5066 WRITE_CLASS_ENCODER(chunk_info_t
)
5067 ostream
& operator<<(ostream
& out
, const chunk_info_t
& ci
);
5069 struct object_info_t
;
5070 struct object_manifest_t
{
5076 uint8_t type
; // redirect, chunked, ...
5077 hobject_t redirect_target
;
5078 map
<uint64_t, chunk_info_t
> chunk_map
;
5080 object_manifest_t() : type(0) { }
5081 object_manifest_t(uint8_t type
, const hobject_t
& redirect_target
)
5082 : type(type
), redirect_target(redirect_target
) { }
5084 bool is_empty() const {
5085 return type
== TYPE_NONE
;
5087 bool is_redirect() const {
5088 return type
== TYPE_REDIRECT
;
5090 bool is_chunked() const {
5091 return type
== TYPE_CHUNKED
;
5093 static std::string_view
get_type_name(uint8_t m
) {
5095 case TYPE_NONE
: return "none";
5096 case TYPE_REDIRECT
: return "redirect";
5097 case TYPE_CHUNKED
: return "chunked";
5098 default: return "unknown";
5101 std::string_view
get_type_name() const {
5102 return get_type_name(type
);
5106 redirect_target
= hobject_t();
5109 static void generate_test_instances(list
<object_manifest_t
*>& o
);
5110 void encode(bufferlist
&bl
) const;
5111 void decode(bufferlist::const_iterator
&bl
);
5112 void dump(Formatter
*f
) const;
5113 friend ostream
& operator<<(ostream
& out
, const object_info_t
& oi
);
5115 WRITE_CLASS_ENCODER(object_manifest_t
)
5116 ostream
& operator<<(ostream
& out
, const object_manifest_t
& oi
);
5118 struct object_info_t
{
5120 eversion_t version
, prior_version
;
5121 version_t user_version
;
5122 osd_reqid_t last_reqid
;
5126 utime_t local_mtime
; // local mtime
5128 // note: these are currently encoded into a total 16 bits; see
5129 // encode()/decode() for the weirdness.
5132 FLAG_WHITEOUT
= 1<<1, // object logically does not exist
5133 FLAG_DIRTY
= 1<<2, // object has been modified since last flushed or undirtied
5134 FLAG_OMAP
= 1<<3, // has (or may have) some/any omap data
5135 FLAG_DATA_DIGEST
= 1<<4, // has data crc
5136 FLAG_OMAP_DIGEST
= 1<<5, // has omap crc
5137 FLAG_CACHE_PIN
= 1<<6, // pin the object in cache tier
5138 FLAG_MANIFEST
= 1<<7, // has manifest
5139 FLAG_USES_TMAP
= 1<<8, // deprecated; no longer used
5140 FLAG_REDIRECT_HAS_REFERENCE
= 1<<9, // has reference
5145 static string
get_flag_string(flag_t flags
) {
5147 vector
<string
> sv
= get_flag_vector(flags
);
5148 for (auto ss
: sv
) {
5149 s
+= string("|") + ss
;
5155 static vector
<string
> get_flag_vector(flag_t flags
) {
5157 if (flags
& FLAG_LOST
)
5158 sv
.insert(sv
.end(), "lost");
5159 if (flags
& FLAG_WHITEOUT
)
5160 sv
.insert(sv
.end(), "whiteout");
5161 if (flags
& FLAG_DIRTY
)
5162 sv
.insert(sv
.end(), "dirty");
5163 if (flags
& FLAG_USES_TMAP
)
5164 sv
.insert(sv
.end(), "uses_tmap");
5165 if (flags
& FLAG_OMAP
)
5166 sv
.insert(sv
.end(), "omap");
5167 if (flags
& FLAG_DATA_DIGEST
)
5168 sv
.insert(sv
.end(), "data_digest");
5169 if (flags
& FLAG_OMAP_DIGEST
)
5170 sv
.insert(sv
.end(), "omap_digest");
5171 if (flags
& FLAG_CACHE_PIN
)
5172 sv
.insert(sv
.end(), "cache_pin");
5173 if (flags
& FLAG_MANIFEST
)
5174 sv
.insert(sv
.end(), "manifest");
5175 if (flags
& FLAG_REDIRECT_HAS_REFERENCE
)
5176 sv
.insert(sv
.end(), "redirect_has_reference");
5179 string
get_flag_string() const {
5180 return get_flag_string(flags
);
5183 uint64_t truncate_seq
, truncate_size
;
5185 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
> watchers
;
5187 // opportunistic checksums; may or may not be present
5188 __u32 data_digest
; ///< data crc32c
5189 __u32 omap_digest
; ///< omap crc32c
5191 // alloc hint attribute
5192 uint64_t expected_object_size
, expected_write_size
;
5193 uint32_t alloc_hint_flags
;
5195 struct object_manifest_t manifest
;
5197 void copy_user_bits(const object_info_t
& other
);
5199 bool test_flag(flag_t f
) const {
5200 return (flags
& f
) == f
;
5202 void set_flag(flag_t f
) {
5203 flags
= (flag_t
)(flags
| f
);
5205 void clear_flag(flag_t f
) {
5206 flags
= (flag_t
)(flags
& ~f
);
5208 bool is_lost() const {
5209 return test_flag(FLAG_LOST
);
5211 bool is_whiteout() const {
5212 return test_flag(FLAG_WHITEOUT
);
5214 bool is_dirty() const {
5215 return test_flag(FLAG_DIRTY
);
5217 bool is_omap() const {
5218 return test_flag(FLAG_OMAP
);
5220 bool is_data_digest() const {
5221 return test_flag(FLAG_DATA_DIGEST
);
5223 bool is_omap_digest() const {
5224 return test_flag(FLAG_OMAP_DIGEST
);
5226 bool is_cache_pinned() const {
5227 return test_flag(FLAG_CACHE_PIN
);
5229 bool has_manifest() const {
5230 return test_flag(FLAG_MANIFEST
);
5232 void set_data_digest(__u32 d
) {
5233 set_flag(FLAG_DATA_DIGEST
);
5236 void set_omap_digest(__u32 d
) {
5237 set_flag(FLAG_OMAP_DIGEST
);
5240 void clear_data_digest() {
5241 clear_flag(FLAG_DATA_DIGEST
);
5244 void clear_omap_digest() {
5245 clear_flag(FLAG_OMAP_DIGEST
);
5249 clear_data_digest();
5250 clear_omap_digest();
5253 void encode(bufferlist
& bl
, uint64_t features
) const;
5254 void decode(bufferlist::const_iterator
& bl
);
5255 void decode(bufferlist
& bl
) {
5256 auto p
= std::cbegin(bl
);
5259 void dump(Formatter
*f
) const;
5260 static void generate_test_instances(list
<object_info_t
*>& o
);
5262 explicit object_info_t()
5263 : user_version(0), size(0), flags((flag_t
)0),
5264 truncate_seq(0), truncate_size(0),
5265 data_digest(-1), omap_digest(-1),
5266 expected_object_size(0), expected_write_size(0),
5270 explicit object_info_t(const hobject_t
& s
)
5272 user_version(0), size(0), flags((flag_t
)0),
5273 truncate_seq(0), truncate_size(0),
5274 data_digest(-1), omap_digest(-1),
5275 expected_object_size(0), expected_write_size(0),
5279 explicit object_info_t(bufferlist
& bl
) {
5283 WRITE_CLASS_ENCODER_FEATURES(object_info_t
)
5285 ostream
& operator<<(ostream
& out
, const object_info_t
& oi
);
5290 struct ObjectRecoveryInfo
{
5295 SnapSet ss
; // only populated if soid is_snap()
5296 interval_set
<uint64_t> copy_subset
;
5297 map
<hobject_t
, interval_set
<uint64_t>> clone_subset
;
5299 ObjectRecoveryInfo() : size(0) { }
5301 static void generate_test_instances(list
<ObjectRecoveryInfo
*>& o
);
5302 void encode(bufferlist
&bl
, uint64_t features
) const;
5303 void decode(bufferlist::const_iterator
&bl
, int64_t pool
= -1);
5304 ostream
&print(ostream
&out
) const;
5305 void dump(Formatter
*f
) const;
5307 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo
)
5308 ostream
& operator<<(ostream
& out
, const ObjectRecoveryInfo
&inf
);
5310 struct ObjectRecoveryProgress
{
5311 uint64_t data_recovered_to
;
5312 string omap_recovered_to
;
5318 ObjectRecoveryProgress()
5319 : data_recovered_to(0),
5321 data_complete(false), omap_complete(false) { }
5323 bool is_complete(const ObjectRecoveryInfo
& info
) const {
5324 return (data_recovered_to
>= (
5325 info
.copy_subset
.empty() ?
5326 0 : info
.copy_subset
.range_end())) &&
5330 static void generate_test_instances(list
<ObjectRecoveryProgress
*>& o
);
5331 void encode(bufferlist
&bl
) const;
5332 void decode(bufferlist::const_iterator
&bl
);
5333 ostream
&print(ostream
&out
) const;
5334 void dump(Formatter
*f
) const;
5336 WRITE_CLASS_ENCODER(ObjectRecoveryProgress
)
5337 ostream
& operator<<(ostream
& out
, const ObjectRecoveryProgress
&prog
);
5339 struct PushReplyOp
{
5342 static void generate_test_instances(list
<PushReplyOp
*>& o
);
5343 void encode(bufferlist
&bl
) const;
5344 void decode(bufferlist::const_iterator
&bl
);
5345 ostream
&print(ostream
&out
) const;
5346 void dump(Formatter
*f
) const;
5348 uint64_t cost(CephContext
*cct
) const;
5350 WRITE_CLASS_ENCODER(PushReplyOp
)
5351 ostream
& operator<<(ostream
& out
, const PushReplyOp
&op
);
5356 ObjectRecoveryInfo recovery_info
;
5357 ObjectRecoveryProgress recovery_progress
;
5359 static void generate_test_instances(list
<PullOp
*>& o
);
5360 void encode(bufferlist
&bl
, uint64_t features
) const;
5361 void decode(bufferlist::const_iterator
&bl
);
5362 ostream
&print(ostream
&out
) const;
5363 void dump(Formatter
*f
) const;
5365 uint64_t cost(CephContext
*cct
) const;
5367 WRITE_CLASS_ENCODER_FEATURES(PullOp
)
5368 ostream
& operator<<(ostream
& out
, const PullOp
&op
);
5374 interval_set
<uint64_t> data_included
;
5375 bufferlist omap_header
;
5376 map
<string
, bufferlist
> omap_entries
;
5377 map
<string
, bufferlist
> attrset
;
5379 ObjectRecoveryInfo recovery_info
;
5380 ObjectRecoveryProgress before_progress
;
5381 ObjectRecoveryProgress after_progress
;
5383 static void generate_test_instances(list
<PushOp
*>& o
);
5384 void encode(bufferlist
&bl
, uint64_t features
) const;
5385 void decode(bufferlist::const_iterator
&bl
);
5386 ostream
&print(ostream
&out
) const;
5387 void dump(Formatter
*f
) const;
5389 uint64_t cost(CephContext
*cct
) const;
5391 WRITE_CLASS_ENCODER_FEATURES(PushOp
)
5392 ostream
& operator<<(ostream
& out
, const PushOp
&op
);
5396 * summarize pg contents for purposes of a scrub
5400 map
<string
,bufferptr
> attrs
;
5402 __u32 omap_digest
; ///< omap crc32c
5403 __u32 digest
; ///< data crc32c
5405 bool digest_present
:1;
5406 bool omap_digest_present
:1;
5409 bool ec_hash_mismatch
:1;
5410 bool ec_size_mismatch
:1;
5411 bool large_omap_object_found
:1;
5412 uint64_t large_omap_object_key_count
= 0;
5413 uint64_t large_omap_object_value_size
= 0;
5414 uint64_t object_omap_bytes
= 0;
5415 uint64_t object_omap_keys
= 0;
5418 // Init invalid size so it won't match if we get a stat EIO error
5419 size(-1), omap_digest(0), digest(0),
5420 negative(false), digest_present(false), omap_digest_present(false),
5421 read_error(false), stat_error(false), ec_hash_mismatch(false),
5422 ec_size_mismatch(false), large_omap_object_found(false) {}
5424 void encode(bufferlist
& bl
) const;
5425 void decode(bufferlist::const_iterator
& bl
);
5426 void dump(Formatter
*f
) const;
5427 static void generate_test_instances(list
<object
*>& o
);
5429 WRITE_CLASS_ENCODER(object
)
5431 map
<hobject_t
,object
> objects
;
5432 eversion_t valid_through
;
5433 eversion_t incr_since
;
5434 bool has_large_omap_object_errors
:1;
5435 bool has_omap_keys
:1;
5437 void merge_incr(const ScrubMap
&l
);
5438 void clear_from(const hobject_t
& start
) {
5439 objects
.erase(objects
.lower_bound(start
), objects
.end());
5441 void insert(const ScrubMap
&r
) {
5442 objects
.insert(r
.objects
.begin(), r
.objects
.end());
5444 void swap(ScrubMap
&r
) {
5446 swap(objects
, r
.objects
);
5447 swap(valid_through
, r
.valid_through
);
5448 swap(incr_since
, r
.incr_since
);
5451 void encode(bufferlist
& bl
) const;
5452 void decode(bufferlist::const_iterator
& bl
, int64_t pool
=-1);
5453 void dump(Formatter
*f
) const;
5454 static void generate_test_instances(list
<ScrubMap
*>& o
);
5456 WRITE_CLASS_ENCODER(ScrubMap::object
)
5457 WRITE_CLASS_ENCODER(ScrubMap
)
5459 struct ScrubMapBuilder
{
5461 vector
<hobject_t
> ls
;
5463 int64_t data_pos
= 0;
5466 bufferhash data_hash
, omap_hash
; ///< accumulatinng hash value
5467 uint64_t omap_keys
= 0;
5468 uint64_t omap_bytes
= 0;
5474 return pos
>= ls
.size();
5477 *this = ScrubMapBuilder();
5481 return data_pos
< 0;
5484 void next_object() {
5492 friend ostream
& operator<<(ostream
& out
, const ScrubMapBuilder
& pos
) {
5493 out
<< "(" << pos
.pos
<< "/" << pos
.ls
.size();
5494 if (pos
.pos
< pos
.ls
.size()) {
5495 out
<< " " << pos
.ls
[pos
.pos
];
5497 if (pos
.data_pos
< 0) {
5498 out
<< " byte " << pos
.data_pos
;
5500 if (!pos
.omap_pos
.empty()) {
5501 out
<< " key " << pos
.omap_pos
;
5507 out
<< " ret " << pos
.ret
;
5517 bufferlist indata
, outdata
;
5521 memset(&op
, 0, sizeof(ceph_osd_op
));
5525 * split a bufferlist into constituent indata members of a vector of OSDOps
5527 * @param ops [out] vector of OSDOps
5528 * @param in [in] combined data buffer
5530 static void split_osd_op_vector_in_data(vector
<OSDOp
>& ops
, bufferlist
& in
);
5533 * merge indata members of a vector of OSDOp into a single bufferlist
5535 * Notably this also encodes certain other OSDOp data into the data
5536 * buffer, including the sobject_t soid.
5538 * @param ops [in] vector of OSDOps
5539 * @param out [out] combined data buffer
5541 static void merge_osd_op_vector_in_data(vector
<OSDOp
>& ops
, bufferlist
& out
);
5544 * split a bufferlist into constituent outdata members of a vector of OSDOps
5546 * @param ops [out] vector of OSDOps
5547 * @param in [in] combined data buffer
5549 static void split_osd_op_vector_out_data(vector
<OSDOp
>& ops
, bufferlist
& in
);
5552 * merge outdata members of a vector of OSDOps into a single bufferlist
5554 * @param ops [in] vector of OSDOps
5555 * @param out [out] combined data buffer
5557 static void merge_osd_op_vector_out_data(vector
<OSDOp
>& ops
, bufferlist
& out
);
5560 * Clear data as much as possible, leave minimal data for historical op dump
5562 * @param ops [in] vector of OSDOps
5564 static void clear_data(vector
<OSDOp
>& ops
);
5567 ostream
& operator<<(ostream
& out
, const OSDOp
& op
);
5569 struct watch_item_t
{
5572 uint32_t timeout_seconds
;
5575 watch_item_t() : cookie(0), timeout_seconds(0) { }
5576 watch_item_t(entity_name_t name
, uint64_t cookie
, uint32_t timeout
,
5577 const entity_addr_t
& addr
)
5578 : name(name
), cookie(cookie
), timeout_seconds(timeout
),
5581 void encode(bufferlist
&bl
, uint64_t features
) const {
5582 ENCODE_START(2, 1, bl
);
5585 encode(timeout_seconds
, bl
);
5586 encode(addr
, bl
, features
);
5589 void decode(bufferlist::const_iterator
&bl
) {
5590 DECODE_START(2, bl
);
5593 decode(timeout_seconds
, bl
);
5594 if (struct_v
>= 2) {
5600 WRITE_CLASS_ENCODER_FEATURES(watch_item_t
)
5602 struct obj_watch_item_t
{
5608 * obj list watch response format
5611 struct obj_list_watch_response_t
{
5612 list
<watch_item_t
> entries
;
5614 void encode(bufferlist
& bl
, uint64_t features
) const {
5615 ENCODE_START(1, 1, bl
);
5616 encode(entries
, bl
, features
);
5619 void decode(bufferlist::const_iterator
& bl
) {
5620 DECODE_START(1, bl
);
5621 decode(entries
, bl
);
5624 void dump(Formatter
*f
) const {
5625 f
->open_array_section("entries");
5626 for (list
<watch_item_t
>::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
5627 f
->open_object_section("watch");
5628 f
->dump_stream("watcher") << p
->name
;
5629 f
->dump_int("cookie", p
->cookie
);
5630 f
->dump_int("timeout", p
->timeout_seconds
);
5631 f
->open_object_section("addr");
5638 static void generate_test_instances(list
<obj_list_watch_response_t
*>& o
) {
5640 o
.push_back(new obj_list_watch_response_t
);
5641 o
.push_back(new obj_list_watch_response_t
);
5642 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
5644 ea
.set_family(AF_INET
);
5645 ea
.set_in4_quad(0, 127);
5646 ea
.set_in4_quad(1, 0);
5647 ea
.set_in4_quad(2, 0);
5648 ea
.set_in4_quad(3, 1);
5650 o
.back()->entries
.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 1), 10, 30, ea
));
5652 ea
.set_in4_quad(3, 2);
5654 o
.back()->entries
.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 2), 20, 60, ea
));
5657 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t
)
5661 vector
<snapid_t
> snaps
; // ascending
5662 vector
< pair
<uint64_t,uint64_t> > overlap
;
5665 clone_info() : cloneid(CEPH_NOSNAP
), size(0) {}
5667 void encode(bufferlist
& bl
) const {
5668 ENCODE_START(1, 1, bl
);
5669 encode(cloneid
, bl
);
5671 encode(overlap
, bl
);
5675 void decode(bufferlist::const_iterator
& bl
) {
5676 DECODE_START(1, bl
);
5677 decode(cloneid
, bl
);
5679 decode(overlap
, bl
);
5683 void dump(Formatter
*f
) const {
5684 if (cloneid
== CEPH_NOSNAP
)
5685 f
->dump_string("cloneid", "HEAD");
5687 f
->dump_unsigned("cloneid", cloneid
.val
);
5688 f
->open_array_section("snapshots");
5689 for (vector
<snapid_t
>::const_iterator p
= snaps
.begin(); p
!= snaps
.end(); ++p
) {
5690 f
->open_object_section("snap");
5691 f
->dump_unsigned("id", p
->val
);
5695 f
->open_array_section("overlaps");
5696 for (vector
< pair
<uint64_t,uint64_t> >::const_iterator q
= overlap
.begin();
5697 q
!= overlap
.end(); ++q
) {
5698 f
->open_object_section("overlap");
5699 f
->dump_unsigned("offset", q
->first
);
5700 f
->dump_unsigned("length", q
->second
);
5704 f
->dump_unsigned("size", size
);
5706 static void generate_test_instances(list
<clone_info
*>& o
) {
5707 o
.push_back(new clone_info
);
5708 o
.push_back(new clone_info
);
5709 o
.back()->cloneid
= 1;
5710 o
.back()->snaps
.push_back(1);
5711 o
.back()->overlap
.push_back(pair
<uint64_t,uint64_t>(0,4096));
5712 o
.back()->overlap
.push_back(pair
<uint64_t,uint64_t>(8192,4096));
5713 o
.back()->size
= 16384;
5714 o
.push_back(new clone_info
);
5715 o
.back()->cloneid
= CEPH_NOSNAP
;
5716 o
.back()->size
= 32768;
5719 WRITE_CLASS_ENCODER(clone_info
)
5722 * obj list snaps response format
5725 struct obj_list_snap_response_t
{
5726 vector
<clone_info
> clones
; // ascending
5729 void encode(bufferlist
& bl
) const {
5730 ENCODE_START(2, 1, bl
);
5735 void decode(bufferlist::const_iterator
& bl
) {
5736 DECODE_START(2, bl
);
5744 void dump(Formatter
*f
) const {
5745 f
->open_array_section("clones");
5746 for (vector
<clone_info
>::const_iterator p
= clones
.begin(); p
!= clones
.end(); ++p
) {
5747 f
->open_object_section("clone");
5751 f
->dump_unsigned("seq", seq
);
5754 static void generate_test_instances(list
<obj_list_snap_response_t
*>& o
) {
5755 o
.push_back(new obj_list_snap_response_t
);
5756 o
.push_back(new obj_list_snap_response_t
);
5759 cl
.snaps
.push_back(1);
5760 cl
.overlap
.push_back(pair
<uint64_t,uint64_t>(0,4096));
5761 cl
.overlap
.push_back(pair
<uint64_t,uint64_t>(8192,4096));
5763 o
.back()->clones
.push_back(cl
);
5764 cl
.cloneid
= CEPH_NOSNAP
;
5768 o
.back()->clones
.push_back(cl
);
5769 o
.back()->seq
= 123;
5773 WRITE_CLASS_ENCODER(obj_list_snap_response_t
)
5777 struct PromoteCounter
{
5778 std::atomic
<unsigned long long> attempts
{0};
5779 std::atomic
<unsigned long long> objects
{0};
5780 std::atomic
<unsigned long long> bytes
{0};
5786 void finish(uint64_t size
) {
5791 void sample_and_attenuate(uint64_t *a
, uint64_t *o
, uint64_t *b
) {
5801 struct pool_pg_num_history_t
{
5802 /// last epoch updated
5804 /// poolid -> epoch -> pg_num
5805 map
<int64_t,map
<epoch_t
,uint32_t>> pg_nums
;
5806 /// pair(epoch, poolid)
5807 set
<pair
<epoch_t
,int64_t>> deleted_pools
;
5809 void log_pg_num_change(epoch_t epoch
, int64_t pool
, uint32_t pg_num
) {
5810 pg_nums
[pool
][epoch
] = pg_num
;
5812 void log_pool_delete(epoch_t epoch
, int64_t pool
) {
5813 deleted_pools
.insert(make_pair(epoch
, pool
));
5816 /// prune history based on oldest osdmap epoch in the cluster
5817 void prune(epoch_t oldest_epoch
) {
5818 auto i
= deleted_pools
.begin();
5819 while (i
!= deleted_pools
.end()) {
5820 if (i
->first
>= oldest_epoch
) {
5823 pg_nums
.erase(i
->second
);
5824 i
= deleted_pools
.erase(i
);
5826 for (auto& j
: pg_nums
) {
5827 auto k
= j
.second
.lower_bound(oldest_epoch
);
5828 // keep this and the entry before it (just to be paranoid)
5829 if (k
!= j
.second
.begin()) {
5831 j
.second
.erase(j
.second
.begin(), k
);
5836 void encode(bufferlist
& bl
) const {
5837 ENCODE_START(1, 1, bl
);
5839 encode(pg_nums
, bl
);
5840 encode(deleted_pools
, bl
);
5843 void decode(bufferlist::const_iterator
& p
) {
5847 decode(deleted_pools
, p
);
5850 void dump(Formatter
*f
) const {
5851 f
->dump_unsigned("epoch", epoch
);
5852 f
->open_object_section("pools");
5853 for (auto& i
: pg_nums
) {
5854 f
->open_object_section("pool");
5855 f
->dump_unsigned("pool_id", i
.first
);
5856 f
->open_array_section("changes");
5857 for (auto& j
: i
.second
) {
5858 f
->open_object_section("change");
5859 f
->dump_unsigned("epoch", j
.first
);
5860 f
->dump_unsigned("pg_num", j
.second
);
5867 f
->open_array_section("deleted_pools");
5868 for (auto& i
: deleted_pools
) {
5869 f
->open_object_section("deletion");
5870 f
->dump_unsigned("pool_id", i
.second
);
5871 f
->dump_unsigned("epoch", i
.first
);
5876 static void generate_test_instances(list
<pool_pg_num_history_t
*>& ls
) {
5877 ls
.push_back(new pool_pg_num_history_t
);
5879 friend ostream
& operator<<(ostream
& out
, const pool_pg_num_history_t
& h
) {
5880 return out
<< "pg_num_history(e" << h
.epoch
5881 << " pg_nums " << h
.pg_nums
5882 << " deleted_pools " << h
.deleted_pools
5886 WRITE_CLASS_ENCODER(pool_pg_num_history_t
)
5888 // omap specific stats
5889 struct omap_stat_t
{
5890 int large_omap_objects
;