1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
24 #include <boost/scoped_ptr.hpp>
25 #include <boost/optional/optional_io.hpp>
26 #include <boost/variant.hpp>
28 #include "include/rados/rados_types.hpp"
29 #include "include/mempool.h"
31 #include "msg/msg_types.h"
32 #include "include/types.h"
33 #include "include/utime.h"
34 #include "include/CompatSet.h"
35 #include "common/histogram.h"
36 #include "include/interval_set.h"
37 #include "include/inline_memory.h"
38 #include "common/Formatter.h"
39 #include "common/bloom_filter.hpp"
40 #include "common/hobject.h"
41 #include "common/snap_types.h"
44 #include "include/cmp.h"
45 #include "librados/ListObjectImpl.h"
46 #include "compressor/Compressor.h"
49 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
51 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
52 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
53 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
54 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
55 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
56 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
57 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
58 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
59 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
60 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
61 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
62 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
63 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
64 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
65 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
66 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
69 /// min recovery priority for MBackfillReserve
70 #define OSD_RECOVERY_PRIORITY_MIN 0
72 /// base backfill priority for MBackfillReserve
73 #define OSD_BACKFILL_PRIORITY_BASE 100
75 /// base backfill priority for MBackfillReserve (degraded PG)
76 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
78 /// base recovery priority for MBackfillReserve
79 #define OSD_RECOVERY_PRIORITY_BASE 180
81 /// base backfill priority for MBackfillReserve (inactive PG)
82 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
84 /// max manually/automatically set recovery priority for MBackfillReserve
85 #define OSD_RECOVERY_PRIORITY_MAX 253
87 /// backfill priority for MBackfillReserve, when forced manually
88 #define OSD_BACKFILL_PRIORITY_FORCED 254
90 /// recovery priority for MRecoveryReserve, when forced manually
91 #define OSD_RECOVERY_PRIORITY_FORCED 255
94 typedef hobject_t collection_list_handle_t
;
96 /// convert a single CPEH_OSD_FLAG_* to a string
97 const char *ceph_osd_flag_name(unsigned flag
);
98 /// convert a single CEPH_OSD_OF_FLAG_* to a string
99 const char *ceph_osd_op_flag_name(unsigned flag
);
101 /// convert CEPH_OSD_FLAG_* op flags to a string
102 string
ceph_osd_flag_string(unsigned flags
);
103 /// conver CEPH_OSD_OP_FLAG_* op flags to a string
104 string
ceph_osd_op_flag_string(unsigned flags
);
105 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a string
106 string
ceph_osd_alloc_hint_flag_string(unsigned flags
);
110 * osd request identifier
112 * caller name + incarnation# + tid to unique identify this request.
115 entity_name_t name
; // who
117 int32_t inc
; // incarnation
122 osd_reqid_t(const osd_reqid_t
& other
)
123 : name(other
.name
), tid(other
.tid
), inc(other
.inc
)
125 osd_reqid_t(const entity_name_t
& a
, int i
, ceph_tid_t t
)
126 : name(a
), tid(t
), inc(i
)
129 DENC(osd_reqid_t
, v
, p
) {
136 void dump(Formatter
*f
) const;
137 static void generate_test_instances(list
<osd_reqid_t
*>& o
);
139 WRITE_CLASS_DENC(osd_reqid_t
)
144 static const int32_t NO_OSD
= 0x7fffffff;
147 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD
) {}
148 explicit pg_shard_t(int osd
) : osd(osd
), shard(shard_id_t::NO_SHARD
) {}
149 pg_shard_t(int osd
, shard_id_t shard
) : osd(osd
), shard(shard
) {}
150 bool is_undefined() const {
153 string
get_osd() const { return (osd
== NO_OSD
? "NONE" : to_string(osd
)); }
154 void encode(bufferlist
&bl
) const;
155 void decode(bufferlist::iterator
&bl
);
156 void dump(Formatter
*f
) const {
157 f
->dump_unsigned("osd", osd
);
158 if (shard
!= shard_id_t::NO_SHARD
) {
159 f
->dump_unsigned("shard", shard
);
163 WRITE_CLASS_ENCODER(pg_shard_t
)
164 WRITE_EQ_OPERATORS_2(pg_shard_t
, osd
, shard
)
165 WRITE_CMP_OPERATORS_2(pg_shard_t
, osd
, shard
)
166 ostream
&operator<<(ostream
&lhs
, const pg_shard_t
&rhs
);
168 class IsPGRecoverablePredicate
{
171 * have encodes the shards available
173 virtual bool operator()(const set
<pg_shard_t
> &have
) const = 0;
174 virtual ~IsPGRecoverablePredicate() {}
177 class IsPGReadablePredicate
{
180 * have encodes the shards available
182 virtual bool operator()(const set
<pg_shard_t
> &have
) const = 0;
183 virtual ~IsPGReadablePredicate() {}
186 inline ostream
& operator<<(ostream
& out
, const osd_reqid_t
& r
) {
187 return out
<< r
.name
<< "." << r
.inc
<< ":" << r
.tid
;
190 inline bool operator==(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
191 return (l
.name
== r
.name
) && (l
.inc
== r
.inc
) && (l
.tid
== r
.tid
);
193 inline bool operator!=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
194 return (l
.name
!= r
.name
) || (l
.inc
!= r
.inc
) || (l
.tid
!= r
.tid
);
196 inline bool operator<(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
197 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
198 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
< r
.tid
);
200 inline bool operator<=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) {
201 return (l
.name
< r
.name
) || (l
.inc
< r
.inc
) ||
202 (l
.name
== r
.name
&& l
.inc
== r
.inc
&& l
.tid
<= r
.tid
);
204 inline bool operator>(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
<= r
); }
205 inline bool operator>=(const osd_reqid_t
& l
, const osd_reqid_t
& r
) { return !(l
< r
); }
208 template<> struct hash
<osd_reqid_t
> {
209 size_t operator()(const osd_reqid_t
&r
) const {
210 static hash
<uint64_t> H
;
211 return H(r
.name
.num() ^ r
.tid
^ r
.inc
);
219 // a locator constrains the placement of an object. mainly, which pool
221 struct object_locator_t
{
222 // You specify either the hash or the key -- not both
223 int64_t pool
; ///< pool id
224 string key
; ///< key string (if non-empty)
225 string nspace
; ///< namespace
226 int64_t hash
; ///< hash position (if >= 0)
228 explicit object_locator_t()
229 : pool(-1), hash(-1) {}
230 explicit object_locator_t(int64_t po
)
231 : pool(po
), hash(-1) {}
232 explicit object_locator_t(int64_t po
, int64_t ps
)
233 : pool(po
), hash(ps
) {}
234 explicit object_locator_t(int64_t po
, string ns
)
235 : pool(po
), nspace(ns
), hash(-1) {}
236 explicit object_locator_t(int64_t po
, string ns
, int64_t ps
)
237 : pool(po
), nspace(ns
), hash(ps
) {}
238 explicit object_locator_t(int64_t po
, string ns
, string s
)
239 : pool(po
), key(s
), nspace(ns
), hash(-1) {}
240 explicit object_locator_t(const hobject_t
& soid
)
241 : pool(soid
.pool
), key(soid
.get_key()), nspace(soid
.nspace
), hash(-1) {}
243 int64_t get_pool() const {
258 void encode(bufferlist
& bl
) const;
259 void decode(bufferlist::iterator
& p
);
260 void dump(Formatter
*f
) const;
261 static void generate_test_instances(list
<object_locator_t
*>& o
);
263 WRITE_CLASS_ENCODER(object_locator_t
)
265 inline bool operator==(const object_locator_t
& l
, const object_locator_t
& r
) {
266 return l
.pool
== r
.pool
&& l
.key
== r
.key
&& l
.nspace
== r
.nspace
&& l
.hash
== r
.hash
;
268 inline bool operator!=(const object_locator_t
& l
, const object_locator_t
& r
) {
272 inline ostream
& operator<<(ostream
& out
, const object_locator_t
& loc
)
274 out
<< "@" << loc
.pool
;
275 if (loc
.nspace
.length())
276 out
<< ";" << loc
.nspace
;
277 if (loc
.key
.length())
278 out
<< ":" << loc
.key
;
282 struct request_redirect_t
{
284 object_locator_t redirect_locator
; ///< this is authoritative
285 string redirect_object
; ///< If non-empty, the request goes to this object name
286 bufferlist osd_instructions
; ///< a bufferlist for the OSDs, passed but not interpreted by clients
288 friend ostream
& operator<<(ostream
& out
, const request_redirect_t
& redir
);
291 request_redirect_t() {}
292 explicit request_redirect_t(const object_locator_t
& orig
, int64_t rpool
) :
293 redirect_locator(orig
) { redirect_locator
.pool
= rpool
; }
294 explicit request_redirect_t(const object_locator_t
& rloc
) :
295 redirect_locator(rloc
) {}
296 explicit request_redirect_t(const object_locator_t
& orig
,
297 const string
& robj
) :
298 redirect_locator(orig
), redirect_object(robj
) {}
300 void set_instructions(const bufferlist
& bl
) { osd_instructions
= bl
; }
301 const bufferlist
& get_instructions() { return osd_instructions
; }
303 bool empty() const { return redirect_locator
.empty() &&
304 redirect_object
.empty(); }
306 void combine_with_locator(object_locator_t
& orig
, string
& obj
) const {
307 orig
= redirect_locator
;
308 if (!redirect_object
.empty())
309 obj
= redirect_object
;
312 void encode(bufferlist
& bl
) const;
313 void decode(bufferlist::iterator
& bl
);
314 void dump(Formatter
*f
) const;
315 static void generate_test_instances(list
<request_redirect_t
*>& o
);
317 WRITE_CLASS_ENCODER(request_redirect_t
)
319 inline ostream
& operator<<(ostream
& out
, const request_redirect_t
& redir
) {
320 out
<< "object " << redir
.redirect_object
<< ", locator{" << redir
.redirect_locator
<< "}";
324 // Internal OSD op flags - set by the OSD based on the op types
326 CEPH_OSD_RMW_FLAG_READ
= (1 << 1),
327 CEPH_OSD_RMW_FLAG_WRITE
= (1 << 2),
328 CEPH_OSD_RMW_FLAG_CLASS_READ
= (1 << 3),
329 CEPH_OSD_RMW_FLAG_CLASS_WRITE
= (1 << 4),
330 CEPH_OSD_RMW_FLAG_PGOP
= (1 << 5),
331 CEPH_OSD_RMW_FLAG_CACHE
= (1 << 6),
332 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE
= (1 << 7),
333 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE
= (1 << 8),
334 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE
= (1 << 9),
335 CEPH_OSD_RMW_FLAG_RWORDERED
= (1 << 10),
341 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
343 // placement seed (a hash value)
344 typedef uint32_t ps_t
;
346 // old (v1) pg_t encoding (wrap old struct ceph_pg)
349 void encode(bufferlist
& bl
) const {
352 void decode(bufferlist::iterator
& bl
) {
356 WRITE_CLASS_ENCODER(old_pg_t
)
358 // placement group id
364 pg_t() : m_pool(0), m_seed(0), m_preferred(-1) {}
365 pg_t(ps_t seed
, uint64_t pool
, int pref
=-1) :
366 m_pool(pool
), m_seed(seed
), m_preferred(pref
) {}
367 // cppcheck-suppress noExplicitConstructor
368 pg_t(const ceph_pg
& cpg
) :
369 m_pool(cpg
.pool
), m_seed(cpg
.ps
), m_preferred((__s16
)cpg
.preferred
) {}
371 // cppcheck-suppress noExplicitConstructor
372 pg_t(const old_pg_t
& opg
) {
376 old_pg_t
get_old_pg() const {
378 assert(m_pool
< 0xffffffffull
);
381 o
.v
.preferred
= (__s16
)m_preferred
;
388 uint64_t pool() const {
391 int32_t preferred() const {
395 static const uint8_t calc_name_buf_size
= 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
396 char *calc_name(char *buf
, const char *suffix_backwords
) const;
398 void set_ps(ps_t p
) {
401 void set_pool(uint64_t p
) {
404 void set_preferred(int32_t osd
) {
408 pg_t
get_parent() const;
409 pg_t
get_ancestor(unsigned old_pg_num
) const;
411 int print(char *o
, int maxlen
) const;
412 bool parse(const char *s
);
414 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
, set
<pg_t
> *pchildren
) const;
417 * Returns b such that for all object o:
418 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
420 unsigned get_split_bits(unsigned pg_num
) const;
422 bool contains(int bits
, const ghobject_t
& oid
) {
424 (int64_t)m_pool
== oid
.hobj
.get_logical_pool() &&
425 oid
.match(bits
, ps());
427 bool contains(int bits
, const hobject_t
& oid
) {
429 (int64_t)m_pool
== oid
.get_logical_pool() &&
430 oid
.match(bits
, ps());
433 hobject_t
get_hobj_start() const;
434 hobject_t
get_hobj_end(unsigned pg_num
) const;
436 void encode(bufferlist
& bl
) const {
439 ::encode(m_pool
, bl
);
440 ::encode(m_seed
, bl
);
441 ::encode(m_preferred
, bl
);
443 void decode(bufferlist::iterator
& bl
) {
446 ::decode(m_pool
, bl
);
447 ::decode(m_seed
, bl
);
448 ::decode(m_preferred
, bl
);
450 void decode_old(bufferlist::iterator
& bl
) {
455 void dump(Formatter
*f
) const;
456 static void generate_test_instances(list
<pg_t
*>& o
);
458 WRITE_CLASS_ENCODER(pg_t
)
460 inline bool operator<(const pg_t
& l
, const pg_t
& r
) {
461 return l
.pool() < r
.pool() ||
462 (l
.pool() == r
.pool() && (l
.preferred() < r
.preferred() ||
463 (l
.preferred() == r
.preferred() && (l
.ps() < r
.ps()))));
465 inline bool operator<=(const pg_t
& l
, const pg_t
& r
) {
466 return l
.pool() < r
.pool() ||
467 (l
.pool() == r
.pool() && (l
.preferred() < r
.preferred() ||
468 (l
.preferred() == r
.preferred() && (l
.ps() <= r
.ps()))));
470 inline bool operator==(const pg_t
& l
, const pg_t
& r
) {
471 return l
.pool() == r
.pool() &&
472 l
.preferred() == r
.preferred() &&
475 inline bool operator!=(const pg_t
& l
, const pg_t
& r
) {
476 return l
.pool() != r
.pool() ||
477 l
.preferred() != r
.preferred() ||
480 inline bool operator>(const pg_t
& l
, const pg_t
& r
) {
481 return l
.pool() > r
.pool() ||
482 (l
.pool() == r
.pool() && (l
.preferred() > r
.preferred() ||
483 (l
.preferred() == r
.preferred() && (l
.ps() > r
.ps()))));
485 inline bool operator>=(const pg_t
& l
, const pg_t
& r
) {
486 return l
.pool() > r
.pool() ||
487 (l
.pool() == r
.pool() && (l
.preferred() > r
.preferred() ||
488 (l
.preferred() == r
.preferred() && (l
.ps() >= r
.ps()))));
491 ostream
& operator<<(ostream
& out
, const pg_t
&pg
);
494 template<> struct hash
< pg_t
>
496 size_t operator()( const pg_t
& x
) const
498 static hash
<uint32_t> H
;
499 return H((x
.pool() & 0xffffffff) ^ (x
.pool() >> 32) ^ x
.ps() ^ x
.preferred());
507 spg_t() : shard(shard_id_t::NO_SHARD
) {}
508 spg_t(pg_t pgid
, shard_id_t shard
) : pgid(pgid
), shard(shard
) {}
509 explicit spg_t(pg_t pgid
) : pgid(pgid
), shard(shard_id_t::NO_SHARD
) {}
510 unsigned get_split_bits(unsigned pg_num
) const {
511 return pgid
.get_split_bits(pg_num
);
513 spg_t
get_parent() const {
514 return spg_t(pgid
.get_parent(), shard
);
519 uint64_t pool() const {
522 int32_t preferred() const {
523 return pgid
.preferred();
526 static const uint8_t calc_name_buf_size
= pg_t::calc_name_buf_size
+ 4; // 36 + len('s') + len("255");
527 char *calc_name(char *buf
, const char *suffix_backwords
) const;
529 bool parse(const char *s
);
530 bool parse(const std::string
& s
) {
531 return parse(s
.c_str());
533 bool is_split(unsigned old_pg_num
, unsigned new_pg_num
,
534 set
<spg_t
> *pchildren
) const {
536 set
<pg_t
> *children
= pchildren
? &_children
: NULL
;
537 bool is_split
= pgid
.is_split(old_pg_num
, new_pg_num
, children
);
538 if (pchildren
&& is_split
) {
539 for (set
<pg_t
>::iterator i
= _children
.begin();
540 i
!= _children
.end();
542 pchildren
->insert(spg_t(*i
, shard
));
547 bool is_no_shard() const {
548 return shard
== shard_id_t::NO_SHARD
;
551 ghobject_t
make_pgmeta_oid() const {
552 return ghobject_t::make_pgmeta(pgid
.pool(), pgid
.ps(), shard
);
555 void encode(bufferlist
&bl
) const {
556 ENCODE_START(1, 1, bl
);
561 void decode(bufferlist::iterator
&bl
) {
568 ghobject_t
make_temp_ghobject(const string
& name
) const {
570 hobject_t(object_t(name
), "", CEPH_NOSNAP
,
572 hobject_t::get_temp_pool(pgid
.pool()),
578 unsigned hash_to_shard(unsigned num_shards
) const {
579 return ps() % num_shards
;
582 WRITE_CLASS_ENCODER(spg_t
)
583 WRITE_EQ_OPERATORS_2(spg_t
, pgid
, shard
)
584 WRITE_CMP_OPERATORS_2(spg_t
, pgid
, shard
)
587 template<> struct hash
< spg_t
>
589 size_t operator()( const spg_t
& x
) const
591 static hash
<uint32_t> H
;
592 return H(hash
<pg_t
>()(x
.pgid
) ^ x
.shard
);
597 ostream
& operator<<(ostream
& out
, const spg_t
&pg
);
599 // ----------------------
604 TYPE_LEGACY_TEMP
= 1, /* no longer used */
610 uint64_t removal_seq
; // note: deprecated, not encoded
612 char _str_buff
[spg_t::calc_name_buf_size
];
617 coll_t(type_t t
, spg_t p
, uint64_t r
)
618 : type(t
), pgid(p
), removal_seq(r
) {
623 coll_t() : type(TYPE_META
), removal_seq(0)
628 coll_t(const coll_t
& other
)
629 : type(other
.type
), pgid(other
.pgid
), removal_seq(other
.removal_seq
) {
633 explicit coll_t(spg_t pgid
)
634 : type(TYPE_PG
), pgid(pgid
), removal_seq(0)
639 coll_t
& operator=(const coll_t
& rhs
)
641 this->type
= rhs
.type
;
642 this->pgid
= rhs
.pgid
;
643 this->removal_seq
= rhs
.removal_seq
;
648 // named constructors
649 static coll_t
meta() {
652 static coll_t
pg(spg_t p
) {
656 const std::string
to_str() const {
659 const char *c_str() const {
663 bool parse(const std::string
& s
);
665 int operator<(const coll_t
&rhs
) const {
666 return type
< rhs
.type
||
667 (type
== rhs
.type
&& pgid
< rhs
.pgid
);
670 bool is_meta() const {
671 return type
== TYPE_META
;
673 bool is_pg_prefix(spg_t
*pgid_
) const {
674 if (type
== TYPE_PG
|| type
== TYPE_PG_TEMP
) {
681 return type
== TYPE_PG
;
683 bool is_pg(spg_t
*pgid_
) const {
684 if (type
== TYPE_PG
) {
690 bool is_temp() const {
691 return type
== TYPE_PG_TEMP
;
693 bool is_temp(spg_t
*pgid_
) const {
694 if (type
== TYPE_PG_TEMP
) {
701 void encode(bufferlist
& bl
) const;
702 void decode(bufferlist::iterator
& bl
);
703 size_t encoded_size() const;
705 inline bool operator==(const coll_t
& rhs
) const {
706 // only compare type if meta
707 if (type
!= rhs
.type
)
709 if (type
== TYPE_META
)
711 return type
== rhs
.type
&& pgid
== rhs
.pgid
;
713 inline bool operator!=(const coll_t
& rhs
) const {
714 return !(*this == rhs
);
717 // get a TEMP collection that corresponds to the current collection,
718 // which we presume is a pg collection.
719 coll_t
get_temp() const {
720 assert(type
== TYPE_PG
);
721 return coll_t(TYPE_PG_TEMP
, pgid
, 0);
724 ghobject_t
get_min_hobj() const {
728 o
.hobj
.pool
= pgid
.pool();
729 o
.set_shard(pgid
.shard
);
740 unsigned hash_to_shard(unsigned num_shards
) const {
742 return pgid
.hash_to_shard(num_shards
);
743 return 0; // whatever.
746 void dump(Formatter
*f
) const;
747 static void generate_test_instances(list
<coll_t
*>& o
);
750 WRITE_CLASS_ENCODER(coll_t
)
752 inline ostream
& operator<<(ostream
& out
, const coll_t
& c
) {
758 template<> struct hash
<coll_t
> {
759 size_t operator()(const coll_t
&c
) const {
761 string
str(c
.to_str());
762 std::string::const_iterator
end(str
.end());
763 for (std::string::const_iterator s
= str
.begin(); s
!= end
; ++s
) {
776 inline ostream
& operator<<(ostream
& out
, const ceph_object_layout
&ol
)
778 out
<< pg_t(ol
.ol_pgid
);
779 int su
= ol
.ol_stripe_unit
;
787 // compound rados version type
788 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
789 * work well. For little-endian machine, we should make sure there is no padding
790 * in 32-bit machine and 64-bit machine.
797 eversion_t() : version(0), epoch(0), __pad(0) {}
798 eversion_t(epoch_t e
, version_t v
) : version(v
), epoch(e
), __pad(0) {}
800 // cppcheck-suppress noExplicitConstructor
801 eversion_t(const ceph_eversion
& ce
) :
806 explicit eversion_t(bufferlist
& bl
) : __pad(0) { decode(bl
); }
808 static eversion_t
max() {
815 operator ceph_eversion() {
822 string
get_key_name() const;
824 void encode(bufferlist
&bl
) const {
825 #if defined(CEPH_LITTLE_ENDIAN)
826 bl
.append((char *)this, sizeof(version_t
) + sizeof(epoch_t
));
828 ::encode(version
, bl
);
832 void decode(bufferlist::iterator
&bl
) {
833 #if defined(CEPH_LITTLE_ENDIAN)
834 bl
.copy(sizeof(version_t
) + sizeof(epoch_t
), (char *)this);
836 ::decode(version
, bl
);
840 void decode(bufferlist
& bl
) {
841 bufferlist::iterator p
= bl
.begin();
845 WRITE_CLASS_ENCODER(eversion_t
)
847 inline bool operator==(const eversion_t
& l
, const eversion_t
& r
) {
848 return (l
.epoch
== r
.epoch
) && (l
.version
== r
.version
);
850 inline bool operator!=(const eversion_t
& l
, const eversion_t
& r
) {
851 return (l
.epoch
!= r
.epoch
) || (l
.version
!= r
.version
);
853 inline bool operator<(const eversion_t
& l
, const eversion_t
& r
) {
854 return (l
.epoch
== r
.epoch
) ? (l
.version
< r
.version
):(l
.epoch
< r
.epoch
);
856 inline bool operator<=(const eversion_t
& l
, const eversion_t
& r
) {
857 return (l
.epoch
== r
.epoch
) ? (l
.version
<= r
.version
):(l
.epoch
<= r
.epoch
);
859 inline bool operator>(const eversion_t
& l
, const eversion_t
& r
) {
860 return (l
.epoch
== r
.epoch
) ? (l
.version
> r
.version
):(l
.epoch
> r
.epoch
);
862 inline bool operator>=(const eversion_t
& l
, const eversion_t
& r
) {
863 return (l
.epoch
== r
.epoch
) ? (l
.version
>= r
.version
):(l
.epoch
>= r
.epoch
);
865 inline ostream
& operator<<(ostream
& out
, const eversion_t
& e
) {
866 return out
<< e
.epoch
<< "'" << e
.version
;
870 * objectstore_perf_stat_t
872 * current perf information about the osd
874 struct objectstore_perf_stat_t
{
875 // cur_op_latency is in ms since double add/sub are not associative
876 uint32_t os_commit_latency
;
877 uint32_t os_apply_latency
;
879 objectstore_perf_stat_t() :
880 os_commit_latency(0), os_apply_latency(0) {}
882 bool operator==(const objectstore_perf_stat_t
&r
) const {
883 return os_commit_latency
== r
.os_commit_latency
&&
884 os_apply_latency
== r
.os_apply_latency
;
887 void add(const objectstore_perf_stat_t
&o
) {
888 os_commit_latency
+= o
.os_commit_latency
;
889 os_apply_latency
+= o
.os_apply_latency
;
891 void sub(const objectstore_perf_stat_t
&o
) {
892 os_commit_latency
-= o
.os_commit_latency
;
893 os_apply_latency
-= o
.os_apply_latency
;
895 void dump(Formatter
*f
) const;
896 void encode(bufferlist
&bl
) const;
897 void decode(bufferlist::iterator
&bl
);
898 static void generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
);
900 WRITE_CLASS_ENCODER(objectstore_perf_stat_t
)
903 * aggregate stats for an osd
906 int64_t kb
, kb_used
, kb_avail
;
907 vector
<int> hb_peers
;
908 int32_t snap_trim_queue_len
, num_snap_trimming
;
910 pow2_hist_t op_queue_age_hist
;
912 objectstore_perf_stat_t os_perf_stat
;
917 uint32_t num_pgs
= 0;
919 osd_stat_t() : kb(0), kb_used(0), kb_avail(0),
920 snap_trim_queue_len(0), num_snap_trimming(0) {}
922 void add(const osd_stat_t
& o
) {
924 kb_used
+= o
.kb_used
;
925 kb_avail
+= o
.kb_avail
;
926 snap_trim_queue_len
+= o
.snap_trim_queue_len
;
927 num_snap_trimming
+= o
.num_snap_trimming
;
928 op_queue_age_hist
.add(o
.op_queue_age_hist
);
929 os_perf_stat
.add(o
.os_perf_stat
);
930 num_pgs
+= o
.num_pgs
;
932 void sub(const osd_stat_t
& o
) {
934 kb_used
-= o
.kb_used
;
935 kb_avail
-= o
.kb_avail
;
936 snap_trim_queue_len
-= o
.snap_trim_queue_len
;
937 num_snap_trimming
-= o
.num_snap_trimming
;
938 op_queue_age_hist
.sub(o
.op_queue_age_hist
);
939 os_perf_stat
.sub(o
.os_perf_stat
);
940 num_pgs
-= o
.num_pgs
;
943 void dump(Formatter
*f
) const;
944 void encode(bufferlist
&bl
) const;
945 void decode(bufferlist::iterator
&bl
);
946 static void generate_test_instances(std::list
<osd_stat_t
*>& o
);
948 WRITE_CLASS_ENCODER(osd_stat_t
)
950 inline bool operator==(const osd_stat_t
& l
, const osd_stat_t
& r
) {
951 return l
.kb
== r
.kb
&&
952 l
.kb_used
== r
.kb_used
&&
953 l
.kb_avail
== r
.kb_avail
&&
954 l
.snap_trim_queue_len
== r
.snap_trim_queue_len
&&
955 l
.num_snap_trimming
== r
.num_snap_trimming
&&
956 l
.hb_peers
== r
.hb_peers
&&
957 l
.op_queue_age_hist
== r
.op_queue_age_hist
&&
958 l
.os_perf_stat
== r
.os_perf_stat
&&
959 l
.num_pgs
== r
.num_pgs
;
961 inline bool operator!=(const osd_stat_t
& l
, const osd_stat_t
& r
) {
967 inline ostream
& operator<<(ostream
& out
, const osd_stat_t
& s
) {
968 return out
<< "osd_stat(" << byte_u_t(s
.kb_used
<< 10) << " used, "
969 << byte_u_t(s
.kb_avail
<< 10) << " avail, "
970 << byte_u_t(s
.kb
<< 10) << " total, "
971 << "peers " << s
.hb_peers
972 << " op hist " << s
.op_queue_age_hist
.h
980 #define PG_STATE_CREATING (1<<0) // creating
981 #define PG_STATE_ACTIVE (1<<1) // i am active. (primary: replicas too)
982 #define PG_STATE_CLEAN (1<<2) // peers are complete, clean of stray replicas.
983 #define PG_STATE_DOWN (1<<4) // a needed replica is down, PG offline
984 #define PG_STATE_RECOVERY_UNFOUND (1<<5) // recovery stopped due to unfound
985 #define PG_STATE_BACKFILL_UNFOUND (1<<6) // backfill stopped due to unfound
986 //#define PG_STATE_SPLITTING (1<<7) // i am splitting
987 #define PG_STATE_SCRUBBING (1<<8) // scrubbing
988 //#define PG_STATE_SCRUBQ (1<<9) // queued for scrub
989 #define PG_STATE_DEGRADED (1<<10) // pg contains objects with reduced redundancy
990 #define PG_STATE_INCONSISTENT (1<<11) // pg replicas are inconsistent (but shouldn't be)
991 #define PG_STATE_PEERING (1<<12) // pg is (re)peering
992 #define PG_STATE_REPAIR (1<<13) // pg should repair on next scrub
993 #define PG_STATE_RECOVERING (1<<14) // pg is recovering/migrating objects
994 #define PG_STATE_BACKFILL_WAIT (1<<15) // [active] reserving backfill
995 #define PG_STATE_INCOMPLETE (1<<16) // incomplete content, peering failed.
996 #define PG_STATE_STALE (1<<17) // our state for this pg is stale, unknown.
997 #define PG_STATE_REMAPPED (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
998 #define PG_STATE_DEEP_SCRUB (1<<19) // deep scrub: check CRC32 on files
999 #define PG_STATE_BACKFILLING (1<<20) // [active] backfilling pg content
1000 #define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
1001 #define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
1002 #define PG_STATE_UNDERSIZED (1<<23) // pg acting < pool size
1003 #define PG_STATE_ACTIVATING (1<<24) // pg is peered but not yet active
1004 #define PG_STATE_PEERED (1<<25) // peered, cannot go active, can recover
1005 #define PG_STATE_SNAPTRIM (1<<26) // trimming snaps
1006 #define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
1007 #define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
1008 #define PG_STATE_SNAPTRIM_ERROR (1<<29) // error stopped trimming snaps
1009 #define PG_STATE_FORCED_RECOVERY (1<<30) // force recovery of this pg before any other
1010 #define PG_STATE_FORCED_BACKFILL (1<<31) // force backfill of this pg before any other
1012 std::string
pg_state_string(int state
);
1013 std::string
pg_vector_string(const vector
<int32_t> &a
);
1014 boost::optional
<uint64_t> pg_string_state(const std::string
& state
);
1020 * attributes for a single pool snapshot.
1022 struct pool_snap_info_t
{
1027 void dump(Formatter
*f
) const;
1028 void encode(bufferlist
& bl
, uint64_t features
) const;
1029 void decode(bufferlist::iterator
& bl
);
1030 static void generate_test_instances(list
<pool_snap_info_t
*>& o
);
1032 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t
)
1034 inline ostream
& operator<<(ostream
& out
, const pool_snap_info_t
& si
) {
1035 return out
<< si
.snapid
<< '(' << si
.name
<< ' ' << si
.stamp
<< ')';
1050 DEEP_SCRUB_INTERVAL
,
1052 RECOVERY_OP_PRIORITY
,
1055 COMPRESSION_ALGORITHM
,
1056 COMPRESSION_REQUIRED_RATIO
,
1057 COMPRESSION_MAX_BLOB_SIZE
,
1058 COMPRESSION_MIN_BLOB_SIZE
,
1074 opt_desc_t(key_t k
, type_t t
) : key(k
), type(t
) {}
1076 bool operator==(const opt_desc_t
& rhs
) const {
1077 return key
== rhs
.key
&& type
== rhs
.type
;
1081 typedef boost::variant
<std::string
,int,double> value_t
;
1083 static bool is_opt_name(const std::string
& name
);
1084 static opt_desc_t
get_opt_desc(const std::string
& name
);
1086 pool_opts_t() : opts() {}
1088 bool is_set(key_t key
) const;
1090 template<typename T
>
1091 void set(key_t key
, const T
&val
) {
1092 value_t value
= val
;
1096 template<typename T
>
1097 bool get(key_t key
, T
*val
) const {
1098 opts_t::const_iterator i
= opts
.find(key
);
1099 if (i
== opts
.end()) {
1102 *val
= boost::get
<T
>(i
->second
);
1106 const value_t
& get(key_t key
) const;
1108 bool unset(key_t key
);
1110 void dump(const std::string
& name
, Formatter
*f
) const;
1112 void dump(Formatter
*f
) const;
1113 void encode(bufferlist
&bl
) const;
1114 void decode(bufferlist::iterator
&bl
);
1117 typedef std::map
<key_t
, value_t
> opts_t
;
1120 friend ostream
& operator<<(ostream
& out
, const pool_opts_t
& opts
);
1122 WRITE_CLASS_ENCODER(pool_opts_t
)
1128 static const char *APPLICATION_NAME_CEPHFS
;
1129 static const char *APPLICATION_NAME_RBD
;
1130 static const char *APPLICATION_NAME_RGW
;
1133 TYPE_REPLICATED
= 1, // replication
1134 //TYPE_RAID4 = 2, // raid4 (never implemented)
1135 TYPE_ERASURE
= 3, // erasure-coded
1137 static const char *get_type_name(int t
) {
1139 case TYPE_REPLICATED
: return "replicated";
1140 //case TYPE_RAID4: return "raid4";
1141 case TYPE_ERASURE
: return "erasure";
1142 default: return "???";
1145 const char *get_type_name() const {
1146 return get_type_name(type
);
1150 FLAG_HASHPSPOOL
= 1<<0, // hash pg seed and pool together (instead of adding)
1151 FLAG_FULL
= 1<<1, // pool is full
1152 FLAG_EC_OVERWRITES
= 1<<2, // enables overwrites, once enabled, cannot be disabled
1153 FLAG_INCOMPLETE_CLONES
= 1<<3, // may have incomplete clones (bc we are/were an overlay)
1154 FLAG_NODELETE
= 1<<4, // pool can't be deleted
1155 FLAG_NOPGCHANGE
= 1<<5, // pool's pg and pgp num can't be changed
1156 FLAG_NOSIZECHANGE
= 1<<6, // pool's size and min size can't be changed
1157 FLAG_WRITE_FADVISE_DONTNEED
= 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1158 FLAG_NOSCRUB
= 1<<8, // block periodic scrub
1159 FLAG_NODEEP_SCRUB
= 1<<9, // block periodic deep-scrub
1160 FLAG_FULL_NO_QUOTA
= 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1161 FLAG_NEARFULL
= 1<<11, // pool is nearfull
1162 FLAG_BACKFILLFULL
= 1<<12, // pool is backfillfull
1165 static const char *get_flag_name(int f
) {
1167 case FLAG_HASHPSPOOL
: return "hashpspool";
1168 case FLAG_FULL
: return "full";
1169 case FLAG_EC_OVERWRITES
: return "ec_overwrites";
1170 case FLAG_INCOMPLETE_CLONES
: return "incomplete_clones";
1171 case FLAG_NODELETE
: return "nodelete";
1172 case FLAG_NOPGCHANGE
: return "nopgchange";
1173 case FLAG_NOSIZECHANGE
: return "nosizechange";
1174 case FLAG_WRITE_FADVISE_DONTNEED
: return "write_fadvise_dontneed";
1175 case FLAG_NOSCRUB
: return "noscrub";
1176 case FLAG_NODEEP_SCRUB
: return "nodeep-scrub";
1177 case FLAG_FULL_NO_QUOTA
: return "full_no_quota";
1178 case FLAG_NEARFULL
: return "nearfull";
1179 case FLAG_BACKFILLFULL
: return "backfillfull";
1180 default: return "???";
1183 static string
get_flags_string(uint64_t f
) {
1185 for (unsigned n
=0; f
&& n
<64; ++n
) {
1186 if (f
& (1ull << n
)) {
1189 s
+= get_flag_name(1ull << n
);
1194 string
get_flags_string() const {
1195 return get_flags_string(flags
);
1197 static uint64_t get_flag_by_name(const string
& name
) {
1198 if (name
== "hashpspool")
1199 return FLAG_HASHPSPOOL
;
1202 if (name
== "ec_overwrites")
1203 return FLAG_EC_OVERWRITES
;
1204 if (name
== "incomplete_clones")
1205 return FLAG_INCOMPLETE_CLONES
;
1206 if (name
== "nodelete")
1207 return FLAG_NODELETE
;
1208 if (name
== "nopgchange")
1209 return FLAG_NOPGCHANGE
;
1210 if (name
== "nosizechange")
1211 return FLAG_NOSIZECHANGE
;
1212 if (name
== "write_fadvise_dontneed")
1213 return FLAG_WRITE_FADVISE_DONTNEED
;
1214 if (name
== "noscrub")
1215 return FLAG_NOSCRUB
;
1216 if (name
== "nodeep-scrub")
1217 return FLAG_NODEEP_SCRUB
;
1218 if (name
== "full_no_quota")
1219 return FLAG_FULL_NO_QUOTA
;
1220 if (name
== "nearfull")
1221 return FLAG_NEARFULL
;
1222 if (name
== "backfillfull")
1223 return FLAG_BACKFILLFULL
;
1227 /// converts the acting/up vector to a set of pg shards
1228 void convert_to_pg_shards(const vector
<int> &from
, set
<pg_shard_t
>* to
) const;
1231 CACHEMODE_NONE
= 0, ///< no caching
1232 CACHEMODE_WRITEBACK
= 1, ///< write to cache, flush later
1233 CACHEMODE_FORWARD
= 2, ///< forward if not in cache
1234 CACHEMODE_READONLY
= 3, ///< handle reads, forward writes [not strongly consistent]
1235 CACHEMODE_READFORWARD
= 4, ///< forward reads, write to cache flush later
1236 CACHEMODE_READPROXY
= 5, ///< proxy reads, write to cache flush later
1237 CACHEMODE_PROXY
= 6, ///< proxy if not in cache
1239 static const char *get_cache_mode_name(cache_mode_t m
) {
1241 case CACHEMODE_NONE
: return "none";
1242 case CACHEMODE_WRITEBACK
: return "writeback";
1243 case CACHEMODE_FORWARD
: return "forward";
1244 case CACHEMODE_READONLY
: return "readonly";
1245 case CACHEMODE_READFORWARD
: return "readforward";
1246 case CACHEMODE_READPROXY
: return "readproxy";
1247 case CACHEMODE_PROXY
: return "proxy";
1248 default: return "unknown";
1251 static cache_mode_t
get_cache_mode_from_str(const string
& s
) {
1253 return CACHEMODE_NONE
;
1254 if (s
== "writeback")
1255 return CACHEMODE_WRITEBACK
;
1257 return CACHEMODE_FORWARD
;
1258 if (s
== "readonly")
1259 return CACHEMODE_READONLY
;
1260 if (s
== "readforward")
1261 return CACHEMODE_READFORWARD
;
1262 if (s
== "readproxy")
1263 return CACHEMODE_READPROXY
;
1265 return CACHEMODE_PROXY
;
1266 return (cache_mode_t
)-1;
1268 const char *get_cache_mode_name() const {
1269 return get_cache_mode_name(cache_mode
);
1271 bool cache_mode_requires_hit_set() const {
1272 switch (cache_mode
) {
1273 case CACHEMODE_NONE
:
1274 case CACHEMODE_FORWARD
:
1275 case CACHEMODE_READONLY
:
1276 case CACHEMODE_PROXY
:
1278 case CACHEMODE_WRITEBACK
:
1279 case CACHEMODE_READFORWARD
:
1280 case CACHEMODE_READPROXY
:
1283 assert(0 == "implement me");
1287 uint64_t flags
; ///< FLAG_*
1288 __u8 type
; ///< TYPE_*
1289 __u8 size
, min_size
; ///< number of osds in each pg
1290 __u8 crush_rule
; ///< crush placement rule
1291 __u8 object_hash
; ///< hash mapping object name to ps
1293 __u32 pg_num
, pgp_num
; ///< number of pgs
1297 map
<string
,string
> properties
; ///< OBSOLETE
1298 string erasure_code_profile
; ///< name of the erasure code profile in OSDMap
1299 epoch_t last_change
; ///< most recent epoch changed, exclusing snapshot changes
1300 epoch_t last_force_op_resend
; ///< last epoch that forced clients to resend
1301 /// last epoch that forced clients to resend (pre-luminous clients only)
1302 epoch_t last_force_op_resend_preluminous
;
1303 snapid_t snap_seq
; ///< seq for per-pool snapshot
1304 epoch_t snap_epoch
; ///< osdmap epoch of last snap
1305 uint64_t auid
; ///< who owns the pg
1306 __u32 crash_replay_interval
; ///< seconds to allow clients to replay ACKed but unCOMMITted requests
1308 uint64_t quota_max_bytes
; ///< maximum number of bytes for this pool
1309 uint64_t quota_max_objects
; ///< maximum number of objects for this pool
1312 * Pool snaps (global to this pool). These define a SnapContext for
1313 * the pool, unless the client manually specifies an alternate
1316 map
<snapid_t
, pool_snap_info_t
> snaps
;
1318 * Alternatively, if we are defining non-pool snaps (e.g. via the
1319 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1320 * used). Snaps and removed_snaps are to be used exclusive of each
1323 interval_set
<snapid_t
> removed_snaps
;
1325 unsigned pg_num_mask
, pgp_num_mask
;
1327 set
<uint64_t> tiers
; ///< pools that are tiers of us
1328 int64_t tier_of
; ///< pool for which we are a tier
1329 // Note that write wins for read+write ops
1330 int64_t read_tier
; ///< pool/tier for objecter to direct reads to
1331 int64_t write_tier
; ///< pool/tier for objecter to direct writes to
1332 cache_mode_t cache_mode
; ///< cache pool mode
1334 bool is_tier() const { return tier_of
>= 0; }
1335 bool has_tiers() const { return !tiers
.empty(); }
1340 clear_tier_tunables();
1342 bool has_read_tier() const { return read_tier
>= 0; }
1343 void clear_read_tier() { read_tier
= -1; }
1344 bool has_write_tier() const { return write_tier
>= 0; }
1345 void clear_write_tier() { write_tier
= -1; }
1346 void clear_tier_tunables() {
1347 if (cache_mode
!= CACHEMODE_NONE
)
1348 flags
|= FLAG_INCOMPLETE_CLONES
;
1349 cache_mode
= CACHEMODE_NONE
;
1351 target_max_bytes
= 0;
1352 target_max_objects
= 0;
1353 cache_target_dirty_ratio_micro
= 0;
1354 cache_target_dirty_high_ratio_micro
= 0;
1355 cache_target_full_ratio_micro
= 0;
1356 hit_set_params
= HitSet::Params();
1359 hit_set_grade_decay_rate
= 0;
1360 hit_set_search_last_n
= 0;
1361 grade_table
.resize(0);
1364 uint64_t target_max_bytes
; ///< tiering: target max pool size
1365 uint64_t target_max_objects
; ///< tiering: target max pool size
1367 uint32_t cache_target_dirty_ratio_micro
; ///< cache: fraction of target to leave dirty
1368 uint32_t cache_target_dirty_high_ratio_micro
; ///<cache: fraction of target to flush with high speed
1369 uint32_t cache_target_full_ratio_micro
; ///< cache: fraction of target to fill before we evict in earnest
1371 uint32_t cache_min_flush_age
; ///< minimum age (seconds) before we can flush
1372 uint32_t cache_min_evict_age
; ///< minimum age (seconds) before we can evict
1374 HitSet::Params hit_set_params
; ///< The HitSet params to use on this pool
1375 uint32_t hit_set_period
; ///< periodicity of HitSet segments (seconds)
1376 uint32_t hit_set_count
; ///< number of periods to retain
1377 bool use_gmt_hitset
; ///< use gmt to name the hitset archive object
1378 uint32_t min_read_recency_for_promote
; ///< minimum number of HitSet to check before promote on read
1379 uint32_t min_write_recency_for_promote
; ///< minimum number of HitSet to check before promote on write
1380 uint32_t hit_set_grade_decay_rate
; ///< current hit_set has highest priority on objects
1381 ///temperature count,the follow hit_set's priority decay
1382 ///by this params than pre hit_set
1383 uint32_t hit_set_search_last_n
; ///<accumulate atmost N hit_sets for temperature
1385 uint32_t stripe_width
; ///< erasure coded stripe size in bytes
1387 uint64_t expected_num_objects
; ///< expected number of objects on this pool, a value of 0 indicates
1388 ///< user does not specify any expected value
1389 bool fast_read
; ///< whether turn on fast read on the pool or not
1391 pool_opts_t opts
; ///< options
1393 /// application -> key/value metadata
1394 map
<string
, std::map
<string
, string
>> application_metadata
;
1397 vector
<uint32_t> grade_table
;
1400 uint32_t get_grade(unsigned i
) const {
1401 if (grade_table
.size() <= i
)
1403 return grade_table
[i
];
1405 void calc_grade_table() {
1406 unsigned v
= 1000000;
1407 grade_table
.resize(hit_set_count
);
1408 for (unsigned i
= 0; i
< hit_set_count
; i
++) {
1409 v
= v
* (1 - (hit_set_grade_decay_rate
/ 100.0));
1415 : flags(0), type(0), size(0), min_size(0),
1416 crush_rule(0), object_hash(0),
1417 pg_num(0), pgp_num(0),
1419 last_force_op_resend(0),
1420 last_force_op_resend_preluminous(0),
1421 snap_seq(0), snap_epoch(0),
1423 crash_replay_interval(0),
1424 quota_max_bytes(0), quota_max_objects(0),
1425 pg_num_mask(0), pgp_num_mask(0),
1426 tier_of(-1), read_tier(-1), write_tier(-1),
1427 cache_mode(CACHEMODE_NONE
),
1428 target_max_bytes(0), target_max_objects(0),
1429 cache_target_dirty_ratio_micro(0),
1430 cache_target_dirty_high_ratio_micro(0),
1431 cache_target_full_ratio_micro(0),
1432 cache_min_flush_age(0),
1433 cache_min_evict_age(0),
1437 use_gmt_hitset(true),
1438 min_read_recency_for_promote(0),
1439 min_write_recency_for_promote(0),
1440 hit_set_grade_decay_rate(0),
1441 hit_set_search_last_n(0),
1443 expected_num_objects(0),
1448 void dump(Formatter
*f
) const;
1450 uint64_t get_flags() const { return flags
; }
1451 bool has_flag(uint64_t f
) const { return flags
& f
; }
1452 void set_flag(uint64_t f
) { flags
|= f
; }
1453 void unset_flag(uint64_t f
) { flags
&= ~f
; }
1455 bool ec_pool() const {
1456 return type
== TYPE_ERASURE
;
1458 bool require_rollback() const {
1462 /// true if incomplete clones may be present
1463 bool allow_incomplete_clones() const {
1464 return cache_mode
!= CACHEMODE_NONE
|| has_flag(FLAG_INCOMPLETE_CLONES
);
1467 unsigned get_type() const { return type
; }
1468 unsigned get_size() const { return size
; }
1469 unsigned get_min_size() const { return min_size
; }
1470 int get_crush_rule() const { return crush_rule
; }
1471 int get_object_hash() const { return object_hash
; }
1472 const char *get_object_hash_name() const {
1473 return ceph_str_hash_name(get_object_hash());
1475 epoch_t
get_last_change() const { return last_change
; }
1476 epoch_t
get_last_force_op_resend() const { return last_force_op_resend
; }
1477 epoch_t
get_last_force_op_resend_preluminous() const {
1478 return last_force_op_resend_preluminous
;
1480 epoch_t
get_snap_epoch() const { return snap_epoch
; }
1481 snapid_t
get_snap_seq() const { return snap_seq
; }
1482 uint64_t get_auid() const { return auid
; }
1483 unsigned get_crash_replay_interval() const { return crash_replay_interval
; }
1485 void set_snap_seq(snapid_t s
) { snap_seq
= s
; }
1486 void set_snap_epoch(epoch_t e
) { snap_epoch
= e
; }
1488 void set_stripe_width(uint32_t s
) { stripe_width
= s
; }
1489 uint32_t get_stripe_width() const { return stripe_width
; }
1491 bool is_replicated() const { return get_type() == TYPE_REPLICATED
; }
1492 bool is_erasure() const { return get_type() == TYPE_ERASURE
; }
1494 bool supports_omap() const {
1495 return !(get_type() == TYPE_ERASURE
);
1498 bool requires_aligned_append() const {
1499 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES
);
1501 uint64_t required_alignment() const { return stripe_width
; }
1503 bool allows_ecoverwrites() const {
1504 return has_flag(FLAG_EC_OVERWRITES
);
1507 bool can_shift_osds() const {
1508 switch (get_type()) {
1509 case TYPE_REPLICATED
:
1514 assert(0 == "unhandled pool type");
1518 unsigned get_pg_num() const { return pg_num
; }
1519 unsigned get_pgp_num() const { return pgp_num
; }
1521 unsigned get_pg_num_mask() const { return pg_num_mask
; }
1522 unsigned get_pgp_num_mask() const { return pgp_num_mask
; }
1524 // if pg_num is not a multiple of two, pgs are not equally sized.
1525 // return, for a given pg, the fraction (denominator) of the total
1526 // pool size that it represents.
1527 unsigned get_pg_num_divisor(pg_t pgid
) const;
1529 void set_pg_num(int p
) {
1533 void set_pgp_num(int p
) {
1538 void set_quota_max_bytes(uint64_t m
) {
1539 quota_max_bytes
= m
;
1541 uint64_t get_quota_max_bytes() {
1542 return quota_max_bytes
;
1545 void set_quota_max_objects(uint64_t m
) {
1546 quota_max_objects
= m
;
1548 uint64_t get_quota_max_objects() {
1549 return quota_max_objects
;
1552 void set_last_force_op_resend(uint64_t t
) {
1553 last_force_op_resend
= t
;
1554 last_force_op_resend_preluminous
= t
;
1557 void calc_pg_masks();
1560 * we have two snap modes:
1561 * - pool global snaps
1562 * - snap existence/non-existence defined by snaps[] and snap_seq
1563 * - user managed snaps
1564 * - removal governed by removed_snaps
1566 * we know which mode we're using based on whether removed_snaps is empty.
1567 * If nothing has been created, both functions report false.
1569 bool is_pool_snaps_mode() const;
1570 bool is_unmanaged_snaps_mode() const;
1571 bool is_removed_snap(snapid_t s
) const;
1574 * build set of known-removed sets from either pool snaps or
1575 * explicit removed_snaps set.
1577 void build_removed_snaps(interval_set
<snapid_t
>& rs
) const;
1578 bool maybe_updated_removed_snaps(const interval_set
<snapid_t
>& cached
) const;
1579 snapid_t
snap_exists(const char *s
) const;
1580 void add_snap(const char *n
, utime_t stamp
);
1581 void add_unmanaged_snap(uint64_t& snapid
);
1582 void remove_snap(snapid_t s
);
1583 void remove_unmanaged_snap(snapid_t s
);
1585 SnapContext
get_snap_context() const;
1587 /// hash a object name+namespace key to a hash position
1588 uint32_t hash_key(const string
& key
, const string
& ns
) const;
1590 /// round a hash position down to a pg num
1591 uint32_t raw_hash_to_pg(uint32_t v
) const;
1594 * map a raw pg (with full precision ps) into an actual pg, for storage
1596 pg_t
raw_pg_to_pg(pg_t pg
) const;
1599 * map raw pg (full precision ps) into a placement seed. include
1600 * pool id in that value so that different pools don't use the same
1603 ps_t
raw_pg_to_pps(pg_t pg
) const;
1605 /// choose a random hash position within a pg
1606 uint32_t get_random_pg_position(pg_t pgid
, uint32_t seed
) const;
1608 void encode(bufferlist
& bl
, uint64_t features
) const;
1609 void decode(bufferlist::iterator
& bl
);
1611 static void generate_test_instances(list
<pg_pool_t
*>& o
);
1613 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t
)
1615 ostream
& operator<<(ostream
& out
, const pg_pool_t
& p
);
1619 * a summation of object stats
1621 * This is just a container for object stats; we don't know what for.
1623 * If you add members in object_stat_sum_t, you should make sure there are
1624 * not padding among these members.
1625 * You should also modify the padding_check function.
1628 struct object_stat_sum_t
{
1629 /**************************************************************************
1630 * WARNING: be sure to update operator==, floor, and split when
1631 * adding/removing fields!
1632 **************************************************************************/
1633 int64_t num_bytes
; // in bytes
1634 int64_t num_objects
;
1635 int64_t num_object_clones
;
1636 int64_t num_object_copies
; // num_objects * num_replicas
1637 int64_t num_objects_missing_on_primary
;
1638 int64_t num_objects_degraded
;
1639 int64_t num_objects_unfound
;
1644 int64_t num_scrub_errors
; // total deep and shallow scrub errors
1645 int64_t num_objects_recovered
;
1646 int64_t num_bytes_recovered
;
1647 int64_t num_keys_recovered
;
1648 int64_t num_shallow_scrub_errors
;
1649 int64_t num_deep_scrub_errors
;
1650 int64_t num_objects_dirty
;
1651 int64_t num_whiteouts
;
1652 int64_t num_objects_omap
;
1653 int64_t num_objects_hit_set_archive
;
1654 int64_t num_objects_misplaced
;
1655 int64_t num_bytes_hit_set_archive
;
1657 int64_t num_flush_kb
;
1659 int64_t num_evict_kb
;
1660 int64_t num_promote
;
1661 int32_t num_flush_mode_high
; // 1 when in high flush mode, otherwise 0
1662 int32_t num_flush_mode_low
; // 1 when in low flush mode, otherwise 0
1663 int32_t num_evict_mode_some
; // 1 when in evict some mode, otherwise 0
1664 int32_t num_evict_mode_full
; // 1 when in evict full mode, otherwise 0
1665 int64_t num_objects_pinned
;
1666 int64_t num_objects_missing
;
1667 int64_t num_legacy_snapsets
; ///< upper bound on pre-luminous-style SnapSets
1668 int64_t num_large_omap_objects
= 0;
1672 num_objects(0), num_object_clones(0), num_object_copies(0),
1673 num_objects_missing_on_primary(0), num_objects_degraded(0),
1674 num_objects_unfound(0),
1675 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1676 num_scrub_errors(0),
1677 num_objects_recovered(0),
1678 num_bytes_recovered(0),
1679 num_keys_recovered(0),
1680 num_shallow_scrub_errors(0),
1681 num_deep_scrub_errors(0),
1682 num_objects_dirty(0),
1684 num_objects_omap(0),
1685 num_objects_hit_set_archive(0),
1686 num_objects_misplaced(0),
1687 num_bytes_hit_set_archive(0),
1693 num_flush_mode_high(0), num_flush_mode_low(0),
1694 num_evict_mode_some(0), num_evict_mode_full(0),
1695 num_objects_pinned(0),
1696 num_objects_missing(0),
1697 num_legacy_snapsets(0)
1700 void floor(int64_t f
) {
1701 #define FLOOR(x) if (x < f) x = f
1704 FLOOR(num_object_clones
);
1705 FLOOR(num_object_copies
);
1706 FLOOR(num_objects_missing_on_primary
);
1707 FLOOR(num_objects_missing
);
1708 FLOOR(num_objects_degraded
);
1709 FLOOR(num_objects_misplaced
);
1710 FLOOR(num_objects_unfound
);
1715 FLOOR(num_large_omap_objects
);
1716 FLOOR(num_shallow_scrub_errors
);
1717 FLOOR(num_deep_scrub_errors
);
1718 num_scrub_errors
= num_shallow_scrub_errors
+ num_deep_scrub_errors
;
1719 FLOOR(num_objects_recovered
);
1720 FLOOR(num_bytes_recovered
);
1721 FLOOR(num_keys_recovered
);
1722 FLOOR(num_objects_dirty
);
1723 FLOOR(num_whiteouts
);
1724 FLOOR(num_objects_omap
);
1725 FLOOR(num_objects_hit_set_archive
);
1726 FLOOR(num_bytes_hit_set_archive
);
1728 FLOOR(num_flush_kb
);
1730 FLOOR(num_evict_kb
);
1732 FLOOR(num_flush_mode_high
);
1733 FLOOR(num_flush_mode_low
);
1734 FLOOR(num_evict_mode_some
);
1735 FLOOR(num_evict_mode_full
);
1736 FLOOR(num_objects_pinned
);
1737 FLOOR(num_legacy_snapsets
);
1741 void split(vector
<object_stat_sum_t
> &out
) const {
1742 #define SPLIT(PARAM) \
1743 for (unsigned i = 0; i < out.size(); ++i) { \
1744 out[i].PARAM = PARAM / out.size(); \
1745 if (i < (PARAM % out.size())) { \
1749 #define SPLIT_PRESERVE_NONZERO(PARAM) \
1750 for (unsigned i = 0; i < out.size(); ++i) { \
1752 out[i].PARAM = 1 + PARAM / out.size(); \
1759 SPLIT(num_object_clones
);
1760 SPLIT(num_object_copies
);
1761 SPLIT(num_objects_missing_on_primary
);
1762 SPLIT(num_objects_missing
);
1763 SPLIT(num_objects_degraded
);
1764 SPLIT(num_objects_misplaced
);
1765 SPLIT(num_objects_unfound
);
1770 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors
);
1771 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors
);
1772 for (unsigned i
= 0; i
< out
.size(); ++i
) {
1773 out
[i
].num_scrub_errors
= out
[i
].num_shallow_scrub_errors
+
1774 out
[i
].num_deep_scrub_errors
;
1776 SPLIT(num_large_omap_objects
);
1777 SPLIT(num_objects_recovered
);
1778 SPLIT(num_bytes_recovered
);
1779 SPLIT(num_keys_recovered
);
1780 SPLIT(num_objects_dirty
);
1781 SPLIT(num_whiteouts
);
1782 SPLIT(num_objects_omap
);
1783 SPLIT(num_objects_hit_set_archive
);
1784 SPLIT(num_bytes_hit_set_archive
);
1786 SPLIT(num_flush_kb
);
1788 SPLIT(num_evict_kb
);
1790 SPLIT(num_flush_mode_high
);
1791 SPLIT(num_flush_mode_low
);
1792 SPLIT(num_evict_mode_some
);
1793 SPLIT(num_evict_mode_full
);
1794 SPLIT(num_objects_pinned
);
1795 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets
);
1797 #undef SPLIT_PRESERVE_NONZERO
1801 memset(this, 0, sizeof(*this));
1804 void calc_copies(int nrep
) {
1805 num_object_copies
= nrep
* num_objects
;
1808 bool is_zero() const {
1809 return mem_is_zero((char*)this, sizeof(*this));
1812 void add(const object_stat_sum_t
& o
);
1813 void sub(const object_stat_sum_t
& o
);
1815 void dump(Formatter
*f
) const;
1816 void padding_check() {
1818 sizeof(object_stat_sum_t
) ==
1820 sizeof(num_objects
) +
1821 sizeof(num_object_clones
) +
1822 sizeof(num_object_copies
) +
1823 sizeof(num_objects_missing_on_primary
) +
1824 sizeof(num_objects_degraded
) +
1825 sizeof(num_objects_unfound
) +
1830 sizeof(num_scrub_errors
) +
1831 sizeof(num_large_omap_objects
) +
1832 sizeof(num_objects_recovered
) +
1833 sizeof(num_bytes_recovered
) +
1834 sizeof(num_keys_recovered
) +
1835 sizeof(num_shallow_scrub_errors
) +
1836 sizeof(num_deep_scrub_errors
) +
1837 sizeof(num_objects_dirty
) +
1838 sizeof(num_whiteouts
) +
1839 sizeof(num_objects_omap
) +
1840 sizeof(num_objects_hit_set_archive
) +
1841 sizeof(num_objects_misplaced
) +
1842 sizeof(num_bytes_hit_set_archive
) +
1844 sizeof(num_flush_kb
) +
1846 sizeof(num_evict_kb
) +
1847 sizeof(num_promote
) +
1848 sizeof(num_flush_mode_high
) +
1849 sizeof(num_flush_mode_low
) +
1850 sizeof(num_evict_mode_some
) +
1851 sizeof(num_evict_mode_full
) +
1852 sizeof(num_objects_pinned
) +
1853 sizeof(num_objects_missing
) +
1854 sizeof(num_legacy_snapsets
)
1856 "object_stat_sum_t have padding");
1858 void encode(bufferlist
& bl
) const;
1859 void decode(bufferlist::iterator
& bl
);
1860 static void generate_test_instances(list
<object_stat_sum_t
*>& o
);
1862 WRITE_CLASS_ENCODER(object_stat_sum_t
)
1864 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
);
1867 * a collection of object stat sums
1869 * This is a collection of stat sums over different categories.
1871 struct object_stat_collection_t
{
1872 /**************************************************************************
1873 * WARNING: be sure to update the operator== when adding/removing fields! *
1874 **************************************************************************/
1875 object_stat_sum_t sum
;
1877 void calc_copies(int nrep
) {
1878 sum
.calc_copies(nrep
);
1881 void dump(Formatter
*f
) const;
1882 void encode(bufferlist
& bl
) const;
1883 void decode(bufferlist::iterator
& bl
);
1884 static void generate_test_instances(list
<object_stat_collection_t
*>& o
);
1886 bool is_zero() const {
1887 return sum
.is_zero();
1894 void floor(int64_t f
) {
1898 void add(const object_stat_sum_t
& o
) {
1902 void add(const object_stat_collection_t
& o
) {
1905 void sub(const object_stat_collection_t
& o
) {
1909 WRITE_CLASS_ENCODER(object_stat_collection_t
)
1911 inline bool operator==(const object_stat_collection_t
& l
,
1912 const object_stat_collection_t
& r
) {
1913 return l
.sum
== r
.sum
;
1918 * aggregate stats for a single PG.
1921 /**************************************************************************
1922 * WARNING: be sure to update the operator== when adding/removing fields! *
1923 **************************************************************************/
1925 version_t reported_seq
; // sequence number
1926 epoch_t reported_epoch
; // epoch of this report
1928 utime_t last_fresh
; // last reported
1929 utime_t last_change
; // new state != previous state
1930 utime_t last_active
; // state & PG_STATE_ACTIVE
1931 utime_t last_peered
; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
1932 utime_t last_clean
; // state & PG_STATE_CLEAN
1933 utime_t last_unstale
; // (state & PG_STATE_STALE) == 0
1934 utime_t last_undegraded
; // (state & PG_STATE_DEGRADED) == 0
1935 utime_t last_fullsized
; // (state & PG_STATE_UNDERSIZED) == 0
1937 eversion_t log_start
; // (log_start,version]
1938 eversion_t ondisk_log_start
; // there may be more on disk
1941 epoch_t last_epoch_clean
;
1943 __u32 parent_split_bits
;
1945 eversion_t last_scrub
;
1946 eversion_t last_deep_scrub
;
1947 utime_t last_scrub_stamp
;
1948 utime_t last_deep_scrub_stamp
;
1949 utime_t last_clean_scrub_stamp
;
1951 object_stat_collection_t stats
;
1954 int64_t ondisk_log_size
; // >= active_log_size
1956 vector
<int32_t> up
, acting
;
1957 epoch_t mapping_epoch
;
1959 vector
<int32_t> blocked_by
; ///< osds on which the pg is blocked
1961 utime_t last_became_active
;
1962 utime_t last_became_peered
;
1964 /// up, acting primaries
1966 int32_t acting_primary
;
1968 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
1969 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
1970 uint32_t snaptrimq_len
;
1972 bool stats_invalid
:1;
1973 /// true if num_objects_dirty is not accurate (because it was not
1974 /// maintained starting from pool creation)
1975 bool dirty_stats_invalid
:1;
1976 bool omap_stats_invalid
:1;
1977 bool hitset_stats_invalid
:1;
1978 bool hitset_bytes_stats_invalid
:1;
1979 bool pin_stats_invalid
:1;
1985 created(0), last_epoch_clean(0),
1986 parent_split_bits(0),
1987 log_size(0), ondisk_log_size(0),
1992 stats_invalid(false),
1993 dirty_stats_invalid(false),
1994 omap_stats_invalid(false),
1995 hitset_stats_invalid(false),
1996 hitset_bytes_stats_invalid(false),
1997 pin_stats_invalid(false)
2000 epoch_t
get_effective_last_epoch_clean() const {
2001 if (state
& PG_STATE_CLEAN
) {
2002 // we are clean as of this report, and should thus take the
2004 return reported_epoch
;
2006 return last_epoch_clean
;
2010 pair
<epoch_t
, version_t
> get_version_pair() const {
2011 return make_pair(reported_epoch
, reported_seq
);
2014 void floor(int64_t f
) {
2018 if (ondisk_log_size
< f
)
2019 ondisk_log_size
= f
;
2020 if (snaptrimq_len
< f
)
2024 void add(const pg_stat_t
& o
) {
2026 log_size
+= o
.log_size
;
2027 ondisk_log_size
+= o
.ondisk_log_size
;
2028 if (((uint64_t)snaptrimq_len
+ (uint64_t)o
.snaptrimq_len
) > (uint64_t)(1 << 31)) {
2029 snaptrimq_len
= 1 << 31;
2031 snaptrimq_len
+= o
.snaptrimq_len
;
2034 void sub(const pg_stat_t
& o
) {
2036 log_size
-= o
.log_size
;
2037 ondisk_log_size
-= o
.ondisk_log_size
;
2038 if (o
.snaptrimq_len
< snaptrimq_len
) {
2039 snaptrimq_len
-= o
.snaptrimq_len
;
2045 bool is_acting_osd(int32_t osd
, bool primary
) const;
2046 void dump(Formatter
*f
) const;
2047 void dump_brief(Formatter
*f
) const;
2048 void encode(bufferlist
&bl
) const;
2049 void decode(bufferlist::iterator
&bl
);
2050 static void generate_test_instances(list
<pg_stat_t
*>& o
);
2052 WRITE_CLASS_ENCODER(pg_stat_t
)
2054 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
);
2057 * summation over an entire pool
2059 struct pool_stat_t
{
2060 object_stat_collection_t stats
;
2062 int64_t ondisk_log_size
; // >= active_log_size
2063 int32_t up
; ///< number of up replicas or shards
2064 int32_t acting
; ///< number of acting replicas or shards
2066 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0)
2069 void floor(int64_t f
) {
2073 if (ondisk_log_size
< f
)
2074 ondisk_log_size
= f
;
2081 void add(const pg_stat_t
& o
) {
2083 log_size
+= o
.log_size
;
2084 ondisk_log_size
+= o
.ondisk_log_size
;
2086 acting
+= o
.acting
.size();
2088 void sub(const pg_stat_t
& o
) {
2090 log_size
-= o
.log_size
;
2091 ondisk_log_size
-= o
.ondisk_log_size
;
2093 acting
-= o
.acting
.size();
2096 bool is_zero() const {
2097 return (stats
.is_zero() &&
2099 ondisk_log_size
== 0 &&
2104 void dump(Formatter
*f
) const;
2105 void encode(bufferlist
&bl
, uint64_t features
) const;
2106 void decode(bufferlist::iterator
&bl
);
2107 static void generate_test_instances(list
<pool_stat_t
*>& o
);
2109 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t
)
2112 // -----------------------------------------
2115 * pg_hit_set_info_t - information about a single recorded HitSet
2117 * Track basic metadata about a HitSet, like the nubmer of insertions
2118 * and the time range it covers.
2120 struct pg_hit_set_info_t
{
2121 utime_t begin
, end
; ///< time interval
2122 eversion_t version
; ///< version this HitSet object was written
2123 bool using_gmt
; ///< use gmt for creating the hit_set archive object name
2125 friend bool operator==(const pg_hit_set_info_t
& l
,
2126 const pg_hit_set_info_t
& r
) {
2128 l
.begin
== r
.begin
&&
2130 l
.version
== r
.version
&&
2131 l
.using_gmt
== r
.using_gmt
;
2134 explicit pg_hit_set_info_t(bool using_gmt
= true)
2135 : using_gmt(using_gmt
) {}
2137 void encode(bufferlist
&bl
) const;
2138 void decode(bufferlist::iterator
&bl
);
2139 void dump(Formatter
*f
) const;
2140 static void generate_test_instances(list
<pg_hit_set_info_t
*>& o
);
2142 WRITE_CLASS_ENCODER(pg_hit_set_info_t
)
2145 * pg_hit_set_history_t - information about a history of hitsets
2147 * Include information about the currently accumulating hit set as well
2148 * as archived/historical ones.
2150 struct pg_hit_set_history_t
{
2151 eversion_t current_last_update
; ///< last version inserted into current set
2152 list
<pg_hit_set_info_t
> history
; ///< archived sets, sorted oldest -> newest
2154 friend bool operator==(const pg_hit_set_history_t
& l
,
2155 const pg_hit_set_history_t
& r
) {
2157 l
.current_last_update
== r
.current_last_update
&&
2158 l
.history
== r
.history
;
2161 void encode(bufferlist
&bl
) const;
2162 void decode(bufferlist::iterator
&bl
);
2163 void dump(Formatter
*f
) const;
2164 static void generate_test_instances(list
<pg_hit_set_history_t
*>& o
);
2166 WRITE_CLASS_ENCODER(pg_hit_set_history_t
)
2169 // -----------------------------------------
2172 * pg_history_t - information about recent pg peering/mapping history
2174 * This is aggressively shared between OSDs to bound the amount of past
2175 * history they need to worry about.
2177 struct pg_history_t
{
2178 epoch_t epoch_created
; // epoch in which *pg* was created (pool or pg)
2179 epoch_t epoch_pool_created
; // epoch in which *pool* was created
2180 // (note: may be pg creation epoch for
2181 // pre-luminous clusters)
2182 epoch_t last_epoch_started
; // lower bound on last epoch started (anywhere, not necessarily locally)
2183 epoch_t last_interval_started
; // first epoch of last_epoch_started interval
2184 epoch_t last_epoch_clean
; // lower bound on last epoch the PG was completely clean.
2185 epoch_t last_interval_clean
; // first epoch of last_epoch_clean interval
2186 epoch_t last_epoch_split
; // as parent or child
2187 epoch_t last_epoch_marked_full
; // pool or cluster
2190 * In the event of a map discontinuity, same_*_since may reflect the first
2191 * map the osd has seen in the new map sequence rather than the actual start
2192 * of the interval. This is ok since a discontinuity at epoch e means there
2193 * must have been a clean interval between e and now and that we cannot be
2194 * in the active set during the interval containing e.
2196 epoch_t same_up_since
; // same acting set since
2197 epoch_t same_interval_since
; // same acting AND up set since
2198 epoch_t same_primary_since
; // same primary at least back through this epoch.
2200 eversion_t last_scrub
;
2201 eversion_t last_deep_scrub
;
2202 utime_t last_scrub_stamp
;
2203 utime_t last_deep_scrub_stamp
;
2204 utime_t last_clean_scrub_stamp
;
2206 friend bool operator==(const pg_history_t
& l
, const pg_history_t
& r
) {
2208 l
.epoch_created
== r
.epoch_created
&&
2209 l
.epoch_pool_created
== r
.epoch_pool_created
&&
2210 l
.last_epoch_started
== r
.last_epoch_started
&&
2211 l
.last_interval_started
== r
.last_interval_started
&&
2212 l
.last_epoch_clean
== r
.last_epoch_clean
&&
2213 l
.last_interval_clean
== r
.last_interval_clean
&&
2214 l
.last_epoch_split
== r
.last_epoch_split
&&
2215 l
.last_epoch_marked_full
== r
.last_epoch_marked_full
&&
2216 l
.same_up_since
== r
.same_up_since
&&
2217 l
.same_interval_since
== r
.same_interval_since
&&
2218 l
.same_primary_since
== r
.same_primary_since
&&
2219 l
.last_scrub
== r
.last_scrub
&&
2220 l
.last_deep_scrub
== r
.last_deep_scrub
&&
2221 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
2222 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
2223 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
;
2228 epoch_pool_created(0),
2229 last_epoch_started(0),
2230 last_interval_started(0),
2231 last_epoch_clean(0),
2232 last_interval_clean(0),
2233 last_epoch_split(0),
2234 last_epoch_marked_full(0),
2235 same_up_since(0), same_interval_since(0), same_primary_since(0) {}
2237 bool merge(const pg_history_t
&other
) {
2238 // Here, we only update the fields which cannot be calculated from the OSDmap.
2239 bool modified
= false;
2240 if (epoch_created
< other
.epoch_created
) {
2241 epoch_created
= other
.epoch_created
;
2244 if (epoch_pool_created
< other
.epoch_pool_created
) {
2245 // FIXME: for jewel compat only; this should either be 0 or always the
2246 // same value across all pg instances.
2247 epoch_pool_created
= other
.epoch_pool_created
;
2250 if (last_epoch_started
< other
.last_epoch_started
) {
2251 last_epoch_started
= other
.last_epoch_started
;
2254 if (last_interval_started
< other
.last_interval_started
) {
2255 last_interval_started
= other
.last_interval_started
;
2258 if (last_epoch_clean
< other
.last_epoch_clean
) {
2259 last_epoch_clean
= other
.last_epoch_clean
;
2262 if (last_interval_clean
< other
.last_interval_clean
) {
2263 last_interval_clean
= other
.last_interval_clean
;
2266 if (last_epoch_split
< other
.last_epoch_split
) {
2267 last_epoch_split
= other
.last_epoch_split
;
2270 if (last_epoch_marked_full
< other
.last_epoch_marked_full
) {
2271 last_epoch_marked_full
= other
.last_epoch_marked_full
;
2274 if (other
.last_scrub
> last_scrub
) {
2275 last_scrub
= other
.last_scrub
;
2278 if (other
.last_scrub_stamp
> last_scrub_stamp
) {
2279 last_scrub_stamp
= other
.last_scrub_stamp
;
2282 if (other
.last_deep_scrub
> last_deep_scrub
) {
2283 last_deep_scrub
= other
.last_deep_scrub
;
2286 if (other
.last_deep_scrub_stamp
> last_deep_scrub_stamp
) {
2287 last_deep_scrub_stamp
= other
.last_deep_scrub_stamp
;
2290 if (other
.last_clean_scrub_stamp
> last_clean_scrub_stamp
) {
2291 last_clean_scrub_stamp
= other
.last_clean_scrub_stamp
;
2297 void encode(bufferlist
& bl
) const;
2298 void decode(bufferlist::iterator
& p
);
2299 void dump(Formatter
*f
) const;
2300 static void generate_test_instances(list
<pg_history_t
*>& o
);
2302 WRITE_CLASS_ENCODER(pg_history_t
)
2304 inline ostream
& operator<<(ostream
& out
, const pg_history_t
& h
) {
2305 return out
<< "ec=" << h
.epoch_created
<< "/" << h
.epoch_pool_created
2306 << " lis/c " << h
.last_interval_started
2307 << "/" << h
.last_interval_clean
2308 << " les/c/f " << h
.last_epoch_started
<< "/" << h
.last_epoch_clean
2309 << "/" << h
.last_epoch_marked_full
2310 << " " << h
.same_up_since
2311 << "/" << h
.same_interval_since
2312 << "/" << h
.same_primary_since
;
2317 * pg_info_t - summary of PG statistics.
2320 * - last_complete implies we have all objects that existed as of that
2321 * stamp, OR a newer object, OR have already applied a later delete.
2322 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2323 * otherwise, we have no idea what the pg is supposed to contain.
2327 eversion_t last_update
; ///< last object version applied to store.
2328 eversion_t last_complete
; ///< last version pg was complete through.
2329 epoch_t last_epoch_started
; ///< last epoch at which this pg started on this osd
2330 epoch_t last_interval_started
; ///< first epoch of last_epoch_started interval
2332 version_t last_user_version
; ///< last user object version applied to store
2334 eversion_t log_tail
; ///< oldest log entry.
2336 hobject_t last_backfill
; ///< objects >= this and < last_complete may be missing
2337 bool last_backfill_bitwise
; ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
2339 interval_set
<snapid_t
> purged_snaps
;
2343 pg_history_t history
;
2344 pg_hit_set_history_t hit_set
;
2346 friend bool operator==(const pg_info_t
& l
, const pg_info_t
& r
) {
2349 l
.last_update
== r
.last_update
&&
2350 l
.last_complete
== r
.last_complete
&&
2351 l
.last_epoch_started
== r
.last_epoch_started
&&
2352 l
.last_interval_started
== r
.last_interval_started
&&
2353 l
.last_user_version
== r
.last_user_version
&&
2354 l
.log_tail
== r
.log_tail
&&
2355 l
.last_backfill
== r
.last_backfill
&&
2356 l
.last_backfill_bitwise
== r
.last_backfill_bitwise
&&
2357 l
.purged_snaps
== r
.purged_snaps
&&
2358 l
.stats
== r
.stats
&&
2359 l
.history
== r
.history
&&
2360 l
.hit_set
== r
.hit_set
;
2364 : last_epoch_started(0),
2365 last_interval_started(0),
2366 last_user_version(0),
2367 last_backfill(hobject_t::get_max()),
2368 last_backfill_bitwise(false)
2370 // cppcheck-suppress noExplicitConstructor
2373 last_epoch_started(0),
2374 last_interval_started(0),
2375 last_user_version(0),
2376 last_backfill(hobject_t::get_max()),
2377 last_backfill_bitwise(false)
2380 void set_last_backfill(hobject_t pos
) {
2381 last_backfill
= pos
;
2382 last_backfill_bitwise
= true;
2385 bool is_empty() const { return last_update
.version
== 0; }
2386 bool dne() const { return history
.epoch_created
== 0; }
2388 bool is_incomplete() const { return !last_backfill
.is_max(); }
2390 void encode(bufferlist
& bl
) const;
2391 void decode(bufferlist::iterator
& p
);
2392 void dump(Formatter
*f
) const;
2393 bool overlaps_with(const pg_info_t
&oinfo
) const {
2394 return last_update
> oinfo
.log_tail
?
2395 oinfo
.last_update
>= log_tail
:
2396 last_update
>= oinfo
.log_tail
;
2398 static void generate_test_instances(list
<pg_info_t
*>& o
);
2400 WRITE_CLASS_ENCODER(pg_info_t
)
2402 inline ostream
& operator<<(ostream
& out
, const pg_info_t
& pgi
)
2404 out
<< pgi
.pgid
<< "(";
2410 out
<< " v " << pgi
.last_update
;
2411 if (pgi
.last_complete
!= pgi
.last_update
)
2412 out
<< " lc " << pgi
.last_complete
;
2413 out
<< " (" << pgi
.log_tail
<< "," << pgi
.last_update
<< "]";
2415 if (pgi
.is_incomplete())
2416 out
<< " lb " << pgi
.last_backfill
2417 << (pgi
.last_backfill_bitwise
? " (bitwise)" : " (NIBBLEWISE)");
2418 //out << " c " << pgi.epoch_created;
2419 out
<< " local-lis/les=" << pgi
.last_interval_started
2420 << "/" << pgi
.last_epoch_started
;
2421 out
<< " n=" << pgi
.stats
.stats
.sum
.num_objects
;
2422 out
<< " " << pgi
.history
2428 * pg_fast_info_t - common pg_info_t fields
2430 * These are the fields of pg_info_t (and children) that are updated for
2431 * most IO operations.
2434 * Because we rely on these fields to be applied to the normal
2435 * info struct, adding a new field here that is not also new in info
2436 * means that we must set an incompat OSD feature bit!
2438 struct pg_fast_info_t
{
2439 eversion_t last_update
;
2440 eversion_t last_complete
;
2441 version_t last_user_version
;
2442 struct { // pg_stat_t stats
2444 version_t reported_seq
;
2446 utime_t last_active
;
2447 utime_t last_peered
;
2449 utime_t last_unstale
;
2450 utime_t last_undegraded
;
2451 utime_t last_fullsized
;
2452 int64_t log_size
; // (also ondisk_log_size, which has the same value)
2453 struct { // object_stat_collection_t stats;
2454 struct { // objct_stat_sum_t sum
2455 int64_t num_bytes
; // in bytes
2456 int64_t num_objects
;
2457 int64_t num_object_copies
;
2462 int64_t num_objects_dirty
;
2467 void populate_from(const pg_info_t
& info
) {
2468 last_update
= info
.last_update
;
2469 last_complete
= info
.last_complete
;
2470 last_user_version
= info
.last_user_version
;
2471 stats
.version
= info
.stats
.version
;
2472 stats
.reported_seq
= info
.stats
.reported_seq
;
2473 stats
.last_fresh
= info
.stats
.last_fresh
;
2474 stats
.last_active
= info
.stats
.last_active
;
2475 stats
.last_peered
= info
.stats
.last_peered
;
2476 stats
.last_clean
= info
.stats
.last_clean
;
2477 stats
.last_unstale
= info
.stats
.last_unstale
;
2478 stats
.last_undegraded
= info
.stats
.last_undegraded
;
2479 stats
.last_fullsized
= info
.stats
.last_fullsized
;
2480 stats
.log_size
= info
.stats
.log_size
;
2481 stats
.stats
.sum
.num_bytes
= info
.stats
.stats
.sum
.num_bytes
;
2482 stats
.stats
.sum
.num_objects
= info
.stats
.stats
.sum
.num_objects
;
2483 stats
.stats
.sum
.num_object_copies
= info
.stats
.stats
.sum
.num_object_copies
;
2484 stats
.stats
.sum
.num_rd
= info
.stats
.stats
.sum
.num_rd
;
2485 stats
.stats
.sum
.num_rd_kb
= info
.stats
.stats
.sum
.num_rd_kb
;
2486 stats
.stats
.sum
.num_wr
= info
.stats
.stats
.sum
.num_wr
;
2487 stats
.stats
.sum
.num_wr_kb
= info
.stats
.stats
.sum
.num_wr_kb
;
2488 stats
.stats
.sum
.num_objects_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
2491 bool try_apply_to(pg_info_t
* info
) {
2492 if (last_update
<= info
->last_update
)
2494 info
->last_update
= last_update
;
2495 info
->last_complete
= last_complete
;
2496 info
->last_user_version
= last_user_version
;
2497 info
->stats
.version
= stats
.version
;
2498 info
->stats
.reported_seq
= stats
.reported_seq
;
2499 info
->stats
.last_fresh
= stats
.last_fresh
;
2500 info
->stats
.last_active
= stats
.last_active
;
2501 info
->stats
.last_peered
= stats
.last_peered
;
2502 info
->stats
.last_clean
= stats
.last_clean
;
2503 info
->stats
.last_unstale
= stats
.last_unstale
;
2504 info
->stats
.last_undegraded
= stats
.last_undegraded
;
2505 info
->stats
.last_fullsized
= stats
.last_fullsized
;
2506 info
->stats
.log_size
= stats
.log_size
;
2507 info
->stats
.ondisk_log_size
= stats
.log_size
;
2508 info
->stats
.stats
.sum
.num_bytes
= stats
.stats
.sum
.num_bytes
;
2509 info
->stats
.stats
.sum
.num_objects
= stats
.stats
.sum
.num_objects
;
2510 info
->stats
.stats
.sum
.num_object_copies
= stats
.stats
.sum
.num_object_copies
;
2511 info
->stats
.stats
.sum
.num_rd
= stats
.stats
.sum
.num_rd
;
2512 info
->stats
.stats
.sum
.num_rd_kb
= stats
.stats
.sum
.num_rd_kb
;
2513 info
->stats
.stats
.sum
.num_wr
= stats
.stats
.sum
.num_wr
;
2514 info
->stats
.stats
.sum
.num_wr_kb
= stats
.stats
.sum
.num_wr_kb
;
2515 info
->stats
.stats
.sum
.num_objects_dirty
= stats
.stats
.sum
.num_objects_dirty
;
2519 void encode(bufferlist
& bl
) const {
2520 ENCODE_START(1, 1, bl
);
2521 ::encode(last_update
, bl
);
2522 ::encode(last_complete
, bl
);
2523 ::encode(last_user_version
, bl
);
2524 ::encode(stats
.version
, bl
);
2525 ::encode(stats
.reported_seq
, bl
);
2526 ::encode(stats
.last_fresh
, bl
);
2527 ::encode(stats
.last_active
, bl
);
2528 ::encode(stats
.last_peered
, bl
);
2529 ::encode(stats
.last_clean
, bl
);
2530 ::encode(stats
.last_unstale
, bl
);
2531 ::encode(stats
.last_undegraded
, bl
);
2532 ::encode(stats
.last_fullsized
, bl
);
2533 ::encode(stats
.log_size
, bl
);
2534 ::encode(stats
.stats
.sum
.num_bytes
, bl
);
2535 ::encode(stats
.stats
.sum
.num_objects
, bl
);
2536 ::encode(stats
.stats
.sum
.num_object_copies
, bl
);
2537 ::encode(stats
.stats
.sum
.num_rd
, bl
);
2538 ::encode(stats
.stats
.sum
.num_rd_kb
, bl
);
2539 ::encode(stats
.stats
.sum
.num_wr
, bl
);
2540 ::encode(stats
.stats
.sum
.num_wr_kb
, bl
);
2541 ::encode(stats
.stats
.sum
.num_objects_dirty
, bl
);
2544 void decode(bufferlist::iterator
& p
) {
2546 ::decode(last_update
, p
);
2547 ::decode(last_complete
, p
);
2548 ::decode(last_user_version
, p
);
2549 ::decode(stats
.version
, p
);
2550 ::decode(stats
.reported_seq
, p
);
2551 ::decode(stats
.last_fresh
, p
);
2552 ::decode(stats
.last_active
, p
);
2553 ::decode(stats
.last_peered
, p
);
2554 ::decode(stats
.last_clean
, p
);
2555 ::decode(stats
.last_unstale
, p
);
2556 ::decode(stats
.last_undegraded
, p
);
2557 ::decode(stats
.last_fullsized
, p
);
2558 ::decode(stats
.log_size
, p
);
2559 ::decode(stats
.stats
.sum
.num_bytes
, p
);
2560 ::decode(stats
.stats
.sum
.num_objects
, p
);
2561 ::decode(stats
.stats
.sum
.num_object_copies
, p
);
2562 ::decode(stats
.stats
.sum
.num_rd
, p
);
2563 ::decode(stats
.stats
.sum
.num_rd_kb
, p
);
2564 ::decode(stats
.stats
.sum
.num_wr
, p
);
2565 ::decode(stats
.stats
.sum
.num_wr_kb
, p
);
2566 ::decode(stats
.stats
.sum
.num_objects_dirty
, p
);
2570 WRITE_CLASS_ENCODER(pg_fast_info_t
)
2573 struct pg_notify_t
{
2574 epoch_t query_epoch
;
2580 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD
),
2581 from(shard_id_t::NO_SHARD
) {}
2585 epoch_t query_epoch
,
2587 const pg_info_t
&info
)
2588 : query_epoch(query_epoch
),
2589 epoch_sent(epoch_sent
),
2590 info(info
), to(to
), from(from
) {
2591 assert(from
== info
.pgid
.shard
);
2593 void encode(bufferlist
&bl
) const;
2594 void decode(bufferlist::iterator
&p
);
2595 void dump(Formatter
*f
) const;
2596 static void generate_test_instances(list
<pg_notify_t
*> &o
);
2598 WRITE_CLASS_ENCODER(pg_notify_t
)
2599 ostream
&operator<<(ostream
&lhs
, const pg_notify_t
¬ify
);
2604 * PastIntervals -- information needed to determine the PriorSet and
2605 * the might_have_unfound set
2607 class PastIntervals
{
2609 struct pg_interval_t
{
2610 vector
<int32_t> up
, acting
;
2611 epoch_t first
, last
;
2617 : first(0), last(0),
2618 maybe_went_rw(false),
2624 vector
<int32_t> &&up
,
2625 vector
<int32_t> &&acting
,
2631 : up(up
), acting(acting
), first(first
), last(last
),
2632 maybe_went_rw(maybe_went_rw
), primary(primary
), up_primary(up_primary
)
2635 void encode(bufferlist
& bl
) const;
2636 void decode(bufferlist::iterator
& bl
);
2637 void dump(Formatter
*f
) const;
2638 static void generate_test_instances(list
<pg_interval_t
*>& o
);
2641 PastIntervals() = default;
2642 PastIntervals(bool ec_pool
, const OSDMap
&osdmap
) : PastIntervals() {
2643 update_type_from_map(ec_pool
, osdmap
);
2645 PastIntervals(bool ec_pool
, bool compact
) : PastIntervals() {
2646 update_type(ec_pool
, compact
);
2648 PastIntervals(PastIntervals
&&rhs
) = default;
2649 PastIntervals
&operator=(PastIntervals
&&rhs
) = default;
2651 PastIntervals(const PastIntervals
&rhs
);
2652 PastIntervals
&operator=(const PastIntervals
&rhs
);
2654 class interval_rep
{
2656 virtual size_t size() const = 0;
2657 virtual bool empty() const = 0;
2658 virtual void clear() = 0;
2659 virtual pair
<epoch_t
, epoch_t
> get_bounds() const = 0;
2660 virtual set
<pg_shard_t
> get_all_participants(
2661 bool ec_pool
) const = 0;
2662 virtual void add_interval(bool ec_pool
, const pg_interval_t
&interval
) = 0;
2663 virtual unique_ptr
<interval_rep
> clone() const = 0;
2664 virtual ostream
&print(ostream
&out
) const = 0;
2665 virtual void encode(bufferlist
&bl
) const = 0;
2666 virtual void decode(bufferlist::iterator
&bl
) = 0;
2667 virtual void dump(Formatter
*f
) const = 0;
2668 virtual bool is_classic() const = 0;
2669 virtual void iterate_mayberw_back_to(
2672 std::function
<void(epoch_t
, const set
<pg_shard_t
> &)> &&f
) const = 0;
2674 virtual bool has_full_intervals() const { return false; }
2675 virtual void iterate_all_intervals(
2676 std::function
<void(const pg_interval_t
&)> &&f
) const {
2677 assert(!has_full_intervals());
2678 assert(0 == "not valid for this implementation");
2681 virtual ~interval_rep() {}
2683 friend class pi_simple_rep
;
2684 friend class pi_compact_rep
;
2687 unique_ptr
<interval_rep
> past_intervals
;
2689 PastIntervals(interval_rep
*rep
) : past_intervals(rep
) {}
2692 void add_interval(bool ec_pool
, const pg_interval_t
&interval
) {
2693 assert(past_intervals
);
2694 return past_intervals
->add_interval(ec_pool
, interval
);
2697 bool is_classic() const {
2698 assert(past_intervals
);
2699 return past_intervals
->is_classic();
2702 void encode(bufferlist
&bl
) const {
2703 ENCODE_START(1, 1, bl
);
2704 if (past_intervals
) {
2705 __u8 type
= is_classic() ? 1 : 2;
2707 past_intervals
->encode(bl
);
2709 ::encode((__u8
)0, bl
);
2713 void encode_classic(bufferlist
&bl
) const {
2714 if (past_intervals
) {
2715 assert(past_intervals
->is_classic());
2716 past_intervals
->encode(bl
);
2719 ::encode((uint32_t)0, bl
);
2723 void decode(bufferlist::iterator
&bl
);
2724 void decode_classic(bufferlist::iterator
&bl
);
2726 void dump(Formatter
*f
) const {
2727 assert(past_intervals
);
2728 past_intervals
->dump(f
);
2730 static void generate_test_instances(list
<PastIntervals
*> & o
);
2733 * Determines whether there is an interval change
2735 static bool is_new_interval(
2736 int old_acting_primary
,
2737 int new_acting_primary
,
2738 const vector
<int> &old_acting
,
2739 const vector
<int> &new_acting
,
2742 const vector
<int> &old_up
,
2743 const vector
<int> &new_up
,
2748 unsigned old_pg_num
,
2749 unsigned new_pg_num
,
2750 bool old_sort_bitwise
,
2751 bool new_sort_bitwise
,
2752 bool old_recovery_deletes
,
2753 bool new_recovery_deletes
,
2758 * Determines whether there is an interval change
2760 static bool is_new_interval(
2761 int old_acting_primary
, ///< [in] primary as of lastmap
2762 int new_acting_primary
, ///< [in] primary as of lastmap
2763 const vector
<int> &old_acting
, ///< [in] acting as of lastmap
2764 const vector
<int> &new_acting
, ///< [in] acting as of osdmap
2765 int old_up_primary
, ///< [in] up primary of lastmap
2766 int new_up_primary
, ///< [in] up primary of osdmap
2767 const vector
<int> &old_up
, ///< [in] up as of lastmap
2768 const vector
<int> &new_up
, ///< [in] up as of osdmap
2769 ceph::shared_ptr
<const OSDMap
> osdmap
, ///< [in] current map
2770 ceph::shared_ptr
<const OSDMap
> lastmap
, ///< [in] last map
2771 pg_t pgid
///< [in] pgid for pg
2775 * Integrates a new map into *past_intervals, returns true
2776 * if an interval was closed out.
2778 static bool check_new_interval(
2779 int old_acting_primary
, ///< [in] primary as of lastmap
2780 int new_acting_primary
, ///< [in] primary as of osdmap
2781 const vector
<int> &old_acting
, ///< [in] acting as of lastmap
2782 const vector
<int> &new_acting
, ///< [in] acting as of osdmap
2783 int old_up_primary
, ///< [in] up primary of lastmap
2784 int new_up_primary
, ///< [in] up primary of osdmap
2785 const vector
<int> &old_up
, ///< [in] up as of lastmap
2786 const vector
<int> &new_up
, ///< [in] up as of osdmap
2787 epoch_t same_interval_since
, ///< [in] as of osdmap
2788 epoch_t last_epoch_clean
, ///< [in] current
2789 ceph::shared_ptr
<const OSDMap
> osdmap
, ///< [in] current map
2790 ceph::shared_ptr
<const OSDMap
> lastmap
, ///< [in] last map
2791 pg_t pgid
, ///< [in] pgid for pg
2792 IsPGRecoverablePredicate
*could_have_gone_active
, /// [in] predicate whether the pg can be active
2793 PastIntervals
*past_intervals
, ///< [out] intervals
2794 ostream
*out
= 0 ///< [out] debug ostream
2797 friend ostream
& operator<<(ostream
& out
, const PastIntervals
&i
);
2799 template <typename F
>
2800 void iterate_mayberw_back_to(
2804 assert(past_intervals
);
2805 past_intervals
->iterate_mayberw_back_to(ec_pool
, les
, std::forward
<F
>(f
));
2808 assert(past_intervals
);
2809 past_intervals
->clear();
2813 * Should return a value which gives an indication of the amount
2814 * of state contained
2816 size_t size() const {
2817 assert(past_intervals
);
2818 return past_intervals
->size();
2821 bool empty() const {
2822 assert(past_intervals
);
2823 return past_intervals
->empty();
2826 void swap(PastIntervals
&other
) {
2828 swap(other
.past_intervals
, past_intervals
);
2832 * Return all shards which have been in the acting set back to the
2833 * latest epoch to which we have trimmed except for pg_whoami
2835 set
<pg_shard_t
> get_might_have_unfound(
2836 pg_shard_t pg_whoami
,
2837 bool ec_pool
) const {
2838 assert(past_intervals
);
2839 auto ret
= past_intervals
->get_all_participants(ec_pool
);
2840 ret
.erase(pg_whoami
);
2845 * Return all shards which we might want to talk to for peering
2847 set
<pg_shard_t
> get_all_probe(
2848 bool ec_pool
) const {
2849 assert(past_intervals
);
2850 return past_intervals
->get_all_participants(ec_pool
);
2853 /* Return the set of epochs [start, end) represented by the
2854 * past_interval set.
2856 pair
<epoch_t
, epoch_t
> get_bounds() const {
2857 assert(past_intervals
);
2858 return past_intervals
->get_bounds();
2868 bool ec_pool
= false;
2869 set
<pg_shard_t
> probe
; /// current+prior OSDs we need to probe.
2870 set
<int> down
; /// down osds that would normally be in @a probe and might be interesting.
2871 map
<int, epoch_t
> blocked_by
; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
2873 bool pg_down
= false; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
2874 unique_ptr
<IsPGRecoverablePredicate
> pcontdec
;
2876 PriorSet() = default;
2877 PriorSet(PriorSet
&&) = default;
2878 PriorSet
&operator=(PriorSet
&&) = default;
2880 PriorSet
&operator=(const PriorSet
&) = delete;
2881 PriorSet(const PriorSet
&) = delete;
2883 bool operator==(const PriorSet
&rhs
) const {
2884 return (ec_pool
== rhs
.ec_pool
) &&
2885 (probe
== rhs
.probe
) &&
2886 (down
== rhs
.down
) &&
2887 (blocked_by
== rhs
.blocked_by
) &&
2888 (pg_down
== rhs
.pg_down
);
2891 bool affected_by_map(
2892 const OSDMap
&osdmap
,
2893 const DoutPrefixProvider
*dpp
) const;
2895 // For verifying tests
2898 set
<pg_shard_t
> probe
,
2900 map
<int, epoch_t
> blocked_by
,
2902 IsPGRecoverablePredicate
*pcontdec
)
2903 : ec_pool(ec_pool
), probe(probe
), down(down
), blocked_by(blocked_by
),
2904 pg_down(pg_down
), pcontdec(pcontdec
) {}
2907 template <typename F
>
2909 const PastIntervals
&past_intervals
,
2911 epoch_t last_epoch_started
,
2912 IsPGRecoverablePredicate
*c
,
2914 const vector
<int> &up
,
2915 const vector
<int> &acting
,
2916 const DoutPrefixProvider
*dpp
);
2918 friend class PastIntervals
;
2921 void update_type(bool ec_pool
, bool compact
);
2922 void update_type_from_map(bool ec_pool
, const OSDMap
&osdmap
);
2924 template <typename
... Args
>
2925 PriorSet
get_prior_set(Args
&&... args
) const {
2926 return PriorSet(*this, std::forward
<Args
>(args
)...);
2929 WRITE_CLASS_ENCODER(PastIntervals
)
2931 ostream
& operator<<(ostream
& out
, const PastIntervals::pg_interval_t
& i
);
2932 ostream
& operator<<(ostream
& out
, const PastIntervals
&i
);
2933 ostream
& operator<<(ostream
& out
, const PastIntervals::PriorSet
&i
);
2935 template <typename F
>
2936 PastIntervals::PriorSet::PriorSet(
2937 const PastIntervals
&past_intervals
,
2939 epoch_t last_epoch_started
,
2940 IsPGRecoverablePredicate
*c
,
2942 const vector
<int> &up
,
2943 const vector
<int> &acting
,
2944 const DoutPrefixProvider
*dpp
)
2945 : ec_pool(ec_pool
), pg_down(false), pcontdec(c
)
2948 * We have to be careful to gracefully deal with situations like
2949 * so. Say we have a power outage or something that takes out both
2950 * OSDs, but the monitor doesn't mark them down in the same epoch.
2951 * The history may look like
2955 * 3: let's say B dies for good, too (say, from the power spike)
2958 * which makes it look like B may have applied updates to the PG
2959 * that we need in order to proceed. This sucks...
2961 * To minimize the risk of this happening, we CANNOT go active if
2962 * _any_ OSDs in the prior set are down until we send an MOSDAlive
2963 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
2964 * Then, we have something like
2971 * -> we can ignore B, bc it couldn't have gone active (alive_thru
2982 * -> we must wait for B, bc it was alive through 2, and could have
2983 * written to the pg.
2985 * If B is really dead, then an administrator will need to manually
2986 * intervene by marking the OSD as "lost."
2989 // Include current acting and up nodes... not because they may
2990 // contain old data (this interval hasn't gone active, obviously),
2991 // but because we want their pg_info to inform choose_acting(), and
2992 // so that we know what they do/do not have explicitly before
2993 // sending them any new info/logs/whatever.
2994 for (unsigned i
= 0; i
< acting
.size(); i
++) {
2995 if (acting
[i
] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
2996 probe
.insert(pg_shard_t(acting
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
2998 // It may be possible to exclude the up nodes, but let's keep them in
3000 for (unsigned i
= 0; i
< up
.size(); i
++) {
3001 if (up
[i
] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3002 probe
.insert(pg_shard_t(up
[i
], ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3005 set
<pg_shard_t
> all_probe
= past_intervals
.get_all_probe(ec_pool
);
3006 ldpp_dout(dpp
, 10) << "build_prior all_probe " << all_probe
<< dendl
;
3007 for (auto &&i
: all_probe
) {
3008 switch (f(0, i
.osd
, nullptr)) {
3022 past_intervals
.iterate_mayberw_back_to(
3025 [&](epoch_t start
, const set
<pg_shard_t
> &acting
) {
3026 ldpp_dout(dpp
, 10) << "build_prior maybe_rw interval:" << start
3027 << ", acting: " << acting
<< dendl
;
3029 // look at candidate osds during this interval. each falls into
3030 // one of three categories: up, down (but potentially
3031 // interesting), or lost (down, but we won't wait for it).
3032 set
<pg_shard_t
> up_now
;
3033 map
<int, epoch_t
> candidate_blocked_by
;
3034 // any candidates down now (that might have useful data)
3035 bool any_down_now
= false;
3037 // consider ACTING osds
3038 for (auto &&so
: acting
) {
3039 epoch_t lost_at
= 0;
3040 switch (f(start
, so
.osd
, &lost_at
)) {
3042 // include past acting osds if they are up.
3047 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3048 << " no longer exists" << dendl
;
3052 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3053 << " is down, but lost_at " << lost_at
<< dendl
;
3058 ldpp_dout(dpp
, 10) << "build_prior prior osd." << so
.osd
3059 << " is down" << dendl
;
3060 candidate_blocked_by
[so
.osd
] = lost_at
;
3061 any_down_now
= true;
3067 // if not enough osds survived this interval, and we may have gone rw,
3068 // then we need to wait for one of those osds to recover to
3069 // ensure that we haven't lost any information.
3070 if (!(*pcontdec
)(up_now
) && any_down_now
) {
3071 // fixme: how do we identify a "clean" shutdown anyway?
3072 ldpp_dout(dpp
, 10) << "build_prior possibly went active+rw,"
3073 << " insufficient up; including down osds" << dendl
;
3074 assert(!candidate_blocked_by
.empty());
3077 candidate_blocked_by
.begin(),
3078 candidate_blocked_by
.end());
3082 ldpp_dout(dpp
, 10) << "build_prior final: probe " << probe
3084 << " blocked_by " << blocked_by
3085 << (pg_down
? " pg_down":"")
3090 * pg_query_t - used to ask a peer for information about a pg.
3092 * note: if version=0, type=LOG, then we just provide our full log.
3101 const char *get_type_name() const {
3103 case INFO
: return "info";
3104 case LOG
: return "log";
3105 case MISSING
: return "missing";
3106 case FULLLOG
: return "fulllog";
3107 default: return "???";
3113 pg_history_t history
;
3118 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD
),
3119 from(shard_id_t::NO_SHARD
) {}
3124 const pg_history_t
& h
,
3128 epoch_sent(epoch_sent
),
3129 to(to
), from(from
) {
3137 const pg_history_t
& h
,
3139 : type(t
), since(s
), history(h
),
3140 epoch_sent(epoch_sent
), to(to
), from(from
) {
3144 void encode(bufferlist
&bl
, uint64_t features
) const;
3145 void decode(bufferlist::iterator
&bl
);
3147 void dump(Formatter
*f
) const;
3148 static void generate_test_instances(list
<pg_query_t
*>& o
);
3150 WRITE_CLASS_ENCODER_FEATURES(pg_query_t
)
3152 inline ostream
& operator<<(ostream
& out
, const pg_query_t
& q
) {
3153 out
<< "query(" << q
.get_type_name() << " " << q
.since
;
3154 if (q
.type
== pg_query_t::LOG
)
3155 out
<< " " << q
.history
;
3161 class ObjectModDesc
{
3162 bool can_local_rollback
;
3163 bool rollback_info_completed
;
3165 // version required to decode, reflected in encode/decode version
3166 __u8 max_required_version
= 1;
3170 virtual void append(uint64_t old_offset
) {}
3171 virtual void setattrs(map
<string
, boost::optional
<bufferlist
> > &attrs
) {}
3172 virtual void rmobject(version_t old_version
) {}
3174 * Used to support the unfound_lost_delete log event: if the stashed
3175 * version exists, we unstash it, otherwise, we do nothing. This way
3176 * each replica rolls back to whatever state it had prior to the attempt
3177 * at mark unfound lost delete
3179 virtual void try_rmobject(version_t old_version
) {
3180 rmobject(old_version
);
3182 virtual void create() {}
3183 virtual void update_snaps(const set
<snapid_t
> &old_snaps
) {}
3184 virtual void rollback_extents(
3186 const vector
<pair
<uint64_t, uint64_t> > &extents
) {}
3187 virtual ~Visitor() {}
3189 void visit(Visitor
*visitor
) const;
3190 mutable bufferlist bl
;
3198 ROLLBACK_EXTENTS
= 7
3200 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3201 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3203 void claim(ObjectModDesc
&other
) {
3206 can_local_rollback
= other
.can_local_rollback
;
3207 rollback_info_completed
= other
.rollback_info_completed
;
3209 void claim_append(ObjectModDesc
&other
) {
3210 if (!can_local_rollback
|| rollback_info_completed
)
3212 if (!other
.can_local_rollback
) {
3213 mark_unrollbackable();
3216 bl
.claim_append(other
.bl
);
3217 rollback_info_completed
= other
.rollback_info_completed
;
3219 void swap(ObjectModDesc
&other
) {
3223 swap(other
.can_local_rollback
, can_local_rollback
);
3224 swap(other
.rollback_info_completed
, rollback_info_completed
);
3225 swap(other
.max_required_version
, max_required_version
);
3227 void append_id(ModID id
) {
3231 void append(uint64_t old_size
) {
3232 if (!can_local_rollback
|| rollback_info_completed
)
3234 ENCODE_START(1, 1, bl
);
3236 ::encode(old_size
, bl
);
3239 void setattrs(map
<string
, boost::optional
<bufferlist
> > &old_attrs
) {
3240 if (!can_local_rollback
|| rollback_info_completed
)
3242 ENCODE_START(1, 1, bl
);
3243 append_id(SETATTRS
);
3244 ::encode(old_attrs
, bl
);
3247 bool rmobject(version_t deletion_version
) {
3248 if (!can_local_rollback
|| rollback_info_completed
)
3250 ENCODE_START(1, 1, bl
);
3252 ::encode(deletion_version
, bl
);
3254 rollback_info_completed
= true;
3257 bool try_rmobject(version_t deletion_version
) {
3258 if (!can_local_rollback
|| rollback_info_completed
)
3260 ENCODE_START(1, 1, bl
);
3261 append_id(TRY_DELETE
);
3262 ::encode(deletion_version
, bl
);
3264 rollback_info_completed
= true;
3268 if (!can_local_rollback
|| rollback_info_completed
)
3270 rollback_info_completed
= true;
3271 ENCODE_START(1, 1, bl
);
3275 void update_snaps(const set
<snapid_t
> &old_snaps
) {
3276 if (!can_local_rollback
|| rollback_info_completed
)
3278 ENCODE_START(1, 1, bl
);
3279 append_id(UPDATE_SNAPS
);
3280 ::encode(old_snaps
, bl
);
3283 void rollback_extents(
3284 version_t gen
, const vector
<pair
<uint64_t, uint64_t> > &extents
) {
3285 assert(can_local_rollback
);
3286 assert(!rollback_info_completed
);
3287 if (max_required_version
< 2)
3288 max_required_version
= 2;
3289 ENCODE_START(2, 2, bl
);
3290 append_id(ROLLBACK_EXTENTS
);
3292 ::encode(extents
, bl
);
3296 // cannot be rolled back
3297 void mark_unrollbackable() {
3298 can_local_rollback
= false;
3301 bool can_rollback() const {
3302 return can_local_rollback
;
3304 bool empty() const {
3305 return can_local_rollback
&& (bl
.length() == 0);
3308 bool requires_kraken() const {
3309 return max_required_version
>= 2;
3313 * Create fresh copy of bl bytes to avoid keeping large buffers around
3314 * in the case that bl contains ptrs which point into a much larger
3317 void trim_bl() const {
3318 if (bl
.length() > 0)
3321 void encode(bufferlist
&bl
) const;
3322 void decode(bufferlist::iterator
&bl
);
3323 void dump(Formatter
*f
) const;
3324 static void generate_test_instances(list
<ObjectModDesc
*>& o
);
3326 WRITE_CLASS_ENCODER(ObjectModDesc
)
3330 * pg_log_entry_t - single entry/event in pg log
3333 struct pg_log_entry_t
{
3335 MODIFY
= 1, // some unspecified modification (but not *all* modifications)
3336 CLONE
= 2, // cloned object from head
3337 DELETE
= 3, // deleted object
3338 BACKLOG
= 4, // event invented by generate_backlog [deprecated]
3339 LOST_REVERT
= 5, // lost new version, revert to an older version.
3340 LOST_DELETE
= 6, // lost new version, revert to no object (deleted).
3341 LOST_MARK
= 7, // lost new version, now EIO
3342 PROMOTE
= 8, // promoted object from another tier
3343 CLEAN
= 9, // mark an object clean
3344 ERROR
= 10, // write that returned an error
3346 static const char *get_op_name(int op
) {
3372 const char *get_op_name() const {
3373 return get_op_name(op
);
3376 // describes state for a locally-rollbackable entry
3377 ObjectModDesc mod_desc
;
3378 bufferlist snaps
; // only for clone entries
3380 osd_reqid_t reqid
; // caller+tid to uniquely identify request
3381 mempool::osd_pglog::vector
<pair
<osd_reqid_t
, version_t
> > extra_reqids
;
3382 eversion_t version
, prior_version
, reverting_to
;
3383 version_t user_version
; // the user version for this entry
3384 utime_t mtime
; // this is the _user_ mtime, mind you
3385 int32_t return_code
; // only stored for ERRORs for dup detection
3388 bool invalid_hash
; // only when decoding sobject_t based entries
3389 bool invalid_pool
; // only when decoding pool-less hobject based entries
3392 : user_version(0), return_code(0), op(0),
3393 invalid_hash(false), invalid_pool(false) {
3394 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3396 pg_log_entry_t(int _op
, const hobject_t
& _soid
,
3397 const eversion_t
& v
, const eversion_t
& pv
,
3399 const osd_reqid_t
& rid
, const utime_t
& mt
,
3401 : soid(_soid
), reqid(rid
), version(v
), prior_version(pv
), user_version(uv
),
3402 mtime(mt
), return_code(return_code
), op(_op
),
3403 invalid_hash(false), invalid_pool(false) {
3404 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3407 bool is_clone() const { return op
== CLONE
; }
3408 bool is_modify() const { return op
== MODIFY
; }
3409 bool is_promote() const { return op
== PROMOTE
; }
3410 bool is_clean() const { return op
== CLEAN
; }
3411 bool is_backlog() const { return op
== BACKLOG
; }
3412 bool is_lost_revert() const { return op
== LOST_REVERT
; }
3413 bool is_lost_delete() const { return op
== LOST_DELETE
; }
3414 bool is_lost_mark() const { return op
== LOST_MARK
; }
3415 bool is_error() const { return op
== ERROR
; }
3417 bool is_update() const {
3419 is_clone() || is_modify() || is_promote() || is_clean() ||
3420 is_backlog() || is_lost_revert() || is_lost_mark();
3422 bool is_delete() const {
3423 return op
== DELETE
|| op
== LOST_DELETE
;
3426 bool can_rollback() const {
3427 return mod_desc
.can_rollback();
3430 void mark_unrollbackable() {
3431 mod_desc
.mark_unrollbackable();
3434 bool requires_kraken() const {
3435 return mod_desc
.requires_kraken();
3438 // Errors are only used for dup detection, whereas
3439 // the index by objects is used by recovery, copy_get,
3440 // and other facilities that don't expect or need to
3441 // be aware of error entries.
3442 bool object_is_indexed() const {
3446 bool reqid_is_indexed() const {
3447 return reqid
!= osd_reqid_t() &&
3448 (op
== MODIFY
|| op
== DELETE
|| op
== ERROR
);
3451 string
get_key_name() const;
3452 void encode_with_checksum(bufferlist
& bl
) const;
3453 void decode_with_checksum(bufferlist::iterator
& p
);
3455 void encode(bufferlist
&bl
) const;
3456 void decode(bufferlist::iterator
&bl
);
3457 void dump(Formatter
*f
) const;
3458 static void generate_test_instances(list
<pg_log_entry_t
*>& o
);
3461 WRITE_CLASS_ENCODER(pg_log_entry_t
)
3463 ostream
& operator<<(ostream
& out
, const pg_log_entry_t
& e
);
3465 struct pg_log_dup_t
{
3466 osd_reqid_t reqid
; // caller+tid to uniquely identify request
3468 version_t user_version
; // the user version for this entry
3469 int32_t return_code
; // only stored for ERRORs for dup detection
3472 : user_version(0), return_code(0)
3474 explicit pg_log_dup_t(const pg_log_entry_t
& entry
)
3475 : reqid(entry
.reqid
), version(entry
.version
),
3476 user_version(entry
.user_version
), return_code(entry
.return_code
)
3478 pg_log_dup_t(const eversion_t
& v
, version_t uv
,
3479 const osd_reqid_t
& rid
, int return_code
)
3480 : reqid(rid
), version(v
), user_version(uv
),
3481 return_code(return_code
)
3484 string
get_key_name() const;
3485 void encode(bufferlist
&bl
) const;
3486 void decode(bufferlist::iterator
&bl
);
3487 void dump(Formatter
*f
) const;
3488 static void generate_test_instances(list
<pg_log_dup_t
*>& o
);
3490 bool operator==(const pg_log_dup_t
&rhs
) const {
3491 return reqid
== rhs
.reqid
&&
3492 version
== rhs
.version
&&
3493 user_version
== rhs
.user_version
&&
3494 return_code
== rhs
.return_code
;
3496 bool operator!=(const pg_log_dup_t
&rhs
) const {
3497 return !(*this == rhs
);
3500 friend std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
3502 WRITE_CLASS_ENCODER(pg_log_dup_t
)
3504 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
);
3507 * pg_log_t - incremental log of recent pg changes.
3509 * serves as a recovery queue for recent changes.
3513 * head - newest entry (update|delete)
3514 * tail - entry previous to oldest (update|delete) for which we have
3515 * complete negative information.
3516 * i.e. we can infer pg contents for any store whose last_update >= tail.
3518 eversion_t head
; // newest entry
3519 eversion_t tail
; // version prior to oldest
3522 // We can rollback rollback-able entries > can_rollback_to
3523 eversion_t can_rollback_to
;
3525 // always <= can_rollback_to, indicates how far stashed rollback
3526 // data can be found
3527 eversion_t rollback_info_trimmed_to
;
3531 mempool::osd_pglog::list
<pg_log_entry_t
> log
;
3533 // entries just for dup op detection ordered oldest to newest
3534 mempool::osd_pglog::list
<pg_log_dup_t
> dups
;
3536 pg_log_t() = default;
3537 pg_log_t(const eversion_t
&last_update
,
3538 const eversion_t
&log_tail
,
3539 const eversion_t
&can_rollback_to
,
3540 const eversion_t
&rollback_info_trimmed_to
,
3541 mempool::osd_pglog::list
<pg_log_entry_t
> &&entries
,
3542 mempool::osd_pglog::list
<pg_log_dup_t
> &&dup_entries
)
3543 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
3544 rollback_info_trimmed_to(rollback_info_trimmed_to
),
3545 log(std::move(entries
)), dups(std::move(dup_entries
)) {}
3546 pg_log_t(const eversion_t
&last_update
,
3547 const eversion_t
&log_tail
,
3548 const eversion_t
&can_rollback_to
,
3549 const eversion_t
&rollback_info_trimmed_to
,
3550 const std::list
<pg_log_entry_t
> &entries
,
3551 const std::list
<pg_log_dup_t
> &dup_entries
)
3552 : head(last_update
), tail(log_tail
), can_rollback_to(can_rollback_to
),
3553 rollback_info_trimmed_to(rollback_info_trimmed_to
) {
3554 for (auto &&entry
: entries
) {
3555 log
.push_back(entry
);
3557 for (auto &&entry
: dup_entries
) {
3558 dups
.push_back(entry
);
3564 rollback_info_trimmed_to
= can_rollback_to
= head
= tail
= z
;
3569 eversion_t
get_rollback_info_trimmed_to() const {
3570 return rollback_info_trimmed_to
;
3572 eversion_t
get_can_rollback_to() const {
3573 return can_rollback_to
;
3577 pg_log_t
split_out_child(pg_t child_pgid
, unsigned split_bits
) {
3578 mempool::osd_pglog::list
<pg_log_entry_t
> oldlog
, childlog
;
3581 eversion_t old_tail
;
3582 unsigned mask
= ~((~0)<<split_bits
);
3583 for (auto i
= oldlog
.begin();
3586 if ((i
->soid
.get_hash() & mask
) == child_pgid
.m_seed
) {
3587 childlog
.push_back(*i
);
3594 // osd_reqid is unique, so it doesn't matter if there are extra
3595 // dup entries in each pg. To avoid storing oid with the dup
3596 // entries, just copy the whole list.
3597 auto childdups(dups
);
3603 rollback_info_trimmed_to
,
3604 std::move(childlog
),
3605 std::move(childdups
));
3608 mempool::osd_pglog::list
<pg_log_entry_t
> rewind_from_head(eversion_t newhead
) {
3609 assert(newhead
>= tail
);
3611 mempool::osd_pglog::list
<pg_log_entry_t
>::iterator p
= log
.end();
3612 mempool::osd_pglog::list
<pg_log_entry_t
> divergent
;
3614 if (p
== log
.begin()) {
3615 // yikes, the whole thing is divergent!
3617 swap(divergent
, log
);
3621 if (p
->version
.version
<= newhead
.version
) {
3623 * look at eversion.version here. we want to avoid a situation like:
3624 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3625 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
3626 * lower_bound = 100'9
3627 * i.e, same request, different version. If the eversion.version is > the
3628 * lower_bound, we it is divergent.
3631 divergent
.splice(divergent
.begin(), log
, p
, log
.end());
3634 assert(p
->version
> newhead
);
3638 if (can_rollback_to
> newhead
)
3639 can_rollback_to
= newhead
;
3641 if (rollback_info_trimmed_to
> newhead
)
3642 rollback_info_trimmed_to
= newhead
;
3647 bool empty() const {
3652 return head
.version
== 0 && head
.epoch
== 0;
3655 size_t approx_size() const {
3656 return head
.version
- tail
.version
;
3659 static void filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
3660 const string
&hit_set_namespace
, const pg_log_t
&in
,
3661 pg_log_t
&out
, pg_log_t
&reject
);
3664 * copy entries from the tail of another pg_log_t
3666 * @param other pg_log_t to copy from
3667 * @param from copy entries after this version
3669 void copy_after(const pg_log_t
&other
, eversion_t from
);
3672 * copy a range of entries from another pg_log_t
3674 * @param other pg_log_t to copy from
3675 * @param from copy entries after this version
3676 * @param to up to and including this version
3678 void copy_range(const pg_log_t
&other
, eversion_t from
, eversion_t to
);
3681 * copy up to N entries
3683 * @param other source log
3684 * @param max max number of entries to copy
3686 void copy_up_to(const pg_log_t
&other
, int max
);
3688 ostream
& print(ostream
& out
) const;
3690 void encode(bufferlist
&bl
) const;
3691 void decode(bufferlist::iterator
&bl
, int64_t pool
= -1);
3692 void dump(Formatter
*f
) const;
3693 static void generate_test_instances(list
<pg_log_t
*>& o
);
3695 WRITE_CLASS_ENCODER(pg_log_t
)
3697 inline ostream
& operator<<(ostream
& out
, const pg_log_t
& log
)
3699 out
<< "log((" << log
.tail
<< "," << log
.head
<< "], crt="
3700 << log
.get_can_rollback_to() << ")";
3706 * pg_missing_t - summary of missing objects.
3708 * kept in memory, as a supplement to pg_log_t
3709 * also used to pass missing info in messages.
3711 struct pg_missing_item
{
3712 eversion_t need
, have
;
3713 enum missing_flags_t
{
3717 pg_missing_item() : flags(FLAG_NONE
) {}
3718 explicit pg_missing_item(eversion_t n
) : need(n
), flags(FLAG_NONE
) {} // have no old version
3719 pg_missing_item(eversion_t n
, eversion_t h
, bool is_delete
=false) : need(n
), have(h
) {
3720 set_delete(is_delete
);
3723 void encode(bufferlist
& bl
, uint64_t features
) const {
3724 if (HAVE_FEATURE(features
, OSD_RECOVERY_DELETES
)) {
3725 // encoding a zeroed eversion_t to differentiate between this and
3726 // legacy unversioned encoding - a need value of 0'0 is not
3727 // possible. This can be replaced with the legacy encoding
3728 // macros post-luminous.
3733 ::encode(static_cast<uint8_t>(flags
), bl
);
3735 // legacy unversioned encoding
3740 void decode(bufferlist::iterator
& bl
) {
3743 if (e
!= eversion_t()) {
3744 // legacy encoding, this is the need value
3752 flags
= static_cast<missing_flags_t
>(f
);
3756 void set_delete(bool is_delete
) {
3757 flags
= is_delete
? FLAG_DELETE
: FLAG_NONE
;
3760 bool is_delete() const {
3761 return (flags
& FLAG_DELETE
) == FLAG_DELETE
;
3764 string
flag_str() const {
3765 if (flags
== FLAG_NONE
) {
3772 void dump(Formatter
*f
) const {
3773 f
->dump_stream("need") << need
;
3774 f
->dump_stream("have") << have
;
3775 f
->dump_stream("flags") << flag_str();
3777 static void generate_test_instances(list
<pg_missing_item
*>& o
) {
3778 o
.push_back(new pg_missing_item
);
3779 o
.push_back(new pg_missing_item
);
3780 o
.back()->need
= eversion_t(1, 2);
3781 o
.back()->have
= eversion_t(1, 1);
3782 o
.push_back(new pg_missing_item
);
3783 o
.back()->need
= eversion_t(3, 5);
3784 o
.back()->have
= eversion_t(3, 4);
3785 o
.back()->flags
= FLAG_DELETE
;
3787 bool operator==(const pg_missing_item
&rhs
) const {
3788 return need
== rhs
.need
&& have
== rhs
.have
&& flags
== rhs
.flags
;
3790 bool operator!=(const pg_missing_item
&rhs
) const {
3791 return !(*this == rhs
);
3794 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item
)
3795 ostream
& operator<<(ostream
& out
, const pg_missing_item
&item
);
3797 class pg_missing_const_i
{
3799 virtual const map
<hobject_t
, pg_missing_item
> &
3800 get_items() const = 0;
3801 virtual const map
<version_t
, hobject_t
> &get_rmissing() const = 0;
3802 virtual bool get_may_include_deletes() const = 0;
3803 virtual unsigned int num_missing() const = 0;
3804 virtual bool have_missing() const = 0;
3805 virtual bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const = 0;
3806 virtual bool is_missing(const hobject_t
& oid
, eversion_t v
) const = 0;
3807 virtual eversion_t
have_old(const hobject_t
& oid
) const = 0;
3808 virtual ~pg_missing_const_i() {}
3812 template <bool Track
>
3813 class ChangeTracker
{
3815 void changed(const hobject_t
&obj
) {}
3816 template <typename F
>
3817 void get_changed(F
&&f
) const {}
3819 bool is_clean() const {
3824 class ChangeTracker
<true> {
3825 set
<hobject_t
> _changed
;
3827 void changed(const hobject_t
&obj
) {
3828 _changed
.insert(obj
);
3830 template <typename F
>
3831 void get_changed(F
&&f
) const {
3832 for (auto const &i
: _changed
) {
3839 bool is_clean() const {
3840 return _changed
.empty();
3844 template <bool TrackChanges
>
3845 class pg_missing_set
: public pg_missing_const_i
{
3846 using item
= pg_missing_item
;
3847 map
<hobject_t
, item
> missing
; // oid -> (need v, have v)
3848 map
<version_t
, hobject_t
> rmissing
; // v -> oid
3849 ChangeTracker
<TrackChanges
> tracker
;
3852 pg_missing_set() = default;
3854 template <typename missing_type
>
3855 pg_missing_set(const missing_type
&m
) {
3856 missing
= m
.get_items();
3857 rmissing
= m
.get_rmissing();
3858 may_include_deletes
= m
.get_may_include_deletes();
3859 for (auto &&i
: missing
)
3860 tracker
.changed(i
.first
);
3863 bool may_include_deletes
= false;
3865 const map
<hobject_t
, item
> &get_items() const override
{
3868 const map
<version_t
, hobject_t
> &get_rmissing() const override
{
3871 bool get_may_include_deletes() const override
{
3872 return may_include_deletes
;
3874 unsigned int num_missing() const override
{
3875 return missing
.size();
3877 bool have_missing() const override
{
3878 return !missing
.empty();
3880 bool is_missing(const hobject_t
& oid
, pg_missing_item
*out
= nullptr) const override
{
3881 auto iter
= missing
.find(oid
);
3882 if (iter
== missing
.end())
3885 *out
= iter
->second
;
3888 bool is_missing(const hobject_t
& oid
, eversion_t v
) const override
{
3889 map
<hobject_t
, item
>::const_iterator m
=
3891 if (m
== missing
.end())
3893 const item
&item(m
->second
);
3898 eversion_t
have_old(const hobject_t
& oid
) const override
{
3899 map
<hobject_t
, item
>::const_iterator m
=
3901 if (m
== missing
.end())
3902 return eversion_t();
3903 const item
&item(m
->second
);
3907 void claim(pg_missing_set
& o
) {
3908 static_assert(!TrackChanges
, "Can't use claim with TrackChanges");
3909 missing
.swap(o
.missing
);
3910 rmissing
.swap(o
.rmissing
);
3914 * this needs to be called in log order as we extend the log. it
3915 * assumes missing is accurate up through the previous log entry.
3917 void add_next_event(const pg_log_entry_t
& e
) {
3918 map
<hobject_t
, item
>::iterator missing_it
;
3919 missing_it
= missing
.find(e
.soid
);
3920 bool is_missing_divergent_item
= missing_it
!= missing
.end();
3921 if (e
.prior_version
== eversion_t() || e
.is_clone()) {
3923 if (is_missing_divergent_item
) { // use iterator
3924 rmissing
.erase((missing_it
->second
).need
.version
);
3925 missing_it
->second
= item(e
.version
, eversion_t(), e
.is_delete()); // .have = nil
3926 } else // create new element in missing map
3927 missing
[e
.soid
] = item(e
.version
, eversion_t(), e
.is_delete()); // .have = nil
3928 } else if (is_missing_divergent_item
) {
3929 // already missing (prior).
3930 rmissing
.erase((missing_it
->second
).need
.version
);
3931 (missing_it
->second
).need
= e
.version
; // leave .have unchanged.
3932 missing_it
->second
.set_delete(e
.is_delete());
3933 } else if (e
.is_backlog()) {
3934 // May not have prior version
3935 assert(0 == "these don't exist anymore");
3937 // not missing, we must have prior_version (if any)
3938 assert(!is_missing_divergent_item
);
3939 missing
[e
.soid
] = item(e
.version
, e
.prior_version
, e
.is_delete());
3941 rmissing
[e
.version
.version
] = e
.soid
;
3942 tracker
.changed(e
.soid
);
3945 void revise_need(hobject_t oid
, eversion_t need
, bool is_delete
) {
3946 if (missing
.count(oid
)) {
3947 rmissing
.erase(missing
[oid
].need
.version
);
3948 missing
[oid
].need
= need
; // no not adjust .have
3949 missing
[oid
].set_delete(is_delete
);
3951 missing
[oid
] = item(need
, eversion_t(), is_delete
);
3953 rmissing
[need
.version
] = oid
;
3955 tracker
.changed(oid
);
3958 void revise_have(hobject_t oid
, eversion_t have
) {
3959 if (missing
.count(oid
)) {
3960 tracker
.changed(oid
);
3961 missing
[oid
].have
= have
;
3965 void add(const hobject_t
& oid
, eversion_t need
, eversion_t have
,
3967 missing
[oid
] = item(need
, have
, is_delete
);
3968 rmissing
[need
.version
] = oid
;
3969 tracker
.changed(oid
);
3972 void rm(const hobject_t
& oid
, eversion_t v
) {
3973 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
3974 if (p
!= missing
.end() && p
->second
.need
<= v
)
3978 void rm(std::map
<hobject_t
, item
>::const_iterator m
) {
3979 tracker
.changed(m
->first
);
3980 rmissing
.erase(m
->second
.need
.version
);
3984 void got(const hobject_t
& oid
, eversion_t v
) {
3985 std::map
<hobject_t
, item
>::iterator p
= missing
.find(oid
);
3986 assert(p
!= missing
.end());
3987 assert(p
->second
.need
<= v
|| p
->second
.is_delete());
3991 void got(std::map
<hobject_t
, item
>::const_iterator m
) {
3992 tracker
.changed(m
->first
);
3993 rmissing
.erase(m
->second
.need
.version
);
3999 unsigned split_bits
,
4000 pg_missing_set
*omissing
) {
4001 omissing
->may_include_deletes
= may_include_deletes
;
4002 unsigned mask
= ~((~0)<<split_bits
);
4003 for (map
<hobject_t
, item
>::iterator i
= missing
.begin();
4006 if ((i
->first
.get_hash() & mask
) == child_pgid
.m_seed
) {
4007 omissing
->add(i
->first
, i
->second
.need
, i
->second
.have
,
4008 i
->second
.is_delete());
4017 for (auto const &i
: missing
)
4018 tracker
.changed(i
.first
);
4023 void encode(bufferlist
&bl
) const {
4024 ENCODE_START(4, 2, bl
);
4025 ::encode(missing
, bl
, may_include_deletes
? CEPH_FEATURE_OSD_RECOVERY_DELETES
: 0);
4026 ::encode(may_include_deletes
, bl
);
4029 void decode(bufferlist::iterator
&bl
, int64_t pool
= -1) {
4030 for (auto const &i
: missing
)
4031 tracker
.changed(i
.first
);
4032 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl
);
4033 ::decode(missing
, bl
);
4034 if (struct_v
>= 4) {
4035 ::decode(may_include_deletes
, bl
);
4040 // Handle hobject_t upgrade
4041 map
<hobject_t
, item
> tmp
;
4042 for (map
<hobject_t
, item
>::iterator i
=
4046 if (!i
->first
.is_max() && i
->first
.pool
== -1) {
4047 hobject_t
to_insert(i
->first
);
4048 to_insert
.pool
= pool
;
4049 tmp
[to_insert
] = i
->second
;
4055 missing
.insert(tmp
.begin(), tmp
.end());
4058 for (map
<hobject_t
,item
>::iterator it
=
4060 it
!= missing
.end();
4062 rmissing
[it
->second
.need
.version
] = it
->first
;
4063 for (auto const &i
: missing
)
4064 tracker
.changed(i
.first
);
4066 void dump(Formatter
*f
) const {
4067 f
->open_array_section("missing");
4068 for (map
<hobject_t
,item
>::const_iterator p
=
4069 missing
.begin(); p
!= missing
.end(); ++p
) {
4070 f
->open_object_section("item");
4071 f
->dump_stream("object") << p
->first
;
4076 f
->dump_bool("may_include_deletes", may_include_deletes
);
4078 template <typename F
>
4079 void filter_objects(F
&&f
) {
4080 for (auto i
= missing
.begin(); i
!= missing
.end();) {
4088 static void generate_test_instances(list
<pg_missing_set
*>& o
) {
4089 o
.push_back(new pg_missing_set
);
4090 o
.push_back(new pg_missing_set
);
4092 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4093 eversion_t(5, 6), eversion_t(5, 1), false);
4094 o
.push_back(new pg_missing_set
);
4096 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4097 eversion_t(5, 6), eversion_t(5, 1), true);
4098 o
.back()->may_include_deletes
= true;
4100 template <typename F
>
4101 void get_changed(F
&&f
) const {
4102 tracker
.get_changed(f
);
4107 bool is_clean() const {
4108 return tracker
.is_clean();
4110 template <typename missing_t
>
4111 bool debug_verify_from_init(
4112 const missing_t
&init_missing
,
4113 ostream
*oss
) const {
4116 auto check_missing(init_missing
.get_items());
4117 tracker
.get_changed([&](const hobject_t
&hoid
) {
4118 check_missing
.erase(hoid
);
4119 if (missing
.count(hoid
)) {
4120 check_missing
.insert(*(missing
.find(hoid
)));
4124 if (check_missing
.size() != missing
.size()) {
4126 *oss
<< "Size mismatch, check: " << check_missing
.size()
4127 << ", actual: " << missing
.size() << "\n";
4131 for (auto &i
: missing
) {
4132 if (!check_missing
.count(i
.first
)) {
4134 *oss
<< "check_missing missing " << i
.first
<< "\n";
4136 } else if (check_missing
[i
.first
] != i
.second
) {
4138 *oss
<< "check_missing missing item mismatch on " << i
.first
4139 << ", check: " << check_missing
[i
.first
]
4140 << ", actual: " << i
.second
<< "\n";
4145 *oss
<< "check_missing: " << check_missing
<< "\n";
4146 set
<hobject_t
> changed
;
4147 tracker
.get_changed([&](const hobject_t
&hoid
) { changed
.insert(hoid
); });
4148 *oss
<< "changed: " << changed
<< "\n";
4153 template <bool TrackChanges
>
4155 const pg_missing_set
<TrackChanges
> &c
, bufferlist
&bl
, uint64_t features
=0) {
4158 ENCODE_DUMP_POST(cl
);
4160 template <bool TrackChanges
>
4161 void decode(pg_missing_set
<TrackChanges
> &c
, bufferlist::iterator
&p
) {
4164 template <bool TrackChanges
>
4165 ostream
& operator<<(ostream
& out
, const pg_missing_set
<TrackChanges
> &missing
)
4167 out
<< "missing(" << missing
.num_missing()
4168 << " may_include_deletes = " << missing
.may_include_deletes
;
4169 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4174 using pg_missing_t
= pg_missing_set
<false>;
4175 using pg_missing_tracker_t
= pg_missing_set
<true>;
4179 * pg list objects response format
4182 struct pg_nls_response_t
{
4183 collection_list_handle_t handle
;
4184 list
<librados::ListObjectImpl
> entries
;
4186 void encode(bufferlist
& bl
) const {
4187 ENCODE_START(1, 1, bl
);
4188 ::encode(handle
, bl
);
4189 __u32 n
= (__u32
)entries
.size();
4191 for (list
<librados::ListObjectImpl
>::const_iterator i
= entries
.begin(); i
!= entries
.end(); ++i
) {
4192 ::encode(i
->nspace
, bl
);
4193 ::encode(i
->oid
, bl
);
4194 ::encode(i
->locator
, bl
);
4198 void decode(bufferlist::iterator
& bl
) {
4199 DECODE_START(1, bl
);
4200 ::decode(handle
, bl
);
4205 librados::ListObjectImpl i
;
4206 ::decode(i
.nspace
, bl
);
4207 ::decode(i
.oid
, bl
);
4208 ::decode(i
.locator
, bl
);
4209 entries
.push_back(i
);
4213 void dump(Formatter
*f
) const {
4214 f
->dump_stream("handle") << handle
;
4215 f
->open_array_section("entries");
4216 for (list
<librados::ListObjectImpl
>::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
4217 f
->open_object_section("object");
4218 f
->dump_string("namespace", p
->nspace
);
4219 f
->dump_string("object", p
->oid
);
4220 f
->dump_string("key", p
->locator
);
4225 static void generate_test_instances(list
<pg_nls_response_t
*>& o
) {
4226 o
.push_back(new pg_nls_response_t
);
4227 o
.push_back(new pg_nls_response_t
);
4228 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4229 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
4230 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
4231 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
4232 o
.push_back(new pg_nls_response_t
);
4233 o
.back()->handle
= hobject_t(object_t("hi"), "key", 3, 4, -1, "");
4234 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4235 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4236 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4237 o
.push_back(new pg_nls_response_t
);
4238 o
.back()->handle
= hobject_t(object_t("hi"), "key", 5, 6, -1, "");
4239 o
.back()->entries
.push_back(librados::ListObjectImpl("", "one", ""));
4240 o
.back()->entries
.push_back(librados::ListObjectImpl("", "two", "twokey"));
4241 o
.back()->entries
.push_back(librados::ListObjectImpl("", "three", ""));
4242 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1one", ""));
4243 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
4244 o
.back()->entries
.push_back(librados::ListObjectImpl("n1", "n1three", ""));
4248 WRITE_CLASS_ENCODER(pg_nls_response_t
)
4250 // For backwards compatibility with older OSD requests
4251 struct pg_ls_response_t
{
4252 collection_list_handle_t handle
;
4253 list
<pair
<object_t
, string
> > entries
;
4255 void encode(bufferlist
& bl
) const {
4258 ::encode(handle
, bl
);
4259 ::encode(entries
, bl
);
4261 void decode(bufferlist::iterator
& bl
) {
4265 ::decode(handle
, bl
);
4266 ::decode(entries
, bl
);
4268 void dump(Formatter
*f
) const {
4269 f
->dump_stream("handle") << handle
;
4270 f
->open_array_section("entries");
4271 for (list
<pair
<object_t
, string
> >::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
4272 f
->open_object_section("object");
4273 f
->dump_stream("object") << p
->first
;
4274 f
->dump_string("key", p
->second
);
4279 static void generate_test_instances(list
<pg_ls_response_t
*>& o
) {
4280 o
.push_back(new pg_ls_response_t
);
4281 o
.push_back(new pg_ls_response_t
);
4282 o
.back()->handle
= hobject_t(object_t("hi"), "key", 1, 2, -1, "");
4283 o
.back()->entries
.push_back(make_pair(object_t("one"), string()));
4284 o
.back()->entries
.push_back(make_pair(object_t("two"), string("twokey")));
4288 WRITE_CLASS_ENCODER(pg_ls_response_t
)
4291 * object_copy_cursor_t
4293 struct object_copy_cursor_t
{
4294 uint64_t data_offset
;
4300 object_copy_cursor_t()
4302 attr_complete(false),
4303 data_complete(false),
4304 omap_complete(false)
4307 bool is_initial() const {
4308 return !attr_complete
&& data_offset
== 0 && omap_offset
.empty();
4310 bool is_complete() const {
4311 return attr_complete
&& data_complete
&& omap_complete
;
4314 static void generate_test_instances(list
<object_copy_cursor_t
*>& o
);
4315 void encode(bufferlist
& bl
) const;
4316 void decode(bufferlist::iterator
&bl
);
4317 void dump(Formatter
*f
) const;
4319 WRITE_CLASS_ENCODER(object_copy_cursor_t
)
4322 * object_copy_data_t
4324 * Return data from a copy request. The semantics are a little strange
4325 * as a result of the encoding's heritage.
4327 * In particular, the sender unconditionally fills in the cursor (from what
4328 * it receives and sends), the size, and the mtime, but is responsible for
4329 * figuring out whether it should put any data in the attrs, data, or
4330 * omap members (corresponding to xattrs, object data, and the omap entries)
4331 * based on external data (the client includes a max amount to return with
4332 * the copy request). The client then looks into the attrs, data, and/or omap
4333 * based on the contents of the cursor.
4335 struct object_copy_data_t
{
4337 FLAG_DATA_DIGEST
= 1<<0,
4338 FLAG_OMAP_DIGEST
= 1<<1,
4340 object_copy_cursor_t cursor
;
4343 uint32_t data_digest
, omap_digest
;
4345 map
<string
, bufferlist
> attrs
;
4347 bufferlist omap_header
;
4348 bufferlist omap_data
;
4350 /// which snaps we are defined for (if a snap and not the head)
4351 vector
<snapid_t
> snaps
;
4352 ///< latest snap seq for the object (if head)
4355 ///< recent reqids on this object
4356 mempool::osd_pglog::vector
<pair
<osd_reqid_t
, version_t
> > reqids
;
4358 uint64_t truncate_seq
;
4359 uint64_t truncate_size
;
4362 object_copy_data_t() :
4363 size((uint64_t)-1), data_digest(-1),
4364 omap_digest(-1), flags(0),
4368 static void generate_test_instances(list
<object_copy_data_t
*>& o
);
4369 void encode(bufferlist
& bl
, uint64_t features
) const;
4370 void decode(bufferlist::iterator
& bl
);
4371 void dump(Formatter
*f
) const;
4373 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t
)
4378 struct pg_create_t
{
4379 epoch_t created
; // epoch pg created
4380 pg_t parent
; // split from parent (if != pg_t())
4384 : created(0), split_bits(0) {}
4385 pg_create_t(unsigned c
, pg_t p
, int s
)
4386 : created(c
), parent(p
), split_bits(s
) {}
4388 void encode(bufferlist
&bl
) const;
4389 void decode(bufferlist::iterator
&bl
);
4390 void dump(Formatter
*f
) const;
4391 static void generate_test_instances(list
<pg_create_t
*>& o
);
4393 WRITE_CLASS_ENCODER(pg_create_t
)
4395 // -----------------------------------------
4397 struct osd_peer_stat_t
{
4400 osd_peer_stat_t() { }
4402 void encode(bufferlist
&bl
) const;
4403 void decode(bufferlist::iterator
&bl
);
4404 void dump(Formatter
*f
) const;
4405 static void generate_test_instances(list
<osd_peer_stat_t
*>& o
);
4407 WRITE_CLASS_ENCODER(osd_peer_stat_t
)
4409 ostream
& operator<<(ostream
& out
, const osd_peer_stat_t
&stat
);
4412 // -----------------------------------------
4414 class ObjectExtent
{
4416 * ObjectExtents are used for specifying IO behavior against RADOS
4417 * objects when one is using the ObjectCacher.
4419 * To use this in a real system, *every member* must be filled
4420 * out correctly. In particular, make sure to initialize the
4421 * oloc correctly, as its default values are deliberate poison
4422 * and will cause internal ObjectCacher asserts.
4424 * Similarly, your buffer_extents vector *must* specify a total
4425 * size equal to your length. If the buffer_extents inadvertently
4426 * contain less space than the length member specifies, you
4427 * will get unintelligible asserts deep in the ObjectCacher.
4429 * If you are trying to do testing and don't care about actual
4430 * RADOS function, the simplest thing to do is to initialize
4431 * the ObjectExtent (truncate_size can be 0), create a single entry
4432 * in buffer_extents matching the length, and set oloc.pool to 0.
4435 object_t oid
; // object id
4437 uint64_t offset
; // in object
4438 uint64_t length
; // in object
4439 uint64_t truncate_size
; // in object
4441 object_locator_t oloc
; // object locator (pool etc)
4443 vector
<pair
<uint64_t,uint64_t> > buffer_extents
; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
4445 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
4446 ObjectExtent(object_t o
, uint64_t ono
, uint64_t off
, uint64_t l
, uint64_t ts
) :
4447 oid(o
), objectno(ono
), offset(off
), length(l
), truncate_size(ts
) { }
4450 inline ostream
& operator<<(ostream
& out
, const ObjectExtent
&ex
)
4452 return out
<< "extent("
4453 << ex
.oid
<< " (" << ex
.objectno
<< ") in " << ex
.oloc
4454 << " " << ex
.offset
<< "~" << ex
.length
4455 << " -> " << ex
.buffer_extents
4460 // ---------------------------------------
4462 class OSDSuperblock
{
4464 uuid_d cluster_fsid
, osd_fsid
;
4465 int32_t whoami
; // my role in this fs.
4466 epoch_t current_epoch
; // most recent epoch
4467 epoch_t oldest_map
, newest_map
; // oldest/newest maps we have.
4470 CompatSet compat_features
;
4472 // last interval over which i mounted and was then active
4473 epoch_t mounted
; // last epoch i mounted
4474 epoch_t clean_thru
; // epoch i was active and clean thru
4478 current_epoch(0), oldest_map(0), newest_map(0), weight(0),
4479 mounted(0), clean_thru(0) {
4482 void encode(bufferlist
&bl
) const;
4483 void decode(bufferlist::iterator
&bl
);
4484 void dump(Formatter
*f
) const;
4485 static void generate_test_instances(list
<OSDSuperblock
*>& o
);
4487 WRITE_CLASS_ENCODER(OSDSuperblock
)
4489 inline ostream
& operator<<(ostream
& out
, const OSDSuperblock
& sb
)
4491 return out
<< "sb(" << sb
.cluster_fsid
4492 << " osd." << sb
.whoami
4493 << " " << sb
.osd_fsid
4494 << " e" << sb
.current_epoch
4495 << " [" << sb
.oldest_map
<< "," << sb
.newest_map
<< "]"
4496 << " lci=[" << sb
.mounted
<< "," << sb
.clean_thru
<< "]"
4509 * attached to object head. describes most recent snap context, and
4510 * set of existing clones.
4515 vector
<snapid_t
> snaps
; // descending
4516 vector
<snapid_t
> clones
; // ascending
4517 map
<snapid_t
, interval_set
<uint64_t> > clone_overlap
; // overlap w/ next newest
4518 map
<snapid_t
, uint64_t> clone_size
;
4519 map
<snapid_t
, vector
<snapid_t
>> clone_snaps
; // descending
4521 SnapSet() : seq(0), head_exists(false) {}
4522 explicit SnapSet(bufferlist
& bl
) {
4523 bufferlist::iterator p
= bl
.begin();
4527 bool is_legacy() const {
4528 return clone_snaps
.size() < clones
.size() || !head_exists
;
4531 /// populate SnapSet from a librados::snap_set_t
4532 void from_snap_set(const librados::snap_set_t
& ss
, bool legacy
);
4534 /// get space accounted to clone
4535 uint64_t get_clone_bytes(snapid_t clone
) const;
4537 void encode(bufferlist
& bl
) const;
4538 void decode(bufferlist::iterator
& bl
);
4539 void dump(Formatter
*f
) const;
4540 static void generate_test_instances(list
<SnapSet
*>& o
);
4542 SnapContext
get_ssc_as_of(snapid_t as_of
) const {
4545 for (vector
<snapid_t
>::const_iterator i
= snaps
.begin();
4549 out
.snaps
.push_back(*i
);
4554 // return min element of snaps > after, return max if no such element
4555 snapid_t
get_first_snap_after(snapid_t after
, snapid_t max
) const {
4556 for (vector
<snapid_t
>::const_reverse_iterator i
= snaps
.rbegin();
4565 SnapSet
get_filtered(const pg_pool_t
&pinfo
) const;
4566 void filter(const pg_pool_t
&pinfo
);
4568 WRITE_CLASS_ENCODER(SnapSet
)
4570 ostream
& operator<<(ostream
& out
, const SnapSet
& cs
);
4575 #define SS_ATTR "snapset"
4577 struct watch_info_t
{
4579 uint32_t timeout_seconds
;
4582 watch_info_t() : cookie(0), timeout_seconds(0) { }
4583 watch_info_t(uint64_t c
, uint32_t t
, const entity_addr_t
& a
) : cookie(c
), timeout_seconds(t
), addr(a
) {}
4585 void encode(bufferlist
& bl
, uint64_t features
) const;
4586 void decode(bufferlist::iterator
& bl
);
4587 void dump(Formatter
*f
) const;
4588 static void generate_test_instances(list
<watch_info_t
*>& o
);
4590 WRITE_CLASS_ENCODER_FEATURES(watch_info_t
)
4592 static inline bool operator==(const watch_info_t
& l
, const watch_info_t
& r
) {
4593 return l
.cookie
== r
.cookie
&& l
.timeout_seconds
== r
.timeout_seconds
4594 && l
.addr
== r
.addr
;
4597 static inline ostream
& operator<<(ostream
& out
, const watch_info_t
& w
) {
4598 return out
<< "watch(cookie " << w
.cookie
<< " " << w
.timeout_seconds
<< "s"
4599 << " " << w
.addr
<< ")";
4602 struct notify_info_t
{
4609 static inline ostream
& operator<<(ostream
& out
, const notify_info_t
& n
) {
4610 return out
<< "notify(cookie " << n
.cookie
4611 << " notify" << n
.notify_id
4612 << " " << n
.timeout
<< "s)";
4615 struct object_info_t
;
4616 struct object_manifest_t
{
4619 TYPE_REDIRECT
= 1, // start with this
4620 TYPE_CHUNKED
= 2, // do this later
4622 uint8_t type
; // redirect, chunked, ...
4623 hobject_t redirect_target
;
4625 object_manifest_t() : type(0) { }
4626 object_manifest_t(uint8_t type
, const hobject_t
& redirect_target
)
4627 : type(type
), redirect_target(redirect_target
) { }
4629 bool is_empty() const {
4630 return type
== TYPE_NONE
;
4632 bool is_redirect() const {
4633 return type
== TYPE_REDIRECT
;
4635 bool is_chunked() const {
4636 return type
== TYPE_CHUNKED
;
4638 static const char *get_type_name(uint8_t m
) {
4640 case TYPE_NONE
: return "none";
4641 case TYPE_REDIRECT
: return "redirect";
4642 case TYPE_CHUNKED
: return "chunked";
4643 default: return "unknown";
4646 const char *get_type_name() const {
4647 return get_type_name(type
);
4649 static void generate_test_instances(list
<object_manifest_t
*>& o
);
4650 void encode(bufferlist
&bl
) const;
4651 void decode(bufferlist::iterator
&bl
);
4652 void dump(Formatter
*f
) const;
4653 friend ostream
& operator<<(ostream
& out
, const object_info_t
& oi
);
4655 WRITE_CLASS_ENCODER(object_manifest_t
)
4656 ostream
& operator<<(ostream
& out
, const object_manifest_t
& oi
);
4658 struct object_info_t
{
4660 eversion_t version
, prior_version
;
4661 version_t user_version
;
4662 osd_reqid_t last_reqid
;
4666 utime_t local_mtime
; // local mtime
4668 // note: these are currently encoded into a total 16 bits; see
4669 // encode()/decode() for the weirdness.
4672 FLAG_WHITEOUT
= 1<<1, // object logically does not exist
4673 FLAG_DIRTY
= 1<<2, // object has been modified since last flushed or undirtied
4674 FLAG_OMAP
= 1 << 3, // has (or may have) some/any omap data
4675 FLAG_DATA_DIGEST
= 1 << 4, // has data crc
4676 FLAG_OMAP_DIGEST
= 1 << 5, // has omap crc
4677 FLAG_CACHE_PIN
= 1 << 6, // pin the object in cache tier
4678 FLAG_MANIFEST
= 1 << 7, // has manifest
4680 FLAG_USES_TMAP
= 1<<8, // deprecated; no longer used.
4685 static string
get_flag_string(flag_t flags
) {
4687 vector
<string
> sv
= get_flag_vector(flags
);
4688 for (auto ss
: sv
) {
4689 s
+= string("|") + ss
;
4695 static vector
<string
> get_flag_vector(flag_t flags
) {
4697 if (flags
& FLAG_LOST
)
4698 sv
.insert(sv
.end(), "lost");
4699 if (flags
& FLAG_WHITEOUT
)
4700 sv
.insert(sv
.end(), "whiteout");
4701 if (flags
& FLAG_DIRTY
)
4702 sv
.insert(sv
.end(), "dirty");
4703 if (flags
& FLAG_USES_TMAP
)
4704 sv
.insert(sv
.end(), "uses_tmap");
4705 if (flags
& FLAG_OMAP
)
4706 sv
.insert(sv
.end(), "omap");
4707 if (flags
& FLAG_DATA_DIGEST
)
4708 sv
.insert(sv
.end(), "data_digest");
4709 if (flags
& FLAG_OMAP_DIGEST
)
4710 sv
.insert(sv
.end(), "omap_digest");
4711 if (flags
& FLAG_CACHE_PIN
)
4712 sv
.insert(sv
.end(), "cache_pin");
4713 if (flags
& FLAG_MANIFEST
)
4714 sv
.insert(sv
.end(), "manifest");
4717 string
get_flag_string() const {
4718 return get_flag_string(flags
);
4721 /// [clone] descending. pre-luminous; moved to SnapSet
4722 vector
<snapid_t
> legacy_snaps
;
4724 uint64_t truncate_seq
, truncate_size
;
4726 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
> watchers
;
4728 // opportunistic checksums; may or may not be present
4729 __u32 data_digest
; ///< data crc32c
4730 __u32 omap_digest
; ///< omap crc32c
4732 // alloc hint attribute
4733 uint64_t expected_object_size
, expected_write_size
;
4734 uint32_t alloc_hint_flags
;
4736 struct object_manifest_t manifest
;
4738 void copy_user_bits(const object_info_t
& other
);
4740 static ps_t
legacy_object_locator_to_ps(const object_t
&oid
,
4741 const object_locator_t
&loc
);
4743 bool test_flag(flag_t f
) const {
4744 return (flags
& f
) == f
;
4746 void set_flag(flag_t f
) {
4747 flags
= (flag_t
)(flags
| f
);
4749 void clear_flag(flag_t f
) {
4750 flags
= (flag_t
)(flags
& ~f
);
4752 bool is_lost() const {
4753 return test_flag(FLAG_LOST
);
4755 bool is_whiteout() const {
4756 return test_flag(FLAG_WHITEOUT
);
4758 bool is_dirty() const {
4759 return test_flag(FLAG_DIRTY
);
4761 bool is_omap() const {
4762 return test_flag(FLAG_OMAP
);
4764 bool is_data_digest() const {
4765 return test_flag(FLAG_DATA_DIGEST
);
4767 bool is_omap_digest() const {
4768 return test_flag(FLAG_OMAP_DIGEST
);
4770 bool is_cache_pinned() const {
4771 return test_flag(FLAG_CACHE_PIN
);
4773 bool has_manifest() const {
4774 return test_flag(FLAG_MANIFEST
);
4777 void set_data_digest(__u32 d
) {
4778 set_flag(FLAG_DATA_DIGEST
);
4781 void set_omap_digest(__u32 d
) {
4782 set_flag(FLAG_OMAP_DIGEST
);
4785 void clear_data_digest() {
4786 clear_flag(FLAG_DATA_DIGEST
);
4789 void clear_omap_digest() {
4790 clear_flag(FLAG_OMAP_DIGEST
);
4794 clear_data_digest();
4795 clear_omap_digest();
4798 void encode(bufferlist
& bl
, uint64_t features
) const;
4799 void decode(bufferlist::iterator
& bl
);
4800 void decode(bufferlist
& bl
) {
4801 bufferlist::iterator p
= bl
.begin();
4804 void dump(Formatter
*f
) const;
4805 static void generate_test_instances(list
<object_info_t
*>& o
);
4807 explicit object_info_t()
4808 : user_version(0), size(0), flags((flag_t
)0),
4809 truncate_seq(0), truncate_size(0),
4810 data_digest(-1), omap_digest(-1),
4811 expected_object_size(0), expected_write_size(0),
4815 explicit object_info_t(const hobject_t
& s
)
4817 user_version(0), size(0), flags((flag_t
)0),
4818 truncate_seq(0), truncate_size(0),
4819 data_digest(-1), omap_digest(-1),
4820 expected_object_size(0), expected_write_size(0),
4824 explicit object_info_t(bufferlist
& bl
) {
4828 WRITE_CLASS_ENCODER_FEATURES(object_info_t
)
4830 ostream
& operator<<(ostream
& out
, const object_info_t
& oi
);
4835 struct ObjectRecoveryInfo
{
4840 SnapSet ss
; // only populated if soid is_snap()
4841 interval_set
<uint64_t> copy_subset
;
4842 map
<hobject_t
, interval_set
<uint64_t>> clone_subset
;
4844 ObjectRecoveryInfo() : size(0) { }
4846 static void generate_test_instances(list
<ObjectRecoveryInfo
*>& o
);
4847 void encode(bufferlist
&bl
, uint64_t features
) const;
4848 void decode(bufferlist::iterator
&bl
, int64_t pool
= -1);
4849 ostream
&print(ostream
&out
) const;
4850 void dump(Formatter
*f
) const;
4852 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo
)
4853 ostream
& operator<<(ostream
& out
, const ObjectRecoveryInfo
&inf
);
4855 struct ObjectRecoveryProgress
{
4856 uint64_t data_recovered_to
;
4857 string omap_recovered_to
;
4863 ObjectRecoveryProgress()
4864 : data_recovered_to(0),
4866 data_complete(false), omap_complete(false) { }
4868 bool is_complete(const ObjectRecoveryInfo
& info
) const {
4869 return (data_recovered_to
>= (
4870 info
.copy_subset
.empty() ?
4871 0 : info
.copy_subset
.range_end())) &&
4875 static void generate_test_instances(list
<ObjectRecoveryProgress
*>& o
);
4876 void encode(bufferlist
&bl
) const;
4877 void decode(bufferlist::iterator
&bl
);
4878 ostream
&print(ostream
&out
) const;
4879 void dump(Formatter
*f
) const;
4881 WRITE_CLASS_ENCODER(ObjectRecoveryProgress
)
4882 ostream
& operator<<(ostream
& out
, const ObjectRecoveryProgress
&prog
);
4884 struct PushReplyOp
{
4887 static void generate_test_instances(list
<PushReplyOp
*>& o
);
4888 void encode(bufferlist
&bl
) const;
4889 void decode(bufferlist::iterator
&bl
);
4890 ostream
&print(ostream
&out
) const;
4891 void dump(Formatter
*f
) const;
4893 uint64_t cost(CephContext
*cct
) const;
4895 WRITE_CLASS_ENCODER(PushReplyOp
)
4896 ostream
& operator<<(ostream
& out
, const PushReplyOp
&op
);
4901 ObjectRecoveryInfo recovery_info
;
4902 ObjectRecoveryProgress recovery_progress
;
4904 static void generate_test_instances(list
<PullOp
*>& o
);
4905 void encode(bufferlist
&bl
, uint64_t features
) const;
4906 void decode(bufferlist::iterator
&bl
);
4907 ostream
&print(ostream
&out
) const;
4908 void dump(Formatter
*f
) const;
4910 uint64_t cost(CephContext
*cct
) const;
4912 WRITE_CLASS_ENCODER_FEATURES(PullOp
)
4913 ostream
& operator<<(ostream
& out
, const PullOp
&op
);
4919 interval_set
<uint64_t> data_included
;
4920 bufferlist omap_header
;
4921 map
<string
, bufferlist
> omap_entries
;
4922 map
<string
, bufferlist
> attrset
;
4924 ObjectRecoveryInfo recovery_info
;
4925 ObjectRecoveryProgress before_progress
;
4926 ObjectRecoveryProgress after_progress
;
4928 static void generate_test_instances(list
<PushOp
*>& o
);
4929 void encode(bufferlist
&bl
, uint64_t features
) const;
4930 void decode(bufferlist::iterator
&bl
);
4931 ostream
&print(ostream
&out
) const;
4932 void dump(Formatter
*f
) const;
4934 uint64_t cost(CephContext
*cct
) const;
4936 WRITE_CLASS_ENCODER_FEATURES(PushOp
)
4937 ostream
& operator<<(ostream
& out
, const PushOp
&op
);
4941 * summarize pg contents for purposes of a scrub
4945 map
<string
,bufferptr
> attrs
;
4947 __u32 omap_digest
; ///< omap crc32c
4948 __u32 digest
; ///< data crc32c
4950 bool digest_present
:1;
4951 bool omap_digest_present
:1;
4954 bool ec_hash_mismatch
:1;
4955 bool ec_size_mismatch
:1;
4956 bool large_omap_object_found
:1;
4957 uint64_t large_omap_object_key_count
= 0;
4958 uint64_t large_omap_object_value_size
= 0;
4961 // Init invalid size so it won't match if we get a stat EIO error
4962 size(-1), omap_digest(0), digest(0),
4963 negative(false), digest_present(false), omap_digest_present(false),
4964 read_error(false), stat_error(false), ec_hash_mismatch(false),
4965 ec_size_mismatch(false), large_omap_object_found(false) {}
4967 void encode(bufferlist
& bl
) const;
4968 void decode(bufferlist::iterator
& bl
);
4969 void dump(Formatter
*f
) const;
4970 static void generate_test_instances(list
<object
*>& o
);
4972 WRITE_CLASS_ENCODER(object
)
4974 map
<hobject_t
,object
> objects
;
4975 eversion_t valid_through
;
4976 eversion_t incr_since
;
4977 bool has_large_omap_object_errors
:1;
4978 boost::optional
<bool> has_builtin_csum
;
4980 void merge_incr(const ScrubMap
&l
);
4981 void clear_from(const hobject_t
& start
) {
4982 objects
.erase(objects
.lower_bound(start
), objects
.end());
4984 void insert(const ScrubMap
&r
) {
4985 objects
.insert(r
.objects
.begin(), r
.objects
.end());
4987 void swap(ScrubMap
&r
) {
4989 swap(objects
, r
.objects
);
4990 swap(valid_through
, r
.valid_through
);
4991 swap(incr_since
, r
.incr_since
);
4994 void encode(bufferlist
& bl
) const;
4995 void decode(bufferlist::iterator
& bl
, int64_t pool
=-1);
4996 void dump(Formatter
*f
) const;
4997 static void generate_test_instances(list
<ScrubMap
*>& o
);
4999 WRITE_CLASS_ENCODER(ScrubMap::object
)
5000 WRITE_CLASS_ENCODER(ScrubMap
)
5002 struct ScrubMapBuilder
{
5004 vector
<hobject_t
> ls
;
5006 int64_t data_pos
= 0;
5009 bufferhash data_hash
, omap_hash
; ///< accumulatinng hash value
5010 uint64_t omap_keys
= 0;
5011 uint64_t omap_bytes
= 0;
5017 return pos
>= ls
.size();
5020 *this = ScrubMapBuilder();
5024 return data_pos
< 0;
5027 void next_object() {
5035 friend ostream
& operator<<(ostream
& out
, const ScrubMapBuilder
& pos
) {
5036 out
<< "(" << pos
.pos
<< "/" << pos
.ls
.size();
5037 if (pos
.pos
< pos
.ls
.size()) {
5038 out
<< " " << pos
.ls
[pos
.pos
];
5040 if (pos
.data_pos
< 0) {
5041 out
<< " byte " << pos
.data_pos
;
5043 if (!pos
.omap_pos
.empty()) {
5044 out
<< " key " << pos
.omap_pos
;
5050 out
<< " ret " << pos
.ret
;
5060 bufferlist indata
, outdata
;
5064 memset(&op
, 0, sizeof(ceph_osd_op
));
5068 * split a bufferlist into constituent indata members of a vector of OSDOps
5070 * @param ops [out] vector of OSDOps
5071 * @param in [in] combined data buffer
5073 static void split_osd_op_vector_in_data(vector
<OSDOp
>& ops
, bufferlist
& in
);
5076 * merge indata members of a vector of OSDOp into a single bufferlist
5078 * Notably this also encodes certain other OSDOp data into the data
5079 * buffer, including the sobject_t soid.
5081 * @param ops [in] vector of OSDOps
5082 * @param out [out] combined data buffer
5084 static void merge_osd_op_vector_in_data(vector
<OSDOp
>& ops
, bufferlist
& out
);
5087 * split a bufferlist into constituent outdata members of a vector of OSDOps
5089 * @param ops [out] vector of OSDOps
5090 * @param in [in] combined data buffer
5092 static void split_osd_op_vector_out_data(vector
<OSDOp
>& ops
, bufferlist
& in
);
5095 * merge outdata members of a vector of OSDOps into a single bufferlist
5097 * @param ops [in] vector of OSDOps
5098 * @param out [out] combined data buffer
5100 static void merge_osd_op_vector_out_data(vector
<OSDOp
>& ops
, bufferlist
& out
);
5103 * Clear data as much as possible, leave minimal data for historical op dump
5105 * @param ops [in] vector of OSDOps
5107 static void clear_data(vector
<OSDOp
>& ops
);
5110 ostream
& operator<<(ostream
& out
, const OSDOp
& op
);
5112 struct watch_item_t
{
5115 uint32_t timeout_seconds
;
5118 watch_item_t() : cookie(0), timeout_seconds(0) { }
5119 watch_item_t(entity_name_t name
, uint64_t cookie
, uint32_t timeout
,
5120 const entity_addr_t
& addr
)
5121 : name(name
), cookie(cookie
), timeout_seconds(timeout
),
5124 void encode(bufferlist
&bl
, uint64_t features
) const {
5125 ENCODE_START(2, 1, bl
);
5127 ::encode(cookie
, bl
);
5128 ::encode(timeout_seconds
, bl
);
5129 ::encode(addr
, bl
, features
);
5132 void decode(bufferlist::iterator
&bl
) {
5133 DECODE_START(2, bl
);
5135 ::decode(cookie
, bl
);
5136 ::decode(timeout_seconds
, bl
);
5137 if (struct_v
>= 2) {
5143 WRITE_CLASS_ENCODER_FEATURES(watch_item_t
)
5145 struct obj_watch_item_t
{
5151 * obj list watch response format
5154 struct obj_list_watch_response_t
{
5155 list
<watch_item_t
> entries
;
5157 void encode(bufferlist
& bl
, uint64_t features
) const {
5158 ENCODE_START(1, 1, bl
);
5159 ::encode(entries
, bl
, features
);
5162 void decode(bufferlist::iterator
& bl
) {
5163 DECODE_START(1, bl
);
5164 ::decode(entries
, bl
);
5167 void dump(Formatter
*f
) const {
5168 f
->open_array_section("entries");
5169 for (list
<watch_item_t
>::const_iterator p
= entries
.begin(); p
!= entries
.end(); ++p
) {
5170 f
->open_object_section("watch");
5171 f
->dump_stream("watcher") << p
->name
;
5172 f
->dump_int("cookie", p
->cookie
);
5173 f
->dump_int("timeout", p
->timeout_seconds
);
5174 f
->open_object_section("addr");
5181 static void generate_test_instances(list
<obj_list_watch_response_t
*>& o
) {
5183 o
.push_back(new obj_list_watch_response_t
);
5184 o
.push_back(new obj_list_watch_response_t
);
5185 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
5187 ea
.set_family(AF_INET
);
5188 ea
.set_in4_quad(0, 127);
5189 ea
.set_in4_quad(1, 0);
5190 ea
.set_in4_quad(2, 0);
5191 ea
.set_in4_quad(3, 1);
5193 o
.back()->entries
.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 1), 10, 30, ea
));
5195 ea
.set_in4_quad(3, 2);
5197 o
.back()->entries
.push_back(watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT
, 2), 20, 60, ea
));
5200 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t
)
5204 vector
<snapid_t
> snaps
; // ascending
5205 vector
< pair
<uint64_t,uint64_t> > overlap
;
5208 clone_info() : cloneid(CEPH_NOSNAP
), size(0) {}
5210 void encode(bufferlist
& bl
) const {
5211 ENCODE_START(1, 1, bl
);
5212 ::encode(cloneid
, bl
);
5213 ::encode(snaps
, bl
);
5214 ::encode(overlap
, bl
);
5218 void decode(bufferlist::iterator
& bl
) {
5219 DECODE_START(1, bl
);
5220 ::decode(cloneid
, bl
);
5221 ::decode(snaps
, bl
);
5222 ::decode(overlap
, bl
);
5226 void dump(Formatter
*f
) const {
5227 if (cloneid
== CEPH_NOSNAP
)
5228 f
->dump_string("cloneid", "HEAD");
5230 f
->dump_unsigned("cloneid", cloneid
.val
);
5231 f
->open_array_section("snapshots");
5232 for (vector
<snapid_t
>::const_iterator p
= snaps
.begin(); p
!= snaps
.end(); ++p
) {
5233 f
->open_object_section("snap");
5234 f
->dump_unsigned("id", p
->val
);
5238 f
->open_array_section("overlaps");
5239 for (vector
< pair
<uint64_t,uint64_t> >::const_iterator q
= overlap
.begin();
5240 q
!= overlap
.end(); ++q
) {
5241 f
->open_object_section("overlap");
5242 f
->dump_unsigned("offset", q
->first
);
5243 f
->dump_unsigned("length", q
->second
);
5247 f
->dump_unsigned("size", size
);
5249 static void generate_test_instances(list
<clone_info
*>& o
) {
5250 o
.push_back(new clone_info
);
5251 o
.push_back(new clone_info
);
5252 o
.back()->cloneid
= 1;
5253 o
.back()->snaps
.push_back(1);
5254 o
.back()->overlap
.push_back(pair
<uint64_t,uint64_t>(0,4096));
5255 o
.back()->overlap
.push_back(pair
<uint64_t,uint64_t>(8192,4096));
5256 o
.back()->size
= 16384;
5257 o
.push_back(new clone_info
);
5258 o
.back()->cloneid
= CEPH_NOSNAP
;
5259 o
.back()->size
= 32768;
5262 WRITE_CLASS_ENCODER(clone_info
)
5265 * obj list snaps response format
5268 struct obj_list_snap_response_t
{
5269 vector
<clone_info
> clones
; // ascending
5272 void encode(bufferlist
& bl
) const {
5273 ENCODE_START(2, 1, bl
);
5274 ::encode(clones
, bl
);
5278 void decode(bufferlist::iterator
& bl
) {
5279 DECODE_START(2, bl
);
5280 ::decode(clones
, bl
);
5287 void dump(Formatter
*f
) const {
5288 f
->open_array_section("clones");
5289 for (vector
<clone_info
>::const_iterator p
= clones
.begin(); p
!= clones
.end(); ++p
) {
5290 f
->open_object_section("clone");
5294 f
->dump_unsigned("seq", seq
);
5297 static void generate_test_instances(list
<obj_list_snap_response_t
*>& o
) {
5298 o
.push_back(new obj_list_snap_response_t
);
5299 o
.push_back(new obj_list_snap_response_t
);
5302 cl
.snaps
.push_back(1);
5303 cl
.overlap
.push_back(pair
<uint64_t,uint64_t>(0,4096));
5304 cl
.overlap
.push_back(pair
<uint64_t,uint64_t>(8192,4096));
5306 o
.back()->clones
.push_back(cl
);
5307 cl
.cloneid
= CEPH_NOSNAP
;
5311 o
.back()->clones
.push_back(cl
);
5312 o
.back()->seq
= 123;
5316 WRITE_CLASS_ENCODER(obj_list_snap_response_t
)
5320 struct PromoteCounter
{
5321 std::atomic_ullong attempts
{0};
5322 std::atomic_ullong objects
{0};
5323 std::atomic_ullong bytes
{0};
5329 void finish(uint64_t size
) {
5334 void sample_and_attenuate(uint64_t *a
, uint64_t *o
, uint64_t *b
) {
5345 * ObjectStore full statfs information
5347 struct store_statfs_t
5349 uint64_t total
= 0; // Total bytes
5350 uint64_t available
= 0; // Free bytes available
5352 int64_t allocated
= 0; // Bytes allocated by the store
5353 int64_t stored
= 0; // Bytes actually stored by the user
5354 int64_t compressed
= 0; // Bytes stored after compression
5355 int64_t compressed_allocated
= 0; // Bytes allocated for compressed data
5356 int64_t compressed_original
= 0; // Bytes that were successfully compressed
5359 *this = store_statfs_t();
5361 bool operator ==(const store_statfs_t
& other
) const;
5362 void dump(Formatter
*f
) const;
5364 ostream
&operator<<(ostream
&lhs
, const store_statfs_t
&rhs
);