1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include <boost/assign/list_of.hpp>
20 #include "osd_types.h"
21 #include "include/ceph_features.h"
22 #include "include/stringify.h"
24 #include "crush/hash.h"
28 const char *ceph_osd_flag_name(unsigned flag
)
31 case CEPH_OSD_FLAG_ACK
: return "ack";
32 case CEPH_OSD_FLAG_ONNVRAM
: return "onnvram";
33 case CEPH_OSD_FLAG_ONDISK
: return "ondisk";
34 case CEPH_OSD_FLAG_RETRY
: return "retry";
35 case CEPH_OSD_FLAG_READ
: return "read";
36 case CEPH_OSD_FLAG_WRITE
: return "write";
37 case CEPH_OSD_FLAG_ORDERSNAP
: return "ordersnap";
38 case CEPH_OSD_FLAG_PEERSTAT_OLD
: return "peerstat_old";
39 case CEPH_OSD_FLAG_BALANCE_READS
: return "balance_reads";
40 case CEPH_OSD_FLAG_PARALLELEXEC
: return "parallelexec";
41 case CEPH_OSD_FLAG_PGOP
: return "pgop";
42 case CEPH_OSD_FLAG_EXEC
: return "exec";
43 case CEPH_OSD_FLAG_EXEC_PUBLIC
: return "exec_public";
44 case CEPH_OSD_FLAG_LOCALIZE_READS
: return "localize_reads";
45 case CEPH_OSD_FLAG_RWORDERED
: return "rwordered";
46 case CEPH_OSD_FLAG_IGNORE_CACHE
: return "ignore_cache";
47 case CEPH_OSD_FLAG_SKIPRWLOCKS
: return "skiprwlocks";
48 case CEPH_OSD_FLAG_IGNORE_OVERLAY
: return "ignore_overlay";
49 case CEPH_OSD_FLAG_FLUSH
: return "flush";
50 case CEPH_OSD_FLAG_MAP_SNAP_CLONE
: return "map_snap_clone";
51 case CEPH_OSD_FLAG_ENFORCE_SNAPC
: return "enforce_snapc";
52 case CEPH_OSD_FLAG_REDIRECTED
: return "redirected";
53 case CEPH_OSD_FLAG_KNOWN_REDIR
: return "known_if_redirected";
54 case CEPH_OSD_FLAG_FULL_TRY
: return "full_try";
55 case CEPH_OSD_FLAG_FULL_FORCE
: return "full_force";
56 case CEPH_OSD_FLAG_IGNORE_REDIRECT
: return "ignore_redirect";
57 default: return "???";
61 string
ceph_osd_flag_string(unsigned flags
)
64 for (unsigned i
=0; i
<32; ++i
) {
65 if (flags
& (1u<<i
)) {
68 s
+= ceph_osd_flag_name(1u << i
);
76 const char * ceph_osd_op_flag_name(unsigned flag
)
81 case CEPH_OSD_OP_FLAG_EXCL
:
84 case CEPH_OSD_OP_FLAG_FAILOK
:
87 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM
:
88 name
= "fadvise_random";
90 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
:
91 name
= "fadvise_sequential";
93 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
:
94 name
= "favise_willneed";
96 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
:
97 name
= "fadvise_dontneed";
99 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
:
100 name
= "fadvise_nocache";
102 case CEPH_OSD_OP_FLAG_WITH_REFERENCE
:
103 name
= "with_reference";
105 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
:
106 name
= "bypass_clean_cache";
115 string
ceph_osd_op_flag_string(unsigned flags
)
118 for (unsigned i
=0; i
<32; ++i
) {
119 if (flags
& (1u<<i
)) {
122 s
+= ceph_osd_op_flag_name(1u << i
);
130 string
ceph_osd_alloc_hint_flag_string(unsigned flags
)
133 for (unsigned i
=0; i
<32; ++i
) {
134 if (flags
& (1u<<i
)) {
137 s
+= ceph_osd_alloc_hint_flag_name(1u << i
);
145 void pg_shard_t::encode(bufferlist
&bl
) const
147 ENCODE_START(1, 1, bl
);
152 void pg_shard_t::decode(bufferlist::const_iterator
&bl
)
160 ostream
&operator<<(ostream
&lhs
, const pg_shard_t
&rhs
)
162 if (rhs
.is_undefined())
164 if (rhs
.shard
== shard_id_t::NO_SHARD
)
165 return lhs
<< rhs
.get_osd();
166 return lhs
<< rhs
.get_osd() << '(' << (unsigned)(rhs
.shard
) << ')';
169 void dump(Formatter
* f
, const osd_alerts_t
& alerts
)
171 for (auto& a
: alerts
) {
172 string s0
= " osd: ";
173 s0
+= stringify(a
.first
);
175 for (auto& aa
: a
.second
) {
181 f
->dump_string("alert", s
);
187 void osd_reqid_t::dump(Formatter
*f
) const
189 f
->dump_stream("name") << name
;
190 f
->dump_int("inc", inc
);
191 f
->dump_unsigned("tid", tid
);
194 void osd_reqid_t::generate_test_instances(list
<osd_reqid_t
*>& o
)
196 o
.push_back(new osd_reqid_t
);
197 o
.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
200 // -- object_locator_t --
202 void object_locator_t::encode(bufferlist
& bl
) const
204 // verify that nobody's corrupted the locator
205 ceph_assert(hash
== -1 || key
.empty());
206 __u8 encode_compat
= 3;
207 ENCODE_START(6, encode_compat
, bl
);
209 int32_t preferred
= -1; // tell old code there is no preferred osd (-1).
210 encode(preferred
, bl
);
215 encode_compat
= std::max
<std::uint8_t>(encode_compat
, 6); // need to interpret the hash
216 ENCODE_FINISH_NEW_COMPAT(bl
, encode_compat
);
219 void object_locator_t::decode(bufferlist::const_iterator
& p
)
221 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p
);
231 decode(preferred
, p
);
241 // verify that nobody's corrupted the locator
242 ceph_assert(hash
== -1 || key
.empty());
245 void object_locator_t::dump(Formatter
*f
) const
247 f
->dump_int("pool", pool
);
248 f
->dump_string("key", key
);
249 f
->dump_string("namespace", nspace
);
250 f
->dump_int("hash", hash
);
253 void object_locator_t::generate_test_instances(list
<object_locator_t
*>& o
)
255 o
.push_back(new object_locator_t
);
256 o
.push_back(new object_locator_t(123));
257 o
.push_back(new object_locator_t(123, 876));
258 o
.push_back(new object_locator_t(1, "n2"));
259 o
.push_back(new object_locator_t(1234, "", "key"));
260 o
.push_back(new object_locator_t(12, "n1", "key2"));
263 // -- request_redirect_t --
264 void request_redirect_t::encode(bufferlist
& bl
) const
266 ENCODE_START(1, 1, bl
);
267 encode(redirect_locator
, bl
);
268 encode(redirect_object
, bl
);
269 // legacy of the removed osd_instructions member
270 encode((uint32_t)0, bl
);
274 void request_redirect_t::decode(bufferlist::const_iterator
& bl
)
277 uint32_t legacy_osd_instructions_len
;
278 decode(redirect_locator
, bl
);
279 decode(redirect_object
, bl
);
280 decode(legacy_osd_instructions_len
, bl
);
281 if (legacy_osd_instructions_len
) {
282 bl
.advance(legacy_osd_instructions_len
);
287 void request_redirect_t::dump(Formatter
*f
) const
289 f
->dump_string("object", redirect_object
);
290 f
->open_object_section("locator");
291 redirect_locator
.dump(f
);
292 f
->close_section(); // locator
295 void request_redirect_t::generate_test_instances(list
<request_redirect_t
*>& o
)
297 object_locator_t
loc(1, "redir_obj");
298 o
.push_back(new request_redirect_t());
299 o
.push_back(new request_redirect_t(loc
, 0));
300 o
.push_back(new request_redirect_t(loc
, "redir_obj"));
301 o
.push_back(new request_redirect_t(loc
));
304 void objectstore_perf_stat_t::dump(Formatter
*f
) const
306 // *_ms values just for compatibility.
307 f
->dump_float("commit_latency_ms", os_commit_latency_ns
/ 1000000.0);
308 f
->dump_float("apply_latency_ms", os_apply_latency_ns
/ 1000000.0);
309 f
->dump_unsigned("commit_latency_ns", os_commit_latency_ns
);
310 f
->dump_unsigned("apply_latency_ns", os_apply_latency_ns
);
313 void objectstore_perf_stat_t::encode(bufferlist
&bl
, uint64_t features
) const
315 uint8_t target_v
= 2;
316 if (!HAVE_FEATURE(features
, OS_PERF_STAT_NS
)) {
319 ENCODE_START(target_v
, target_v
, bl
);
321 encode(os_commit_latency_ns
, bl
);
322 encode(os_apply_latency_ns
, bl
);
324 constexpr auto NS_PER_MS
= std::chrono::nanoseconds(1ms
).count();
325 uint32_t commit_latency_ms
= os_commit_latency_ns
/ NS_PER_MS
;
326 uint32_t apply_latency_ms
= os_apply_latency_ns
/ NS_PER_MS
;
327 encode(commit_latency_ms
, bl
); // for compatibility with older monitor.
328 encode(apply_latency_ms
, bl
); // for compatibility with older monitor.
333 void objectstore_perf_stat_t::decode(bufferlist::const_iterator
&bl
)
337 decode(os_commit_latency_ns
, bl
);
338 decode(os_apply_latency_ns
, bl
);
340 uint32_t commit_latency_ms
;
341 uint32_t apply_latency_ms
;
342 decode(commit_latency_ms
, bl
);
343 decode(apply_latency_ms
, bl
);
344 constexpr auto NS_PER_MS
= std::chrono::nanoseconds(1ms
).count();
345 os_commit_latency_ns
= commit_latency_ms
* NS_PER_MS
;
346 os_apply_latency_ns
= apply_latency_ms
* NS_PER_MS
;
351 void objectstore_perf_stat_t::generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
)
353 o
.push_back(new objectstore_perf_stat_t());
354 o
.push_back(new objectstore_perf_stat_t());
355 o
.back()->os_commit_latency_ns
= 20000000;
356 o
.back()->os_apply_latency_ns
= 30000000;
360 void osd_stat_t::dump(Formatter
*f
) const
362 f
->dump_unsigned("up_from", up_from
);
363 f
->dump_unsigned("seq", seq
);
364 f
->dump_unsigned("num_pgs", num_pgs
);
365 f
->dump_unsigned("num_osds", num_osds
);
366 f
->dump_unsigned("num_per_pool_osds", num_per_pool_osds
);
368 /// dump legacy stats fields to ensure backward compatibility.
369 f
->dump_unsigned("kb", statfs
.kb());
370 f
->dump_unsigned("kb_used", statfs
.kb_used_raw());
371 f
->dump_unsigned("kb_used_data", statfs
.kb_used_data());
372 f
->dump_unsigned("kb_used_omap", statfs
.kb_used_omap());
373 f
->dump_unsigned("kb_used_meta", statfs
.kb_used_internal_metadata());
374 f
->dump_unsigned("kb_avail", statfs
.kb_avail());
377 f
->open_object_section("statfs");
380 f
->open_array_section("hb_peers");
381 for (auto p
: hb_peers
)
382 f
->dump_int("osd", p
);
384 f
->dump_int("snap_trim_queue_len", snap_trim_queue_len
);
385 f
->dump_int("num_snap_trimming", num_snap_trimming
);
386 f
->dump_int("num_shards_repaired", num_shards_repaired
);
387 f
->open_object_section("op_queue_age_hist");
388 op_queue_age_hist
.dump(f
);
390 f
->open_object_section("perf_stat");
391 os_perf_stat
.dump(f
);
393 f
->open_array_section("alerts");
394 ::dump(f
, os_alerts
);
398 void osd_stat_t::encode(bufferlist
&bl
, uint64_t features
) const
400 ENCODE_START(12, 2, bl
);
402 //////// for compatibility ////////
403 int64_t kb
= statfs
.kb();
404 int64_t kb_used
= statfs
.kb_used_raw();
405 int64_t kb_avail
= statfs
.kb_avail();
408 encode(kb_avail
, bl
);
409 ///////////////////////////////////
411 encode(snap_trim_queue_len
, bl
);
412 encode(num_snap_trimming
, bl
);
413 encode(hb_peers
, bl
);
414 encode((uint32_t)0, bl
);
415 encode(op_queue_age_hist
, bl
);
416 encode(os_perf_stat
, bl
, features
);
421 //////// for compatibility ////////
422 int64_t kb_used_data
= statfs
.kb_used_data();
423 int64_t kb_used_omap
= statfs
.kb_used_omap();
424 int64_t kb_used_meta
= statfs
.kb_used_internal_metadata();
425 encode(kb_used_data
, bl
);
426 encode(kb_used_omap
, bl
);
427 encode(kb_used_meta
, bl
);
429 ///////////////////////////////////
430 encode(os_alerts
, bl
);
431 encode(num_shards_repaired
, bl
);
432 encode(num_osds
, bl
);
433 encode(num_per_pool_osds
, bl
);
437 void osd_stat_t::decode(bufferlist::const_iterator
&bl
)
439 int64_t kb
, kb_used
,kb_avail
;
440 int64_t kb_used_data
, kb_used_omap
, kb_used_meta
;
441 DECODE_START_LEGACY_COMPAT_LEN(12, 2, 2, bl
);
444 decode(kb_avail
, bl
);
445 decode(snap_trim_queue_len
, bl
);
446 decode(num_snap_trimming
, bl
);
447 decode(hb_peers
, bl
);
448 vector
<int> num_hb_out
;
449 decode(num_hb_out
, bl
);
451 decode(op_queue_age_hist
, bl
);
453 decode(os_perf_stat
, bl
);
462 decode(kb_used_data
, bl
);
463 decode(kb_used_omap
, bl
);
464 decode(kb_used_meta
, bl
);
466 kb_used_data
= kb_used
;
474 statfs
.total
= kb
<< 10;
475 statfs
.available
= kb_avail
<< 10;
476 // actually it's totally unexpected to have ststfs.total < statfs.available
477 // here but unfortunately legacy generate_test_instances produced such a
478 // case hence inserting some handling rather than assert
479 statfs
.internally_reserved
=
480 statfs
.total
> statfs
.available
? statfs
.total
- statfs
.available
: 0;
482 if ((int64_t)statfs
.internally_reserved
> kb_used
) {
483 statfs
.internally_reserved
-= kb_used
;
485 statfs
.internally_reserved
= 0;
487 statfs
.allocated
= kb_used_data
<< 10;
488 statfs
.omap_allocated
= kb_used_omap
<< 10;
489 statfs
.internal_metadata
= kb_used_meta
<< 10;
491 if (struct_v
>= 10) {
492 decode(os_alerts
, bl
);
496 if (struct_v
>= 11) {
497 decode(num_shards_repaired
, bl
);
499 num_shards_repaired
= 0;
501 if (struct_v
>= 12) {
502 decode(num_osds
, bl
);
503 decode(num_per_pool_osds
, bl
);
506 num_per_pool_osds
= 0;
511 void osd_stat_t::generate_test_instances(std::list
<osd_stat_t
*>& o
)
513 o
.push_back(new osd_stat_t
);
515 o
.push_back(new osd_stat_t
);
516 list
<store_statfs_t
*> ll
;
517 store_statfs_t::generate_test_instances(ll
);
518 o
.back()->statfs
= *ll
.back();
519 o
.back()->hb_peers
.push_back(7);
520 o
.back()->snap_trim_queue_len
= 8;
521 o
.back()->num_snap_trimming
= 99;
522 o
.back()->num_shards_repaired
= 101;
523 o
.back()->os_alerts
[0].emplace(
524 "some alert", "some alert details");
525 o
.back()->os_alerts
[1].emplace(
526 "some alert2", "some alert2 details");
531 int pg_t::print(char *o
, int maxlen
) const
533 return snprintf(o
, maxlen
, "%llu.%x", (unsigned long long)pool(), ps());
536 bool pg_t::parse(const char *s
)
540 int r
= sscanf(s
, "%llu.%x", (long long unsigned *)&ppool
, &pseed
);
548 bool spg_t::parse(const char *s
)
550 shard
= shard_id_t::NO_SHARD
;
554 int r
= sscanf(s
, "%llu.%x", (long long unsigned *)&ppool
, &pseed
);
557 pgid
.set_pool(ppool
);
560 const char *p
= strchr(s
, 's');
562 r
= sscanf(p
, "s%u", &pshard
);
564 shard
= shard_id_t(pshard
);
572 char *spg_t::calc_name(char *buf
, const char *suffix_backwords
) const
574 while (*suffix_backwords
)
575 *--buf
= *suffix_backwords
++;
577 if (!is_no_shard()) {
578 buf
= ritoa
<uint8_t, 10>((uint8_t)shard
.id
, buf
);
582 return pgid
.calc_name(buf
, "");
585 ostream
& operator<<(ostream
& out
, const spg_t
&pg
)
587 char buf
[spg_t::calc_name_buf_size
];
588 buf
[spg_t::calc_name_buf_size
- 1] = '\0';
589 out
<< pg
.calc_name(buf
+ spg_t::calc_name_buf_size
- 1, "");
593 pg_t
pg_t::get_ancestor(unsigned old_pg_num
) const
595 int old_bits
= cbits(old_pg_num
);
596 int old_mask
= (1 << old_bits
) - 1;
598 ret
.m_seed
= ceph_stable_mod(m_seed
, old_pg_num
, old_mask
);
602 bool pg_t::is_split(unsigned old_pg_num
, unsigned new_pg_num
, set
<pg_t
> *children
) const
604 //ceph_assert(m_seed < old_pg_num);
605 if (m_seed
>= old_pg_num
) {
609 if (new_pg_num
<= old_pg_num
)
614 unsigned old_bits
= cbits(old_pg_num
);
615 unsigned old_mask
= (1 << old_bits
) - 1;
616 for (unsigned n
= 1; ; n
++) {
617 unsigned next_bit
= (n
<< (old_bits
-1));
618 unsigned s
= next_bit
| m_seed
;
620 if (s
< old_pg_num
|| s
== m_seed
)
624 if ((unsigned)ceph_stable_mod(s
, old_pg_num
, old_mask
) == m_seed
) {
627 children
->insert(pg_t(s
, m_pool
));
633 int old_bits
= cbits(old_pg_num
);
634 int old_mask
= (1 << old_bits
) - 1;
635 for (unsigned x
= old_pg_num
; x
< new_pg_num
; ++x
) {
636 unsigned o
= ceph_stable_mod(x
, old_pg_num
, old_mask
);
639 children
->insert(pg_t(x
, m_pool
));
646 unsigned pg_t::get_split_bits(unsigned pg_num
) const {
649 ceph_assert(pg_num
> 1);
651 // Find unique p such that pg_num \in [2^(p-1), 2^p)
652 unsigned p
= cbits(pg_num
);
653 ceph_assert(p
); // silence coverity #751330
655 if ((m_seed
% (1<<(p
-1))) < (pg_num
% (1<<(p
-1))))
661 bool pg_t::is_merge_source(
666 if (m_seed
< old_pg_num
&&
667 m_seed
>= new_pg_num
) {
670 while (t
.m_seed
>= new_pg_num
) {
680 pg_t
pg_t::get_parent() const
682 unsigned bits
= cbits(m_seed
);
685 retval
.m_seed
&= ~((~0)<<(bits
- 1));
689 hobject_t
pg_t::get_hobj_start() const
691 return hobject_t(object_t(), string(), 0, m_seed
, m_pool
,
695 hobject_t
pg_t::get_hobj_end(unsigned pg_num
) const
697 // note: this assumes a bitwise sort; with the legacy nibblewise
698 // sort a PG did not always cover a single contiguous range of the
699 // (bit-reversed) hash range.
700 unsigned bits
= get_split_bits(pg_num
);
701 uint64_t rev_start
= hobject_t::_reverse_bits(m_seed
);
702 uint64_t rev_end
= (rev_start
| (0xffffffff >> bits
)) + 1;
703 if (rev_end
>= 0x100000000) {
704 ceph_assert(rev_end
== 0x100000000);
705 return hobject_t::get_max();
707 return hobject_t(object_t(), string(), CEPH_NOSNAP
,
708 hobject_t::_reverse_bits(rev_end
), m_pool
,
713 void pg_t::dump(Formatter
*f
) const
715 f
->dump_unsigned("pool", m_pool
);
716 f
->dump_unsigned("seed", m_seed
);
719 void pg_t::generate_test_instances(list
<pg_t
*>& o
)
721 o
.push_back(new pg_t
);
722 o
.push_back(new pg_t(1, 2));
723 o
.push_back(new pg_t(13123, 3));
724 o
.push_back(new pg_t(131223, 4));
727 char *pg_t::calc_name(char *buf
, const char *suffix_backwords
) const
729 while (*suffix_backwords
)
730 *--buf
= *suffix_backwords
++;
732 buf
= ritoa
<uint32_t, 16>(m_seed
, buf
);
736 return ritoa
<uint64_t, 10>(m_pool
, buf
);
739 ostream
& operator<<(ostream
& out
, const pg_t
&pg
)
741 char buf
[pg_t::calc_name_buf_size
];
742 buf
[pg_t::calc_name_buf_size
- 1] = '\0';
743 out
<< pg
.calc_name(buf
+ pg_t::calc_name_buf_size
- 1, "");
750 void coll_t::calc_str()
754 strcpy(_str_buff
, "meta");
758 _str_buff
[spg_t::calc_name_buf_size
- 1] = '\0';
759 _str
= pgid
.calc_name(_str_buff
+ spg_t::calc_name_buf_size
- 1, "daeh_");
762 _str_buff
[spg_t::calc_name_buf_size
- 1] = '\0';
763 _str
= pgid
.calc_name(_str_buff
+ spg_t::calc_name_buf_size
- 1, "PMET_");
766 ceph_abort_msg("unknown collection type");
770 bool coll_t::parse(const std::string
& s
)
777 ceph_assert(s
== _str
);
780 if (s
.find("_head") == s
.length() - 5 &&
781 pgid
.parse(s
.substr(0, s
.length() - 5))) {
785 ceph_assert(s
== _str
);
788 if (s
.find("_TEMP") == s
.length() - 5 &&
789 pgid
.parse(s
.substr(0, s
.length() - 5))) {
793 ceph_assert(s
== _str
);
799 void coll_t::encode(bufferlist
& bl
) const
802 // when changing this, remember to update encoded_size() too.
804 // can't express this as v2...
806 encode(struct_v
, bl
);
807 encode(to_str(), bl
);
810 encode(struct_v
, bl
);
811 encode((__u8
)type
, bl
);
813 snapid_t snap
= CEPH_NOSNAP
;
818 size_t coll_t::encoded_size() const
820 size_t r
= sizeof(__u8
);
833 r
+= sizeof(ceph_le32
) + 2 * sizeof(__u8
);
835 r
+= sizeof(__u8
) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
839 r
+= sizeof(uint64_t);
845 void coll_t::decode(bufferlist::const_iterator
& bl
)
849 decode(struct_v
, bl
);
858 if (pgid
== spg_t() && snap
== 0) {
874 type
= (type_t
)_type
;
883 bool ok
= parse(str
);
885 throw std::domain_error(std::string("unable to parse pg ") + str
);
892 oss
<< "coll_t::decode(): don't know how to decode version "
894 throw std::domain_error(oss
.str());
899 void coll_t::dump(Formatter
*f
) const
901 f
->dump_unsigned("type_id", (unsigned)type
);
902 if (type
!= TYPE_META
)
903 f
->dump_stream("pgid") << pgid
;
904 f
->dump_string("name", to_str());
907 void coll_t::generate_test_instances(list
<coll_t
*>& o
)
909 o
.push_back(new coll_t());
910 o
.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD
)));
911 o
.push_back(new coll_t(o
.back()->get_temp()));
912 o
.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
913 o
.push_back(new coll_t(o
.back()->get_temp()));
914 o
.push_back(new coll_t());
919 std::string
pg_vector_string(const vector
<int32_t> &a
)
923 for (vector
<int32_t>::const_iterator i
= a
.begin(); i
!= a
.end(); ++i
) {
926 if (*i
!= CRUSH_ITEM_NONE
)
935 std::string
pg_state_string(uint64_t state
)
938 if (state
& PG_STATE_STALE
)
940 if (state
& PG_STATE_CREATING
)
942 if (state
& PG_STATE_ACTIVE
)
944 if (state
& PG_STATE_ACTIVATING
)
945 oss
<< "activating+";
946 if (state
& PG_STATE_CLEAN
)
948 if (state
& PG_STATE_RECOVERY_WAIT
)
949 oss
<< "recovery_wait+";
950 if (state
& PG_STATE_RECOVERY_TOOFULL
)
951 oss
<< "recovery_toofull+";
952 if (state
& PG_STATE_RECOVERING
)
953 oss
<< "recovering+";
954 if (state
& PG_STATE_FORCED_RECOVERY
)
955 oss
<< "forced_recovery+";
956 if (state
& PG_STATE_DOWN
)
958 if (state
& PG_STATE_RECOVERY_UNFOUND
)
959 oss
<< "recovery_unfound+";
960 if (state
& PG_STATE_BACKFILL_UNFOUND
)
961 oss
<< "backfill_unfound+";
962 if (state
& PG_STATE_UNDERSIZED
)
963 oss
<< "undersized+";
964 if (state
& PG_STATE_DEGRADED
)
966 if (state
& PG_STATE_REMAPPED
)
968 if (state
& PG_STATE_PREMERGE
)
970 if (state
& PG_STATE_SCRUBBING
)
972 if (state
& PG_STATE_DEEP_SCRUB
)
974 if (state
& PG_STATE_INCONSISTENT
)
975 oss
<< "inconsistent+";
976 if (state
& PG_STATE_PEERING
)
978 if (state
& PG_STATE_REPAIR
)
980 if (state
& PG_STATE_BACKFILL_WAIT
)
981 oss
<< "backfill_wait+";
982 if (state
& PG_STATE_BACKFILLING
)
983 oss
<< "backfilling+";
984 if (state
& PG_STATE_FORCED_BACKFILL
)
985 oss
<< "forced_backfill+";
986 if (state
& PG_STATE_BACKFILL_TOOFULL
)
987 oss
<< "backfill_toofull+";
988 if (state
& PG_STATE_INCOMPLETE
)
989 oss
<< "incomplete+";
990 if (state
& PG_STATE_PEERED
)
992 if (state
& PG_STATE_SNAPTRIM
)
994 if (state
& PG_STATE_SNAPTRIM_WAIT
)
995 oss
<< "snaptrim_wait+";
996 if (state
& PG_STATE_SNAPTRIM_ERROR
)
997 oss
<< "snaptrim_error+";
998 if (state
& PG_STATE_FAILED_REPAIR
)
999 oss
<< "failed_repair+";
1000 string
ret(oss
.str());
1001 if (ret
.length() > 0)
1002 ret
.resize(ret
.length() - 1);
1008 boost::optional
<uint64_t> pg_string_state(const std::string
& state
)
1010 boost::optional
<uint64_t> type
;
1011 if (state
== "active")
1012 type
= PG_STATE_ACTIVE
;
1013 else if (state
== "clean")
1014 type
= PG_STATE_CLEAN
;
1015 else if (state
== "down")
1016 type
= PG_STATE_DOWN
;
1017 else if (state
== "recovery_unfound")
1018 type
= PG_STATE_RECOVERY_UNFOUND
;
1019 else if (state
== "backfill_unfound")
1020 type
= PG_STATE_BACKFILL_UNFOUND
;
1021 else if (state
== "premerge")
1022 type
= PG_STATE_PREMERGE
;
1023 else if (state
== "scrubbing")
1024 type
= PG_STATE_SCRUBBING
;
1025 else if (state
== "degraded")
1026 type
= PG_STATE_DEGRADED
;
1027 else if (state
== "inconsistent")
1028 type
= PG_STATE_INCONSISTENT
;
1029 else if (state
== "peering")
1030 type
= PG_STATE_PEERING
;
1031 else if (state
== "repair")
1032 type
= PG_STATE_REPAIR
;
1033 else if (state
== "recovering")
1034 type
= PG_STATE_RECOVERING
;
1035 else if (state
== "forced_recovery")
1036 type
= PG_STATE_FORCED_RECOVERY
;
1037 else if (state
== "backfill_wait")
1038 type
= PG_STATE_BACKFILL_WAIT
;
1039 else if (state
== "incomplete")
1040 type
= PG_STATE_INCOMPLETE
;
1041 else if (state
== "stale")
1042 type
= PG_STATE_STALE
;
1043 else if (state
== "remapped")
1044 type
= PG_STATE_REMAPPED
;
1045 else if (state
== "deep")
1046 type
= PG_STATE_DEEP_SCRUB
;
1047 else if (state
== "backfilling")
1048 type
= PG_STATE_BACKFILLING
;
1049 else if (state
== "forced_backfill")
1050 type
= PG_STATE_FORCED_BACKFILL
;
1051 else if (state
== "backfill_toofull")
1052 type
= PG_STATE_BACKFILL_TOOFULL
;
1053 else if (state
== "recovery_wait")
1054 type
= PG_STATE_RECOVERY_WAIT
;
1055 else if (state
== "recovery_toofull")
1056 type
= PG_STATE_RECOVERY_TOOFULL
;
1057 else if (state
== "undersized")
1058 type
= PG_STATE_UNDERSIZED
;
1059 else if (state
== "activating")
1060 type
= PG_STATE_ACTIVATING
;
1061 else if (state
== "peered")
1062 type
= PG_STATE_PEERED
;
1063 else if (state
== "snaptrim")
1064 type
= PG_STATE_SNAPTRIM
;
1065 else if (state
== "snaptrim_wait")
1066 type
= PG_STATE_SNAPTRIM_WAIT
;
1067 else if (state
== "snaptrim_error")
1068 type
= PG_STATE_SNAPTRIM_ERROR
;
1069 else if (state
== "creating")
1070 type
= PG_STATE_CREATING
;
1071 else if (state
== "failed_repair")
1072 type
= PG_STATE_FAILED_REPAIR
;
1073 else if (state
== "unknown")
1081 string
eversion_t::get_key_name() const
1083 std::string
key(32, ' ');
1084 get_key_name(&key
[0]);
1085 key
.resize(31); // remove the null terminator
1089 // -- pool_snap_info_t --
1090 void pool_snap_info_t::dump(Formatter
*f
) const
1092 f
->dump_unsigned("snapid", snapid
);
1093 f
->dump_stream("stamp") << stamp
;
1094 f
->dump_string("name", name
);
1097 void pool_snap_info_t::encode(bufferlist
& bl
, uint64_t features
) const
1100 if ((features
& CEPH_FEATURE_PGPOOL3
) == 0) {
1102 encode(struct_v
, bl
);
1108 ENCODE_START(2, 2, bl
);
1115 void pool_snap_info_t::decode(bufferlist::const_iterator
& bl
)
1117 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
1124 void pool_snap_info_t::generate_test_instances(list
<pool_snap_info_t
*>& o
)
1126 o
.push_back(new pool_snap_info_t
);
1127 o
.push_back(new pool_snap_info_t
);
1128 o
.back()->snapid
= 1;
1129 o
.back()->stamp
= utime_t(1, 2);
1130 o
.back()->name
= "foo";
1133 // -- pool_opts_t --
1135 typedef std::map
<std::string
, pool_opts_t::opt_desc_t
> opt_mapping_t
;
1136 static opt_mapping_t opt_mapping
= boost::assign::map_list_of
1137 ("scrub_min_interval", pool_opts_t::opt_desc_t(
1138 pool_opts_t::SCRUB_MIN_INTERVAL
, pool_opts_t::DOUBLE
))
1139 ("scrub_max_interval", pool_opts_t::opt_desc_t(
1140 pool_opts_t::SCRUB_MAX_INTERVAL
, pool_opts_t::DOUBLE
))
1141 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1142 pool_opts_t::DEEP_SCRUB_INTERVAL
, pool_opts_t::DOUBLE
))
1143 ("recovery_priority", pool_opts_t::opt_desc_t(
1144 pool_opts_t::RECOVERY_PRIORITY
, pool_opts_t::INT
))
1145 ("recovery_op_priority", pool_opts_t::opt_desc_t(
1146 pool_opts_t::RECOVERY_OP_PRIORITY
, pool_opts_t::INT
))
1147 ("scrub_priority", pool_opts_t::opt_desc_t(
1148 pool_opts_t::SCRUB_PRIORITY
, pool_opts_t::INT
))
1149 ("compression_mode", pool_opts_t::opt_desc_t(
1150 pool_opts_t::COMPRESSION_MODE
, pool_opts_t::STR
))
1151 ("compression_algorithm", pool_opts_t::opt_desc_t(
1152 pool_opts_t::COMPRESSION_ALGORITHM
, pool_opts_t::STR
))
1153 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1154 pool_opts_t::COMPRESSION_REQUIRED_RATIO
, pool_opts_t::DOUBLE
))
1155 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1156 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, pool_opts_t::INT
))
1157 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1158 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, pool_opts_t::INT
))
1159 ("csum_type", pool_opts_t::opt_desc_t(
1160 pool_opts_t::CSUM_TYPE
, pool_opts_t::INT
))
1161 ("csum_max_block", pool_opts_t::opt_desc_t(
1162 pool_opts_t::CSUM_MAX_BLOCK
, pool_opts_t::INT
))
1163 ("csum_min_block", pool_opts_t::opt_desc_t(
1164 pool_opts_t::CSUM_MIN_BLOCK
, pool_opts_t::INT
))
1165 ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1166 pool_opts_t::FINGERPRINT_ALGORITHM
, pool_opts_t::STR
))
1167 ("pg_num_min", pool_opts_t::opt_desc_t(
1168 pool_opts_t::PG_NUM_MIN
, pool_opts_t::INT
))
1169 ("target_size_bytes", pool_opts_t::opt_desc_t(
1170 pool_opts_t::TARGET_SIZE_BYTES
, pool_opts_t::INT
))
1171 ("target_size_ratio", pool_opts_t::opt_desc_t(
1172 pool_opts_t::TARGET_SIZE_RATIO
, pool_opts_t::DOUBLE
))
1173 ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
1174 pool_opts_t::PG_AUTOSCALE_BIAS
, pool_opts_t::DOUBLE
));
1176 bool pool_opts_t::is_opt_name(const std::string
& name
)
1178 return opt_mapping
.count(name
);
1181 pool_opts_t::opt_desc_t
pool_opts_t::get_opt_desc(const std::string
& name
)
1183 opt_mapping_t::iterator i
= opt_mapping
.find(name
);
1184 ceph_assert(i
!= opt_mapping
.end());
1188 bool pool_opts_t::is_set(pool_opts_t::key_t key
) const
1190 return opts
.count(key
);
1193 const pool_opts_t::value_t
& pool_opts_t::get(pool_opts_t::key_t key
) const
1195 opts_t::const_iterator i
= opts
.find(key
);
1196 ceph_assert(i
!= opts
.end());
1200 bool pool_opts_t::unset(pool_opts_t::key_t key
) {
1201 return opts
.erase(key
) > 0;
1204 class pool_opts_dumper_t
: public boost::static_visitor
<> {
1206 pool_opts_dumper_t(const std::string
& name_
, Formatter
* f_
) :
1207 name(name_
.c_str()), f(f_
) {}
1209 void operator()(std::string s
) const {
1210 f
->dump_string(name
, s
);
1212 void operator()(int64_t i
) const {
1213 f
->dump_int(name
, i
);
1215 void operator()(double d
) const {
1216 f
->dump_float(name
, d
);
1224 void pool_opts_t::dump(const std::string
& name
, Formatter
* f
) const
1226 const opt_desc_t
& desc
= get_opt_desc(name
);
1227 opts_t::const_iterator i
= opts
.find(desc
.key
);
1228 if (i
== opts
.end()) {
1231 boost::apply_visitor(pool_opts_dumper_t(name
, f
), i
->second
);
1234 void pool_opts_t::dump(Formatter
* f
) const
1236 for (opt_mapping_t::iterator i
= opt_mapping
.begin(); i
!= opt_mapping
.end();
1238 const std::string
& name
= i
->first
;
1239 const opt_desc_t
& desc
= i
->second
;
1240 opts_t::const_iterator j
= opts
.find(desc
.key
);
1241 if (j
== opts
.end()) {
1244 boost::apply_visitor(pool_opts_dumper_t(name
, f
), j
->second
);
1248 class pool_opts_encoder_t
: public boost::static_visitor
<> {
1250 explicit pool_opts_encoder_t(bufferlist
& bl_
, uint64_t features
)
1252 features(features
) {}
1254 void operator()(const std::string
&s
) const {
1255 encode(static_cast<int32_t>(pool_opts_t::STR
), bl
);
1258 void operator()(int64_t i
) const {
1259 encode(static_cast<int32_t>(pool_opts_t::INT
), bl
);
1260 if (HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1263 encode(static_cast<int32_t>(i
), bl
);
1266 void operator()(double d
) const {
1267 encode(static_cast<int32_t>(pool_opts_t::DOUBLE
), bl
);
1276 void pool_opts_t::encode(bufferlist
& bl
, uint64_t features
) const
1279 if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1282 ENCODE_START(v
, 1, bl
);
1283 uint32_t n
= static_cast<uint32_t>(opts
.size());
1285 for (opts_t::const_iterator i
= opts
.begin(); i
!= opts
.end(); ++i
) {
1286 encode(static_cast<int32_t>(i
->first
), bl
);
1287 boost::apply_visitor(pool_opts_encoder_t(bl
, features
), i
->second
);
1292 void pool_opts_t::decode(bufferlist::const_iterator
& bl
)
1294 DECODE_START(1, bl
);
1305 opts
[static_cast<key_t
>(k
)] = s
;
1306 } else if (t
== INT
) {
1308 if (struct_v
>= 2) {
1315 opts
[static_cast<key_t
>(k
)] = i
;
1316 } else if (t
== DOUBLE
) {
1319 opts
[static_cast<key_t
>(k
)] = d
;
1321 ceph_assert(!"invalid type");
1327 ostream
& operator<<(ostream
& out
, const pool_opts_t
& opts
)
1329 for (opt_mapping_t::iterator i
= opt_mapping
.begin(); i
!= opt_mapping
.end();
1331 const std::string
& name
= i
->first
;
1332 const pool_opts_t::opt_desc_t
& desc
= i
->second
;
1333 pool_opts_t::opts_t::const_iterator j
= opts
.opts
.find(desc
.key
);
1334 if (j
== opts
.opts
.end()) {
1337 out
<< " " << name
<< " " << j
->second
;
1344 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1345 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1346 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1348 void pg_pool_t::dump(Formatter
*f
) const
1350 f
->dump_stream("create_time") << get_create_time();
1351 f
->dump_unsigned("flags", get_flags());
1352 f
->dump_string("flags_names", get_flags_string());
1353 f
->dump_int("type", get_type());
1354 f
->dump_int("size", get_size());
1355 f
->dump_int("min_size", get_min_size());
1356 f
->dump_int("crush_rule", get_crush_rule());
1357 f
->dump_int("object_hash", get_object_hash());
1358 f
->dump_string("pg_autoscale_mode",
1359 get_pg_autoscale_mode_name(pg_autoscale_mode
));
1360 f
->dump_unsigned("pg_num", get_pg_num());
1361 f
->dump_unsigned("pg_placement_num", get_pgp_num());
1362 f
->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1363 f
->dump_unsigned("pg_num_target", get_pg_num_target());
1364 f
->dump_unsigned("pg_num_pending", get_pg_num_pending());
1365 f
->dump_object("last_pg_merge_meta", last_pg_merge_meta
);
1366 f
->dump_stream("last_change") << get_last_change();
1367 f
->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1368 f
->dump_stream("last_force_op_resend_prenautilus")
1369 << get_last_force_op_resend_prenautilus();
1370 f
->dump_stream("last_force_op_resend_preluminous")
1371 << get_last_force_op_resend_preluminous();
1372 f
->dump_unsigned("auid", get_auid());
1373 f
->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1374 f
->dump_unsigned("snap_seq", get_snap_seq());
1375 f
->dump_unsigned("snap_epoch", get_snap_epoch());
1376 f
->open_array_section("pool_snaps");
1377 for (map
<snapid_t
, pool_snap_info_t
>::const_iterator p
= snaps
.begin(); p
!= snaps
.end(); ++p
) {
1378 f
->open_object_section("pool_snap_info");
1383 f
->dump_stream("removed_snaps") << removed_snaps
;
1384 f
->dump_unsigned("quota_max_bytes", quota_max_bytes
);
1385 f
->dump_unsigned("quota_max_objects", quota_max_objects
);
1386 f
->open_array_section("tiers");
1387 for (set
<uint64_t>::const_iterator p
= tiers
.begin(); p
!= tiers
.end(); ++p
)
1388 f
->dump_unsigned("pool_id", *p
);
1390 f
->dump_int("tier_of", tier_of
);
1391 f
->dump_int("read_tier", read_tier
);
1392 f
->dump_int("write_tier", write_tier
);
1393 f
->dump_string("cache_mode", get_cache_mode_name());
1394 f
->dump_unsigned("target_max_bytes", target_max_bytes
);
1395 f
->dump_unsigned("target_max_objects", target_max_objects
);
1396 f
->dump_unsigned("cache_target_dirty_ratio_micro",
1397 cache_target_dirty_ratio_micro
);
1398 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
1399 cache_target_dirty_high_ratio_micro
);
1400 f
->dump_unsigned("cache_target_full_ratio_micro",
1401 cache_target_full_ratio_micro
);
1402 f
->dump_unsigned("cache_min_flush_age", cache_min_flush_age
);
1403 f
->dump_unsigned("cache_min_evict_age", cache_min_evict_age
);
1404 f
->dump_string("erasure_code_profile", erasure_code_profile
);
1405 f
->open_object_section("hit_set_params");
1406 hit_set_params
.dump(f
);
1407 f
->close_section(); // hit_set_params
1408 f
->dump_unsigned("hit_set_period", hit_set_period
);
1409 f
->dump_unsigned("hit_set_count", hit_set_count
);
1410 f
->dump_bool("use_gmt_hitset", use_gmt_hitset
);
1411 f
->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote
);
1412 f
->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote
);
1413 f
->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate
);
1414 f
->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n
);
1415 f
->open_array_section("grade_table");
1416 for (unsigned i
= 0; i
< hit_set_count
; ++i
)
1417 f
->dump_unsigned("value", get_grade(i
));
1419 f
->dump_unsigned("stripe_width", get_stripe_width());
1420 f
->dump_unsigned("expected_num_objects", expected_num_objects
);
1421 f
->dump_bool("fast_read", fast_read
);
1422 f
->open_object_section("options");
1424 f
->close_section(); // options
1425 f
->open_object_section("application_metadata");
1426 for (auto &app_pair
: application_metadata
) {
1427 f
->open_object_section(app_pair
.first
.c_str());
1428 for (auto &kv_pair
: app_pair
.second
) {
1429 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
1431 f
->close_section(); // application
1433 f
->close_section(); // application_metadata
1436 void pg_pool_t::convert_to_pg_shards(const vector
<int> &from
, set
<pg_shard_t
>* to
) const {
1437 for (size_t i
= 0; i
< from
.size(); ++i
) {
1438 if (from
[i
] != CRUSH_ITEM_NONE
) {
1442 is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1447 void pg_pool_t::calc_pg_masks()
1449 pg_num_mask
= (1 << cbits(pg_num
-1)) - 1;
1450 pgp_num_mask
= (1 << cbits(pgp_num
-1)) - 1;
1453 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid
) const
1455 if (pg_num
== pg_num_mask
+ 1)
1456 return pg_num
; // power-of-2 split
1457 unsigned mask
= pg_num_mask
>> 1;
1458 if ((pgid
.ps() & mask
) < (pg_num
& mask
))
1459 return pg_num_mask
+ 1; // smaller bin size (already split)
1461 return (pg_num_mask
+ 1) >> 1; // bigger bin (not yet split)
1464 bool pg_pool_t::is_pending_merge(pg_t pgid
, bool *target
) const
1466 if (pg_num_pending
>= pg_num
) {
1469 if (pgid
.ps() >= pg_num_pending
&& pgid
.ps() < pg_num
) {
1475 for (unsigned ps
= pg_num_pending
; ps
< pg_num
; ++ps
) {
1476 if (pg_t(ps
, pgid
.pool()).get_parent() == pgid
) {
1487 * we have two snap modes:
1489 * - snap existence/non-existence defined by snaps[] and snap_seq
1490 * - user managed snaps
1491 * - existence tracked by librados user
1493 bool pg_pool_t::is_pool_snaps_mode() const
1495 return has_flag(FLAG_POOL_SNAPS
);
1498 bool pg_pool_t::is_unmanaged_snaps_mode() const
1500 return has_flag(FLAG_SELFMANAGED_SNAPS
);
1503 bool pg_pool_t::is_removed_snap(snapid_t s
) const
1505 if (is_pool_snaps_mode())
1506 return s
<= get_snap_seq() && snaps
.count(s
) == 0;
1508 return removed_snaps
.contains(s
);
1512 * build set of known-removed sets from either pool snaps or
1513 * explicit removed_snaps set.
1515 void pg_pool_t::build_removed_snaps(interval_set
<snapid_t
>& rs
) const
1517 if (is_pool_snaps_mode()) {
1519 for (snapid_t s
= 1; s
<= get_snap_seq(); s
= s
+ 1)
1520 if (snaps
.count(s
) == 0)
1527 bool pg_pool_t::maybe_updated_removed_snaps(const interval_set
<snapid_t
>& cached
) const
1529 if (is_unmanaged_snaps_mode()) { // remove_unmanaged_snap increments range_end
1530 if (removed_snaps
.empty() || cached
.empty()) // range_end is undefined
1531 return removed_snaps
.empty() != cached
.empty();
1532 return removed_snaps
.range_end() != cached
.range_end();
1537 snapid_t
pg_pool_t::snap_exists(const char *s
) const
1539 for (map
<snapid_t
,pool_snap_info_t
>::const_iterator p
= snaps
.begin();
1542 if (p
->second
.name
== s
)
1543 return p
->second
.snapid
;
1547 void pg_pool_t::add_snap(const char *n
, utime_t stamp
)
1549 ceph_assert(!is_unmanaged_snaps_mode());
1550 flags
|= FLAG_POOL_SNAPS
;
1551 snapid_t s
= get_snap_seq() + 1;
1553 snaps
[s
].snapid
= s
;
1555 snaps
[s
].stamp
= stamp
;
1558 void pg_pool_t::add_unmanaged_snap(uint64_t& snapid
)
1560 ceph_assert(!is_pool_snaps_mode());
1561 if (snap_seq
== 0) {
1562 // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
1563 // mimic this field is not decoded but our flag is set; pre-mimic, we
1564 // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1565 removed_snaps
.insert(snapid_t(1));
1568 flags
|= FLAG_SELFMANAGED_SNAPS
;
1569 snapid
= snap_seq
= snap_seq
+ 1;
1572 void pg_pool_t::remove_snap(snapid_t s
)
1574 ceph_assert(snaps
.count(s
));
1576 snap_seq
= snap_seq
+ 1;
1579 void pg_pool_t::remove_unmanaged_snap(snapid_t s
)
1581 ceph_assert(is_unmanaged_snaps_mode());
1582 removed_snaps
.insert(s
);
1583 snap_seq
= snap_seq
+ 1;
1584 // try to add in the new seq, just to try to keep the interval_set contiguous
1585 if (!removed_snaps
.contains(get_snap_seq())) {
1586 removed_snaps
.insert(get_snap_seq());
1590 SnapContext
pg_pool_t::get_snap_context() const
1592 vector
<snapid_t
> s(snaps
.size());
1594 for (map
<snapid_t
, pool_snap_info_t
>::const_reverse_iterator p
= snaps
.rbegin();
1598 return SnapContext(get_snap_seq(), s
);
1601 uint32_t pg_pool_t::hash_key(const string
& key
, const string
& ns
) const
1604 return ceph_str_hash(object_hash
, key
.data(), key
.length());
1605 int nsl
= ns
.length();
1606 int len
= key
.length() + nsl
+ 1;
1608 memcpy(&buf
[0], ns
.data(), nsl
);
1610 memcpy(&buf
[nsl
+1], key
.data(), key
.length());
1611 return ceph_str_hash(object_hash
, &buf
[0], len
);
1614 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v
) const
1616 return ceph_stable_mod(v
, pg_num
, pg_num_mask
);
1620 * map a raw pg (with full precision ps) into an actual pg, for storage
1622 pg_t
pg_pool_t::raw_pg_to_pg(pg_t pg
) const
1624 pg
.set_ps(ceph_stable_mod(pg
.ps(), pg_num
, pg_num_mask
));
1629 * map raw pg (full precision ps) into a placement seed. include
1630 * pool id in that value so that different pools don't use the same
1633 ps_t
pg_pool_t::raw_pg_to_pps(pg_t pg
) const
1635 if (flags
& FLAG_HASHPSPOOL
) {
1636 // Hash the pool id so that pool PGs do not overlap.
1638 crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1639 ceph_stable_mod(pg
.ps(), pgp_num
, pgp_num_mask
),
1642 // Legacy behavior; add ps and pool together. This is not a great
1643 // idea because the PGs from each pool will essentially overlap on
1644 // top of each other: 0.5 == 1.4 == 2.3 == ...
1646 ceph_stable_mod(pg
.ps(), pgp_num
, pgp_num_mask
) +
1651 uint32_t pg_pool_t::get_random_pg_position(pg_t pg
, uint32_t seed
) const
1653 uint32_t r
= crush_hash32_2(CRUSH_HASH_RJENKINS1
, seed
, 123);
1654 if (pg_num
== pg_num_mask
+ 1) {
1657 unsigned smaller_mask
= pg_num_mask
>> 1;
1658 if ((pg
.ps() & smaller_mask
) < (pg_num
& smaller_mask
)) {
1668 void pg_pool_t::encode(bufferlist
& bl
, uint64_t features
) const
1671 if ((features
& CEPH_FEATURE_PGPOOL3
) == 0) {
1672 // this encoding matches the old struct ceph_pg_pool
1674 encode(struct_v
, bl
);
1677 encode(crush_rule
, bl
);
1678 encode(object_hash
, bl
);
1680 encode(pgp_num
, bl
);
1681 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1682 encode(lpg_num
, bl
);
1683 encode(lpgp_num
, bl
);
1684 encode(last_change
, bl
);
1685 encode(snap_seq
, bl
);
1686 encode(snap_epoch
, bl
);
1688 __u32 n
= snaps
.size();
1690 n
= removed_snaps
.num_intervals();
1695 encode_nohead(snaps
, bl
, features
);
1696 encode_nohead(removed_snaps
, bl
);
1700 if ((features
& CEPH_FEATURE_OSDENC
) == 0) {
1702 encode(struct_v
, bl
);
1705 encode(crush_rule
, bl
);
1706 encode(object_hash
, bl
);
1708 encode(pgp_num
, bl
);
1709 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1710 encode(lpg_num
, bl
);
1711 encode(lpgp_num
, bl
);
1712 encode(last_change
, bl
);
1713 encode(snap_seq
, bl
);
1714 encode(snap_epoch
, bl
);
1715 encode(snaps
, bl
, features
);
1716 encode(removed_snaps
, bl
);
1719 encode((uint32_t)0, bl
); // crash_replay_interval
1723 if ((features
& CEPH_FEATURE_OSD_POOLRESEND
) == 0) {
1724 // we simply added last_force_op_resend here, which is a fully
1725 // backward compatible change. however, encoding the same map
1726 // differently between monitors triggers scrub noise (even though
1727 // they are decodable without the feature), so let's be pendantic
1729 ENCODE_START(14, 5, bl
);
1732 encode(crush_rule
, bl
);
1733 encode(object_hash
, bl
);
1735 encode(pgp_num
, bl
);
1736 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1737 encode(lpg_num
, bl
);
1738 encode(lpgp_num
, bl
);
1739 encode(last_change
, bl
);
1740 encode(snap_seq
, bl
);
1741 encode(snap_epoch
, bl
);
1742 encode(snaps
, bl
, features
);
1743 encode(removed_snaps
, bl
);
1746 encode((uint32_t)0, bl
); // crash_replay_interval
1747 encode(min_size
, bl
);
1748 encode(quota_max_bytes
, bl
);
1749 encode(quota_max_objects
, bl
);
1751 encode(tier_of
, bl
);
1752 __u8 c
= cache_mode
;
1754 encode(read_tier
, bl
);
1755 encode(write_tier
, bl
);
1756 encode(properties
, bl
);
1757 encode(hit_set_params
, bl
);
1758 encode(hit_set_period
, bl
);
1759 encode(hit_set_count
, bl
);
1760 encode(stripe_width
, bl
);
1761 encode(target_max_bytes
, bl
);
1762 encode(target_max_objects
, bl
);
1763 encode(cache_target_dirty_ratio_micro
, bl
);
1764 encode(cache_target_full_ratio_micro
, bl
);
1765 encode(cache_min_flush_age
, bl
);
1766 encode(cache_min_evict_age
, bl
);
1767 encode(erasure_code_profile
, bl
);
1773 // NOTE: any new encoding dependencies must be reflected by
1774 // SIGNIFICANT_FEATURES
1775 if (!(features
& CEPH_FEATURE_NEW_OSDOP_ENCODING
)) {
1776 // this was the first post-hammer thing we added; if it's missing, encode
1779 } else if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
1781 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
1783 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1787 ENCODE_START(v
, 5, bl
);
1790 encode(crush_rule
, bl
);
1791 encode(object_hash
, bl
);
1793 encode(pgp_num
, bl
);
1794 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1795 encode(lpg_num
, bl
);
1796 encode(lpgp_num
, bl
);
1797 encode(last_change
, bl
);
1798 encode(snap_seq
, bl
);
1799 encode(snap_epoch
, bl
);
1800 encode(snaps
, bl
, features
);
1801 encode(removed_snaps
, bl
);
1807 tmp
&= ~(FLAG_SELFMANAGED_SNAPS
| FLAG_POOL_SNAPS
| FLAG_CREATING
);
1810 encode((uint32_t)0, bl
); // crash_replay_interval
1811 encode(min_size
, bl
);
1812 encode(quota_max_bytes
, bl
);
1813 encode(quota_max_objects
, bl
);
1815 encode(tier_of
, bl
);
1816 __u8 c
= cache_mode
;
1818 encode(read_tier
, bl
);
1819 encode(write_tier
, bl
);
1820 encode(properties
, bl
);
1821 encode(hit_set_params
, bl
);
1822 encode(hit_set_period
, bl
);
1823 encode(hit_set_count
, bl
);
1824 encode(stripe_width
, bl
);
1825 encode(target_max_bytes
, bl
);
1826 encode(target_max_objects
, bl
);
1827 encode(cache_target_dirty_ratio_micro
, bl
);
1828 encode(cache_target_full_ratio_micro
, bl
);
1829 encode(cache_min_flush_age
, bl
);
1830 encode(cache_min_evict_age
, bl
);
1831 encode(erasure_code_profile
, bl
);
1832 encode(last_force_op_resend_preluminous
, bl
);
1833 encode(min_read_recency_for_promote
, bl
);
1834 encode(expected_num_objects
, bl
);
1836 encode(cache_target_dirty_high_ratio_micro
, bl
);
1839 encode(min_write_recency_for_promote
, bl
);
1842 encode(use_gmt_hitset
, bl
);
1845 encode(fast_read
, bl
);
1848 encode(hit_set_grade_decay_rate
, bl
);
1849 encode(hit_set_search_last_n
, bl
);
1852 encode(opts
, bl
, features
);
1855 encode(last_force_op_resend_prenautilus
, bl
);
1858 encode(application_metadata
, bl
);
1861 encode(create_time
, bl
);
1864 encode(pg_num_target
, bl
);
1865 encode(pgp_num_target
, bl
);
1866 encode(pg_num_pending
, bl
);
1867 encode((epoch_t
)0, bl
); // pg_num_dec_last_epoch_started from 14.1.[01]
1868 encode((epoch_t
)0, bl
); // pg_num_dec_last_epoch_clean from 14.1.[01]
1869 encode(last_force_op_resend
, bl
);
1870 encode(pg_autoscale_mode
, bl
);
1873 encode(last_pg_merge_meta
, bl
);
1878 void pg_pool_t::decode(bufferlist::const_iterator
& bl
)
1880 DECODE_START_LEGACY_COMPAT_LEN(29, 5, 5, bl
);
1883 decode(crush_rule
, bl
);
1884 decode(object_hash
, bl
);
1886 decode(pgp_num
, bl
);
1888 __u32 lpg_num
, lpgp_num
;
1889 decode(lpg_num
, bl
);
1890 decode(lpgp_num
, bl
);
1892 decode(last_change
, bl
);
1893 decode(snap_seq
, bl
);
1894 decode(snap_epoch
, bl
);
1896 if (struct_v
>= 3) {
1898 decode(removed_snaps
, bl
);
1905 decode_nohead(n
, snaps
, bl
);
1906 decode_nohead(m
, removed_snaps
, bl
);
1909 if (struct_v
>= 4) {
1911 uint32_t crash_replay_interval
;
1912 decode(crash_replay_interval
, bl
);
1916 // upgrade path for selfmanaged vs pool snaps
1917 if (snap_seq
> 0 && (flags
& (FLAG_SELFMANAGED_SNAPS
|FLAG_POOL_SNAPS
)) == 0) {
1918 if (!removed_snaps
.empty()) {
1919 flags
|= FLAG_SELFMANAGED_SNAPS
;
1921 flags
|= FLAG_POOL_SNAPS
;
1924 if (struct_v
>= 7) {
1925 decode(min_size
, bl
);
1927 min_size
= size
- size
/2;
1929 if (struct_v
>= 8) {
1930 decode(quota_max_bytes
, bl
);
1931 decode(quota_max_objects
, bl
);
1933 if (struct_v
>= 9) {
1935 decode(tier_of
, bl
);
1938 cache_mode
= (cache_mode_t
)v
;
1939 decode(read_tier
, bl
);
1940 decode(write_tier
, bl
);
1942 if (struct_v
>= 10) {
1943 decode(properties
, bl
);
1945 if (struct_v
>= 11) {
1946 decode(hit_set_params
, bl
);
1947 decode(hit_set_period
, bl
);
1948 decode(hit_set_count
, bl
);
1951 hit_set_period
= def
.hit_set_period
;
1952 hit_set_count
= def
.hit_set_count
;
1954 if (struct_v
>= 12) {
1955 decode(stripe_width
, bl
);
1957 set_stripe_width(0);
1959 if (struct_v
>= 13) {
1960 decode(target_max_bytes
, bl
);
1961 decode(target_max_objects
, bl
);
1962 decode(cache_target_dirty_ratio_micro
, bl
);
1963 decode(cache_target_full_ratio_micro
, bl
);
1964 decode(cache_min_flush_age
, bl
);
1965 decode(cache_min_evict_age
, bl
);
1967 target_max_bytes
= 0;
1968 target_max_objects
= 0;
1969 cache_target_dirty_ratio_micro
= 0;
1970 cache_target_full_ratio_micro
= 0;
1971 cache_min_flush_age
= 0;
1972 cache_min_evict_age
= 0;
1974 if (struct_v
>= 14) {
1975 decode(erasure_code_profile
, bl
);
1977 if (struct_v
>= 15) {
1978 decode(last_force_op_resend_preluminous
, bl
);
1980 last_force_op_resend_preluminous
= 0;
1982 if (struct_v
>= 16) {
1983 decode(min_read_recency_for_promote
, bl
);
1985 min_read_recency_for_promote
= 1;
1987 if (struct_v
>= 17) {
1988 decode(expected_num_objects
, bl
);
1990 expected_num_objects
= 0;
1992 if (struct_v
>= 19) {
1993 decode(cache_target_dirty_high_ratio_micro
, bl
);
1995 cache_target_dirty_high_ratio_micro
= cache_target_dirty_ratio_micro
;
1997 if (struct_v
>= 20) {
1998 decode(min_write_recency_for_promote
, bl
);
2000 min_write_recency_for_promote
= 1;
2002 if (struct_v
>= 21) {
2003 decode(use_gmt_hitset
, bl
);
2005 use_gmt_hitset
= false;
2007 if (struct_v
>= 22) {
2008 decode(fast_read
, bl
);
2012 if (struct_v
>= 23) {
2013 decode(hit_set_grade_decay_rate
, bl
);
2014 decode(hit_set_search_last_n
, bl
);
2016 hit_set_grade_decay_rate
= 0;
2017 hit_set_search_last_n
= 1;
2019 if (struct_v
>= 24) {
2022 if (struct_v
>= 25) {
2023 decode(last_force_op_resend_prenautilus
, bl
);
2025 last_force_op_resend_prenautilus
= last_force_op_resend_preluminous
;
2027 if (struct_v
>= 26) {
2028 decode(application_metadata
, bl
);
2030 if (struct_v
>= 27) {
2031 decode(create_time
, bl
);
2033 if (struct_v
>= 28) {
2034 decode(pg_num_target
, bl
);
2035 decode(pgp_num_target
, bl
);
2036 decode(pg_num_pending
, bl
);
2037 epoch_t old_merge_last_epoch_clean
, old_merge_last_epoch_started
;
2038 decode(old_merge_last_epoch_started
, bl
);
2039 decode(old_merge_last_epoch_clean
, bl
);
2040 decode(last_force_op_resend
, bl
);
2041 decode(pg_autoscale_mode
, bl
);
2042 if (struct_v
>= 29) {
2043 decode(last_pg_merge_meta
, bl
);
2045 last_pg_merge_meta
.last_epoch_clean
= old_merge_last_epoch_clean
;
2046 last_pg_merge_meta
.last_epoch_started
= old_merge_last_epoch_started
;
2049 pg_num_target
= pg_num
;
2050 pgp_num_target
= pgp_num
;
2051 pg_num_pending
= pg_num
;
2052 last_force_op_resend
= last_force_op_resend_prenautilus
;
2053 pg_autoscale_mode
= PG_AUTOSCALE_MODE_WARN
; // default to warn on upgrade
2060 void pg_pool_t::generate_test_instances(list
<pg_pool_t
*>& o
)
2063 o
.push_back(new pg_pool_t(a
));
2065 a
.create_time
= utime_t(4,5);
2066 a
.type
= TYPE_REPLICATED
;
2072 a
.pgp_num_target
= 4;
2073 a
.pg_num_target
= 5;
2074 a
.pg_num_pending
= 5;
2075 a
.last_pg_merge_meta
.last_epoch_started
= 2;
2076 a
.last_pg_merge_meta
.last_epoch_clean
= 2;
2078 a
.last_force_op_resend
= 123823;
2079 a
.last_force_op_resend_preluminous
= 123824;
2082 a
.flags
= FLAG_POOL_SNAPS
;
2084 a
.quota_max_bytes
= 473;
2085 a
.quota_max_objects
= 474;
2086 o
.push_back(new pg_pool_t(a
));
2088 a
.snaps
[3].name
= "asdf";
2089 a
.snaps
[3].snapid
= 3;
2090 a
.snaps
[3].stamp
= utime_t(123, 4);
2091 a
.snaps
[6].name
= "qwer";
2092 a
.snaps
[6].snapid
= 6;
2093 a
.snaps
[6].stamp
= utime_t(23423, 4);
2094 o
.push_back(new pg_pool_t(a
));
2096 a
.flags
= FLAG_SELFMANAGED_SNAPS
;
2098 a
.removed_snaps
.insert(2);
2099 a
.quota_max_bytes
= 2473;
2100 a
.quota_max_objects
= 4374;
2104 a
.cache_mode
= CACHEMODE_WRITEBACK
;
2107 a
.hit_set_params
= HitSet::Params(new BloomHitSet::Params
);
2108 a
.hit_set_period
= 3600;
2109 a
.hit_set_count
= 8;
2110 a
.min_read_recency_for_promote
= 1;
2111 a
.min_write_recency_for_promote
= 1;
2112 a
.hit_set_grade_decay_rate
= 50;
2113 a
.hit_set_search_last_n
= 1;
2114 a
.calc_grade_table();
2115 a
.set_stripe_width(12345);
2116 a
.target_max_bytes
= 1238132132;
2117 a
.target_max_objects
= 1232132;
2118 a
.cache_target_dirty_ratio_micro
= 187232;
2119 a
.cache_target_dirty_high_ratio_micro
= 309856;
2120 a
.cache_target_full_ratio_micro
= 987222;
2121 a
.cache_min_flush_age
= 231;
2122 a
.cache_min_evict_age
= 2321;
2123 a
.erasure_code_profile
= "profile in osdmap";
2124 a
.expected_num_objects
= 123456;
2125 a
.fast_read
= false;
2126 a
.application_metadata
= {{"rbd", {{"key", "value"}}}};
2127 o
.push_back(new pg_pool_t(a
));
2130 ostream
& operator<<(ostream
& out
, const pg_pool_t
& p
)
2132 out
<< p
.get_type_name()
2133 << " size " << p
.get_size()
2134 << " min_size " << p
.get_min_size()
2135 << " crush_rule " << p
.get_crush_rule()
2136 << " object_hash " << p
.get_object_hash_name()
2137 << " pg_num " << p
.get_pg_num()
2138 << " pgp_num " << p
.get_pgp_num();
2139 if (p
.get_pg_num_target() != p
.get_pg_num()) {
2140 out
<< " pg_num_target " << p
.get_pg_num_target();
2142 if (p
.get_pgp_num_target() != p
.get_pgp_num()) {
2143 out
<< " pgp_num_target " << p
.get_pgp_num_target();
2145 if (p
.get_pg_num_pending() != p
.get_pg_num()) {
2146 out
<< " pg_num_pending " << p
.get_pg_num_pending();
2148 if (p
.pg_autoscale_mode
) {
2149 out
<< " autoscale_mode " << p
.get_pg_autoscale_mode_name(p
.pg_autoscale_mode
);
2151 out
<< " last_change " << p
.get_last_change();
2152 if (p
.get_last_force_op_resend() ||
2153 p
.get_last_force_op_resend_prenautilus() ||
2154 p
.get_last_force_op_resend_preluminous())
2155 out
<< " lfor " << p
.get_last_force_op_resend() << "/"
2156 << p
.get_last_force_op_resend_prenautilus() << "/"
2157 << p
.get_last_force_op_resend_preluminous();
2159 out
<< " owner " << p
.get_auid();
2161 out
<< " flags " << p
.get_flags_string();
2162 if (p
.quota_max_bytes
)
2163 out
<< " max_bytes " << p
.quota_max_bytes
;
2164 if (p
.quota_max_objects
)
2165 out
<< " max_objects " << p
.quota_max_objects
;
2166 if (!p
.tiers
.empty())
2167 out
<< " tiers " << p
.tiers
;
2169 out
<< " tier_of " << p
.tier_of
;
2170 if (p
.has_read_tier())
2171 out
<< " read_tier " << p
.read_tier
;
2172 if (p
.has_write_tier())
2173 out
<< " write_tier " << p
.write_tier
;
2175 out
<< " cache_mode " << p
.get_cache_mode_name();
2176 if (p
.target_max_bytes
)
2177 out
<< " target_bytes " << p
.target_max_bytes
;
2178 if (p
.target_max_objects
)
2179 out
<< " target_objects " << p
.target_max_objects
;
2180 if (p
.hit_set_params
.get_type() != HitSet::TYPE_NONE
) {
2181 out
<< " hit_set " << p
.hit_set_params
2182 << " " << p
.hit_set_period
<< "s"
2183 << " x" << p
.hit_set_count
<< " decay_rate "
2184 << p
.hit_set_grade_decay_rate
2185 << " search_last_n " << p
.hit_set_search_last_n
;
2187 if (p
.min_read_recency_for_promote
)
2188 out
<< " min_read_recency_for_promote " << p
.min_read_recency_for_promote
;
2189 if (p
.min_write_recency_for_promote
)
2190 out
<< " min_write_recency_for_promote " << p
.min_write_recency_for_promote
;
2191 out
<< " stripe_width " << p
.get_stripe_width();
2192 if (p
.expected_num_objects
)
2193 out
<< " expected_num_objects " << p
.expected_num_objects
;
2195 out
<< " fast_read " << p
.fast_read
;
2197 if (!p
.application_metadata
.empty()) {
2198 out
<< " application ";
2199 for (auto it
= p
.application_metadata
.begin();
2200 it
!= p
.application_metadata
.end(); ++it
) {
2201 if (it
!= p
.application_metadata
.begin())
2210 // -- object_stat_sum_t --
2212 void object_stat_sum_t::dump(Formatter
*f
) const
2214 f
->dump_int("num_bytes", num_bytes
);
2215 f
->dump_int("num_objects", num_objects
);
2216 f
->dump_int("num_object_clones", num_object_clones
);
2217 f
->dump_int("num_object_copies", num_object_copies
);
2218 f
->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary
);
2219 f
->dump_int("num_objects_missing", num_objects_missing
);
2220 f
->dump_int("num_objects_degraded", num_objects_degraded
);
2221 f
->dump_int("num_objects_misplaced", num_objects_misplaced
);
2222 f
->dump_int("num_objects_unfound", num_objects_unfound
);
2223 f
->dump_int("num_objects_dirty", num_objects_dirty
);
2224 f
->dump_int("num_whiteouts", num_whiteouts
);
2225 f
->dump_int("num_read", num_rd
);
2226 f
->dump_int("num_read_kb", num_rd_kb
);
2227 f
->dump_int("num_write", num_wr
);
2228 f
->dump_int("num_write_kb", num_wr_kb
);
2229 f
->dump_int("num_scrub_errors", num_scrub_errors
);
2230 f
->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors
);
2231 f
->dump_int("num_deep_scrub_errors", num_deep_scrub_errors
);
2232 f
->dump_int("num_objects_recovered", num_objects_recovered
);
2233 f
->dump_int("num_bytes_recovered", num_bytes_recovered
);
2234 f
->dump_int("num_keys_recovered", num_keys_recovered
);
2235 f
->dump_int("num_objects_omap", num_objects_omap
);
2236 f
->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive
);
2237 f
->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive
);
2238 f
->dump_int("num_flush", num_flush
);
2239 f
->dump_int("num_flush_kb", num_flush_kb
);
2240 f
->dump_int("num_evict", num_evict
);
2241 f
->dump_int("num_evict_kb", num_evict_kb
);
2242 f
->dump_int("num_promote", num_promote
);
2243 f
->dump_int("num_flush_mode_high", num_flush_mode_high
);
2244 f
->dump_int("num_flush_mode_low", num_flush_mode_low
);
2245 f
->dump_int("num_evict_mode_some", num_evict_mode_some
);
2246 f
->dump_int("num_evict_mode_full", num_evict_mode_full
);
2247 f
->dump_int("num_objects_pinned", num_objects_pinned
);
2248 f
->dump_int("num_legacy_snapsets", num_legacy_snapsets
);
2249 f
->dump_int("num_large_omap_objects", num_large_omap_objects
);
2250 f
->dump_int("num_objects_manifest", num_objects_manifest
);
2251 f
->dump_int("num_omap_bytes", num_omap_bytes
);
2252 f
->dump_int("num_omap_keys", num_omap_keys
);
2253 f
->dump_int("num_objects_repaired", num_objects_repaired
);
2256 void object_stat_sum_t::encode(bufferlist
& bl
) const
2258 ENCODE_START(20, 14, bl
);
2259 #if defined(CEPH_LITTLE_ENDIAN)
2260 bl
.append((char *)(&num_bytes
), sizeof(object_stat_sum_t
));
2262 encode(num_bytes
, bl
);
2263 encode(num_objects
, bl
);
2264 encode(num_object_clones
, bl
);
2265 encode(num_object_copies
, bl
);
2266 encode(num_objects_missing_on_primary
, bl
);
2267 encode(num_objects_degraded
, bl
);
2268 encode(num_objects_unfound
, bl
);
2270 encode(num_rd_kb
, bl
);
2272 encode(num_wr_kb
, bl
);
2273 encode(num_scrub_errors
, bl
);
2274 encode(num_objects_recovered
, bl
);
2275 encode(num_bytes_recovered
, bl
);
2276 encode(num_keys_recovered
, bl
);
2277 encode(num_shallow_scrub_errors
, bl
);
2278 encode(num_deep_scrub_errors
, bl
);
2279 encode(num_objects_dirty
, bl
);
2280 encode(num_whiteouts
, bl
);
2281 encode(num_objects_omap
, bl
);
2282 encode(num_objects_hit_set_archive
, bl
);
2283 encode(num_objects_misplaced
, bl
);
2284 encode(num_bytes_hit_set_archive
, bl
);
2285 encode(num_flush
, bl
);
2286 encode(num_flush_kb
, bl
);
2287 encode(num_evict
, bl
);
2288 encode(num_evict_kb
, bl
);
2289 encode(num_promote
, bl
);
2290 encode(num_flush_mode_high
, bl
);
2291 encode(num_flush_mode_low
, bl
);
2292 encode(num_evict_mode_some
, bl
);
2293 encode(num_evict_mode_full
, bl
);
2294 encode(num_objects_pinned
, bl
);
2295 encode(num_objects_missing
, bl
);
2296 encode(num_legacy_snapsets
, bl
);
2297 encode(num_large_omap_objects
, bl
);
2298 encode(num_objects_manifest
, bl
);
2299 encode(num_omap_bytes
, bl
);
2300 encode(num_omap_keys
, bl
);
2301 encode(num_objects_repaired
, bl
);
2306 void object_stat_sum_t::decode(bufferlist::const_iterator
& bl
)
2308 bool decode_finish
= false;
2309 static const int STAT_SUM_DECODE_VERSION
= 20;
2310 DECODE_START(STAT_SUM_DECODE_VERSION
, bl
);
2311 #if defined(CEPH_LITTLE_ENDIAN)
2312 if (struct_v
== STAT_SUM_DECODE_VERSION
) {
2313 bl
.copy(sizeof(object_stat_sum_t
), (char*)(&num_bytes
));
2314 decode_finish
= true;
2317 if (!decode_finish
) {
2318 decode(num_bytes
, bl
);
2319 decode(num_objects
, bl
);
2320 decode(num_object_clones
, bl
);
2321 decode(num_object_copies
, bl
);
2322 decode(num_objects_missing_on_primary
, bl
);
2323 decode(num_objects_degraded
, bl
);
2324 decode(num_objects_unfound
, bl
);
2326 decode(num_rd_kb
, bl
);
2328 decode(num_wr_kb
, bl
);
2329 decode(num_scrub_errors
, bl
);
2330 decode(num_objects_recovered
, bl
);
2331 decode(num_bytes_recovered
, bl
);
2332 decode(num_keys_recovered
, bl
);
2333 decode(num_shallow_scrub_errors
, bl
);
2334 decode(num_deep_scrub_errors
, bl
);
2335 decode(num_objects_dirty
, bl
);
2336 decode(num_whiteouts
, bl
);
2337 decode(num_objects_omap
, bl
);
2338 decode(num_objects_hit_set_archive
, bl
);
2339 decode(num_objects_misplaced
, bl
);
2340 decode(num_bytes_hit_set_archive
, bl
);
2341 decode(num_flush
, bl
);
2342 decode(num_flush_kb
, bl
);
2343 decode(num_evict
, bl
);
2344 decode(num_evict_kb
, bl
);
2345 decode(num_promote
, bl
);
2346 decode(num_flush_mode_high
, bl
);
2347 decode(num_flush_mode_low
, bl
);
2348 decode(num_evict_mode_some
, bl
);
2349 decode(num_evict_mode_full
, bl
);
2350 decode(num_objects_pinned
, bl
);
2351 decode(num_objects_missing
, bl
);
2352 if (struct_v
>= 16) {
2353 decode(num_legacy_snapsets
, bl
);
2355 num_legacy_snapsets
= num_object_clones
; // upper bound
2357 if (struct_v
>= 17) {
2358 decode(num_large_omap_objects
, bl
);
2360 if (struct_v
>= 18) {
2361 decode(num_objects_manifest
, bl
);
2363 if (struct_v
>= 19) {
2364 decode(num_omap_bytes
, bl
);
2365 decode(num_omap_keys
, bl
);
2367 if (struct_v
>= 20) {
2368 decode(num_objects_repaired
, bl
);
2374 void object_stat_sum_t::generate_test_instances(list
<object_stat_sum_t
*>& o
)
2376 object_stat_sum_t a
;
2380 a
.num_object_clones
= 4;
2381 a
.num_object_copies
= 5;
2382 a
.num_objects_missing_on_primary
= 6;
2383 a
.num_objects_missing
= 123;
2384 a
.num_objects_degraded
= 7;
2385 a
.num_objects_unfound
= 8;
2386 a
.num_rd
= 9; a
.num_rd_kb
= 10;
2387 a
.num_wr
= 11; a
.num_wr_kb
= 12;
2388 a
.num_objects_recovered
= 14;
2389 a
.num_bytes_recovered
= 15;
2390 a
.num_keys_recovered
= 16;
2391 a
.num_deep_scrub_errors
= 17;
2392 a
.num_shallow_scrub_errors
= 18;
2393 a
.num_scrub_errors
= a
.num_deep_scrub_errors
+ a
.num_shallow_scrub_errors
;
2394 a
.num_objects_dirty
= 21;
2395 a
.num_whiteouts
= 22;
2396 a
.num_objects_misplaced
= 1232;
2397 a
.num_objects_hit_set_archive
= 2;
2398 a
.num_bytes_hit_set_archive
= 27;
2404 a
.num_flush_mode_high
= 0;
2405 a
.num_flush_mode_low
= 1;
2406 a
.num_evict_mode_some
= 1;
2407 a
.num_evict_mode_full
= 0;
2408 a
.num_objects_pinned
= 20;
2409 a
.num_large_omap_objects
= 5;
2410 a
.num_objects_manifest
= 2;
2411 a
.num_omap_bytes
= 20000;
2412 a
.num_omap_keys
= 200;
2413 a
.num_objects_repaired
= 300;
2414 o
.push_back(new object_stat_sum_t(a
));
2417 void object_stat_sum_t::add(const object_stat_sum_t
& o
)
2419 num_bytes
+= o
.num_bytes
;
2420 num_objects
+= o
.num_objects
;
2421 num_object_clones
+= o
.num_object_clones
;
2422 num_object_copies
+= o
.num_object_copies
;
2423 num_objects_missing_on_primary
+= o
.num_objects_missing_on_primary
;
2424 num_objects_missing
+= o
.num_objects_missing
;
2425 num_objects_degraded
+= o
.num_objects_degraded
;
2426 num_objects_misplaced
+= o
.num_objects_misplaced
;
2428 num_rd_kb
+= o
.num_rd_kb
;
2430 num_wr_kb
+= o
.num_wr_kb
;
2431 num_objects_unfound
+= o
.num_objects_unfound
;
2432 num_scrub_errors
+= o
.num_scrub_errors
;
2433 num_shallow_scrub_errors
+= o
.num_shallow_scrub_errors
;
2434 num_deep_scrub_errors
+= o
.num_deep_scrub_errors
;
2435 num_objects_recovered
+= o
.num_objects_recovered
;
2436 num_bytes_recovered
+= o
.num_bytes_recovered
;
2437 num_keys_recovered
+= o
.num_keys_recovered
;
2438 num_objects_dirty
+= o
.num_objects_dirty
;
2439 num_whiteouts
+= o
.num_whiteouts
;
2440 num_objects_omap
+= o
.num_objects_omap
;
2441 num_objects_hit_set_archive
+= o
.num_objects_hit_set_archive
;
2442 num_bytes_hit_set_archive
+= o
.num_bytes_hit_set_archive
;
2443 num_flush
+= o
.num_flush
;
2444 num_flush_kb
+= o
.num_flush_kb
;
2445 num_evict
+= o
.num_evict
;
2446 num_evict_kb
+= o
.num_evict_kb
;
2447 num_promote
+= o
.num_promote
;
2448 num_flush_mode_high
+= o
.num_flush_mode_high
;
2449 num_flush_mode_low
+= o
.num_flush_mode_low
;
2450 num_evict_mode_some
+= o
.num_evict_mode_some
;
2451 num_evict_mode_full
+= o
.num_evict_mode_full
;
2452 num_objects_pinned
+= o
.num_objects_pinned
;
2453 num_legacy_snapsets
+= o
.num_legacy_snapsets
;
2454 num_large_omap_objects
+= o
.num_large_omap_objects
;
2455 num_objects_manifest
+= o
.num_objects_manifest
;
2456 num_omap_bytes
+= o
.num_omap_bytes
;
2457 num_omap_keys
+= o
.num_omap_keys
;
2458 num_objects_repaired
+= o
.num_objects_repaired
;
2461 void object_stat_sum_t::sub(const object_stat_sum_t
& o
)
2463 num_bytes
-= o
.num_bytes
;
2464 num_objects
-= o
.num_objects
;
2465 num_object_clones
-= o
.num_object_clones
;
2466 num_object_copies
-= o
.num_object_copies
;
2467 num_objects_missing_on_primary
-= o
.num_objects_missing_on_primary
;
2468 num_objects_missing
-= o
.num_objects_missing
;
2469 num_objects_degraded
-= o
.num_objects_degraded
;
2470 num_objects_misplaced
-= o
.num_objects_misplaced
;
2472 num_rd_kb
-= o
.num_rd_kb
;
2474 num_wr_kb
-= o
.num_wr_kb
;
2475 num_objects_unfound
-= o
.num_objects_unfound
;
2476 num_scrub_errors
-= o
.num_scrub_errors
;
2477 num_shallow_scrub_errors
-= o
.num_shallow_scrub_errors
;
2478 num_deep_scrub_errors
-= o
.num_deep_scrub_errors
;
2479 num_objects_recovered
-= o
.num_objects_recovered
;
2480 num_bytes_recovered
-= o
.num_bytes_recovered
;
2481 num_keys_recovered
-= o
.num_keys_recovered
;
2482 num_objects_dirty
-= o
.num_objects_dirty
;
2483 num_whiteouts
-= o
.num_whiteouts
;
2484 num_objects_omap
-= o
.num_objects_omap
;
2485 num_objects_hit_set_archive
-= o
.num_objects_hit_set_archive
;
2486 num_bytes_hit_set_archive
-= o
.num_bytes_hit_set_archive
;
2487 num_flush
-= o
.num_flush
;
2488 num_flush_kb
-= o
.num_flush_kb
;
2489 num_evict
-= o
.num_evict
;
2490 num_evict_kb
-= o
.num_evict_kb
;
2491 num_promote
-= o
.num_promote
;
2492 num_flush_mode_high
-= o
.num_flush_mode_high
;
2493 num_flush_mode_low
-= o
.num_flush_mode_low
;
2494 num_evict_mode_some
-= o
.num_evict_mode_some
;
2495 num_evict_mode_full
-= o
.num_evict_mode_full
;
2496 num_objects_pinned
-= o
.num_objects_pinned
;
2497 num_legacy_snapsets
-= o
.num_legacy_snapsets
;
2498 num_large_omap_objects
-= o
.num_large_omap_objects
;
2499 num_objects_manifest
-= o
.num_objects_manifest
;
2500 num_omap_bytes
-= o
.num_omap_bytes
;
2501 num_omap_keys
-= o
.num_omap_keys
;
2502 num_objects_repaired
-= o
.num_objects_repaired
;
2505 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
)
2508 l
.num_bytes
== r
.num_bytes
&&
2509 l
.num_objects
== r
.num_objects
&&
2510 l
.num_object_clones
== r
.num_object_clones
&&
2511 l
.num_object_copies
== r
.num_object_copies
&&
2512 l
.num_objects_missing_on_primary
== r
.num_objects_missing_on_primary
&&
2513 l
.num_objects_missing
== r
.num_objects_missing
&&
2514 l
.num_objects_degraded
== r
.num_objects_degraded
&&
2515 l
.num_objects_misplaced
== r
.num_objects_misplaced
&&
2516 l
.num_objects_unfound
== r
.num_objects_unfound
&&
2517 l
.num_rd
== r
.num_rd
&&
2518 l
.num_rd_kb
== r
.num_rd_kb
&&
2519 l
.num_wr
== r
.num_wr
&&
2520 l
.num_wr_kb
== r
.num_wr_kb
&&
2521 l
.num_scrub_errors
== r
.num_scrub_errors
&&
2522 l
.num_shallow_scrub_errors
== r
.num_shallow_scrub_errors
&&
2523 l
.num_deep_scrub_errors
== r
.num_deep_scrub_errors
&&
2524 l
.num_objects_recovered
== r
.num_objects_recovered
&&
2525 l
.num_bytes_recovered
== r
.num_bytes_recovered
&&
2526 l
.num_keys_recovered
== r
.num_keys_recovered
&&
2527 l
.num_objects_dirty
== r
.num_objects_dirty
&&
2528 l
.num_whiteouts
== r
.num_whiteouts
&&
2529 l
.num_objects_omap
== r
.num_objects_omap
&&
2530 l
.num_objects_hit_set_archive
== r
.num_objects_hit_set_archive
&&
2531 l
.num_bytes_hit_set_archive
== r
.num_bytes_hit_set_archive
&&
2532 l
.num_flush
== r
.num_flush
&&
2533 l
.num_flush_kb
== r
.num_flush_kb
&&
2534 l
.num_evict
== r
.num_evict
&&
2535 l
.num_evict_kb
== r
.num_evict_kb
&&
2536 l
.num_promote
== r
.num_promote
&&
2537 l
.num_flush_mode_high
== r
.num_flush_mode_high
&&
2538 l
.num_flush_mode_low
== r
.num_flush_mode_low
&&
2539 l
.num_evict_mode_some
== r
.num_evict_mode_some
&&
2540 l
.num_evict_mode_full
== r
.num_evict_mode_full
&&
2541 l
.num_objects_pinned
== r
.num_objects_pinned
&&
2542 l
.num_legacy_snapsets
== r
.num_legacy_snapsets
&&
2543 l
.num_large_omap_objects
== r
.num_large_omap_objects
&&
2544 l
.num_objects_manifest
== r
.num_objects_manifest
&&
2545 l
.num_omap_bytes
== r
.num_omap_bytes
&&
2546 l
.num_omap_keys
== r
.num_omap_keys
&&
2547 l
.num_objects_repaired
== r
.num_objects_repaired
;
2550 // -- object_stat_collection_t --
2552 void object_stat_collection_t::dump(Formatter
*f
) const
2554 f
->open_object_section("stat_sum");
2559 void object_stat_collection_t::encode(bufferlist
& bl
) const
2561 ENCODE_START(2, 2, bl
);
2563 encode((__u32
)0, bl
);
2567 void object_stat_collection_t::decode(bufferlist::const_iterator
& bl
)
2569 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2572 map
<string
,object_stat_sum_t
> cat_sum
;
2573 decode(cat_sum
, bl
);
2578 void object_stat_collection_t::generate_test_instances(list
<object_stat_collection_t
*>& o
)
2580 object_stat_collection_t a
;
2581 o
.push_back(new object_stat_collection_t(a
));
2582 list
<object_stat_sum_t
*> l
;
2583 object_stat_sum_t::generate_test_instances(l
);
2584 for (list
<object_stat_sum_t
*>::iterator p
= l
.begin(); p
!= l
.end(); ++p
) {
2586 o
.push_back(new object_stat_collection_t(a
));
2593 bool pg_stat_t::is_acting_osd(int32_t osd
, bool primary
) const
2595 if (primary
&& osd
== acting_primary
) {
2597 } else if (!primary
) {
2598 for(vector
<int32_t>::const_iterator it
= acting
.begin();
2599 it
!= acting
.end(); ++it
)
2608 void pg_stat_t::dump(Formatter
*f
) const
2610 f
->dump_stream("version") << version
;
2611 f
->dump_stream("reported_seq") << reported_seq
;
2612 f
->dump_stream("reported_epoch") << reported_epoch
;
2613 f
->dump_string("state", pg_state_string(state
));
2614 f
->dump_stream("last_fresh") << last_fresh
;
2615 f
->dump_stream("last_change") << last_change
;
2616 f
->dump_stream("last_active") << last_active
;
2617 f
->dump_stream("last_peered") << last_peered
;
2618 f
->dump_stream("last_clean") << last_clean
;
2619 f
->dump_stream("last_became_active") << last_became_active
;
2620 f
->dump_stream("last_became_peered") << last_became_peered
;
2621 f
->dump_stream("last_unstale") << last_unstale
;
2622 f
->dump_stream("last_undegraded") << last_undegraded
;
2623 f
->dump_stream("last_fullsized") << last_fullsized
;
2624 f
->dump_unsigned("mapping_epoch", mapping_epoch
);
2625 f
->dump_stream("log_start") << log_start
;
2626 f
->dump_stream("ondisk_log_start") << ondisk_log_start
;
2627 f
->dump_unsigned("created", created
);
2628 f
->dump_unsigned("last_epoch_clean", last_epoch_clean
);
2629 f
->dump_stream("parent") << parent
;
2630 f
->dump_unsigned("parent_split_bits", parent_split_bits
);
2631 f
->dump_stream("last_scrub") << last_scrub
;
2632 f
->dump_stream("last_scrub_stamp") << last_scrub_stamp
;
2633 f
->dump_stream("last_deep_scrub") << last_deep_scrub
;
2634 f
->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp
;
2635 f
->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp
;
2636 f
->dump_int("log_size", log_size
);
2637 f
->dump_int("ondisk_log_size", ondisk_log_size
);
2638 f
->dump_bool("stats_invalid", stats_invalid
);
2639 f
->dump_bool("dirty_stats_invalid", dirty_stats_invalid
);
2640 f
->dump_bool("omap_stats_invalid", omap_stats_invalid
);
2641 f
->dump_bool("hitset_stats_invalid", hitset_stats_invalid
);
2642 f
->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid
);
2643 f
->dump_bool("pin_stats_invalid", pin_stats_invalid
);
2644 f
->dump_bool("manifest_stats_invalid", manifest_stats_invalid
);
2645 f
->dump_unsigned("snaptrimq_len", snaptrimq_len
);
2647 f
->open_array_section("up");
2648 for (vector
<int32_t>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
)
2649 f
->dump_int("osd", *p
);
2651 f
->open_array_section("acting");
2652 for (vector
<int32_t>::const_iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
2653 f
->dump_int("osd", *p
);
2655 f
->open_array_section("avail_no_missing");
2656 for (auto p
= avail_no_missing
.cbegin(); p
!= avail_no_missing
.cend(); ++p
)
2657 f
->dump_stream("shard") << *p
;
2659 f
->open_array_section("object_location_counts");
2660 for (auto p
= object_location_counts
.cbegin(); p
!= object_location_counts
.cend(); ++p
) {
2661 f
->open_object_section("entry");
2662 f
->dump_stream("shards") << p
->first
;
2663 f
->dump_int("objects", p
->second
);
2667 f
->open_array_section("blocked_by");
2668 for (vector
<int32_t>::const_iterator p
= blocked_by
.begin();
2669 p
!= blocked_by
.end(); ++p
)
2670 f
->dump_int("osd", *p
);
2672 f
->dump_int("up_primary", up_primary
);
2673 f
->dump_int("acting_primary", acting_primary
);
2674 f
->open_array_section("purged_snaps");
2675 for (interval_set
<snapid_t
>::const_iterator i
= purged_snaps
.begin();
2676 i
!= purged_snaps
.end();
2678 f
->open_object_section("interval");
2679 f
->dump_stream("start") << i
.get_start();
2680 f
->dump_stream("length") << i
.get_len();
2686 void pg_stat_t::dump_brief(Formatter
*f
) const
2688 f
->dump_string("state", pg_state_string(state
));
2689 f
->open_array_section("up");
2690 for (vector
<int32_t>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
)
2691 f
->dump_int("osd", *p
);
2693 f
->open_array_section("acting");
2694 for (vector
<int32_t>::const_iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
2695 f
->dump_int("osd", *p
);
2697 f
->dump_int("up_primary", up_primary
);
2698 f
->dump_int("acting_primary", acting_primary
);
2701 void pg_stat_t::encode(bufferlist
&bl
) const
2703 ENCODE_START(26, 22, bl
);
2704 encode(version
, bl
);
2705 encode(reported_seq
, bl
);
2706 encode(reported_epoch
, bl
);
2707 encode((__u32
)state
, bl
); // for older peers
2708 encode(log_start
, bl
);
2709 encode(ondisk_log_start
, bl
);
2710 encode(created
, bl
);
2711 encode(last_epoch_clean
, bl
);
2713 encode(parent_split_bits
, bl
);
2714 encode(last_scrub
, bl
);
2715 encode(last_scrub_stamp
, bl
);
2717 encode(log_size
, bl
);
2718 encode(ondisk_log_size
, bl
);
2721 encode(last_fresh
, bl
);
2722 encode(last_change
, bl
);
2723 encode(last_active
, bl
);
2724 encode(last_clean
, bl
);
2725 encode(last_unstale
, bl
);
2726 encode(mapping_epoch
, bl
);
2727 encode(last_deep_scrub
, bl
);
2728 encode(last_deep_scrub_stamp
, bl
);
2729 encode(stats_invalid
, bl
);
2730 encode(last_clean_scrub_stamp
, bl
);
2731 encode(last_became_active
, bl
);
2732 encode(dirty_stats_invalid
, bl
);
2733 encode(up_primary
, bl
);
2734 encode(acting_primary
, bl
);
2735 encode(omap_stats_invalid
, bl
);
2736 encode(hitset_stats_invalid
, bl
);
2737 encode(blocked_by
, bl
);
2738 encode(last_undegraded
, bl
);
2739 encode(last_fullsized
, bl
);
2740 encode(hitset_bytes_stats_invalid
, bl
);
2741 encode(last_peered
, bl
);
2742 encode(last_became_peered
, bl
);
2743 encode(pin_stats_invalid
, bl
);
2744 encode(snaptrimq_len
, bl
);
2745 __u32 top_state
= (state
>> 32);
2746 encode(top_state
, bl
);
2747 encode(purged_snaps
, bl
);
2748 encode(manifest_stats_invalid
, bl
);
2749 encode(avail_no_missing
, bl
);
2750 encode(object_location_counts
, bl
);
2754 void pg_stat_t::decode(bufferlist::const_iterator
&bl
)
2758 DECODE_START(26, bl
);
2759 decode(version
, bl
);
2760 decode(reported_seq
, bl
);
2761 decode(reported_epoch
, bl
);
2762 decode(old_state
, bl
);
2763 decode(log_start
, bl
);
2764 decode(ondisk_log_start
, bl
);
2765 decode(created
, bl
);
2766 decode(last_epoch_clean
, bl
);
2768 decode(parent_split_bits
, bl
);
2769 decode(last_scrub
, bl
);
2770 decode(last_scrub_stamp
, bl
);
2772 decode(log_size
, bl
);
2773 decode(ondisk_log_size
, bl
);
2776 decode(last_fresh
, bl
);
2777 decode(last_change
, bl
);
2778 decode(last_active
, bl
);
2779 decode(last_clean
, bl
);
2780 decode(last_unstale
, bl
);
2781 decode(mapping_epoch
, bl
);
2782 decode(last_deep_scrub
, bl
);
2783 decode(last_deep_scrub_stamp
, bl
);
2785 stats_invalid
= tmp
;
2786 decode(last_clean_scrub_stamp
, bl
);
2787 decode(last_became_active
, bl
);
2789 dirty_stats_invalid
= tmp
;
2790 decode(up_primary
, bl
);
2791 decode(acting_primary
, bl
);
2793 omap_stats_invalid
= tmp
;
2795 hitset_stats_invalid
= tmp
;
2796 decode(blocked_by
, bl
);
2797 decode(last_undegraded
, bl
);
2798 decode(last_fullsized
, bl
);
2800 hitset_bytes_stats_invalid
= tmp
;
2801 decode(last_peered
, bl
);
2802 decode(last_became_peered
, bl
);
2804 pin_stats_invalid
= tmp
;
2805 if (struct_v
>= 23) {
2806 decode(snaptrimq_len
, bl
);
2807 if (struct_v
>= 24) {
2809 decode(top_state
, bl
);
2810 state
= (uint64_t)old_state
| ((uint64_t)top_state
<< 32);
2811 decode(purged_snaps
, bl
);
2815 if (struct_v
>= 25) {
2817 manifest_stats_invalid
= tmp
;
2819 manifest_stats_invalid
= true;
2821 if (struct_v
>= 26) {
2822 decode(avail_no_missing
, bl
);
2823 decode(object_location_counts
, bl
);
2829 void pg_stat_t::generate_test_instances(list
<pg_stat_t
*>& o
)
2832 o
.push_back(new pg_stat_t(a
));
2834 a
.version
= eversion_t(1, 3);
2835 a
.reported_epoch
= 1;
2838 a
.mapping_epoch
= 998;
2839 a
.last_fresh
= utime_t(1002, 1);
2840 a
.last_change
= utime_t(1002, 2);
2841 a
.last_active
= utime_t(1002, 3);
2842 a
.last_clean
= utime_t(1002, 4);
2843 a
.last_unstale
= utime_t(1002, 5);
2844 a
.last_undegraded
= utime_t(1002, 7);
2845 a
.last_fullsized
= utime_t(1002, 8);
2846 a
.log_start
= eversion_t(1, 4);
2847 a
.ondisk_log_start
= eversion_t(1, 5);
2849 a
.last_epoch_clean
= 7;
2850 a
.parent
= pg_t(1, 2);
2851 a
.parent_split_bits
= 12;
2852 a
.last_scrub
= eversion_t(9, 10);
2853 a
.last_scrub_stamp
= utime_t(11, 12);
2854 a
.last_deep_scrub
= eversion_t(13, 14);
2855 a
.last_deep_scrub_stamp
= utime_t(15, 16);
2856 a
.last_clean_scrub_stamp
= utime_t(17, 18);
2857 a
.snaptrimq_len
= 1048576;
2858 list
<object_stat_collection_t
*> l
;
2859 object_stat_collection_t::generate_test_instances(l
);
2860 a
.stats
= *l
.back();
2862 a
.ondisk_log_size
= 88;
2863 a
.up
.push_back(123);
2865 a
.acting
.push_back(456);
2866 a
.avail_no_missing
.push_back(pg_shard_t(456, shard_id_t::NO_SHARD
));
2867 set
<pg_shard_t
> sset
= { pg_shard_t(0), pg_shard_t(1) };
2868 a
.object_location_counts
.insert(make_pair(sset
, 10));
2869 sset
.insert(pg_shard_t(2));
2870 a
.object_location_counts
.insert(make_pair(sset
, 5));
2871 a
.acting_primary
= 456;
2872 o
.push_back(new pg_stat_t(a
));
2874 a
.up
.push_back(124);
2876 a
.acting
.push_back(124);
2877 a
.acting_primary
= 124;
2878 a
.blocked_by
.push_back(155);
2879 a
.blocked_by
.push_back(156);
2880 o
.push_back(new pg_stat_t(a
));
2883 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
)
2886 l
.version
== r
.version
&&
2887 l
.reported_seq
== r
.reported_seq
&&
2888 l
.reported_epoch
== r
.reported_epoch
&&
2889 l
.state
== r
.state
&&
2890 l
.last_fresh
== r
.last_fresh
&&
2891 l
.last_change
== r
.last_change
&&
2892 l
.last_active
== r
.last_active
&&
2893 l
.last_peered
== r
.last_peered
&&
2894 l
.last_clean
== r
.last_clean
&&
2895 l
.last_unstale
== r
.last_unstale
&&
2896 l
.last_undegraded
== r
.last_undegraded
&&
2897 l
.last_fullsized
== r
.last_fullsized
&&
2898 l
.log_start
== r
.log_start
&&
2899 l
.ondisk_log_start
== r
.ondisk_log_start
&&
2900 l
.created
== r
.created
&&
2901 l
.last_epoch_clean
== r
.last_epoch_clean
&&
2902 l
.parent
== r
.parent
&&
2903 l
.parent_split_bits
== r
.parent_split_bits
&&
2904 l
.last_scrub
== r
.last_scrub
&&
2905 l
.last_deep_scrub
== r
.last_deep_scrub
&&
2906 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
2907 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
2908 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
&&
2909 l
.stats
== r
.stats
&&
2910 l
.stats_invalid
== r
.stats_invalid
&&
2911 l
.log_size
== r
.log_size
&&
2912 l
.ondisk_log_size
== r
.ondisk_log_size
&&
2914 l
.acting
== r
.acting
&&
2915 l
.avail_no_missing
== r
.avail_no_missing
&&
2916 l
.object_location_counts
== r
.object_location_counts
&&
2917 l
.mapping_epoch
== r
.mapping_epoch
&&
2918 l
.blocked_by
== r
.blocked_by
&&
2919 l
.last_became_active
== r
.last_became_active
&&
2920 l
.last_became_peered
== r
.last_became_peered
&&
2921 l
.dirty_stats_invalid
== r
.dirty_stats_invalid
&&
2922 l
.omap_stats_invalid
== r
.omap_stats_invalid
&&
2923 l
.hitset_stats_invalid
== r
.hitset_stats_invalid
&&
2924 l
.hitset_bytes_stats_invalid
== r
.hitset_bytes_stats_invalid
&&
2925 l
.up_primary
== r
.up_primary
&&
2926 l
.acting_primary
== r
.acting_primary
&&
2927 l
.pin_stats_invalid
== r
.pin_stats_invalid
&&
2928 l
.manifest_stats_invalid
== r
.manifest_stats_invalid
&&
2929 l
.purged_snaps
== r
.purged_snaps
&&
2930 l
.snaptrimq_len
== r
.snaptrimq_len
;
2933 // -- store_statfs_t --
2935 bool store_statfs_t::operator==(const store_statfs_t
& other
) const
2937 return total
== other
.total
2938 && available
== other
.available
2939 && allocated
== other
.allocated
2940 && internally_reserved
== other
.internally_reserved
2941 && data_stored
== other
.data_stored
2942 && data_compressed
== other
.data_compressed
2943 && data_compressed_allocated
== other
.data_compressed_allocated
2944 && data_compressed_original
== other
.data_compressed_original
2945 && omap_allocated
== other
.omap_allocated
2946 && internal_metadata
== other
.internal_metadata
;
2949 void store_statfs_t::dump(Formatter
*f
) const
2951 f
->dump_int("total", total
);
2952 f
->dump_int("available", available
);
2953 f
->dump_int("internally_reserved", internally_reserved
);
2954 f
->dump_int("allocated", allocated
);
2955 f
->dump_int("data_stored", data_stored
);
2956 f
->dump_int("data_compressed", data_compressed
);
2957 f
->dump_int("data_compressed_allocated", data_compressed_allocated
);
2958 f
->dump_int("data_compressed_original", data_compressed_original
);
2959 f
->dump_int("omap_allocated", omap_allocated
);
2960 f
->dump_int("internal_metadata", internal_metadata
);
2963 ostream
& operator<<(ostream
& out
, const store_statfs_t
&s
)
2966 << "store_statfs(0x" << s
.available
2967 << "/0x" << s
.internally_reserved
2969 << ", data 0x" << s
.data_stored
2970 << "/0x" << s
.allocated
2971 << ", compress 0x" << s
.data_compressed
2972 << "/0x" << s
.data_compressed_allocated
2973 << "/0x" << s
.data_compressed_original
2974 << ", omap 0x" << s
.omap_allocated
2975 << ", meta 0x" << s
.internal_metadata
2981 void store_statfs_t::generate_test_instances(list
<store_statfs_t
*>& o
)
2984 o
.push_back(new store_statfs_t(a
));
2987 a
.internally_reserved
= 33;
2990 a
.data_compressed
= 21;
2991 a
.data_compressed_allocated
= 12;
2992 a
.data_compressed_original
= 13;
2993 a
.omap_allocated
= 14;
2994 a
.internal_metadata
= 15;
2995 o
.push_back(new store_statfs_t(a
));
2998 // -- pool_stat_t --
3000 void pool_stat_t::dump(Formatter
*f
) const
3003 f
->open_object_section("store_stats");
3004 store_stats
.dump(f
);
3006 f
->dump_int("log_size", log_size
);
3007 f
->dump_int("ondisk_log_size", ondisk_log_size
);
3008 f
->dump_int("up", up
);
3009 f
->dump_int("acting", acting
);
3010 f
->dump_int("num_store_stats", acting
);
3013 void pool_stat_t::encode(bufferlist
&bl
, uint64_t features
) const
3016 if ((features
& CEPH_FEATURE_OSDENC
) == 0) {
3020 encode(log_size
, bl
);
3021 encode(ondisk_log_size
, bl
);
3025 ENCODE_START(7, 5, bl
);
3027 encode(log_size
, bl
);
3028 encode(ondisk_log_size
, bl
);
3031 encode(store_stats
, bl
);
3032 encode(num_store_stats
, bl
);
3036 void pool_stat_t::decode(bufferlist::const_iterator
&bl
)
3038 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
3039 if (struct_v
>= 4) {
3041 decode(log_size
, bl
);
3042 decode(ondisk_log_size
, bl
);
3043 if (struct_v
>= 6) {
3050 if (struct_v
>= 7) {
3051 decode(store_stats
, bl
);
3052 decode(num_store_stats
, bl
);
3054 store_stats
.reset();
3055 num_store_stats
= 0;
3059 decode(stats
.sum
.num_bytes
, bl
);
3062 decode(stats
.sum
.num_objects
, bl
);
3063 decode(stats
.sum
.num_object_clones
, bl
);
3064 decode(stats
.sum
.num_object_copies
, bl
);
3065 decode(stats
.sum
.num_objects_missing_on_primary
, bl
);
3066 decode(stats
.sum
.num_objects_degraded
, bl
);
3067 decode(log_size
, bl
);
3068 decode(ondisk_log_size
, bl
);
3069 if (struct_v
>= 2) {
3070 decode(stats
.sum
.num_rd
, bl
);
3071 decode(stats
.sum
.num_rd_kb
, bl
);
3072 decode(stats
.sum
.num_wr
, bl
);
3073 decode(stats
.sum
.num_wr_kb
, bl
);
3075 if (struct_v
>= 3) {
3076 decode(stats
.sum
.num_objects_unfound
, bl
);
3082 void pool_stat_t::generate_test_instances(list
<pool_stat_t
*>& o
)
3085 o
.push_back(new pool_stat_t(a
));
3087 list
<object_stat_collection_t
*> l
;
3088 object_stat_collection_t::generate_test_instances(l
);
3089 list
<store_statfs_t
*> ll
;
3090 store_statfs_t::generate_test_instances(ll
);
3091 a
.stats
= *l
.back();
3092 a
.store_stats
= *ll
.back();
3094 a
.ondisk_log_size
= 456;
3097 a
.num_store_stats
= 1;
3098 o
.push_back(new pool_stat_t(a
));
3102 // -- pg_history_t --
3104 void pg_history_t::encode(bufferlist
&bl
) const
3106 ENCODE_START(9, 4, bl
);
3107 encode(epoch_created
, bl
);
3108 encode(last_epoch_started
, bl
);
3109 encode(last_epoch_clean
, bl
);
3110 encode(last_epoch_split
, bl
);
3111 encode(same_interval_since
, bl
);
3112 encode(same_up_since
, bl
);
3113 encode(same_primary_since
, bl
);
3114 encode(last_scrub
, bl
);
3115 encode(last_scrub_stamp
, bl
);
3116 encode(last_deep_scrub
, bl
);
3117 encode(last_deep_scrub_stamp
, bl
);
3118 encode(last_clean_scrub_stamp
, bl
);
3119 encode(last_epoch_marked_full
, bl
);
3120 encode(last_interval_started
, bl
);
3121 encode(last_interval_clean
, bl
);
3122 encode(epoch_pool_created
, bl
);
3126 void pg_history_t::decode(bufferlist::const_iterator
&bl
)
3128 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl
);
3129 decode(epoch_created
, bl
);
3130 decode(last_epoch_started
, bl
);
3132 decode(last_epoch_clean
, bl
);
3134 last_epoch_clean
= last_epoch_started
; // careful, it's a lie!
3135 decode(last_epoch_split
, bl
);
3136 decode(same_interval_since
, bl
);
3137 decode(same_up_since
, bl
);
3138 decode(same_primary_since
, bl
);
3139 if (struct_v
>= 2) {
3140 decode(last_scrub
, bl
);
3141 decode(last_scrub_stamp
, bl
);
3143 if (struct_v
>= 5) {
3144 decode(last_deep_scrub
, bl
);
3145 decode(last_deep_scrub_stamp
, bl
);
3147 if (struct_v
>= 6) {
3148 decode(last_clean_scrub_stamp
, bl
);
3150 if (struct_v
>= 7) {
3151 decode(last_epoch_marked_full
, bl
);
3153 if (struct_v
>= 8) {
3154 decode(last_interval_started
, bl
);
3155 decode(last_interval_clean
, bl
);
3157 if (last_epoch_started
>= same_interval_since
) {
3158 last_interval_started
= same_interval_since
;
3160 last_interval_started
= last_epoch_started
; // best guess
3162 if (last_epoch_clean
>= same_interval_since
) {
3163 last_interval_clean
= same_interval_since
;
3165 last_interval_clean
= last_epoch_clean
; // best guess
3168 if (struct_v
>= 9) {
3169 decode(epoch_pool_created
, bl
);
3171 epoch_pool_created
= epoch_created
;
3176 void pg_history_t::dump(Formatter
*f
) const
3178 f
->dump_int("epoch_created", epoch_created
);
3179 f
->dump_int("epoch_pool_created", epoch_pool_created
);
3180 f
->dump_int("last_epoch_started", last_epoch_started
);
3181 f
->dump_int("last_interval_started", last_interval_started
);
3182 f
->dump_int("last_epoch_clean", last_epoch_clean
);
3183 f
->dump_int("last_interval_clean", last_interval_clean
);
3184 f
->dump_int("last_epoch_split", last_epoch_split
);
3185 f
->dump_int("last_epoch_marked_full", last_epoch_marked_full
);
3186 f
->dump_int("same_up_since", same_up_since
);
3187 f
->dump_int("same_interval_since", same_interval_since
);
3188 f
->dump_int("same_primary_since", same_primary_since
);
3189 f
->dump_stream("last_scrub") << last_scrub
;
3190 f
->dump_stream("last_scrub_stamp") << last_scrub_stamp
;
3191 f
->dump_stream("last_deep_scrub") << last_deep_scrub
;
3192 f
->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp
;
3193 f
->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp
;
3196 void pg_history_t::generate_test_instances(list
<pg_history_t
*>& o
)
3198 o
.push_back(new pg_history_t
);
3199 o
.push_back(new pg_history_t
);
3200 o
.back()->epoch_created
= 1;
3201 o
.back()->epoch_pool_created
= 1;
3202 o
.back()->last_epoch_started
= 2;
3203 o
.back()->last_interval_started
= 2;
3204 o
.back()->last_epoch_clean
= 3;
3205 o
.back()->last_interval_clean
= 2;
3206 o
.back()->last_epoch_split
= 4;
3207 o
.back()->same_up_since
= 5;
3208 o
.back()->same_interval_since
= 6;
3209 o
.back()->same_primary_since
= 7;
3210 o
.back()->last_scrub
= eversion_t(8, 9);
3211 o
.back()->last_scrub_stamp
= utime_t(10, 11);
3212 o
.back()->last_deep_scrub
= eversion_t(12, 13);
3213 o
.back()->last_deep_scrub_stamp
= utime_t(14, 15);
3214 o
.back()->last_clean_scrub_stamp
= utime_t(16, 17);
3215 o
.back()->last_epoch_marked_full
= 18;
3221 void pg_info_t::encode(bufferlist
&bl
) const
3223 ENCODE_START(32, 26, bl
);
3224 encode(pgid
.pgid
, bl
);
3225 encode(last_update
, bl
);
3226 encode(last_complete
, bl
);
3227 encode(log_tail
, bl
);
3228 if (last_backfill_bitwise
&& !last_backfill
.is_max()) {
3229 encode(hobject_t(), bl
);
3231 encode(last_backfill
, bl
);
3235 encode(purged_snaps
, bl
);
3236 encode(last_epoch_started
, bl
);
3237 encode(last_user_version
, bl
);
3238 encode(hit_set
, bl
);
3239 encode(pgid
.shard
, bl
);
3240 encode(last_backfill
, bl
);
3241 encode(last_backfill_bitwise
, bl
);
3242 encode(last_interval_started
, bl
);
3246 void pg_info_t::decode(bufferlist::const_iterator
&bl
)
3248 DECODE_START(32, bl
);
3249 decode(pgid
.pgid
, bl
);
3250 decode(last_update
, bl
);
3251 decode(last_complete
, bl
);
3252 decode(log_tail
, bl
);
3254 hobject_t old_last_backfill
;
3255 decode(old_last_backfill
, bl
);
3259 decode(purged_snaps
, bl
);
3260 decode(last_epoch_started
, bl
);
3261 decode(last_user_version
, bl
);
3262 decode(hit_set
, bl
);
3263 decode(pgid
.shard
, bl
);
3264 decode(last_backfill
, bl
);
3265 decode(last_backfill_bitwise
, bl
);
3266 if (struct_v
>= 32) {
3267 decode(last_interval_started
, bl
);
3269 last_interval_started
= last_epoch_started
;
3276 void pg_info_t::dump(Formatter
*f
) const
3278 f
->dump_stream("pgid") << pgid
;
3279 f
->dump_stream("last_update") << last_update
;
3280 f
->dump_stream("last_complete") << last_complete
;
3281 f
->dump_stream("log_tail") << log_tail
;
3282 f
->dump_int("last_user_version", last_user_version
);
3283 f
->dump_stream("last_backfill") << last_backfill
;
3284 f
->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise
);
3285 f
->open_array_section("purged_snaps");
3286 for (interval_set
<snapid_t
>::const_iterator i
=purged_snaps
.begin();
3287 i
!= purged_snaps
.end();
3289 f
->open_object_section("purged_snap_interval");
3290 f
->dump_stream("start") << i
.get_start();
3291 f
->dump_stream("length") << i
.get_len();
3295 f
->open_object_section("history");
3298 f
->open_object_section("stats");
3302 f
->dump_int("empty", is_empty());
3303 f
->dump_int("dne", dne());
3304 f
->dump_int("incomplete", is_incomplete());
3305 f
->dump_int("last_epoch_started", last_epoch_started
);
3307 f
->open_object_section("hit_set_history");
3312 void pg_info_t::generate_test_instances(list
<pg_info_t
*>& o
)
3314 o
.push_back(new pg_info_t
);
3315 o
.push_back(new pg_info_t
);
3316 list
<pg_history_t
*> h
;
3317 pg_history_t::generate_test_instances(h
);
3318 o
.back()->history
= *h
.back();
3319 o
.back()->pgid
= spg_t(pg_t(1, 2), shard_id_t::NO_SHARD
);
3320 o
.back()->last_update
= eversion_t(3, 4);
3321 o
.back()->last_complete
= eversion_t(5, 6);
3322 o
.back()->last_user_version
= 2;
3323 o
.back()->log_tail
= eversion_t(7, 8);
3324 o
.back()->last_backfill
= hobject_t(object_t("objname"), "key", 123, 456, -1, "");
3325 o
.back()->last_backfill_bitwise
= true;
3328 pg_stat_t::generate_test_instances(s
);
3329 o
.back()->stats
= *s
.back();
3332 list
<pg_hit_set_history_t
*> s
;
3333 pg_hit_set_history_t::generate_test_instances(s
);
3334 o
.back()->hit_set
= *s
.back();
3338 // -- pg_notify_t --
3339 void pg_notify_t::encode(bufferlist
&bl
) const
3341 ENCODE_START(2, 2, bl
);
3342 encode(query_epoch
, bl
);
3343 encode(epoch_sent
, bl
);
3350 void pg_notify_t::decode(bufferlist::const_iterator
&bl
)
3352 DECODE_START(2, bl
);
3353 decode(query_epoch
, bl
);
3354 decode(epoch_sent
, bl
);
3361 void pg_notify_t::dump(Formatter
*f
) const
3363 f
->dump_int("from", from
);
3364 f
->dump_int("to", to
);
3365 f
->dump_unsigned("query_epoch", query_epoch
);
3366 f
->dump_unsigned("epoch_sent", epoch_sent
);
3368 f
->open_object_section("info");
3374 void pg_notify_t::generate_test_instances(list
<pg_notify_t
*>& o
)
3376 o
.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD
, 1, 1, pg_info_t()));
3377 o
.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t()));
3380 ostream
&operator<<(ostream
&lhs
, const pg_notify_t
¬ify
)
3382 lhs
<< "(query:" << notify
.query_epoch
3383 << " sent:" << notify
.epoch_sent
3384 << " " << notify
.info
;
3385 if (notify
.from
!= shard_id_t::NO_SHARD
||
3386 notify
.to
!= shard_id_t::NO_SHARD
)
3387 lhs
<< " " << (unsigned)notify
.from
3388 << "->" << (unsigned)notify
.to
;
3392 // -- pg_interval_t --
3394 void PastIntervals::pg_interval_t::encode(bufferlist
& bl
) const
3396 ENCODE_START(4, 2, bl
);
3401 encode(maybe_went_rw
, bl
);
3402 encode(primary
, bl
);
3403 encode(up_primary
, bl
);
3407 void PastIntervals::pg_interval_t::decode(bufferlist::const_iterator
& bl
)
3409 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl
);
3414 decode(maybe_went_rw
, bl
);
3415 if (struct_v
>= 3) {
3416 decode(primary
, bl
);
3419 primary
= acting
[0];
3421 if (struct_v
>= 4) {
3422 decode(up_primary
, bl
);
3430 void PastIntervals::pg_interval_t::dump(Formatter
*f
) const
3432 f
->dump_unsigned("first", first
);
3433 f
->dump_unsigned("last", last
);
3434 f
->dump_int("maybe_went_rw", maybe_went_rw
? 1 : 0);
3435 f
->open_array_section("up");
3436 for (vector
<int>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
)
3437 f
->dump_int("osd", *p
);
3439 f
->open_array_section("acting");
3440 for (vector
<int>::const_iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
3441 f
->dump_int("osd", *p
);
3443 f
->dump_int("primary", primary
);
3444 f
->dump_int("up_primary", up_primary
);
3447 void PastIntervals::pg_interval_t::generate_test_instances(list
<pg_interval_t
*>& o
)
3449 o
.push_back(new pg_interval_t
);
3450 o
.push_back(new pg_interval_t
);
3451 o
.back()->up
.push_back(1);
3452 o
.back()->acting
.push_back(2);
3453 o
.back()->acting
.push_back(3);
3454 o
.back()->first
= 4;
3456 o
.back()->maybe_went_rw
= true;
3459 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t
)
3465 * PastIntervals only needs to be able to answer two questions:
3466 * 1) Where should the primary look for unfound objects?
3467 * 2) List a set of subsets of the OSDs such that contacting at least
3468 * one from each subset guarantees we speak to at least one witness
3469 * of any completed write.
3471 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3472 * we don't need to keep any where maybe_went_rw would be false. We also
3473 * needn't keep two intervals where the actingset in one is a subset
3474 * of the other (only need to keep the smaller of the two sets). In order
3475 * to accurately trim the set of intervals as last_epoch_started changes
3476 * without rebuilding the set from scratch, we'll retain the larger set
3477 * if it in an older interval.
3479 struct compact_interval_t
{
3482 set
<pg_shard_t
> acting
;
3483 bool supersedes(const compact_interval_t
&other
) {
3484 for (auto &&i
: acting
) {
3485 if (!other
.acting
.count(i
))
3490 void dump(Formatter
*f
) const {
3491 f
->open_object_section("compact_interval_t");
3492 f
->dump_stream("first") << first
;
3493 f
->dump_stream("last") << last
;
3494 f
->dump_stream("acting") << acting
;
3497 void encode(bufferlist
&bl
) const {
3498 ENCODE_START(1, 1, bl
);
3504 void decode(bufferlist::const_iterator
&bl
) {
3505 DECODE_START(1, bl
);
3511 static void generate_test_instances(list
<compact_interval_t
*> & o
) {
3512 /* Not going to be used, we'll generate pi_compact_rep directly */
3515 ostream
&operator<<(ostream
&o
, const compact_interval_t
&rhs
)
3517 return o
<< "([" << rhs
.first
<< "," << rhs
.last
3518 << "] acting " << rhs
.acting
<< ")";
3520 WRITE_CLASS_ENCODER(compact_interval_t
)
3522 class pi_compact_rep
: public PastIntervals::interval_rep
{
3524 epoch_t last
= 0; // inclusive
3525 set
<pg_shard_t
> all_participants
;
3526 list
<compact_interval_t
> intervals
;
3529 std::list
<PastIntervals::pg_interval_t
> &&intervals
) {
3530 for (auto &&i
: intervals
)
3531 add_interval(ec_pool
, i
);
3534 pi_compact_rep() = default;
3535 pi_compact_rep(const pi_compact_rep
&) = default;
3536 pi_compact_rep(pi_compact_rep
&&) = default;
3537 pi_compact_rep
&operator=(const pi_compact_rep
&) = default;
3538 pi_compact_rep
&operator=(pi_compact_rep
&&) = default;
3540 size_t size() const override
{ return intervals
.size(); }
3541 bool empty() const override
{
3542 return first
> last
|| (first
== 0 && last
== 0);
3544 void clear() override
{
3545 *this = pi_compact_rep();
3547 pair
<epoch_t
, epoch_t
> get_bounds() const override
{
3548 return make_pair(first
, last
+ 1);
3550 void adjust_start_backwards(epoch_t last_epoch_clean
) {
3551 first
= last_epoch_clean
;
3554 set
<pg_shard_t
> get_all_participants(
3555 bool ec_pool
) const override
{
3556 return all_participants
;
3559 bool ec_pool
, const PastIntervals::pg_interval_t
&interval
) override
{
3561 first
= interval
.first
;
3562 ceph_assert(interval
.last
> last
);
3563 last
= interval
.last
;
3564 set
<pg_shard_t
> acting
;
3565 for (unsigned i
= 0; i
< interval
.acting
.size(); ++i
) {
3566 if (interval
.acting
[i
] == CRUSH_ITEM_NONE
)
3571 ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3573 all_participants
.insert(acting
.begin(), acting
.end());
3574 if (!interval
.maybe_went_rw
)
3576 intervals
.push_back(
3577 compact_interval_t
{interval
.first
, interval
.last
, acting
});
3578 auto plast
= intervals
.end();
3580 for (auto cur
= intervals
.begin(); cur
!= plast
; ) {
3581 if (plast
->supersedes(*cur
)) {
3582 intervals
.erase(cur
++);
3588 unique_ptr
<PastIntervals::interval_rep
> clone() const override
{
3589 return unique_ptr
<PastIntervals::interval_rep
>(new pi_compact_rep(*this));
3591 ostream
&print(ostream
&out
) const override
{
3592 return out
<< "([" << first
<< "," << last
3593 << "] intervals=" << intervals
<< ")";
3595 void encode(bufferlist
&bl
) const override
{
3596 ENCODE_START(1, 1, bl
);
3599 encode(all_participants
, bl
);
3600 encode(intervals
, bl
);
3603 void decode(bufferlist::const_iterator
&bl
) override
{
3604 DECODE_START(1, bl
);
3607 decode(all_participants
, bl
);
3608 decode(intervals
, bl
);
3611 void dump(Formatter
*f
) const override
{
3612 f
->open_object_section("PastIntervals::compact_rep");
3613 f
->dump_stream("first") << first
;
3614 f
->dump_stream("last") << last
;
3615 f
->open_array_section("all_participants");
3616 for (auto& i
: all_participants
) {
3617 f
->dump_object("pg_shard", i
);
3620 f
->open_array_section("intervals");
3621 for (auto &&i
: intervals
) {
3627 static void generate_test_instances(list
<pi_compact_rep
*> &o
) {
3628 using ival
= PastIntervals::pg_interval_t
;
3629 using ivallst
= std::list
<ival
>;
3633 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3634 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3635 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3636 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3641 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3642 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3643 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3644 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3649 { ival
{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3650 , ival
{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3651 , ival
{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3652 , ival
{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3655 void iterate_mayberw_back_to(
3657 std::function
<void(epoch_t
, const set
<pg_shard_t
> &)> &&f
) const override
{
3658 for (auto i
= intervals
.rbegin(); i
!= intervals
.rend(); ++i
) {
3661 f(i
->first
, i
->acting
);
3664 virtual ~pi_compact_rep() override
{}
3666 WRITE_CLASS_ENCODER(pi_compact_rep
)
3668 PastIntervals::PastIntervals()
3670 past_intervals
.reset(new pi_compact_rep
);
3673 PastIntervals::PastIntervals(const PastIntervals
&rhs
)
3674 : past_intervals(rhs
.past_intervals
?
3675 rhs
.past_intervals
->clone() :
3678 PastIntervals
&PastIntervals::operator=(const PastIntervals
&rhs
)
3680 PastIntervals
other(rhs
);
3685 ostream
& operator<<(ostream
& out
, const PastIntervals
&i
)
3687 if (i
.past_intervals
) {
3688 return i
.past_intervals
->print(out
);
3690 return out
<< "(empty)";
3694 ostream
& operator<<(ostream
& out
, const PastIntervals::PriorSet
&i
)
3696 return out
<< "PriorSet("
3697 << "ec_pool: " << i
.ec_pool
3698 << ", probe: " << i
.probe
3699 << ", down: " << i
.down
3700 << ", blocked_by: " << i
.blocked_by
3701 << ", pg_down: " << i
.pg_down
3705 void PastIntervals::decode(bufferlist::const_iterator
&bl
)
3707 DECODE_START(1, bl
);
3714 ceph_abort_msg("pi_simple_rep support removed post-luminous");
3717 past_intervals
.reset(new pi_compact_rep
);
3718 past_intervals
->decode(bl
);
3724 void PastIntervals::generate_test_instances(list
<PastIntervals
*> &o
)
3727 list
<pi_compact_rep
*> compact
;
3728 pi_compact_rep::generate_test_instances(compact
);
3729 for (auto &&i
: compact
) {
3730 // takes ownership of contents
3731 o
.push_back(new PastIntervals(i
));
3737 bool PastIntervals::is_new_interval(
3738 int old_acting_primary
,
3739 int new_acting_primary
,
3740 const vector
<int> &old_acting
,
3741 const vector
<int> &new_acting
,
3744 const vector
<int> &old_up
,
3745 const vector
<int> &new_up
,
3750 unsigned old_pg_num
,
3751 unsigned new_pg_num
,
3752 unsigned old_pg_num_pending
,
3753 unsigned new_pg_num_pending
,
3754 bool old_sort_bitwise
,
3755 bool new_sort_bitwise
,
3756 bool old_recovery_deletes
,
3757 bool new_recovery_deletes
,
3759 return old_acting_primary
!= new_acting_primary
||
3760 new_acting
!= old_acting
||
3761 old_up_primary
!= new_up_primary
||
3763 old_min_size
!= new_min_size
||
3764 old_size
!= new_size
||
3765 pgid
.is_split(old_pg_num
, new_pg_num
, 0) ||
3766 // (is or was) pre-merge source
3767 pgid
.is_merge_source(old_pg_num_pending
, new_pg_num_pending
, 0) ||
3768 pgid
.is_merge_source(new_pg_num_pending
, old_pg_num_pending
, 0) ||
3770 pgid
.is_merge_source(old_pg_num
, new_pg_num
, 0) ||
3771 // (is or was) pre-merge target
3772 pgid
.is_merge_target(old_pg_num_pending
, new_pg_num_pending
) ||
3773 pgid
.is_merge_target(new_pg_num_pending
, old_pg_num_pending
) ||
3775 pgid
.is_merge_target(old_pg_num
, new_pg_num
) ||
3776 old_sort_bitwise
!= new_sort_bitwise
||
3777 old_recovery_deletes
!= new_recovery_deletes
;
3780 bool PastIntervals::is_new_interval(
3781 int old_acting_primary
,
3782 int new_acting_primary
,
3783 const vector
<int> &old_acting
,
3784 const vector
<int> &new_acting
,
3787 const vector
<int> &old_up
,
3788 const vector
<int> &new_up
,
3793 const pg_pool_t
*plast
= lastmap
->get_pg_pool(pgid
.pool());
3795 return false; // after pool is deleted there are no more interval changes
3797 const pg_pool_t
*pi
= osdmap
->get_pg_pool(pgid
.pool());
3799 return true; // pool was deleted this epoch -> (final!) interval change
3802 is_new_interval(old_acting_primary
,
3814 plast
->get_pg_num(),
3816 plast
->get_pg_num_pending(),
3817 pi
->get_pg_num_pending(),
3818 lastmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
),
3819 osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
),
3820 lastmap
->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
),
3821 osdmap
->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
),
3825 bool PastIntervals::check_new_interval(
3826 int old_acting_primary
,
3827 int new_acting_primary
,
3828 const vector
<int> &old_acting
,
3829 const vector
<int> &new_acting
,
3832 const vector
<int> &old_up
,
3833 const vector
<int> &new_up
,
3834 epoch_t same_interval_since
,
3835 epoch_t last_epoch_clean
,
3839 IsPGRecoverablePredicate
*could_have_gone_active
,
3840 PastIntervals
*past_intervals
,
3844 * We have to be careful to gracefully deal with situations like
3845 * so. Say we have a power outage or something that takes out both
3846 * OSDs, but the monitor doesn't mark them down in the same epoch.
3847 * The history may look like
3851 * 3: let's say B dies for good, too (say, from the power spike)
3854 * which makes it look like B may have applied updates to the PG
3855 * that we need in order to proceed. This sucks...
3857 * To minimize the risk of this happening, we CANNOT go active if
3858 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3859 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3860 * Then, we have something like
3867 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
3877 * -> we must wait for B, bc it was alive through 2, and could have
3878 * written to the pg.
3880 * If B is really dead, then an administrator will need to manually
3881 * intervene by marking the OSD as "lost."
3884 // remember past interval
3885 // NOTE: a change in the up set primary triggers an interval
3886 // change, even though the interval members in the pg_interval_t
3888 ceph_assert(past_intervals
);
3889 ceph_assert(past_intervals
->past_intervals
);
3890 if (is_new_interval(
3903 i
.first
= same_interval_since
;
3904 i
.last
= osdmap
->get_epoch() - 1;
3905 ceph_assert(i
.first
<= i
.last
);
3906 i
.acting
= old_acting
;
3908 i
.primary
= old_acting_primary
;
3909 i
.up_primary
= old_up_primary
;
3911 unsigned num_acting
= 0;
3912 for (vector
<int>::const_iterator p
= i
.acting
.begin(); p
!= i
.acting
.end();
3914 if (*p
!= CRUSH_ITEM_NONE
)
3917 ceph_assert(lastmap
->get_pools().count(pgid
.pool()));
3918 const pg_pool_t
& old_pg_pool
= lastmap
->get_pools().find(pgid
.pool())->second
;
3919 set
<pg_shard_t
> old_acting_shards
;
3920 old_pg_pool
.convert_to_pg_shards(old_acting
, &old_acting_shards
);
3924 num_acting
>= old_pg_pool
.min_size
&&
3925 (*could_have_gone_active
)(old_acting_shards
)) {
3927 *out
<< __func__
<< " " << i
3928 << " up_thru " << lastmap
->get_up_thru(i
.primary
)
3929 << " up_from " << lastmap
->get_up_from(i
.primary
)
3930 << " last_epoch_clean " << last_epoch_clean
;
3931 if (lastmap
->get_up_thru(i
.primary
) >= i
.first
&&
3932 lastmap
->get_up_from(i
.primary
) <= i
.first
) {
3933 i
.maybe_went_rw
= true;
3936 << " : primary up " << lastmap
->get_up_from(i
.primary
)
3937 << "-" << lastmap
->get_up_thru(i
.primary
)
3938 << " includes interval"
3940 } else if (last_epoch_clean
>= i
.first
&&
3941 last_epoch_clean
<= i
.last
) {
3942 // If the last_epoch_clean is included in this interval, then
3943 // the pg must have been rw (for recovery to have completed).
3944 // This is important because we won't know the _real_
3945 // first_epoch because we stop at last_epoch_clean, and we
3946 // don't want the oldest interval to randomly have
3947 // maybe_went_rw false depending on the relative up_thru vs
3948 // last_epoch_clean timing.
3949 i
.maybe_went_rw
= true;
3952 << " : includes last_epoch_clean " << last_epoch_clean
3953 << " and presumed to have been rw"
3956 i
.maybe_went_rw
= false;
3959 << " : primary up " << lastmap
->get_up_from(i
.primary
)
3960 << "-" << lastmap
->get_up_thru(i
.primary
)
3961 << " does not include interval"
3965 i
.maybe_went_rw
= false;
3967 *out
<< __func__
<< " " << i
<< " : acting set is too small" << std::endl
;
3969 past_intervals
->past_intervals
->add_interval(old_pg_pool
.is_erasure(), i
);
3977 // true if the given map affects the prior set
3978 bool PastIntervals::PriorSet::affected_by_map(
3979 const OSDMap
&osdmap
,
3980 const DoutPrefixProvider
*dpp
) const
3982 for (set
<pg_shard_t
>::iterator p
= probe
.begin();
3987 // did someone in the prior set go down?
3988 if (osdmap
.is_down(o
) && down
.count(o
) == 0) {
3989 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " now down" << dendl
;
3993 // did a down osd in cur get (re)marked as lost?
3994 map
<int, epoch_t
>::const_iterator r
= blocked_by
.find(o
);
3995 if (r
!= blocked_by
.end()) {
3996 if (!osdmap
.exists(o
)) {
3997 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " no longer exists" << dendl
;
4000 if (osdmap
.get_info(o
).lost_at
!= r
->second
) {
4001 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " (re)marked as lost" << dendl
;
4007 // did someone in the prior down set go up?
4008 for (set
<int>::const_iterator p
= down
.begin();
4013 if (osdmap
.is_up(o
)) {
4014 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " now up" << dendl
;
4018 // did someone in the prior set get lost or destroyed?
4019 if (!osdmap
.exists(o
)) {
4020 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " no longer exists" << dendl
;
4023 // did a down osd in down get (re)marked as lost?
4024 map
<int, epoch_t
>::const_iterator r
= blocked_by
.find(o
);
4025 if (r
!= blocked_by
.end()) {
4026 if (osdmap
.get_info(o
).lost_at
!= r
->second
) {
4027 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " (re)marked as lost" << dendl
;
4036 ostream
& operator<<(ostream
& out
, const PastIntervals::pg_interval_t
& i
)
4038 out
<< "interval(" << i
.first
<< "-" << i
.last
4039 << " up " << i
.up
<< "(" << i
.up_primary
<< ")"
4040 << " acting " << i
.acting
<< "(" << i
.primary
<< ")";
4041 if (i
.maybe_went_rw
)
4042 out
<< " maybe_went_rw";
4051 void pg_query_t::encode(bufferlist
&bl
, uint64_t features
) const {
4052 ENCODE_START(3, 3, bl
);
4056 encode(epoch_sent
, bl
);
4062 void pg_query_t::decode(bufferlist::const_iterator
&bl
) {
4063 DECODE_START(3, bl
);
4067 decode(epoch_sent
, bl
);
4073 void pg_query_t::dump(Formatter
*f
) const
4075 f
->dump_int("from", from
);
4076 f
->dump_int("to", to
);
4077 f
->dump_string("type", get_type_name());
4078 f
->dump_stream("since") << since
;
4079 f
->dump_stream("epoch_sent") << epoch_sent
;
4080 f
->open_object_section("history");
4084 void pg_query_t::generate_test_instances(list
<pg_query_t
*>& o
)
4086 o
.push_back(new pg_query_t());
4087 list
<pg_history_t
*> h
;
4088 pg_history_t::generate_test_instances(h
);
4089 o
.push_back(new pg_query_t(pg_query_t::INFO
, shard_id_t(1), shard_id_t(2), *h
.back(), 4));
4090 o
.push_back(new pg_query_t(pg_query_t::MISSING
, shard_id_t(2), shard_id_t(3), *h
.back(), 4));
4091 o
.push_back(new pg_query_t(pg_query_t::LOG
, shard_id_t(0), shard_id_t(0),
4092 eversion_t(4, 5), *h
.back(), 4));
4093 o
.push_back(new pg_query_t(pg_query_t::FULLLOG
,
4094 shard_id_t::NO_SHARD
, shard_id_t::NO_SHARD
,
4098 // -- ObjectModDesc --
4099 void ObjectModDesc::visit(Visitor
*visitor
) const
4101 auto bp
= bl
.cbegin();
4104 DECODE_START(max_required_version
, bp
);
4111 visitor
->append(size
);
4115 map
<string
, boost::optional
<bufferlist
> > attrs
;
4117 visitor
->setattrs(attrs
);
4121 version_t old_version
;
4122 decode(old_version
, bp
);
4123 visitor
->rmobject(old_version
);
4130 case UPDATE_SNAPS
: {
4131 set
<snapid_t
> snaps
;
4133 visitor
->update_snaps(snaps
);
4137 version_t old_version
;
4138 decode(old_version
, bp
);
4139 visitor
->try_rmobject(old_version
);
4142 case ROLLBACK_EXTENTS
: {
4143 vector
<pair
<uint64_t, uint64_t> > extents
;
4146 decode(extents
, bp
);
4147 visitor
->rollback_extents(gen
,extents
);
4151 ceph_abort_msg("Invalid rollback code");
4156 ceph_abort_msg("Invalid encoding");
4160 struct DumpVisitor
: public ObjectModDesc::Visitor
{
4162 explicit DumpVisitor(Formatter
*f
) : f(f
) {}
4163 void append(uint64_t old_size
) override
{
4164 f
->open_object_section("op");
4165 f
->dump_string("code", "APPEND");
4166 f
->dump_unsigned("old_size", old_size
);
4169 void setattrs(map
<string
, boost::optional
<bufferlist
> > &attrs
) override
{
4170 f
->open_object_section("op");
4171 f
->dump_string("code", "SETATTRS");
4172 f
->open_array_section("attrs");
4173 for (map
<string
, boost::optional
<bufferlist
> >::iterator i
= attrs
.begin();
4176 f
->dump_string("attr_name", i
->first
);
4181 void rmobject(version_t old_version
) override
{
4182 f
->open_object_section("op");
4183 f
->dump_string("code", "RMOBJECT");
4184 f
->dump_unsigned("old_version", old_version
);
4187 void try_rmobject(version_t old_version
) override
{
4188 f
->open_object_section("op");
4189 f
->dump_string("code", "TRY_RMOBJECT");
4190 f
->dump_unsigned("old_version", old_version
);
4193 void create() override
{
4194 f
->open_object_section("op");
4195 f
->dump_string("code", "CREATE");
4198 void update_snaps(const set
<snapid_t
> &snaps
) override
{
4199 f
->open_object_section("op");
4200 f
->dump_string("code", "UPDATE_SNAPS");
4201 f
->dump_stream("snaps") << snaps
;
4204 void rollback_extents(
4206 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
4207 f
->open_object_section("op");
4208 f
->dump_string("code", "ROLLBACK_EXTENTS");
4209 f
->dump_unsigned("gen", gen
);
4210 f
->dump_stream("snaps") << extents
;
4215 void ObjectModDesc::dump(Formatter
*f
) const
4217 f
->open_object_section("object_mod_desc");
4218 f
->dump_bool("can_local_rollback", can_local_rollback
);
4219 f
->dump_bool("rollback_info_completed", rollback_info_completed
);
4221 f
->open_array_section("ops");
4229 void ObjectModDesc::generate_test_instances(list
<ObjectModDesc
*>& o
)
4231 map
<string
, boost::optional
<bufferlist
> > attrs
;
4235 o
.push_back(new ObjectModDesc());
4236 o
.back()->append(100);
4237 o
.back()->setattrs(attrs
);
4238 o
.push_back(new ObjectModDesc());
4239 o
.back()->rmobject(1001);
4240 o
.push_back(new ObjectModDesc());
4242 o
.back()->setattrs(attrs
);
4243 o
.push_back(new ObjectModDesc());
4245 o
.back()->setattrs(attrs
);
4246 o
.back()->mark_unrollbackable();
4247 o
.back()->append(1000);
4250 void ObjectModDesc::encode(bufferlist
&_bl
) const
4252 ENCODE_START(max_required_version
, max_required_version
, _bl
);
4253 encode(can_local_rollback
, _bl
);
4254 encode(rollback_info_completed
, _bl
);
4258 void ObjectModDesc::decode(bufferlist::const_iterator
&_bl
)
4260 DECODE_START(2, _bl
);
4261 max_required_version
= struct_v
;
4262 decode(can_local_rollback
, _bl
);
4263 decode(rollback_info_completed
, _bl
);
4265 // ensure bl does not pin a larger buffer in memory
4267 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4271 // -- pg_log_entry_t --
4273 string
pg_log_entry_t::get_key_name() const
4275 return version
.get_key_name();
4278 void pg_log_entry_t::encode_with_checksum(bufferlist
& bl
) const
4281 bufferlist
ebl(sizeof(*this)*2);
4283 __u32 crc
= ebl
.crc32c(0);
4288 void pg_log_entry_t::decode_with_checksum(bufferlist::const_iterator
& p
)
4295 if (crc
!= bl
.crc32c(0))
4296 throw buffer::malformed_input("bad checksum on pg_log_entry_t");
4297 auto q
= bl
.cbegin();
4301 void pg_log_entry_t::encode(bufferlist
&bl
) const
4303 ENCODE_START(12, 4, bl
);
4306 encode(version
, bl
);
4309 * Added with reverting_to:
4310 * Previous code used prior_version to encode
4311 * what we now call reverting_to. This will
4312 * allow older code to decode reverting_to
4313 * into prior_version as expected.
4315 if (op
== LOST_REVERT
)
4316 encode(reverting_to
, bl
);
4318 encode(prior_version
, bl
);
4322 if (op
== LOST_REVERT
)
4323 encode(prior_version
, bl
);
4325 encode(user_version
, bl
);
4326 encode(mod_desc
, bl
);
4327 encode(extra_reqids
, bl
);
4329 encode(return_code
, bl
);
4330 if (!extra_reqids
.empty())
4331 encode(extra_reqid_return_codes
, bl
);
4335 void pg_log_entry_t::decode(bufferlist::const_iterator
&bl
)
4337 DECODE_START_LEGACY_COMPAT_LEN(12, 4, 4, bl
);
4341 decode(old_soid
, bl
);
4342 soid
.oid
= old_soid
.oid
;
4343 soid
.snap
= old_soid
.snap
;
4344 invalid_hash
= true;
4349 invalid_hash
= true;
4350 decode(version
, bl
);
4352 if (struct_v
>= 6 && op
== LOST_REVERT
)
4353 decode(reverting_to
, bl
);
4355 decode(prior_version
, bl
);
4361 invalid_pool
= true;
4363 if (op
== LOST_REVERT
) {
4364 if (struct_v
>= 6) {
4365 decode(prior_version
, bl
);
4367 reverting_to
= prior_version
;
4370 if (struct_v
>= 7 || // for v >= 7, this is for all ops.
4371 op
== CLONE
) { // for v < 7, it's only present for CLONE.
4373 // ensure snaps does not pin a larger buffer in memory
4375 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4379 decode(user_version
, bl
);
4381 user_version
= version
.version
;
4384 decode(mod_desc
, bl
);
4386 mod_desc
.mark_unrollbackable();
4388 decode(extra_reqids
, bl
);
4389 if (struct_v
>= 11 && op
== ERROR
)
4390 decode(return_code
, bl
);
4391 if (struct_v
>= 12 && !extra_reqids
.empty())
4392 decode(extra_reqid_return_codes
, bl
);
4396 void pg_log_entry_t::dump(Formatter
*f
) const
4398 f
->dump_string("op", get_op_name());
4399 f
->dump_stream("object") << soid
;
4400 f
->dump_stream("version") << version
;
4401 f
->dump_stream("prior_version") << prior_version
;
4402 f
->dump_stream("reqid") << reqid
;
4403 f
->open_array_section("extra_reqids");
4405 for (auto p
= extra_reqids
.begin();
4406 p
!= extra_reqids
.end();
4408 f
->open_object_section("extra_reqid");
4409 f
->dump_stream("reqid") << p
->first
;
4410 f
->dump_stream("user_version") << p
->second
;
4411 auto it
= extra_reqid_return_codes
.find(idx
);
4412 if (it
!= extra_reqid_return_codes
.end()) {
4413 f
->dump_int("return_code", it
->second
);
4418 f
->dump_stream("mtime") << mtime
;
4419 f
->dump_int("return_code", return_code
);
4420 if (snaps
.length() > 0) {
4422 bufferlist c
= snaps
;
4423 auto p
= c
.cbegin();
4430 f
->open_object_section("snaps");
4431 for (vector
<snapid_t
>::iterator p
= v
.begin(); p
!= v
.end(); ++p
)
4432 f
->dump_unsigned("snap", *p
);
4436 f
->open_object_section("mod_desc");
4442 void pg_log_entry_t::generate_test_instances(list
<pg_log_entry_t
*>& o
)
4444 o
.push_back(new pg_log_entry_t());
4445 hobject_t
oid(object_t("objname"), "key", 123, 456, 0, "");
4446 o
.push_back(new pg_log_entry_t(MODIFY
, oid
, eversion_t(1,2), eversion_t(3,4),
4447 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4449 o
.push_back(new pg_log_entry_t(ERROR
, oid
, eversion_t(1,2), eversion_t(3,4),
4450 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4451 utime_t(8,9), -ENOENT
));
4454 ostream
& operator<<(ostream
& out
, const pg_log_entry_t
& e
)
4456 out
<< e
.version
<< " (" << e
.prior_version
<< ") "
4457 << std::left
<< std::setw(8) << e
.get_op_name() << ' '
4458 << e
.soid
<< " by " << e
.reqid
<< " " << e
.mtime
4459 << " " << e
.return_code
;
4460 if (e
.snaps
.length()) {
4461 vector
<snapid_t
> snaps
;
4462 bufferlist c
= e
.snaps
;
4463 auto p
= c
.cbegin();
4469 out
<< " snaps " << snaps
;
4474 // -- pg_log_dup_t --
4476 std::string
pg_log_dup_t::get_key_name() const
4478 static const char prefix
[] = "dup_";
4479 std::string
key(36, ' ');
4480 memcpy(&key
[0], prefix
, 4);
4481 version
.get_key_name(&key
[4]);
4482 key
.resize(35); // remove the null terminator
4486 void pg_log_dup_t::encode(bufferlist
&bl
) const
4488 ENCODE_START(1, 1, bl
);
4490 encode(version
, bl
);
4491 encode(user_version
, bl
);
4492 encode(return_code
, bl
);
4496 void pg_log_dup_t::decode(bufferlist::const_iterator
&bl
)
4498 DECODE_START(1, bl
);
4500 decode(version
, bl
);
4501 decode(user_version
, bl
);
4502 decode(return_code
, bl
);
4506 void pg_log_dup_t::dump(Formatter
*f
) const
4508 f
->dump_stream("reqid") << reqid
;
4509 f
->dump_stream("version") << version
;
4510 f
->dump_stream("user_version") << user_version
;
4511 f
->dump_stream("return_code") << return_code
;
4514 void pg_log_dup_t::generate_test_instances(list
<pg_log_dup_t
*>& o
)
4516 o
.push_back(new pg_log_dup_t());
4517 o
.push_back(new pg_log_dup_t(eversion_t(1,2),
4519 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4521 o
.push_back(new pg_log_dup_t(eversion_t(1,2),
4523 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4528 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
) {
4529 return out
<< "log_dup(reqid=" << e
.reqid
<<
4530 " v=" << e
.version
<< " uv=" << e
.user_version
<<
4531 " rc=" << e
.return_code
<< ")";
4537 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4538 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4539 void pg_log_t::filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
4540 const string
&hit_set_namespace
, const pg_log_t
&in
,
4541 pg_log_t
&out
, pg_log_t
&reject
)
4547 for (list
<pg_log_entry_t
>::const_iterator i
= in
.log
.begin();
4548 i
!= in
.log
.end(); ++i
) {
4550 // Reject pg log entries for temporary objects
4551 if (i
->soid
.is_temp()) {
4552 reject
.log
.push_back(*i
);
4556 if (i
->soid
.nspace
!= hit_set_namespace
) {
4557 object_t oid
= i
->soid
.oid
;
4558 object_locator_t
loc(i
->soid
);
4559 pg_t raw_pgid
= curmap
.object_locator_to_pg(oid
, loc
);
4560 pg_t pgid
= curmap
.raw_pg_to_pg(raw_pgid
);
4562 if (import_pgid
.pgid
== pgid
) {
4563 out
.log
.push_back(*i
);
4565 reject
.log
.push_back(*i
);
4568 out
.log
.push_back(*i
);
4573 void pg_log_t::encode(bufferlist
& bl
) const
4575 ENCODE_START(7, 3, bl
);
4579 encode(can_rollback_to
, bl
);
4580 encode(rollback_info_trimmed_to
, bl
);
4585 void pg_log_t::decode(bufferlist::const_iterator
&bl
, int64_t pool
)
4587 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl
);
4592 decode(backlog
, bl
);
4596 decode(can_rollback_to
, bl
);
4599 decode(rollback_info_trimmed_to
, bl
);
4601 rollback_info_trimmed_to
= tail
;
4608 // handle hobject_t format change
4610 for (list
<pg_log_entry_t
>::iterator i
= log
.begin();
4613 if (!i
->soid
.is_max() && i
->soid
.pool
== -1)
4614 i
->soid
.pool
= pool
;
4619 void pg_log_t::dump(Formatter
*f
) const
4621 f
->dump_stream("head") << head
;
4622 f
->dump_stream("tail") << tail
;
4623 f
->open_array_section("log");
4624 for (list
<pg_log_entry_t
>::const_iterator p
= log
.begin(); p
!= log
.end(); ++p
) {
4625 f
->open_object_section("entry");
4630 f
->open_array_section("dups");
4631 for (const auto& entry
: dups
) {
4632 f
->open_object_section("entry");
4639 void pg_log_t::generate_test_instances(list
<pg_log_t
*>& o
)
4641 o
.push_back(new pg_log_t
);
4643 // this is nonsensical:
4644 o
.push_back(new pg_log_t
);
4645 o
.back()->head
= eversion_t(1,2);
4646 o
.back()->tail
= eversion_t(3,4);
4647 list
<pg_log_entry_t
*> e
;
4648 pg_log_entry_t::generate_test_instances(e
);
4649 for (list
<pg_log_entry_t
*>::iterator p
= e
.begin(); p
!= e
.end(); ++p
)
4650 o
.back()->log
.push_back(**p
);
4653 static void _handle_dups(CephContext
* cct
, pg_log_t
&target
, const pg_log_t
&other
, unsigned maxdups
)
4655 auto earliest_dup_version
=
4656 target
.head
.version
< maxdups
? 0u : target
.head
.version
- maxdups
+ 1;
4657 lgeneric_subdout(cct
, osd
, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version
<< dendl
;
4659 for (auto d
= other
.dups
.cbegin(); d
!= other
.dups
.cend(); ++d
) {
4660 if (d
->version
.version
>= earliest_dup_version
) {
4661 lgeneric_subdout(cct
, osd
, 20)
4662 << "copy_up_to/copy_after copy dup version "
4663 << d
->version
<< dendl
;
4664 target
.dups
.push_back(pg_log_dup_t(*d
));
4668 for (auto i
= other
.log
.cbegin(); i
!= other
.log
.cend(); ++i
) {
4669 ceph_assert(i
->version
> other
.tail
);
4670 if (i
->version
> target
.tail
)
4672 if (i
->version
.version
>= earliest_dup_version
) {
4673 lgeneric_subdout(cct
, osd
, 20)
4674 << "copy_up_to/copy_after copy dup from log version "
4675 << i
->version
<< dendl
;
4676 target
.dups
.push_back(pg_log_dup_t(*i
));
4682 void pg_log_t::copy_after(CephContext
* cct
, const pg_log_t
&other
, eversion_t v
)
4684 can_rollback_to
= other
.can_rollback_to
;
4687 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " v " << v
<< dendl
;
4688 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= other
.log
.rbegin();
4689 i
!= other
.log
.rend();
4691 ceph_assert(i
->version
> other
.tail
);
4692 if (i
->version
<= v
) {
4693 // make tail accurate.
4697 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " copy log version " << i
->version
<< dendl
;
4700 _handle_dups(cct
, *this, other
, cct
->_conf
->osd_pg_log_dups_tracked
);
4703 void pg_log_t::copy_up_to(CephContext
* cct
, const pg_log_t
&other
, int max
)
4705 can_rollback_to
= other
.can_rollback_to
;
4709 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " max " << max
<< dendl
;
4710 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= other
.log
.rbegin();
4711 i
!= other
.log
.rend();
4713 ceph_assert(i
->version
> other
.tail
);
4718 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " copy log version " << i
->version
<< dendl
;
4721 _handle_dups(cct
, *this, other
, cct
->_conf
->osd_pg_log_dups_tracked
);
4724 ostream
& pg_log_t::print(ostream
& out
) const
4726 out
<< *this << std::endl
;
4727 for (list
<pg_log_entry_t
>::const_iterator p
= log
.begin();
4730 out
<< *p
<< std::endl
;
4731 for (const auto& entry
: dups
) {
4732 out
<< " dup entry: " << entry
<< std::endl
;
4737 // -- pg_missing_t --
4739 ostream
& operator<<(ostream
& out
, const pg_missing_item
& i
)
4742 if (i
.have
!= eversion_t())
4743 out
<< "(" << i
.have
<< ")";
4744 out
<< " flags = " << i
.flag_str();
4748 // -- object_copy_cursor_t --
4750 void object_copy_cursor_t::encode(bufferlist
& bl
) const
4752 ENCODE_START(1, 1, bl
);
4753 encode(attr_complete
, bl
);
4754 encode(data_offset
, bl
);
4755 encode(data_complete
, bl
);
4756 encode(omap_offset
, bl
);
4757 encode(omap_complete
, bl
);
4761 void object_copy_cursor_t::decode(bufferlist::const_iterator
&bl
)
4763 DECODE_START(1, bl
);
4764 decode(attr_complete
, bl
);
4765 decode(data_offset
, bl
);
4766 decode(data_complete
, bl
);
4767 decode(omap_offset
, bl
);
4768 decode(omap_complete
, bl
);
4772 void object_copy_cursor_t::dump(Formatter
*f
) const
4774 f
->dump_unsigned("attr_complete", (int)attr_complete
);
4775 f
->dump_unsigned("data_offset", data_offset
);
4776 f
->dump_unsigned("data_complete", (int)data_complete
);
4777 f
->dump_string("omap_offset", omap_offset
);
4778 f
->dump_unsigned("omap_complete", (int)omap_complete
);
4781 void object_copy_cursor_t::generate_test_instances(list
<object_copy_cursor_t
*>& o
)
4783 o
.push_back(new object_copy_cursor_t
);
4784 o
.push_back(new object_copy_cursor_t
);
4785 o
.back()->attr_complete
= true;
4786 o
.back()->data_offset
= 123;
4787 o
.push_back(new object_copy_cursor_t
);
4788 o
.back()->attr_complete
= true;
4789 o
.back()->data_complete
= true;
4790 o
.back()->omap_offset
= "foo";
4791 o
.push_back(new object_copy_cursor_t
);
4792 o
.back()->attr_complete
= true;
4793 o
.back()->data_complete
= true;
4794 o
.back()->omap_complete
= true;
4797 // -- object_copy_data_t --
4799 void object_copy_data_t::encode(bufferlist
& bl
, uint64_t features
) const
4801 ENCODE_START(8, 5, bl
);
4806 encode(omap_data
, bl
);
4808 encode(omap_header
, bl
);
4810 encode(snap_seq
, bl
);
4812 encode(data_digest
, bl
);
4813 encode(omap_digest
, bl
);
4815 encode(truncate_seq
, bl
);
4816 encode(truncate_size
, bl
);
4817 encode(reqid_return_codes
, bl
);
4821 void object_copy_data_t::decode(bufferlist::const_iterator
& bl
)
4823 DECODE_START(7, bl
);
4830 decode(category
, bl
); // no longer used
4835 map
<string
,bufferlist
> omap
;
4838 if (!omap
.empty()) {
4840 encode(omap
, omap_data
);
4845 decode(omap_header
, bl
);
4846 if (struct_v
>= 3) {
4848 decode(snap_seq
, bl
);
4853 if (struct_v
>= 4) {
4855 decode(data_digest
, bl
);
4856 decode(omap_digest
, bl
);
4864 decode(omap_data
, bl
);
4866 decode(omap_header
, bl
);
4868 decode(snap_seq
, bl
);
4869 if (struct_v
>= 4) {
4871 decode(data_digest
, bl
);
4872 decode(omap_digest
, bl
);
4874 if (struct_v
>= 6) {
4877 if (struct_v
>= 7) {
4878 decode(truncate_seq
, bl
);
4879 decode(truncate_size
, bl
);
4881 if (struct_v
>= 8) {
4882 decode(reqid_return_codes
, bl
);
4888 void object_copy_data_t::generate_test_instances(list
<object_copy_data_t
*>& o
)
4890 o
.push_back(new object_copy_data_t());
4892 list
<object_copy_cursor_t
*> cursors
;
4893 object_copy_cursor_t::generate_test_instances(cursors
);
4894 list
<object_copy_cursor_t
*>::iterator ci
= cursors
.begin();
4895 o
.back()->cursor
= **(ci
++);
4897 o
.push_back(new object_copy_data_t());
4898 o
.back()->cursor
= **(ci
++);
4900 o
.push_back(new object_copy_data_t());
4901 o
.back()->size
= 1234;
4902 o
.back()->mtime
.set_from_double(1234);
4903 bufferptr
bp("there", 5);
4906 o
.back()->attrs
["hello"] = bl
;
4907 bufferptr
bp2("not", 3);
4910 map
<string
,bufferlist
> omap
;
4913 encode(omap
, o
.back()->omap_data
);
4914 bufferptr
databp("iamsomedatatocontain", 20);
4915 o
.back()->data
.push_back(databp
);
4916 o
.back()->omap_header
.append("this is an omap header");
4917 o
.back()->snaps
.push_back(123);
4918 o
.back()->reqids
.push_back(make_pair(osd_reqid_t(), version_t()));
4921 void object_copy_data_t::dump(Formatter
*f
) const
4923 f
->open_object_section("cursor");
4925 f
->close_section(); // cursor
4926 f
->dump_int("size", size
);
4927 f
->dump_stream("mtime") << mtime
;
4928 /* we should really print out the attrs here, but bufferlist
4929 const-correctness prevents that */
4930 f
->dump_int("attrs_size", attrs
.size());
4931 f
->dump_int("flags", flags
);
4932 f
->dump_unsigned("data_digest", data_digest
);
4933 f
->dump_unsigned("omap_digest", omap_digest
);
4934 f
->dump_int("omap_data_length", omap_data
.length());
4935 f
->dump_int("omap_header_length", omap_header
.length());
4936 f
->dump_int("data_length", data
.length());
4937 f
->open_array_section("snaps");
4938 for (vector
<snapid_t
>::const_iterator p
= snaps
.begin();
4939 p
!= snaps
.end(); ++p
)
4940 f
->dump_unsigned("snap", *p
);
4942 f
->open_array_section("reqids");
4944 for (auto p
= reqids
.begin();
4947 f
->open_object_section("extra_reqid");
4948 f
->dump_stream("reqid") << p
->first
;
4949 f
->dump_stream("user_version") << p
->second
;
4950 auto it
= reqid_return_codes
.find(idx
);
4951 if (it
!= reqid_return_codes
.end()) {
4952 f
->dump_int("return_code", it
->second
);
4959 // -- pg_create_t --
4961 void pg_create_t::encode(bufferlist
&bl
) const
4963 ENCODE_START(1, 1, bl
);
4964 encode(created
, bl
);
4966 encode(split_bits
, bl
);
4970 void pg_create_t::decode(bufferlist::const_iterator
&bl
)
4972 DECODE_START(1, bl
);
4973 decode(created
, bl
);
4975 decode(split_bits
, bl
);
4979 void pg_create_t::dump(Formatter
*f
) const
4981 f
->dump_unsigned("created", created
);
4982 f
->dump_stream("parent") << parent
;
4983 f
->dump_int("split_bits", split_bits
);
4986 void pg_create_t::generate_test_instances(list
<pg_create_t
*>& o
)
4988 o
.push_back(new pg_create_t
);
4989 o
.push_back(new pg_create_t(1, pg_t(3, 4), 2));
4993 // -- pg_hit_set_info_t --
4995 void pg_hit_set_info_t::encode(bufferlist
& bl
) const
4997 ENCODE_START(2, 1, bl
);
5000 encode(version
, bl
);
5001 encode(using_gmt
, bl
);
5005 void pg_hit_set_info_t::decode(bufferlist::const_iterator
& p
)
5011 if (struct_v
>= 2) {
5012 decode(using_gmt
, p
);
5019 void pg_hit_set_info_t::dump(Formatter
*f
) const
5021 f
->dump_stream("begin") << begin
;
5022 f
->dump_stream("end") << end
;
5023 f
->dump_stream("version") << version
;
5024 f
->dump_stream("using_gmt") << using_gmt
;
5027 void pg_hit_set_info_t::generate_test_instances(list
<pg_hit_set_info_t
*>& ls
)
5029 ls
.push_back(new pg_hit_set_info_t
);
5030 ls
.push_back(new pg_hit_set_info_t
);
5031 ls
.back()->begin
= utime_t(1, 2);
5032 ls
.back()->end
= utime_t(3, 4);
5036 // -- pg_hit_set_history_t --
5038 void pg_hit_set_history_t::encode(bufferlist
& bl
) const
5040 ENCODE_START(1, 1, bl
);
5041 encode(current_last_update
, bl
);
5043 utime_t dummy_stamp
;
5044 encode(dummy_stamp
, bl
);
5047 pg_hit_set_info_t dummy_info
;
5048 encode(dummy_info
, bl
);
5050 encode(history
, bl
);
5054 void pg_hit_set_history_t::decode(bufferlist::const_iterator
& p
)
5057 decode(current_last_update
, p
);
5059 utime_t dummy_stamp
;
5060 decode(dummy_stamp
, p
);
5063 pg_hit_set_info_t dummy_info
;
5064 decode(dummy_info
, p
);
5070 void pg_hit_set_history_t::dump(Formatter
*f
) const
5072 f
->dump_stream("current_last_update") << current_last_update
;
5073 f
->open_array_section("history");
5074 for (list
<pg_hit_set_info_t
>::const_iterator p
= history
.begin();
5075 p
!= history
.end(); ++p
) {
5076 f
->open_object_section("info");
5083 void pg_hit_set_history_t::generate_test_instances(list
<pg_hit_set_history_t
*>& ls
)
5085 ls
.push_back(new pg_hit_set_history_t
);
5086 ls
.push_back(new pg_hit_set_history_t
);
5087 ls
.back()->current_last_update
= eversion_t(1, 2);
5088 ls
.back()->history
.push_back(pg_hit_set_info_t());
5091 // -- OSDSuperblock --
5093 void OSDSuperblock::encode(bufferlist
&bl
) const
5095 ENCODE_START(8, 5, bl
);
5096 encode(cluster_fsid
, bl
);
5098 encode(current_epoch
, bl
);
5099 encode(oldest_map
, bl
);
5100 encode(newest_map
, bl
);
5102 compat_features
.encode(bl
);
5103 encode(clean_thru
, bl
);
5104 encode(mounted
, bl
);
5105 encode(osd_fsid
, bl
);
5106 encode((epoch_t
)0, bl
); // epoch_t last_epoch_marked_full
5107 encode((uint32_t)0, bl
); // map<int64_t,epoch_t> pool_last_epoch_marked_full
5111 void OSDSuperblock::decode(bufferlist::const_iterator
&bl
)
5113 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl
);
5118 decode(cluster_fsid
, bl
);
5120 decode(current_epoch
, bl
);
5121 decode(oldest_map
, bl
);
5122 decode(newest_map
, bl
);
5124 if (struct_v
>= 2) {
5125 compat_features
.decode(bl
);
5126 } else { //upgrade it!
5127 compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
5129 decode(clean_thru
, bl
);
5130 decode(mounted
, bl
);
5132 decode(osd_fsid
, bl
);
5133 if (struct_v
>= 6) {
5134 epoch_t last_map_marked_full
;
5135 decode(last_map_marked_full
, bl
);
5137 if (struct_v
>= 7) {
5138 map
<int64_t,epoch_t
> pool_last_map_marked_full
;
5139 decode(pool_last_map_marked_full
, bl
);
5144 void OSDSuperblock::dump(Formatter
*f
) const
5146 f
->dump_stream("cluster_fsid") << cluster_fsid
;
5147 f
->dump_stream("osd_fsid") << osd_fsid
;
5148 f
->dump_int("whoami", whoami
);
5149 f
->dump_int("current_epoch", current_epoch
);
5150 f
->dump_int("oldest_map", oldest_map
);
5151 f
->dump_int("newest_map", newest_map
);
5152 f
->dump_float("weight", weight
);
5153 f
->open_object_section("compat");
5154 compat_features
.dump(f
);
5156 f
->dump_int("clean_thru", clean_thru
);
5157 f
->dump_int("last_epoch_mounted", mounted
);
5160 void OSDSuperblock::generate_test_instances(list
<OSDSuperblock
*>& o
)
5163 o
.push_back(new OSDSuperblock(z
));
5164 z
.cluster_fsid
.parse("01010101-0101-0101-0101-010101010101");
5165 z
.osd_fsid
.parse("02020202-0202-0202-0202-020202020202");
5167 z
.current_epoch
= 4;
5172 o
.push_back(new OSDSuperblock(z
));
5173 o
.push_back(new OSDSuperblock(z
));
5178 void SnapSet::encode(bufferlist
& bl
) const
5180 ENCODE_START(3, 2, bl
);
5182 encode(true, bl
); // head_exists
5185 encode(clone_overlap
, bl
);
5186 encode(clone_size
, bl
);
5187 encode(clone_snaps
, bl
);
5191 void SnapSet::decode(bufferlist::const_iterator
& bl
)
5193 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
5195 bl
.advance(1u); // skip legacy head_exists (always true)
5198 decode(clone_overlap
, bl
);
5199 decode(clone_size
, bl
);
5200 if (struct_v
>= 3) {
5201 decode(clone_snaps
, bl
);
5203 clone_snaps
.clear();
5208 void SnapSet::dump(Formatter
*f
) const
5210 SnapContext
sc(seq
, snaps
);
5211 f
->open_object_section("snap_context");
5214 f
->open_array_section("clones");
5215 for (vector
<snapid_t
>::const_iterator p
= clones
.begin(); p
!= clones
.end(); ++p
) {
5216 f
->open_object_section("clone");
5217 f
->dump_unsigned("snap", *p
);
5218 auto cs
= clone_size
.find(*p
);
5219 if (cs
!= clone_size
.end())
5220 f
->dump_unsigned("size", cs
->second
);
5222 f
->dump_string("size", "????");
5223 auto co
= clone_overlap
.find(*p
);
5224 if (co
!= clone_overlap
.end())
5225 f
->dump_stream("overlap") << co
->second
;
5227 f
->dump_stream("overlap") << "????";
5228 auto q
= clone_snaps
.find(*p
);
5229 if (q
!= clone_snaps
.end()) {
5230 f
->open_array_section("snaps");
5231 for (auto s
: q
->second
) {
5232 f
->dump_unsigned("snap", s
);
5241 void SnapSet::generate_test_instances(list
<SnapSet
*>& o
)
5243 o
.push_back(new SnapSet
);
5244 o
.push_back(new SnapSet
);
5245 o
.back()->seq
= 123;
5246 o
.back()->snaps
.push_back(123);
5247 o
.back()->snaps
.push_back(12);
5248 o
.push_back(new SnapSet
);
5249 o
.back()->seq
= 123;
5250 o
.back()->snaps
.push_back(123);
5251 o
.back()->snaps
.push_back(12);
5252 o
.back()->clones
.push_back(12);
5253 o
.back()->clone_size
[12] = 12345;
5254 o
.back()->clone_overlap
[12];
5255 o
.back()->clone_snaps
[12] = {12, 10, 8};
5258 ostream
& operator<<(ostream
& out
, const SnapSet
& cs
)
5260 return out
<< cs
.seq
<< "=" << cs
.snaps
<< ":"
5264 void SnapSet::from_snap_set(const librados::snap_set_t
& ss
, bool legacy
)
5266 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5267 // correct: it will not include snaps that still logically exist
5268 // but for which there was no clone that is defined. For all
5269 // practical purposes this doesn't matter, since we only use that
5270 // information to clone on the OSD, and we have already moved
5271 // forward past that part of the object history.
5274 set
<snapid_t
> _snaps
;
5275 set
<snapid_t
> _clones
;
5276 for (vector
<librados::clone_info_t
>::const_iterator p
= ss
.clones
.begin();
5277 p
!= ss
.clones
.end();
5279 if (p
->cloneid
!= librados::SNAP_HEAD
) {
5280 _clones
.insert(p
->cloneid
);
5281 _snaps
.insert(p
->snaps
.begin(), p
->snaps
.end());
5282 clone_size
[p
->cloneid
] = p
->size
;
5283 clone_overlap
[p
->cloneid
]; // the entry must exist, even if it's empty.
5284 for (vector
<pair
<uint64_t, uint64_t> >::const_iterator q
=
5285 p
->overlap
.begin(); q
!= p
->overlap
.end(); ++q
)
5286 clone_overlap
[p
->cloneid
].insert(q
->first
, q
->second
);
5288 // p->snaps is ascending; clone_snaps is descending
5289 vector
<snapid_t
>& v
= clone_snaps
[p
->cloneid
];
5290 for (auto q
= p
->snaps
.rbegin(); q
!= p
->snaps
.rend(); ++q
) {
5299 clones
.reserve(_clones
.size());
5300 for (set
<snapid_t
>::iterator p
= _clones
.begin(); p
!= _clones
.end(); ++p
)
5301 clones
.push_back(*p
);
5305 snaps
.reserve(_snaps
.size());
5306 for (set
<snapid_t
>::reverse_iterator p
= _snaps
.rbegin();
5307 p
!= _snaps
.rend(); ++p
)
5308 snaps
.push_back(*p
);
5311 uint64_t SnapSet::get_clone_bytes(snapid_t clone
) const
5313 ceph_assert(clone_size
.count(clone
));
5314 uint64_t size
= clone_size
.find(clone
)->second
;
5315 ceph_assert(clone_overlap
.count(clone
));
5316 const interval_set
<uint64_t> &overlap
= clone_overlap
.find(clone
)->second
;
5317 ceph_assert(size
>= (uint64_t)overlap
.size());
5318 return size
- overlap
.size();
5321 void SnapSet::filter(const pg_pool_t
&pinfo
)
5323 vector
<snapid_t
> oldsnaps
;
5324 oldsnaps
.swap(snaps
);
5325 for (vector
<snapid_t
>::const_iterator i
= oldsnaps
.begin();
5326 i
!= oldsnaps
.end();
5328 if (!pinfo
.is_removed_snap(*i
))
5329 snaps
.push_back(*i
);
5333 SnapSet
SnapSet::get_filtered(const pg_pool_t
&pinfo
) const
5340 // -- watch_info_t --
5342 void watch_info_t::encode(bufferlist
& bl
, uint64_t features
) const
5344 ENCODE_START(4, 3, bl
);
5346 encode(timeout_seconds
, bl
);
5347 encode(addr
, bl
, features
);
5351 void watch_info_t::decode(bufferlist::const_iterator
& bl
)
5353 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl
);
5359 decode(timeout_seconds
, bl
);
5360 if (struct_v
>= 4) {
5366 void watch_info_t::dump(Formatter
*f
) const
5368 f
->dump_unsigned("cookie", cookie
);
5369 f
->dump_unsigned("timeout_seconds", timeout_seconds
);
5370 f
->open_object_section("addr");
5375 void watch_info_t::generate_test_instances(list
<watch_info_t
*>& o
)
5377 o
.push_back(new watch_info_t
);
5378 o
.push_back(new watch_info_t
);
5379 o
.back()->cookie
= 123;
5380 o
.back()->timeout_seconds
= 99;
5382 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
5384 ea
.set_family(AF_INET
);
5385 ea
.set_in4_quad(0, 127);
5386 ea
.set_in4_quad(1, 0);
5387 ea
.set_in4_quad(2, 1);
5388 ea
.set_in4_quad(3, 2);
5390 o
.back()->addr
= ea
;
5393 // -- chunk_info_t --
5395 void chunk_info_t::encode(bufferlist
& bl
) const
5397 ENCODE_START(1, 1, bl
);
5401 __u32 _flags
= flags
;
5406 void chunk_info_t::decode(bufferlist::const_iterator
& bl
)
5408 DECODE_START(1, bl
);
5414 flags
= (cflag_t
)_flags
;
5418 void chunk_info_t::dump(Formatter
*f
) const
5420 f
->dump_unsigned("length", length
);
5421 f
->open_object_section("oid");
5424 f
->dump_unsigned("flags", flags
);
5427 ostream
& operator<<(ostream
& out
, const chunk_info_t
& ci
)
5429 return out
<< "(len: " << ci
.length
<< " oid: " << ci
.oid
5430 << " offset: " << ci
.offset
5431 << " flags: " << ci
.get_flag_string(ci
.flags
) << ")";
5434 // -- object_manifest_t --
5436 void object_manifest_t::encode(bufferlist
& bl
) const
5438 ENCODE_START(1, 1, bl
);
5441 case TYPE_NONE
: break;
5443 encode(redirect_target
, bl
);
5446 encode(chunk_map
, bl
);
5454 void object_manifest_t::decode(bufferlist::const_iterator
& bl
)
5456 DECODE_START(1, bl
);
5459 case TYPE_NONE
: break;
5461 decode(redirect_target
, bl
);
5464 decode(chunk_map
, bl
);
5472 void object_manifest_t::dump(Formatter
*f
) const
5474 f
->dump_unsigned("type", type
);
5475 if (type
== TYPE_REDIRECT
) {
5476 f
->open_object_section("redirect_target");
5477 redirect_target
.dump(f
);
5479 } else if (type
== TYPE_CHUNKED
) {
5480 f
->open_array_section("chunk_map");
5481 for (auto& p
: chunk_map
) {
5482 f
->open_object_section("chunk");
5483 f
->dump_unsigned("offset", p
.first
);
5491 void object_manifest_t::generate_test_instances(list
<object_manifest_t
*>& o
)
5493 o
.push_back(new object_manifest_t());
5494 o
.back()->type
= TYPE_REDIRECT
;
5497 ostream
& operator<<(ostream
& out
, const object_manifest_t
& om
)
5499 out
<< "manifest(" << om
.get_type_name();
5500 if (om
.is_redirect()) {
5501 out
<< " " << om
.redirect_target
;
5502 } else if (om
.is_chunked()) {
5503 out
<< " " << om
.chunk_map
;
5509 // -- object_info_t --
5511 void object_info_t::copy_user_bits(const object_info_t
& other
)
5513 // these bits are copied from head->clone.
5515 mtime
= other
.mtime
;
5516 local_mtime
= other
.local_mtime
;
5517 last_reqid
= other
.last_reqid
;
5518 truncate_seq
= other
.truncate_seq
;
5519 truncate_size
= other
.truncate_size
;
5520 flags
= other
.flags
;
5521 user_version
= other
.user_version
;
5522 data_digest
= other
.data_digest
;
5523 omap_digest
= other
.omap_digest
;
5526 void object_info_t::encode(bufferlist
& bl
, uint64_t features
) const
5528 object_locator_t
myoloc(soid
);
5529 map
<entity_name_t
, watch_info_t
> old_watchers
;
5530 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator i
=
5532 i
!= watchers
.end();
5534 old_watchers
.insert(make_pair(i
->first
.second
, i
->second
));
5536 ENCODE_START(17, 8, bl
);
5538 encode(myoloc
, bl
); //Retained for compatibility
5539 encode((__u32
)0, bl
); // was category, no longer used
5540 encode(version
, bl
);
5541 encode(prior_version
, bl
);
5542 encode(last_reqid
, bl
);
5545 if (soid
.snap
== CEPH_NOSNAP
)
5546 encode(osd_reqid_t(), bl
); // used to be wrlock_by
5548 encode((uint32_t)0, bl
); // was legacy_snaps
5549 encode(truncate_seq
, bl
);
5550 encode(truncate_size
, bl
);
5551 encode(is_lost(), bl
);
5552 encode(old_watchers
, bl
, features
);
5553 /* shenanigans to avoid breaking backwards compatibility in the disk format.
5554 * When we can, switch this out for simply putting the version_t on disk. */
5555 eversion_t
user_eversion(0, user_version
);
5556 encode(user_eversion
, bl
);
5557 encode(test_flag(FLAG_USES_TMAP
), bl
);
5558 encode(watchers
, bl
, features
);
5559 __u32 _flags
= flags
;
5561 encode(local_mtime
, bl
);
5562 encode(data_digest
, bl
);
5563 encode(omap_digest
, bl
);
5564 encode(expected_object_size
, bl
);
5565 encode(expected_write_size
, bl
);
5566 encode(alloc_hint_flags
, bl
);
5567 if (has_manifest()) {
5568 encode(manifest
, bl
);
5573 void object_info_t::decode(bufferlist::const_iterator
& bl
)
5575 object_locator_t myoloc
;
5576 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl
);
5577 map
<entity_name_t
, watch_info_t
> old_watchers
;
5582 decode(category
, bl
); // no longer used
5584 decode(version
, bl
);
5585 decode(prior_version
, bl
);
5586 decode(last_reqid
, bl
);
5589 if (soid
.snap
== CEPH_NOSNAP
) {
5590 osd_reqid_t wrlock_by
;
5591 decode(wrlock_by
, bl
);
5593 vector
<snapid_t
> legacy_snaps
;
5594 decode(legacy_snaps
, bl
);
5596 decode(truncate_seq
, bl
);
5597 decode(truncate_size
, bl
);
5599 // if this is struct_v >= 13, we will overwrite this
5600 // below since this field is just here for backwards
5606 decode(old_watchers
, bl
);
5607 eversion_t user_eversion
;
5608 decode(user_eversion
, bl
);
5609 user_version
= user_eversion
.version
;
5611 if (struct_v
>= 9) {
5612 bool uses_tmap
= false;
5613 decode(uses_tmap
, bl
);
5615 set_flag(FLAG_USES_TMAP
);
5617 set_flag(FLAG_USES_TMAP
);
5620 soid
.pool
= myoloc
.pool
;
5621 if (struct_v
>= 11) {
5622 decode(watchers
, bl
);
5624 for (map
<entity_name_t
, watch_info_t
>::iterator i
= old_watchers
.begin();
5625 i
!= old_watchers
.end();
5629 make_pair(i
->second
.cookie
, i
->first
), i
->second
));
5632 if (struct_v
>= 13) {
5635 flags
= (flag_t
)_flags
;
5637 if (struct_v
>= 14) {
5638 decode(local_mtime
, bl
);
5640 local_mtime
= utime_t();
5642 if (struct_v
>= 15) {
5643 decode(data_digest
, bl
);
5644 decode(omap_digest
, bl
);
5646 data_digest
= omap_digest
= -1;
5647 clear_flag(FLAG_DATA_DIGEST
);
5648 clear_flag(FLAG_OMAP_DIGEST
);
5650 if (struct_v
>= 16) {
5651 decode(expected_object_size
, bl
);
5652 decode(expected_write_size
, bl
);
5653 decode(alloc_hint_flags
, bl
);
5655 expected_object_size
= 0;
5656 expected_write_size
= 0;
5657 alloc_hint_flags
= 0;
5659 if (struct_v
>= 17) {
5660 if (has_manifest()) {
5661 decode(manifest
, bl
);
5667 void object_info_t::dump(Formatter
*f
) const
5669 f
->open_object_section("oid");
5672 f
->dump_stream("version") << version
;
5673 f
->dump_stream("prior_version") << prior_version
;
5674 f
->dump_stream("last_reqid") << last_reqid
;
5675 f
->dump_unsigned("user_version", user_version
);
5676 f
->dump_unsigned("size", size
);
5677 f
->dump_stream("mtime") << mtime
;
5678 f
->dump_stream("local_mtime") << local_mtime
;
5679 f
->dump_unsigned("lost", (int)is_lost());
5680 vector
<string
> sv
= get_flag_vector(flags
);
5681 f
->open_array_section("flags");
5683 f
->dump_string("flags", str
);
5685 f
->dump_unsigned("truncate_seq", truncate_seq
);
5686 f
->dump_unsigned("truncate_size", truncate_size
);
5687 f
->dump_format("data_digest", "0x%08x", data_digest
);
5688 f
->dump_format("omap_digest", "0x%08x", omap_digest
);
5689 f
->dump_unsigned("expected_object_size", expected_object_size
);
5690 f
->dump_unsigned("expected_write_size", expected_write_size
);
5691 f
->dump_unsigned("alloc_hint_flags", alloc_hint_flags
);
5692 f
->dump_object("manifest", manifest
);
5693 f
->open_object_section("watchers");
5694 for (map
<pair
<uint64_t, entity_name_t
>,watch_info_t
>::const_iterator p
=
5695 watchers
.begin(); p
!= watchers
.end(); ++p
) {
5697 ss
<< p
->first
.second
;
5698 f
->open_object_section(ss
.str().c_str());
5705 void object_info_t::generate_test_instances(list
<object_info_t
*>& o
)
5707 o
.push_back(new object_info_t());
5713 ostream
& operator<<(ostream
& out
, const object_info_t
& oi
)
5715 out
<< oi
.soid
<< "(" << oi
.version
5716 << " " << oi
.last_reqid
;
5718 out
<< " " << oi
.get_flag_string();
5719 out
<< " s " << oi
.size
;
5720 out
<< " uv " << oi
.user_version
;
5721 if (oi
.is_data_digest())
5722 out
<< " dd " << std::hex
<< oi
.data_digest
<< std::dec
;
5723 if (oi
.is_omap_digest())
5724 out
<< " od " << std::hex
<< oi
.omap_digest
<< std::dec
;
5725 out
<< " alloc_hint [" << oi
.expected_object_size
5726 << " " << oi
.expected_write_size
5727 << " " << oi
.alloc_hint_flags
<< "]";
5728 if (oi
.has_manifest())
5729 out
<< " " << oi
.manifest
;
5734 // -- ObjectRecovery --
5735 void ObjectRecoveryProgress::encode(bufferlist
&bl
) const
5737 ENCODE_START(1, 1, bl
);
5739 encode(data_complete
, bl
);
5740 encode(data_recovered_to
, bl
);
5741 encode(omap_recovered_to
, bl
);
5742 encode(omap_complete
, bl
);
5746 void ObjectRecoveryProgress::decode(bufferlist::const_iterator
&bl
)
5748 DECODE_START(1, bl
);
5750 decode(data_complete
, bl
);
5751 decode(data_recovered_to
, bl
);
5752 decode(omap_recovered_to
, bl
);
5753 decode(omap_complete
, bl
);
5757 ostream
&operator<<(ostream
&out
, const ObjectRecoveryProgress
&prog
)
5759 return prog
.print(out
);
5762 void ObjectRecoveryProgress::generate_test_instances(
5763 list
<ObjectRecoveryProgress
*>& o
)
5765 o
.push_back(new ObjectRecoveryProgress
);
5766 o
.back()->first
= false;
5767 o
.back()->data_complete
= true;
5768 o
.back()->omap_complete
= true;
5769 o
.back()->data_recovered_to
= 100;
5771 o
.push_back(new ObjectRecoveryProgress
);
5772 o
.back()->first
= true;
5773 o
.back()->data_complete
= false;
5774 o
.back()->omap_complete
= false;
5775 o
.back()->data_recovered_to
= 0;
5778 ostream
&ObjectRecoveryProgress::print(ostream
&out
) const
5780 return out
<< "ObjectRecoveryProgress("
5781 << ( first
? "" : "!" ) << "first, "
5782 << "data_recovered_to:" << data_recovered_to
5783 << ", data_complete:" << ( data_complete
? "true" : "false" )
5784 << ", omap_recovered_to:" << omap_recovered_to
5785 << ", omap_complete:" << ( omap_complete
? "true" : "false" )
5786 << ", error:" << ( error
? "true" : "false" )
5790 void ObjectRecoveryProgress::dump(Formatter
*f
) const
5792 f
->dump_int("first?", first
);
5793 f
->dump_int("data_complete?", data_complete
);
5794 f
->dump_unsigned("data_recovered_to", data_recovered_to
);
5795 f
->dump_int("omap_complete?", omap_complete
);
5796 f
->dump_string("omap_recovered_to", omap_recovered_to
);
5799 void ObjectRecoveryInfo::encode(bufferlist
&bl
, uint64_t features
) const
5801 ENCODE_START(2, 1, bl
);
5803 encode(version
, bl
);
5805 encode(oi
, bl
, features
);
5807 encode(copy_subset
, bl
);
5808 encode(clone_subset
, bl
);
5812 void ObjectRecoveryInfo::decode(bufferlist::const_iterator
&bl
,
5815 DECODE_START(2, bl
);
5817 decode(version
, bl
);
5821 decode(copy_subset
, bl
);
5822 decode(clone_subset
, bl
);
5826 if (!soid
.is_max() && soid
.pool
== -1)
5828 map
<hobject_t
, interval_set
<uint64_t>> tmp
;
5829 tmp
.swap(clone_subset
);
5830 for (map
<hobject_t
, interval_set
<uint64_t>>::iterator i
= tmp
.begin();
5833 hobject_t
first(i
->first
);
5834 if (!first
.is_max() && first
.pool
== -1)
5836 clone_subset
[first
].swap(i
->second
);
5841 void ObjectRecoveryInfo::generate_test_instances(
5842 list
<ObjectRecoveryInfo
*>& o
)
5844 o
.push_back(new ObjectRecoveryInfo
);
5845 o
.back()->soid
= hobject_t(sobject_t("key", CEPH_NOSNAP
));
5846 o
.back()->version
= eversion_t(0,0);
5847 o
.back()->size
= 100;
5851 void ObjectRecoveryInfo::dump(Formatter
*f
) const
5853 f
->dump_stream("object") << soid
;
5854 f
->dump_stream("at_version") << version
;
5855 f
->dump_stream("size") << size
;
5857 f
->open_object_section("object_info");
5862 f
->open_object_section("snapset");
5866 f
->dump_stream("copy_subset") << copy_subset
;
5867 f
->dump_stream("clone_subset") << clone_subset
;
5870 ostream
& operator<<(ostream
& out
, const ObjectRecoveryInfo
&inf
)
5872 return inf
.print(out
);
5875 ostream
&ObjectRecoveryInfo::print(ostream
&out
) const
5877 return out
<< "ObjectRecoveryInfo("
5878 << soid
<< "@" << version
5879 << ", size: " << size
5880 << ", copy_subset: " << copy_subset
5881 << ", clone_subset: " << clone_subset
5882 << ", snapset: " << ss
5886 // -- PushReplyOp --
5887 void PushReplyOp::generate_test_instances(list
<PushReplyOp
*> &o
)
5889 o
.push_back(new PushReplyOp
);
5890 o
.push_back(new PushReplyOp
);
5891 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
5892 o
.push_back(new PushReplyOp
);
5893 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
5896 void PushReplyOp::encode(bufferlist
&bl
) const
5898 ENCODE_START(1, 1, bl
);
5903 void PushReplyOp::decode(bufferlist::const_iterator
&bl
)
5905 DECODE_START(1, bl
);
5910 void PushReplyOp::dump(Formatter
*f
) const
5912 f
->dump_stream("soid") << soid
;
5915 ostream
&PushReplyOp::print(ostream
&out
) const
5918 << "PushReplyOp(" << soid
5922 ostream
& operator<<(ostream
& out
, const PushReplyOp
&op
)
5924 return op
.print(out
);
5927 uint64_t PushReplyOp::cost(CephContext
*cct
) const
5930 return cct
->_conf
->osd_push_per_object_cost
+
5931 cct
->_conf
->osd_recovery_max_chunk
;
5935 void PullOp::generate_test_instances(list
<PullOp
*> &o
)
5937 o
.push_back(new PullOp
);
5938 o
.push_back(new PullOp
);
5939 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
5940 o
.back()->recovery_info
.version
= eversion_t(3, 10);
5941 o
.push_back(new PullOp
);
5942 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
5943 o
.back()->recovery_info
.version
= eversion_t(0, 0);
5946 void PullOp::encode(bufferlist
&bl
, uint64_t features
) const
5948 ENCODE_START(1, 1, bl
);
5950 encode(recovery_info
, bl
, features
);
5951 encode(recovery_progress
, bl
);
5955 void PullOp::decode(bufferlist::const_iterator
&bl
)
5957 DECODE_START(1, bl
);
5959 decode(recovery_info
, bl
);
5960 decode(recovery_progress
, bl
);
5964 void PullOp::dump(Formatter
*f
) const
5966 f
->dump_stream("soid") << soid
;
5968 f
->open_object_section("recovery_info");
5969 recovery_info
.dump(f
);
5973 f
->open_object_section("recovery_progress");
5974 recovery_progress
.dump(f
);
5979 ostream
&PullOp::print(ostream
&out
) const
5982 << "PullOp(" << soid
5983 << ", recovery_info: " << recovery_info
5984 << ", recovery_progress: " << recovery_progress
5988 ostream
& operator<<(ostream
& out
, const PullOp
&op
)
5990 return op
.print(out
);
5993 uint64_t PullOp::cost(CephContext
*cct
) const
5995 return cct
->_conf
->osd_push_per_object_cost
+
5996 cct
->_conf
->osd_recovery_max_chunk
;
6000 void PushOp::generate_test_instances(list
<PushOp
*> &o
)
6002 o
.push_back(new PushOp
);
6003 o
.push_back(new PushOp
);
6004 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
6005 o
.back()->version
= eversion_t(3, 10);
6006 o
.push_back(new PushOp
);
6007 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
6008 o
.back()->version
= eversion_t(0, 0);
6011 void PushOp::encode(bufferlist
&bl
, uint64_t features
) const
6013 ENCODE_START(1, 1, bl
);
6015 encode(version
, bl
);
6017 encode(data_included
, bl
);
6018 encode(omap_header
, bl
);
6019 encode(omap_entries
, bl
);
6020 encode(attrset
, bl
);
6021 encode(recovery_info
, bl
, features
);
6022 encode(after_progress
, bl
);
6023 encode(before_progress
, bl
);
6027 void PushOp::decode(bufferlist::const_iterator
&bl
)
6029 DECODE_START(1, bl
);
6031 decode(version
, bl
);
6033 decode(data_included
, bl
);
6034 decode(omap_header
, bl
);
6035 decode(omap_entries
, bl
);
6036 decode(attrset
, bl
);
6037 decode(recovery_info
, bl
);
6038 decode(after_progress
, bl
);
6039 decode(before_progress
, bl
);
6043 void PushOp::dump(Formatter
*f
) const
6045 f
->dump_stream("soid") << soid
;
6046 f
->dump_stream("version") << version
;
6047 f
->dump_int("data_len", data
.length());
6048 f
->dump_stream("data_included") << data_included
;
6049 f
->dump_int("omap_header_len", omap_header
.length());
6050 f
->dump_int("omap_entries_len", omap_entries
.size());
6051 f
->dump_int("attrset_len", attrset
.size());
6053 f
->open_object_section("recovery_info");
6054 recovery_info
.dump(f
);
6058 f
->open_object_section("after_progress");
6059 after_progress
.dump(f
);
6063 f
->open_object_section("before_progress");
6064 before_progress
.dump(f
);
6069 ostream
&PushOp::print(ostream
&out
) const
6072 << "PushOp(" << soid
6073 << ", version: " << version
6074 << ", data_included: " << data_included
6075 << ", data_size: " << data
.length()
6076 << ", omap_header_size: " << omap_header
.length()
6077 << ", omap_entries_size: " << omap_entries
.size()
6078 << ", attrset_size: " << attrset
.size()
6079 << ", recovery_info: " << recovery_info
6080 << ", after_progress: " << after_progress
6081 << ", before_progress: " << before_progress
6085 ostream
& operator<<(ostream
& out
, const PushOp
&op
)
6087 return op
.print(out
);
6090 uint64_t PushOp::cost(CephContext
*cct
) const
6092 uint64_t cost
= data_included
.size();
6093 for (map
<string
, bufferlist
>::const_iterator i
=
6094 omap_entries
.begin();
6095 i
!= omap_entries
.end();
6097 cost
+= i
->second
.length();
6099 cost
+= cct
->_conf
->osd_push_per_object_cost
;
6105 void ScrubMap::merge_incr(const ScrubMap
&l
)
6107 ceph_assert(valid_through
== l
.incr_since
);
6108 valid_through
= l
.valid_through
;
6110 for (map
<hobject_t
,object
>::const_iterator p
= l
.objects
.begin();
6111 p
!= l
.objects
.end();
6113 if (p
->second
.negative
) {
6114 map
<hobject_t
,object
>::iterator q
= objects
.find(p
->first
);
6115 if (q
!= objects
.end()) {
6119 objects
[p
->first
] = p
->second
;
6124 void ScrubMap::encode(bufferlist
& bl
) const
6126 ENCODE_START(3, 2, bl
);
6127 encode(objects
, bl
);
6128 encode((__u32
)0, bl
); // used to be attrs; now deprecated
6129 bufferlist old_logbl
; // not used
6130 encode(old_logbl
, bl
);
6131 encode(valid_through
, bl
);
6132 encode(incr_since
, bl
);
6136 void ScrubMap::decode(bufferlist::const_iterator
& bl
, int64_t pool
)
6138 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
6139 decode(objects
, bl
);
6141 map
<string
,string
> attrs
; // deprecated
6144 bufferlist old_logbl
; // not used
6145 decode(old_logbl
, bl
);
6146 decode(valid_through
, bl
);
6147 decode(incr_since
, bl
);
6150 // handle hobject_t upgrade
6152 map
<hobject_t
, object
> tmp
;
6154 for (map
<hobject_t
, object
>::iterator i
= tmp
.begin();
6157 hobject_t
first(i
->first
);
6158 if (!first
.is_max() && first
.pool
== -1)
6160 objects
[first
] = i
->second
;
6165 void ScrubMap::dump(Formatter
*f
) const
6167 f
->dump_stream("valid_through") << valid_through
;
6168 f
->dump_stream("incremental_since") << incr_since
;
6169 f
->open_array_section("objects");
6170 for (map
<hobject_t
,object
>::const_iterator p
= objects
.begin(); p
!= objects
.end(); ++p
) {
6171 f
->open_object_section("object");
6172 f
->dump_string("name", p
->first
.oid
.name
);
6173 f
->dump_unsigned("hash", p
->first
.get_hash());
6174 f
->dump_string("key", p
->first
.get_key());
6175 f
->dump_int("snapid", p
->first
.snap
);
6182 void ScrubMap::generate_test_instances(list
<ScrubMap
*>& o
)
6184 o
.push_back(new ScrubMap
);
6185 o
.push_back(new ScrubMap
);
6186 o
.back()->valid_through
= eversion_t(1, 2);
6187 o
.back()->incr_since
= eversion_t(3, 4);
6189 object::generate_test_instances(obj
);
6190 o
.back()->objects
[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj
.back();
6192 o
.back()->objects
[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj
.back();
6195 // -- ScrubMap::object --
6197 void ScrubMap::object::encode(bufferlist
& bl
) const
6199 bool compat_read_error
= read_error
|| ec_hash_mismatch
|| ec_size_mismatch
;
6200 ENCODE_START(10, 7, bl
);
6202 encode(negative
, bl
);
6205 encode(digest_present
, bl
);
6206 encode((uint32_t)0, bl
); // obsolete nlinks
6207 encode((uint32_t)0, bl
); // snapcolls
6208 encode(omap_digest
, bl
);
6209 encode(omap_digest_present
, bl
);
6210 encode(compat_read_error
, bl
);
6211 encode(stat_error
, bl
);
6212 encode(read_error
, bl
);
6213 encode(ec_hash_mismatch
, bl
);
6214 encode(ec_size_mismatch
, bl
);
6215 encode(large_omap_object_found
, bl
);
6216 encode(large_omap_object_key_count
, bl
);
6217 encode(large_omap_object_value_size
, bl
);
6218 encode(object_omap_bytes
, bl
);
6219 encode(object_omap_keys
, bl
);
6223 void ScrubMap::object::decode(bufferlist::const_iterator
& bl
)
6225 DECODE_START(10, bl
);
6227 bool tmp
, compat_read_error
= false;
6233 digest_present
= tmp
;
6237 set
<snapid_t
> snapcolls
;
6238 decode(snapcolls
, bl
);
6240 decode(omap_digest
, bl
);
6242 omap_digest_present
= tmp
;
6243 decode(compat_read_error
, bl
);
6246 if (struct_v
>= 8) {
6250 ec_hash_mismatch
= tmp
;
6252 ec_size_mismatch
= tmp
;
6254 // If older encoder found a read_error, set read_error
6255 if (compat_read_error
&& !read_error
&& !ec_hash_mismatch
&& !ec_size_mismatch
)
6257 if (struct_v
>= 9) {
6259 large_omap_object_found
= tmp
;
6260 decode(large_omap_object_key_count
, bl
);
6261 decode(large_omap_object_value_size
, bl
);
6263 if (struct_v
>= 10) {
6264 decode(object_omap_bytes
, bl
);
6265 decode(object_omap_keys
, bl
);
6270 void ScrubMap::object::dump(Formatter
*f
) const
6272 f
->dump_int("size", size
);
6273 f
->dump_int("negative", negative
);
6274 f
->open_array_section("attrs");
6275 for (map
<string
,bufferptr
>::const_iterator p
= attrs
.begin(); p
!= attrs
.end(); ++p
) {
6276 f
->open_object_section("attr");
6277 f
->dump_string("name", p
->first
);
6278 f
->dump_int("length", p
->second
.length());
6284 void ScrubMap::object::generate_test_instances(list
<object
*>& o
)
6286 o
.push_back(new object
);
6287 o
.push_back(new object
);
6288 o
.back()->negative
= true;
6289 o
.push_back(new object
);
6290 o
.back()->size
= 123;
6291 o
.back()->attrs
["foo"] = buffer::copy("foo", 3);
6292 o
.back()->attrs
["bar"] = buffer::copy("barval", 6);
6297 ostream
& operator<<(ostream
& out
, const OSDOp
& op
)
6299 out
<< ceph_osd_op_name(op
.op
.op
);
6300 if (ceph_osd_op_type_data(op
.op
.op
)) {
6303 case CEPH_OSD_OP_ASSERT_VER
:
6304 out
<< " v" << op
.op
.assert_ver
.ver
;
6306 case CEPH_OSD_OP_TRUNCATE
:
6307 out
<< " " << op
.op
.extent
.offset
;
6309 case CEPH_OSD_OP_MASKTRUNC
:
6310 case CEPH_OSD_OP_TRIMTRUNC
:
6311 out
<< " " << op
.op
.extent
.truncate_seq
<< "@"
6312 << (int64_t)op
.op
.extent
.truncate_size
;
6314 case CEPH_OSD_OP_ROLLBACK
:
6315 out
<< " " << snapid_t(op
.op
.snap
.snapid
);
6317 case CEPH_OSD_OP_WATCH
:
6318 out
<< " " << ceph_osd_watch_op_name(op
.op
.watch
.op
)
6319 << " cookie " << op
.op
.watch
.cookie
;
6320 if (op
.op
.watch
.gen
)
6321 out
<< " gen " << op
.op
.watch
.gen
;
6323 case CEPH_OSD_OP_NOTIFY
:
6324 out
<< " cookie " << op
.op
.notify
.cookie
;
6326 case CEPH_OSD_OP_COPY_GET
:
6327 out
<< " max " << op
.op
.copy_get
.max
;
6329 case CEPH_OSD_OP_COPY_FROM
:
6330 out
<< " ver " << op
.op
.copy_from
.src_version
;
6332 case CEPH_OSD_OP_SETALLOCHINT
:
6333 out
<< " object_size " << op
.op
.alloc_hint
.expected_object_size
6334 << " write_size " << op
.op
.alloc_hint
.expected_write_size
;
6336 case CEPH_OSD_OP_READ
:
6337 case CEPH_OSD_OP_SPARSE_READ
:
6338 case CEPH_OSD_OP_SYNC_READ
:
6339 case CEPH_OSD_OP_WRITE
:
6340 case CEPH_OSD_OP_WRITEFULL
:
6341 case CEPH_OSD_OP_ZERO
:
6342 case CEPH_OSD_OP_APPEND
:
6343 case CEPH_OSD_OP_MAPEXT
:
6344 case CEPH_OSD_OP_CMPEXT
:
6345 out
<< " " << op
.op
.extent
.offset
<< "~" << op
.op
.extent
.length
;
6346 if (op
.op
.extent
.truncate_seq
)
6347 out
<< " [" << op
.op
.extent
.truncate_seq
<< "@"
6348 << (int64_t)op
.op
.extent
.truncate_size
<< "]";
6350 out
<< " [" << ceph_osd_op_flag_string(op
.op
.flags
) << "]";
6352 // don't show any arg info
6355 } else if (ceph_osd_op_type_attr(op
.op
.op
)) {
6357 if (op
.op
.xattr
.name_len
&& op
.indata
.length()) {
6359 op
.indata
.write(0, op
.op
.xattr
.name_len
, out
);
6361 if (op
.op
.xattr
.value_len
)
6362 out
<< " (" << op
.op
.xattr
.value_len
<< ")";
6363 if (op
.op
.op
== CEPH_OSD_OP_CMPXATTR
)
6364 out
<< " op " << (int)op
.op
.xattr
.cmp_op
6365 << " mode " << (int)op
.op
.xattr
.cmp_mode
;
6366 } else if (ceph_osd_op_type_exec(op
.op
.op
)) {
6368 if (op
.op
.cls
.class_len
&& op
.indata
.length()) {
6370 op
.indata
.write(0, op
.op
.cls
.class_len
, out
);
6372 op
.indata
.write(op
.op
.cls
.class_len
, op
.op
.cls
.method_len
, out
);
6374 } else if (ceph_osd_op_type_pg(op
.op
.op
)) {
6376 case CEPH_OSD_OP_PGLS
:
6377 case CEPH_OSD_OP_PGLS_FILTER
:
6378 case CEPH_OSD_OP_PGNLS
:
6379 case CEPH_OSD_OP_PGNLS_FILTER
:
6380 out
<< " start_epoch " << op
.op
.pgls
.start_epoch
;
6382 case CEPH_OSD_OP_PG_HITSET_LS
:
6384 case CEPH_OSD_OP_PG_HITSET_GET
:
6385 out
<< " " << utime_t(op
.op
.hit_set_get
.stamp
);
6387 case CEPH_OSD_OP_SCRUBLS
:
6395 void OSDOp::split_osd_op_vector_in_data(vector
<OSDOp
>& ops
, bufferlist
& in
)
6397 bufferlist::iterator datap
= in
.begin();
6398 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6399 if (ops
[i
].op
.payload_len
) {
6400 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].indata
);
6405 void OSDOp::merge_osd_op_vector_in_data(vector
<OSDOp
>& ops
, bufferlist
& out
)
6407 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6408 if (ops
[i
].indata
.length()) {
6409 ops
[i
].op
.payload_len
= ops
[i
].indata
.length();
6410 out
.append(ops
[i
].indata
);
6415 void OSDOp::split_osd_op_vector_out_data(vector
<OSDOp
>& ops
, bufferlist
& in
)
6417 bufferlist::iterator datap
= in
.begin();
6418 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6419 if (ops
[i
].op
.payload_len
) {
6420 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].outdata
);
6425 void OSDOp::merge_osd_op_vector_out_data(vector
<OSDOp
>& ops
, bufferlist
& out
)
6427 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6428 if (ops
[i
].outdata
.length()) {
6429 ops
[i
].op
.payload_len
= ops
[i
].outdata
.length();
6430 out
.append(ops
[i
].outdata
);
6435 void OSDOp::clear_data(vector
<OSDOp
>& ops
)
6437 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6440 if (ceph_osd_op_type_attr(op
.op
.op
) &&
6441 op
.op
.xattr
.name_len
&&
6442 op
.indata
.length() >= op
.op
.xattr
.name_len
) {
6443 bufferptr
bp(op
.op
.xattr
.name_len
);
6446 bl
.copy_in(0, op
.op
.xattr
.name_len
, op
.indata
);
6447 op
.indata
.claim(bl
);
6448 } else if (ceph_osd_op_type_exec(op
.op
.op
) &&
6449 op
.op
.cls
.class_len
&&
6450 op
.indata
.length() >
6451 (op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
)) {
6452 __u8 len
= op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
;
6456 bl
.copy_in(0, len
, op
.indata
);
6457 op
.indata
.claim(bl
);