1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include <boost/assign/list_of.hpp>
20 #include "osd_types.h"
21 #include "include/ceph_features.h"
23 #include "crush/hash.h"
27 #include "PGBackend.h"
29 const char *ceph_osd_flag_name(unsigned flag
)
32 case CEPH_OSD_FLAG_ACK
: return "ack";
33 case CEPH_OSD_FLAG_ONNVRAM
: return "onnvram";
34 case CEPH_OSD_FLAG_ONDISK
: return "ondisk";
35 case CEPH_OSD_FLAG_RETRY
: return "retry";
36 case CEPH_OSD_FLAG_READ
: return "read";
37 case CEPH_OSD_FLAG_WRITE
: return "write";
38 case CEPH_OSD_FLAG_ORDERSNAP
: return "ordersnap";
39 case CEPH_OSD_FLAG_PEERSTAT_OLD
: return "peerstat_old";
40 case CEPH_OSD_FLAG_BALANCE_READS
: return "balance_reads";
41 case CEPH_OSD_FLAG_PARALLELEXEC
: return "parallelexec";
42 case CEPH_OSD_FLAG_PGOP
: return "pgop";
43 case CEPH_OSD_FLAG_EXEC
: return "exec";
44 case CEPH_OSD_FLAG_EXEC_PUBLIC
: return "exec_public";
45 case CEPH_OSD_FLAG_LOCALIZE_READS
: return "localize_reads";
46 case CEPH_OSD_FLAG_RWORDERED
: return "rwordered";
47 case CEPH_OSD_FLAG_IGNORE_CACHE
: return "ignore_cache";
48 case CEPH_OSD_FLAG_SKIPRWLOCKS
: return "skiprwlocks";
49 case CEPH_OSD_FLAG_IGNORE_OVERLAY
: return "ignore_overlay";
50 case CEPH_OSD_FLAG_FLUSH
: return "flush";
51 case CEPH_OSD_FLAG_MAP_SNAP_CLONE
: return "map_snap_clone";
52 case CEPH_OSD_FLAG_ENFORCE_SNAPC
: return "enforce_snapc";
53 case CEPH_OSD_FLAG_REDIRECTED
: return "redirected";
54 case CEPH_OSD_FLAG_KNOWN_REDIR
: return "known_if_redirected";
55 case CEPH_OSD_FLAG_FULL_TRY
: return "full_try";
56 case CEPH_OSD_FLAG_FULL_FORCE
: return "full_force";
57 case CEPH_OSD_FLAG_IGNORE_REDIRECT
: return "ignore_redirect";
58 default: return "???";
62 string
ceph_osd_flag_string(unsigned flags
)
65 for (unsigned i
=0; i
<32; ++i
) {
66 if (flags
& (1u<<i
)) {
69 s
+= ceph_osd_flag_name(1u << i
);
77 const char * ceph_osd_op_flag_name(unsigned flag
)
82 case CEPH_OSD_OP_FLAG_EXCL
:
85 case CEPH_OSD_OP_FLAG_FAILOK
:
88 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM
:
89 name
= "fadvise_random";
91 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
:
92 name
= "fadvise_sequential";
94 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
:
95 name
= "favise_willneed";
97 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
:
98 name
= "fadvise_dontneed";
100 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
:
101 name
= "fadvise_nocache";
103 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
:
104 name
= "bypass_clean_cache";
113 string
ceph_osd_op_flag_string(unsigned flags
)
116 for (unsigned i
=0; i
<32; ++i
) {
117 if (flags
& (1u<<i
)) {
120 s
+= ceph_osd_op_flag_name(1u << i
);
128 string
ceph_osd_alloc_hint_flag_string(unsigned flags
)
131 for (unsigned i
=0; i
<32; ++i
) {
132 if (flags
& (1u<<i
)) {
135 s
+= ceph_osd_alloc_hint_flag_name(1u << i
);
143 void pg_shard_t::encode(bufferlist
&bl
) const
145 ENCODE_START(1, 1, bl
);
150 void pg_shard_t::decode(bufferlist::iterator
&bl
)
158 ostream
&operator<<(ostream
&lhs
, const pg_shard_t
&rhs
)
160 if (rhs
.is_undefined())
162 if (rhs
.shard
== shard_id_t::NO_SHARD
)
163 return lhs
<< rhs
.get_osd();
164 return lhs
<< rhs
.get_osd() << '(' << (unsigned)(rhs
.shard
) << ')';
168 void osd_reqid_t::dump(Formatter
*f
) const
170 f
->dump_stream("name") << name
;
171 f
->dump_int("inc", inc
);
172 f
->dump_unsigned("tid", tid
);
175 void osd_reqid_t::generate_test_instances(list
<osd_reqid_t
*>& o
)
177 o
.push_back(new osd_reqid_t
);
178 o
.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
181 // -- object_locator_t --
183 void object_locator_t::encode(bufferlist
& bl
) const
185 // verify that nobody's corrupted the locator
186 assert(hash
== -1 || key
.empty());
187 __u8 encode_compat
= 3;
188 ENCODE_START(6, encode_compat
, bl
);
190 int32_t preferred
= -1; // tell old code there is no preferred osd (-1).
191 ::encode(preferred
, bl
);
193 ::encode(nspace
, bl
);
196 encode_compat
= MAX(encode_compat
, 6); // need to interpret the hash
197 ENCODE_FINISH_NEW_COMPAT(bl
, encode_compat
);
200 void object_locator_t::decode(bufferlist::iterator
& p
)
202 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p
);
212 ::decode(preferred
, p
);
222 // verify that nobody's corrupted the locator
223 assert(hash
== -1 || key
.empty());
226 void object_locator_t::dump(Formatter
*f
) const
228 f
->dump_int("pool", pool
);
229 f
->dump_string("key", key
);
230 f
->dump_string("namespace", nspace
);
231 f
->dump_int("hash", hash
);
234 void object_locator_t::generate_test_instances(list
<object_locator_t
*>& o
)
236 o
.push_back(new object_locator_t
);
237 o
.push_back(new object_locator_t(123));
238 o
.push_back(new object_locator_t(123, 876));
239 o
.push_back(new object_locator_t(1, "n2"));
240 o
.push_back(new object_locator_t(1234, "", "key"));
241 o
.push_back(new object_locator_t(12, "n1", "key2"));
244 // -- request_redirect_t --
245 void request_redirect_t::encode(bufferlist
& bl
) const
247 ENCODE_START(1, 1, bl
);
248 ::encode(redirect_locator
, bl
);
249 ::encode(redirect_object
, bl
);
250 ::encode(osd_instructions
, bl
);
254 void request_redirect_t::decode(bufferlist::iterator
& bl
)
257 ::decode(redirect_locator
, bl
);
258 ::decode(redirect_object
, bl
);
259 ::decode(osd_instructions
, bl
);
263 void request_redirect_t::dump(Formatter
*f
) const
265 f
->dump_string("object", redirect_object
);
266 f
->open_object_section("locator");
267 redirect_locator
.dump(f
);
268 f
->close_section(); // locator
271 void request_redirect_t::generate_test_instances(list
<request_redirect_t
*>& o
)
273 object_locator_t
loc(1, "redir_obj");
274 o
.push_back(new request_redirect_t());
275 o
.push_back(new request_redirect_t(loc
, 0));
276 o
.push_back(new request_redirect_t(loc
, "redir_obj"));
277 o
.push_back(new request_redirect_t(loc
));
280 void objectstore_perf_stat_t::dump(Formatter
*f
) const
282 f
->dump_unsigned("commit_latency_ms", os_commit_latency
);
283 f
->dump_unsigned("apply_latency_ms", os_apply_latency
);
286 void objectstore_perf_stat_t::encode(bufferlist
&bl
) const
288 ENCODE_START(1, 1, bl
);
289 ::encode(os_commit_latency
, bl
);
290 ::encode(os_apply_latency
, bl
);
294 void objectstore_perf_stat_t::decode(bufferlist::iterator
&bl
)
297 ::decode(os_commit_latency
, bl
);
298 ::decode(os_apply_latency
, bl
);
302 void objectstore_perf_stat_t::generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
)
304 o
.push_back(new objectstore_perf_stat_t());
305 o
.push_back(new objectstore_perf_stat_t());
306 o
.back()->os_commit_latency
= 20;
307 o
.back()->os_apply_latency
= 30;
311 void osd_stat_t::dump(Formatter
*f
) const
313 f
->dump_unsigned("up_from", up_from
);
314 f
->dump_unsigned("seq", seq
);
315 f
->dump_unsigned("num_pgs", num_pgs
);
316 f
->dump_unsigned("kb", kb
);
317 f
->dump_unsigned("kb_used", kb_used
);
318 f
->dump_unsigned("kb_avail", kb_avail
);
319 f
->open_array_section("hb_peers");
320 for (auto p
: hb_peers
)
321 f
->dump_int("osd", p
);
323 f
->dump_int("snap_trim_queue_len", snap_trim_queue_len
);
324 f
->dump_int("num_snap_trimming", num_snap_trimming
);
325 f
->open_object_section("op_queue_age_hist");
326 op_queue_age_hist
.dump(f
);
328 f
->open_object_section("perf_stat");
329 os_perf_stat
.dump(f
);
333 void osd_stat_t::encode(bufferlist
&bl
) const
335 ENCODE_START(7, 2, bl
);
337 ::encode(kb_used
, bl
);
338 ::encode(kb_avail
, bl
);
339 ::encode(snap_trim_queue_len
, bl
);
340 ::encode(num_snap_trimming
, bl
);
341 ::encode(hb_peers
, bl
);
342 ::encode((uint32_t)0, bl
);
343 ::encode(op_queue_age_hist
, bl
);
344 ::encode(os_perf_stat
, bl
);
345 ::encode(up_from
, bl
);
347 ::encode(num_pgs
, bl
);
351 void osd_stat_t::decode(bufferlist::iterator
&bl
)
353 DECODE_START_LEGACY_COMPAT_LEN(6, 2, 2, bl
);
355 ::decode(kb_used
, bl
);
356 ::decode(kb_avail
, bl
);
357 ::decode(snap_trim_queue_len
, bl
);
358 ::decode(num_snap_trimming
, bl
);
359 ::decode(hb_peers
, bl
);
360 vector
<int> num_hb_out
;
361 ::decode(num_hb_out
, bl
);
363 ::decode(op_queue_age_hist
, bl
);
365 ::decode(os_perf_stat
, bl
);
367 ::decode(up_from
, bl
);
371 ::decode(num_pgs
, bl
);
376 void osd_stat_t::generate_test_instances(std::list
<osd_stat_t
*>& o
)
378 o
.push_back(new osd_stat_t
);
380 o
.push_back(new osd_stat_t
);
382 o
.back()->kb_used
= 2;
383 o
.back()->kb_avail
= 3;
384 o
.back()->hb_peers
.push_back(7);
385 o
.back()->snap_trim_queue_len
= 8;
386 o
.back()->num_snap_trimming
= 99;
391 int pg_t::print(char *o
, int maxlen
) const
393 if (preferred() >= 0)
394 return snprintf(o
, maxlen
, "%llu.%xp%d", (unsigned long long)pool(), ps(), preferred());
396 return snprintf(o
, maxlen
, "%llu.%x", (unsigned long long)pool(), ps());
399 bool pg_t::parse(const char *s
)
404 int r
= sscanf(s
, "%llu.%xp%d", (long long unsigned *)&ppool
, &pseed
, &pref
);
416 bool spg_t::parse(const char *s
)
418 pgid
.set_preferred(-1);
419 shard
= shard_id_t::NO_SHARD
;
424 int r
= sscanf(s
, "%llu.%x", (long long unsigned *)&ppool
, &pseed
);
427 pgid
.set_pool(ppool
);
430 const char *p
= strchr(s
, 'p');
432 r
= sscanf(p
, "p%d", &pref
);
434 pgid
.set_preferred(pref
);
442 r
= sscanf(p
, "s%d", &pshard
);
444 shard
= shard_id_t(pshard
);
452 char *spg_t::calc_name(char *buf
, const char *suffix_backwords
) const
454 while (*suffix_backwords
)
455 *--buf
= *suffix_backwords
++;
457 if (!is_no_shard()) {
458 buf
= ritoa
<uint8_t, 10>((uint8_t)shard
.id
, buf
);
462 return pgid
.calc_name(buf
, "");
465 ostream
& operator<<(ostream
& out
, const spg_t
&pg
)
467 char buf
[spg_t::calc_name_buf_size
];
468 buf
[spg_t::calc_name_buf_size
- 1] = '\0';
469 out
<< pg
.calc_name(buf
+ spg_t::calc_name_buf_size
- 1, "");
473 pg_t
pg_t::get_ancestor(unsigned old_pg_num
) const
475 int old_bits
= cbits(old_pg_num
);
476 int old_mask
= (1 << old_bits
) - 1;
478 ret
.m_seed
= ceph_stable_mod(m_seed
, old_pg_num
, old_mask
);
482 bool pg_t::is_split(unsigned old_pg_num
, unsigned new_pg_num
, set
<pg_t
> *children
) const
484 assert(m_seed
< old_pg_num
);
485 if (new_pg_num
<= old_pg_num
)
490 unsigned old_bits
= cbits(old_pg_num
);
491 unsigned old_mask
= (1 << old_bits
) - 1;
492 for (unsigned n
= 1; ; n
++) {
493 unsigned next_bit
= (n
<< (old_bits
-1));
494 unsigned s
= next_bit
| m_seed
;
496 if (s
< old_pg_num
|| s
== m_seed
)
500 if ((unsigned)ceph_stable_mod(s
, old_pg_num
, old_mask
) == m_seed
) {
503 children
->insert(pg_t(s
, m_pool
, m_preferred
));
509 int old_bits
= cbits(old_pg_num
);
510 int old_mask
= (1 << old_bits
) - 1;
511 for (unsigned x
= old_pg_num
; x
< new_pg_num
; ++x
) {
512 unsigned o
= ceph_stable_mod(x
, old_pg_num
, old_mask
);
515 children
->insert(pg_t(x
, m_pool
, m_preferred
));
522 unsigned pg_t::get_split_bits(unsigned pg_num
) const {
527 // Find unique p such that pg_num \in [2^(p-1), 2^p)
528 unsigned p
= cbits(pg_num
);
529 assert(p
); // silence coverity #751330
531 if ((m_seed
% (1<<(p
-1))) < (pg_num
% (1<<(p
-1))))
537 pg_t
pg_t::get_parent() const
539 unsigned bits
= cbits(m_seed
);
542 retval
.m_seed
&= ~((~0)<<(bits
- 1));
546 hobject_t
pg_t::get_hobj_start() const
548 return hobject_t(object_t(), string(), CEPH_NOSNAP
, m_seed
, m_pool
,
552 hobject_t
pg_t::get_hobj_end(unsigned pg_num
) const
554 // note: this assumes a bitwise sort; with the legacy nibblewise
555 // sort a PG did not always cover a single contiguous range of the
556 // (bit-reversed) hash range.
557 unsigned bits
= get_split_bits(pg_num
);
558 uint64_t rev_start
= hobject_t::_reverse_bits(m_seed
);
559 uint64_t rev_end
= (rev_start
| (0xffffffff >> bits
)) + 1;
560 if (rev_end
>= 0x100000000) {
561 assert(rev_end
== 0x100000000);
562 return hobject_t::get_max();
564 return hobject_t(object_t(), string(), CEPH_NOSNAP
,
565 hobject_t::_reverse_bits(rev_end
), m_pool
,
570 void pg_t::dump(Formatter
*f
) const
572 f
->dump_unsigned("pool", m_pool
);
573 f
->dump_unsigned("seed", m_seed
);
574 f
->dump_int("preferred_osd", m_preferred
);
577 void pg_t::generate_test_instances(list
<pg_t
*>& o
)
579 o
.push_back(new pg_t
);
580 o
.push_back(new pg_t(1, 2, -1));
581 o
.push_back(new pg_t(13123, 3, -1));
582 o
.push_back(new pg_t(131223, 4, 23));
585 char *pg_t::calc_name(char *buf
, const char *suffix_backwords
) const
587 while (*suffix_backwords
)
588 *--buf
= *suffix_backwords
++;
590 if (m_preferred
>= 0)
593 buf
= ritoa
<uint32_t, 16>(m_seed
, buf
);
597 return ritoa
<uint64_t, 10>(m_pool
, buf
);
600 ostream
& operator<<(ostream
& out
, const pg_t
&pg
)
602 char buf
[pg_t::calc_name_buf_size
];
603 buf
[pg_t::calc_name_buf_size
- 1] = '\0';
604 out
<< pg
.calc_name(buf
+ pg_t::calc_name_buf_size
- 1, "");
611 void coll_t::calc_str()
615 strcpy(_str_buff
, "meta");
619 _str_buff
[spg_t::calc_name_buf_size
- 1] = '\0';
620 _str
= pgid
.calc_name(_str_buff
+ spg_t::calc_name_buf_size
- 1, "daeh_");
623 _str_buff
[spg_t::calc_name_buf_size
- 1] = '\0';
624 _str
= pgid
.calc_name(_str_buff
+ spg_t::calc_name_buf_size
- 1, "PMET_");
627 assert(0 == "unknown collection type");
631 bool coll_t::parse(const std::string
& s
)
641 if (s
.find("_head") == s
.length() - 5 &&
642 pgid
.parse(s
.substr(0, s
.length() - 5))) {
649 if (s
.find("_TEMP") == s
.length() - 5 &&
650 pgid
.parse(s
.substr(0, s
.length() - 5))) {
660 void coll_t::encode(bufferlist
& bl
) const
662 // when changing this, remember to update encoded_size() too.
664 // can't express this as v2...
666 ::encode(struct_v
, bl
);
667 ::encode(to_str(), bl
);
670 ::encode(struct_v
, bl
);
671 ::encode((__u8
)type
, bl
);
673 snapid_t snap
= CEPH_NOSNAP
;
678 size_t coll_t::encoded_size() const
680 size_t r
= sizeof(__u8
);
693 r
+= sizeof(ceph_le32
) + 2 * sizeof(__u8
);
695 r
+= sizeof(__u8
) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
699 r
+= sizeof(uint64_t);
705 void coll_t::decode(bufferlist::iterator
& bl
)
708 ::decode(struct_v
, bl
);
717 if (pgid
== spg_t() && snap
== 0) {
733 type
= (type_t
)_type
;
742 bool ok
= parse(str
);
744 throw std::domain_error(std::string("unable to parse pg ") + str
);
751 oss
<< "coll_t::decode(): don't know how to decode version "
753 throw std::domain_error(oss
.str());
758 void coll_t::dump(Formatter
*f
) const
760 f
->dump_unsigned("type_id", (unsigned)type
);
761 if (type
!= TYPE_META
)
762 f
->dump_stream("pgid") << pgid
;
763 f
->dump_string("name", to_str());
766 void coll_t::generate_test_instances(list
<coll_t
*>& o
)
768 o
.push_back(new coll_t());
769 o
.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD
)));
770 o
.push_back(new coll_t(o
.back()->get_temp()));
771 o
.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
772 o
.push_back(new coll_t(o
.back()->get_temp()));
773 o
.push_back(new coll_t());
778 std::string
pg_vector_string(const vector
<int32_t> &a
)
782 for (vector
<int32_t>::const_iterator i
= a
.begin(); i
!= a
.end(); ++i
) {
785 if (*i
!= CRUSH_ITEM_NONE
)
794 std::string
pg_state_string(int state
)
797 if (state
& PG_STATE_STALE
)
799 if (state
& PG_STATE_CREATING
)
801 if (state
& PG_STATE_ACTIVE
)
803 if (state
& PG_STATE_ACTIVATING
)
804 oss
<< "activating+";
805 if (state
& PG_STATE_CLEAN
)
807 if (state
& PG_STATE_RECOVERY_WAIT
)
808 oss
<< "recovery_wait+";
809 if (state
& PG_STATE_RECOVERY_TOOFULL
)
810 oss
<< "recovery_toofull+";
811 if (state
& PG_STATE_RECOVERING
)
812 oss
<< "recovering+";
813 if (state
& PG_STATE_FORCED_RECOVERY
)
814 oss
<< "forced_recovery+";
815 if (state
& PG_STATE_DOWN
)
817 if (state
& PG_STATE_RECOVERY_UNFOUND
)
818 oss
<< "recovery_unfound+";
819 if (state
& PG_STATE_BACKFILL_UNFOUND
)
820 oss
<< "backfill_unfound+";
821 if (state
& PG_STATE_UNDERSIZED
)
822 oss
<< "undersized+";
823 if (state
& PG_STATE_DEGRADED
)
825 if (state
& PG_STATE_REMAPPED
)
827 if (state
& PG_STATE_SCRUBBING
)
829 if (state
& PG_STATE_DEEP_SCRUB
)
831 if (state
& PG_STATE_INCONSISTENT
)
832 oss
<< "inconsistent+";
833 if (state
& PG_STATE_PEERING
)
835 if (state
& PG_STATE_REPAIR
)
837 if (state
& PG_STATE_BACKFILL_WAIT
)
838 oss
<< "backfill_wait+";
839 if (state
& PG_STATE_BACKFILLING
)
840 oss
<< "backfilling+";
841 if (state
& PG_STATE_FORCED_BACKFILL
)
842 oss
<< "forced_backfill+";
843 if (state
& PG_STATE_BACKFILL_TOOFULL
)
844 oss
<< "backfill_toofull+";
845 if (state
& PG_STATE_INCOMPLETE
)
846 oss
<< "incomplete+";
847 if (state
& PG_STATE_PEERED
)
849 if (state
& PG_STATE_SNAPTRIM
)
851 if (state
& PG_STATE_SNAPTRIM_WAIT
)
852 oss
<< "snaptrim_wait+";
853 if (state
& PG_STATE_SNAPTRIM_ERROR
)
854 oss
<< "snaptrim_error+";
855 string
ret(oss
.str());
856 if (ret
.length() > 0)
857 ret
.resize(ret
.length() - 1);
863 boost::optional
<uint64_t> pg_string_state(const std::string
& state
)
865 boost::optional
<uint64_t> type
;
866 if (state
== "active")
867 type
= PG_STATE_ACTIVE
;
868 else if (state
== "clean")
869 type
= PG_STATE_CLEAN
;
870 else if (state
== "down")
871 type
= PG_STATE_DOWN
;
872 else if (state
== "recovery_unfound")
873 type
= PG_STATE_RECOVERY_UNFOUND
;
874 else if (state
== "backfill_unfound")
875 type
= PG_STATE_BACKFILL_UNFOUND
;
876 else if (state
== "scrubbing")
877 type
= PG_STATE_SCRUBBING
;
878 else if (state
== "degraded")
879 type
= PG_STATE_DEGRADED
;
880 else if (state
== "inconsistent")
881 type
= PG_STATE_INCONSISTENT
;
882 else if (state
== "peering")
883 type
= PG_STATE_PEERING
;
884 else if (state
== "repair")
885 type
= PG_STATE_REPAIR
;
886 else if (state
== "recovering")
887 type
= PG_STATE_RECOVERING
;
888 else if (state
== "forced_recovery")
889 type
= PG_STATE_FORCED_RECOVERY
;
890 else if (state
== "backfill_wait")
891 type
= PG_STATE_BACKFILL_WAIT
;
892 else if (state
== "incomplete")
893 type
= PG_STATE_INCOMPLETE
;
894 else if (state
== "stale")
895 type
= PG_STATE_STALE
;
896 else if (state
== "remapped")
897 type
= PG_STATE_REMAPPED
;
898 else if (state
== "deep")
899 type
= PG_STATE_DEEP_SCRUB
;
900 else if (state
== "backfilling")
901 type
= PG_STATE_BACKFILLING
;
902 else if (state
== "forced_backfill")
903 type
= PG_STATE_FORCED_BACKFILL
;
904 else if (state
== "backfill_toofull")
905 type
= PG_STATE_BACKFILL_TOOFULL
;
906 else if (state
== "recovery_wait")
907 type
= PG_STATE_RECOVERY_WAIT
;
908 else if (state
== "recovery_toofull")
909 type
= PG_STATE_RECOVERY_TOOFULL
;
910 else if (state
== "undersized")
911 type
= PG_STATE_UNDERSIZED
;
912 else if (state
== "activating")
913 type
= PG_STATE_ACTIVATING
;
914 else if (state
== "peered")
915 type
= PG_STATE_PEERED
;
916 else if (state
== "snaptrim")
917 type
= PG_STATE_SNAPTRIM
;
918 else if (state
== "snaptrim_wait")
919 type
= PG_STATE_SNAPTRIM_WAIT
;
920 else if (state
== "snaptrim_error")
921 type
= PG_STATE_SNAPTRIM_ERROR
;
922 else if (state
== "creating")
923 type
= PG_STATE_CREATING
;
930 string
eversion_t::get_key_name() const
933 // Below is equivalent of sprintf("%010u.%020llu");
935 ritoa
<uint64_t, 10, 20>(version
, key
+ 31);
937 ritoa
<uint32_t, 10, 10>(epoch
, key
+ 10);
942 // -- pool_snap_info_t --
943 void pool_snap_info_t::dump(Formatter
*f
) const
945 f
->dump_unsigned("snapid", snapid
);
946 f
->dump_stream("stamp") << stamp
;
947 f
->dump_string("name", name
);
950 void pool_snap_info_t::encode(bufferlist
& bl
, uint64_t features
) const
952 if ((features
& CEPH_FEATURE_PGPOOL3
) == 0) {
954 ::encode(struct_v
, bl
);
955 ::encode(snapid
, bl
);
960 ENCODE_START(2, 2, bl
);
961 ::encode(snapid
, bl
);
967 void pool_snap_info_t::decode(bufferlist::iterator
& bl
)
969 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
970 ::decode(snapid
, bl
);
976 void pool_snap_info_t::generate_test_instances(list
<pool_snap_info_t
*>& o
)
978 o
.push_back(new pool_snap_info_t
);
979 o
.push_back(new pool_snap_info_t
);
980 o
.back()->snapid
= 1;
981 o
.back()->stamp
= utime_t(1, 2);
982 o
.back()->name
= "foo";
987 typedef std::map
<std::string
, pool_opts_t::opt_desc_t
> opt_mapping_t
;
988 static opt_mapping_t opt_mapping
= boost::assign::map_list_of
989 ("scrub_min_interval", pool_opts_t::opt_desc_t(
990 pool_opts_t::SCRUB_MIN_INTERVAL
, pool_opts_t::DOUBLE
))
991 ("scrub_max_interval", pool_opts_t::opt_desc_t(
992 pool_opts_t::SCRUB_MAX_INTERVAL
, pool_opts_t::DOUBLE
))
993 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
994 pool_opts_t::DEEP_SCRUB_INTERVAL
, pool_opts_t::DOUBLE
))
995 ("recovery_priority", pool_opts_t::opt_desc_t(
996 pool_opts_t::RECOVERY_PRIORITY
, pool_opts_t::INT
))
997 ("recovery_op_priority", pool_opts_t::opt_desc_t(
998 pool_opts_t::RECOVERY_OP_PRIORITY
, pool_opts_t::INT
))
999 ("scrub_priority", pool_opts_t::opt_desc_t(
1000 pool_opts_t::SCRUB_PRIORITY
, pool_opts_t::INT
))
1001 ("compression_mode", pool_opts_t::opt_desc_t(
1002 pool_opts_t::COMPRESSION_MODE
, pool_opts_t::STR
))
1003 ("compression_algorithm", pool_opts_t::opt_desc_t(
1004 pool_opts_t::COMPRESSION_ALGORITHM
, pool_opts_t::STR
))
1005 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1006 pool_opts_t::COMPRESSION_REQUIRED_RATIO
, pool_opts_t::DOUBLE
))
1007 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1008 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, pool_opts_t::INT
))
1009 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1010 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, pool_opts_t::INT
))
1011 ("csum_type", pool_opts_t::opt_desc_t(
1012 pool_opts_t::CSUM_TYPE
, pool_opts_t::INT
))
1013 ("csum_max_block", pool_opts_t::opt_desc_t(
1014 pool_opts_t::CSUM_MAX_BLOCK
, pool_opts_t::INT
))
1015 ("csum_min_block", pool_opts_t::opt_desc_t(
1016 pool_opts_t::CSUM_MIN_BLOCK
, pool_opts_t::INT
));
1018 bool pool_opts_t::is_opt_name(const std::string
& name
) {
1019 return opt_mapping
.count(name
);
1022 pool_opts_t::opt_desc_t
pool_opts_t::get_opt_desc(const std::string
& name
) {
1023 opt_mapping_t::iterator i
= opt_mapping
.find(name
);
1024 assert(i
!= opt_mapping
.end());
1028 bool pool_opts_t::is_set(pool_opts_t::key_t key
) const {
1029 return opts
.count(key
);
1032 const pool_opts_t::value_t
& pool_opts_t::get(pool_opts_t::key_t key
) const {
1033 opts_t::const_iterator i
= opts
.find(key
);
1034 assert(i
!= opts
.end());
1038 bool pool_opts_t::unset(pool_opts_t::key_t key
) {
1039 return opts
.erase(key
) > 0;
1042 class pool_opts_dumper_t
: public boost::static_visitor
<>
1045 pool_opts_dumper_t(const std::string
& name_
, Formatter
* f_
) :
1046 name(name_
.c_str()), f(f_
) {}
1048 void operator()(std::string s
) const {
1049 f
->dump_string(name
, s
);
1051 void operator()(int i
) const {
1052 f
->dump_int(name
, i
);
1054 void operator()(double d
) const {
1055 f
->dump_float(name
, d
);
1063 void pool_opts_t::dump(const std::string
& name
, Formatter
* f
) const
1065 const opt_desc_t
& desc
= get_opt_desc(name
);
1066 opts_t::const_iterator i
= opts
.find(desc
.key
);
1067 if (i
== opts
.end()) {
1070 boost::apply_visitor(pool_opts_dumper_t(name
, f
), i
->second
);
1073 void pool_opts_t::dump(Formatter
* f
) const
1075 for (opt_mapping_t::iterator i
= opt_mapping
.begin(); i
!= opt_mapping
.end();
1077 const std::string
& name
= i
->first
;
1078 const opt_desc_t
& desc
= i
->second
;
1079 opts_t::const_iterator j
= opts
.find(desc
.key
);
1080 if (j
== opts
.end()) {
1083 boost::apply_visitor(pool_opts_dumper_t(name
, f
), j
->second
);
1087 class pool_opts_encoder_t
: public boost::static_visitor
<>
1090 explicit pool_opts_encoder_t(bufferlist
& bl_
) : bl(bl_
) {}
1092 void operator()(std::string s
) const {
1093 ::encode(static_cast<int32_t>(pool_opts_t::STR
), bl
);
1096 void operator()(int i
) const {
1097 ::encode(static_cast<int32_t>(pool_opts_t::INT
), bl
);
1100 void operator()(double d
) const {
1101 ::encode(static_cast<int32_t>(pool_opts_t::DOUBLE
), bl
);
1109 void pool_opts_t::encode(bufferlist
& bl
) const {
1110 ENCODE_START(1, 1, bl
);
1111 uint32_t n
= static_cast<uint32_t>(opts
.size());
1113 for (opts_t::const_iterator i
= opts
.begin(); i
!= opts
.end(); ++i
) {
1114 ::encode(static_cast<int32_t>(i
->first
), bl
);
1115 boost::apply_visitor(pool_opts_encoder_t(bl
), i
->second
);
1120 void pool_opts_t::decode(bufferlist::iterator
& bl
) {
1121 DECODE_START(1, bl
);
1132 opts
[static_cast<key_t
>(k
)] = s
;
1133 } else if (t
== INT
) {
1136 opts
[static_cast<key_t
>(k
)] = i
;
1137 } else if (t
== DOUBLE
) {
1140 opts
[static_cast<key_t
>(k
)] = d
;
1142 assert(!"invalid type");
1148 ostream
& operator<<(ostream
& out
, const pool_opts_t
& opts
)
1150 for (opt_mapping_t::iterator i
= opt_mapping
.begin(); i
!= opt_mapping
.end();
1152 const std::string
& name
= i
->first
;
1153 const pool_opts_t::opt_desc_t
& desc
= i
->second
;
1154 pool_opts_t::opts_t::const_iterator j
= opts
.opts
.find(desc
.key
);
1155 if (j
== opts
.opts
.end()) {
1158 out
<< " " << name
<< " " << j
->second
;
1165 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1166 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1167 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1169 void pg_pool_t::dump(Formatter
*f
) const
1171 f
->dump_unsigned("flags", get_flags());
1172 f
->dump_string("flags_names", get_flags_string());
1173 f
->dump_int("type", get_type());
1174 f
->dump_int("size", get_size());
1175 f
->dump_int("min_size", get_min_size());
1176 f
->dump_int("crush_rule", get_crush_rule());
1177 f
->dump_int("object_hash", get_object_hash());
1178 f
->dump_unsigned("pg_num", get_pg_num());
1179 f
->dump_unsigned("pg_placement_num", get_pgp_num());
1180 f
->dump_unsigned("crash_replay_interval", get_crash_replay_interval());
1181 f
->dump_stream("last_change") << get_last_change();
1182 f
->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1183 f
->dump_stream("last_force_op_resend_preluminous")
1184 << get_last_force_op_resend_preluminous();
1185 f
->dump_unsigned("auid", get_auid());
1186 f
->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1187 f
->dump_unsigned("snap_seq", get_snap_seq());
1188 f
->dump_unsigned("snap_epoch", get_snap_epoch());
1189 f
->open_array_section("pool_snaps");
1190 for (map
<snapid_t
, pool_snap_info_t
>::const_iterator p
= snaps
.begin(); p
!= snaps
.end(); ++p
) {
1191 f
->open_object_section("pool_snap_info");
1196 f
->dump_stream("removed_snaps") << removed_snaps
;
1197 f
->dump_unsigned("quota_max_bytes", quota_max_bytes
);
1198 f
->dump_unsigned("quota_max_objects", quota_max_objects
);
1199 f
->open_array_section("tiers");
1200 for (set
<uint64_t>::const_iterator p
= tiers
.begin(); p
!= tiers
.end(); ++p
)
1201 f
->dump_unsigned("pool_id", *p
);
1203 f
->dump_int("tier_of", tier_of
);
1204 f
->dump_int("read_tier", read_tier
);
1205 f
->dump_int("write_tier", write_tier
);
1206 f
->dump_string("cache_mode", get_cache_mode_name());
1207 f
->dump_unsigned("target_max_bytes", target_max_bytes
);
1208 f
->dump_unsigned("target_max_objects", target_max_objects
);
1209 f
->dump_unsigned("cache_target_dirty_ratio_micro",
1210 cache_target_dirty_ratio_micro
);
1211 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
1212 cache_target_dirty_high_ratio_micro
);
1213 f
->dump_unsigned("cache_target_full_ratio_micro",
1214 cache_target_full_ratio_micro
);
1215 f
->dump_unsigned("cache_min_flush_age", cache_min_flush_age
);
1216 f
->dump_unsigned("cache_min_evict_age", cache_min_evict_age
);
1217 f
->dump_string("erasure_code_profile", erasure_code_profile
);
1218 f
->open_object_section("hit_set_params");
1219 hit_set_params
.dump(f
);
1220 f
->close_section(); // hit_set_params
1221 f
->dump_unsigned("hit_set_period", hit_set_period
);
1222 f
->dump_unsigned("hit_set_count", hit_set_count
);
1223 f
->dump_bool("use_gmt_hitset", use_gmt_hitset
);
1224 f
->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote
);
1225 f
->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote
);
1226 f
->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate
);
1227 f
->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n
);
1228 f
->open_array_section("grade_table");
1229 for (unsigned i
= 0; i
< hit_set_count
; ++i
)
1230 f
->dump_unsigned("value", get_grade(i
));
1232 f
->dump_unsigned("stripe_width", get_stripe_width());
1233 f
->dump_unsigned("expected_num_objects", expected_num_objects
);
1234 f
->dump_bool("fast_read", fast_read
);
1235 f
->open_object_section("options");
1237 f
->close_section(); // options
1238 f
->open_object_section("application_metadata");
1239 for (auto &app_pair
: application_metadata
) {
1240 f
->open_object_section(app_pair
.first
.c_str());
1241 for (auto &kv_pair
: app_pair
.second
) {
1242 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
1244 f
->close_section(); // application
1246 f
->close_section(); // application_metadata
1249 void pg_pool_t::convert_to_pg_shards(const vector
<int> &from
, set
<pg_shard_t
>* to
) const {
1250 for (size_t i
= 0; i
< from
.size(); ++i
) {
1251 if (from
[i
] != CRUSH_ITEM_NONE
) {
1255 ec_pool() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1260 void pg_pool_t::calc_pg_masks()
1262 pg_num_mask
= (1 << cbits(pg_num
-1)) - 1;
1263 pgp_num_mask
= (1 << cbits(pgp_num
-1)) - 1;
1266 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid
) const
1268 if (pg_num
== pg_num_mask
+ 1)
1269 return pg_num
; // power-of-2 split
1270 unsigned mask
= pg_num_mask
>> 1;
1271 if ((pgid
.ps() & mask
) < (pg_num
& mask
))
1272 return pg_num_mask
+ 1; // smaller bin size (already split)
1274 return (pg_num_mask
+ 1) >> 1; // bigger bin (not yet split)
1278 * we have two snap modes:
1279 * - pool global snaps
1280 * - snap existence/non-existence defined by snaps[] and snap_seq
1281 * - user managed snaps
1282 * - removal governed by removed_snaps
1284 * we know which mode we're using based on whether removed_snaps is empty.
1286 bool pg_pool_t::is_pool_snaps_mode() const
1288 return removed_snaps
.empty() && get_snap_seq() > 0;
1291 bool pg_pool_t::is_unmanaged_snaps_mode() const
1293 return removed_snaps
.size() && get_snap_seq() > 0;
1296 bool pg_pool_t::is_removed_snap(snapid_t s
) const
1298 if (is_pool_snaps_mode())
1299 return s
<= get_snap_seq() && snaps
.count(s
) == 0;
1301 return removed_snaps
.contains(s
);
1305 * build set of known-removed sets from either pool snaps or
1306 * explicit removed_snaps set.
1308 void pg_pool_t::build_removed_snaps(interval_set
<snapid_t
>& rs
) const
1310 if (is_pool_snaps_mode()) {
1312 for (snapid_t s
= 1; s
<= get_snap_seq(); s
= s
+ 1)
1313 if (snaps
.count(s
) == 0)
1320 bool pg_pool_t::maybe_updated_removed_snaps(const interval_set
<snapid_t
>& cached
) const
1322 if (is_unmanaged_snaps_mode()) { // remove_unmanaged_snap increments range_end
1323 if (removed_snaps
.empty() || cached
.empty()) // range_end is undefined
1324 return removed_snaps
.empty() != cached
.empty();
1325 return removed_snaps
.range_end() != cached
.range_end();
1330 snapid_t
pg_pool_t::snap_exists(const char *s
) const
1332 for (map
<snapid_t
,pool_snap_info_t
>::const_iterator p
= snaps
.begin();
1335 if (p
->second
.name
== s
)
1336 return p
->second
.snapid
;
1340 void pg_pool_t::add_snap(const char *n
, utime_t stamp
)
1342 assert(!is_unmanaged_snaps_mode());
1343 snapid_t s
= get_snap_seq() + 1;
1345 snaps
[s
].snapid
= s
;
1347 snaps
[s
].stamp
= stamp
;
1350 void pg_pool_t::add_unmanaged_snap(uint64_t& snapid
)
1352 if (removed_snaps
.empty()) {
1353 assert(!is_pool_snaps_mode());
1354 removed_snaps
.insert(snapid_t(1));
1357 snapid
= snap_seq
= snap_seq
+ 1;
1360 void pg_pool_t::remove_snap(snapid_t s
)
1362 assert(snaps
.count(s
));
1364 snap_seq
= snap_seq
+ 1;
1367 void pg_pool_t::remove_unmanaged_snap(snapid_t s
)
1369 assert(is_unmanaged_snaps_mode());
1370 removed_snaps
.insert(s
);
1371 snap_seq
= snap_seq
+ 1;
1372 // try to add in the new seq, just to try to keep the interval_set contiguous
1373 if (!removed_snaps
.contains(get_snap_seq())) {
1374 removed_snaps
.insert(get_snap_seq());
1378 SnapContext
pg_pool_t::get_snap_context() const
1380 vector
<snapid_t
> s(snaps
.size());
1382 for (map
<snapid_t
, pool_snap_info_t
>::const_reverse_iterator p
= snaps
.rbegin();
1386 return SnapContext(get_snap_seq(), s
);
1389 uint32_t pg_pool_t::hash_key(const string
& key
, const string
& ns
) const
1392 return ceph_str_hash(object_hash
, key
.data(), key
.length());
1393 int nsl
= ns
.length();
1394 int len
= key
.length() + nsl
+ 1;
1396 memcpy(&buf
[0], ns
.data(), nsl
);
1398 memcpy(&buf
[nsl
+1], key
.data(), key
.length());
1399 return ceph_str_hash(object_hash
, &buf
[0], len
);
1402 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v
) const
1404 return ceph_stable_mod(v
, pg_num
, pg_num_mask
);
1408 * map a raw pg (with full precision ps) into an actual pg, for storage
1410 pg_t
pg_pool_t::raw_pg_to_pg(pg_t pg
) const
1412 pg
.set_ps(ceph_stable_mod(pg
.ps(), pg_num
, pg_num_mask
));
1417 * map raw pg (full precision ps) into a placement seed. include
1418 * pool id in that value so that different pools don't use the same
1421 ps_t
pg_pool_t::raw_pg_to_pps(pg_t pg
) const
1423 if (flags
& FLAG_HASHPSPOOL
) {
1424 // Hash the pool id so that pool PGs do not overlap.
1426 crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1427 ceph_stable_mod(pg
.ps(), pgp_num
, pgp_num_mask
),
1430 // Legacy behavior; add ps and pool together. This is not a great
1431 // idea because the PGs from each pool will essentially overlap on
1432 // top of each other: 0.5 == 1.4 == 2.3 == ...
1434 ceph_stable_mod(pg
.ps(), pgp_num
, pgp_num_mask
) +
1439 uint32_t pg_pool_t::get_random_pg_position(pg_t pg
, uint32_t seed
) const
1441 uint32_t r
= crush_hash32_2(CRUSH_HASH_RJENKINS1
, seed
, 123);
1442 if (pg_num
== pg_num_mask
+ 1) {
1445 unsigned smaller_mask
= pg_num_mask
>> 1;
1446 if ((pg
.ps() & smaller_mask
) < (pg_num
& smaller_mask
)) {
1456 void pg_pool_t::encode(bufferlist
& bl
, uint64_t features
) const
1458 if ((features
& CEPH_FEATURE_PGPOOL3
) == 0) {
1459 // this encoding matches the old struct ceph_pg_pool
1461 ::encode(struct_v
, bl
);
1464 ::encode(crush_rule
, bl
);
1465 ::encode(object_hash
, bl
);
1466 ::encode(pg_num
, bl
);
1467 ::encode(pgp_num
, bl
);
1468 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1469 ::encode(lpg_num
, bl
);
1470 ::encode(lpgp_num
, bl
);
1471 ::encode(last_change
, bl
);
1472 ::encode(snap_seq
, bl
);
1473 ::encode(snap_epoch
, bl
);
1475 __u32 n
= snaps
.size();
1477 n
= removed_snaps
.num_intervals();
1482 ::encode_nohead(snaps
, bl
, features
);
1483 ::encode_nohead(removed_snaps
, bl
);
1487 if ((features
& CEPH_FEATURE_OSDENC
) == 0) {
1489 ::encode(struct_v
, bl
);
1492 ::encode(crush_rule
, bl
);
1493 ::encode(object_hash
, bl
);
1494 ::encode(pg_num
, bl
);
1495 ::encode(pgp_num
, bl
);
1496 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1497 ::encode(lpg_num
, bl
);
1498 ::encode(lpgp_num
, bl
);
1499 ::encode(last_change
, bl
);
1500 ::encode(snap_seq
, bl
);
1501 ::encode(snap_epoch
, bl
);
1502 ::encode(snaps
, bl
, features
);
1503 ::encode(removed_snaps
, bl
);
1505 ::encode(flags
, bl
);
1506 ::encode(crash_replay_interval
, bl
);
1510 if ((features
& CEPH_FEATURE_OSD_POOLRESEND
) == 0) {
1511 // we simply added last_force_op_resend here, which is a fully
1512 // backward compatible change. however, encoding the same map
1513 // differently between monitors triggers scrub noise (even though
1514 // they are decodable without the feature), so let's be pendantic
1516 ENCODE_START(14, 5, bl
);
1519 ::encode(crush_rule
, bl
);
1520 ::encode(object_hash
, bl
);
1521 ::encode(pg_num
, bl
);
1522 ::encode(pgp_num
, bl
);
1523 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1524 ::encode(lpg_num
, bl
);
1525 ::encode(lpgp_num
, bl
);
1526 ::encode(last_change
, bl
);
1527 ::encode(snap_seq
, bl
);
1528 ::encode(snap_epoch
, bl
);
1529 ::encode(snaps
, bl
, features
);
1530 ::encode(removed_snaps
, bl
);
1532 ::encode(flags
, bl
);
1533 ::encode(crash_replay_interval
, bl
);
1534 ::encode(min_size
, bl
);
1535 ::encode(quota_max_bytes
, bl
);
1536 ::encode(quota_max_objects
, bl
);
1537 ::encode(tiers
, bl
);
1538 ::encode(tier_of
, bl
);
1539 __u8 c
= cache_mode
;
1541 ::encode(read_tier
, bl
);
1542 ::encode(write_tier
, bl
);
1543 ::encode(properties
, bl
);
1544 ::encode(hit_set_params
, bl
);
1545 ::encode(hit_set_period
, bl
);
1546 ::encode(hit_set_count
, bl
);
1547 ::encode(stripe_width
, bl
);
1548 ::encode(target_max_bytes
, bl
);
1549 ::encode(target_max_objects
, bl
);
1550 ::encode(cache_target_dirty_ratio_micro
, bl
);
1551 ::encode(cache_target_full_ratio_micro
, bl
);
1552 ::encode(cache_min_flush_age
, bl
);
1553 ::encode(cache_min_evict_age
, bl
);
1554 ::encode(erasure_code_profile
, bl
);
1560 // NOTE: any new encoding dependencies must be reflected by
1561 // SIGNIFICANT_FEATURES
1562 if (!(features
& CEPH_FEATURE_NEW_OSDOP_ENCODING
)) {
1563 // this was the first post-hammer thing we added; if it's missing, encode
1566 } else if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
1570 ENCODE_START(v
, 5, bl
);
1573 ::encode(crush_rule
, bl
);
1574 ::encode(object_hash
, bl
);
1575 ::encode(pg_num
, bl
);
1576 ::encode(pgp_num
, bl
);
1577 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1578 ::encode(lpg_num
, bl
);
1579 ::encode(lpgp_num
, bl
);
1580 ::encode(last_change
, bl
);
1581 ::encode(snap_seq
, bl
);
1582 ::encode(snap_epoch
, bl
);
1583 ::encode(snaps
, bl
, features
);
1584 ::encode(removed_snaps
, bl
);
1586 ::encode(flags
, bl
);
1587 ::encode(crash_replay_interval
, bl
);
1588 ::encode(min_size
, bl
);
1589 ::encode(quota_max_bytes
, bl
);
1590 ::encode(quota_max_objects
, bl
);
1591 ::encode(tiers
, bl
);
1592 ::encode(tier_of
, bl
);
1593 __u8 c
= cache_mode
;
1595 ::encode(read_tier
, bl
);
1596 ::encode(write_tier
, bl
);
1597 ::encode(properties
, bl
);
1598 ::encode(hit_set_params
, bl
);
1599 ::encode(hit_set_period
, bl
);
1600 ::encode(hit_set_count
, bl
);
1601 ::encode(stripe_width
, bl
);
1602 ::encode(target_max_bytes
, bl
);
1603 ::encode(target_max_objects
, bl
);
1604 ::encode(cache_target_dirty_ratio_micro
, bl
);
1605 ::encode(cache_target_full_ratio_micro
, bl
);
1606 ::encode(cache_min_flush_age
, bl
);
1607 ::encode(cache_min_evict_age
, bl
);
1608 ::encode(erasure_code_profile
, bl
);
1609 ::encode(last_force_op_resend_preluminous
, bl
);
1610 ::encode(min_read_recency_for_promote
, bl
);
1611 ::encode(expected_num_objects
, bl
);
1613 ::encode(cache_target_dirty_high_ratio_micro
, bl
);
1616 ::encode(min_write_recency_for_promote
, bl
);
1619 ::encode(use_gmt_hitset
, bl
);
1622 ::encode(fast_read
, bl
);
1625 ::encode(hit_set_grade_decay_rate
, bl
);
1626 ::encode(hit_set_search_last_n
, bl
);
1632 ::encode(last_force_op_resend
, bl
);
1635 ::encode(application_metadata
, bl
);
1640 void pg_pool_t::decode(bufferlist::iterator
& bl
)
1642 DECODE_START_LEGACY_COMPAT_LEN(26, 5, 5, bl
);
1645 ::decode(crush_rule
, bl
);
1646 ::decode(object_hash
, bl
);
1647 ::decode(pg_num
, bl
);
1648 ::decode(pgp_num
, bl
);
1650 __u32 lpg_num
, lpgp_num
;
1651 ::decode(lpg_num
, bl
);
1652 ::decode(lpgp_num
, bl
);
1654 ::decode(last_change
, bl
);
1655 ::decode(snap_seq
, bl
);
1656 ::decode(snap_epoch
, bl
);
1658 if (struct_v
>= 3) {
1659 ::decode(snaps
, bl
);
1660 ::decode(removed_snaps
, bl
);
1667 ::decode_nohead(n
, snaps
, bl
);
1668 ::decode_nohead(m
, removed_snaps
, bl
);
1671 if (struct_v
>= 4) {
1672 ::decode(flags
, bl
);
1673 ::decode(crash_replay_interval
, bl
);
1677 // if this looks like the 'data' pool, set the
1678 // crash_replay_interval appropriately. unfortunately, we can't
1679 // be precise here. this should be good enough to preserve replay
1680 // on the data pool for the majority of cluster upgrades, though.
1681 if (crush_rule
== 0 && auid
== 0)
1682 crash_replay_interval
= 60;
1684 crash_replay_interval
= 0;
1686 if (struct_v
>= 7) {
1687 ::decode(min_size
, bl
);
1689 min_size
= size
- size
/2;
1691 if (struct_v
>= 8) {
1692 ::decode(quota_max_bytes
, bl
);
1693 ::decode(quota_max_objects
, bl
);
1695 if (struct_v
>= 9) {
1696 ::decode(tiers
, bl
);
1697 ::decode(tier_of
, bl
);
1700 cache_mode
= (cache_mode_t
)v
;
1701 ::decode(read_tier
, bl
);
1702 ::decode(write_tier
, bl
);
1704 if (struct_v
>= 10) {
1705 ::decode(properties
, bl
);
1707 if (struct_v
>= 11) {
1708 ::decode(hit_set_params
, bl
);
1709 ::decode(hit_set_period
, bl
);
1710 ::decode(hit_set_count
, bl
);
1713 hit_set_period
= def
.hit_set_period
;
1714 hit_set_count
= def
.hit_set_count
;
1716 if (struct_v
>= 12) {
1717 ::decode(stripe_width
, bl
);
1719 set_stripe_width(0);
1721 if (struct_v
>= 13) {
1722 ::decode(target_max_bytes
, bl
);
1723 ::decode(target_max_objects
, bl
);
1724 ::decode(cache_target_dirty_ratio_micro
, bl
);
1725 ::decode(cache_target_full_ratio_micro
, bl
);
1726 ::decode(cache_min_flush_age
, bl
);
1727 ::decode(cache_min_evict_age
, bl
);
1729 target_max_bytes
= 0;
1730 target_max_objects
= 0;
1731 cache_target_dirty_ratio_micro
= 0;
1732 cache_target_full_ratio_micro
= 0;
1733 cache_min_flush_age
= 0;
1734 cache_min_evict_age
= 0;
1736 if (struct_v
>= 14) {
1737 ::decode(erasure_code_profile
, bl
);
1739 if (struct_v
>= 15) {
1740 ::decode(last_force_op_resend_preluminous
, bl
);
1742 last_force_op_resend_preluminous
= 0;
1744 if (struct_v
>= 16) {
1745 ::decode(min_read_recency_for_promote
, bl
);
1747 min_read_recency_for_promote
= 1;
1749 if (struct_v
>= 17) {
1750 ::decode(expected_num_objects
, bl
);
1752 expected_num_objects
= 0;
1754 if (struct_v
>= 19) {
1755 ::decode(cache_target_dirty_high_ratio_micro
, bl
);
1757 cache_target_dirty_high_ratio_micro
= cache_target_dirty_ratio_micro
;
1759 if (struct_v
>= 20) {
1760 ::decode(min_write_recency_for_promote
, bl
);
1762 min_write_recency_for_promote
= 1;
1764 if (struct_v
>= 21) {
1765 ::decode(use_gmt_hitset
, bl
);
1767 use_gmt_hitset
= false;
1769 if (struct_v
>= 22) {
1770 ::decode(fast_read
, bl
);
1774 if (struct_v
>= 23) {
1775 ::decode(hit_set_grade_decay_rate
, bl
);
1776 ::decode(hit_set_search_last_n
, bl
);
1778 hit_set_grade_decay_rate
= 0;
1779 hit_set_search_last_n
= 1;
1781 if (struct_v
>= 24) {
1784 if (struct_v
>= 25) {
1785 ::decode(last_force_op_resend
, bl
);
1787 last_force_op_resend
= last_force_op_resend_preluminous
;
1789 if (struct_v
>= 26) {
1790 ::decode(application_metadata
, bl
);
1797 void pg_pool_t::generate_test_instances(list
<pg_pool_t
*>& o
)
1800 o
.push_back(new pg_pool_t(a
));
1802 a
.type
= TYPE_REPLICATED
;
1809 a
.last_force_op_resend
= 123823;
1810 a
.last_force_op_resend_preluminous
= 123824;
1814 a
.crash_replay_interval
= 13;
1815 a
.quota_max_bytes
= 473;
1816 a
.quota_max_objects
= 474;
1817 o
.push_back(new pg_pool_t(a
));
1819 a
.snaps
[3].name
= "asdf";
1820 a
.snaps
[3].snapid
= 3;
1821 a
.snaps
[3].stamp
= utime_t(123, 4);
1822 a
.snaps
[6].name
= "qwer";
1823 a
.snaps
[6].snapid
= 6;
1824 a
.snaps
[6].stamp
= utime_t(23423, 4);
1825 o
.push_back(new pg_pool_t(a
));
1827 a
.removed_snaps
.insert(2); // not quite valid to combine with snaps!
1828 a
.quota_max_bytes
= 2473;
1829 a
.quota_max_objects
= 4374;
1833 a
.cache_mode
= CACHEMODE_WRITEBACK
;
1836 a
.hit_set_params
= HitSet::Params(new BloomHitSet::Params
);
1837 a
.hit_set_period
= 3600;
1838 a
.hit_set_count
= 8;
1839 a
.min_read_recency_for_promote
= 1;
1840 a
.min_write_recency_for_promote
= 1;
1841 a
.hit_set_grade_decay_rate
= 50;
1842 a
.hit_set_search_last_n
= 1;
1843 a
.calc_grade_table();
1844 a
.set_stripe_width(12345);
1845 a
.target_max_bytes
= 1238132132;
1846 a
.target_max_objects
= 1232132;
1847 a
.cache_target_dirty_ratio_micro
= 187232;
1848 a
.cache_target_dirty_high_ratio_micro
= 309856;
1849 a
.cache_target_full_ratio_micro
= 987222;
1850 a
.cache_min_flush_age
= 231;
1851 a
.cache_min_evict_age
= 2321;
1852 a
.erasure_code_profile
= "profile in osdmap";
1853 a
.expected_num_objects
= 123456;
1854 a
.fast_read
= false;
1855 a
.application_metadata
= {{"rbd", {{"key", "value"}}}};
1856 o
.push_back(new pg_pool_t(a
));
1859 ostream
& operator<<(ostream
& out
, const pg_pool_t
& p
)
1861 out
<< p
.get_type_name()
1862 << " size " << p
.get_size()
1863 << " min_size " << p
.get_min_size()
1864 << " crush_rule " << p
.get_crush_rule()
1865 << " object_hash " << p
.get_object_hash_name()
1866 << " pg_num " << p
.get_pg_num()
1867 << " pgp_num " << p
.get_pgp_num()
1868 << " last_change " << p
.get_last_change();
1869 if (p
.get_last_force_op_resend() ||
1870 p
.get_last_force_op_resend_preluminous())
1871 out
<< " lfor " << p
.get_last_force_op_resend() << "/"
1872 << p
.get_last_force_op_resend_preluminous();
1874 out
<< " owner " << p
.get_auid();
1876 out
<< " flags " << p
.get_flags_string();
1877 if (p
.crash_replay_interval
)
1878 out
<< " crash_replay_interval " << p
.crash_replay_interval
;
1879 if (p
.quota_max_bytes
)
1880 out
<< " max_bytes " << p
.quota_max_bytes
;
1881 if (p
.quota_max_objects
)
1882 out
<< " max_objects " << p
.quota_max_objects
;
1883 if (!p
.tiers
.empty())
1884 out
<< " tiers " << p
.tiers
;
1886 out
<< " tier_of " << p
.tier_of
;
1887 if (p
.has_read_tier())
1888 out
<< " read_tier " << p
.read_tier
;
1889 if (p
.has_write_tier())
1890 out
<< " write_tier " << p
.write_tier
;
1892 out
<< " cache_mode " << p
.get_cache_mode_name();
1893 if (p
.target_max_bytes
)
1894 out
<< " target_bytes " << p
.target_max_bytes
;
1895 if (p
.target_max_objects
)
1896 out
<< " target_objects " << p
.target_max_objects
;
1897 if (p
.hit_set_params
.get_type() != HitSet::TYPE_NONE
) {
1898 out
<< " hit_set " << p
.hit_set_params
1899 << " " << p
.hit_set_period
<< "s"
1900 << " x" << p
.hit_set_count
<< " decay_rate "
1901 << p
.hit_set_grade_decay_rate
1902 << " search_last_n " << p
.hit_set_search_last_n
;
1904 if (p
.min_read_recency_for_promote
)
1905 out
<< " min_read_recency_for_promote " << p
.min_read_recency_for_promote
;
1906 if (p
.min_write_recency_for_promote
)
1907 out
<< " min_write_recency_for_promote " << p
.min_write_recency_for_promote
;
1908 out
<< " stripe_width " << p
.get_stripe_width();
1909 if (p
.expected_num_objects
)
1910 out
<< " expected_num_objects " << p
.expected_num_objects
;
1912 out
<< " fast_read " << p
.fast_read
;
1914 if (!p
.application_metadata
.empty()) {
1915 out
<< " application ";
1916 for (auto it
= p
.application_metadata
.begin();
1917 it
!= p
.application_metadata
.end(); ++it
) {
1918 if (it
!= p
.application_metadata
.begin())
1927 // -- object_stat_sum_t --
1929 void object_stat_sum_t::dump(Formatter
*f
) const
1931 f
->dump_int("num_bytes", num_bytes
);
1932 f
->dump_int("num_objects", num_objects
);
1933 f
->dump_int("num_object_clones", num_object_clones
);
1934 f
->dump_int("num_object_copies", num_object_copies
);
1935 f
->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary
);
1936 f
->dump_int("num_objects_missing", num_objects_missing
);
1937 f
->dump_int("num_objects_degraded", num_objects_degraded
);
1938 f
->dump_int("num_objects_misplaced", num_objects_misplaced
);
1939 f
->dump_int("num_objects_unfound", num_objects_unfound
);
1940 f
->dump_int("num_objects_dirty", num_objects_dirty
);
1941 f
->dump_int("num_whiteouts", num_whiteouts
);
1942 f
->dump_int("num_read", num_rd
);
1943 f
->dump_int("num_read_kb", num_rd_kb
);
1944 f
->dump_int("num_write", num_wr
);
1945 f
->dump_int("num_write_kb", num_wr_kb
);
1946 f
->dump_int("num_scrub_errors", num_scrub_errors
);
1947 f
->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors
);
1948 f
->dump_int("num_deep_scrub_errors", num_deep_scrub_errors
);
1949 f
->dump_int("num_objects_recovered", num_objects_recovered
);
1950 f
->dump_int("num_bytes_recovered", num_bytes_recovered
);
1951 f
->dump_int("num_keys_recovered", num_keys_recovered
);
1952 f
->dump_int("num_objects_omap", num_objects_omap
);
1953 f
->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive
);
1954 f
->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive
);
1955 f
->dump_int("num_flush", num_flush
);
1956 f
->dump_int("num_flush_kb", num_flush_kb
);
1957 f
->dump_int("num_evict", num_evict
);
1958 f
->dump_int("num_evict_kb", num_evict_kb
);
1959 f
->dump_int("num_promote", num_promote
);
1960 f
->dump_int("num_flush_mode_high", num_flush_mode_high
);
1961 f
->dump_int("num_flush_mode_low", num_flush_mode_low
);
1962 f
->dump_int("num_evict_mode_some", num_evict_mode_some
);
1963 f
->dump_int("num_evict_mode_full", num_evict_mode_full
);
1964 f
->dump_int("num_objects_pinned", num_objects_pinned
);
1965 f
->dump_int("num_legacy_snapsets", num_legacy_snapsets
);
1966 f
->dump_int("num_large_omap_objects", num_large_omap_objects
);
1969 void object_stat_sum_t::encode(bufferlist
& bl
) const
1971 ENCODE_START(17, 14, bl
);
1972 #if defined(CEPH_LITTLE_ENDIAN)
1973 bl
.append((char *)(&num_bytes
), sizeof(object_stat_sum_t
));
1975 ::encode(num_bytes
, bl
);
1976 ::encode(num_objects
, bl
);
1977 ::encode(num_object_clones
, bl
);
1978 ::encode(num_object_copies
, bl
);
1979 ::encode(num_objects_missing_on_primary
, bl
);
1980 ::encode(num_objects_degraded
, bl
);
1981 ::encode(num_objects_unfound
, bl
);
1982 ::encode(num_rd
, bl
);
1983 ::encode(num_rd_kb
, bl
);
1984 ::encode(num_wr
, bl
);
1985 ::encode(num_wr_kb
, bl
);
1986 ::encode(num_scrub_errors
, bl
);
1987 ::encode(num_objects_recovered
, bl
);
1988 ::encode(num_bytes_recovered
, bl
);
1989 ::encode(num_keys_recovered
, bl
);
1990 ::encode(num_shallow_scrub_errors
, bl
);
1991 ::encode(num_deep_scrub_errors
, bl
);
1992 ::encode(num_objects_dirty
, bl
);
1993 ::encode(num_whiteouts
, bl
);
1994 ::encode(num_objects_omap
, bl
);
1995 ::encode(num_objects_hit_set_archive
, bl
);
1996 ::encode(num_objects_misplaced
, bl
);
1997 ::encode(num_bytes_hit_set_archive
, bl
);
1998 ::encode(num_flush
, bl
);
1999 ::encode(num_flush_kb
, bl
);
2000 ::encode(num_evict
, bl
);
2001 ::encode(num_evict_kb
, bl
);
2002 ::encode(num_promote
, bl
);
2003 ::encode(num_flush_mode_high
, bl
);
2004 ::encode(num_flush_mode_low
, bl
);
2005 ::encode(num_evict_mode_some
, bl
);
2006 ::encode(num_evict_mode_full
, bl
);
2007 ::encode(num_objects_pinned
, bl
);
2008 ::encode(num_objects_missing
, bl
);
2009 ::encode(num_legacy_snapsets
, bl
);
2010 ::encode(num_large_omap_objects
, bl
);
2015 void object_stat_sum_t::decode(bufferlist::iterator
& bl
)
2017 bool decode_finish
= false;
2018 DECODE_START(17, bl
); // make sure to also update fast decode below
2019 #if defined(CEPH_LITTLE_ENDIAN)
2020 if (struct_v
>= 17) { // this must match newest decode version
2021 bl
.copy(sizeof(object_stat_sum_t
), (char*)(&num_bytes
));
2022 decode_finish
= true;
2025 if (!decode_finish
) {
2026 ::decode(num_bytes
, bl
);
2027 ::decode(num_objects
, bl
);
2028 ::decode(num_object_clones
, bl
);
2029 ::decode(num_object_copies
, bl
);
2030 ::decode(num_objects_missing_on_primary
, bl
);
2031 ::decode(num_objects_degraded
, bl
);
2032 ::decode(num_objects_unfound
, bl
);
2033 ::decode(num_rd
, bl
);
2034 ::decode(num_rd_kb
, bl
);
2035 ::decode(num_wr
, bl
);
2036 ::decode(num_wr_kb
, bl
);
2037 ::decode(num_scrub_errors
, bl
);
2038 ::decode(num_objects_recovered
, bl
);
2039 ::decode(num_bytes_recovered
, bl
);
2040 ::decode(num_keys_recovered
, bl
);
2041 ::decode(num_shallow_scrub_errors
, bl
);
2042 ::decode(num_deep_scrub_errors
, bl
);
2043 ::decode(num_objects_dirty
, bl
);
2044 ::decode(num_whiteouts
, bl
);
2045 ::decode(num_objects_omap
, bl
);
2046 ::decode(num_objects_hit_set_archive
, bl
);
2047 ::decode(num_objects_misplaced
, bl
);
2048 ::decode(num_bytes_hit_set_archive
, bl
);
2049 ::decode(num_flush
, bl
);
2050 ::decode(num_flush_kb
, bl
);
2051 ::decode(num_evict
, bl
);
2052 ::decode(num_evict_kb
, bl
);
2053 ::decode(num_promote
, bl
);
2054 ::decode(num_flush_mode_high
, bl
);
2055 ::decode(num_flush_mode_low
, bl
);
2056 ::decode(num_evict_mode_some
, bl
);
2057 ::decode(num_evict_mode_full
, bl
);
2058 ::decode(num_objects_pinned
, bl
);
2059 ::decode(num_objects_missing
, bl
);
2060 if (struct_v
>= 16) {
2061 ::decode(num_legacy_snapsets
, bl
);
2063 num_legacy_snapsets
= num_object_clones
; // upper bound
2065 if (struct_v
>= 17) {
2066 ::decode(num_large_omap_objects
, bl
);
2072 void object_stat_sum_t::generate_test_instances(list
<object_stat_sum_t
*>& o
)
2074 object_stat_sum_t a
;
2078 a
.num_object_clones
= 4;
2079 a
.num_object_copies
= 5;
2080 a
.num_objects_missing_on_primary
= 6;
2081 a
.num_objects_missing
= 123;
2082 a
.num_objects_degraded
= 7;
2083 a
.num_objects_unfound
= 8;
2084 a
.num_rd
= 9; a
.num_rd_kb
= 10;
2085 a
.num_wr
= 11; a
.num_wr_kb
= 12;
2086 a
.num_objects_recovered
= 14;
2087 a
.num_bytes_recovered
= 15;
2088 a
.num_keys_recovered
= 16;
2089 a
.num_deep_scrub_errors
= 17;
2090 a
.num_shallow_scrub_errors
= 18;
2091 a
.num_scrub_errors
= a
.num_deep_scrub_errors
+ a
.num_shallow_scrub_errors
;
2092 a
.num_objects_dirty
= 21;
2093 a
.num_whiteouts
= 22;
2094 a
.num_objects_misplaced
= 1232;
2095 a
.num_objects_hit_set_archive
= 2;
2096 a
.num_bytes_hit_set_archive
= 27;
2102 a
.num_flush_mode_high
= 0;
2103 a
.num_flush_mode_low
= 1;
2104 a
.num_evict_mode_some
= 1;
2105 a
.num_evict_mode_full
= 0;
2106 a
.num_objects_pinned
= 20;
2107 a
.num_large_omap_objects
= 5;
2108 o
.push_back(new object_stat_sum_t(a
));
2111 void object_stat_sum_t::add(const object_stat_sum_t
& o
)
2113 num_bytes
+= o
.num_bytes
;
2114 num_objects
+= o
.num_objects
;
2115 num_object_clones
+= o
.num_object_clones
;
2116 num_object_copies
+= o
.num_object_copies
;
2117 num_objects_missing_on_primary
+= o
.num_objects_missing_on_primary
;
2118 num_objects_missing
+= o
.num_objects_missing
;
2119 num_objects_degraded
+= o
.num_objects_degraded
;
2120 num_objects_misplaced
+= o
.num_objects_misplaced
;
2122 num_rd_kb
+= o
.num_rd_kb
;
2124 num_wr_kb
+= o
.num_wr_kb
;
2125 num_objects_unfound
+= o
.num_objects_unfound
;
2126 num_scrub_errors
+= o
.num_scrub_errors
;
2127 num_shallow_scrub_errors
+= o
.num_shallow_scrub_errors
;
2128 num_deep_scrub_errors
+= o
.num_deep_scrub_errors
;
2129 num_objects_recovered
+= o
.num_objects_recovered
;
2130 num_bytes_recovered
+= o
.num_bytes_recovered
;
2131 num_keys_recovered
+= o
.num_keys_recovered
;
2132 num_objects_dirty
+= o
.num_objects_dirty
;
2133 num_whiteouts
+= o
.num_whiteouts
;
2134 num_objects_omap
+= o
.num_objects_omap
;
2135 num_objects_hit_set_archive
+= o
.num_objects_hit_set_archive
;
2136 num_bytes_hit_set_archive
+= o
.num_bytes_hit_set_archive
;
2137 num_flush
+= o
.num_flush
;
2138 num_flush_kb
+= o
.num_flush_kb
;
2139 num_evict
+= o
.num_evict
;
2140 num_evict_kb
+= o
.num_evict_kb
;
2141 num_promote
+= o
.num_promote
;
2142 num_flush_mode_high
+= o
.num_flush_mode_high
;
2143 num_flush_mode_low
+= o
.num_flush_mode_low
;
2144 num_evict_mode_some
+= o
.num_evict_mode_some
;
2145 num_evict_mode_full
+= o
.num_evict_mode_full
;
2146 num_objects_pinned
+= o
.num_objects_pinned
;
2147 num_legacy_snapsets
+= o
.num_legacy_snapsets
;
2148 num_large_omap_objects
+= o
.num_large_omap_objects
;
2151 void object_stat_sum_t::sub(const object_stat_sum_t
& o
)
2153 num_bytes
-= o
.num_bytes
;
2154 num_objects
-= o
.num_objects
;
2155 num_object_clones
-= o
.num_object_clones
;
2156 num_object_copies
-= o
.num_object_copies
;
2157 num_objects_missing_on_primary
-= o
.num_objects_missing_on_primary
;
2158 num_objects_missing
-= o
.num_objects_missing
;
2159 num_objects_degraded
-= o
.num_objects_degraded
;
2160 num_objects_misplaced
-= o
.num_objects_misplaced
;
2162 num_rd_kb
-= o
.num_rd_kb
;
2164 num_wr_kb
-= o
.num_wr_kb
;
2165 num_objects_unfound
-= o
.num_objects_unfound
;
2166 num_scrub_errors
-= o
.num_scrub_errors
;
2167 num_shallow_scrub_errors
-= o
.num_shallow_scrub_errors
;
2168 num_deep_scrub_errors
-= o
.num_deep_scrub_errors
;
2169 num_objects_recovered
-= o
.num_objects_recovered
;
2170 num_bytes_recovered
-= o
.num_bytes_recovered
;
2171 num_keys_recovered
-= o
.num_keys_recovered
;
2172 num_objects_dirty
-= o
.num_objects_dirty
;
2173 num_whiteouts
-= o
.num_whiteouts
;
2174 num_objects_omap
-= o
.num_objects_omap
;
2175 num_objects_hit_set_archive
-= o
.num_objects_hit_set_archive
;
2176 num_bytes_hit_set_archive
-= o
.num_bytes_hit_set_archive
;
2177 num_flush
-= o
.num_flush
;
2178 num_flush_kb
-= o
.num_flush_kb
;
2179 num_evict
-= o
.num_evict
;
2180 num_evict_kb
-= o
.num_evict_kb
;
2181 num_promote
-= o
.num_promote
;
2182 num_flush_mode_high
-= o
.num_flush_mode_high
;
2183 num_flush_mode_low
-= o
.num_flush_mode_low
;
2184 num_evict_mode_some
-= o
.num_evict_mode_some
;
2185 num_evict_mode_full
-= o
.num_evict_mode_full
;
2186 num_objects_pinned
-= o
.num_objects_pinned
;
2187 num_legacy_snapsets
-= o
.num_legacy_snapsets
;
2188 num_large_omap_objects
-= o
.num_large_omap_objects
;
2191 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
)
2194 l
.num_bytes
== r
.num_bytes
&&
2195 l
.num_objects
== r
.num_objects
&&
2196 l
.num_object_clones
== r
.num_object_clones
&&
2197 l
.num_object_copies
== r
.num_object_copies
&&
2198 l
.num_objects_missing_on_primary
== r
.num_objects_missing_on_primary
&&
2199 l
.num_objects_missing
== r
.num_objects_missing
&&
2200 l
.num_objects_degraded
== r
.num_objects_degraded
&&
2201 l
.num_objects_misplaced
== r
.num_objects_misplaced
&&
2202 l
.num_objects_unfound
== r
.num_objects_unfound
&&
2203 l
.num_rd
== r
.num_rd
&&
2204 l
.num_rd_kb
== r
.num_rd_kb
&&
2205 l
.num_wr
== r
.num_wr
&&
2206 l
.num_wr_kb
== r
.num_wr_kb
&&
2207 l
.num_scrub_errors
== r
.num_scrub_errors
&&
2208 l
.num_shallow_scrub_errors
== r
.num_shallow_scrub_errors
&&
2209 l
.num_deep_scrub_errors
== r
.num_deep_scrub_errors
&&
2210 l
.num_objects_recovered
== r
.num_objects_recovered
&&
2211 l
.num_bytes_recovered
== r
.num_bytes_recovered
&&
2212 l
.num_keys_recovered
== r
.num_keys_recovered
&&
2213 l
.num_objects_dirty
== r
.num_objects_dirty
&&
2214 l
.num_whiteouts
== r
.num_whiteouts
&&
2215 l
.num_objects_omap
== r
.num_objects_omap
&&
2216 l
.num_objects_hit_set_archive
== r
.num_objects_hit_set_archive
&&
2217 l
.num_bytes_hit_set_archive
== r
.num_bytes_hit_set_archive
&&
2218 l
.num_flush
== r
.num_flush
&&
2219 l
.num_flush_kb
== r
.num_flush_kb
&&
2220 l
.num_evict
== r
.num_evict
&&
2221 l
.num_evict_kb
== r
.num_evict_kb
&&
2222 l
.num_promote
== r
.num_promote
&&
2223 l
.num_flush_mode_high
== r
.num_flush_mode_high
&&
2224 l
.num_flush_mode_low
== r
.num_flush_mode_low
&&
2225 l
.num_evict_mode_some
== r
.num_evict_mode_some
&&
2226 l
.num_evict_mode_full
== r
.num_evict_mode_full
&&
2227 l
.num_objects_pinned
== r
.num_objects_pinned
&&
2228 l
.num_legacy_snapsets
== r
.num_legacy_snapsets
&&
2229 l
.num_large_omap_objects
== r
.num_large_omap_objects
;
2232 // -- object_stat_collection_t --
2234 void object_stat_collection_t::dump(Formatter
*f
) const
2236 f
->open_object_section("stat_sum");
2241 void object_stat_collection_t::encode(bufferlist
& bl
) const
2243 ENCODE_START(2, 2, bl
);
2245 ::encode((__u32
)0, bl
);
2249 void object_stat_collection_t::decode(bufferlist::iterator
& bl
)
2251 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2254 map
<string
,object_stat_sum_t
> cat_sum
;
2255 ::decode(cat_sum
, bl
);
2260 void object_stat_collection_t::generate_test_instances(list
<object_stat_collection_t
*>& o
)
2262 object_stat_collection_t a
;
2263 o
.push_back(new object_stat_collection_t(a
));
2264 list
<object_stat_sum_t
*> l
;
2265 object_stat_sum_t::generate_test_instances(l
);
2266 for (list
<object_stat_sum_t
*>::iterator p
= l
.begin(); p
!= l
.end(); ++p
) {
2268 o
.push_back(new object_stat_collection_t(a
));
2275 bool pg_stat_t::is_acting_osd(int32_t osd
, bool primary
) const
2277 if (primary
&& osd
== acting_primary
) {
2279 } else if (!primary
) {
2280 for(vector
<int32_t>::const_iterator it
= acting
.begin();
2281 it
!= acting
.end(); ++it
)
2290 void pg_stat_t::dump(Formatter
*f
) const
2292 f
->dump_stream("version") << version
;
2293 f
->dump_stream("reported_seq") << reported_seq
;
2294 f
->dump_stream("reported_epoch") << reported_epoch
;
2295 f
->dump_string("state", pg_state_string(state
));
2296 f
->dump_stream("last_fresh") << last_fresh
;
2297 f
->dump_stream("last_change") << last_change
;
2298 f
->dump_stream("last_active") << last_active
;
2299 f
->dump_stream("last_peered") << last_peered
;
2300 f
->dump_stream("last_clean") << last_clean
;
2301 f
->dump_stream("last_became_active") << last_became_active
;
2302 f
->dump_stream("last_became_peered") << last_became_peered
;
2303 f
->dump_stream("last_unstale") << last_unstale
;
2304 f
->dump_stream("last_undegraded") << last_undegraded
;
2305 f
->dump_stream("last_fullsized") << last_fullsized
;
2306 f
->dump_unsigned("mapping_epoch", mapping_epoch
);
2307 f
->dump_stream("log_start") << log_start
;
2308 f
->dump_stream("ondisk_log_start") << ondisk_log_start
;
2309 f
->dump_unsigned("created", created
);
2310 f
->dump_unsigned("last_epoch_clean", last_epoch_clean
);
2311 f
->dump_stream("parent") << parent
;
2312 f
->dump_unsigned("parent_split_bits", parent_split_bits
);
2313 f
->dump_stream("last_scrub") << last_scrub
;
2314 f
->dump_stream("last_scrub_stamp") << last_scrub_stamp
;
2315 f
->dump_stream("last_deep_scrub") << last_deep_scrub
;
2316 f
->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp
;
2317 f
->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp
;
2318 f
->dump_int("log_size", log_size
);
2319 f
->dump_int("ondisk_log_size", ondisk_log_size
);
2320 f
->dump_bool("stats_invalid", stats_invalid
);
2321 f
->dump_bool("dirty_stats_invalid", dirty_stats_invalid
);
2322 f
->dump_bool("omap_stats_invalid", omap_stats_invalid
);
2323 f
->dump_bool("hitset_stats_invalid", hitset_stats_invalid
);
2324 f
->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid
);
2325 f
->dump_bool("pin_stats_invalid", pin_stats_invalid
);
2326 f
->dump_unsigned("snaptrimq_len", snaptrimq_len
);
2328 f
->open_array_section("up");
2329 for (vector
<int32_t>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
)
2330 f
->dump_int("osd", *p
);
2332 f
->open_array_section("acting");
2333 for (vector
<int32_t>::const_iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
2334 f
->dump_int("osd", *p
);
2336 f
->open_array_section("blocked_by");
2337 for (vector
<int32_t>::const_iterator p
= blocked_by
.begin();
2338 p
!= blocked_by
.end(); ++p
)
2339 f
->dump_int("osd", *p
);
2341 f
->dump_int("up_primary", up_primary
);
2342 f
->dump_int("acting_primary", acting_primary
);
2345 void pg_stat_t::dump_brief(Formatter
*f
) const
2347 f
->dump_string("state", pg_state_string(state
));
2348 f
->open_array_section("up");
2349 for (vector
<int32_t>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
)
2350 f
->dump_int("osd", *p
);
2352 f
->open_array_section("acting");
2353 for (vector
<int32_t>::const_iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
2354 f
->dump_int("osd", *p
);
2356 f
->dump_int("up_primary", up_primary
);
2357 f
->dump_int("acting_primary", acting_primary
);
2360 void pg_stat_t::encode(bufferlist
&bl
) const
2362 ENCODE_START(23, 22, bl
);
2363 ::encode(version
, bl
);
2364 ::encode(reported_seq
, bl
);
2365 ::encode(reported_epoch
, bl
);
2366 ::encode(state
, bl
);
2367 ::encode(log_start
, bl
);
2368 ::encode(ondisk_log_start
, bl
);
2369 ::encode(created
, bl
);
2370 ::encode(last_epoch_clean
, bl
);
2371 ::encode(parent
, bl
);
2372 ::encode(parent_split_bits
, bl
);
2373 ::encode(last_scrub
, bl
);
2374 ::encode(last_scrub_stamp
, bl
);
2375 ::encode(stats
, bl
);
2376 ::encode(log_size
, bl
);
2377 ::encode(ondisk_log_size
, bl
);
2379 ::encode(acting
, bl
);
2380 ::encode(last_fresh
, bl
);
2381 ::encode(last_change
, bl
);
2382 ::encode(last_active
, bl
);
2383 ::encode(last_clean
, bl
);
2384 ::encode(last_unstale
, bl
);
2385 ::encode(mapping_epoch
, bl
);
2386 ::encode(last_deep_scrub
, bl
);
2387 ::encode(last_deep_scrub_stamp
, bl
);
2388 ::encode(stats_invalid
, bl
);
2389 ::encode(last_clean_scrub_stamp
, bl
);
2390 ::encode(last_became_active
, bl
);
2391 ::encode(dirty_stats_invalid
, bl
);
2392 ::encode(up_primary
, bl
);
2393 ::encode(acting_primary
, bl
);
2394 ::encode(omap_stats_invalid
, bl
);
2395 ::encode(hitset_stats_invalid
, bl
);
2396 ::encode(blocked_by
, bl
);
2397 ::encode(last_undegraded
, bl
);
2398 ::encode(last_fullsized
, bl
);
2399 ::encode(hitset_bytes_stats_invalid
, bl
);
2400 ::encode(last_peered
, bl
);
2401 ::encode(last_became_peered
, bl
);
2402 ::encode(pin_stats_invalid
, bl
);
2403 ::encode(snaptrimq_len
, bl
);
2407 void pg_stat_t::decode(bufferlist::iterator
&bl
)
2410 DECODE_START(22, bl
);
2411 ::decode(version
, bl
);
2412 ::decode(reported_seq
, bl
);
2413 ::decode(reported_epoch
, bl
);
2414 ::decode(state
, bl
);
2415 ::decode(log_start
, bl
);
2416 ::decode(ondisk_log_start
, bl
);
2417 ::decode(created
, bl
);
2418 ::decode(last_epoch_clean
, bl
);
2419 ::decode(parent
, bl
);
2420 ::decode(parent_split_bits
, bl
);
2421 ::decode(last_scrub
, bl
);
2422 ::decode(last_scrub_stamp
, bl
);
2423 ::decode(stats
, bl
);
2424 ::decode(log_size
, bl
);
2425 ::decode(ondisk_log_size
, bl
);
2427 ::decode(acting
, bl
);
2428 ::decode(last_fresh
, bl
);
2429 ::decode(last_change
, bl
);
2430 ::decode(last_active
, bl
);
2431 ::decode(last_clean
, bl
);
2432 ::decode(last_unstale
, bl
);
2433 ::decode(mapping_epoch
, bl
);
2434 ::decode(last_deep_scrub
, bl
);
2435 ::decode(last_deep_scrub_stamp
, bl
);
2437 stats_invalid
= tmp
;
2438 ::decode(last_clean_scrub_stamp
, bl
);
2439 ::decode(last_became_active
, bl
);
2441 dirty_stats_invalid
= tmp
;
2442 ::decode(up_primary
, bl
);
2443 ::decode(acting_primary
, bl
);
2445 omap_stats_invalid
= tmp
;
2447 hitset_stats_invalid
= tmp
;
2448 ::decode(blocked_by
, bl
);
2449 ::decode(last_undegraded
, bl
);
2450 ::decode(last_fullsized
, bl
);
2452 hitset_bytes_stats_invalid
= tmp
;
2453 ::decode(last_peered
, bl
);
2454 ::decode(last_became_peered
, bl
);
2456 pin_stats_invalid
= tmp
;
2457 if (struct_v
>= 23) {
2458 ::decode(snaptrimq_len
, bl
);
2463 void pg_stat_t::generate_test_instances(list
<pg_stat_t
*>& o
)
2466 o
.push_back(new pg_stat_t(a
));
2468 a
.version
= eversion_t(1, 3);
2469 a
.reported_epoch
= 1;
2472 a
.mapping_epoch
= 998;
2473 a
.last_fresh
= utime_t(1002, 1);
2474 a
.last_change
= utime_t(1002, 2);
2475 a
.last_active
= utime_t(1002, 3);
2476 a
.last_clean
= utime_t(1002, 4);
2477 a
.last_unstale
= utime_t(1002, 5);
2478 a
.last_undegraded
= utime_t(1002, 7);
2479 a
.last_fullsized
= utime_t(1002, 8);
2480 a
.log_start
= eversion_t(1, 4);
2481 a
.ondisk_log_start
= eversion_t(1, 5);
2483 a
.last_epoch_clean
= 7;
2484 a
.parent
= pg_t(1, 2, 3);
2485 a
.parent_split_bits
= 12;
2486 a
.last_scrub
= eversion_t(9, 10);
2487 a
.last_scrub_stamp
= utime_t(11, 12);
2488 a
.last_deep_scrub
= eversion_t(13, 14);
2489 a
.last_deep_scrub_stamp
= utime_t(15, 16);
2490 a
.last_clean_scrub_stamp
= utime_t(17, 18);
2491 a
.snaptrimq_len
= 1048576;
2492 list
<object_stat_collection_t
*> l
;
2493 object_stat_collection_t::generate_test_instances(l
);
2494 a
.stats
= *l
.back();
2496 a
.ondisk_log_size
= 88;
2497 a
.up
.push_back(123);
2499 a
.acting
.push_back(456);
2500 a
.acting_primary
= 456;
2501 o
.push_back(new pg_stat_t(a
));
2503 a
.up
.push_back(124);
2505 a
.acting
.push_back(124);
2506 a
.acting_primary
= 124;
2507 a
.blocked_by
.push_back(155);
2508 a
.blocked_by
.push_back(156);
2509 o
.push_back(new pg_stat_t(a
));
2512 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
)
2515 l
.version
== r
.version
&&
2516 l
.reported_seq
== r
.reported_seq
&&
2517 l
.reported_epoch
== r
.reported_epoch
&&
2518 l
.state
== r
.state
&&
2519 l
.last_fresh
== r
.last_fresh
&&
2520 l
.last_change
== r
.last_change
&&
2521 l
.last_active
== r
.last_active
&&
2522 l
.last_peered
== r
.last_peered
&&
2523 l
.last_clean
== r
.last_clean
&&
2524 l
.last_unstale
== r
.last_unstale
&&
2525 l
.last_undegraded
== r
.last_undegraded
&&
2526 l
.last_fullsized
== r
.last_fullsized
&&
2527 l
.log_start
== r
.log_start
&&
2528 l
.ondisk_log_start
== r
.ondisk_log_start
&&
2529 l
.created
== r
.created
&&
2530 l
.last_epoch_clean
== r
.last_epoch_clean
&&
2531 l
.parent
== r
.parent
&&
2532 l
.parent_split_bits
== r
.parent_split_bits
&&
2533 l
.last_scrub
== r
.last_scrub
&&
2534 l
.last_deep_scrub
== r
.last_deep_scrub
&&
2535 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
2536 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
2537 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
&&
2538 l
.stats
== r
.stats
&&
2539 l
.stats_invalid
== r
.stats_invalid
&&
2540 l
.log_size
== r
.log_size
&&
2541 l
.ondisk_log_size
== r
.ondisk_log_size
&&
2543 l
.acting
== r
.acting
&&
2544 l
.mapping_epoch
== r
.mapping_epoch
&&
2545 l
.blocked_by
== r
.blocked_by
&&
2546 l
.last_became_active
== r
.last_became_active
&&
2547 l
.last_became_peered
== r
.last_became_peered
&&
2548 l
.dirty_stats_invalid
== r
.dirty_stats_invalid
&&
2549 l
.omap_stats_invalid
== r
.omap_stats_invalid
&&
2550 l
.hitset_stats_invalid
== r
.hitset_stats_invalid
&&
2551 l
.hitset_bytes_stats_invalid
== r
.hitset_bytes_stats_invalid
&&
2552 l
.up_primary
== r
.up_primary
&&
2553 l
.acting_primary
== r
.acting_primary
&&
2554 l
.pin_stats_invalid
== r
.pin_stats_invalid
&&
2555 l
.snaptrimq_len
== r
.snaptrimq_len
;
2558 // -- pool_stat_t --
2560 void pool_stat_t::dump(Formatter
*f
) const
2563 f
->dump_int("log_size", log_size
);
2564 f
->dump_int("ondisk_log_size", ondisk_log_size
);
2565 f
->dump_int("up", up
);
2566 f
->dump_int("acting", acting
);
2569 void pool_stat_t::encode(bufferlist
&bl
, uint64_t features
) const
2571 if ((features
& CEPH_FEATURE_OSDENC
) == 0) {
2574 ::encode(stats
, bl
);
2575 ::encode(log_size
, bl
);
2576 ::encode(ondisk_log_size
, bl
);
2580 ENCODE_START(6, 5, bl
);
2581 ::encode(stats
, bl
);
2582 ::encode(log_size
, bl
);
2583 ::encode(ondisk_log_size
, bl
);
2585 ::encode(acting
, bl
);
2589 void pool_stat_t::decode(bufferlist::iterator
&bl
)
2591 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl
);
2592 if (struct_v
>= 4) {
2593 ::decode(stats
, bl
);
2594 ::decode(log_size
, bl
);
2595 ::decode(ondisk_log_size
, bl
);
2596 if (struct_v
>= 6) {
2598 ::decode(acting
, bl
);
2604 ::decode(stats
.sum
.num_bytes
, bl
);
2606 ::decode(num_kb
, bl
);
2607 ::decode(stats
.sum
.num_objects
, bl
);
2608 ::decode(stats
.sum
.num_object_clones
, bl
);
2609 ::decode(stats
.sum
.num_object_copies
, bl
);
2610 ::decode(stats
.sum
.num_objects_missing_on_primary
, bl
);
2611 ::decode(stats
.sum
.num_objects_degraded
, bl
);
2612 ::decode(log_size
, bl
);
2613 ::decode(ondisk_log_size
, bl
);
2614 if (struct_v
>= 2) {
2615 ::decode(stats
.sum
.num_rd
, bl
);
2616 ::decode(stats
.sum
.num_rd_kb
, bl
);
2617 ::decode(stats
.sum
.num_wr
, bl
);
2618 ::decode(stats
.sum
.num_wr_kb
, bl
);
2620 if (struct_v
>= 3) {
2621 ::decode(stats
.sum
.num_objects_unfound
, bl
);
2627 void pool_stat_t::generate_test_instances(list
<pool_stat_t
*>& o
)
2630 o
.push_back(new pool_stat_t(a
));
2632 list
<object_stat_collection_t
*> l
;
2633 object_stat_collection_t::generate_test_instances(l
);
2634 a
.stats
= *l
.back();
2636 a
.ondisk_log_size
= 456;
2639 o
.push_back(new pool_stat_t(a
));
2643 // -- pg_history_t --
2645 void pg_history_t::encode(bufferlist
&bl
) const
2647 ENCODE_START(9, 4, bl
);
2648 ::encode(epoch_created
, bl
);
2649 ::encode(last_epoch_started
, bl
);
2650 ::encode(last_epoch_clean
, bl
);
2651 ::encode(last_epoch_split
, bl
);
2652 ::encode(same_interval_since
, bl
);
2653 ::encode(same_up_since
, bl
);
2654 ::encode(same_primary_since
, bl
);
2655 ::encode(last_scrub
, bl
);
2656 ::encode(last_scrub_stamp
, bl
);
2657 ::encode(last_deep_scrub
, bl
);
2658 ::encode(last_deep_scrub_stamp
, bl
);
2659 ::encode(last_clean_scrub_stamp
, bl
);
2660 ::encode(last_epoch_marked_full
, bl
);
2661 ::encode(last_interval_started
, bl
);
2662 ::encode(last_interval_clean
, bl
);
2663 ::encode(epoch_pool_created
, bl
);
2667 void pg_history_t::decode(bufferlist::iterator
&bl
)
2669 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl
);
2670 ::decode(epoch_created
, bl
);
2671 ::decode(last_epoch_started
, bl
);
2673 ::decode(last_epoch_clean
, bl
);
2675 last_epoch_clean
= last_epoch_started
; // careful, it's a lie!
2676 ::decode(last_epoch_split
, bl
);
2677 ::decode(same_interval_since
, bl
);
2678 ::decode(same_up_since
, bl
);
2679 ::decode(same_primary_since
, bl
);
2680 if (struct_v
>= 2) {
2681 ::decode(last_scrub
, bl
);
2682 ::decode(last_scrub_stamp
, bl
);
2684 if (struct_v
>= 5) {
2685 ::decode(last_deep_scrub
, bl
);
2686 ::decode(last_deep_scrub_stamp
, bl
);
2688 if (struct_v
>= 6) {
2689 ::decode(last_clean_scrub_stamp
, bl
);
2691 if (struct_v
>= 7) {
2692 ::decode(last_epoch_marked_full
, bl
);
2694 if (struct_v
>= 8) {
2695 ::decode(last_interval_started
, bl
);
2696 ::decode(last_interval_clean
, bl
);
2698 if (last_epoch_started
>= same_interval_since
) {
2699 last_interval_started
= same_interval_since
;
2701 last_interval_started
= last_epoch_started
; // best guess
2703 if (last_epoch_clean
>= same_interval_since
) {
2704 last_interval_clean
= same_interval_since
;
2706 last_interval_clean
= last_epoch_clean
; // best guess
2709 if (struct_v
>= 9) {
2710 ::decode(epoch_pool_created
, bl
);
2712 epoch_pool_created
= epoch_created
;
2717 void pg_history_t::dump(Formatter
*f
) const
2719 f
->dump_int("epoch_created", epoch_created
);
2720 f
->dump_int("epoch_pool_created", epoch_pool_created
);
2721 f
->dump_int("last_epoch_started", last_epoch_started
);
2722 f
->dump_int("last_interval_started", last_interval_started
);
2723 f
->dump_int("last_epoch_clean", last_epoch_clean
);
2724 f
->dump_int("last_interval_clean", last_interval_clean
);
2725 f
->dump_int("last_epoch_split", last_epoch_split
);
2726 f
->dump_int("last_epoch_marked_full", last_epoch_marked_full
);
2727 f
->dump_int("same_up_since", same_up_since
);
2728 f
->dump_int("same_interval_since", same_interval_since
);
2729 f
->dump_int("same_primary_since", same_primary_since
);
2730 f
->dump_stream("last_scrub") << last_scrub
;
2731 f
->dump_stream("last_scrub_stamp") << last_scrub_stamp
;
2732 f
->dump_stream("last_deep_scrub") << last_deep_scrub
;
2733 f
->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp
;
2734 f
->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp
;
2737 void pg_history_t::generate_test_instances(list
<pg_history_t
*>& o
)
2739 o
.push_back(new pg_history_t
);
2740 o
.push_back(new pg_history_t
);
2741 o
.back()->epoch_created
= 1;
2742 o
.back()->epoch_pool_created
= 1;
2743 o
.back()->last_epoch_started
= 2;
2744 o
.back()->last_interval_started
= 2;
2745 o
.back()->last_epoch_clean
= 3;
2746 o
.back()->last_interval_clean
= 2;
2747 o
.back()->last_epoch_split
= 4;
2748 o
.back()->same_up_since
= 5;
2749 o
.back()->same_interval_since
= 6;
2750 o
.back()->same_primary_since
= 7;
2751 o
.back()->last_scrub
= eversion_t(8, 9);
2752 o
.back()->last_scrub_stamp
= utime_t(10, 11);
2753 o
.back()->last_deep_scrub
= eversion_t(12, 13);
2754 o
.back()->last_deep_scrub_stamp
= utime_t(14, 15);
2755 o
.back()->last_clean_scrub_stamp
= utime_t(16, 17);
2756 o
.back()->last_epoch_marked_full
= 18;
2762 void pg_info_t::encode(bufferlist
&bl
) const
2764 ENCODE_START(32, 26, bl
);
2765 ::encode(pgid
.pgid
, bl
);
2766 ::encode(last_update
, bl
);
2767 ::encode(last_complete
, bl
);
2768 ::encode(log_tail
, bl
);
2769 if (last_backfill_bitwise
&& !last_backfill
.is_max()) {
2770 ::encode(hobject_t(), bl
);
2772 ::encode(last_backfill
, bl
);
2774 ::encode(stats
, bl
);
2776 ::encode(purged_snaps
, bl
);
2777 ::encode(last_epoch_started
, bl
);
2778 ::encode(last_user_version
, bl
);
2779 ::encode(hit_set
, bl
);
2780 ::encode(pgid
.shard
, bl
);
2781 ::encode(last_backfill
, bl
);
2782 ::encode(last_backfill_bitwise
, bl
);
2783 ::encode(last_interval_started
, bl
);
2787 void pg_info_t::decode(bufferlist::iterator
&bl
)
2789 DECODE_START(32, bl
);
2790 ::decode(pgid
.pgid
, bl
);
2791 ::decode(last_update
, bl
);
2792 ::decode(last_complete
, bl
);
2793 ::decode(log_tail
, bl
);
2795 hobject_t old_last_backfill
;
2796 ::decode(old_last_backfill
, bl
);
2798 ::decode(stats
, bl
);
2800 ::decode(purged_snaps
, bl
);
2801 ::decode(last_epoch_started
, bl
);
2802 ::decode(last_user_version
, bl
);
2803 ::decode(hit_set
, bl
);
2804 ::decode(pgid
.shard
, bl
);
2805 ::decode(last_backfill
, bl
);
2806 ::decode(last_backfill_bitwise
, bl
);
2807 if (struct_v
>= 32) {
2808 ::decode(last_interval_started
, bl
);
2810 last_interval_started
= last_epoch_started
;
2817 void pg_info_t::dump(Formatter
*f
) const
2819 f
->dump_stream("pgid") << pgid
;
2820 f
->dump_stream("last_update") << last_update
;
2821 f
->dump_stream("last_complete") << last_complete
;
2822 f
->dump_stream("log_tail") << log_tail
;
2823 f
->dump_int("last_user_version", last_user_version
);
2824 f
->dump_stream("last_backfill") << last_backfill
;
2825 f
->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise
);
2826 f
->open_array_section("purged_snaps");
2827 for (interval_set
<snapid_t
>::const_iterator i
=purged_snaps
.begin();
2828 i
!= purged_snaps
.end();
2830 f
->open_object_section("purged_snap_interval");
2831 f
->dump_stream("start") << i
.get_start();
2832 f
->dump_stream("length") << i
.get_len();
2836 f
->open_object_section("history");
2839 f
->open_object_section("stats");
2843 f
->dump_int("empty", is_empty());
2844 f
->dump_int("dne", dne());
2845 f
->dump_int("incomplete", is_incomplete());
2846 f
->dump_int("last_epoch_started", last_epoch_started
);
2848 f
->open_object_section("hit_set_history");
2853 void pg_info_t::generate_test_instances(list
<pg_info_t
*>& o
)
2855 o
.push_back(new pg_info_t
);
2856 o
.push_back(new pg_info_t
);
2857 list
<pg_history_t
*> h
;
2858 pg_history_t::generate_test_instances(h
);
2859 o
.back()->history
= *h
.back();
2860 o
.back()->pgid
= spg_t(pg_t(1, 2, -1), shard_id_t::NO_SHARD
);
2861 o
.back()->last_update
= eversion_t(3, 4);
2862 o
.back()->last_complete
= eversion_t(5, 6);
2863 o
.back()->last_user_version
= 2;
2864 o
.back()->log_tail
= eversion_t(7, 8);
2865 o
.back()->last_backfill
= hobject_t(object_t("objname"), "key", 123, 456, -1, "");
2866 o
.back()->last_backfill_bitwise
= true;
2869 pg_stat_t::generate_test_instances(s
);
2870 o
.back()->stats
= *s
.back();
2873 list
<pg_hit_set_history_t
*> s
;
2874 pg_hit_set_history_t::generate_test_instances(s
);
2875 o
.back()->hit_set
= *s
.back();
2879 // -- pg_notify_t --
2880 void pg_notify_t::encode(bufferlist
&bl
) const
2882 ENCODE_START(2, 2, bl
);
2883 ::encode(query_epoch
, bl
);
2884 ::encode(epoch_sent
, bl
);
2891 void pg_notify_t::decode(bufferlist::iterator
&bl
)
2893 DECODE_START(2, bl
);
2894 ::decode(query_epoch
, bl
);
2895 ::decode(epoch_sent
, bl
);
2902 void pg_notify_t::dump(Formatter
*f
) const
2904 f
->dump_int("from", from
);
2905 f
->dump_int("to", to
);
2906 f
->dump_unsigned("query_epoch", query_epoch
);
2907 f
->dump_unsigned("epoch_sent", epoch_sent
);
2909 f
->open_object_section("info");
2915 void pg_notify_t::generate_test_instances(list
<pg_notify_t
*>& o
)
2917 o
.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD
, 1, 1, pg_info_t()));
2918 o
.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t()));
2921 ostream
&operator<<(ostream
&lhs
, const pg_notify_t
¬ify
)
2923 lhs
<< "(query:" << notify
.query_epoch
2924 << " sent:" << notify
.epoch_sent
2925 << " " << notify
.info
;
2926 if (notify
.from
!= shard_id_t::NO_SHARD
||
2927 notify
.to
!= shard_id_t::NO_SHARD
)
2928 lhs
<< " " << (unsigned)notify
.from
2929 << "->" << (unsigned)notify
.to
;
2933 // -- pg_interval_t --
2935 void PastIntervals::pg_interval_t::encode(bufferlist
& bl
) const
2937 ENCODE_START(4, 2, bl
);
2938 ::encode(first
, bl
);
2941 ::encode(acting
, bl
);
2942 ::encode(maybe_went_rw
, bl
);
2943 ::encode(primary
, bl
);
2944 ::encode(up_primary
, bl
);
2948 void PastIntervals::pg_interval_t::decode(bufferlist::iterator
& bl
)
2950 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl
);
2951 ::decode(first
, bl
);
2954 ::decode(acting
, bl
);
2955 ::decode(maybe_went_rw
, bl
);
2956 if (struct_v
>= 3) {
2957 ::decode(primary
, bl
);
2960 primary
= acting
[0];
2962 if (struct_v
>= 4) {
2963 ::decode(up_primary
, bl
);
2971 void PastIntervals::pg_interval_t::dump(Formatter
*f
) const
2973 f
->dump_unsigned("first", first
);
2974 f
->dump_unsigned("last", last
);
2975 f
->dump_int("maybe_went_rw", maybe_went_rw
? 1 : 0);
2976 f
->open_array_section("up");
2977 for (vector
<int>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
)
2978 f
->dump_int("osd", *p
);
2980 f
->open_array_section("acting");
2981 for (vector
<int>::const_iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
2982 f
->dump_int("osd", *p
);
2984 f
->dump_int("primary", primary
);
2985 f
->dump_int("up_primary", up_primary
);
2988 void PastIntervals::pg_interval_t::generate_test_instances(list
<pg_interval_t
*>& o
)
2990 o
.push_back(new pg_interval_t
);
2991 o
.push_back(new pg_interval_t
);
2992 o
.back()->up
.push_back(1);
2993 o
.back()->acting
.push_back(2);
2994 o
.back()->acting
.push_back(3);
2995 o
.back()->first
= 4;
2997 o
.back()->maybe_went_rw
= true;
3000 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t
)
3002 class pi_simple_rep
: public PastIntervals::interval_rep
{
3003 map
<epoch_t
, PastIntervals::pg_interval_t
> interval_map
;
3007 std::list
<PastIntervals::pg_interval_t
> &&intervals
) {
3008 for (auto &&i
: intervals
)
3009 add_interval(ec_pool
, i
);
3013 pi_simple_rep() = default;
3014 pi_simple_rep(const pi_simple_rep
&) = default;
3015 pi_simple_rep(pi_simple_rep
&&) = default;
3016 pi_simple_rep
&operator=(pi_simple_rep
&&) = default;
3017 pi_simple_rep
&operator=(const pi_simple_rep
&) = default;
3019 size_t size() const override
{ return interval_map
.size(); }
3020 bool empty() const override
{ return interval_map
.empty(); }
3021 void clear() override
{ interval_map
.clear(); }
3022 pair
<epoch_t
, epoch_t
> get_bounds() const override
{
3023 auto iter
= interval_map
.begin();
3024 if (iter
!= interval_map
.end()) {
3025 auto riter
= interval_map
.rbegin();
3028 riter
->second
.last
+ 1);
3030 return make_pair(0, 0);
3033 set
<pg_shard_t
> get_all_participants(
3034 bool ec_pool
) const override
{
3035 set
<pg_shard_t
> all_participants
;
3037 // We need to decide who might have unfound objects that we need
3038 auto p
= interval_map
.rbegin();
3039 auto end
= interval_map
.rend();
3040 for (; p
!= end
; ++p
) {
3041 const PastIntervals::pg_interval_t
&interval(p
->second
);
3042 // If nothing changed, we don't care about this interval.
3043 if (!interval
.maybe_went_rw
)
3047 std::vector
<int>::const_iterator a
= interval
.acting
.begin();
3048 std::vector
<int>::const_iterator a_end
= interval
.acting
.end();
3049 for (; a
!= a_end
; ++a
, ++i
) {
3050 pg_shard_t
shard(*a
, ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
);
3051 if (*a
!= CRUSH_ITEM_NONE
)
3052 all_participants
.insert(shard
);
3055 return all_participants
;
3059 const PastIntervals::pg_interval_t
&interval
) override
{
3060 interval_map
[interval
.first
] = interval
;
3062 unique_ptr
<PastIntervals::interval_rep
> clone() const override
{
3063 return unique_ptr
<PastIntervals::interval_rep
>(new pi_simple_rep(*this));
3065 ostream
&print(ostream
&out
) const override
{
3066 return out
<< interval_map
;
3068 void encode(bufferlist
&bl
) const override
{
3069 ::encode(interval_map
, bl
);
3071 void decode(bufferlist::iterator
&bl
) override
{
3072 ::decode(interval_map
, bl
);
3074 void dump(Formatter
*f
) const override
{
3075 f
->open_array_section("PastIntervals::compat_rep");
3076 for (auto &&i
: interval_map
) {
3077 f
->open_object_section("pg_interval_t");
3078 f
->dump_int("epoch", i
.first
);
3079 f
->open_object_section("interval");
3086 bool is_classic() const override
{
3089 static void generate_test_instances(list
<pi_simple_rep
*> &o
) {
3090 using ival
= PastIntervals::pg_interval_t
;
3091 using ivallst
= std::list
<ival
>;
3095 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3096 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3097 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3098 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3103 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3104 , ival
{{ 1, 2}, { 1, 2}, 20, 30, true, 1, 1}
3105 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3106 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3111 { ival
{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3112 , ival
{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3113 , ival
{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3114 , ival
{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3118 void iterate_mayberw_back_to(
3121 std::function
<void(epoch_t
, const set
<pg_shard_t
> &)> &&f
) const override
{
3122 for (auto i
= interval_map
.rbegin(); i
!= interval_map
.rend(); ++i
) {
3123 if (!i
->second
.maybe_went_rw
)
3125 if (i
->second
.last
< les
)
3127 set
<pg_shard_t
> actingset
;
3128 for (unsigned j
= 0; j
< i
->second
.acting
.size(); ++j
) {
3129 if (i
->second
.acting
[j
] == CRUSH_ITEM_NONE
)
3133 i
->second
.acting
[j
],
3134 ec_pool
? shard_id_t(j
) : shard_id_t::NO_SHARD
));
3136 f(i
->second
.first
, actingset
);
3140 bool has_full_intervals() const override
{ return true; }
3141 void iterate_all_intervals(
3142 std::function
<void(const PastIntervals::pg_interval_t
&)> &&f
3144 for (auto &&i
: interval_map
) {
3148 virtual ~pi_simple_rep() override
{}
3154 * PastIntervals only needs to be able to answer two questions:
3155 * 1) Where should the primary look for unfound objects?
3156 * 2) List a set of subsets of the OSDs such that contacting at least
3157 * one from each subset guarrantees we speak to at least one witness
3158 * of any completed write.
3160 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3161 * we don't need to keep any where maybe_went_rw would be false. We also
3162 * needn't keep two intervals where the actingset in one is a subset
3163 * of the other (only need to keep the smaller of the two sets). In order
3164 * to accurately trim the set of intervals as last_epoch_started changes
3165 * without rebuilding the set from scratch, we'll retain the larger set
3166 * if it in an older interval.
3168 struct compact_interval_t
{
3171 set
<pg_shard_t
> acting
;
3172 bool supersedes(const compact_interval_t
&other
) {
3173 for (auto &&i
: acting
) {
3174 if (!other
.acting
.count(i
))
3179 void dump(Formatter
*f
) const {
3180 f
->open_object_section("compact_interval_t");
3181 f
->dump_stream("first") << first
;
3182 f
->dump_stream("last") << last
;
3183 f
->dump_stream("acting") << acting
;
3186 void encode(bufferlist
&bl
) const {
3187 ENCODE_START(1, 1, bl
);
3188 ::encode(first
, bl
);
3190 ::encode(acting
, bl
);
3193 void decode(bufferlist::iterator
&bl
) {
3194 DECODE_START(1, bl
);
3195 ::decode(first
, bl
);
3197 ::decode(acting
, bl
);
3200 static void generate_test_instances(list
<compact_interval_t
*> & o
) {
3201 /* Not going to be used, we'll generate pi_compact_rep directly */
3204 ostream
&operator<<(ostream
&o
, const compact_interval_t
&rhs
)
3206 return o
<< "([" << rhs
.first
<< "," << rhs
.last
3207 << "] acting " << rhs
.acting
<< ")";
3209 WRITE_CLASS_ENCODER(compact_interval_t
)
3211 class pi_compact_rep
: public PastIntervals::interval_rep
{
3213 epoch_t last
= 0; // inclusive
3214 set
<pg_shard_t
> all_participants
;
3215 list
<compact_interval_t
> intervals
;
3218 std::list
<PastIntervals::pg_interval_t
> &&intervals
) {
3219 for (auto &&i
: intervals
)
3220 add_interval(ec_pool
, i
);
3223 pi_compact_rep() = default;
3224 pi_compact_rep(const pi_compact_rep
&) = default;
3225 pi_compact_rep(pi_compact_rep
&&) = default;
3226 pi_compact_rep
&operator=(const pi_compact_rep
&) = default;
3227 pi_compact_rep
&operator=(pi_compact_rep
&&) = default;
3229 size_t size() const override
{ return intervals
.size(); }
3230 bool empty() const override
{
3231 return first
> last
|| (first
== 0 && last
== 0);
3233 void clear() override
{
3234 *this = pi_compact_rep();
3236 pair
<epoch_t
, epoch_t
> get_bounds() const override
{
3237 return make_pair(first
, last
+ 1);
3239 set
<pg_shard_t
> get_all_participants(
3240 bool ec_pool
) const override
{
3241 return all_participants
;
3244 bool ec_pool
, const PastIntervals::pg_interval_t
&interval
) override
{
3246 first
= interval
.first
;
3247 assert(interval
.last
> last
);
3248 last
= interval
.last
;
3249 set
<pg_shard_t
> acting
;
3250 for (unsigned i
= 0; i
< interval
.acting
.size(); ++i
) {
3251 if (interval
.acting
[i
] == CRUSH_ITEM_NONE
)
3256 ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3258 all_participants
.insert(acting
.begin(), acting
.end());
3259 if (!interval
.maybe_went_rw
)
3261 intervals
.push_back(
3262 compact_interval_t
{interval
.first
, interval
.last
, acting
});
3263 auto plast
= intervals
.end();
3265 for (auto cur
= intervals
.begin(); cur
!= plast
; ) {
3266 if (plast
->supersedes(*cur
)) {
3267 intervals
.erase(cur
++);
3273 unique_ptr
<PastIntervals::interval_rep
> clone() const override
{
3274 return unique_ptr
<PastIntervals::interval_rep
>(new pi_compact_rep(*this));
3276 ostream
&print(ostream
&out
) const override
{
3277 return out
<< "([" << first
<< "," << last
3278 << "] intervals=" << intervals
<< ")";
3280 void encode(bufferlist
&bl
) const override
{
3281 ENCODE_START(1, 1, bl
);
3282 ::encode(first
, bl
);
3284 ::encode(all_participants
, bl
);
3285 ::encode(intervals
, bl
);
3288 void decode(bufferlist::iterator
&bl
) override
{
3289 DECODE_START(1, bl
);
3290 ::decode(first
, bl
);
3292 ::decode(all_participants
, bl
);
3293 ::decode(intervals
, bl
);
3296 void dump(Formatter
*f
) const override
{
3297 f
->open_object_section("PastIntervals::compact_rep");
3298 f
->dump_stream("first") << first
;
3299 f
->dump_stream("last") << last
;
3300 f
->open_array_section("all_participants");
3301 for (auto& i
: all_participants
) {
3302 f
->dump_object("pg_shard", i
);
3305 f
->open_array_section("intervals");
3306 for (auto &&i
: intervals
) {
3312 bool is_classic() const override
{
3315 static void generate_test_instances(list
<pi_compact_rep
*> &o
) {
3316 using ival
= PastIntervals::pg_interval_t
;
3317 using ivallst
= std::list
<ival
>;
3321 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3322 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3323 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3324 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3329 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3330 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3331 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3332 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3337 { ival
{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3338 , ival
{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3339 , ival
{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3340 , ival
{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3343 void iterate_mayberw_back_to(
3346 std::function
<void(epoch_t
, const set
<pg_shard_t
> &)> &&f
) const override
{
3347 for (auto i
= intervals
.rbegin(); i
!= intervals
.rend(); ++i
) {
3350 f(i
->first
, i
->acting
);
3353 virtual ~pi_compact_rep() override
{}
3355 WRITE_CLASS_ENCODER(pi_compact_rep
)
3357 PastIntervals::PastIntervals(const PastIntervals
&rhs
)
3358 : past_intervals(rhs
.past_intervals
?
3359 rhs
.past_intervals
->clone() :
3362 PastIntervals
&PastIntervals::operator=(const PastIntervals
&rhs
)
3364 PastIntervals
other(rhs
);
3369 ostream
& operator<<(ostream
& out
, const PastIntervals
&i
)
3371 if (i
.past_intervals
) {
3372 return i
.past_intervals
->print(out
);
3374 return out
<< "(empty)";
3378 ostream
& operator<<(ostream
& out
, const PastIntervals::PriorSet
&i
)
3380 return out
<< "PriorSet("
3381 << "ec_pool: " << i
.ec_pool
3382 << ", probe: " << i
.probe
3383 << ", down: " << i
.down
3384 << ", blocked_by: " << i
.blocked_by
3385 << ", pg_down: " << i
.pg_down
3389 void PastIntervals::decode(bufferlist::iterator
&bl
)
3391 DECODE_START(1, bl
);
3398 past_intervals
.reset(new pi_simple_rep
);
3399 past_intervals
->decode(bl
);
3402 past_intervals
.reset(new pi_compact_rep
);
3403 past_intervals
->decode(bl
);
3409 void PastIntervals::decode_classic(bufferlist::iterator
&bl
)
3411 past_intervals
.reset(new pi_simple_rep
);
3412 past_intervals
->decode(bl
);
3415 void PastIntervals::generate_test_instances(list
<PastIntervals
*> &o
)
3418 list
<pi_simple_rep
*> simple
;
3419 pi_simple_rep::generate_test_instances(simple
);
3420 for (auto &&i
: simple
) {
3421 // takes ownership of contents
3422 o
.push_back(new PastIntervals(i
));
3426 list
<pi_compact_rep
*> compact
;
3427 pi_compact_rep::generate_test_instances(compact
);
3428 for (auto &&i
: compact
) {
3429 // takes ownership of contents
3430 o
.push_back(new PastIntervals(i
));
3436 void PastIntervals::update_type(bool ec_pool
, bool compact
)
3439 if (!past_intervals
) {
3440 past_intervals
.reset(new pi_simple_rep
);
3442 // we never convert from compact back to classic
3443 assert(is_classic());
3446 if (!past_intervals
) {
3447 past_intervals
.reset(new pi_compact_rep
);
3448 } else if (is_classic()) {
3449 auto old
= std::move(past_intervals
);
3450 past_intervals
.reset(new pi_compact_rep
);
3451 assert(old
->has_full_intervals());
3452 old
->iterate_all_intervals([&](const pg_interval_t
&i
) {
3453 past_intervals
->add_interval(ec_pool
, i
);
3459 void PastIntervals::update_type_from_map(bool ec_pool
, const OSDMap
&osdmap
)
3461 update_type(ec_pool
, osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
);
3464 bool PastIntervals::is_new_interval(
3465 int old_acting_primary
,
3466 int new_acting_primary
,
3467 const vector
<int> &old_acting
,
3468 const vector
<int> &new_acting
,
3471 const vector
<int> &old_up
,
3472 const vector
<int> &new_up
,
3477 unsigned old_pg_num
,
3478 unsigned new_pg_num
,
3479 bool old_sort_bitwise
,
3480 bool new_sort_bitwise
,
3481 bool old_recovery_deletes
,
3482 bool new_recovery_deletes
,
3484 return old_acting_primary
!= new_acting_primary
||
3485 new_acting
!= old_acting
||
3486 old_up_primary
!= new_up_primary
||
3488 old_min_size
!= new_min_size
||
3489 old_size
!= new_size
||
3490 pgid
.is_split(old_pg_num
, new_pg_num
, 0) ||
3491 old_sort_bitwise
!= new_sort_bitwise
||
3492 old_recovery_deletes
!= new_recovery_deletes
;
3495 bool PastIntervals::is_new_interval(
3496 int old_acting_primary
,
3497 int new_acting_primary
,
3498 const vector
<int> &old_acting
,
3499 const vector
<int> &new_acting
,
3502 const vector
<int> &old_up
,
3503 const vector
<int> &new_up
,
3507 return !(lastmap
->get_pools().count(pgid
.pool())) ||
3508 is_new_interval(old_acting_primary
,
3516 lastmap
->get_pools().find(pgid
.pool())->second
.size
,
3517 osdmap
->get_pools().find(pgid
.pool())->second
.size
,
3518 lastmap
->get_pools().find(pgid
.pool())->second
.min_size
,
3519 osdmap
->get_pools().find(pgid
.pool())->second
.min_size
,
3520 lastmap
->get_pg_num(pgid
.pool()),
3521 osdmap
->get_pg_num(pgid
.pool()),
3522 lastmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
),
3523 osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
),
3524 lastmap
->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
),
3525 osdmap
->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
),
3529 bool PastIntervals::check_new_interval(
3530 int old_acting_primary
,
3531 int new_acting_primary
,
3532 const vector
<int> &old_acting
,
3533 const vector
<int> &new_acting
,
3536 const vector
<int> &old_up
,
3537 const vector
<int> &new_up
,
3538 epoch_t same_interval_since
,
3539 epoch_t last_epoch_clean
,
3543 IsPGRecoverablePredicate
*could_have_gone_active
,
3544 PastIntervals
*past_intervals
,
3548 * We have to be careful to gracefully deal with situations like
3549 * so. Say we have a power outage or something that takes out both
3550 * OSDs, but the monitor doesn't mark them down in the same epoch.
3551 * The history may look like
3555 * 3: let's say B dies for good, too (say, from the power spike)
3558 * which makes it look like B may have applied updates to the PG
3559 * that we need in order to proceed. This sucks...
3561 * To minimize the risk of this happening, we CANNOT go active if
3562 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3563 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3564 * Then, we have something like
3571 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
3581 * -> we must wait for B, bc it was alive through 2, and could have
3582 * written to the pg.
3584 * If B is really dead, then an administrator will need to manually
3585 * intervene by marking the OSD as "lost."
3588 // remember past interval
3589 // NOTE: a change in the up set primary triggers an interval
3590 // change, even though the interval members in the pg_interval_t
3592 assert(past_intervals
);
3593 assert(past_intervals
->past_intervals
);
3594 if (is_new_interval(
3607 i
.first
= same_interval_since
;
3608 i
.last
= osdmap
->get_epoch() - 1;
3609 assert(i
.first
<= i
.last
);
3610 i
.acting
= old_acting
;
3612 i
.primary
= old_acting_primary
;
3613 i
.up_primary
= old_up_primary
;
3615 unsigned num_acting
= 0;
3616 for (vector
<int>::const_iterator p
= i
.acting
.begin(); p
!= i
.acting
.end();
3618 if (*p
!= CRUSH_ITEM_NONE
)
3621 assert(lastmap
->get_pools().count(pgid
.pool()));
3622 const pg_pool_t
& old_pg_pool
= lastmap
->get_pools().find(pgid
.pool())->second
;
3623 set
<pg_shard_t
> old_acting_shards
;
3624 old_pg_pool
.convert_to_pg_shards(old_acting
, &old_acting_shards
);
3628 num_acting
>= old_pg_pool
.min_size
&&
3629 (*could_have_gone_active
)(old_acting_shards
)) {
3631 *out
<< __func__
<< " " << i
3633 << " up_thru " << lastmap
->get_up_thru(i
.primary
)
3634 << " up_from " << lastmap
->get_up_from(i
.primary
)
3635 << " last_epoch_clean " << last_epoch_clean
3637 if (lastmap
->get_up_thru(i
.primary
) >= i
.first
&&
3638 lastmap
->get_up_from(i
.primary
) <= i
.first
) {
3639 i
.maybe_went_rw
= true;
3641 *out
<< __func__
<< " " << i
3642 << " : primary up " << lastmap
->get_up_from(i
.primary
)
3643 << "-" << lastmap
->get_up_thru(i
.primary
)
3644 << " includes interval"
3646 } else if (last_epoch_clean
>= i
.first
&&
3647 last_epoch_clean
<= i
.last
) {
3648 // If the last_epoch_clean is included in this interval, then
3649 // the pg must have been rw (for recovery to have completed).
3650 // This is important because we won't know the _real_
3651 // first_epoch because we stop at last_epoch_clean, and we
3652 // don't want the oldest interval to randomly have
3653 // maybe_went_rw false depending on the relative up_thru vs
3654 // last_epoch_clean timing.
3655 i
.maybe_went_rw
= true;
3657 *out
<< __func__
<< " " << i
3658 << " : includes last_epoch_clean " << last_epoch_clean
3659 << " and presumed to have been rw"
3662 i
.maybe_went_rw
= false;
3664 *out
<< __func__
<< " " << i
3665 << " : primary up " << lastmap
->get_up_from(i
.primary
)
3666 << "-" << lastmap
->get_up_thru(i
.primary
)
3667 << " does not include interval"
3671 i
.maybe_went_rw
= false;
3673 *out
<< __func__
<< " " << i
<< " : acting set is too small" << std::endl
;
3675 past_intervals
->past_intervals
->add_interval(old_pg_pool
.ec_pool(), i
);
3683 // true if the given map affects the prior set
3684 bool PastIntervals::PriorSet::affected_by_map(
3685 const OSDMap
&osdmap
,
3686 const DoutPrefixProvider
*dpp
) const
3688 for (set
<pg_shard_t
>::iterator p
= probe
.begin();
3693 // did someone in the prior set go down?
3694 if (osdmap
.is_down(o
) && down
.count(o
) == 0) {
3695 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " now down" << dendl
;
3699 // did a down osd in cur get (re)marked as lost?
3700 map
<int, epoch_t
>::const_iterator r
= blocked_by
.find(o
);
3701 if (r
!= blocked_by
.end()) {
3702 if (!osdmap
.exists(o
)) {
3703 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " no longer exists" << dendl
;
3706 if (osdmap
.get_info(o
).lost_at
!= r
->second
) {
3707 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " (re)marked as lost" << dendl
;
3713 // did someone in the prior down set go up?
3714 for (set
<int>::const_iterator p
= down
.begin();
3719 if (osdmap
.is_up(o
)) {
3720 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " now up" << dendl
;
3724 // did someone in the prior set get lost or destroyed?
3725 if (!osdmap
.exists(o
)) {
3726 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " no longer exists" << dendl
;
3729 // did a down osd in down get (re)marked as lost?
3730 map
<int, epoch_t
>::const_iterator r
= blocked_by
.find(o
);
3731 if (r
!= blocked_by
.end()) {
3732 if (osdmap
.get_info(o
).lost_at
!= r
->second
) {
3733 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " (re)marked as lost" << dendl
;
3742 ostream
& operator<<(ostream
& out
, const PastIntervals::pg_interval_t
& i
)
3744 out
<< "interval(" << i
.first
<< "-" << i
.last
3745 << " up " << i
.up
<< "(" << i
.up_primary
<< ")"
3746 << " acting " << i
.acting
<< "(" << i
.primary
<< ")";
3747 if (i
.maybe_went_rw
)
3748 out
<< " maybe_went_rw";
3757 void pg_query_t::encode(bufferlist
&bl
, uint64_t features
) const {
3758 ENCODE_START(3, 3, bl
);
3760 ::encode(since
, bl
);
3762 ::encode(epoch_sent
, bl
);
3768 void pg_query_t::decode(bufferlist::iterator
&bl
) {
3769 DECODE_START(3, bl
);
3771 ::decode(since
, bl
);
3773 ::decode(epoch_sent
, bl
);
3779 void pg_query_t::dump(Formatter
*f
) const
3781 f
->dump_int("from", from
);
3782 f
->dump_int("to", to
);
3783 f
->dump_string("type", get_type_name());
3784 f
->dump_stream("since") << since
;
3785 f
->dump_stream("epoch_sent") << epoch_sent
;
3786 f
->open_object_section("history");
3790 void pg_query_t::generate_test_instances(list
<pg_query_t
*>& o
)
3792 o
.push_back(new pg_query_t());
3793 list
<pg_history_t
*> h
;
3794 pg_history_t::generate_test_instances(h
);
3795 o
.push_back(new pg_query_t(pg_query_t::INFO
, shard_id_t(1), shard_id_t(2), *h
.back(), 4));
3796 o
.push_back(new pg_query_t(pg_query_t::MISSING
, shard_id_t(2), shard_id_t(3), *h
.back(), 4));
3797 o
.push_back(new pg_query_t(pg_query_t::LOG
, shard_id_t(0), shard_id_t(0),
3798 eversion_t(4, 5), *h
.back(), 4));
3799 o
.push_back(new pg_query_t(pg_query_t::FULLLOG
,
3800 shard_id_t::NO_SHARD
, shard_id_t::NO_SHARD
,
3804 // -- ObjectModDesc --
3805 void ObjectModDesc::visit(Visitor
*visitor
) const
3807 bufferlist::iterator bp
= bl
.begin();
3810 DECODE_START(max_required_version
, bp
);
3817 visitor
->append(size
);
3821 map
<string
, boost::optional
<bufferlist
> > attrs
;
3822 ::decode(attrs
, bp
);
3823 visitor
->setattrs(attrs
);
3827 version_t old_version
;
3828 ::decode(old_version
, bp
);
3829 visitor
->rmobject(old_version
);
3836 case UPDATE_SNAPS
: {
3837 set
<snapid_t
> snaps
;
3838 ::decode(snaps
, bp
);
3839 visitor
->update_snaps(snaps
);
3843 version_t old_version
;
3844 ::decode(old_version
, bp
);
3845 visitor
->try_rmobject(old_version
);
3848 case ROLLBACK_EXTENTS
: {
3849 vector
<pair
<uint64_t, uint64_t> > extents
;
3852 ::decode(extents
, bp
);
3853 visitor
->rollback_extents(gen
,extents
);
3857 assert(0 == "Invalid rollback code");
3862 assert(0 == "Invalid encoding");
3866 struct DumpVisitor
: public ObjectModDesc::Visitor
{
3868 explicit DumpVisitor(Formatter
*f
) : f(f
) {}
3869 void append(uint64_t old_size
) override
{
3870 f
->open_object_section("op");
3871 f
->dump_string("code", "APPEND");
3872 f
->dump_unsigned("old_size", old_size
);
3875 void setattrs(map
<string
, boost::optional
<bufferlist
> > &attrs
) override
{
3876 f
->open_object_section("op");
3877 f
->dump_string("code", "SETATTRS");
3878 f
->open_array_section("attrs");
3879 for (map
<string
, boost::optional
<bufferlist
> >::iterator i
= attrs
.begin();
3882 f
->dump_string("attr_name", i
->first
);
3887 void rmobject(version_t old_version
) override
{
3888 f
->open_object_section("op");
3889 f
->dump_string("code", "RMOBJECT");
3890 f
->dump_unsigned("old_version", old_version
);
3893 void try_rmobject(version_t old_version
) override
{
3894 f
->open_object_section("op");
3895 f
->dump_string("code", "TRY_RMOBJECT");
3896 f
->dump_unsigned("old_version", old_version
);
3899 void create() override
{
3900 f
->open_object_section("op");
3901 f
->dump_string("code", "CREATE");
3904 void update_snaps(const set
<snapid_t
> &snaps
) override
{
3905 f
->open_object_section("op");
3906 f
->dump_string("code", "UPDATE_SNAPS");
3907 f
->dump_stream("snaps") << snaps
;
3910 void rollback_extents(
3912 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
3913 f
->open_object_section("op");
3914 f
->dump_string("code", "ROLLBACK_EXTENTS");
3915 f
->dump_unsigned("gen", gen
);
3916 f
->dump_stream("snaps") << extents
;
3921 void ObjectModDesc::dump(Formatter
*f
) const
3923 f
->open_object_section("object_mod_desc");
3924 f
->dump_bool("can_local_rollback", can_local_rollback
);
3925 f
->dump_bool("rollback_info_completed", rollback_info_completed
);
3927 f
->open_array_section("ops");
3935 void ObjectModDesc::generate_test_instances(list
<ObjectModDesc
*>& o
)
3937 map
<string
, boost::optional
<bufferlist
> > attrs
;
3941 o
.push_back(new ObjectModDesc());
3942 o
.back()->append(100);
3943 o
.back()->setattrs(attrs
);
3944 o
.push_back(new ObjectModDesc());
3945 o
.back()->rmobject(1001);
3946 o
.push_back(new ObjectModDesc());
3948 o
.back()->setattrs(attrs
);
3949 o
.push_back(new ObjectModDesc());
3951 o
.back()->setattrs(attrs
);
3952 o
.back()->mark_unrollbackable();
3953 o
.back()->append(1000);
3956 void ObjectModDesc::encode(bufferlist
&_bl
) const
3958 ENCODE_START(max_required_version
, max_required_version
, _bl
);
3959 ::encode(can_local_rollback
, _bl
);
3960 ::encode(rollback_info_completed
, _bl
);
3964 void ObjectModDesc::decode(bufferlist::iterator
&_bl
)
3966 DECODE_START(2, _bl
);
3967 max_required_version
= struct_v
;
3968 ::decode(can_local_rollback
, _bl
);
3969 ::decode(rollback_info_completed
, _bl
);
3971 // ensure bl does not pin a larger buffer in memory
3973 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
3977 // -- pg_log_entry_t --
3979 string
pg_log_entry_t::get_key_name() const
3981 return version
.get_key_name();
3984 void pg_log_entry_t::encode_with_checksum(bufferlist
& bl
) const
3986 bufferlist
ebl(sizeof(*this)*2);
3988 __u32 crc
= ebl
.crc32c(0);
3993 void pg_log_entry_t::decode_with_checksum(bufferlist::iterator
& p
)
3999 if (crc
!= bl
.crc32c(0))
4000 throw buffer::malformed_input("bad checksum on pg_log_entry_t");
4001 bufferlist::iterator q
= bl
.begin();
4005 void pg_log_entry_t::encode(bufferlist
&bl
) const
4007 ENCODE_START(11, 4, bl
);
4010 ::encode(version
, bl
);
4013 * Added with reverting_to:
4014 * Previous code used prior_version to encode
4015 * what we now call reverting_to. This will
4016 * allow older code to decode reverting_to
4017 * into prior_version as expected.
4019 if (op
== LOST_REVERT
)
4020 ::encode(reverting_to
, bl
);
4022 ::encode(prior_version
, bl
);
4024 ::encode(reqid
, bl
);
4025 ::encode(mtime
, bl
);
4026 if (op
== LOST_REVERT
)
4027 ::encode(prior_version
, bl
);
4028 ::encode(snaps
, bl
);
4029 ::encode(user_version
, bl
);
4030 ::encode(mod_desc
, bl
);
4031 ::encode(extra_reqids
, bl
);
4033 ::encode(return_code
, bl
);
4037 void pg_log_entry_t::decode(bufferlist::iterator
&bl
)
4039 DECODE_START_LEGACY_COMPAT_LEN(11, 4, 4, bl
);
4043 ::decode(old_soid
, bl
);
4044 soid
.oid
= old_soid
.oid
;
4045 soid
.snap
= old_soid
.snap
;
4046 invalid_hash
= true;
4051 invalid_hash
= true;
4052 ::decode(version
, bl
);
4054 if (struct_v
>= 6 && op
== LOST_REVERT
)
4055 ::decode(reverting_to
, bl
);
4057 ::decode(prior_version
, bl
);
4059 ::decode(reqid
, bl
);
4061 ::decode(mtime
, bl
);
4063 invalid_pool
= true;
4065 if (op
== LOST_REVERT
) {
4066 if (struct_v
>= 6) {
4067 ::decode(prior_version
, bl
);
4069 reverting_to
= prior_version
;
4072 if (struct_v
>= 7 || // for v >= 7, this is for all ops.
4073 op
== CLONE
) { // for v < 7, it's only present for CLONE.
4074 ::decode(snaps
, bl
);
4075 // ensure snaps does not pin a larger buffer in memory
4077 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4081 ::decode(user_version
, bl
);
4083 user_version
= version
.version
;
4086 ::decode(mod_desc
, bl
);
4088 mod_desc
.mark_unrollbackable();
4090 ::decode(extra_reqids
, bl
);
4091 if (struct_v
>= 11 && op
== ERROR
)
4092 ::decode(return_code
, bl
);
4096 void pg_log_entry_t::dump(Formatter
*f
) const
4098 f
->dump_string("op", get_op_name());
4099 f
->dump_stream("object") << soid
;
4100 f
->dump_stream("version") << version
;
4101 f
->dump_stream("prior_version") << prior_version
;
4102 f
->dump_stream("reqid") << reqid
;
4103 f
->open_array_section("extra_reqids");
4104 for (auto p
= extra_reqids
.begin();
4105 p
!= extra_reqids
.end();
4107 f
->open_object_section("extra_reqid");
4108 f
->dump_stream("reqid") << p
->first
;
4109 f
->dump_stream("user_version") << p
->second
;
4113 f
->dump_stream("mtime") << mtime
;
4114 f
->dump_int("return_code", return_code
);
4115 if (snaps
.length() > 0) {
4117 bufferlist c
= snaps
;
4118 bufferlist::iterator p
= c
.begin();
4124 f
->open_object_section("snaps");
4125 for (vector
<snapid_t
>::iterator p
= v
.begin(); p
!= v
.end(); ++p
)
4126 f
->dump_unsigned("snap", *p
);
4130 f
->open_object_section("mod_desc");
4136 void pg_log_entry_t::generate_test_instances(list
<pg_log_entry_t
*>& o
)
4138 o
.push_back(new pg_log_entry_t());
4139 hobject_t
oid(object_t("objname"), "key", 123, 456, 0, "");
4140 o
.push_back(new pg_log_entry_t(MODIFY
, oid
, eversion_t(1,2), eversion_t(3,4),
4141 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4143 o
.push_back(new pg_log_entry_t(ERROR
, oid
, eversion_t(1,2), eversion_t(3,4),
4144 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4145 utime_t(8,9), -ENOENT
));
4148 ostream
& operator<<(ostream
& out
, const pg_log_entry_t
& e
)
4150 out
<< e
.version
<< " (" << e
.prior_version
<< ") "
4151 << std::left
<< std::setw(8) << e
.get_op_name() << ' '
4152 << e
.soid
<< " by " << e
.reqid
<< " " << e
.mtime
4153 << " " << e
.return_code
;
4154 if (e
.snaps
.length()) {
4155 vector
<snapid_t
> snaps
;
4156 bufferlist c
= e
.snaps
;
4157 bufferlist::iterator p
= c
.begin();
4163 out
<< " snaps " << snaps
;
4168 // -- pg_log_dup_t --
4170 string
pg_log_dup_t::get_key_name() const
4172 return "dup_" + version
.get_key_name();
4175 void pg_log_dup_t::encode(bufferlist
&bl
) const
4177 ENCODE_START(1, 1, bl
);
4178 ::encode(reqid
, bl
);
4179 ::encode(version
, bl
);
4180 ::encode(user_version
, bl
);
4181 ::encode(return_code
, bl
);
4185 void pg_log_dup_t::decode(bufferlist::iterator
&bl
)
4187 DECODE_START(1, bl
);
4188 ::decode(reqid
, bl
);
4189 ::decode(version
, bl
);
4190 ::decode(user_version
, bl
);
4191 ::decode(return_code
, bl
);
4195 void pg_log_dup_t::dump(Formatter
*f
) const
4197 f
->dump_stream("reqid") << reqid
;
4198 f
->dump_stream("version") << version
;
4199 f
->dump_stream("user_version") << user_version
;
4200 f
->dump_stream("return_code") << return_code
;
4203 void pg_log_dup_t::generate_test_instances(list
<pg_log_dup_t
*>& o
)
4205 o
.push_back(new pg_log_dup_t());
4206 o
.push_back(new pg_log_dup_t(eversion_t(1,2),
4208 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4210 o
.push_back(new pg_log_dup_t(eversion_t(1,2),
4212 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4217 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
) {
4218 return out
<< "log_dup(reqid=" << e
.reqid
<<
4219 " v=" << e
.version
<< " uv=" << e
.user_version
<<
4220 " rc=" << e
.return_code
<< ")";
4226 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4227 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4228 void pg_log_t::filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
4229 const string
&hit_set_namespace
, const pg_log_t
&in
,
4230 pg_log_t
&out
, pg_log_t
&reject
)
4236 for (list
<pg_log_entry_t
>::const_iterator i
= in
.log
.begin();
4237 i
!= in
.log
.end(); ++i
) {
4239 // Reject pg log entries for temporary objects
4240 if (i
->soid
.is_temp()) {
4241 reject
.log
.push_back(*i
);
4245 if (i
->soid
.nspace
!= hit_set_namespace
) {
4246 object_t oid
= i
->soid
.oid
;
4247 object_locator_t
loc(i
->soid
);
4248 pg_t raw_pgid
= curmap
.object_locator_to_pg(oid
, loc
);
4249 pg_t pgid
= curmap
.raw_pg_to_pg(raw_pgid
);
4251 if (import_pgid
.pgid
== pgid
) {
4252 out
.log
.push_back(*i
);
4254 reject
.log
.push_back(*i
);
4257 out
.log
.push_back(*i
);
4262 void pg_log_t::encode(bufferlist
& bl
) const
4264 ENCODE_START(7, 3, bl
);
4268 ::encode(can_rollback_to
, bl
);
4269 ::encode(rollback_info_trimmed_to
, bl
);
4274 void pg_log_t::decode(bufferlist::iterator
&bl
, int64_t pool
)
4276 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl
);
4281 ::decode(backlog
, bl
);
4285 ::decode(can_rollback_to
, bl
);
4288 ::decode(rollback_info_trimmed_to
, bl
);
4290 rollback_info_trimmed_to
= tail
;
4297 // handle hobject_t format change
4299 for (list
<pg_log_entry_t
>::iterator i
= log
.begin();
4302 if (!i
->soid
.is_max() && i
->soid
.pool
== -1)
4303 i
->soid
.pool
= pool
;
4308 void pg_log_t::dump(Formatter
*f
) const
4310 f
->dump_stream("head") << head
;
4311 f
->dump_stream("tail") << tail
;
4312 f
->open_array_section("log");
4313 for (list
<pg_log_entry_t
>::const_iterator p
= log
.begin(); p
!= log
.end(); ++p
) {
4314 f
->open_object_section("entry");
4319 f
->open_array_section("dups");
4320 for (const auto& entry
: dups
) {
4321 f
->open_object_section("entry");
4328 void pg_log_t::generate_test_instances(list
<pg_log_t
*>& o
)
4330 o
.push_back(new pg_log_t
);
4332 // this is nonsensical:
4333 o
.push_back(new pg_log_t
);
4334 o
.back()->head
= eversion_t(1,2);
4335 o
.back()->tail
= eversion_t(3,4);
4336 list
<pg_log_entry_t
*> e
;
4337 pg_log_entry_t::generate_test_instances(e
);
4338 for (list
<pg_log_entry_t
*>::iterator p
= e
.begin(); p
!= e
.end(); ++p
)
4339 o
.back()->log
.push_back(**p
);
4342 void pg_log_t::copy_after(const pg_log_t
&other
, eversion_t v
)
4344 can_rollback_to
= other
.can_rollback_to
;
4347 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= other
.log
.rbegin();
4348 i
!= other
.log
.rend();
4350 assert(i
->version
> other
.tail
);
4351 if (i
->version
<= v
) {
4352 // make tail accurate.
4360 void pg_log_t::copy_range(const pg_log_t
&other
, eversion_t from
, eversion_t to
)
4362 can_rollback_to
= other
.can_rollback_to
;
4363 list
<pg_log_entry_t
>::const_reverse_iterator i
= other
.log
.rbegin();
4364 assert(i
!= other
.log
.rend());
4365 while (i
->version
> to
) {
4367 assert(i
!= other
.log
.rend());
4369 assert(i
->version
== to
);
4371 for ( ; i
!= other
.log
.rend(); ++i
) {
4372 if (i
->version
<= from
) {
4380 void pg_log_t::copy_up_to(const pg_log_t
&other
, int max
)
4382 can_rollback_to
= other
.can_rollback_to
;
4386 for (list
<pg_log_entry_t
>::const_reverse_iterator i
= other
.log
.rbegin();
4387 i
!= other
.log
.rend();
4397 ostream
& pg_log_t::print(ostream
& out
) const
4399 out
<< *this << std::endl
;
4400 for (list
<pg_log_entry_t
>::const_iterator p
= log
.begin();
4403 out
<< *p
<< std::endl
;
4404 for (const auto& entry
: dups
) {
4405 out
<< " dup entry: " << entry
<< std::endl
;
4410 // -- pg_missing_t --
4412 ostream
& operator<<(ostream
& out
, const pg_missing_item
& i
)
4415 if (i
.have
!= eversion_t())
4416 out
<< "(" << i
.have
<< ")";
4417 out
<< " flags = " << i
.flag_str();
4421 // -- object_copy_cursor_t --
4423 void object_copy_cursor_t::encode(bufferlist
& bl
) const
4425 ENCODE_START(1, 1, bl
);
4426 ::encode(attr_complete
, bl
);
4427 ::encode(data_offset
, bl
);
4428 ::encode(data_complete
, bl
);
4429 ::encode(omap_offset
, bl
);
4430 ::encode(omap_complete
, bl
);
4434 void object_copy_cursor_t::decode(bufferlist::iterator
&bl
)
4436 DECODE_START(1, bl
);
4437 ::decode(attr_complete
, bl
);
4438 ::decode(data_offset
, bl
);
4439 ::decode(data_complete
, bl
);
4440 ::decode(omap_offset
, bl
);
4441 ::decode(omap_complete
, bl
);
4445 void object_copy_cursor_t::dump(Formatter
*f
) const
4447 f
->dump_unsigned("attr_complete", (int)attr_complete
);
4448 f
->dump_unsigned("data_offset", data_offset
);
4449 f
->dump_unsigned("data_complete", (int)data_complete
);
4450 f
->dump_string("omap_offset", omap_offset
);
4451 f
->dump_unsigned("omap_complete", (int)omap_complete
);
4454 void object_copy_cursor_t::generate_test_instances(list
<object_copy_cursor_t
*>& o
)
4456 o
.push_back(new object_copy_cursor_t
);
4457 o
.push_back(new object_copy_cursor_t
);
4458 o
.back()->attr_complete
= true;
4459 o
.back()->data_offset
= 123;
4460 o
.push_back(new object_copy_cursor_t
);
4461 o
.back()->attr_complete
= true;
4462 o
.back()->data_complete
= true;
4463 o
.back()->omap_offset
= "foo";
4464 o
.push_back(new object_copy_cursor_t
);
4465 o
.back()->attr_complete
= true;
4466 o
.back()->data_complete
= true;
4467 o
.back()->omap_complete
= true;
4470 // -- object_copy_data_t --
4472 void object_copy_data_t::encode(bufferlist
& bl
, uint64_t features
) const
4474 ENCODE_START(7, 5, bl
);
4476 ::encode(mtime
, bl
);
4477 ::encode(attrs
, bl
);
4479 ::encode(omap_data
, bl
);
4480 ::encode(cursor
, bl
);
4481 ::encode(omap_header
, bl
);
4482 ::encode(snaps
, bl
);
4483 ::encode(snap_seq
, bl
);
4484 ::encode(flags
, bl
);
4485 ::encode(data_digest
, bl
);
4486 ::encode(omap_digest
, bl
);
4487 ::encode(reqids
, bl
);
4488 ::encode(truncate_seq
, bl
);
4489 ::encode(truncate_size
, bl
);
4493 void object_copy_data_t::decode(bufferlist::iterator
& bl
)
4495 DECODE_START(7, bl
);
4499 ::decode(mtime
, bl
);
4502 ::decode(category
, bl
); // no longer used
4504 ::decode(attrs
, bl
);
4507 map
<string
,bufferlist
> omap
;
4511 ::encode(omap
, omap_data
);
4513 ::decode(cursor
, bl
);
4515 ::decode(omap_header
, bl
);
4516 if (struct_v
>= 3) {
4517 ::decode(snaps
, bl
);
4518 ::decode(snap_seq
, bl
);
4523 if (struct_v
>= 4) {
4524 ::decode(flags
, bl
);
4525 ::decode(data_digest
, bl
);
4526 ::decode(omap_digest
, bl
);
4531 ::decode(mtime
, bl
);
4532 ::decode(attrs
, bl
);
4534 ::decode(omap_data
, bl
);
4535 ::decode(cursor
, bl
);
4536 ::decode(omap_header
, bl
);
4537 ::decode(snaps
, bl
);
4538 ::decode(snap_seq
, bl
);
4539 if (struct_v
>= 4) {
4540 ::decode(flags
, bl
);
4541 ::decode(data_digest
, bl
);
4542 ::decode(omap_digest
, bl
);
4544 if (struct_v
>= 6) {
4545 ::decode(reqids
, bl
);
4547 if (struct_v
>= 7) {
4548 ::decode(truncate_seq
, bl
);
4549 ::decode(truncate_size
, bl
);
4555 void object_copy_data_t::generate_test_instances(list
<object_copy_data_t
*>& o
)
4557 o
.push_back(new object_copy_data_t());
4559 list
<object_copy_cursor_t
*> cursors
;
4560 object_copy_cursor_t::generate_test_instances(cursors
);
4561 list
<object_copy_cursor_t
*>::iterator ci
= cursors
.begin();
4562 o
.back()->cursor
= **(ci
++);
4564 o
.push_back(new object_copy_data_t());
4565 o
.back()->cursor
= **(ci
++);
4567 o
.push_back(new object_copy_data_t());
4568 o
.back()->size
= 1234;
4569 o
.back()->mtime
.set_from_double(1234);
4570 bufferptr
bp("there", 5);
4573 o
.back()->attrs
["hello"] = bl
;
4574 bufferptr
bp2("not", 3);
4577 map
<string
,bufferlist
> omap
;
4579 ::encode(omap
, o
.back()->omap_data
);
4580 bufferptr
databp("iamsomedatatocontain", 20);
4581 o
.back()->data
.push_back(databp
);
4582 o
.back()->omap_header
.append("this is an omap header");
4583 o
.back()->snaps
.push_back(123);
4584 o
.back()->reqids
.push_back(make_pair(osd_reqid_t(), version_t()));
4587 void object_copy_data_t::dump(Formatter
*f
) const
4589 f
->open_object_section("cursor");
4591 f
->close_section(); // cursor
4592 f
->dump_int("size", size
);
4593 f
->dump_stream("mtime") << mtime
;
4594 /* we should really print out the attrs here, but bufferlist
4595 const-correctness prevents that */
4596 f
->dump_int("attrs_size", attrs
.size());
4597 f
->dump_int("flags", flags
);
4598 f
->dump_unsigned("data_digest", data_digest
);
4599 f
->dump_unsigned("omap_digest", omap_digest
);
4600 f
->dump_int("omap_data_length", omap_data
.length());
4601 f
->dump_int("omap_header_length", omap_header
.length());
4602 f
->dump_int("data_length", data
.length());
4603 f
->open_array_section("snaps");
4604 for (vector
<snapid_t
>::const_iterator p
= snaps
.begin();
4605 p
!= snaps
.end(); ++p
)
4606 f
->dump_unsigned("snap", *p
);
4608 f
->open_array_section("reqids");
4609 for (auto p
= reqids
.begin();
4612 f
->open_object_section("extra_reqid");
4613 f
->dump_stream("reqid") << p
->first
;
4614 f
->dump_stream("user_version") << p
->second
;
4620 // -- pg_create_t --
4622 void pg_create_t::encode(bufferlist
&bl
) const
4624 ENCODE_START(1, 1, bl
);
4625 ::encode(created
, bl
);
4626 ::encode(parent
, bl
);
4627 ::encode(split_bits
, bl
);
4631 void pg_create_t::decode(bufferlist::iterator
&bl
)
4633 DECODE_START(1, bl
);
4634 ::decode(created
, bl
);
4635 ::decode(parent
, bl
);
4636 ::decode(split_bits
, bl
);
4640 void pg_create_t::dump(Formatter
*f
) const
4642 f
->dump_unsigned("created", created
);
4643 f
->dump_stream("parent") << parent
;
4644 f
->dump_int("split_bits", split_bits
);
4647 void pg_create_t::generate_test_instances(list
<pg_create_t
*>& o
)
4649 o
.push_back(new pg_create_t
);
4650 o
.push_back(new pg_create_t(1, pg_t(3, 4, -1), 2));
4654 // -- pg_hit_set_info_t --
4656 void pg_hit_set_info_t::encode(bufferlist
& bl
) const
4658 ENCODE_START(2, 1, bl
);
4659 ::encode(begin
, bl
);
4661 ::encode(version
, bl
);
4662 ::encode(using_gmt
, bl
);
4666 void pg_hit_set_info_t::decode(bufferlist::iterator
& p
)
4671 ::decode(version
, p
);
4672 if (struct_v
>= 2) {
4673 ::decode(using_gmt
, p
);
4680 void pg_hit_set_info_t::dump(Formatter
*f
) const
4682 f
->dump_stream("begin") << begin
;
4683 f
->dump_stream("end") << end
;
4684 f
->dump_stream("version") << version
;
4685 f
->dump_stream("using_gmt") << using_gmt
;
4688 void pg_hit_set_info_t::generate_test_instances(list
<pg_hit_set_info_t
*>& ls
)
4690 ls
.push_back(new pg_hit_set_info_t
);
4691 ls
.push_back(new pg_hit_set_info_t
);
4692 ls
.back()->begin
= utime_t(1, 2);
4693 ls
.back()->end
= utime_t(3, 4);
4697 // -- pg_hit_set_history_t --
4699 void pg_hit_set_history_t::encode(bufferlist
& bl
) const
4701 ENCODE_START(1, 1, bl
);
4702 ::encode(current_last_update
, bl
);
4704 utime_t dummy_stamp
;
4705 ::encode(dummy_stamp
, bl
);
4708 pg_hit_set_info_t dummy_info
;
4709 ::encode(dummy_info
, bl
);
4711 ::encode(history
, bl
);
4715 void pg_hit_set_history_t::decode(bufferlist::iterator
& p
)
4718 ::decode(current_last_update
, p
);
4720 utime_t dummy_stamp
;
4721 ::decode(dummy_stamp
, p
);
4724 pg_hit_set_info_t dummy_info
;
4725 ::decode(dummy_info
, p
);
4727 ::decode(history
, p
);
4731 void pg_hit_set_history_t::dump(Formatter
*f
) const
4733 f
->dump_stream("current_last_update") << current_last_update
;
4734 f
->open_array_section("history");
4735 for (list
<pg_hit_set_info_t
>::const_iterator p
= history
.begin();
4736 p
!= history
.end(); ++p
) {
4737 f
->open_object_section("info");
4744 void pg_hit_set_history_t::generate_test_instances(list
<pg_hit_set_history_t
*>& ls
)
4746 ls
.push_back(new pg_hit_set_history_t
);
4747 ls
.push_back(new pg_hit_set_history_t
);
4748 ls
.back()->current_last_update
= eversion_t(1, 2);
4749 ls
.back()->history
.push_back(pg_hit_set_info_t());
4752 // -- osd_peer_stat_t --
4754 void osd_peer_stat_t::encode(bufferlist
& bl
) const
4756 ENCODE_START(1, 1, bl
);
4757 ::encode(stamp
, bl
);
4761 void osd_peer_stat_t::decode(bufferlist::iterator
& bl
)
4763 DECODE_START(1, bl
);
4764 ::decode(stamp
, bl
);
4768 void osd_peer_stat_t::dump(Formatter
*f
) const
4770 f
->dump_stream("stamp") << stamp
;
4773 void osd_peer_stat_t::generate_test_instances(list
<osd_peer_stat_t
*>& o
)
4775 o
.push_back(new osd_peer_stat_t
);
4776 o
.push_back(new osd_peer_stat_t
);
4777 o
.back()->stamp
= utime_t(1, 2);
4780 ostream
& operator<<(ostream
& out
, const osd_peer_stat_t
&stat
)
4782 return out
<< "stat(" << stat
.stamp
<< ")";
4786 // -- OSDSuperblock --
4788 void OSDSuperblock::encode(bufferlist
&bl
) const
4790 ENCODE_START(8, 5, bl
);
4791 ::encode(cluster_fsid
, bl
);
4792 ::encode(whoami
, bl
);
4793 ::encode(current_epoch
, bl
);
4794 ::encode(oldest_map
, bl
);
4795 ::encode(newest_map
, bl
);
4796 ::encode(weight
, bl
);
4797 compat_features
.encode(bl
);
4798 ::encode(clean_thru
, bl
);
4799 ::encode(mounted
, bl
);
4800 ::encode(osd_fsid
, bl
);
4801 ::encode((epoch_t
)0, bl
); // epoch_t last_epoch_marked_full
4802 ::encode((uint32_t)0, bl
); // map<int64_t,epoch_t> pool_last_epoch_marked_full
4806 void OSDSuperblock::decode(bufferlist::iterator
&bl
)
4808 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl
);
4811 ::decode(magic
, bl
);
4813 ::decode(cluster_fsid
, bl
);
4814 ::decode(whoami
, bl
);
4815 ::decode(current_epoch
, bl
);
4816 ::decode(oldest_map
, bl
);
4817 ::decode(newest_map
, bl
);
4818 ::decode(weight
, bl
);
4819 if (struct_v
>= 2) {
4820 compat_features
.decode(bl
);
4821 } else { //upgrade it!
4822 compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4824 ::decode(clean_thru
, bl
);
4825 ::decode(mounted
, bl
);
4827 ::decode(osd_fsid
, bl
);
4828 if (struct_v
>= 6) {
4829 epoch_t last_map_marked_full
;
4830 ::decode(last_map_marked_full
, bl
);
4832 if (struct_v
>= 7) {
4833 map
<int64_t,epoch_t
> pool_last_map_marked_full
;
4834 ::decode(pool_last_map_marked_full
, bl
);
4839 void OSDSuperblock::dump(Formatter
*f
) const
4841 f
->dump_stream("cluster_fsid") << cluster_fsid
;
4842 f
->dump_stream("osd_fsid") << osd_fsid
;
4843 f
->dump_int("whoami", whoami
);
4844 f
->dump_int("current_epoch", current_epoch
);
4845 f
->dump_int("oldest_map", oldest_map
);
4846 f
->dump_int("newest_map", newest_map
);
4847 f
->dump_float("weight", weight
);
4848 f
->open_object_section("compat");
4849 compat_features
.dump(f
);
4851 f
->dump_int("clean_thru", clean_thru
);
4852 f
->dump_int("last_epoch_mounted", mounted
);
4855 void OSDSuperblock::generate_test_instances(list
<OSDSuperblock
*>& o
)
4858 o
.push_back(new OSDSuperblock(z
));
4859 memset(&z
.cluster_fsid
, 1, sizeof(z
.cluster_fsid
));
4860 memset(&z
.osd_fsid
, 2, sizeof(z
.osd_fsid
));
4862 z
.current_epoch
= 4;
4867 o
.push_back(new OSDSuperblock(z
));
4868 o
.push_back(new OSDSuperblock(z
));
4873 void SnapSet::encode(bufferlist
& bl
) const
4875 ENCODE_START(3, 2, bl
);
4877 ::encode(head_exists
, bl
);
4878 ::encode(snaps
, bl
);
4879 ::encode(clones
, bl
);
4880 ::encode(clone_overlap
, bl
);
4881 ::encode(clone_size
, bl
);
4882 ::encode(clone_snaps
, bl
);
4886 void SnapSet::decode(bufferlist::iterator
& bl
)
4888 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
4890 ::decode(head_exists
, bl
);
4891 ::decode(snaps
, bl
);
4892 ::decode(clones
, bl
);
4893 ::decode(clone_overlap
, bl
);
4894 ::decode(clone_size
, bl
);
4895 if (struct_v
>= 3) {
4896 ::decode(clone_snaps
, bl
);
4898 clone_snaps
.clear();
4903 void SnapSet::dump(Formatter
*f
) const
4905 SnapContext
sc(seq
, snaps
);
4906 f
->open_object_section("snap_context");
4909 f
->dump_int("head_exists", head_exists
);
4910 f
->open_array_section("clones");
4911 for (vector
<snapid_t
>::const_iterator p
= clones
.begin(); p
!= clones
.end(); ++p
) {
4912 f
->open_object_section("clone");
4913 f
->dump_unsigned("snap", *p
);
4914 auto cs
= clone_size
.find(*p
);
4915 if (cs
!= clone_size
.end())
4916 f
->dump_unsigned("size", cs
->second
);
4918 f
->dump_string("size", "????");
4919 auto co
= clone_overlap
.find(*p
);
4920 if (co
!= clone_overlap
.end())
4921 f
->dump_stream("overlap") << co
->second
;
4923 f
->dump_stream("overlap") << "????";
4924 auto q
= clone_snaps
.find(*p
);
4925 if (q
!= clone_snaps
.end()) {
4926 f
->open_array_section("snaps");
4927 for (auto s
: q
->second
) {
4928 f
->dump_unsigned("snap", s
);
4937 void SnapSet::generate_test_instances(list
<SnapSet
*>& o
)
4939 o
.push_back(new SnapSet
);
4940 o
.push_back(new SnapSet
);
4941 o
.back()->head_exists
= true;
4942 o
.back()->seq
= 123;
4943 o
.back()->snaps
.push_back(123);
4944 o
.back()->snaps
.push_back(12);
4945 o
.push_back(new SnapSet
);
4946 o
.back()->head_exists
= true;
4947 o
.back()->seq
= 123;
4948 o
.back()->snaps
.push_back(123);
4949 o
.back()->snaps
.push_back(12);
4950 o
.back()->clones
.push_back(12);
4951 o
.back()->clone_size
[12] = 12345;
4952 o
.back()->clone_overlap
[12];
4953 o
.back()->clone_snaps
[12] = {12, 10, 8};
4956 ostream
& operator<<(ostream
& out
, const SnapSet
& cs
)
4958 if (cs
.is_legacy()) {
4959 out
<< cs
.seq
<< "=" << cs
.snaps
<< ":"
4961 << (cs
.head_exists
? "+head":"");
4962 if (!cs
.clone_snaps
.empty()) {
4963 out
<< "+stray_clone_snaps=" << cs
.clone_snaps
;
4967 return out
<< cs
.seq
<< "=" << cs
.snaps
<< ":"
4972 void SnapSet::from_snap_set(const librados::snap_set_t
& ss
, bool legacy
)
4974 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
4975 // correct: it will not include snaps that still logically exist
4976 // but for which there was no clone that is defined. For all
4977 // practical purposes this doesn't matter, since we only use that
4978 // information to clone on the OSD, and we have already moved
4979 // forward past that part of the object history.
4982 set
<snapid_t
> _snaps
;
4983 set
<snapid_t
> _clones
;
4984 head_exists
= false;
4985 for (vector
<librados::clone_info_t
>::const_iterator p
= ss
.clones
.begin();
4986 p
!= ss
.clones
.end();
4988 if (p
->cloneid
== librados::SNAP_HEAD
) {
4991 _clones
.insert(p
->cloneid
);
4992 _snaps
.insert(p
->snaps
.begin(), p
->snaps
.end());
4993 clone_size
[p
->cloneid
] = p
->size
;
4994 clone_overlap
[p
->cloneid
]; // the entry must exist, even if it's empty.
4995 for (vector
<pair
<uint64_t, uint64_t> >::const_iterator q
=
4996 p
->overlap
.begin(); q
!= p
->overlap
.end(); ++q
)
4997 clone_overlap
[p
->cloneid
].insert(q
->first
, q
->second
);
4999 // p->snaps is ascending; clone_snaps is descending
5000 vector
<snapid_t
>& v
= clone_snaps
[p
->cloneid
];
5001 for (auto q
= p
->snaps
.rbegin(); q
!= p
->snaps
.rend(); ++q
) {
5010 clones
.reserve(_clones
.size());
5011 for (set
<snapid_t
>::iterator p
= _clones
.begin(); p
!= _clones
.end(); ++p
)
5012 clones
.push_back(*p
);
5016 snaps
.reserve(_snaps
.size());
5017 for (set
<snapid_t
>::reverse_iterator p
= _snaps
.rbegin();
5018 p
!= _snaps
.rend(); ++p
)
5019 snaps
.push_back(*p
);
5022 uint64_t SnapSet::get_clone_bytes(snapid_t clone
) const
5024 assert(clone_size
.count(clone
));
5025 uint64_t size
= clone_size
.find(clone
)->second
;
5026 assert(clone_overlap
.count(clone
));
5027 const interval_set
<uint64_t> &overlap
= clone_overlap
.find(clone
)->second
;
5028 for (interval_set
<uint64_t>::const_iterator i
= overlap
.begin();
5031 assert(size
>= i
.get_len());
5032 size
-= i
.get_len();
5037 void SnapSet::filter(const pg_pool_t
&pinfo
)
5039 vector
<snapid_t
> oldsnaps
;
5040 oldsnaps
.swap(snaps
);
5041 for (vector
<snapid_t
>::const_iterator i
= oldsnaps
.begin();
5042 i
!= oldsnaps
.end();
5044 if (!pinfo
.is_removed_snap(*i
))
5045 snaps
.push_back(*i
);
5049 SnapSet
SnapSet::get_filtered(const pg_pool_t
&pinfo
) const
5056 // -- watch_info_t --
5058 void watch_info_t::encode(bufferlist
& bl
, uint64_t features
) const
5060 ENCODE_START(4, 3, bl
);
5061 ::encode(cookie
, bl
);
5062 ::encode(timeout_seconds
, bl
);
5063 ::encode(addr
, bl
, features
);
5067 void watch_info_t::decode(bufferlist::iterator
& bl
)
5069 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl
);
5070 ::decode(cookie
, bl
);
5075 ::decode(timeout_seconds
, bl
);
5076 if (struct_v
>= 4) {
5082 void watch_info_t::dump(Formatter
*f
) const
5084 f
->dump_unsigned("cookie", cookie
);
5085 f
->dump_unsigned("timeout_seconds", timeout_seconds
);
5086 f
->open_object_section("addr");
5091 void watch_info_t::generate_test_instances(list
<watch_info_t
*>& o
)
5093 o
.push_back(new watch_info_t
);
5094 o
.push_back(new watch_info_t
);
5095 o
.back()->cookie
= 123;
5096 o
.back()->timeout_seconds
= 99;
5098 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
5100 ea
.set_family(AF_INET
);
5101 ea
.set_in4_quad(0, 127);
5102 ea
.set_in4_quad(1, 0);
5103 ea
.set_in4_quad(2, 1);
5104 ea
.set_in4_quad(3, 2);
5106 o
.back()->addr
= ea
;
5109 // -- object_manifest_t --
5111 void object_manifest_t::encode(bufferlist
& bl
) const
5113 ENCODE_START(1, 1, bl
);
5116 case TYPE_NONE
: break;
5118 ::encode(redirect_target
, bl
);
5126 void object_manifest_t::decode(bufferlist::iterator
& bl
)
5128 DECODE_START(1, bl
);
5131 case TYPE_NONE
: break;
5133 ::decode(redirect_target
, bl
);
5141 void object_manifest_t::dump(Formatter
*f
) const
5143 f
->dump_unsigned("type", type
);
5144 f
->open_object_section("redirect_target");
5145 redirect_target
.dump(f
);
5149 void object_manifest_t::generate_test_instances(list
<object_manifest_t
*>& o
)
5151 o
.push_back(new object_manifest_t());
5152 o
.back()->type
= TYPE_REDIRECT
;
5155 ostream
& operator<<(ostream
& out
, const object_manifest_t
& om
)
5157 return out
<< "type:" << om
.type
<< " redirect_target:" << om
.redirect_target
;
5160 // -- object_info_t --
5162 void object_info_t::copy_user_bits(const object_info_t
& other
)
5164 // these bits are copied from head->clone.
5166 mtime
= other
.mtime
;
5167 local_mtime
= other
.local_mtime
;
5168 last_reqid
= other
.last_reqid
;
5169 truncate_seq
= other
.truncate_seq
;
5170 truncate_size
= other
.truncate_size
;
5171 flags
= other
.flags
;
5172 user_version
= other
.user_version
;
5173 data_digest
= other
.data_digest
;
5174 omap_digest
= other
.omap_digest
;
5177 ps_t
object_info_t::legacy_object_locator_to_ps(const object_t
&oid
,
5178 const object_locator_t
&loc
) {
5180 if (loc
.key
.length())
5181 // Hack, we don't have the osd map, so we don't really know the hash...
5182 ps
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, loc
.key
.c_str(),
5185 ps
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, oid
.name
.c_str(),
5190 void object_info_t::encode(bufferlist
& bl
, uint64_t features
) const
5192 object_locator_t
myoloc(soid
);
5193 map
<entity_name_t
, watch_info_t
> old_watchers
;
5194 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator i
=
5196 i
!= watchers
.end();
5198 old_watchers
.insert(make_pair(i
->first
.second
, i
->second
));
5200 ENCODE_START(17, 8, bl
);
5202 ::encode(myoloc
, bl
); //Retained for compatibility
5203 ::encode((__u32
)0, bl
); // was category, no longer used
5204 ::encode(version
, bl
);
5205 ::encode(prior_version
, bl
);
5206 ::encode(last_reqid
, bl
);
5208 ::encode(mtime
, bl
);
5209 if (soid
.snap
== CEPH_NOSNAP
)
5210 ::encode(osd_reqid_t(), bl
); // used to be wrlock_by
5212 ::encode(legacy_snaps
, bl
);
5213 ::encode(truncate_seq
, bl
);
5214 ::encode(truncate_size
, bl
);
5215 ::encode(is_lost(), bl
);
5216 ::encode(old_watchers
, bl
, features
);
5217 /* shenanigans to avoid breaking backwards compatibility in the disk format.
5218 * When we can, switch this out for simply putting the version_t on disk. */
5219 eversion_t
user_eversion(0, user_version
);
5220 ::encode(user_eversion
, bl
);
5221 ::encode(test_flag(FLAG_USES_TMAP
), bl
);
5222 ::encode(watchers
, bl
, features
);
5223 __u32 _flags
= flags
;
5224 ::encode(_flags
, bl
);
5225 ::encode(local_mtime
, bl
);
5226 ::encode(data_digest
, bl
);
5227 ::encode(omap_digest
, bl
);
5228 ::encode(expected_object_size
, bl
);
5229 ::encode(expected_write_size
, bl
);
5230 ::encode(alloc_hint_flags
, bl
);
5231 if (has_manifest()) {
5232 ::encode(manifest
, bl
);
5237 void object_info_t::decode(bufferlist::iterator
& bl
)
5239 object_locator_t myoloc
;
5240 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl
);
5241 map
<entity_name_t
, watch_info_t
> old_watchers
;
5243 ::decode(myoloc
, bl
);
5246 ::decode(category
, bl
); // no longer used
5248 ::decode(version
, bl
);
5249 ::decode(prior_version
, bl
);
5250 ::decode(last_reqid
, bl
);
5252 ::decode(mtime
, bl
);
5253 if (soid
.snap
== CEPH_NOSNAP
) {
5254 osd_reqid_t wrlock_by
;
5255 ::decode(wrlock_by
, bl
);
5257 ::decode(legacy_snaps
, bl
);
5259 ::decode(truncate_seq
, bl
);
5260 ::decode(truncate_size
, bl
);
5262 // if this is struct_v >= 13, we will overwrite this
5263 // below since this field is just here for backwards
5269 ::decode(old_watchers
, bl
);
5270 eversion_t user_eversion
;
5271 ::decode(user_eversion
, bl
);
5272 user_version
= user_eversion
.version
;
5274 if (struct_v
>= 9) {
5275 bool uses_tmap
= false;
5276 ::decode(uses_tmap
, bl
);
5278 set_flag(FLAG_USES_TMAP
);
5280 set_flag(FLAG_USES_TMAP
);
5283 soid
.pool
= myoloc
.pool
;
5284 if (struct_v
>= 11) {
5285 ::decode(watchers
, bl
);
5287 for (map
<entity_name_t
, watch_info_t
>::iterator i
= old_watchers
.begin();
5288 i
!= old_watchers
.end();
5292 make_pair(i
->second
.cookie
, i
->first
), i
->second
));
5295 if (struct_v
>= 13) {
5297 ::decode(_flags
, bl
);
5298 flags
= (flag_t
)_flags
;
5300 if (struct_v
>= 14) {
5301 ::decode(local_mtime
, bl
);
5303 local_mtime
= utime_t();
5305 if (struct_v
>= 15) {
5306 ::decode(data_digest
, bl
);
5307 ::decode(omap_digest
, bl
);
5309 data_digest
= omap_digest
= -1;
5310 clear_flag(FLAG_DATA_DIGEST
);
5311 clear_flag(FLAG_OMAP_DIGEST
);
5313 if (struct_v
>= 16) {
5314 ::decode(expected_object_size
, bl
);
5315 ::decode(expected_write_size
, bl
);
5316 ::decode(alloc_hint_flags
, bl
);
5318 expected_object_size
= 0;
5319 expected_write_size
= 0;
5320 alloc_hint_flags
= 0;
5322 if (struct_v
>= 17) {
5323 if (has_manifest()) {
5324 ::decode(manifest
, bl
);
5330 void object_info_t::dump(Formatter
*f
) const
5332 f
->open_object_section("oid");
5335 f
->dump_stream("version") << version
;
5336 f
->dump_stream("prior_version") << prior_version
;
5337 f
->dump_stream("last_reqid") << last_reqid
;
5338 f
->dump_unsigned("user_version", user_version
);
5339 f
->dump_unsigned("size", size
);
5340 f
->dump_stream("mtime") << mtime
;
5341 f
->dump_stream("local_mtime") << local_mtime
;
5342 f
->dump_unsigned("lost", (int)is_lost());
5343 vector
<string
> sv
= get_flag_vector(flags
);
5344 f
->open_array_section("flags");
5346 f
->dump_string("flags", str
);
5348 f
->open_array_section("legacy_snaps");
5349 for (auto s
: legacy_snaps
) {
5350 f
->dump_unsigned("snap", s
);
5353 f
->dump_unsigned("truncate_seq", truncate_seq
);
5354 f
->dump_unsigned("truncate_size", truncate_size
);
5355 f
->dump_format("data_digest", "0x%08x", data_digest
);
5356 f
->dump_format("omap_digest", "0x%08x", omap_digest
);
5357 f
->dump_unsigned("expected_object_size", expected_object_size
);
5358 f
->dump_unsigned("expected_write_size", expected_write_size
);
5359 f
->dump_unsigned("alloc_hint_flags", alloc_hint_flags
);
5360 f
->dump_object("manifest", manifest
);
5361 f
->open_object_section("watchers");
5362 for (map
<pair
<uint64_t, entity_name_t
>,watch_info_t
>::const_iterator p
=
5363 watchers
.begin(); p
!= watchers
.end(); ++p
) {
5365 ss
<< p
->first
.second
;
5366 f
->open_object_section(ss
.str().c_str());
5373 void object_info_t::generate_test_instances(list
<object_info_t
*>& o
)
5375 o
.push_back(new object_info_t());
5381 ostream
& operator<<(ostream
& out
, const object_info_t
& oi
)
5383 out
<< oi
.soid
<< "(" << oi
.version
5384 << " " << oi
.last_reqid
;
5385 if (oi
.soid
.snap
!= CEPH_NOSNAP
&& !oi
.legacy_snaps
.empty())
5386 out
<< " " << oi
.legacy_snaps
;
5388 out
<< " " << oi
.get_flag_string();
5389 out
<< " s " << oi
.size
;
5390 out
<< " uv " << oi
.user_version
;
5391 if (oi
.is_data_digest())
5392 out
<< " dd " << std::hex
<< oi
.data_digest
<< std::dec
;
5393 if (oi
.is_omap_digest())
5394 out
<< " od " << std::hex
<< oi
.omap_digest
<< std::dec
;
5395 out
<< " alloc_hint [" << oi
.expected_object_size
5396 << " " << oi
.expected_write_size
5397 << " " << oi
.alloc_hint_flags
<< "]";
5398 if (oi
.has_manifest())
5399 out
<< " " << oi
.manifest
;
5405 // -- ObjectRecovery --
5406 void ObjectRecoveryProgress::encode(bufferlist
&bl
) const
5408 ENCODE_START(1, 1, bl
);
5409 ::encode(first
, bl
);
5410 ::encode(data_complete
, bl
);
5411 ::encode(data_recovered_to
, bl
);
5412 ::encode(omap_recovered_to
, bl
);
5413 ::encode(omap_complete
, bl
);
5417 void ObjectRecoveryProgress::decode(bufferlist::iterator
&bl
)
5419 DECODE_START(1, bl
);
5420 ::decode(first
, bl
);
5421 ::decode(data_complete
, bl
);
5422 ::decode(data_recovered_to
, bl
);
5423 ::decode(omap_recovered_to
, bl
);
5424 ::decode(omap_complete
, bl
);
5428 ostream
&operator<<(ostream
&out
, const ObjectRecoveryProgress
&prog
)
5430 return prog
.print(out
);
5433 void ObjectRecoveryProgress::generate_test_instances(
5434 list
<ObjectRecoveryProgress
*>& o
)
5436 o
.push_back(new ObjectRecoveryProgress
);
5437 o
.back()->first
= false;
5438 o
.back()->data_complete
= true;
5439 o
.back()->omap_complete
= true;
5440 o
.back()->data_recovered_to
= 100;
5442 o
.push_back(new ObjectRecoveryProgress
);
5443 o
.back()->first
= true;
5444 o
.back()->data_complete
= false;
5445 o
.back()->omap_complete
= false;
5446 o
.back()->data_recovered_to
= 0;
5449 ostream
&ObjectRecoveryProgress::print(ostream
&out
) const
5451 return out
<< "ObjectRecoveryProgress("
5452 << ( first
? "" : "!" ) << "first, "
5453 << "data_recovered_to:" << data_recovered_to
5454 << ", data_complete:" << ( data_complete
? "true" : "false" )
5455 << ", omap_recovered_to:" << omap_recovered_to
5456 << ", omap_complete:" << ( omap_complete
? "true" : "false" )
5457 << ", error:" << ( error
? "true" : "false" )
5461 void ObjectRecoveryProgress::dump(Formatter
*f
) const
5463 f
->dump_int("first?", first
);
5464 f
->dump_int("data_complete?", data_complete
);
5465 f
->dump_unsigned("data_recovered_to", data_recovered_to
);
5466 f
->dump_int("omap_complete?", omap_complete
);
5467 f
->dump_string("omap_recovered_to", omap_recovered_to
);
5470 void ObjectRecoveryInfo::encode(bufferlist
&bl
, uint64_t features
) const
5472 ENCODE_START(2, 1, bl
);
5474 ::encode(version
, bl
);
5476 ::encode(oi
, bl
, features
);
5478 ::encode(copy_subset
, bl
);
5479 ::encode(clone_subset
, bl
);
5483 void ObjectRecoveryInfo::decode(bufferlist::iterator
&bl
,
5486 DECODE_START(2, bl
);
5488 ::decode(version
, bl
);
5492 ::decode(copy_subset
, bl
);
5493 ::decode(clone_subset
, bl
);
5497 if (!soid
.is_max() && soid
.pool
== -1)
5499 map
<hobject_t
, interval_set
<uint64_t>> tmp
;
5500 tmp
.swap(clone_subset
);
5501 for (map
<hobject_t
, interval_set
<uint64_t>>::iterator i
= tmp
.begin();
5504 hobject_t
first(i
->first
);
5505 if (!first
.is_max() && first
.pool
== -1)
5507 clone_subset
[first
].swap(i
->second
);
5512 void ObjectRecoveryInfo::generate_test_instances(
5513 list
<ObjectRecoveryInfo
*>& o
)
5515 o
.push_back(new ObjectRecoveryInfo
);
5516 o
.back()->soid
= hobject_t(sobject_t("key", CEPH_NOSNAP
));
5517 o
.back()->version
= eversion_t(0,0);
5518 o
.back()->size
= 100;
5522 void ObjectRecoveryInfo::dump(Formatter
*f
) const
5524 f
->dump_stream("object") << soid
;
5525 f
->dump_stream("at_version") << version
;
5526 f
->dump_stream("size") << size
;
5528 f
->open_object_section("object_info");
5533 f
->open_object_section("snapset");
5537 f
->dump_stream("copy_subset") << copy_subset
;
5538 f
->dump_stream("clone_subset") << clone_subset
;
5541 ostream
& operator<<(ostream
& out
, const ObjectRecoveryInfo
&inf
)
5543 return inf
.print(out
);
5546 ostream
&ObjectRecoveryInfo::print(ostream
&out
) const
5548 return out
<< "ObjectRecoveryInfo("
5549 << soid
<< "@" << version
5550 << ", size: " << size
5551 << ", copy_subset: " << copy_subset
5552 << ", clone_subset: " << clone_subset
5553 << ", snapset: " << ss
5557 // -- PushReplyOp --
5558 void PushReplyOp::generate_test_instances(list
<PushReplyOp
*> &o
)
5560 o
.push_back(new PushReplyOp
);
5561 o
.push_back(new PushReplyOp
);
5562 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
5563 o
.push_back(new PushReplyOp
);
5564 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
5567 void PushReplyOp::encode(bufferlist
&bl
) const
5569 ENCODE_START(1, 1, bl
);
5574 void PushReplyOp::decode(bufferlist::iterator
&bl
)
5576 DECODE_START(1, bl
);
5581 void PushReplyOp::dump(Formatter
*f
) const
5583 f
->dump_stream("soid") << soid
;
5586 ostream
&PushReplyOp::print(ostream
&out
) const
5589 << "PushReplyOp(" << soid
5593 ostream
& operator<<(ostream
& out
, const PushReplyOp
&op
)
5595 return op
.print(out
);
5598 uint64_t PushReplyOp::cost(CephContext
*cct
) const
5601 return cct
->_conf
->osd_push_per_object_cost
+
5602 cct
->_conf
->osd_recovery_max_chunk
;
5606 void PullOp::generate_test_instances(list
<PullOp
*> &o
)
5608 o
.push_back(new PullOp
);
5609 o
.push_back(new PullOp
);
5610 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
5611 o
.back()->recovery_info
.version
= eversion_t(3, 10);
5612 o
.push_back(new PullOp
);
5613 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
5614 o
.back()->recovery_info
.version
= eversion_t(0, 0);
5617 void PullOp::encode(bufferlist
&bl
, uint64_t features
) const
5619 ENCODE_START(1, 1, bl
);
5621 ::encode(recovery_info
, bl
, features
);
5622 ::encode(recovery_progress
, bl
);
5626 void PullOp::decode(bufferlist::iterator
&bl
)
5628 DECODE_START(1, bl
);
5630 ::decode(recovery_info
, bl
);
5631 ::decode(recovery_progress
, bl
);
5635 void PullOp::dump(Formatter
*f
) const
5637 f
->dump_stream("soid") << soid
;
5639 f
->open_object_section("recovery_info");
5640 recovery_info
.dump(f
);
5644 f
->open_object_section("recovery_progress");
5645 recovery_progress
.dump(f
);
5650 ostream
&PullOp::print(ostream
&out
) const
5653 << "PullOp(" << soid
5654 << ", recovery_info: " << recovery_info
5655 << ", recovery_progress: " << recovery_progress
5659 ostream
& operator<<(ostream
& out
, const PullOp
&op
)
5661 return op
.print(out
);
5664 uint64_t PullOp::cost(CephContext
*cct
) const
5666 return cct
->_conf
->osd_push_per_object_cost
+
5667 cct
->_conf
->osd_recovery_max_chunk
;
5671 void PushOp::generate_test_instances(list
<PushOp
*> &o
)
5673 o
.push_back(new PushOp
);
5674 o
.push_back(new PushOp
);
5675 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
5676 o
.back()->version
= eversion_t(3, 10);
5677 o
.push_back(new PushOp
);
5678 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
5679 o
.back()->version
= eversion_t(0, 0);
5682 void PushOp::encode(bufferlist
&bl
, uint64_t features
) const
5684 ENCODE_START(1, 1, bl
);
5686 ::encode(version
, bl
);
5688 ::encode(data_included
, bl
);
5689 ::encode(omap_header
, bl
);
5690 ::encode(omap_entries
, bl
);
5691 ::encode(attrset
, bl
);
5692 ::encode(recovery_info
, bl
, features
);
5693 ::encode(after_progress
, bl
);
5694 ::encode(before_progress
, bl
);
5698 void PushOp::decode(bufferlist::iterator
&bl
)
5700 DECODE_START(1, bl
);
5702 ::decode(version
, bl
);
5704 ::decode(data_included
, bl
);
5705 ::decode(omap_header
, bl
);
5706 ::decode(omap_entries
, bl
);
5707 ::decode(attrset
, bl
);
5708 ::decode(recovery_info
, bl
);
5709 ::decode(after_progress
, bl
);
5710 ::decode(before_progress
, bl
);
5714 void PushOp::dump(Formatter
*f
) const
5716 f
->dump_stream("soid") << soid
;
5717 f
->dump_stream("version") << version
;
5718 f
->dump_int("data_len", data
.length());
5719 f
->dump_stream("data_included") << data_included
;
5720 f
->dump_int("omap_header_len", omap_header
.length());
5721 f
->dump_int("omap_entries_len", omap_entries
.size());
5722 f
->dump_int("attrset_len", attrset
.size());
5724 f
->open_object_section("recovery_info");
5725 recovery_info
.dump(f
);
5729 f
->open_object_section("after_progress");
5730 after_progress
.dump(f
);
5734 f
->open_object_section("before_progress");
5735 before_progress
.dump(f
);
5740 ostream
&PushOp::print(ostream
&out
) const
5743 << "PushOp(" << soid
5744 << ", version: " << version
5745 << ", data_included: " << data_included
5746 << ", data_size: " << data
.length()
5747 << ", omap_header_size: " << omap_header
.length()
5748 << ", omap_entries_size: " << omap_entries
.size()
5749 << ", attrset_size: " << attrset
.size()
5750 << ", recovery_info: " << recovery_info
5751 << ", after_progress: " << after_progress
5752 << ", before_progress: " << before_progress
5756 ostream
& operator<<(ostream
& out
, const PushOp
&op
)
5758 return op
.print(out
);
5761 uint64_t PushOp::cost(CephContext
*cct
) const
5763 uint64_t cost
= data_included
.size();
5764 for (map
<string
, bufferlist
>::const_iterator i
=
5765 omap_entries
.begin();
5766 i
!= omap_entries
.end();
5768 cost
+= i
->second
.length();
5770 cost
+= cct
->_conf
->osd_push_per_object_cost
;
5776 void ScrubMap::merge_incr(const ScrubMap
&l
)
5778 assert(valid_through
== l
.incr_since
);
5779 valid_through
= l
.valid_through
;
5781 for (map
<hobject_t
,object
>::const_iterator p
= l
.objects
.begin();
5782 p
!= l
.objects
.end();
5784 if (p
->second
.negative
) {
5785 map
<hobject_t
,object
>::iterator q
= objects
.find(p
->first
);
5786 if (q
!= objects
.end()) {
5790 objects
[p
->first
] = p
->second
;
5795 void ScrubMap::encode(bufferlist
& bl
) const
5797 ENCODE_START(3, 2, bl
);
5798 ::encode(objects
, bl
);
5799 ::encode((__u32
)0, bl
); // used to be attrs; now deprecated
5800 bufferlist old_logbl
; // not used
5801 ::encode(old_logbl
, bl
);
5802 ::encode(valid_through
, bl
);
5803 ::encode(incr_since
, bl
);
5807 void ScrubMap::decode(bufferlist::iterator
& bl
, int64_t pool
)
5809 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
5810 ::decode(objects
, bl
);
5812 map
<string
,string
> attrs
; // deprecated
5813 ::decode(attrs
, bl
);
5815 bufferlist old_logbl
; // not used
5816 ::decode(old_logbl
, bl
);
5817 ::decode(valid_through
, bl
);
5818 ::decode(incr_since
, bl
);
5821 // handle hobject_t upgrade
5823 map
<hobject_t
, object
> tmp
;
5825 for (map
<hobject_t
, object
>::iterator i
= tmp
.begin();
5828 hobject_t
first(i
->first
);
5829 if (!first
.is_max() && first
.pool
== -1)
5831 objects
[first
] = i
->second
;
5836 void ScrubMap::dump(Formatter
*f
) const
5838 f
->dump_stream("valid_through") << valid_through
;
5839 f
->dump_stream("incremental_since") << incr_since
;
5840 f
->open_array_section("objects");
5841 for (map
<hobject_t
,object
>::const_iterator p
= objects
.begin(); p
!= objects
.end(); ++p
) {
5842 f
->open_object_section("object");
5843 f
->dump_string("name", p
->first
.oid
.name
);
5844 f
->dump_unsigned("hash", p
->first
.get_hash());
5845 f
->dump_string("key", p
->first
.get_key());
5846 f
->dump_int("snapid", p
->first
.snap
);
5853 void ScrubMap::generate_test_instances(list
<ScrubMap
*>& o
)
5855 o
.push_back(new ScrubMap
);
5856 o
.push_back(new ScrubMap
);
5857 o
.back()->valid_through
= eversion_t(1, 2);
5858 o
.back()->incr_since
= eversion_t(3, 4);
5860 object::generate_test_instances(obj
);
5861 o
.back()->objects
[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj
.back();
5863 o
.back()->objects
[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj
.back();
5866 // -- ScrubMap::object --
5868 void ScrubMap::object::encode(bufferlist
& bl
) const
5870 bool compat_read_error
= read_error
|| ec_hash_mismatch
|| ec_size_mismatch
;
5871 ENCODE_START(9, 7, bl
);
5873 ::encode(negative
, bl
);
5874 ::encode(attrs
, bl
);
5875 ::encode(digest
, bl
);
5876 ::encode(digest_present
, bl
);
5877 ::encode((uint32_t)0, bl
); // obsolete nlinks
5878 ::encode((uint32_t)0, bl
); // snapcolls
5879 ::encode(omap_digest
, bl
);
5880 ::encode(omap_digest_present
, bl
);
5881 ::encode(compat_read_error
, bl
);
5882 ::encode(stat_error
, bl
);
5883 ::encode(read_error
, bl
);
5884 ::encode(ec_hash_mismatch
, bl
);
5885 ::encode(ec_size_mismatch
, bl
);
5886 ::encode(large_omap_object_found
, bl
);
5887 ::encode(large_omap_object_key_count
, bl
);
5888 ::encode(large_omap_object_value_size
, bl
);
5892 void ScrubMap::object::decode(bufferlist::iterator
& bl
)
5894 DECODE_START(9, bl
);
5896 bool tmp
, compat_read_error
= false;
5899 ::decode(attrs
, bl
);
5900 ::decode(digest
, bl
);
5902 digest_present
= tmp
;
5905 ::decode(nlinks
, bl
);
5906 set
<snapid_t
> snapcolls
;
5907 ::decode(snapcolls
, bl
);
5909 ::decode(omap_digest
, bl
);
5911 omap_digest_present
= tmp
;
5912 ::decode(compat_read_error
, bl
);
5915 if (struct_v
>= 8) {
5919 ec_hash_mismatch
= tmp
;
5921 ec_size_mismatch
= tmp
;
5923 // If older encoder found a read_error, set read_error
5924 if (compat_read_error
&& !read_error
&& !ec_hash_mismatch
&& !ec_size_mismatch
)
5926 if (struct_v
>= 9) {
5928 large_omap_object_found
= tmp
;
5929 ::decode(large_omap_object_key_count
, bl
);
5930 ::decode(large_omap_object_value_size
, bl
);
5935 void ScrubMap::object::dump(Formatter
*f
) const
5937 f
->dump_int("size", size
);
5938 f
->dump_int("negative", negative
);
5939 f
->open_array_section("attrs");
5940 for (map
<string
,bufferptr
>::const_iterator p
= attrs
.begin(); p
!= attrs
.end(); ++p
) {
5941 f
->open_object_section("attr");
5942 f
->dump_string("name", p
->first
);
5943 f
->dump_int("length", p
->second
.length());
5949 void ScrubMap::object::generate_test_instances(list
<object
*>& o
)
5951 o
.push_back(new object
);
5952 o
.push_back(new object
);
5953 o
.back()->negative
= true;
5954 o
.push_back(new object
);
5955 o
.back()->size
= 123;
5956 o
.back()->attrs
["foo"] = buffer::copy("foo", 3);
5957 o
.back()->attrs
["bar"] = buffer::copy("barval", 6);
5962 ostream
& operator<<(ostream
& out
, const OSDOp
& op
)
5964 out
<< ceph_osd_op_name(op
.op
.op
);
5965 if (ceph_osd_op_type_data(op
.op
.op
)) {
5968 case CEPH_OSD_OP_ASSERT_VER
:
5969 out
<< " v" << op
.op
.assert_ver
.ver
;
5971 case CEPH_OSD_OP_TRUNCATE
:
5972 out
<< " " << op
.op
.extent
.offset
;
5974 case CEPH_OSD_OP_MASKTRUNC
:
5975 case CEPH_OSD_OP_TRIMTRUNC
:
5976 out
<< " " << op
.op
.extent
.truncate_seq
<< "@"
5977 << (int64_t)op
.op
.extent
.truncate_size
;
5979 case CEPH_OSD_OP_ROLLBACK
:
5980 out
<< " " << snapid_t(op
.op
.snap
.snapid
);
5982 case CEPH_OSD_OP_WATCH
:
5983 out
<< " " << ceph_osd_watch_op_name(op
.op
.watch
.op
)
5984 << " cookie " << op
.op
.watch
.cookie
;
5985 if (op
.op
.watch
.gen
)
5986 out
<< " gen " << op
.op
.watch
.gen
;
5988 case CEPH_OSD_OP_NOTIFY
:
5989 case CEPH_OSD_OP_NOTIFY_ACK
:
5990 out
<< " cookie " << op
.op
.notify
.cookie
;
5992 case CEPH_OSD_OP_COPY_GET
:
5993 out
<< " max " << op
.op
.copy_get
.max
;
5995 case CEPH_OSD_OP_COPY_FROM
:
5996 out
<< " ver " << op
.op
.copy_from
.src_version
;
5998 case CEPH_OSD_OP_SETALLOCHINT
:
5999 out
<< " object_size " << op
.op
.alloc_hint
.expected_object_size
6000 << " write_size " << op
.op
.alloc_hint
.expected_write_size
;
6002 case CEPH_OSD_OP_READ
:
6003 case CEPH_OSD_OP_SPARSE_READ
:
6004 case CEPH_OSD_OP_SYNC_READ
:
6005 case CEPH_OSD_OP_WRITE
:
6006 case CEPH_OSD_OP_WRITEFULL
:
6007 case CEPH_OSD_OP_ZERO
:
6008 case CEPH_OSD_OP_APPEND
:
6009 case CEPH_OSD_OP_MAPEXT
:
6010 out
<< " " << op
.op
.extent
.offset
<< "~" << op
.op
.extent
.length
;
6011 if (op
.op
.extent
.truncate_seq
)
6012 out
<< " [" << op
.op
.extent
.truncate_seq
<< "@"
6013 << (int64_t)op
.op
.extent
.truncate_size
<< "]";
6015 out
<< " [" << ceph_osd_op_flag_string(op
.op
.flags
) << "]";
6017 // don't show any arg info
6020 } else if (ceph_osd_op_type_attr(op
.op
.op
)) {
6022 if (op
.op
.xattr
.name_len
&& op
.indata
.length()) {
6024 op
.indata
.write(0, op
.op
.xattr
.name_len
, out
);
6026 if (op
.op
.xattr
.value_len
)
6027 out
<< " (" << op
.op
.xattr
.value_len
<< ")";
6028 if (op
.op
.op
== CEPH_OSD_OP_CMPXATTR
)
6029 out
<< " op " << (int)op
.op
.xattr
.cmp_op
6030 << " mode " << (int)op
.op
.xattr
.cmp_mode
;
6031 } else if (ceph_osd_op_type_exec(op
.op
.op
)) {
6033 if (op
.op
.cls
.class_len
&& op
.indata
.length()) {
6035 op
.indata
.write(0, op
.op
.cls
.class_len
, out
);
6037 op
.indata
.write(op
.op
.cls
.class_len
, op
.op
.cls
.method_len
, out
);
6039 } else if (ceph_osd_op_type_pg(op
.op
.op
)) {
6041 case CEPH_OSD_OP_PGLS
:
6042 case CEPH_OSD_OP_PGLS_FILTER
:
6043 case CEPH_OSD_OP_PGNLS
:
6044 case CEPH_OSD_OP_PGNLS_FILTER
:
6045 out
<< " start_epoch " << op
.op
.pgls
.start_epoch
;
6047 case CEPH_OSD_OP_PG_HITSET_LS
:
6049 case CEPH_OSD_OP_PG_HITSET_GET
:
6050 out
<< " " << utime_t(op
.op
.hit_set_get
.stamp
);
6052 case CEPH_OSD_OP_SCRUBLS
:
6060 void OSDOp::split_osd_op_vector_in_data(vector
<OSDOp
>& ops
, bufferlist
& in
)
6062 bufferlist::iterator datap
= in
.begin();
6063 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6064 if (ops
[i
].op
.payload_len
) {
6065 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].indata
);
6070 void OSDOp::merge_osd_op_vector_in_data(vector
<OSDOp
>& ops
, bufferlist
& out
)
6072 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6073 if (ops
[i
].indata
.length()) {
6074 ops
[i
].op
.payload_len
= ops
[i
].indata
.length();
6075 out
.append(ops
[i
].indata
);
6080 void OSDOp::split_osd_op_vector_out_data(vector
<OSDOp
>& ops
, bufferlist
& in
)
6082 bufferlist::iterator datap
= in
.begin();
6083 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6084 if (ops
[i
].op
.payload_len
) {
6085 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].outdata
);
6090 void OSDOp::merge_osd_op_vector_out_data(vector
<OSDOp
>& ops
, bufferlist
& out
)
6092 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6093 if (ops
[i
].outdata
.length()) {
6094 ops
[i
].op
.payload_len
= ops
[i
].outdata
.length();
6095 out
.append(ops
[i
].outdata
);
6100 bool store_statfs_t::operator==(const store_statfs_t
& other
) const
6102 return total
== other
.total
6103 && available
== other
.available
6104 && allocated
== other
.allocated
6105 && stored
== other
.stored
6106 && compressed
== other
.compressed
6107 && compressed_allocated
== other
.compressed_allocated
6108 && compressed_original
== other
.compressed_original
;
6111 void store_statfs_t::dump(Formatter
*f
) const
6113 f
->dump_int("total", total
);
6114 f
->dump_int("available", available
);
6115 f
->dump_int("allocated", allocated
);
6116 f
->dump_int("stored", stored
);
6117 f
->dump_int("compressed", compressed
);
6118 f
->dump_int("compressed_allocated", compressed_allocated
);
6119 f
->dump_int("compressed_original", compressed_original
);
6122 ostream
& operator<<(ostream
& out
, const store_statfs_t
&s
)
6125 << "store_statfs(0x" << s
.available
6127 << ", stored 0x" << s
.stored
6128 << "/0x" << s
.allocated
6129 << ", compress 0x" << s
.compressed
6130 << "/0x" << s
.compressed_allocated
6131 << "/0x" << s
.compressed_original
6137 void OSDOp::clear_data(vector
<OSDOp
>& ops
)
6139 for (unsigned i
= 0; i
< ops
.size(); i
++) {
6142 if (ceph_osd_op_type_attr(op
.op
.op
) &&
6143 op
.op
.xattr
.name_len
&&
6144 op
.indata
.length() >= op
.op
.xattr
.name_len
) {
6145 bufferptr
bp(op
.op
.xattr
.name_len
);
6148 bl
.copy_in(0, op
.op
.xattr
.name_len
, op
.indata
);
6149 op
.indata
.claim(bl
);
6150 } else if (ceph_osd_op_type_exec(op
.op
.op
) &&
6151 op
.op
.cls
.class_len
&&
6152 op
.indata
.length() >
6153 (op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
)) {
6154 __u8 len
= op
.op
.cls
.class_len
+ op
.op
.cls
.method_len
;
6158 bl
.copy_in(0, len
, op
.indata
);
6159 op
.indata
.claim(bl
);