1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
28 #include <boost/assign/list_of.hpp>
30 #include "include/ceph_features.h"
31 #include "include/encoding.h"
32 #include "include/stringify.h"
34 #include "crush/hash.h"
37 #include "common/Formatter.h"
38 #include "common/StackStringStream.h"
39 #include "include/utime_fmt.h"
41 #include "osd_types.h"
42 #include "osd_types_fmt.h"
43 #include "os/Transaction.h"
51 using std::shared_ptr
;
53 using std::unique_ptr
;
56 using ceph::bufferlist
;
58 using ceph::decode_nohead
;
60 using ceph::encode_nohead
;
61 using ceph::Formatter
;
62 using ceph::make_timespan
;
63 using ceph::JSONFormatter
;
65 using namespace std::literals
;
67 const char *ceph_osd_flag_name(unsigned flag
)
70 case CEPH_OSD_FLAG_ACK
: return "ack";
71 case CEPH_OSD_FLAG_ONNVRAM
: return "onnvram";
72 case CEPH_OSD_FLAG_ONDISK
: return "ondisk";
73 case CEPH_OSD_FLAG_RETRY
: return "retry";
74 case CEPH_OSD_FLAG_READ
: return "read";
75 case CEPH_OSD_FLAG_WRITE
: return "write";
76 case CEPH_OSD_FLAG_ORDERSNAP
: return "ordersnap";
77 case CEPH_OSD_FLAG_PEERSTAT_OLD
: return "peerstat_old";
78 case CEPH_OSD_FLAG_BALANCE_READS
: return "balance_reads";
79 case CEPH_OSD_FLAG_PARALLELEXEC
: return "parallelexec";
80 case CEPH_OSD_FLAG_PGOP
: return "pgop";
81 case CEPH_OSD_FLAG_EXEC
: return "exec";
82 case CEPH_OSD_FLAG_EXEC_PUBLIC
: return "exec_public";
83 case CEPH_OSD_FLAG_LOCALIZE_READS
: return "localize_reads";
84 case CEPH_OSD_FLAG_RWORDERED
: return "rwordered";
85 case CEPH_OSD_FLAG_IGNORE_CACHE
: return "ignore_cache";
86 case CEPH_OSD_FLAG_SKIPRWLOCKS
: return "skiprwlocks";
87 case CEPH_OSD_FLAG_IGNORE_OVERLAY
: return "ignore_overlay";
88 case CEPH_OSD_FLAG_FLUSH
: return "flush";
89 case CEPH_OSD_FLAG_MAP_SNAP_CLONE
: return "map_snap_clone";
90 case CEPH_OSD_FLAG_ENFORCE_SNAPC
: return "enforce_snapc";
91 case CEPH_OSD_FLAG_REDIRECTED
: return "redirected";
92 case CEPH_OSD_FLAG_KNOWN_REDIR
: return "known_if_redirected";
93 case CEPH_OSD_FLAG_FULL_TRY
: return "full_try";
94 case CEPH_OSD_FLAG_FULL_FORCE
: return "full_force";
95 case CEPH_OSD_FLAG_IGNORE_REDIRECT
: return "ignore_redirect";
96 case CEPH_OSD_FLAG_RETURNVEC
: return "returnvec";
97 case CEPH_OSD_FLAG_SUPPORTSPOOLEIO
: return "supports_pool_eio";
98 default: return "???";
102 string
ceph_osd_flag_string(unsigned flags
)
105 for (unsigned i
=0; i
<32; ++i
) {
106 if (flags
& (1u<<i
)) {
109 s
+= ceph_osd_flag_name(1u << i
);
117 const char * ceph_osd_op_flag_name(unsigned flag
)
122 case CEPH_OSD_OP_FLAG_EXCL
:
125 case CEPH_OSD_OP_FLAG_FAILOK
:
128 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM
:
129 name
= "fadvise_random";
131 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
:
132 name
= "fadvise_sequential";
134 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
:
135 name
= "favise_willneed";
137 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
:
138 name
= "fadvise_dontneed";
140 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
:
141 name
= "fadvise_nocache";
143 case CEPH_OSD_OP_FLAG_WITH_REFERENCE
:
144 name
= "with_reference";
146 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
:
147 name
= "bypass_clean_cache";
156 string
ceph_osd_op_flag_string(unsigned flags
)
159 for (unsigned i
=0; i
<32; ++i
) {
160 if (flags
& (1u<<i
)) {
163 s
+= ceph_osd_op_flag_name(1u << i
);
171 string
ceph_osd_alloc_hint_flag_string(unsigned flags
)
174 for (unsigned i
=0; i
<32; ++i
) {
175 if (flags
& (1u<<i
)) {
178 s
+= ceph_osd_alloc_hint_flag_name(1u << i
);
186 void pg_shard_t::encode(ceph::buffer::list
&bl
) const
188 ENCODE_START(1, 1, bl
);
193 void pg_shard_t::decode(ceph::buffer::list::const_iterator
&bl
)
201 ostream
&operator<<(ostream
&lhs
, const pg_shard_t
&rhs
)
203 if (rhs
.is_undefined())
205 if (rhs
.shard
== shard_id_t::NO_SHARD
)
206 return lhs
<< rhs
.get_osd();
207 return lhs
<< rhs
.get_osd() << '(' << (unsigned)(rhs
.shard
) << ')';
210 void dump(Formatter
* f
, const osd_alerts_t
& alerts
)
212 for (auto& a
: alerts
) {
213 string s0
= " osd: ";
214 s0
+= stringify(a
.first
);
216 for (auto& aa
: a
.second
) {
222 f
->dump_string("alert", s
);
228 void osd_reqid_t::dump(Formatter
*f
) const
230 f
->dump_stream("name") << name
;
231 f
->dump_int("inc", inc
);
232 f
->dump_unsigned("tid", tid
);
235 void osd_reqid_t::generate_test_instances(list
<osd_reqid_t
*>& o
)
237 o
.push_back(new osd_reqid_t
);
238 o
.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
241 // -- object_locator_t --
243 void object_locator_t::encode(ceph::buffer::list
& bl
) const
245 // verify that nobody's corrupted the locator
246 ceph_assert(hash
== -1 || key
.empty());
247 __u8 encode_compat
= 3;
248 ENCODE_START(6, encode_compat
, bl
);
250 int32_t preferred
= -1; // tell old code there is no preferred osd (-1).
251 encode(preferred
, bl
);
256 encode_compat
= std::max
<std::uint8_t>(encode_compat
, 6); // need to interpret the hash
257 ENCODE_FINISH_NEW_COMPAT(bl
, encode_compat
);
260 void object_locator_t::decode(ceph::buffer::list::const_iterator
& p
)
262 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p
);
272 decode(preferred
, p
);
282 // verify that nobody's corrupted the locator
283 ceph_assert(hash
== -1 || key
.empty());
286 void object_locator_t::dump(Formatter
*f
) const
288 f
->dump_int("pool", pool
);
289 f
->dump_string("key", key
);
290 f
->dump_string("namespace", nspace
);
291 f
->dump_int("hash", hash
);
294 void object_locator_t::generate_test_instances(list
<object_locator_t
*>& o
)
296 o
.push_back(new object_locator_t
);
297 o
.push_back(new object_locator_t(123));
298 o
.push_back(new object_locator_t(123, 876));
299 o
.push_back(new object_locator_t(1, "n2"));
300 o
.push_back(new object_locator_t(1234, "", "key"));
301 o
.push_back(new object_locator_t(12, "n1", "key2"));
304 // -- request_redirect_t --
305 void request_redirect_t::encode(ceph::buffer::list
& bl
) const
307 ENCODE_START(1, 1, bl
);
308 encode(redirect_locator
, bl
);
309 encode(redirect_object
, bl
);
310 // legacy of the removed osd_instructions member
311 encode((uint32_t)0, bl
);
315 void request_redirect_t::decode(ceph::buffer::list::const_iterator
& bl
)
318 uint32_t legacy_osd_instructions_len
;
319 decode(redirect_locator
, bl
);
320 decode(redirect_object
, bl
);
321 decode(legacy_osd_instructions_len
, bl
);
322 if (legacy_osd_instructions_len
) {
323 bl
+= legacy_osd_instructions_len
;
328 void request_redirect_t::dump(Formatter
*f
) const
330 f
->dump_string("object", redirect_object
);
331 f
->open_object_section("locator");
332 redirect_locator
.dump(f
);
333 f
->close_section(); // locator
336 void request_redirect_t::generate_test_instances(list
<request_redirect_t
*>& o
)
338 object_locator_t
loc(1, "redir_obj");
339 o
.push_back(new request_redirect_t());
340 o
.push_back(new request_redirect_t(loc
, 0));
341 o
.push_back(new request_redirect_t(loc
, "redir_obj"));
342 o
.push_back(new request_redirect_t(loc
));
345 void objectstore_perf_stat_t::dump(Formatter
*f
) const
347 // *_ms values just for compatibility.
348 f
->dump_float("commit_latency_ms", os_commit_latency_ns
/ 1000000.0);
349 f
->dump_float("apply_latency_ms", os_apply_latency_ns
/ 1000000.0);
350 f
->dump_unsigned("commit_latency_ns", os_commit_latency_ns
);
351 f
->dump_unsigned("apply_latency_ns", os_apply_latency_ns
);
354 void objectstore_perf_stat_t::encode(ceph::buffer::list
&bl
, uint64_t features
) const
356 uint8_t target_v
= 2;
357 if (!HAVE_FEATURE(features
, OS_PERF_STAT_NS
)) {
360 ENCODE_START(target_v
, target_v
, bl
);
362 encode(os_commit_latency_ns
, bl
);
363 encode(os_apply_latency_ns
, bl
);
365 constexpr auto NS_PER_MS
= std::chrono::nanoseconds(1ms
).count();
366 uint32_t commit_latency_ms
= os_commit_latency_ns
/ NS_PER_MS
;
367 uint32_t apply_latency_ms
= os_apply_latency_ns
/ NS_PER_MS
;
368 encode(commit_latency_ms
, bl
); // for compatibility with older monitor.
369 encode(apply_latency_ms
, bl
); // for compatibility with older monitor.
374 void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator
&bl
)
378 decode(os_commit_latency_ns
, bl
);
379 decode(os_apply_latency_ns
, bl
);
381 uint32_t commit_latency_ms
;
382 uint32_t apply_latency_ms
;
383 decode(commit_latency_ms
, bl
);
384 decode(apply_latency_ms
, bl
);
385 constexpr auto NS_PER_MS
= std::chrono::nanoseconds(1ms
).count();
386 os_commit_latency_ns
= commit_latency_ms
* NS_PER_MS
;
387 os_apply_latency_ns
= apply_latency_ms
* NS_PER_MS
;
392 void objectstore_perf_stat_t::generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
)
394 o
.push_back(new objectstore_perf_stat_t());
395 o
.push_back(new objectstore_perf_stat_t());
396 o
.back()->os_commit_latency_ns
= 20000000;
397 o
.back()->os_apply_latency_ns
= 30000000;
401 void osd_stat_t::dump(Formatter
*f
, bool with_net
) const
403 f
->dump_unsigned("up_from", up_from
);
404 f
->dump_unsigned("seq", seq
);
405 f
->dump_unsigned("num_pgs", num_pgs
);
406 f
->dump_unsigned("num_osds", num_osds
);
407 f
->dump_unsigned("num_per_pool_osds", num_per_pool_osds
);
408 f
->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds
);
410 /// dump legacy stats fields to ensure backward compatibility.
411 f
->dump_unsigned("kb", statfs
.kb());
412 f
->dump_unsigned("kb_used", statfs
.kb_used_raw());
413 f
->dump_unsigned("kb_used_data", statfs
.kb_used_data());
414 f
->dump_unsigned("kb_used_omap", statfs
.kb_used_omap());
415 f
->dump_unsigned("kb_used_meta", statfs
.kb_used_internal_metadata());
416 f
->dump_unsigned("kb_avail", statfs
.kb_avail());
419 f
->open_object_section("statfs");
422 f
->open_array_section("hb_peers");
423 for (auto p
: hb_peers
)
424 f
->dump_int("osd", p
);
426 f
->dump_int("snap_trim_queue_len", snap_trim_queue_len
);
427 f
->dump_int("num_snap_trimming", num_snap_trimming
);
428 f
->dump_int("num_shards_repaired", num_shards_repaired
);
429 f
->open_object_section("op_queue_age_hist");
430 op_queue_age_hist
.dump(f
);
432 f
->open_object_section("perf_stat");
433 os_perf_stat
.dump(f
);
435 f
->open_array_section("alerts");
436 ::dump(f
, os_alerts
);
443 void osd_stat_t::dump_ping_time(Formatter
*f
) const
445 f
->open_array_section("network_ping_times");
446 for (auto &i
: hb_pingtime
) {
447 f
->open_object_section("entry");
448 f
->dump_int("osd", i
.first
);
449 const time_t lu(i
.second
.last_update
);
451 string
lustr(ctime_r(&lu
, buffer
));
452 lustr
.pop_back(); // Remove trailing \n
453 f
->dump_string("last update", lustr
);
454 f
->open_array_section("interfaces");
455 f
->open_object_section("interface");
456 f
->dump_string("interface", "back");
457 f
->open_object_section("average");
458 f
->dump_float("1min", i
.second
.back_pingtime
[0]/1000.0);
459 f
->dump_float("5min", i
.second
.back_pingtime
[1]/1000.0);
460 f
->dump_float("15min", i
.second
.back_pingtime
[2]/1000.0);
461 f
->close_section(); // average
462 f
->open_object_section("min");
463 f
->dump_float("1min", i
.second
.back_min
[0]/1000.0);
464 f
->dump_float("5min", i
.second
.back_min
[1]/1000.0);
465 f
->dump_float("15min", i
.second
.back_min
[2]/1000.0);
466 f
->close_section(); // min
467 f
->open_object_section("max");
468 f
->dump_float("1min", i
.second
.back_max
[0]/1000.0);
469 f
->dump_float("5min", i
.second
.back_max
[1]/1000.0);
470 f
->dump_float("15min", i
.second
.back_max
[2]/1000.0);
471 f
->close_section(); // max
472 f
->dump_float("last", i
.second
.back_last
/1000.0);
473 f
->close_section(); // interface
475 if (i
.second
.front_pingtime
[0] != 0) {
476 f
->open_object_section("interface");
477 f
->dump_string("interface", "front");
478 f
->open_object_section("average");
479 f
->dump_float("1min", i
.second
.front_pingtime
[0]/1000.0);
480 f
->dump_float("5min", i
.second
.front_pingtime
[1]/1000.0);
481 f
->dump_float("15min", i
.second
.front_pingtime
[2]/1000.0);
482 f
->close_section(); // average
483 f
->open_object_section("min");
484 f
->dump_float("1min", i
.second
.front_min
[0]/1000.0);
485 f
->dump_float("5min", i
.second
.front_min
[1]/1000.0);
486 f
->dump_float("15min", i
.second
.front_min
[2]/1000.0);
487 f
->close_section(); // min
488 f
->open_object_section("max");
489 f
->dump_float("1min", i
.second
.front_max
[0]/1000.0);
490 f
->dump_float("5min", i
.second
.front_max
[1]/1000.0);
491 f
->dump_float("15min", i
.second
.front_max
[2]/1000.0);
492 f
->close_section(); // max
493 f
->dump_float("last", i
.second
.front_last
/1000.0);
494 f
->close_section(); // interface
496 f
->close_section(); // interfaces
497 f
->close_section(); // entry
499 f
->close_section(); // network_ping_time
502 void osd_stat_t::encode(ceph::buffer::list
&bl
, uint64_t features
) const
504 ENCODE_START(14, 2, bl
);
506 //////// for compatibility ////////
507 int64_t kb
= statfs
.kb();
508 int64_t kb_used
= statfs
.kb_used_raw();
509 int64_t kb_avail
= statfs
.kb_avail();
512 encode(kb_avail
, bl
);
513 ///////////////////////////////////
515 encode(snap_trim_queue_len
, bl
);
516 encode(num_snap_trimming
, bl
);
517 encode(hb_peers
, bl
);
518 encode((uint32_t)0, bl
);
519 encode(op_queue_age_hist
, bl
);
520 encode(os_perf_stat
, bl
, features
);
525 //////// for compatibility ////////
526 int64_t kb_used_data
= statfs
.kb_used_data();
527 int64_t kb_used_omap
= statfs
.kb_used_omap();
528 int64_t kb_used_meta
= statfs
.kb_used_internal_metadata();
529 encode(kb_used_data
, bl
);
530 encode(kb_used_omap
, bl
);
531 encode(kb_used_meta
, bl
);
533 ///////////////////////////////////
534 encode(os_alerts
, bl
);
535 encode(num_shards_repaired
, bl
);
536 encode(num_osds
, bl
);
537 encode(num_per_pool_osds
, bl
);
538 encode(num_per_pool_omap_osds
, bl
);
541 encode((int)hb_pingtime
.size(), bl
);
542 for (auto i
: hb_pingtime
) {
543 encode(i
.first
, bl
); // osd
544 encode(i
.second
.last_update
, bl
);
545 encode(i
.second
.back_pingtime
[0], bl
);
546 encode(i
.second
.back_pingtime
[1], bl
);
547 encode(i
.second
.back_pingtime
[2], bl
);
548 encode(i
.second
.back_min
[0], bl
);
549 encode(i
.second
.back_min
[1], bl
);
550 encode(i
.second
.back_min
[2], bl
);
551 encode(i
.second
.back_max
[0], bl
);
552 encode(i
.second
.back_max
[1], bl
);
553 encode(i
.second
.back_max
[2], bl
);
554 encode(i
.second
.back_last
, bl
);
555 encode(i
.second
.front_pingtime
[0], bl
);
556 encode(i
.second
.front_pingtime
[1], bl
);
557 encode(i
.second
.front_pingtime
[2], bl
);
558 encode(i
.second
.front_min
[0], bl
);
559 encode(i
.second
.front_min
[1], bl
);
560 encode(i
.second
.front_min
[2], bl
);
561 encode(i
.second
.front_max
[0], bl
);
562 encode(i
.second
.front_max
[1], bl
);
563 encode(i
.second
.front_max
[2], bl
);
564 encode(i
.second
.front_last
, bl
);
569 void osd_stat_t::decode(ceph::buffer::list::const_iterator
&bl
)
571 int64_t kb
, kb_used
,kb_avail
;
572 int64_t kb_used_data
, kb_used_omap
, kb_used_meta
;
573 DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl
);
576 decode(kb_avail
, bl
);
577 decode(snap_trim_queue_len
, bl
);
578 decode(num_snap_trimming
, bl
);
579 decode(hb_peers
, bl
);
580 vector
<int> num_hb_out
;
581 decode(num_hb_out
, bl
);
583 decode(op_queue_age_hist
, bl
);
585 decode(os_perf_stat
, bl
);
594 decode(kb_used_data
, bl
);
595 decode(kb_used_omap
, bl
);
596 decode(kb_used_meta
, bl
);
598 kb_used_data
= kb_used
;
606 statfs
.total
= kb
<< 10;
607 statfs
.available
= kb_avail
<< 10;
608 // actually it's totally unexpected to have ststfs.total < statfs.available
609 // here but unfortunately legacy generate_test_instances produced such a
610 // case hence inserting some handling rather than assert
611 statfs
.internally_reserved
=
612 statfs
.total
> statfs
.available
? statfs
.total
- statfs
.available
: 0;
614 if ((int64_t)statfs
.internally_reserved
> kb_used
) {
615 statfs
.internally_reserved
-= kb_used
;
617 statfs
.internally_reserved
= 0;
619 statfs
.allocated
= kb_used_data
<< 10;
620 statfs
.omap_allocated
= kb_used_omap
<< 10;
621 statfs
.internal_metadata
= kb_used_meta
<< 10;
623 if (struct_v
>= 10) {
624 decode(os_alerts
, bl
);
628 if (struct_v
>= 11) {
629 decode(num_shards_repaired
, bl
);
631 num_shards_repaired
= 0;
633 if (struct_v
>= 12) {
634 decode(num_osds
, bl
);
635 decode(num_per_pool_osds
, bl
);
638 num_per_pool_osds
= 0;
640 if (struct_v
>= 13) {
641 decode(num_per_pool_omap_osds
, bl
);
643 num_per_pool_omap_osds
= 0;
646 if (struct_v
>= 14) {
649 for (int i
= 0 ; i
< count
; i
++) {
652 struct Interfaces ifs
;
653 decode(ifs
.last_update
, bl
);
654 decode(ifs
.back_pingtime
[0],bl
);
655 decode(ifs
.back_pingtime
[1], bl
);
656 decode(ifs
.back_pingtime
[2], bl
);
657 decode(ifs
.back_min
[0],bl
);
658 decode(ifs
.back_min
[1], bl
);
659 decode(ifs
.back_min
[2], bl
);
660 decode(ifs
.back_max
[0],bl
);
661 decode(ifs
.back_max
[1], bl
);
662 decode(ifs
.back_max
[2], bl
);
663 decode(ifs
.back_last
, bl
);
664 decode(ifs
.front_pingtime
[0], bl
);
665 decode(ifs
.front_pingtime
[1], bl
);
666 decode(ifs
.front_pingtime
[2], bl
);
667 decode(ifs
.front_min
[0], bl
);
668 decode(ifs
.front_min
[1], bl
);
669 decode(ifs
.front_min
[2], bl
);
670 decode(ifs
.front_max
[0], bl
);
671 decode(ifs
.front_max
[1], bl
);
672 decode(ifs
.front_max
[2], bl
);
673 decode(ifs
.front_last
, bl
);
674 hb_pingtime
[osd
] = ifs
;
680 void osd_stat_t::generate_test_instances(std::list
<osd_stat_t
*>& o
)
682 o
.push_back(new osd_stat_t
);
684 o
.push_back(new osd_stat_t
);
685 list
<store_statfs_t
*> ll
;
686 store_statfs_t::generate_test_instances(ll
);
687 o
.back()->statfs
= *ll
.back();
688 o
.back()->hb_peers
.push_back(7);
689 o
.back()->snap_trim_queue_len
= 8;
690 o
.back()->num_snap_trimming
= 99;
691 o
.back()->num_shards_repaired
= 101;
692 o
.back()->os_alerts
[0].emplace(
693 "some alert", "some alert details");
694 o
.back()->os_alerts
[1].emplace(
695 "some alert2", "some alert2 details");
696 struct Interfaces gen_interfaces
= {
697 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
698 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
699 o
.back()->hb_pingtime
[20] = gen_interfaces
;
701 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
702 o
.back()->hb_pingtime
[30] = gen_interfaces
;
707 int pg_t::print(char *o
, int maxlen
) const
709 return snprintf(o
, maxlen
, "%llu.%x", (unsigned long long)pool(), ps());
712 bool pg_t::parse(const char *s
)
716 int r
= sscanf(s
, "%llu.%x", (long long unsigned *)&ppool
, &pseed
);
724 bool spg_t::parse(const char *s
)
726 shard
= shard_id_t::NO_SHARD
;
730 int r
= sscanf(s
, "%llu.%x", (long long unsigned *)&ppool
, &pseed
);
733 pgid
.set_pool(ppool
);
736 const char *p
= strchr(s
, 's');
738 r
= sscanf(p
, "s%u", &pshard
);
740 shard
= shard_id_t(pshard
);
748 char *spg_t::calc_name(char *buf
, const char *suffix_backwords
) const
750 while (*suffix_backwords
)
751 *--buf
= *suffix_backwords
++;
753 if (!is_no_shard()) {
754 buf
= ritoa
<uint8_t, 10>((uint8_t)shard
.id
, buf
);
758 return pgid
.calc_name(buf
, "");
761 ostream
& operator<<(ostream
& out
, const spg_t
&pg
)
763 char buf
[spg_t::calc_name_buf_size
];
764 buf
[spg_t::calc_name_buf_size
- 1] = '\0';
765 out
<< pg
.calc_name(buf
+ spg_t::calc_name_buf_size
- 1, "");
769 pg_t
pg_t::get_ancestor(unsigned old_pg_num
) const
771 int old_bits
= cbits(old_pg_num
);
772 int old_mask
= (1 << old_bits
) - 1;
774 ret
.m_seed
= ceph_stable_mod(m_seed
, old_pg_num
, old_mask
);
778 bool pg_t::is_split(unsigned old_pg_num
, unsigned new_pg_num
, set
<pg_t
> *children
) const
780 //ceph_assert(m_seed < old_pg_num);
781 if (m_seed
>= old_pg_num
) {
785 if (new_pg_num
<= old_pg_num
)
790 unsigned old_bits
= cbits(old_pg_num
);
791 unsigned old_mask
= (1 << old_bits
) - 1;
792 for (unsigned n
= 1; ; n
++) {
793 unsigned next_bit
= (n
<< (old_bits
-1));
794 unsigned s
= next_bit
| m_seed
;
796 if (s
< old_pg_num
|| s
== m_seed
)
800 if ((unsigned)ceph_stable_mod(s
, old_pg_num
, old_mask
) == m_seed
) {
803 children
->insert(pg_t(s
, m_pool
));
809 int old_bits
= cbits(old_pg_num
);
810 int old_mask
= (1 << old_bits
) - 1;
811 for (unsigned x
= old_pg_num
; x
< new_pg_num
; ++x
) {
812 unsigned o
= ceph_stable_mod(x
, old_pg_num
, old_mask
);
815 children
->insert(pg_t(x
, m_pool
));
822 unsigned pg_t::get_split_bits(unsigned pg_num
) const {
825 ceph_assert(pg_num
> 1);
827 // Find unique p such that pg_num \in [2^(p-1), 2^p)
828 unsigned p
= cbits(pg_num
);
829 ceph_assert(p
); // silence coverity #751330
831 if ((m_seed
% (1<<(p
-1))) < (pg_num
% (1<<(p
-1))))
837 bool pg_t::is_merge_source(
842 if (m_seed
< old_pg_num
&&
843 m_seed
>= new_pg_num
) {
846 while (t
.m_seed
>= new_pg_num
) {
856 pg_t
pg_t::get_parent() const
858 unsigned bits
= cbits(m_seed
);
861 retval
.m_seed
&= ~((~0)<<(bits
- 1));
865 hobject_t
pg_t::get_hobj_start() const
867 return hobject_t(object_t(), string(), 0, m_seed
, m_pool
,
871 hobject_t
pg_t::get_hobj_end(unsigned pg_num
) const
873 // note: this assumes a bitwise sort; with the legacy nibblewise
874 // sort a PG did not always cover a single contiguous range of the
875 // (bit-reversed) hash range.
876 unsigned bits
= get_split_bits(pg_num
);
877 uint64_t rev_start
= hobject_t::_reverse_bits(m_seed
);
878 uint64_t rev_end
= (rev_start
| (0xffffffff >> bits
)) + 1;
879 if (rev_end
>= 0x100000000) {
880 ceph_assert(rev_end
== 0x100000000);
881 return hobject_t::get_max();
883 return hobject_t(object_t(), string(), CEPH_NOSNAP
,
884 hobject_t::_reverse_bits(rev_end
), m_pool
,
889 void pg_t::dump(Formatter
*f
) const
891 f
->dump_unsigned("pool", m_pool
);
892 f
->dump_unsigned("seed", m_seed
);
895 void pg_t::generate_test_instances(list
<pg_t
*>& o
)
897 o
.push_back(new pg_t
);
898 o
.push_back(new pg_t(1, 2));
899 o
.push_back(new pg_t(13123, 3));
900 o
.push_back(new pg_t(131223, 4));
903 char *pg_t::calc_name(char *buf
, const char *suffix_backwords
) const
905 while (*suffix_backwords
)
906 *--buf
= *suffix_backwords
++;
908 buf
= ritoa
<uint32_t, 16>(m_seed
, buf
);
912 return ritoa
<uint64_t, 10>(m_pool
, buf
);
915 ostream
& operator<<(ostream
& out
, const pg_t
&pg
)
917 char buf
[pg_t::calc_name_buf_size
];
918 buf
[pg_t::calc_name_buf_size
- 1] = '\0';
919 out
<< pg
.calc_name(buf
+ pg_t::calc_name_buf_size
- 1, "");
926 void coll_t::calc_str()
930 strcpy(_str_buff
, "meta");
934 _str_buff
[spg_t::calc_name_buf_size
- 1] = '\0';
935 _str
= pgid
.calc_name(_str_buff
+ spg_t::calc_name_buf_size
- 1, "daeh_");
938 _str_buff
[spg_t::calc_name_buf_size
- 1] = '\0';
939 _str
= pgid
.calc_name(_str_buff
+ spg_t::calc_name_buf_size
- 1, "PMET_");
942 ceph_abort_msg("unknown collection type");
946 bool coll_t::parse(const std::string
& s
)
953 ceph_assert(s
== _str
);
956 if (s
.find("_head") == s
.length() - 5 &&
957 pgid
.parse(s
.substr(0, s
.length() - 5))) {
961 ceph_assert(s
== _str
);
964 if (s
.find("_TEMP") == s
.length() - 5 &&
965 pgid
.parse(s
.substr(0, s
.length() - 5))) {
969 ceph_assert(s
== _str
);
975 void coll_t::encode(ceph::buffer::list
& bl
) const
978 // when changing this, remember to update encoded_size() too.
980 // can't express this as v2...
982 encode(struct_v
, bl
);
983 encode(to_str(), bl
);
986 encode(struct_v
, bl
);
987 encode((__u8
)type
, bl
);
989 snapid_t snap
= CEPH_NOSNAP
;
994 size_t coll_t::encoded_size() const
996 size_t r
= sizeof(__u8
);
1008 // - encoding header
1009 r
+= sizeof(ceph_le32
) + 2 * sizeof(__u8
);
1011 r
+= sizeof(__u8
) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
1013 r
+= sizeof(int8_t);
1015 r
+= sizeof(uint64_t);
1021 void coll_t::decode(ceph::buffer::list::const_iterator
& bl
)
1025 decode(struct_v
, bl
);
1034 if (pgid
== spg_t() && snap
== 0) {
1050 type
= (type_t
)_type
;
1059 bool ok
= parse(str
);
1061 throw std::domain_error(std::string("unable to parse pg ") + str
);
1067 CachedStackStringStream css
;
1068 *css
<< "coll_t::decode(): don't know how to decode version "
1070 throw std::domain_error(css
->str());
1075 void coll_t::dump(Formatter
*f
) const
1077 f
->dump_unsigned("type_id", (unsigned)type
);
1078 if (type
!= TYPE_META
)
1079 f
->dump_stream("pgid") << pgid
;
1080 f
->dump_string("name", to_str());
1083 void coll_t::generate_test_instances(list
<coll_t
*>& o
)
1085 o
.push_back(new coll_t());
1086 o
.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD
)));
1087 o
.push_back(new coll_t(o
.back()->get_temp()));
1088 o
.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
1089 o
.push_back(new coll_t(o
.back()->get_temp()));
1090 o
.push_back(new coll_t());
1095 std::string
pg_vector_string(const vector
<int32_t> &a
)
1097 CachedStackStringStream css
;
1099 for (auto i
= a
.cbegin(); i
!= a
.cend(); ++i
) {
1102 if (*i
!= CRUSH_ITEM_NONE
)
1111 std::string
pg_state_string(uint64_t state
)
1113 CachedStackStringStream css
;
1114 if (state
& PG_STATE_STALE
)
1116 if (state
& PG_STATE_CREATING
)
1117 *css
<< "creating+";
1118 if (state
& PG_STATE_ACTIVE
)
1120 if (state
& PG_STATE_ACTIVATING
)
1121 *css
<< "activating+";
1122 if (state
& PG_STATE_CLEAN
)
1124 if (state
& PG_STATE_RECOVERY_WAIT
)
1125 *css
<< "recovery_wait+";
1126 if (state
& PG_STATE_RECOVERY_TOOFULL
)
1127 *css
<< "recovery_toofull+";
1128 if (state
& PG_STATE_RECOVERING
)
1129 *css
<< "recovering+";
1130 if (state
& PG_STATE_FORCED_RECOVERY
)
1131 *css
<< "forced_recovery+";
1132 if (state
& PG_STATE_DOWN
)
1134 if (state
& PG_STATE_RECOVERY_UNFOUND
)
1135 *css
<< "recovery_unfound+";
1136 if (state
& PG_STATE_BACKFILL_UNFOUND
)
1137 *css
<< "backfill_unfound+";
1138 if (state
& PG_STATE_UNDERSIZED
)
1139 *css
<< "undersized+";
1140 if (state
& PG_STATE_DEGRADED
)
1141 *css
<< "degraded+";
1142 if (state
& PG_STATE_REMAPPED
)
1143 *css
<< "remapped+";
1144 if (state
& PG_STATE_PREMERGE
)
1145 *css
<< "premerge+";
1146 if (state
& PG_STATE_SCRUBBING
)
1147 *css
<< "scrubbing+";
1148 if (state
& PG_STATE_DEEP_SCRUB
)
1150 if (state
& PG_STATE_INCONSISTENT
)
1151 *css
<< "inconsistent+";
1152 if (state
& PG_STATE_PEERING
)
1154 if (state
& PG_STATE_REPAIR
)
1156 if (state
& PG_STATE_BACKFILL_WAIT
)
1157 *css
<< "backfill_wait+";
1158 if (state
& PG_STATE_BACKFILLING
)
1159 *css
<< "backfilling+";
1160 if (state
& PG_STATE_FORCED_BACKFILL
)
1161 *css
<< "forced_backfill+";
1162 if (state
& PG_STATE_BACKFILL_TOOFULL
)
1163 *css
<< "backfill_toofull+";
1164 if (state
& PG_STATE_INCOMPLETE
)
1165 *css
<< "incomplete+";
1166 if (state
& PG_STATE_PEERED
)
1168 if (state
& PG_STATE_SNAPTRIM
)
1169 *css
<< "snaptrim+";
1170 if (state
& PG_STATE_SNAPTRIM_WAIT
)
1171 *css
<< "snaptrim_wait+";
1172 if (state
& PG_STATE_SNAPTRIM_ERROR
)
1173 *css
<< "snaptrim_error+";
1174 if (state
& PG_STATE_FAILED_REPAIR
)
1175 *css
<< "failed_repair+";
1176 if (state
& PG_STATE_LAGGY
)
1178 if (state
& PG_STATE_WAIT
)
1180 auto ret
= css
->str();
1181 if (ret
.length() > 0)
1182 ret
.resize(ret
.length() - 1);
1188 std::optional
<uint64_t> pg_string_state(const std::string
& state
)
1190 std::optional
<uint64_t> type
;
1191 if (state
== "active")
1192 type
= PG_STATE_ACTIVE
;
1193 else if (state
== "clean")
1194 type
= PG_STATE_CLEAN
;
1195 else if (state
== "down")
1196 type
= PG_STATE_DOWN
;
1197 else if (state
== "recovery_unfound")
1198 type
= PG_STATE_RECOVERY_UNFOUND
;
1199 else if (state
== "backfill_unfound")
1200 type
= PG_STATE_BACKFILL_UNFOUND
;
1201 else if (state
== "premerge")
1202 type
= PG_STATE_PREMERGE
;
1203 else if (state
== "scrubbing")
1204 type
= PG_STATE_SCRUBBING
;
1205 else if (state
== "degraded")
1206 type
= PG_STATE_DEGRADED
;
1207 else if (state
== "inconsistent")
1208 type
= PG_STATE_INCONSISTENT
;
1209 else if (state
== "peering")
1210 type
= PG_STATE_PEERING
;
1211 else if (state
== "repair")
1212 type
= PG_STATE_REPAIR
;
1213 else if (state
== "recovering")
1214 type
= PG_STATE_RECOVERING
;
1215 else if (state
== "forced_recovery")
1216 type
= PG_STATE_FORCED_RECOVERY
;
1217 else if (state
== "backfill_wait")
1218 type
= PG_STATE_BACKFILL_WAIT
;
1219 else if (state
== "incomplete")
1220 type
= PG_STATE_INCOMPLETE
;
1221 else if (state
== "stale")
1222 type
= PG_STATE_STALE
;
1223 else if (state
== "remapped")
1224 type
= PG_STATE_REMAPPED
;
1225 else if (state
== "deep")
1226 type
= PG_STATE_DEEP_SCRUB
;
1227 else if (state
== "backfilling")
1228 type
= PG_STATE_BACKFILLING
;
1229 else if (state
== "forced_backfill")
1230 type
= PG_STATE_FORCED_BACKFILL
;
1231 else if (state
== "backfill_toofull")
1232 type
= PG_STATE_BACKFILL_TOOFULL
;
1233 else if (state
== "recovery_wait")
1234 type
= PG_STATE_RECOVERY_WAIT
;
1235 else if (state
== "recovery_toofull")
1236 type
= PG_STATE_RECOVERY_TOOFULL
;
1237 else if (state
== "undersized")
1238 type
= PG_STATE_UNDERSIZED
;
1239 else if (state
== "activating")
1240 type
= PG_STATE_ACTIVATING
;
1241 else if (state
== "peered")
1242 type
= PG_STATE_PEERED
;
1243 else if (state
== "snaptrim")
1244 type
= PG_STATE_SNAPTRIM
;
1245 else if (state
== "snaptrim_wait")
1246 type
= PG_STATE_SNAPTRIM_WAIT
;
1247 else if (state
== "snaptrim_error")
1248 type
= PG_STATE_SNAPTRIM_ERROR
;
1249 else if (state
== "creating")
1250 type
= PG_STATE_CREATING
;
1251 else if (state
== "failed_repair")
1252 type
= PG_STATE_FAILED_REPAIR
;
1253 else if (state
== "laggy")
1254 type
= PG_STATE_LAGGY
;
1255 else if (state
== "wait")
1256 type
= PG_STATE_WAIT
;
1257 else if (state
== "unknown")
1260 type
= std::nullopt
;
1265 string
eversion_t::get_key_name() const
1267 std::string
key(32, ' ');
1268 get_key_name(&key
[0]);
1269 key
.resize(31); // remove the null terminator
1273 // -- pool_snap_info_t --
1274 void pool_snap_info_t::dump(Formatter
*f
) const
1276 f
->dump_unsigned("snapid", snapid
);
1277 f
->dump_stream("stamp") << stamp
;
1278 f
->dump_string("name", name
);
1281 void pool_snap_info_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
1284 if ((features
& CEPH_FEATURE_PGPOOL3
) == 0) {
1286 encode(struct_v
, bl
);
1292 ENCODE_START(2, 2, bl
);
1299 void pool_snap_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
1301 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
1308 void pool_snap_info_t::generate_test_instances(list
<pool_snap_info_t
*>& o
)
1310 o
.push_back(new pool_snap_info_t
);
1311 o
.push_back(new pool_snap_info_t
);
1312 o
.back()->snapid
= 1;
1313 o
.back()->stamp
= utime_t(1, 2);
1314 o
.back()->name
= "foo";
1317 // -- pool_opts_t --
1319 typedef std::map
<std::string
, pool_opts_t::opt_desc_t
> opt_mapping_t
;
1320 static opt_mapping_t opt_mapping
= boost::assign::map_list_of
1321 ("scrub_min_interval", pool_opts_t::opt_desc_t(
1322 pool_opts_t::SCRUB_MIN_INTERVAL
, pool_opts_t::DOUBLE
))
1323 ("scrub_max_interval", pool_opts_t::opt_desc_t(
1324 pool_opts_t::SCRUB_MAX_INTERVAL
, pool_opts_t::DOUBLE
))
1325 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1326 pool_opts_t::DEEP_SCRUB_INTERVAL
, pool_opts_t::DOUBLE
))
1327 ("recovery_priority", pool_opts_t::opt_desc_t(
1328 pool_opts_t::RECOVERY_PRIORITY
, pool_opts_t::INT
))
1329 ("recovery_op_priority", pool_opts_t::opt_desc_t(
1330 pool_opts_t::RECOVERY_OP_PRIORITY
, pool_opts_t::INT
))
1331 ("scrub_priority", pool_opts_t::opt_desc_t(
1332 pool_opts_t::SCRUB_PRIORITY
, pool_opts_t::INT
))
1333 ("compression_mode", pool_opts_t::opt_desc_t(
1334 pool_opts_t::COMPRESSION_MODE
, pool_opts_t::STR
))
1335 ("compression_algorithm", pool_opts_t::opt_desc_t(
1336 pool_opts_t::COMPRESSION_ALGORITHM
, pool_opts_t::STR
))
1337 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1338 pool_opts_t::COMPRESSION_REQUIRED_RATIO
, pool_opts_t::DOUBLE
))
1339 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1340 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, pool_opts_t::INT
))
1341 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1342 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, pool_opts_t::INT
))
1343 ("csum_type", pool_opts_t::opt_desc_t(
1344 pool_opts_t::CSUM_TYPE
, pool_opts_t::INT
))
1345 ("csum_max_block", pool_opts_t::opt_desc_t(
1346 pool_opts_t::CSUM_MAX_BLOCK
, pool_opts_t::INT
))
1347 ("csum_min_block", pool_opts_t::opt_desc_t(
1348 pool_opts_t::CSUM_MIN_BLOCK
, pool_opts_t::INT
))
1349 ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1350 pool_opts_t::FINGERPRINT_ALGORITHM
, pool_opts_t::STR
))
1351 ("pg_num_min", pool_opts_t::opt_desc_t(
1352 pool_opts_t::PG_NUM_MIN
, pool_opts_t::INT
))
1353 ("pg_num_max", pool_opts_t::opt_desc_t(
1354 pool_opts_t::PG_NUM_MAX
, pool_opts_t::INT
))
1355 ("target_size_bytes", pool_opts_t::opt_desc_t(
1356 pool_opts_t::TARGET_SIZE_BYTES
, pool_opts_t::INT
))
1357 ("target_size_ratio", pool_opts_t::opt_desc_t(
1358 pool_opts_t::TARGET_SIZE_RATIO
, pool_opts_t::DOUBLE
))
1359 ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
1360 pool_opts_t::PG_AUTOSCALE_BIAS
, pool_opts_t::DOUBLE
))
1361 ("read_lease_interval", pool_opts_t::opt_desc_t(
1362 pool_opts_t::READ_LEASE_INTERVAL
, pool_opts_t::DOUBLE
))
1363 ("dedup_tier", pool_opts_t::opt_desc_t(
1364 pool_opts_t::DEDUP_TIER
, pool_opts_t::INT
))
1365 ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t(
1366 pool_opts_t::DEDUP_CHUNK_ALGORITHM
, pool_opts_t::STR
))
1367 ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t(
1368 pool_opts_t::DEDUP_CDC_CHUNK_SIZE
, pool_opts_t::INT
));
1370 bool pool_opts_t::is_opt_name(const std::string
& name
)
1372 return opt_mapping
.count(name
);
1375 pool_opts_t::opt_desc_t
pool_opts_t::get_opt_desc(const std::string
& name
)
1377 auto i
= opt_mapping
.find(name
);
1378 ceph_assert(i
!= opt_mapping
.end());
1382 bool pool_opts_t::is_set(pool_opts_t::key_t key
) const
1384 return opts
.count(key
);
1387 const pool_opts_t::value_t
& pool_opts_t::get(pool_opts_t::key_t key
) const
1389 auto i
= opts
.find(key
);
1390 ceph_assert(i
!= opts
.end());
1394 bool pool_opts_t::unset(pool_opts_t::key_t key
) {
1395 return opts
.erase(key
) > 0;
1398 class pool_opts_dumper_t
: public boost::static_visitor
<> {
1400 pool_opts_dumper_t(const std::string
& name_
, Formatter
* f_
) :
1401 name(name_
.c_str()), f(f_
) {}
1403 void operator()(std::string s
) const {
1404 f
->dump_string(name
, s
);
1406 void operator()(int64_t i
) const {
1407 f
->dump_int(name
, i
);
1409 void operator()(double d
) const {
1410 f
->dump_float(name
, d
);
1418 void pool_opts_t::dump(const std::string
& name
, Formatter
* f
) const
1420 const opt_desc_t
& desc
= get_opt_desc(name
);
1421 auto i
= opts
.find(desc
.key
);
1422 if (i
== opts
.end()) {
1425 boost::apply_visitor(pool_opts_dumper_t(name
, f
), i
->second
);
1428 void pool_opts_t::dump(Formatter
* f
) const
1430 for (auto i
= opt_mapping
.cbegin(); i
!= opt_mapping
.cend(); ++i
) {
1431 const std::string
& name
= i
->first
;
1432 const opt_desc_t
& desc
= i
->second
;
1433 auto j
= opts
.find(desc
.key
);
1434 if (j
== opts
.end()) {
1437 boost::apply_visitor(pool_opts_dumper_t(name
, f
), j
->second
);
1441 class pool_opts_encoder_t
: public boost::static_visitor
<> {
1443 explicit pool_opts_encoder_t(ceph::buffer::list
& bl_
, uint64_t features
)
1445 features(features
) {}
1447 void operator()(const std::string
&s
) const {
1448 encode(static_cast<int32_t>(pool_opts_t::STR
), bl
);
1451 void operator()(int64_t i
) const {
1452 encode(static_cast<int32_t>(pool_opts_t::INT
), bl
);
1453 if (HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1456 encode(static_cast<int32_t>(i
), bl
);
1459 void operator()(double d
) const {
1460 encode(static_cast<int32_t>(pool_opts_t::DOUBLE
), bl
);
1465 ceph::buffer::list
& bl
;
1469 void pool_opts_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
1472 if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1475 ENCODE_START(v
, 1, bl
);
1476 uint32_t n
= static_cast<uint32_t>(opts
.size());
1478 for (auto i
= opts
.cbegin(); i
!= opts
.cend(); ++i
) {
1479 encode(static_cast<int32_t>(i
->first
), bl
);
1480 boost::apply_visitor(pool_opts_encoder_t(bl
, features
), i
->second
);
1485 void pool_opts_t::decode(ceph::buffer::list::const_iterator
& bl
)
1487 DECODE_START(1, bl
);
1498 opts
[static_cast<key_t
>(k
)] = s
;
1499 } else if (t
== INT
) {
1501 if (struct_v
>= 2) {
1508 opts
[static_cast<key_t
>(k
)] = i
;
1509 } else if (t
== DOUBLE
) {
1512 opts
[static_cast<key_t
>(k
)] = d
;
1514 ceph_assert(!"invalid type");
1520 ostream
& operator<<(ostream
& out
, const pool_opts_t
& opts
)
1522 for (auto i
= opt_mapping
.begin(); i
!= opt_mapping
.end(); ++i
) {
1523 const std::string
& name
= i
->first
;
1524 const pool_opts_t::opt_desc_t
& desc
= i
->second
;
1525 auto j
= opts
.opts
.find(desc
.key
);
1526 if (j
== opts
.opts
.end()) {
1529 out
<< " " << name
<< " " << j
->second
;
1536 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1537 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1538 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1540 void pg_pool_t::dump(Formatter
*f
) const
1542 f
->dump_stream("create_time") << get_create_time();
1543 f
->dump_unsigned("flags", get_flags());
1544 f
->dump_string("flags_names", get_flags_string());
1545 f
->dump_int("type", get_type());
1546 f
->dump_int("size", get_size());
1547 f
->dump_int("min_size", get_min_size());
1548 f
->dump_int("crush_rule", get_crush_rule());
1549 f
->dump_int("peering_crush_bucket_count", peering_crush_bucket_count
);
1550 f
->dump_int("peering_crush_bucket_target", peering_crush_bucket_target
);
1551 f
->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier
);
1552 f
->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member
);
1553 f
->dump_int("object_hash", get_object_hash());
1554 f
->dump_string("pg_autoscale_mode",
1555 get_pg_autoscale_mode_name(pg_autoscale_mode
));
1556 f
->dump_unsigned("pg_num", get_pg_num());
1557 f
->dump_unsigned("pg_placement_num", get_pgp_num());
1558 f
->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1559 f
->dump_unsigned("pg_num_target", get_pg_num_target());
1560 f
->dump_unsigned("pg_num_pending", get_pg_num_pending());
1561 f
->dump_object("last_pg_merge_meta", last_pg_merge_meta
);
1562 f
->dump_stream("last_change") << get_last_change();
1563 f
->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1564 f
->dump_stream("last_force_op_resend_prenautilus")
1565 << get_last_force_op_resend_prenautilus();
1566 f
->dump_stream("last_force_op_resend_preluminous")
1567 << get_last_force_op_resend_preluminous();
1568 f
->dump_unsigned("auid", get_auid());
1569 f
->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1570 f
->dump_unsigned("snap_seq", get_snap_seq());
1571 f
->dump_unsigned("snap_epoch", get_snap_epoch());
1572 f
->open_array_section("pool_snaps");
1573 for (auto p
= snaps
.cbegin(); p
!= snaps
.cend(); ++p
) {
1574 f
->open_object_section("pool_snap_info");
1579 f
->dump_stream("removed_snaps") << removed_snaps
;
1580 f
->dump_unsigned("quota_max_bytes", quota_max_bytes
);
1581 f
->dump_unsigned("quota_max_objects", quota_max_objects
);
1582 f
->open_array_section("tiers");
1583 for (auto p
= tiers
.cbegin(); p
!= tiers
.cend(); ++p
)
1584 f
->dump_unsigned("pool_id", *p
);
1586 f
->dump_int("tier_of", tier_of
);
1587 f
->dump_int("read_tier", read_tier
);
1588 f
->dump_int("write_tier", write_tier
);
1589 f
->dump_string("cache_mode", get_cache_mode_name());
1590 f
->dump_unsigned("target_max_bytes", target_max_bytes
);
1591 f
->dump_unsigned("target_max_objects", target_max_objects
);
1592 f
->dump_unsigned("cache_target_dirty_ratio_micro",
1593 cache_target_dirty_ratio_micro
);
1594 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
1595 cache_target_dirty_high_ratio_micro
);
1596 f
->dump_unsigned("cache_target_full_ratio_micro",
1597 cache_target_full_ratio_micro
);
1598 f
->dump_unsigned("cache_min_flush_age", cache_min_flush_age
);
1599 f
->dump_unsigned("cache_min_evict_age", cache_min_evict_age
);
1600 f
->dump_string("erasure_code_profile", erasure_code_profile
);
1601 f
->open_object_section("hit_set_params");
1602 hit_set_params
.dump(f
);
1603 f
->close_section(); // hit_set_params
1604 f
->dump_unsigned("hit_set_period", hit_set_period
);
1605 f
->dump_unsigned("hit_set_count", hit_set_count
);
1606 f
->dump_bool("use_gmt_hitset", use_gmt_hitset
);
1607 f
->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote
);
1608 f
->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote
);
1609 f
->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate
);
1610 f
->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n
);
1611 f
->open_array_section("grade_table");
1612 for (unsigned i
= 0; i
< hit_set_count
; ++i
)
1613 f
->dump_unsigned("value", get_grade(i
));
1615 f
->dump_unsigned("stripe_width", get_stripe_width());
1616 f
->dump_unsigned("expected_num_objects", expected_num_objects
);
1617 f
->dump_bool("fast_read", fast_read
);
1618 f
->open_object_section("options");
1620 f
->close_section(); // options
1621 f
->open_object_section("application_metadata");
1622 for (auto &app_pair
: application_metadata
) {
1623 f
->open_object_section(app_pair
.first
.c_str());
1624 for (auto &kv_pair
: app_pair
.second
) {
1625 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
1627 f
->close_section(); // application
1629 f
->close_section(); // application_metadata
1632 void pg_pool_t::convert_to_pg_shards(const vector
<int> &from
, set
<pg_shard_t
>* to
) const {
1633 for (size_t i
= 0; i
< from
.size(); ++i
) {
1634 if (from
[i
] != CRUSH_ITEM_NONE
) {
1638 is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1643 void pg_pool_t::calc_pg_masks()
1645 pg_num_mask
= (1 << cbits(pg_num
-1)) - 1;
1646 pgp_num_mask
= (1 << cbits(pgp_num
-1)) - 1;
1649 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid
) const
1651 if (pg_num
== pg_num_mask
+ 1)
1652 return pg_num
; // power-of-2 split
1653 unsigned mask
= pg_num_mask
>> 1;
1654 if ((pgid
.ps() & mask
) < (pg_num
& mask
))
1655 return pg_num_mask
+ 1; // smaller bin size (already split)
1657 return (pg_num_mask
+ 1) >> 1; // bigger bin (not yet split)
1660 bool pg_pool_t::is_pending_merge(pg_t pgid
, bool *target
) const
1662 if (pg_num_pending
>= pg_num
) {
1665 if (pgid
.ps() >= pg_num_pending
&& pgid
.ps() < pg_num
) {
1671 for (unsigned ps
= pg_num_pending
; ps
< pg_num
; ++ps
) {
1672 if (pg_t(ps
, pgid
.pool()).get_parent() == pgid
) {
1683 * we have two snap modes:
1685 * - snap existence/non-existence defined by snaps[] and snap_seq
1686 * - user managed snaps
1687 * - existence tracked by librados user
1689 bool pg_pool_t::is_pool_snaps_mode() const
1691 return has_flag(FLAG_POOL_SNAPS
);
1694 bool pg_pool_t::is_unmanaged_snaps_mode() const
1696 return has_flag(FLAG_SELFMANAGED_SNAPS
);
1699 bool pg_pool_t::is_removed_snap(snapid_t s
) const
1701 if (is_pool_snaps_mode())
1702 return s
<= get_snap_seq() && snaps
.count(s
) == 0;
1704 return removed_snaps
.contains(s
);
1707 snapid_t
pg_pool_t::snap_exists(std::string_view s
) const
1709 for (auto p
= snaps
.cbegin(); p
!= snaps
.cend(); ++p
)
1710 if (p
->second
.name
== s
)
1711 return p
->second
.snapid
;
1715 void pg_pool_t::add_snap(const char *n
, utime_t stamp
)
1717 ceph_assert(!is_unmanaged_snaps_mode());
1718 flags
|= FLAG_POOL_SNAPS
;
1719 snapid_t s
= get_snap_seq() + 1;
1721 snaps
[s
].snapid
= s
;
1723 snaps
[s
].stamp
= stamp
;
1726 uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat
)
1728 ceph_assert(!is_pool_snaps_mode());
1729 if (snap_seq
== 0) {
1730 if (preoctopus_compat
) {
1731 // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
1732 // mimic this field is not decoded but our flag is set; pre-mimic, we
1733 // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1734 removed_snaps
.insert(snapid_t(1));
1738 flags
|= FLAG_SELFMANAGED_SNAPS
;
1739 snap_seq
= snap_seq
+ 1;
1743 void pg_pool_t::remove_snap(snapid_t s
)
1745 ceph_assert(snaps
.count(s
));
1747 snap_seq
= snap_seq
+ 1;
1750 void pg_pool_t::remove_unmanaged_snap(snapid_t s
, bool preoctopus_compat
)
1752 ceph_assert(is_unmanaged_snaps_mode());
1754 if (preoctopus_compat
) {
1755 removed_snaps
.insert(s
);
1756 // try to add in the new seq, just to try to keep the interval_set contiguous
1757 if (!removed_snaps
.contains(get_snap_seq())) {
1758 removed_snaps
.insert(get_snap_seq());
1763 SnapContext
pg_pool_t::get_snap_context() const
1765 vector
<snapid_t
> s(snaps
.size());
1767 for (auto p
= snaps
.crbegin(); p
!= snaps
.crend(); ++p
)
1769 return SnapContext(get_snap_seq(), s
);
1772 uint32_t pg_pool_t::hash_key(const string
& key
, const string
& ns
) const
1775 return ceph_str_hash(object_hash
, key
.data(), key
.length());
1776 int nsl
= ns
.length();
1777 int len
= key
.length() + nsl
+ 1;
1779 memcpy(&buf
[0], ns
.data(), nsl
);
1781 memcpy(&buf
[nsl
+1], key
.data(), key
.length());
1782 return ceph_str_hash(object_hash
, &buf
[0], len
);
1785 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v
) const
1787 return ceph_stable_mod(v
, pg_num
, pg_num_mask
);
1791 * map a raw pg (with full precision ps) into an actual pg, for storage
1793 pg_t
pg_pool_t::raw_pg_to_pg(pg_t pg
) const
1795 pg
.set_ps(ceph_stable_mod(pg
.ps(), pg_num
, pg_num_mask
));
1800 * map raw pg (full precision ps) into a placement seed. include
1801 * pool id in that value so that different pools don't use the same
1804 ps_t
pg_pool_t::raw_pg_to_pps(pg_t pg
) const
1806 if (flags
& FLAG_HASHPSPOOL
) {
1807 // Hash the pool id so that pool PGs do not overlap.
1809 crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1810 ceph_stable_mod(pg
.ps(), pgp_num
, pgp_num_mask
),
1813 // Legacy behavior; add ps and pool together. This is not a great
1814 // idea because the PGs from each pool will essentially overlap on
1815 // top of each other: 0.5 == 1.4 == 2.3 == ...
1817 ceph_stable_mod(pg
.ps(), pgp_num
, pgp_num_mask
) +
1822 uint32_t pg_pool_t::get_random_pg_position(pg_t pg
, uint32_t seed
) const
1824 uint32_t r
= crush_hash32_2(CRUSH_HASH_RJENKINS1
, seed
, 123);
1825 if (pg_num
== pg_num_mask
+ 1) {
1828 unsigned smaller_mask
= pg_num_mask
>> 1;
1829 if ((pg
.ps() & smaller_mask
) < (pg_num
& smaller_mask
)) {
1839 void pg_pool_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
1842 if ((features
& CEPH_FEATURE_PGPOOL3
) == 0) {
1843 // this encoding matches the old struct ceph_pg_pool
1845 encode(struct_v
, bl
);
1848 encode(crush_rule
, bl
);
1849 encode(object_hash
, bl
);
1851 encode(pgp_num
, bl
);
1852 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1853 encode(lpg_num
, bl
);
1854 encode(lpgp_num
, bl
);
1855 encode(last_change
, bl
);
1856 encode(snap_seq
, bl
);
1857 encode(snap_epoch
, bl
);
1859 __u32 n
= snaps
.size();
1861 n
= removed_snaps
.num_intervals();
1866 encode_nohead(snaps
, bl
, features
);
1867 encode_nohead(removed_snaps
, bl
);
1871 if ((features
& CEPH_FEATURE_OSDENC
) == 0) {
1873 encode(struct_v
, bl
);
1876 encode(crush_rule
, bl
);
1877 encode(object_hash
, bl
);
1879 encode(pgp_num
, bl
);
1880 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1881 encode(lpg_num
, bl
);
1882 encode(lpgp_num
, bl
);
1883 encode(last_change
, bl
);
1884 encode(snap_seq
, bl
);
1885 encode(snap_epoch
, bl
);
1886 encode(snaps
, bl
, features
);
1887 encode(removed_snaps
, bl
);
1890 encode((uint32_t)0, bl
); // crash_replay_interval
1894 if ((features
& CEPH_FEATURE_OSD_POOLRESEND
) == 0) {
1895 // we simply added last_force_op_resend here, which is a fully
1896 // backward compatible change. however, encoding the same map
1897 // differently between monitors triggers scrub noise (even though
1898 // they are decodable without the feature), so let's be pendantic
1900 ENCODE_START(14, 5, bl
);
1903 encode(crush_rule
, bl
);
1904 encode(object_hash
, bl
);
1906 encode(pgp_num
, bl
);
1907 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1908 encode(lpg_num
, bl
);
1909 encode(lpgp_num
, bl
);
1910 encode(last_change
, bl
);
1911 encode(snap_seq
, bl
);
1912 encode(snap_epoch
, bl
);
1913 encode(snaps
, bl
, features
);
1914 encode(removed_snaps
, bl
);
1917 encode((uint32_t)0, bl
); // crash_replay_interval
1918 encode(min_size
, bl
);
1919 encode(quota_max_bytes
, bl
);
1920 encode(quota_max_objects
, bl
);
1922 encode(tier_of
, bl
);
1923 __u8 c
= cache_mode
;
1925 encode(read_tier
, bl
);
1926 encode(write_tier
, bl
);
1927 encode(properties
, bl
);
1928 encode(hit_set_params
, bl
);
1929 encode(hit_set_period
, bl
);
1930 encode(hit_set_count
, bl
);
1931 encode(stripe_width
, bl
);
1932 encode(target_max_bytes
, bl
);
1933 encode(target_max_objects
, bl
);
1934 encode(cache_target_dirty_ratio_micro
, bl
);
1935 encode(cache_target_full_ratio_micro
, bl
);
1936 encode(cache_min_flush_age
, bl
);
1937 encode(cache_min_evict_age
, bl
);
1938 encode(erasure_code_profile
, bl
);
1944 // NOTE: any new encoding dependencies must be reflected by
1945 // SIGNIFICANT_FEATURES
1946 if (!(features
& CEPH_FEATURE_NEW_OSDOP_ENCODING
)) {
1947 // this was the first post-hammer thing we added; if it's missing, encode
1950 } else if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
1952 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
1954 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1956 } else if (!is_stretch_pool()) {
1960 ENCODE_START(v
, 5, bl
);
1963 encode(crush_rule
, bl
);
1964 encode(object_hash
, bl
);
1966 encode(pgp_num
, bl
);
1967 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1968 encode(lpg_num
, bl
);
1969 encode(lpgp_num
, bl
);
1970 encode(last_change
, bl
);
1971 encode(snap_seq
, bl
);
1972 encode(snap_epoch
, bl
);
1973 encode(snaps
, bl
, features
);
1974 encode(removed_snaps
, bl
);
1980 tmp
&= ~(FLAG_SELFMANAGED_SNAPS
| FLAG_POOL_SNAPS
| FLAG_CREATING
);
1983 encode((uint32_t)0, bl
); // crash_replay_interval
1984 encode(min_size
, bl
);
1985 encode(quota_max_bytes
, bl
);
1986 encode(quota_max_objects
, bl
);
1988 encode(tier_of
, bl
);
1989 __u8 c
= cache_mode
;
1991 encode(read_tier
, bl
);
1992 encode(write_tier
, bl
);
1993 encode(properties
, bl
);
1994 encode(hit_set_params
, bl
);
1995 encode(hit_set_period
, bl
);
1996 encode(hit_set_count
, bl
);
1997 encode(stripe_width
, bl
);
1998 encode(target_max_bytes
, bl
);
1999 encode(target_max_objects
, bl
);
2000 encode(cache_target_dirty_ratio_micro
, bl
);
2001 encode(cache_target_full_ratio_micro
, bl
);
2002 encode(cache_min_flush_age
, bl
);
2003 encode(cache_min_evict_age
, bl
);
2004 encode(erasure_code_profile
, bl
);
2005 encode(last_force_op_resend_preluminous
, bl
);
2006 encode(min_read_recency_for_promote
, bl
);
2007 encode(expected_num_objects
, bl
);
2009 encode(cache_target_dirty_high_ratio_micro
, bl
);
2012 encode(min_write_recency_for_promote
, bl
);
2015 encode(use_gmt_hitset
, bl
);
2018 encode(fast_read
, bl
);
2021 encode(hit_set_grade_decay_rate
, bl
);
2022 encode(hit_set_search_last_n
, bl
);
2025 encode(opts
, bl
, features
);
2028 encode(last_force_op_resend_prenautilus
, bl
);
2031 encode(application_metadata
, bl
);
2034 encode(create_time
, bl
);
2037 encode(pg_num_target
, bl
);
2038 encode(pgp_num_target
, bl
);
2039 encode(pg_num_pending
, bl
);
2040 encode((epoch_t
)0, bl
); // pg_num_dec_last_epoch_started from 14.1.[01]
2041 encode((epoch_t
)0, bl
); // pg_num_dec_last_epoch_clean from 14.1.[01]
2042 encode(last_force_op_resend
, bl
);
2043 encode(pg_autoscale_mode
, bl
);
2046 encode(last_pg_merge_meta
, bl
);
2049 encode(peering_crush_bucket_count
, bl
);
2050 encode(peering_crush_bucket_target
, bl
);
2051 encode(peering_crush_bucket_barrier
, bl
);
2052 encode(peering_crush_mandatory_member
, bl
);
2057 void pg_pool_t::decode(ceph::buffer::list::const_iterator
& bl
)
2059 DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl
);
2062 decode(crush_rule
, bl
);
2063 decode(object_hash
, bl
);
2065 decode(pgp_num
, bl
);
2067 __u32 lpg_num
, lpgp_num
;
2068 decode(lpg_num
, bl
);
2069 decode(lpgp_num
, bl
);
2071 decode(last_change
, bl
);
2072 decode(snap_seq
, bl
);
2073 decode(snap_epoch
, bl
);
2075 if (struct_v
>= 3) {
2077 decode(removed_snaps
, bl
);
2084 decode_nohead(n
, snaps
, bl
);
2085 decode_nohead(m
, removed_snaps
, bl
);
2088 if (struct_v
>= 4) {
2090 uint32_t crash_replay_interval
;
2091 decode(crash_replay_interval
, bl
);
2095 // upgrade path for selfmanaged vs pool snaps
2096 if (snap_seq
> 0 && (flags
& (FLAG_SELFMANAGED_SNAPS
|FLAG_POOL_SNAPS
)) == 0) {
2097 if (!removed_snaps
.empty()) {
2098 flags
|= FLAG_SELFMANAGED_SNAPS
;
2100 flags
|= FLAG_POOL_SNAPS
;
2103 if (struct_v
>= 7) {
2104 decode(min_size
, bl
);
2106 min_size
= size
- size
/2;
2108 if (struct_v
>= 8) {
2109 decode(quota_max_bytes
, bl
);
2110 decode(quota_max_objects
, bl
);
2112 if (struct_v
>= 9) {
2114 decode(tier_of
, bl
);
2117 cache_mode
= (cache_mode_t
)v
;
2118 decode(read_tier
, bl
);
2119 decode(write_tier
, bl
);
2121 if (struct_v
>= 10) {
2122 decode(properties
, bl
);
2124 if (struct_v
>= 11) {
2125 decode(hit_set_params
, bl
);
2126 decode(hit_set_period
, bl
);
2127 decode(hit_set_count
, bl
);
2130 hit_set_period
= def
.hit_set_period
;
2131 hit_set_count
= def
.hit_set_count
;
2133 if (struct_v
>= 12) {
2134 decode(stripe_width
, bl
);
2136 set_stripe_width(0);
2138 if (struct_v
>= 13) {
2139 decode(target_max_bytes
, bl
);
2140 decode(target_max_objects
, bl
);
2141 decode(cache_target_dirty_ratio_micro
, bl
);
2142 decode(cache_target_full_ratio_micro
, bl
);
2143 decode(cache_min_flush_age
, bl
);
2144 decode(cache_min_evict_age
, bl
);
2146 target_max_bytes
= 0;
2147 target_max_objects
= 0;
2148 cache_target_dirty_ratio_micro
= 0;
2149 cache_target_full_ratio_micro
= 0;
2150 cache_min_flush_age
= 0;
2151 cache_min_evict_age
= 0;
2153 if (struct_v
>= 14) {
2154 decode(erasure_code_profile
, bl
);
2156 if (struct_v
>= 15) {
2157 decode(last_force_op_resend_preluminous
, bl
);
2159 last_force_op_resend_preluminous
= 0;
2161 if (struct_v
>= 16) {
2162 decode(min_read_recency_for_promote
, bl
);
2164 min_read_recency_for_promote
= 1;
2166 if (struct_v
>= 17) {
2167 decode(expected_num_objects
, bl
);
2169 expected_num_objects
= 0;
2171 if (struct_v
>= 19) {
2172 decode(cache_target_dirty_high_ratio_micro
, bl
);
2174 cache_target_dirty_high_ratio_micro
= cache_target_dirty_ratio_micro
;
2176 if (struct_v
>= 20) {
2177 decode(min_write_recency_for_promote
, bl
);
2179 min_write_recency_for_promote
= 1;
2181 if (struct_v
>= 21) {
2182 decode(use_gmt_hitset
, bl
);
2184 use_gmt_hitset
= false;
2186 if (struct_v
>= 22) {
2187 decode(fast_read
, bl
);
2191 if (struct_v
>= 23) {
2192 decode(hit_set_grade_decay_rate
, bl
);
2193 decode(hit_set_search_last_n
, bl
);
2195 hit_set_grade_decay_rate
= 0;
2196 hit_set_search_last_n
= 1;
2198 if (struct_v
>= 24) {
2201 if (struct_v
>= 25) {
2202 decode(last_force_op_resend_prenautilus
, bl
);
2204 last_force_op_resend_prenautilus
= last_force_op_resend_preluminous
;
2206 if (struct_v
>= 26) {
2207 decode(application_metadata
, bl
);
2209 if (struct_v
>= 27) {
2210 decode(create_time
, bl
);
2212 if (struct_v
>= 28) {
2213 decode(pg_num_target
, bl
);
2214 decode(pgp_num_target
, bl
);
2215 decode(pg_num_pending
, bl
);
2216 epoch_t old_merge_last_epoch_clean
, old_merge_last_epoch_started
;
2217 decode(old_merge_last_epoch_started
, bl
);
2218 decode(old_merge_last_epoch_clean
, bl
);
2219 decode(last_force_op_resend
, bl
);
2220 decode(pg_autoscale_mode
, bl
);
2221 if (struct_v
>= 29) {
2222 decode(last_pg_merge_meta
, bl
);
2224 last_pg_merge_meta
.last_epoch_clean
= old_merge_last_epoch_clean
;
2225 last_pg_merge_meta
.last_epoch_started
= old_merge_last_epoch_started
;
2228 pg_num_target
= pg_num
;
2229 pgp_num_target
= pgp_num
;
2230 pg_num_pending
= pg_num
;
2231 last_force_op_resend
= last_force_op_resend_prenautilus
;
2232 pg_autoscale_mode
= pg_autoscale_mode_t::WARN
; // default to warn on upgrade
2234 if (struct_v
>= 30) {
2235 decode(peering_crush_bucket_count
, bl
);
2236 decode(peering_crush_bucket_target
, bl
);
2237 decode(peering_crush_bucket_barrier
, bl
);
2238 decode(peering_crush_mandatory_member
, bl
);
2245 bool pg_pool_t::stretch_set_can_peer(const set
<int>& want
, const OSDMap
& osdmap
,
2246 std::ostream
* out
) const
2248 if (!is_stretch_pool()) return true;
2249 const uint32_t barrier_id
= peering_crush_bucket_barrier
;
2250 const uint32_t barrier_count
= peering_crush_bucket_count
;
2252 const shared_ptr
<CrushWrapper
>& crush
= osdmap
.crush
;
2253 for (int osdid
: want
) {
2254 int ancestor
= crush
->get_parent_of_type(osdid
, barrier_id
,
2256 ancestors
.insert(ancestor
);
2258 if (ancestors
.size() < barrier_count
) {
2260 *out
<< __func__
<< ": not enough crush buckets with OSDs in want set "
2264 } else if (peering_crush_mandatory_member
!= CRUSH_ITEM_NONE
&&
2265 !ancestors
.count(peering_crush_mandatory_member
)) {
2267 *out
<< __func__
<< ": missing mandatory crush bucket member "
2268 << peering_crush_mandatory_member
;
2275 void pg_pool_t::generate_test_instances(list
<pg_pool_t
*>& o
)
2278 o
.push_back(new pg_pool_t(a
));
2280 a
.create_time
= utime_t(4,5);
2281 a
.type
= TYPE_REPLICATED
;
2287 a
.pgp_num_target
= 4;
2288 a
.pg_num_target
= 5;
2289 a
.pg_num_pending
= 5;
2290 a
.last_pg_merge_meta
.last_epoch_started
= 2;
2291 a
.last_pg_merge_meta
.last_epoch_clean
= 2;
2293 a
.last_force_op_resend
= 123823;
2294 a
.last_force_op_resend_preluminous
= 123824;
2297 a
.flags
= FLAG_POOL_SNAPS
;
2299 a
.quota_max_bytes
= 473;
2300 a
.quota_max_objects
= 474;
2301 o
.push_back(new pg_pool_t(a
));
2303 a
.snaps
[3].name
= "asdf";
2304 a
.snaps
[3].snapid
= 3;
2305 a
.snaps
[3].stamp
= utime_t(123, 4);
2306 a
.snaps
[6].name
= "qwer";
2307 a
.snaps
[6].snapid
= 6;
2308 a
.snaps
[6].stamp
= utime_t(23423, 4);
2309 o
.push_back(new pg_pool_t(a
));
2311 a
.flags
= FLAG_SELFMANAGED_SNAPS
;
2313 a
.removed_snaps
.insert(2);
2314 a
.quota_max_bytes
= 2473;
2315 a
.quota_max_objects
= 4374;
2319 a
.cache_mode
= CACHEMODE_WRITEBACK
;
2322 a
.hit_set_params
= HitSet::Params(new BloomHitSet::Params
);
2323 a
.hit_set_period
= 3600;
2324 a
.hit_set_count
= 8;
2325 a
.min_read_recency_for_promote
= 1;
2326 a
.min_write_recency_for_promote
= 1;
2327 a
.hit_set_grade_decay_rate
= 50;
2328 a
.hit_set_search_last_n
= 1;
2329 a
.calc_grade_table();
2330 a
.set_stripe_width(12345);
2331 a
.target_max_bytes
= 1238132132;
2332 a
.target_max_objects
= 1232132;
2333 a
.cache_target_dirty_ratio_micro
= 187232;
2334 a
.cache_target_dirty_high_ratio_micro
= 309856;
2335 a
.cache_target_full_ratio_micro
= 987222;
2336 a
.cache_min_flush_age
= 231;
2337 a
.cache_min_evict_age
= 2321;
2338 a
.erasure_code_profile
= "profile in osdmap";
2339 a
.expected_num_objects
= 123456;
2340 a
.fast_read
= false;
2341 a
.application_metadata
= {{"rbd", {{"key", "value"}}}};
2342 o
.push_back(new pg_pool_t(a
));
2345 ostream
& operator<<(ostream
& out
, const pg_pool_t
& p
)
2347 out
<< p
.get_type_name();
2348 if (p
.get_type_name() == "erasure") {
2349 out
<< " profile " << p
.erasure_code_profile
;
2351 out
<< " size " << p
.get_size()
2352 << " min_size " << p
.get_min_size()
2353 << " crush_rule " << p
.get_crush_rule()
2354 << " object_hash " << p
.get_object_hash_name()
2355 << " pg_num " << p
.get_pg_num()
2356 << " pgp_num " << p
.get_pgp_num();
2357 if (p
.get_pg_num_target() != p
.get_pg_num()) {
2358 out
<< " pg_num_target " << p
.get_pg_num_target();
2360 if (p
.get_pgp_num_target() != p
.get_pgp_num()) {
2361 out
<< " pgp_num_target " << p
.get_pgp_num_target();
2363 if (p
.get_pg_num_pending() != p
.get_pg_num()) {
2364 out
<< " pg_num_pending " << p
.get_pg_num_pending();
2366 if (p
.pg_autoscale_mode
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
2367 out
<< " autoscale_mode " << p
.get_pg_autoscale_mode_name(p
.pg_autoscale_mode
);
2369 out
<< " last_change " << p
.get_last_change();
2370 if (p
.get_last_force_op_resend() ||
2371 p
.get_last_force_op_resend_prenautilus() ||
2372 p
.get_last_force_op_resend_preluminous())
2373 out
<< " lfor " << p
.get_last_force_op_resend() << "/"
2374 << p
.get_last_force_op_resend_prenautilus() << "/"
2375 << p
.get_last_force_op_resend_preluminous();
2377 out
<< " owner " << p
.get_auid();
2379 out
<< " flags " << p
.get_flags_string();
2380 if (p
.quota_max_bytes
)
2381 out
<< " max_bytes " << p
.quota_max_bytes
;
2382 if (p
.quota_max_objects
)
2383 out
<< " max_objects " << p
.quota_max_objects
;
2384 if (!p
.tiers
.empty())
2385 out
<< " tiers " << p
.tiers
;
2387 out
<< " tier_of " << p
.tier_of
;
2388 if (p
.has_read_tier())
2389 out
<< " read_tier " << p
.read_tier
;
2390 if (p
.has_write_tier())
2391 out
<< " write_tier " << p
.write_tier
;
2393 out
<< " cache_mode " << p
.get_cache_mode_name();
2394 if (p
.target_max_bytes
)
2395 out
<< " target_bytes " << p
.target_max_bytes
;
2396 if (p
.target_max_objects
)
2397 out
<< " target_objects " << p
.target_max_objects
;
2398 if (p
.hit_set_params
.get_type() != HitSet::TYPE_NONE
) {
2399 out
<< " hit_set " << p
.hit_set_params
2400 << " " << p
.hit_set_period
<< "s"
2401 << " x" << p
.hit_set_count
<< " decay_rate "
2402 << p
.hit_set_grade_decay_rate
2403 << " search_last_n " << p
.hit_set_search_last_n
;
2405 if (p
.min_read_recency_for_promote
)
2406 out
<< " min_read_recency_for_promote " << p
.min_read_recency_for_promote
;
2407 if (p
.min_write_recency_for_promote
)
2408 out
<< " min_write_recency_for_promote " << p
.min_write_recency_for_promote
;
2409 out
<< " stripe_width " << p
.get_stripe_width();
2410 if (p
.expected_num_objects
)
2411 out
<< " expected_num_objects " << p
.expected_num_objects
;
2413 out
<< " fast_read " << p
.fast_read
;
2415 if (!p
.application_metadata
.empty()) {
2416 out
<< " application ";
2417 for (auto it
= p
.application_metadata
.begin();
2418 it
!= p
.application_metadata
.end(); ++it
) {
2419 if (it
!= p
.application_metadata
.begin())
2428 // -- object_stat_sum_t --
2430 void object_stat_sum_t::dump(Formatter
*f
) const
2432 f
->dump_int("num_bytes", num_bytes
);
2433 f
->dump_int("num_objects", num_objects
);
2434 f
->dump_int("num_object_clones", num_object_clones
);
2435 f
->dump_int("num_object_copies", num_object_copies
);
2436 f
->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary
);
2437 f
->dump_int("num_objects_missing", num_objects_missing
);
2438 f
->dump_int("num_objects_degraded", num_objects_degraded
);
2439 f
->dump_int("num_objects_misplaced", num_objects_misplaced
);
2440 f
->dump_int("num_objects_unfound", num_objects_unfound
);
2441 f
->dump_int("num_objects_dirty", num_objects_dirty
);
2442 f
->dump_int("num_whiteouts", num_whiteouts
);
2443 f
->dump_int("num_read", num_rd
);
2444 f
->dump_int("num_read_kb", num_rd_kb
);
2445 f
->dump_int("num_write", num_wr
);
2446 f
->dump_int("num_write_kb", num_wr_kb
);
2447 f
->dump_int("num_scrub_errors", num_scrub_errors
);
2448 f
->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors
);
2449 f
->dump_int("num_deep_scrub_errors", num_deep_scrub_errors
);
2450 f
->dump_int("num_objects_recovered", num_objects_recovered
);
2451 f
->dump_int("num_bytes_recovered", num_bytes_recovered
);
2452 f
->dump_int("num_keys_recovered", num_keys_recovered
);
2453 f
->dump_int("num_objects_omap", num_objects_omap
);
2454 f
->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive
);
2455 f
->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive
);
2456 f
->dump_int("num_flush", num_flush
);
2457 f
->dump_int("num_flush_kb", num_flush_kb
);
2458 f
->dump_int("num_evict", num_evict
);
2459 f
->dump_int("num_evict_kb", num_evict_kb
);
2460 f
->dump_int("num_promote", num_promote
);
2461 f
->dump_int("num_flush_mode_high", num_flush_mode_high
);
2462 f
->dump_int("num_flush_mode_low", num_flush_mode_low
);
2463 f
->dump_int("num_evict_mode_some", num_evict_mode_some
);
2464 f
->dump_int("num_evict_mode_full", num_evict_mode_full
);
2465 f
->dump_int("num_objects_pinned", num_objects_pinned
);
2466 f
->dump_int("num_legacy_snapsets", num_legacy_snapsets
);
2467 f
->dump_int("num_large_omap_objects", num_large_omap_objects
);
2468 f
->dump_int("num_objects_manifest", num_objects_manifest
);
2469 f
->dump_int("num_omap_bytes", num_omap_bytes
);
2470 f
->dump_int("num_omap_keys", num_omap_keys
);
2471 f
->dump_int("num_objects_repaired", num_objects_repaired
);
2474 void object_stat_sum_t::encode(ceph::buffer::list
& bl
) const
2476 ENCODE_START(20, 14, bl
);
2477 #if defined(CEPH_LITTLE_ENDIAN)
2478 bl
.append((char *)(&num_bytes
), sizeof(object_stat_sum_t
));
2480 encode(num_bytes
, bl
);
2481 encode(num_objects
, bl
);
2482 encode(num_object_clones
, bl
);
2483 encode(num_object_copies
, bl
);
2484 encode(num_objects_missing_on_primary
, bl
);
2485 encode(num_objects_degraded
, bl
);
2486 encode(num_objects_unfound
, bl
);
2488 encode(num_rd_kb
, bl
);
2490 encode(num_wr_kb
, bl
);
2491 encode(num_scrub_errors
, bl
);
2492 encode(num_objects_recovered
, bl
);
2493 encode(num_bytes_recovered
, bl
);
2494 encode(num_keys_recovered
, bl
);
2495 encode(num_shallow_scrub_errors
, bl
);
2496 encode(num_deep_scrub_errors
, bl
);
2497 encode(num_objects_dirty
, bl
);
2498 encode(num_whiteouts
, bl
);
2499 encode(num_objects_omap
, bl
);
2500 encode(num_objects_hit_set_archive
, bl
);
2501 encode(num_objects_misplaced
, bl
);
2502 encode(num_bytes_hit_set_archive
, bl
);
2503 encode(num_flush
, bl
);
2504 encode(num_flush_kb
, bl
);
2505 encode(num_evict
, bl
);
2506 encode(num_evict_kb
, bl
);
2507 encode(num_promote
, bl
);
2508 encode(num_flush_mode_high
, bl
);
2509 encode(num_flush_mode_low
, bl
);
2510 encode(num_evict_mode_some
, bl
);
2511 encode(num_evict_mode_full
, bl
);
2512 encode(num_objects_pinned
, bl
);
2513 encode(num_objects_missing
, bl
);
2514 encode(num_legacy_snapsets
, bl
);
2515 encode(num_large_omap_objects
, bl
);
2516 encode(num_objects_manifest
, bl
);
2517 encode(num_omap_bytes
, bl
);
2518 encode(num_omap_keys
, bl
);
2519 encode(num_objects_repaired
, bl
);
2524 void object_stat_sum_t::decode(ceph::buffer::list::const_iterator
& bl
)
2526 bool decode_finish
= false;
2527 static const int STAT_SUM_DECODE_VERSION
= 20;
2528 DECODE_START(STAT_SUM_DECODE_VERSION
, bl
);
2529 #if defined(CEPH_LITTLE_ENDIAN)
2530 if (struct_v
== STAT_SUM_DECODE_VERSION
) {
2531 bl
.copy(sizeof(object_stat_sum_t
), (char*)(&num_bytes
));
2532 decode_finish
= true;
2535 if (!decode_finish
) {
2536 decode(num_bytes
, bl
);
2537 decode(num_objects
, bl
);
2538 decode(num_object_clones
, bl
);
2539 decode(num_object_copies
, bl
);
2540 decode(num_objects_missing_on_primary
, bl
);
2541 decode(num_objects_degraded
, bl
);
2542 decode(num_objects_unfound
, bl
);
2544 decode(num_rd_kb
, bl
);
2546 decode(num_wr_kb
, bl
);
2547 decode(num_scrub_errors
, bl
);
2548 decode(num_objects_recovered
, bl
);
2549 decode(num_bytes_recovered
, bl
);
2550 decode(num_keys_recovered
, bl
);
2551 decode(num_shallow_scrub_errors
, bl
);
2552 decode(num_deep_scrub_errors
, bl
);
2553 decode(num_objects_dirty
, bl
);
2554 decode(num_whiteouts
, bl
);
2555 decode(num_objects_omap
, bl
);
2556 decode(num_objects_hit_set_archive
, bl
);
2557 decode(num_objects_misplaced
, bl
);
2558 decode(num_bytes_hit_set_archive
, bl
);
2559 decode(num_flush
, bl
);
2560 decode(num_flush_kb
, bl
);
2561 decode(num_evict
, bl
);
2562 decode(num_evict_kb
, bl
);
2563 decode(num_promote
, bl
);
2564 decode(num_flush_mode_high
, bl
);
2565 decode(num_flush_mode_low
, bl
);
2566 decode(num_evict_mode_some
, bl
);
2567 decode(num_evict_mode_full
, bl
);
2568 decode(num_objects_pinned
, bl
);
2569 decode(num_objects_missing
, bl
);
2570 if (struct_v
>= 16) {
2571 decode(num_legacy_snapsets
, bl
);
2573 num_legacy_snapsets
= num_object_clones
; // upper bound
2575 if (struct_v
>= 17) {
2576 decode(num_large_omap_objects
, bl
);
2578 if (struct_v
>= 18) {
2579 decode(num_objects_manifest
, bl
);
2581 if (struct_v
>= 19) {
2582 decode(num_omap_bytes
, bl
);
2583 decode(num_omap_keys
, bl
);
2585 if (struct_v
>= 20) {
2586 decode(num_objects_repaired
, bl
);
2592 void object_stat_sum_t::generate_test_instances(list
<object_stat_sum_t
*>& o
)
2594 object_stat_sum_t a
;
2598 a
.num_object_clones
= 4;
2599 a
.num_object_copies
= 5;
2600 a
.num_objects_missing_on_primary
= 6;
2601 a
.num_objects_missing
= 123;
2602 a
.num_objects_degraded
= 7;
2603 a
.num_objects_unfound
= 8;
2604 a
.num_rd
= 9; a
.num_rd_kb
= 10;
2605 a
.num_wr
= 11; a
.num_wr_kb
= 12;
2606 a
.num_objects_recovered
= 14;
2607 a
.num_bytes_recovered
= 15;
2608 a
.num_keys_recovered
= 16;
2609 a
.num_deep_scrub_errors
= 17;
2610 a
.num_shallow_scrub_errors
= 18;
2611 a
.num_scrub_errors
= a
.num_deep_scrub_errors
+ a
.num_shallow_scrub_errors
;
2612 a
.num_objects_dirty
= 21;
2613 a
.num_whiteouts
= 22;
2614 a
.num_objects_misplaced
= 1232;
2615 a
.num_objects_hit_set_archive
= 2;
2616 a
.num_bytes_hit_set_archive
= 27;
2622 a
.num_flush_mode_high
= 0;
2623 a
.num_flush_mode_low
= 1;
2624 a
.num_evict_mode_some
= 1;
2625 a
.num_evict_mode_full
= 0;
2626 a
.num_objects_pinned
= 20;
2627 a
.num_large_omap_objects
= 5;
2628 a
.num_objects_manifest
= 2;
2629 a
.num_omap_bytes
= 20000;
2630 a
.num_omap_keys
= 200;
2631 a
.num_objects_repaired
= 300;
2632 o
.push_back(new object_stat_sum_t(a
));
2635 void object_stat_sum_t::add(const object_stat_sum_t
& o
)
2637 num_bytes
+= o
.num_bytes
;
2638 num_objects
+= o
.num_objects
;
2639 num_object_clones
+= o
.num_object_clones
;
2640 num_object_copies
+= o
.num_object_copies
;
2641 num_objects_missing_on_primary
+= o
.num_objects_missing_on_primary
;
2642 num_objects_missing
+= o
.num_objects_missing
;
2643 num_objects_degraded
+= o
.num_objects_degraded
;
2644 num_objects_misplaced
+= o
.num_objects_misplaced
;
2646 num_rd_kb
+= o
.num_rd_kb
;
2648 num_wr_kb
+= o
.num_wr_kb
;
2649 num_objects_unfound
+= o
.num_objects_unfound
;
2650 num_scrub_errors
+= o
.num_scrub_errors
;
2651 num_shallow_scrub_errors
+= o
.num_shallow_scrub_errors
;
2652 num_deep_scrub_errors
+= o
.num_deep_scrub_errors
;
2653 num_objects_recovered
+= o
.num_objects_recovered
;
2654 num_bytes_recovered
+= o
.num_bytes_recovered
;
2655 num_keys_recovered
+= o
.num_keys_recovered
;
2656 num_objects_dirty
+= o
.num_objects_dirty
;
2657 num_whiteouts
+= o
.num_whiteouts
;
2658 num_objects_omap
+= o
.num_objects_omap
;
2659 num_objects_hit_set_archive
+= o
.num_objects_hit_set_archive
;
2660 num_bytes_hit_set_archive
+= o
.num_bytes_hit_set_archive
;
2661 num_flush
+= o
.num_flush
;
2662 num_flush_kb
+= o
.num_flush_kb
;
2663 num_evict
+= o
.num_evict
;
2664 num_evict_kb
+= o
.num_evict_kb
;
2665 num_promote
+= o
.num_promote
;
2666 num_flush_mode_high
+= o
.num_flush_mode_high
;
2667 num_flush_mode_low
+= o
.num_flush_mode_low
;
2668 num_evict_mode_some
+= o
.num_evict_mode_some
;
2669 num_evict_mode_full
+= o
.num_evict_mode_full
;
2670 num_objects_pinned
+= o
.num_objects_pinned
;
2671 num_legacy_snapsets
+= o
.num_legacy_snapsets
;
2672 num_large_omap_objects
+= o
.num_large_omap_objects
;
2673 num_objects_manifest
+= o
.num_objects_manifest
;
2674 num_omap_bytes
+= o
.num_omap_bytes
;
2675 num_omap_keys
+= o
.num_omap_keys
;
2676 num_objects_repaired
+= o
.num_objects_repaired
;
2679 void object_stat_sum_t::sub(const object_stat_sum_t
& o
)
2681 num_bytes
-= o
.num_bytes
;
2682 num_objects
-= o
.num_objects
;
2683 num_object_clones
-= o
.num_object_clones
;
2684 num_object_copies
-= o
.num_object_copies
;
2685 num_objects_missing_on_primary
-= o
.num_objects_missing_on_primary
;
2686 num_objects_missing
-= o
.num_objects_missing
;
2687 num_objects_degraded
-= o
.num_objects_degraded
;
2688 num_objects_misplaced
-= o
.num_objects_misplaced
;
2690 num_rd_kb
-= o
.num_rd_kb
;
2692 num_wr_kb
-= o
.num_wr_kb
;
2693 num_objects_unfound
-= o
.num_objects_unfound
;
2694 num_scrub_errors
-= o
.num_scrub_errors
;
2695 num_shallow_scrub_errors
-= o
.num_shallow_scrub_errors
;
2696 num_deep_scrub_errors
-= o
.num_deep_scrub_errors
;
2697 num_objects_recovered
-= o
.num_objects_recovered
;
2698 num_bytes_recovered
-= o
.num_bytes_recovered
;
2699 num_keys_recovered
-= o
.num_keys_recovered
;
2700 num_objects_dirty
-= o
.num_objects_dirty
;
2701 num_whiteouts
-= o
.num_whiteouts
;
2702 num_objects_omap
-= o
.num_objects_omap
;
2703 num_objects_hit_set_archive
-= o
.num_objects_hit_set_archive
;
2704 num_bytes_hit_set_archive
-= o
.num_bytes_hit_set_archive
;
2705 num_flush
-= o
.num_flush
;
2706 num_flush_kb
-= o
.num_flush_kb
;
2707 num_evict
-= o
.num_evict
;
2708 num_evict_kb
-= o
.num_evict_kb
;
2709 num_promote
-= o
.num_promote
;
2710 num_flush_mode_high
-= o
.num_flush_mode_high
;
2711 num_flush_mode_low
-= o
.num_flush_mode_low
;
2712 num_evict_mode_some
-= o
.num_evict_mode_some
;
2713 num_evict_mode_full
-= o
.num_evict_mode_full
;
2714 num_objects_pinned
-= o
.num_objects_pinned
;
2715 num_legacy_snapsets
-= o
.num_legacy_snapsets
;
2716 num_large_omap_objects
-= o
.num_large_omap_objects
;
2717 num_objects_manifest
-= o
.num_objects_manifest
;
2718 num_omap_bytes
-= o
.num_omap_bytes
;
2719 num_omap_keys
-= o
.num_omap_keys
;
2720 num_objects_repaired
-= o
.num_objects_repaired
;
2723 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
)
2726 l
.num_bytes
== r
.num_bytes
&&
2727 l
.num_objects
== r
.num_objects
&&
2728 l
.num_object_clones
== r
.num_object_clones
&&
2729 l
.num_object_copies
== r
.num_object_copies
&&
2730 l
.num_objects_missing_on_primary
== r
.num_objects_missing_on_primary
&&
2731 l
.num_objects_missing
== r
.num_objects_missing
&&
2732 l
.num_objects_degraded
== r
.num_objects_degraded
&&
2733 l
.num_objects_misplaced
== r
.num_objects_misplaced
&&
2734 l
.num_objects_unfound
== r
.num_objects_unfound
&&
2735 l
.num_rd
== r
.num_rd
&&
2736 l
.num_rd_kb
== r
.num_rd_kb
&&
2737 l
.num_wr
== r
.num_wr
&&
2738 l
.num_wr_kb
== r
.num_wr_kb
&&
2739 l
.num_scrub_errors
== r
.num_scrub_errors
&&
2740 l
.num_shallow_scrub_errors
== r
.num_shallow_scrub_errors
&&
2741 l
.num_deep_scrub_errors
== r
.num_deep_scrub_errors
&&
2742 l
.num_objects_recovered
== r
.num_objects_recovered
&&
2743 l
.num_bytes_recovered
== r
.num_bytes_recovered
&&
2744 l
.num_keys_recovered
== r
.num_keys_recovered
&&
2745 l
.num_objects_dirty
== r
.num_objects_dirty
&&
2746 l
.num_whiteouts
== r
.num_whiteouts
&&
2747 l
.num_objects_omap
== r
.num_objects_omap
&&
2748 l
.num_objects_hit_set_archive
== r
.num_objects_hit_set_archive
&&
2749 l
.num_bytes_hit_set_archive
== r
.num_bytes_hit_set_archive
&&
2750 l
.num_flush
== r
.num_flush
&&
2751 l
.num_flush_kb
== r
.num_flush_kb
&&
2752 l
.num_evict
== r
.num_evict
&&
2753 l
.num_evict_kb
== r
.num_evict_kb
&&
2754 l
.num_promote
== r
.num_promote
&&
2755 l
.num_flush_mode_high
== r
.num_flush_mode_high
&&
2756 l
.num_flush_mode_low
== r
.num_flush_mode_low
&&
2757 l
.num_evict_mode_some
== r
.num_evict_mode_some
&&
2758 l
.num_evict_mode_full
== r
.num_evict_mode_full
&&
2759 l
.num_objects_pinned
== r
.num_objects_pinned
&&
2760 l
.num_legacy_snapsets
== r
.num_legacy_snapsets
&&
2761 l
.num_large_omap_objects
== r
.num_large_omap_objects
&&
2762 l
.num_objects_manifest
== r
.num_objects_manifest
&&
2763 l
.num_omap_bytes
== r
.num_omap_bytes
&&
2764 l
.num_omap_keys
== r
.num_omap_keys
&&
2765 l
.num_objects_repaired
== r
.num_objects_repaired
;
2768 // -- object_stat_collection_t --
2770 void object_stat_collection_t::dump(Formatter
*f
) const
2772 f
->open_object_section("stat_sum");
2777 void object_stat_collection_t::encode(ceph::buffer::list
& bl
) const
2779 ENCODE_START(2, 2, bl
);
2781 encode((__u32
)0, bl
);
2785 void object_stat_collection_t::decode(ceph::buffer::list::const_iterator
& bl
)
2787 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2790 map
<string
,object_stat_sum_t
> cat_sum
;
2791 decode(cat_sum
, bl
);
2796 void object_stat_collection_t::generate_test_instances(list
<object_stat_collection_t
*>& o
)
2798 object_stat_collection_t a
;
2799 o
.push_back(new object_stat_collection_t(a
));
2800 list
<object_stat_sum_t
*> l
;
2801 object_stat_sum_t::generate_test_instances(l
);
2802 for (auto p
= l
.begin(); p
!= l
.end(); ++p
) {
2804 o
.push_back(new object_stat_collection_t(a
));
2811 bool pg_stat_t::is_acting_osd(int32_t osd
, bool primary
) const
2813 if (primary
&& osd
== acting_primary
) {
2815 } else if (!primary
) {
2816 for(auto it
= acting
.cbegin(); it
!= acting
.cend(); ++it
)
2825 void pg_stat_t::dump(Formatter
*f
) const
2827 f
->dump_stream("version") << version
;
2828 f
->dump_unsigned("reported_seq", reported_seq
);
2829 f
->dump_unsigned("reported_epoch", reported_epoch
);
2830 f
->dump_string("state", pg_state_string(state
));
2831 f
->dump_stream("last_fresh") << last_fresh
;
2832 f
->dump_stream("last_change") << last_change
;
2833 f
->dump_stream("last_active") << last_active
;
2834 f
->dump_stream("last_peered") << last_peered
;
2835 f
->dump_stream("last_clean") << last_clean
;
2836 f
->dump_stream("last_became_active") << last_became_active
;
2837 f
->dump_stream("last_became_peered") << last_became_peered
;
2838 f
->dump_stream("last_unstale") << last_unstale
;
2839 f
->dump_stream("last_undegraded") << last_undegraded
;
2840 f
->dump_stream("last_fullsized") << last_fullsized
;
2841 f
->dump_unsigned("mapping_epoch", mapping_epoch
);
2842 f
->dump_stream("log_start") << log_start
;
2843 f
->dump_stream("ondisk_log_start") << ondisk_log_start
;
2844 f
->dump_unsigned("created", created
);
2845 f
->dump_unsigned("last_epoch_clean", last_epoch_clean
);
2846 f
->dump_stream("parent") << parent
;
2847 f
->dump_unsigned("parent_split_bits", parent_split_bits
);
2848 f
->dump_stream("last_scrub") << last_scrub
;
2849 f
->dump_stream("last_scrub_stamp") << last_scrub_stamp
;
2850 f
->dump_stream("last_deep_scrub") << last_deep_scrub
;
2851 f
->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp
;
2852 f
->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp
;
2853 f
->dump_int("objects_scrubbed", objects_scrubbed
);
2854 f
->dump_int("log_size", log_size
);
2855 f
->dump_int("ondisk_log_size", ondisk_log_size
);
2856 f
->dump_bool("stats_invalid", stats_invalid
);
2857 f
->dump_bool("dirty_stats_invalid", dirty_stats_invalid
);
2858 f
->dump_bool("omap_stats_invalid", omap_stats_invalid
);
2859 f
->dump_bool("hitset_stats_invalid", hitset_stats_invalid
);
2860 f
->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid
);
2861 f
->dump_bool("pin_stats_invalid", pin_stats_invalid
);
2862 f
->dump_bool("manifest_stats_invalid", manifest_stats_invalid
);
2863 f
->dump_unsigned("snaptrimq_len", snaptrimq_len
);
2864 f
->dump_int("last_scrub_duration", last_scrub_duration
);
2865 f
->dump_string("scrub_schedule", dump_scrub_schedule());
2867 f
->open_array_section("up");
2868 for (auto p
= up
.cbegin(); p
!= up
.cend(); ++p
)
2869 f
->dump_int("osd", *p
);
2871 f
->open_array_section("acting");
2872 for (auto p
= acting
.cbegin(); p
!= acting
.cend(); ++p
)
2873 f
->dump_int("osd", *p
);
2875 f
->open_array_section("avail_no_missing");
2876 for (auto p
= avail_no_missing
.cbegin(); p
!= avail_no_missing
.cend(); ++p
)
2877 f
->dump_stream("shard") << *p
;
2879 f
->open_array_section("object_location_counts");
2880 for (auto p
= object_location_counts
.cbegin(); p
!= object_location_counts
.cend(); ++p
) {
2881 f
->open_object_section("entry");
2882 f
->dump_stream("shards") << p
->first
;
2883 f
->dump_int("objects", p
->second
);
2887 f
->open_array_section("blocked_by");
2888 for (auto p
= blocked_by
.cbegin(); p
!= blocked_by
.cend(); ++p
)
2889 f
->dump_int("osd", *p
);
2891 f
->dump_int("up_primary", up_primary
);
2892 f
->dump_int("acting_primary", acting_primary
);
2893 f
->open_array_section("purged_snaps");
2894 for (auto i
= purged_snaps
.begin(); i
!= purged_snaps
.end(); ++i
) {
2895 f
->open_object_section("interval");
2896 f
->dump_stream("start") << i
.get_start();
2897 f
->dump_stream("length") << i
.get_len();
2903 void pg_stat_t::dump_brief(Formatter
*f
) const
2905 f
->dump_string("state", pg_state_string(state
));
2906 f
->open_array_section("up");
2907 for (auto p
= up
.cbegin(); p
!= up
.cend(); ++p
)
2908 f
->dump_int("osd", *p
);
2910 f
->open_array_section("acting");
2911 for (auto p
= acting
.cbegin(); p
!= acting
.cend(); ++p
)
2912 f
->dump_int("osd", *p
);
2914 f
->dump_int("up_primary", up_primary
);
2915 f
->dump_int("acting_primary", acting_primary
);
2918 std::string
pg_stat_t::dump_scrub_schedule() const
2920 if (scrub_sched_status
.m_is_active
) {
2922 "{}scrubbing for {}s",
2923 ((scrub_sched_status
.m_is_deep
== scrub_level_t::deep
) ? "deep " : ""),
2924 scrub_sched_status
.m_duration_seconds
);
2926 switch (scrub_sched_status
.m_sched_status
) {
2927 case pg_scrub_sched_status_t::unknown
:
2928 // no reported scrub schedule yet
2930 case pg_scrub_sched_status_t::not_queued
:
2931 return "no scrub is scheduled"s
;
2932 case pg_scrub_sched_status_t::scheduled
:
2934 "{} {}scrub scheduled @ {}",
2935 (scrub_sched_status
.m_is_periodic
? "periodic" : "user requested"),
2936 ((scrub_sched_status
.m_is_deep
== scrub_level_t::deep
) ? "deep " : ""),
2937 scrub_sched_status
.m_scheduled_at
);
2938 case pg_scrub_sched_status_t::queued
:
2940 "queued for {}scrub",
2941 ((scrub_sched_status
.m_is_deep
== scrub_level_t::deep
) ? "deep " : ""));
2944 return "SCRUB STATE MISMATCH!"s
;
2948 bool operator==(const pg_scrubbing_status_t
& l
, const pg_scrubbing_status_t
& r
)
2951 l
.m_sched_status
== r
.m_sched_status
&&
2952 l
.m_scheduled_at
== r
.m_scheduled_at
&&
2953 l
.m_duration_seconds
== r
.m_duration_seconds
&&
2954 l
.m_is_active
== r
.m_is_active
&&
2955 l
.m_is_deep
== r
.m_is_deep
&&
2956 l
.m_is_periodic
== r
.m_is_periodic
;
2959 void pg_stat_t::encode(ceph::buffer::list
&bl
) const
2961 ENCODE_START(27, 22, bl
);
2962 encode(version
, bl
);
2963 encode(reported_seq
, bl
);
2964 encode(reported_epoch
, bl
);
2965 encode((__u32
)state
, bl
); // for older peers
2966 encode(log_start
, bl
);
2967 encode(ondisk_log_start
, bl
);
2968 encode(created
, bl
);
2969 encode(last_epoch_clean
, bl
);
2971 encode(parent_split_bits
, bl
);
2972 encode(last_scrub
, bl
);
2973 encode(last_scrub_stamp
, bl
);
2975 encode(log_size
, bl
);
2976 encode(ondisk_log_size
, bl
);
2979 encode(last_fresh
, bl
);
2980 encode(last_change
, bl
);
2981 encode(last_active
, bl
);
2982 encode(last_clean
, bl
);
2983 encode(last_unstale
, bl
);
2984 encode(mapping_epoch
, bl
);
2985 encode(last_deep_scrub
, bl
);
2986 encode(last_deep_scrub_stamp
, bl
);
2987 encode(stats_invalid
, bl
);
2988 encode(last_clean_scrub_stamp
, bl
);
2989 encode(last_became_active
, bl
);
2990 encode(dirty_stats_invalid
, bl
);
2991 encode(up_primary
, bl
);
2992 encode(acting_primary
, bl
);
2993 encode(omap_stats_invalid
, bl
);
2994 encode(hitset_stats_invalid
, bl
);
2995 encode(blocked_by
, bl
);
2996 encode(last_undegraded
, bl
);
2997 encode(last_fullsized
, bl
);
2998 encode(hitset_bytes_stats_invalid
, bl
);
2999 encode(last_peered
, bl
);
3000 encode(last_became_peered
, bl
);
3001 encode(pin_stats_invalid
, bl
);
3002 encode(snaptrimq_len
, bl
);
3003 __u32 top_state
= (state
>> 32);
3004 encode(top_state
, bl
);
3005 encode(purged_snaps
, bl
);
3006 encode(manifest_stats_invalid
, bl
);
3007 encode(avail_no_missing
, bl
);
3008 encode(object_location_counts
, bl
);
3009 encode(last_scrub_duration
, bl
);
3010 encode(scrub_sched_status
.m_scheduled_at
, bl
);
3011 encode(scrub_sched_status
.m_duration_seconds
, bl
);
3012 encode((__u16
)scrub_sched_status
.m_sched_status
, bl
);
3013 encode(scrub_sched_status
.m_is_active
, bl
);
3014 encode((scrub_sched_status
.m_is_deep
==scrub_level_t::deep
), bl
);
3015 encode(scrub_sched_status
.m_is_periodic
, bl
);
3016 encode(objects_scrubbed
, bl
);
3021 void pg_stat_t::decode(ceph::buffer::list::const_iterator
&bl
)
3025 DECODE_START(27, bl
);
3026 decode(version
, bl
);
3027 decode(reported_seq
, bl
);
3028 decode(reported_epoch
, bl
);
3029 decode(old_state
, bl
);
3030 decode(log_start
, bl
);
3031 decode(ondisk_log_start
, bl
);
3032 decode(created
, bl
);
3033 decode(last_epoch_clean
, bl
);
3035 decode(parent_split_bits
, bl
);
3036 decode(last_scrub
, bl
);
3037 decode(last_scrub_stamp
, bl
);
3039 decode(log_size
, bl
);
3040 decode(ondisk_log_size
, bl
);
3043 decode(last_fresh
, bl
);
3044 decode(last_change
, bl
);
3045 decode(last_active
, bl
);
3046 decode(last_clean
, bl
);
3047 decode(last_unstale
, bl
);
3048 decode(mapping_epoch
, bl
);
3049 decode(last_deep_scrub
, bl
);
3050 decode(last_deep_scrub_stamp
, bl
);
3052 stats_invalid
= tmp
;
3053 decode(last_clean_scrub_stamp
, bl
);
3054 decode(last_became_active
, bl
);
3056 dirty_stats_invalid
= tmp
;
3057 decode(up_primary
, bl
);
3058 decode(acting_primary
, bl
);
3060 omap_stats_invalid
= tmp
;
3062 hitset_stats_invalid
= tmp
;
3063 decode(blocked_by
, bl
);
3064 decode(last_undegraded
, bl
);
3065 decode(last_fullsized
, bl
);
3067 hitset_bytes_stats_invalid
= tmp
;
3068 decode(last_peered
, bl
);
3069 decode(last_became_peered
, bl
);
3071 pin_stats_invalid
= tmp
;
3072 if (struct_v
>= 23) {
3073 decode(snaptrimq_len
, bl
);
3074 if (struct_v
>= 24) {
3076 decode(top_state
, bl
);
3077 state
= (uint64_t)old_state
| ((uint64_t)top_state
<< 32);
3078 decode(purged_snaps
, bl
);
3082 if (struct_v
>= 25) {
3084 manifest_stats_invalid
= tmp
;
3086 manifest_stats_invalid
= true;
3088 if (struct_v
>= 26) {
3089 decode(avail_no_missing
, bl
);
3090 decode(object_location_counts
, bl
);
3092 if (struct_v
>= 27) {
3093 decode(last_scrub_duration
, bl
);
3094 decode(scrub_sched_status
.m_scheduled_at
, bl
);
3095 decode(scrub_sched_status
.m_duration_seconds
, bl
);
3096 __u16 scrub_sched_as_u16
;
3097 decode(scrub_sched_as_u16
, bl
);
3098 scrub_sched_status
.m_sched_status
= (pg_scrub_sched_status_t
)(scrub_sched_as_u16
);
3100 scrub_sched_status
.m_is_active
= tmp
;
3102 scrub_sched_status
.m_is_deep
= tmp
? scrub_level_t::deep
: scrub_level_t::shallow
;
3104 scrub_sched_status
.m_is_periodic
= tmp
;
3105 decode(objects_scrubbed
, bl
);
3111 void pg_stat_t::generate_test_instances(list
<pg_stat_t
*>& o
)
3114 o
.push_back(new pg_stat_t(a
));
3116 a
.version
= eversion_t(1, 3);
3117 a
.reported_epoch
= 1;
3120 a
.mapping_epoch
= 998;
3121 a
.last_fresh
= utime_t(1002, 1);
3122 a
.last_change
= utime_t(1002, 2);
3123 a
.last_active
= utime_t(1002, 3);
3124 a
.last_clean
= utime_t(1002, 4);
3125 a
.last_unstale
= utime_t(1002, 5);
3126 a
.last_undegraded
= utime_t(1002, 7);
3127 a
.last_fullsized
= utime_t(1002, 8);
3128 a
.log_start
= eversion_t(1, 4);
3129 a
.ondisk_log_start
= eversion_t(1, 5);
3131 a
.last_epoch_clean
= 7;
3132 a
.parent
= pg_t(1, 2);
3133 a
.parent_split_bits
= 12;
3134 a
.last_scrub
= eversion_t(9, 10);
3135 a
.last_scrub_stamp
= utime_t(11, 12);
3136 a
.last_deep_scrub
= eversion_t(13, 14);
3137 a
.last_deep_scrub_stamp
= utime_t(15, 16);
3138 a
.last_clean_scrub_stamp
= utime_t(17, 18);
3139 a
.last_scrub_duration
= 3617;
3140 a
.snaptrimq_len
= 1048576;
3141 a
.objects_scrubbed
= 0;
3142 list
<object_stat_collection_t
*> l
;
3143 object_stat_collection_t::generate_test_instances(l
);
3144 a
.stats
= *l
.back();
3146 a
.ondisk_log_size
= 88;
3147 a
.up
.push_back(123);
3149 a
.acting
.push_back(456);
3150 a
.avail_no_missing
.push_back(pg_shard_t(456, shard_id_t::NO_SHARD
));
3151 set
<pg_shard_t
> sset
= { pg_shard_t(0), pg_shard_t(1) };
3152 a
.object_location_counts
.insert(make_pair(sset
, 10));
3153 sset
.insert(pg_shard_t(2));
3154 a
.object_location_counts
.insert(make_pair(sset
, 5));
3155 a
.acting_primary
= 456;
3156 o
.push_back(new pg_stat_t(a
));
3158 a
.up
.push_back(124);
3160 a
.acting
.push_back(124);
3161 a
.acting_primary
= 124;
3162 a
.blocked_by
.push_back(155);
3163 a
.blocked_by
.push_back(156);
3164 o
.push_back(new pg_stat_t(a
));
3167 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
)
3170 l
.version
== r
.version
&&
3171 l
.reported_seq
== r
.reported_seq
&&
3172 l
.reported_epoch
== r
.reported_epoch
&&
3173 l
.state
== r
.state
&&
3174 l
.last_fresh
== r
.last_fresh
&&
3175 l
.last_change
== r
.last_change
&&
3176 l
.last_active
== r
.last_active
&&
3177 l
.last_peered
== r
.last_peered
&&
3178 l
.last_clean
== r
.last_clean
&&
3179 l
.last_unstale
== r
.last_unstale
&&
3180 l
.last_undegraded
== r
.last_undegraded
&&
3181 l
.last_fullsized
== r
.last_fullsized
&&
3182 l
.log_start
== r
.log_start
&&
3183 l
.ondisk_log_start
== r
.ondisk_log_start
&&
3184 l
.created
== r
.created
&&
3185 l
.last_epoch_clean
== r
.last_epoch_clean
&&
3186 l
.parent
== r
.parent
&&
3187 l
.parent_split_bits
== r
.parent_split_bits
&&
3188 l
.last_scrub
== r
.last_scrub
&&
3189 l
.last_deep_scrub
== r
.last_deep_scrub
&&
3190 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
3191 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
3192 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
&&
3193 l
.stats
== r
.stats
&&
3194 l
.stats_invalid
== r
.stats_invalid
&&
3195 l
.log_size
== r
.log_size
&&
3196 l
.ondisk_log_size
== r
.ondisk_log_size
&&
3198 l
.acting
== r
.acting
&&
3199 l
.avail_no_missing
== r
.avail_no_missing
&&
3200 l
.object_location_counts
== r
.object_location_counts
&&
3201 l
.mapping_epoch
== r
.mapping_epoch
&&
3202 l
.blocked_by
== r
.blocked_by
&&
3203 l
.last_became_active
== r
.last_became_active
&&
3204 l
.last_became_peered
== r
.last_became_peered
&&
3205 l
.dirty_stats_invalid
== r
.dirty_stats_invalid
&&
3206 l
.omap_stats_invalid
== r
.omap_stats_invalid
&&
3207 l
.hitset_stats_invalid
== r
.hitset_stats_invalid
&&
3208 l
.hitset_bytes_stats_invalid
== r
.hitset_bytes_stats_invalid
&&
3209 l
.up_primary
== r
.up_primary
&&
3210 l
.acting_primary
== r
.acting_primary
&&
3211 l
.pin_stats_invalid
== r
.pin_stats_invalid
&&
3212 l
.manifest_stats_invalid
== r
.manifest_stats_invalid
&&
3213 l
.purged_snaps
== r
.purged_snaps
&&
3214 l
.snaptrimq_len
== r
.snaptrimq_len
&&
3215 l
.last_scrub_duration
== r
.last_scrub_duration
&&
3216 l
.scrub_sched_status
== r
.scrub_sched_status
&&
3217 l
.objects_scrubbed
== r
.objects_scrubbed
;
3220 // -- store_statfs_t --
3222 bool store_statfs_t::operator==(const store_statfs_t
& other
) const
3224 return total
== other
.total
3225 && available
== other
.available
3226 && allocated
== other
.allocated
3227 && internally_reserved
== other
.internally_reserved
3228 && data_stored
== other
.data_stored
3229 && data_compressed
== other
.data_compressed
3230 && data_compressed_allocated
== other
.data_compressed_allocated
3231 && data_compressed_original
== other
.data_compressed_original
3232 && omap_allocated
== other
.omap_allocated
3233 && internal_metadata
== other
.internal_metadata
;
3236 void store_statfs_t::dump(Formatter
*f
) const
3238 f
->dump_int("total", total
);
3239 f
->dump_int("available", available
);
3240 f
->dump_int("internally_reserved", internally_reserved
);
3241 f
->dump_int("allocated", allocated
);
3242 f
->dump_int("data_stored", data_stored
);
3243 f
->dump_int("data_compressed", data_compressed
);
3244 f
->dump_int("data_compressed_allocated", data_compressed_allocated
);
3245 f
->dump_int("data_compressed_original", data_compressed_original
);
3246 f
->dump_int("omap_allocated", omap_allocated
);
3247 f
->dump_int("internal_metadata", internal_metadata
);
3250 ostream
& operator<<(ostream
& out
, const store_statfs_t
&s
)
3253 << "store_statfs(0x" << s
.available
3254 << "/0x" << s
.internally_reserved
3256 << ", data 0x" << s
.data_stored
3257 << "/0x" << s
.allocated
3258 << ", compress 0x" << s
.data_compressed
3259 << "/0x" << s
.data_compressed_allocated
3260 << "/0x" << s
.data_compressed_original
3261 << ", omap 0x" << s
.omap_allocated
3262 << ", meta 0x" << s
.internal_metadata
3268 void store_statfs_t::generate_test_instances(list
<store_statfs_t
*>& o
)
3271 o
.push_back(new store_statfs_t(a
));
3274 a
.internally_reserved
= 33;
3277 a
.data_compressed
= 21;
3278 a
.data_compressed_allocated
= 12;
3279 a
.data_compressed_original
= 13;
3280 a
.omap_allocated
= 14;
3281 a
.internal_metadata
= 15;
3282 o
.push_back(new store_statfs_t(a
));
3285 // -- pool_stat_t --
3287 void pool_stat_t::dump(Formatter
*f
) const
3290 f
->open_object_section("store_stats");
3291 store_stats
.dump(f
);
3293 f
->dump_int("log_size", log_size
);
3294 f
->dump_int("ondisk_log_size", ondisk_log_size
);
3295 f
->dump_int("up", up
);
3296 f
->dump_int("acting", acting
);
3297 f
->dump_int("num_store_stats", num_store_stats
);
3300 void pool_stat_t::encode(ceph::buffer::list
&bl
, uint64_t features
) const
3303 if ((features
& CEPH_FEATURE_OSDENC
) == 0) {
3307 encode(log_size
, bl
);
3308 encode(ondisk_log_size
, bl
);
3312 ENCODE_START(7, 5, bl
);
3314 encode(log_size
, bl
);
3315 encode(ondisk_log_size
, bl
);
3318 encode(store_stats
, bl
);
3319 encode(num_store_stats
, bl
);
3323 void pool_stat_t::decode(ceph::buffer::list::const_iterator
&bl
)
3325 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
3326 if (struct_v
>= 4) {
3328 decode(log_size
, bl
);
3329 decode(ondisk_log_size
, bl
);
3330 if (struct_v
>= 6) {
3337 if (struct_v
>= 7) {
3338 decode(store_stats
, bl
);
3339 decode(num_store_stats
, bl
);
3341 store_stats
.reset();
3342 num_store_stats
= 0;
3346 decode(stats
.sum
.num_bytes
, bl
);
3349 decode(stats
.sum
.num_objects
, bl
);
3350 decode(stats
.sum
.num_object_clones
, bl
);
3351 decode(stats
.sum
.num_object_copies
, bl
);
3352 decode(stats
.sum
.num_objects_missing_on_primary
, bl
);
3353 decode(stats
.sum
.num_objects_degraded
, bl
);
3354 decode(log_size
, bl
);
3355 decode(ondisk_log_size
, bl
);
3356 if (struct_v
>= 2) {
3357 decode(stats
.sum
.num_rd
, bl
);
3358 decode(stats
.sum
.num_rd_kb
, bl
);
3359 decode(stats
.sum
.num_wr
, bl
);
3360 decode(stats
.sum
.num_wr_kb
, bl
);
3362 if (struct_v
>= 3) {
3363 decode(stats
.sum
.num_objects_unfound
, bl
);
3369 void pool_stat_t::generate_test_instances(list
<pool_stat_t
*>& o
)
3372 o
.push_back(new pool_stat_t(a
));
3374 list
<object_stat_collection_t
*> l
;
3375 object_stat_collection_t::generate_test_instances(l
);
3376 list
<store_statfs_t
*> ll
;
3377 store_statfs_t::generate_test_instances(ll
);
3378 a
.stats
= *l
.back();
3379 a
.store_stats
= *ll
.back();
3381 a
.ondisk_log_size
= 456;
3384 a
.num_store_stats
= 1;
3385 o
.push_back(new pool_stat_t(a
));
3389 // -- pg_history_t --
3391 void pg_history_t::encode(ceph::buffer::list
&bl
) const
3393 ENCODE_START(10, 4, bl
);
3394 encode(epoch_created
, bl
);
3395 encode(last_epoch_started
, bl
);
3396 encode(last_epoch_clean
, bl
);
3397 encode(last_epoch_split
, bl
);
3398 encode(same_interval_since
, bl
);
3399 encode(same_up_since
, bl
);
3400 encode(same_primary_since
, bl
);
3401 encode(last_scrub
, bl
);
3402 encode(last_scrub_stamp
, bl
);
3403 encode(last_deep_scrub
, bl
);
3404 encode(last_deep_scrub_stamp
, bl
);
3405 encode(last_clean_scrub_stamp
, bl
);
3406 encode(last_epoch_marked_full
, bl
);
3407 encode(last_interval_started
, bl
);
3408 encode(last_interval_clean
, bl
);
3409 encode(epoch_pool_created
, bl
);
3410 encode(prior_readable_until_ub
, bl
);
3414 void pg_history_t::decode(ceph::buffer::list::const_iterator
&bl
)
3416 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl
);
3417 decode(epoch_created
, bl
);
3418 decode(last_epoch_started
, bl
);
3420 decode(last_epoch_clean
, bl
);
3422 last_epoch_clean
= last_epoch_started
; // careful, it's a lie!
3423 decode(last_epoch_split
, bl
);
3424 decode(same_interval_since
, bl
);
3425 decode(same_up_since
, bl
);
3426 decode(same_primary_since
, bl
);
3427 if (struct_v
>= 2) {
3428 decode(last_scrub
, bl
);
3429 decode(last_scrub_stamp
, bl
);
3431 if (struct_v
>= 5) {
3432 decode(last_deep_scrub
, bl
);
3433 decode(last_deep_scrub_stamp
, bl
);
3435 if (struct_v
>= 6) {
3436 decode(last_clean_scrub_stamp
, bl
);
3438 if (struct_v
>= 7) {
3439 decode(last_epoch_marked_full
, bl
);
3441 if (struct_v
>= 8) {
3442 decode(last_interval_started
, bl
);
3443 decode(last_interval_clean
, bl
);
3445 if (last_epoch_started
>= same_interval_since
) {
3446 last_interval_started
= same_interval_since
;
3448 last_interval_started
= last_epoch_started
; // best guess
3450 if (last_epoch_clean
>= same_interval_since
) {
3451 last_interval_clean
= same_interval_since
;
3453 last_interval_clean
= last_epoch_clean
; // best guess
3456 if (struct_v
>= 9) {
3457 decode(epoch_pool_created
, bl
);
3459 epoch_pool_created
= epoch_created
;
3461 if (struct_v
>= 10) {
3462 decode(prior_readable_until_ub
, bl
);
3467 void pg_history_t::dump(Formatter
*f
) const
3469 f
->dump_int("epoch_created", epoch_created
);
3470 f
->dump_int("epoch_pool_created", epoch_pool_created
);
3471 f
->dump_int("last_epoch_started", last_epoch_started
);
3472 f
->dump_int("last_interval_started", last_interval_started
);
3473 f
->dump_int("last_epoch_clean", last_epoch_clean
);
3474 f
->dump_int("last_interval_clean", last_interval_clean
);
3475 f
->dump_int("last_epoch_split", last_epoch_split
);
3476 f
->dump_int("last_epoch_marked_full", last_epoch_marked_full
);
3477 f
->dump_int("same_up_since", same_up_since
);
3478 f
->dump_int("same_interval_since", same_interval_since
);
3479 f
->dump_int("same_primary_since", same_primary_since
);
3480 f
->dump_stream("last_scrub") << last_scrub
;
3481 f
->dump_stream("last_scrub_stamp") << last_scrub_stamp
;
3482 f
->dump_stream("last_deep_scrub") << last_deep_scrub
;
3483 f
->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp
;
3484 f
->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp
;
3486 "prior_readable_until_ub",
3487 std::chrono::duration
<double>(prior_readable_until_ub
).count());
3490 void pg_history_t::generate_test_instances(list
<pg_history_t
*>& o
)
3492 o
.push_back(new pg_history_t
);
3493 o
.push_back(new pg_history_t
);
3494 o
.back()->epoch_created
= 1;
3495 o
.back()->epoch_pool_created
= 1;
3496 o
.back()->last_epoch_started
= 2;
3497 o
.back()->last_interval_started
= 2;
3498 o
.back()->last_epoch_clean
= 3;
3499 o
.back()->last_interval_clean
= 2;
3500 o
.back()->last_epoch_split
= 4;
3501 o
.back()->prior_readable_until_ub
= make_timespan(3.1415);
3502 o
.back()->same_up_since
= 5;
3503 o
.back()->same_interval_since
= 6;
3504 o
.back()->same_primary_since
= 7;
3505 o
.back()->last_scrub
= eversion_t(8, 9);
3506 o
.back()->last_scrub_stamp
= utime_t(10, 11);
3507 o
.back()->last_deep_scrub
= eversion_t(12, 13);
3508 o
.back()->last_deep_scrub_stamp
= utime_t(14, 15);
3509 o
.back()->last_clean_scrub_stamp
= utime_t(16, 17);
3510 o
.back()->last_epoch_marked_full
= 18;
3516 void pg_info_t::encode(ceph::buffer::list
&bl
) const
3518 ENCODE_START(32, 26, bl
);
3519 encode(pgid
.pgid
, bl
);
3520 encode(last_update
, bl
);
3521 encode(last_complete
, bl
);
3522 encode(log_tail
, bl
);
3523 encode(hobject_t(), bl
); // old (nibblewise) last_backfill
3526 encode(purged_snaps
, bl
);
3527 encode(last_epoch_started
, bl
);
3528 encode(last_user_version
, bl
);
3529 encode(hit_set
, bl
);
3530 encode(pgid
.shard
, bl
);
3531 encode(last_backfill
, bl
);
3532 encode(true, bl
); // was last_backfill_bitwise
3533 encode(last_interval_started
, bl
);
3537 void pg_info_t::decode(ceph::buffer::list::const_iterator
&bl
)
3539 DECODE_START(32, bl
);
3540 decode(pgid
.pgid
, bl
);
3541 decode(last_update
, bl
);
3542 decode(last_complete
, bl
);
3543 decode(log_tail
, bl
);
3545 hobject_t old_last_backfill
;
3546 decode(old_last_backfill
, bl
);
3550 decode(purged_snaps
, bl
);
3551 decode(last_epoch_started
, bl
);
3552 decode(last_user_version
, bl
);
3553 decode(hit_set
, bl
);
3554 decode(pgid
.shard
, bl
);
3555 decode(last_backfill
, bl
);
3557 bool last_backfill_bitwise
;
3558 decode(last_backfill_bitwise
, bl
);
3559 // note: we may see a false value here since the default value for
3560 // the member was false, so it often didn't get set to true until
3561 // peering progressed.
3563 if (struct_v
>= 32) {
3564 decode(last_interval_started
, bl
);
3566 last_interval_started
= last_epoch_started
;
3573 void pg_info_t::dump(Formatter
*f
) const
3575 f
->dump_stream("pgid") << pgid
;
3576 f
->dump_stream("last_update") << last_update
;
3577 f
->dump_stream("last_complete") << last_complete
;
3578 f
->dump_stream("log_tail") << log_tail
;
3579 f
->dump_int("last_user_version", last_user_version
);
3580 f
->dump_stream("last_backfill") << last_backfill
;
3581 f
->open_array_section("purged_snaps");
3582 for (interval_set
<snapid_t
>::const_iterator i
=purged_snaps
.begin();
3583 i
!= purged_snaps
.end();
3585 f
->open_object_section("purged_snap_interval");
3586 f
->dump_stream("start") << i
.get_start();
3587 f
->dump_stream("length") << i
.get_len();
3591 f
->open_object_section("history");
3594 f
->open_object_section("stats");
3598 f
->dump_int("empty", is_empty());
3599 f
->dump_int("dne", dne());
3600 f
->dump_int("incomplete", is_incomplete());
3601 f
->dump_int("last_epoch_started", last_epoch_started
);
3603 f
->open_object_section("hit_set_history");
3608 void pg_info_t::generate_test_instances(list
<pg_info_t
*>& o
)
3610 o
.push_back(new pg_info_t
);
3611 o
.push_back(new pg_info_t
);
3612 list
<pg_history_t
*> h
;
3613 pg_history_t::generate_test_instances(h
);
3614 o
.back()->history
= *h
.back();
3615 o
.back()->pgid
= spg_t(pg_t(1, 2), shard_id_t::NO_SHARD
);
3616 o
.back()->last_update
= eversion_t(3, 4);
3617 o
.back()->last_complete
= eversion_t(5, 6);
3618 o
.back()->last_user_version
= 2;
3619 o
.back()->log_tail
= eversion_t(7, 8);
3620 o
.back()->last_backfill
= hobject_t(object_t("objname"), "key", 123, 456, -1, "");
3623 pg_stat_t::generate_test_instances(s
);
3624 o
.back()->stats
= *s
.back();
3627 list
<pg_hit_set_history_t
*> s
;
3628 pg_hit_set_history_t::generate_test_instances(s
);
3629 o
.back()->hit_set
= *s
.back();
3633 // -- pg_notify_t --
3634 void pg_notify_t::encode(ceph::buffer::list
&bl
) const
3636 ENCODE_START(3, 2, bl
);
3637 encode(query_epoch
, bl
);
3638 encode(epoch_sent
, bl
);
3642 encode(past_intervals
, bl
);
3646 void pg_notify_t::decode(ceph::buffer::list::const_iterator
&bl
)
3648 DECODE_START(3, bl
);
3649 decode(query_epoch
, bl
);
3650 decode(epoch_sent
, bl
);
3654 if (struct_v
>= 3) {
3655 decode(past_intervals
, bl
);
3660 void pg_notify_t::dump(Formatter
*f
) const
3662 f
->dump_int("from", from
);
3663 f
->dump_int("to", to
);
3664 f
->dump_unsigned("query_epoch", query_epoch
);
3665 f
->dump_unsigned("epoch_sent", epoch_sent
);
3667 f
->open_object_section("info");
3671 f
->dump_object("past_intervals", past_intervals
);
3674 void pg_notify_t::generate_test_instances(list
<pg_notify_t
*>& o
)
3676 o
.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD
, 1, 1,
3677 pg_info_t(), PastIntervals()));
3678 o
.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
3679 pg_info_t(), PastIntervals()));
3682 ostream
&operator<<(ostream
&lhs
, const pg_notify_t
¬ify
)
3684 lhs
<< "(query:" << notify
.query_epoch
3685 << " sent:" << notify
.epoch_sent
3686 << " " << notify
.info
;
3687 if (notify
.from
!= shard_id_t::NO_SHARD
||
3688 notify
.to
!= shard_id_t::NO_SHARD
)
3689 lhs
<< " " << (unsigned)notify
.from
3690 << "->" << (unsigned)notify
.to
;
3691 lhs
<< " " << notify
.past_intervals
;
3695 // -- pg_interval_t --
3697 void PastIntervals::pg_interval_t::encode(ceph::buffer::list
& bl
) const
3699 ENCODE_START(4, 2, bl
);
3704 encode(maybe_went_rw
, bl
);
3705 encode(primary
, bl
);
3706 encode(up_primary
, bl
);
3710 void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator
& bl
)
3712 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl
);
3717 decode(maybe_went_rw
, bl
);
3718 if (struct_v
>= 3) {
3719 decode(primary
, bl
);
3722 primary
= acting
[0];
3724 if (struct_v
>= 4) {
3725 decode(up_primary
, bl
);
3733 void PastIntervals::pg_interval_t::dump(Formatter
*f
) const
3735 f
->dump_unsigned("first", first
);
3736 f
->dump_unsigned("last", last
);
3737 f
->dump_int("maybe_went_rw", maybe_went_rw
? 1 : 0);
3738 f
->open_array_section("up");
3739 for (auto p
= up
.cbegin(); p
!= up
.cend(); ++p
)
3740 f
->dump_int("osd", *p
);
3742 f
->open_array_section("acting");
3743 for (auto p
= acting
.cbegin(); p
!= acting
.cend(); ++p
)
3744 f
->dump_int("osd", *p
);
3746 f
->dump_int("primary", primary
);
3747 f
->dump_int("up_primary", up_primary
);
3750 void PastIntervals::pg_interval_t::generate_test_instances(list
<pg_interval_t
*>& o
)
3752 o
.push_back(new pg_interval_t
);
3753 o
.push_back(new pg_interval_t
);
3754 o
.back()->up
.push_back(1);
3755 o
.back()->acting
.push_back(2);
3756 o
.back()->acting
.push_back(3);
3757 o
.back()->first
= 4;
3759 o
.back()->maybe_went_rw
= true;
3762 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t
)
3768 * PastIntervals only needs to be able to answer two questions:
3769 * 1) Where should the primary look for unfound objects?
3770 * 2) List a set of subsets of the OSDs such that contacting at least
3771 * one from each subset guarantees we speak to at least one witness
3772 * of any completed write.
3774 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3775 * we don't need to keep any where maybe_went_rw would be false. We also
3776 * needn't keep two intervals where the actingset in one is a subset
3777 * of the other (only need to keep the smaller of the two sets). In order
3778 * to accurately trim the set of intervals as last_epoch_started changes
3779 * without rebuilding the set from scratch, we'll retain the larger set
3780 * if it in an older interval.
3782 struct compact_interval_t
{
3785 set
<pg_shard_t
> acting
;
3786 bool supersedes(const compact_interval_t
&other
) {
3787 for (auto &&i
: acting
) {
3788 if (!other
.acting
.count(i
))
3793 void dump(Formatter
*f
) const {
3794 f
->open_object_section("compact_interval_t");
3795 f
->dump_stream("first") << first
;
3796 f
->dump_stream("last") << last
;
3797 f
->dump_stream("acting") << acting
;
3800 void encode(ceph::buffer::list
&bl
) const {
3801 ENCODE_START(1, 1, bl
);
3807 void decode(ceph::buffer::list::const_iterator
&bl
) {
3808 DECODE_START(1, bl
);
3814 static void generate_test_instances(list
<compact_interval_t
*> & o
) {
3815 /* Not going to be used, we'll generate pi_compact_rep directly */
3818 ostream
&operator<<(ostream
&o
, const compact_interval_t
&rhs
)
3820 return o
<< "([" << rhs
.first
<< "," << rhs
.last
3821 << "] acting " << rhs
.acting
<< ")";
3823 WRITE_CLASS_ENCODER(compact_interval_t
)
3825 class pi_compact_rep
: public PastIntervals::interval_rep
{
3827 epoch_t last
= 0; // inclusive
3828 set
<pg_shard_t
> all_participants
;
3829 list
<compact_interval_t
> intervals
;
3832 std::list
<PastIntervals::pg_interval_t
> &&intervals
) {
3833 for (auto &&i
: intervals
)
3834 add_interval(ec_pool
, i
);
3837 pi_compact_rep() = default;
3838 pi_compact_rep(const pi_compact_rep
&) = default;
3839 pi_compact_rep(pi_compact_rep
&&) = default;
3840 pi_compact_rep
&operator=(const pi_compact_rep
&) = default;
3841 pi_compact_rep
&operator=(pi_compact_rep
&&) = default;
3843 size_t size() const override
{ return intervals
.size(); }
3844 bool empty() const override
{
3845 return first
> last
|| (first
== 0 && last
== 0);
3847 void clear() override
{
3848 *this = pi_compact_rep();
3850 pair
<epoch_t
, epoch_t
> get_bounds() const override
{
3851 return make_pair(first
, last
+ 1);
3853 void adjust_start_backwards(epoch_t last_epoch_clean
) override
{
3854 first
= last_epoch_clean
;
3857 set
<pg_shard_t
> get_all_participants(
3858 bool ec_pool
) const override
{
3859 return all_participants
;
3862 bool ec_pool
, const PastIntervals::pg_interval_t
&interval
) override
{
3864 first
= interval
.first
;
3865 ceph_assert(interval
.last
> last
);
3866 last
= interval
.last
;
3867 set
<pg_shard_t
> acting
;
3868 for (unsigned i
= 0; i
< interval
.acting
.size(); ++i
) {
3869 if (interval
.acting
[i
] == CRUSH_ITEM_NONE
)
3874 ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3876 all_participants
.insert(acting
.begin(), acting
.end());
3877 if (!interval
.maybe_went_rw
)
3879 intervals
.push_back(
3880 compact_interval_t
{interval
.first
, interval
.last
, acting
});
3881 auto plast
= intervals
.end();
3883 for (auto cur
= intervals
.begin(); cur
!= plast
; ) {
3884 if (plast
->supersedes(*cur
)) {
3885 intervals
.erase(cur
++);
3891 unique_ptr
<PastIntervals::interval_rep
> clone() const override
{
3892 return unique_ptr
<PastIntervals::interval_rep
>(new pi_compact_rep(*this));
3894 ostream
&print(ostream
&out
) const override
{
3895 return out
<< "([" << first
<< "," << last
3896 << "] all_participants=" << all_participants
3897 << " intervals=" << intervals
<< ")";
3899 void encode(ceph::buffer::list
&bl
) const override
{
3900 ENCODE_START(1, 1, bl
);
3903 encode(all_participants
, bl
);
3904 encode(intervals
, bl
);
3907 void decode(ceph::buffer::list::const_iterator
&bl
) override
{
3908 DECODE_START(1, bl
);
3911 decode(all_participants
, bl
);
3912 decode(intervals
, bl
);
3915 void dump(Formatter
*f
) const override
{
3916 f
->open_object_section("PastIntervals::compact_rep");
3917 f
->dump_stream("first") << first
;
3918 f
->dump_stream("last") << last
;
3919 f
->open_array_section("all_participants");
3920 for (auto& i
: all_participants
) {
3921 f
->dump_object("pg_shard", i
);
3924 f
->open_array_section("intervals");
3925 for (auto &&i
: intervals
) {
3931 static void generate_test_instances(list
<pi_compact_rep
*> &o
) {
3932 using ival
= PastIntervals::pg_interval_t
;
3933 using ivallst
= std::list
<ival
>;
3937 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3938 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3939 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3940 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3945 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3946 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3947 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3948 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3953 { ival
{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3954 , ival
{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3955 , ival
{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3956 , ival
{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3959 void iterate_mayberw_back_to(
3961 std::function
<void(epoch_t
, const set
<pg_shard_t
> &)> &&f
) const override
{
3962 for (auto i
= intervals
.rbegin(); i
!= intervals
.rend(); ++i
) {
3965 f(i
->first
, i
->acting
);
3968 virtual ~pi_compact_rep() override
{}
3970 WRITE_CLASS_ENCODER(pi_compact_rep
)
3972 PastIntervals::PastIntervals()
3974 past_intervals
.reset(new pi_compact_rep
);
3977 PastIntervals::PastIntervals(const PastIntervals
&rhs
)
3978 : past_intervals(rhs
.past_intervals
?
3979 rhs
.past_intervals
->clone() :
3982 PastIntervals
&PastIntervals::operator=(const PastIntervals
&rhs
)
3984 PastIntervals
other(rhs
);
3989 ostream
& operator<<(ostream
& out
, const PastIntervals
&i
)
3991 if (i
.past_intervals
) {
3992 return i
.past_intervals
->print(out
);
3994 return out
<< "(empty)";
3998 ostream
& operator<<(ostream
& out
, const PastIntervals::PriorSet
&i
)
4000 return out
<< "PriorSet("
4001 << "ec_pool: " << i
.ec_pool
4002 << ", probe: " << i
.probe
4003 << ", down: " << i
.down
4004 << ", blocked_by: " << i
.blocked_by
4005 << ", pg_down: " << i
.pg_down
4009 void PastIntervals::decode(ceph::buffer::list::const_iterator
&bl
)
4011 DECODE_START(1, bl
);
4018 ceph_abort_msg("pi_simple_rep support removed post-luminous");
4021 past_intervals
.reset(new pi_compact_rep
);
4022 past_intervals
->decode(bl
);
4028 void PastIntervals::generate_test_instances(list
<PastIntervals
*> &o
)
4031 list
<pi_compact_rep
*> compact
;
4032 pi_compact_rep::generate_test_instances(compact
);
4033 for (auto &&i
: compact
) {
4034 // takes ownership of contents
4035 o
.push_back(new PastIntervals(i
));
4041 bool PastIntervals::is_new_interval(
4042 int old_acting_primary
,
4043 int new_acting_primary
,
4044 const vector
<int> &old_acting
,
4045 const vector
<int> &new_acting
,
4048 const vector
<int> &old_up
,
4049 const vector
<int> &new_up
,
4054 unsigned old_pg_num
,
4055 unsigned new_pg_num
,
4056 unsigned old_pg_num_pending
,
4057 unsigned new_pg_num_pending
,
4058 bool old_sort_bitwise
,
4059 bool new_sort_bitwise
,
4060 bool old_recovery_deletes
,
4061 bool new_recovery_deletes
,
4062 uint32_t old_crush_count
,
4063 uint32_t new_crush_count
,
4064 uint32_t old_crush_target
,
4065 uint32_t new_crush_target
,
4066 uint32_t old_crush_barrier
,
4067 uint32_t new_crush_barrier
,
4068 int32_t old_crush_member
,
4069 int32_t new_crush_member
,
4071 return old_acting_primary
!= new_acting_primary
||
4072 new_acting
!= old_acting
||
4073 old_up_primary
!= new_up_primary
||
4075 old_min_size
!= new_min_size
||
4076 old_size
!= new_size
||
4077 pgid
.is_split(old_pg_num
, new_pg_num
, 0) ||
4078 // (is or was) pre-merge source
4079 pgid
.is_merge_source(old_pg_num_pending
, new_pg_num_pending
, 0) ||
4080 pgid
.is_merge_source(new_pg_num_pending
, old_pg_num_pending
, 0) ||
4082 pgid
.is_merge_source(old_pg_num
, new_pg_num
, 0) ||
4083 // (is or was) pre-merge target
4084 pgid
.is_merge_target(old_pg_num_pending
, new_pg_num_pending
) ||
4085 pgid
.is_merge_target(new_pg_num_pending
, old_pg_num_pending
) ||
4087 pgid
.is_merge_target(old_pg_num
, new_pg_num
) ||
4088 old_sort_bitwise
!= new_sort_bitwise
||
4089 old_recovery_deletes
!= new_recovery_deletes
||
4090 old_crush_count
!= new_crush_count
||
4091 old_crush_target
!= new_crush_target
||
4092 old_crush_barrier
!= new_crush_barrier
||
4093 old_crush_member
!= new_crush_member
;
4096 bool PastIntervals::is_new_interval(
4097 int old_acting_primary
,
4098 int new_acting_primary
,
4099 const vector
<int> &old_acting
,
4100 const vector
<int> &new_acting
,
4103 const vector
<int> &old_up
,
4104 const vector
<int> &new_up
,
4105 const OSDMap
*osdmap
,
4106 const OSDMap
*lastmap
,
4109 const pg_pool_t
*plast
= lastmap
->get_pg_pool(pgid
.pool());
4111 return false; // after pool is deleted there are no more interval changes
4113 const pg_pool_t
*pi
= osdmap
->get_pg_pool(pgid
.pool());
4115 return true; // pool was deleted this epoch -> (final!) interval change
4118 is_new_interval(old_acting_primary
,
4130 plast
->get_pg_num(),
4132 plast
->get_pg_num_pending(),
4133 pi
->get_pg_num_pending(),
4134 lastmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
),
4135 osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
),
4136 lastmap
->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
),
4137 osdmap
->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
),
4138 plast
->peering_crush_bucket_count
, pi
->peering_crush_bucket_count
,
4139 plast
->peering_crush_bucket_target
, pi
->peering_crush_bucket_target
,
4140 plast
->peering_crush_bucket_barrier
, pi
->peering_crush_bucket_barrier
,
4141 plast
->peering_crush_mandatory_member
, pi
->peering_crush_mandatory_member
,
4145 bool PastIntervals::check_new_interval(
4146 int old_acting_primary
,
4147 int new_acting_primary
,
4148 const vector
<int> &old_acting
,
4149 const vector
<int> &new_acting
,
4152 const vector
<int> &old_up
,
4153 const vector
<int> &new_up
,
4154 epoch_t same_interval_since
,
4155 epoch_t last_epoch_clean
,
4156 const OSDMap
*osdmap
,
4157 const OSDMap
*lastmap
,
4159 const IsPGRecoverablePredicate
&could_have_gone_active
,
4160 PastIntervals
*past_intervals
,
4164 * We have to be careful to gracefully deal with situations like
4165 * so. Say we have a power outage or something that takes out both
4166 * OSDs, but the monitor doesn't mark them down in the same epoch.
4167 * The history may look like
4171 * 3: let's say B dies for good, too (say, from the power spike)
4174 * which makes it look like B may have applied updates to the PG
4175 * that we need in order to proceed. This sucks...
4177 * To minimize the risk of this happening, we CANNOT go active if
4178 * _any_ OSDs in the prior set are down until we send an MOSDAlive
4179 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
4180 * Then, we have something like
4187 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
4197 * -> we must wait for B, bc it was alive through 2, and could have
4198 * written to the pg.
4200 * If B is really dead, then an administrator will need to manually
4201 * intervene by marking the OSD as "lost."
4204 // remember past interval
4205 // NOTE: a change in the up set primary triggers an interval
4206 // change, even though the interval members in the pg_interval_t
4208 ceph_assert(past_intervals
);
4209 ceph_assert(past_intervals
->past_intervals
);
4210 if (is_new_interval(
4223 i
.first
= same_interval_since
;
4224 i
.last
= osdmap
->get_epoch() - 1;
4225 ceph_assert(i
.first
<= i
.last
);
4226 i
.acting
= old_acting
;
4228 i
.primary
= old_acting_primary
;
4229 i
.up_primary
= old_up_primary
;
4231 unsigned num_acting
= 0;
4232 for (auto p
= i
.acting
.cbegin(); p
!= i
.acting
.cend(); ++p
)
4233 if (*p
!= CRUSH_ITEM_NONE
)
4236 ceph_assert(lastmap
->get_pools().count(pgid
.pool()));
4237 const pg_pool_t
& old_pg_pool
= lastmap
->get_pools().find(pgid
.pool())->second
;
4238 set
<pg_shard_t
> old_acting_shards
;
4239 old_pg_pool
.convert_to_pg_shards(old_acting
, &old_acting_shards
);
4243 num_acting
>= old_pg_pool
.min_size
&&
4244 (!old_pg_pool
.is_stretch_pool() ||
4245 old_pg_pool
.stretch_set_can_peer(old_acting
, *lastmap
, out
)) &&
4246 could_have_gone_active(old_acting_shards
)) {
4248 *out
<< __func__
<< " " << i
4249 << " up_thru " << lastmap
->get_up_thru(i
.primary
)
4250 << " up_from " << lastmap
->get_up_from(i
.primary
)
4251 << " last_epoch_clean " << last_epoch_clean
;
4252 if (lastmap
->get_up_thru(i
.primary
) >= i
.first
&&
4253 lastmap
->get_up_from(i
.primary
) <= i
.first
) {
4254 i
.maybe_went_rw
= true;
4257 << " : primary up " << lastmap
->get_up_from(i
.primary
)
4258 << "-" << lastmap
->get_up_thru(i
.primary
)
4259 << " includes interval"
4261 } else if (last_epoch_clean
>= i
.first
&&
4262 last_epoch_clean
<= i
.last
) {
4263 // If the last_epoch_clean is included in this interval, then
4264 // the pg must have been rw (for recovery to have completed).
4265 // This is important because we won't know the _real_
4266 // first_epoch because we stop at last_epoch_clean, and we
4267 // don't want the oldest interval to randomly have
4268 // maybe_went_rw false depending on the relative up_thru vs
4269 // last_epoch_clean timing.
4270 i
.maybe_went_rw
= true;
4273 << " : includes last_epoch_clean " << last_epoch_clean
4274 << " and presumed to have been rw"
4277 i
.maybe_went_rw
= false;
4280 << " : primary up " << lastmap
->get_up_from(i
.primary
)
4281 << "-" << lastmap
->get_up_thru(i
.primary
)
4282 << " does not include interval"
4286 i
.maybe_went_rw
= false;
4288 *out
<< __func__
<< " " << i
<< " : acting set is too small" << std::endl
;
4290 past_intervals
->past_intervals
->add_interval(old_pg_pool
.is_erasure(), i
);
4297 // true if the given map affects the prior set
4298 bool PastIntervals::PriorSet::affected_by_map(
4299 const OSDMap
&osdmap
,
4300 const DoutPrefixProvider
*dpp
) const
4302 for (auto p
= probe
.begin(); p
!= probe
.end(); ++p
) {
4305 // did someone in the prior set go down?
4306 if (osdmap
.is_down(o
) && down
.count(o
) == 0) {
4307 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " now down" << dendl
;
4311 // did a down osd in cur get (re)marked as lost?
4312 auto r
= blocked_by
.find(o
);
4313 if (r
!= blocked_by
.end()) {
4314 if (!osdmap
.exists(o
)) {
4315 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " no longer exists" << dendl
;
4318 if (osdmap
.get_info(o
).lost_at
!= r
->second
) {
4319 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " (re)marked as lost" << dendl
;
4325 // did someone in the prior down set go up?
4326 for (auto p
= down
.cbegin(); p
!= down
.cend(); ++p
) {
4329 if (osdmap
.is_up(o
)) {
4330 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " now up" << dendl
;
4334 // did someone in the prior set get lost or destroyed?
4335 if (!osdmap
.exists(o
)) {
4336 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " no longer exists" << dendl
;
4339 // did a down osd in down get (re)marked as lost?
4340 auto r
= blocked_by
.find(o
);
4341 if (r
!= blocked_by
.end()) {
4342 if (osdmap
.get_info(o
).lost_at
!= r
->second
) {
4343 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " (re)marked as lost" << dendl
;
4352 ostream
& operator<<(ostream
& out
, const PastIntervals::pg_interval_t
& i
)
4354 out
<< "interval(" << i
.first
<< "-" << i
.last
4355 << " up " << i
.up
<< "(" << i
.up_primary
<< ")"
4356 << " acting " << i
.acting
<< "(" << i
.primary
<< ")";
4357 if (i
.maybe_went_rw
)
4358 out
<< " maybe_went_rw";
4367 void pg_query_t::encode(ceph::buffer::list
&bl
, uint64_t features
) const {
4368 ENCODE_START(3, 3, bl
);
4372 encode(epoch_sent
, bl
);
4378 void pg_query_t::decode(ceph::buffer::list::const_iterator
&bl
) {
4379 DECODE_START(3, bl
);
4383 decode(epoch_sent
, bl
);
4389 void pg_query_t::dump(Formatter
*f
) const
4391 f
->dump_int("from", from
);
4392 f
->dump_int("to", to
);
4393 f
->dump_string("type", get_type_name());
4394 f
->dump_stream("since") << since
;
4395 f
->dump_stream("epoch_sent") << epoch_sent
;
4396 f
->open_object_section("history");
4400 void pg_query_t::generate_test_instances(list
<pg_query_t
*>& o
)
4402 o
.push_back(new pg_query_t());
4403 list
<pg_history_t
*> h
;
4404 pg_history_t::generate_test_instances(h
);
4405 o
.push_back(new pg_query_t(pg_query_t::INFO
, shard_id_t(1), shard_id_t(2), *h
.back(), 4));
4406 o
.push_back(new pg_query_t(pg_query_t::MISSING
, shard_id_t(2), shard_id_t(3), *h
.back(), 4));
4407 o
.push_back(new pg_query_t(pg_query_t::LOG
, shard_id_t(0), shard_id_t(0),
4408 eversion_t(4, 5), *h
.back(), 4));
4409 o
.push_back(new pg_query_t(pg_query_t::FULLLOG
,
4410 shard_id_t::NO_SHARD
, shard_id_t::NO_SHARD
,
4416 void pg_lease_t::encode(bufferlist
& bl
) const
4418 ENCODE_START(1, 1, bl
);
4419 encode(readable_until
, bl
);
4420 encode(readable_until_ub
, bl
);
4421 encode(interval
, bl
);
4425 void pg_lease_t::decode(bufferlist::const_iterator
& p
)
4428 decode(readable_until
, p
);
4429 decode(readable_until_ub
, p
);
4430 decode(interval
, p
);
4434 void pg_lease_t::dump(Formatter
*f
) const
4436 f
->dump_stream("readable_until") << readable_until
;
4437 f
->dump_stream("readable_until_ub") << readable_until_ub
;
4438 f
->dump_stream("interval") << interval
;
4441 void pg_lease_t::generate_test_instances(std::list
<pg_lease_t
*>& o
)
4443 o
.push_back(new pg_lease_t());
4444 o
.push_back(new pg_lease_t());
4445 o
.back()->readable_until
= make_timespan(1.5);
4446 o
.back()->readable_until_ub
= make_timespan(3.4);
4447 o
.back()->interval
= make_timespan(1.0);
4450 // -- pg_lease_ack_t --
4452 void pg_lease_ack_t::encode(bufferlist
& bl
) const
4454 ENCODE_START(1, 1, bl
);
4455 encode(readable_until_ub
, bl
);
4459 void pg_lease_ack_t::decode(bufferlist::const_iterator
& p
)
4462 decode(readable_until_ub
, p
);
4466 void pg_lease_ack_t::dump(Formatter
*f
) const
4468 f
->dump_stream("readable_until_ub") << readable_until_ub
;
4471 void pg_lease_ack_t::generate_test_instances(std::list
<pg_lease_ack_t
*>& o
)
4473 o
.push_back(new pg_lease_ack_t());
4474 o
.push_back(new pg_lease_ack_t());
4475 o
.back()->readable_until_ub
= make_timespan(3.4);
4479 // -- ObjectModDesc --
4480 void ObjectModDesc::visit(Visitor
*visitor
) const
4482 auto bp
= bl
.cbegin();
4485 DECODE_START(max_required_version
, bp
);
4492 visitor
->append(size
);
4496 map
<string
, std::optional
<ceph::buffer::list
> > attrs
;
4498 visitor
->setattrs(attrs
);
4502 version_t old_version
;
4503 decode(old_version
, bp
);
4504 visitor
->rmobject(old_version
);
4511 case UPDATE_SNAPS
: {
4512 set
<snapid_t
> snaps
;
4514 visitor
->update_snaps(snaps
);
4518 version_t old_version
;
4519 decode(old_version
, bp
);
4520 visitor
->try_rmobject(old_version
);
4523 case ROLLBACK_EXTENTS
: {
4524 vector
<pair
<uint64_t, uint64_t> > extents
;
4527 decode(extents
, bp
);
4528 visitor
->rollback_extents(gen
,extents
);
4532 ceph_abort_msg("Invalid rollback code");
4537 ceph_abort_msg("Invalid encoding");
4541 struct DumpVisitor
: public ObjectModDesc::Visitor
{
4543 explicit DumpVisitor(Formatter
*f
) : f(f
) {}
4544 void append(uint64_t old_size
) override
{
4545 f
->open_object_section("op");
4546 f
->dump_string("code", "APPEND");
4547 f
->dump_unsigned("old_size", old_size
);
4550 void setattrs(map
<string
, std::optional
<ceph::buffer::list
> > &attrs
) override
{
4551 f
->open_object_section("op");
4552 f
->dump_string("code", "SETATTRS");
4553 f
->open_array_section("attrs");
4554 for (auto i
= attrs
.begin(); i
!= attrs
.end(); ++i
) {
4555 f
->dump_string("attr_name", i
->first
);
4560 void rmobject(version_t old_version
) override
{
4561 f
->open_object_section("op");
4562 f
->dump_string("code", "RMOBJECT");
4563 f
->dump_unsigned("old_version", old_version
);
4566 void try_rmobject(version_t old_version
) override
{
4567 f
->open_object_section("op");
4568 f
->dump_string("code", "TRY_RMOBJECT");
4569 f
->dump_unsigned("old_version", old_version
);
4572 void create() override
{
4573 f
->open_object_section("op");
4574 f
->dump_string("code", "CREATE");
4577 void update_snaps(const set
<snapid_t
> &snaps
) override
{
4578 f
->open_object_section("op");
4579 f
->dump_string("code", "UPDATE_SNAPS");
4580 f
->dump_stream("snaps") << snaps
;
4583 void rollback_extents(
4585 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
4586 f
->open_object_section("op");
4587 f
->dump_string("code", "ROLLBACK_EXTENTS");
4588 f
->dump_unsigned("gen", gen
);
4589 f
->dump_stream("snaps") << extents
;
4594 void ObjectModDesc::dump(Formatter
*f
) const
4596 f
->open_object_section("object_mod_desc");
4597 f
->dump_bool("can_local_rollback", can_local_rollback
);
4598 f
->dump_bool("rollback_info_completed", rollback_info_completed
);
4600 f
->open_array_section("ops");
4608 void ObjectModDesc::generate_test_instances(list
<ObjectModDesc
*>& o
)
4610 map
<string
, std::optional
<ceph::buffer::list
> > attrs
;
4614 o
.push_back(new ObjectModDesc());
4615 o
.back()->append(100);
4616 o
.back()->setattrs(attrs
);
4617 o
.push_back(new ObjectModDesc());
4618 o
.back()->rmobject(1001);
4619 o
.push_back(new ObjectModDesc());
4621 o
.back()->setattrs(attrs
);
4622 o
.push_back(new ObjectModDesc());
4624 o
.back()->setattrs(attrs
);
4625 o
.back()->mark_unrollbackable();
4626 o
.back()->append(1000);
4629 void ObjectModDesc::encode(ceph::buffer::list
&_bl
) const
4631 ENCODE_START(max_required_version
, max_required_version
, _bl
);
4632 encode(can_local_rollback
, _bl
);
4633 encode(rollback_info_completed
, _bl
);
4637 void ObjectModDesc::decode(ceph::buffer::list::const_iterator
&_bl
)
4639 DECODE_START(2, _bl
);
4640 max_required_version
= struct_v
;
4641 decode(can_local_rollback
, _bl
);
4642 decode(rollback_info_completed
, _bl
);
4644 // ensure bl does not pin a larger ceph::buffer in memory
4646 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4650 std::atomic
<uint32_t> ObjectCleanRegions::max_num_intervals
= {10};
4652 void ObjectCleanRegions::set_max_num_intervals(uint32_t num
)
4654 max_num_intervals
= num
;
4657 void ObjectCleanRegions::trim()
4659 while(clean_offsets
.num_intervals() > max_num_intervals
) {
4660 typename interval_set
<uint64_t>::iterator shortest_interval
= clean_offsets
.begin();
4661 if (shortest_interval
== clean_offsets
.end())
4663 for (typename interval_set
<uint64_t>::iterator it
= clean_offsets
.begin();
4664 it
!= clean_offsets
.end();
4666 if (it
.get_len() < shortest_interval
.get_len())
4667 shortest_interval
= it
;
4669 clean_offsets
.erase(shortest_interval
);
4673 void ObjectCleanRegions::merge(const ObjectCleanRegions
&other
)
4675 clean_offsets
.intersection_of(other
.clean_offsets
);
4676 clean_omap
= clean_omap
&& other
.clean_omap
;
4680 void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset
, uint64_t len
)
4682 interval_set
<uint64_t> clean_region
;
4683 clean_region
.insert(0, (uint64_t)-1);
4684 clean_region
.erase(offset
, len
);
4685 clean_offsets
.intersection_of(clean_region
);
4689 bool ObjectCleanRegions::is_clean_region(uint64_t offset
, uint64_t len
) const
4691 return clean_offsets
.contains(offset
, len
);
4694 void ObjectCleanRegions::mark_omap_dirty()
4699 void ObjectCleanRegions::mark_object_new()
4704 void ObjectCleanRegions::mark_fully_dirty()
4706 mark_data_region_dirty(0, (uint64_t)-1);
4711 interval_set
<uint64_t> ObjectCleanRegions::get_dirty_regions() const
4713 interval_set
<uint64_t> dirty_region
;
4714 dirty_region
.insert(0, (uint64_t)-1);
4715 dirty_region
.subtract(clean_offsets
);
4716 return dirty_region
;
4719 bool ObjectCleanRegions::omap_is_dirty() const
4724 bool ObjectCleanRegions::object_is_exist() const
4729 void ObjectCleanRegions::encode(bufferlist
&bl
) const
4731 ENCODE_START(1, 1, bl
);
4733 encode(clean_offsets
, bl
);
4734 encode(clean_omap
, bl
);
4735 encode(new_object
, bl
);
4739 void ObjectCleanRegions::decode(bufferlist::const_iterator
&bl
)
4741 DECODE_START(1, bl
);
4743 decode(clean_offsets
, bl
);
4744 decode(clean_omap
, bl
);
4745 decode(new_object
, bl
);
4749 void ObjectCleanRegions::dump(Formatter
*f
) const
4751 f
->open_object_section("object_clean_regions");
4752 f
->dump_stream("clean_offsets") << clean_offsets
;
4753 f
->dump_bool("clean_omap", clean_omap
);
4754 f
->dump_bool("new_object", new_object
);
4758 void ObjectCleanRegions::generate_test_instances(list
<ObjectCleanRegions
*>& o
)
4760 o
.push_back(new ObjectCleanRegions());
4761 o
.push_back(new ObjectCleanRegions());
4762 o
.back()->mark_data_region_dirty(4096, 40960);
4763 o
.back()->mark_omap_dirty();
4764 o
.back()->mark_object_new();
4767 ostream
& operator<<(ostream
& out
, const ObjectCleanRegions
& ocr
)
4769 return out
<< "clean_offsets: " << ocr
.clean_offsets
4770 << ", clean_omap: " << ocr
.clean_omap
4771 << ", new_object: " << ocr
.new_object
;
4774 // -- pg_log_entry_t --
4776 string
pg_log_entry_t::get_key_name() const
4778 return version
.get_key_name();
4781 void pg_log_entry_t::encode_with_checksum(ceph::buffer::list
& bl
) const
4784 ceph::buffer::list
ebl(sizeof(*this)*2);
4786 __u32 crc
= ebl
.crc32c(0);
4791 void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator
& p
)
4794 ceph::buffer::list bl
;
4798 if (crc
!= bl
.crc32c(0))
4799 throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
4800 auto q
= bl
.cbegin();
4804 void pg_log_entry_t::encode(ceph::buffer::list
&bl
) const
4806 ENCODE_START(14, 4, bl
);
4809 encode(version
, bl
);
4812 * Added with reverting_to:
4813 * Previous code used prior_version to encode
4814 * what we now call reverting_to. This will
4815 * allow older code to decode reverting_to
4816 * into prior_version as expected.
4818 if (op
== LOST_REVERT
)
4819 encode(reverting_to
, bl
);
4821 encode(prior_version
, bl
);
4825 if (op
== LOST_REVERT
)
4826 encode(prior_version
, bl
);
4828 encode(user_version
, bl
);
4829 encode(mod_desc
, bl
);
4830 encode(extra_reqids
, bl
);
4832 encode(return_code
, bl
);
4833 if (!extra_reqids
.empty())
4834 encode(extra_reqid_return_codes
, bl
);
4835 encode(clean_regions
, bl
);
4837 encode(return_code
, bl
);
4838 encode(op_returns
, bl
);
4842 void pg_log_entry_t::decode(ceph::buffer::list::const_iterator
&bl
)
4844 DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl
);
4848 decode(old_soid
, bl
);
4849 soid
.oid
= old_soid
.oid
;
4850 soid
.snap
= old_soid
.snap
;
4851 invalid_hash
= true;
4856 invalid_hash
= true;
4857 decode(version
, bl
);
4859 if (struct_v
>= 6 && op
== LOST_REVERT
)
4860 decode(reverting_to
, bl
);
4862 decode(prior_version
, bl
);
4868 invalid_pool
= true;
4870 if (op
== LOST_REVERT
) {
4871 if (struct_v
>= 6) {
4872 decode(prior_version
, bl
);
4874 reverting_to
= prior_version
;
4877 if (struct_v
>= 7 || // for v >= 7, this is for all ops.
4878 op
== CLONE
) { // for v < 7, it's only present for CLONE.
4880 // ensure snaps does not pin a larger ceph::buffer in memory
4882 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4886 decode(user_version
, bl
);
4888 user_version
= version
.version
;
4891 decode(mod_desc
, bl
);
4893 mod_desc
.mark_unrollbackable();
4895 decode(extra_reqids
, bl
);
4896 if (struct_v
>= 11 && op
== ERROR
)
4897 decode(return_code
, bl
);
4898 if (struct_v
>= 12 && !extra_reqids
.empty())
4899 decode(extra_reqid_return_codes
, bl
);
4901 decode(clean_regions
, bl
);
4903 clean_regions
.mark_fully_dirty();
4904 if (struct_v
>= 14) {
4906 decode(return_code
, bl
);
4908 decode(op_returns
, bl
);
4913 void pg_log_entry_t::dump(Formatter
*f
) const
4915 f
->dump_string("op", get_op_name());
4916 f
->dump_stream("object") << soid
;
4917 f
->dump_stream("version") << version
;
4918 f
->dump_stream("prior_version") << prior_version
;
4919 f
->dump_stream("reqid") << reqid
;
4920 f
->open_array_section("extra_reqids");
4922 for (auto p
= extra_reqids
.begin();
4923 p
!= extra_reqids
.end();
4925 f
->open_object_section("extra_reqid");
4926 f
->dump_stream("reqid") << p
->first
;
4927 f
->dump_stream("user_version") << p
->second
;
4928 auto it
= extra_reqid_return_codes
.find(idx
);
4929 if (it
!= extra_reqid_return_codes
.end()) {
4930 f
->dump_int("return_code", it
->second
);
4935 f
->dump_stream("mtime") << mtime
;
4936 f
->dump_int("return_code", return_code
);
4937 if (!op_returns
.empty()) {
4938 f
->open_array_section("op_returns");
4939 for (auto& i
: op_returns
) {
4940 f
->dump_object("op", i
);
4944 if (snaps
.length() > 0) {
4946 ceph::buffer::list c
= snaps
;
4947 auto p
= c
.cbegin();
4954 f
->open_object_section("snaps");
4955 for (auto p
= v
.begin(); p
!= v
.end(); ++p
)
4956 f
->dump_unsigned("snap", *p
);
4960 f
->open_object_section("mod_desc");
4965 f
->open_object_section("clean_regions");
4966 clean_regions
.dump(f
);
4971 void pg_log_entry_t::generate_test_instances(list
<pg_log_entry_t
*>& o
)
4973 o
.push_back(new pg_log_entry_t());
4974 hobject_t
oid(object_t("objname"), "key", 123, 456, 0, "");
4975 o
.push_back(new pg_log_entry_t(MODIFY
, oid
, eversion_t(1,2), eversion_t(3,4),
4976 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4978 o
.push_back(new pg_log_entry_t(ERROR
, oid
, eversion_t(1,2), eversion_t(3,4),
4979 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4980 utime_t(8,9), -ENOENT
));
4983 ostream
& operator<<(ostream
& out
, const pg_log_entry_t
& e
)
4985 out
<< e
.version
<< " (" << e
.prior_version
<< ") "
4986 << std::left
<< std::setw(8) << e
.get_op_name() << ' '
4987 << e
.soid
<< " by " << e
.reqid
<< " " << e
.mtime
4988 << " " << e
.return_code
;
4989 if (!e
.op_returns
.empty()) {
4990 out
<< " " << e
.op_returns
;
4992 if (e
.snaps
.length()) {
4993 vector
<snapid_t
> snaps
;
4994 ceph::buffer::list c
= e
.snaps
;
4995 auto p
= c
.cbegin();
5001 out
<< " snaps " << snaps
;
5003 out
<< " ObjectCleanRegions " << e
.clean_regions
;
5007 // -- pg_log_dup_t --
5009 std::string
pg_log_dup_t::get_key_name() const
5011 static const char prefix
[] = "dup_";
5012 std::string
key(36, ' ');
5013 memcpy(&key
[0], prefix
, 4);
5014 version
.get_key_name(&key
[4]);
5015 key
.resize(35); // remove the null terminator
5019 void pg_log_dup_t::encode(ceph::buffer::list
&bl
) const
5021 ENCODE_START(2, 1, bl
);
5023 encode(version
, bl
);
5024 encode(user_version
, bl
);
5025 encode(return_code
, bl
);
5026 encode(op_returns
, bl
);
5030 void pg_log_dup_t::decode(ceph::buffer::list::const_iterator
&bl
)
5032 DECODE_START(2, bl
);
5034 decode(version
, bl
);
5035 decode(user_version
, bl
);
5036 decode(return_code
, bl
);
5037 if (struct_v
>= 2) {
5038 decode(op_returns
, bl
);
5043 void pg_log_dup_t::dump(Formatter
*f
) const
5045 f
->dump_stream("reqid") << reqid
;
5046 f
->dump_stream("version") << version
;
5047 f
->dump_stream("user_version") << user_version
;
5048 f
->dump_stream("return_code") << return_code
;
5049 if (!op_returns
.empty()) {
5050 f
->open_array_section("op_returns");
5051 for (auto& i
: op_returns
) {
5052 f
->dump_object("op", i
);
5058 void pg_log_dup_t::generate_test_instances(list
<pg_log_dup_t
*>& o
)
5060 o
.push_back(new pg_log_dup_t());
5061 o
.push_back(new pg_log_dup_t(eversion_t(1,2),
5063 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5065 o
.push_back(new pg_log_dup_t(eversion_t(1,2),
5067 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5072 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
) {
5073 out
<< "log_dup(reqid=" << e
.reqid
<<
5074 " v=" << e
.version
<< " uv=" << e
.user_version
<<
5075 " rc=" << e
.return_code
;
5076 if (!e
.op_returns
.empty()) {
5077 out
<< " " << e
.op_returns
;
5085 // out: pg_log_t that only has entries that apply to import_pgid using curmap
5086 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
5087 void pg_log_t::filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
5088 const string
&hit_set_namespace
, const pg_log_t
&in
,
5089 pg_log_t
&out
, pg_log_t
&reject
)
5095 for (auto i
= in
.log
.cbegin(); i
!= in
.log
.cend(); ++i
) {
5097 // Reject pg log entries for temporary objects
5098 if (i
->soid
.is_temp()) {
5099 reject
.log
.push_back(*i
);
5103 if (i
->soid
.nspace
!= hit_set_namespace
) {
5104 object_t oid
= i
->soid
.oid
;
5105 object_locator_t
loc(i
->soid
);
5106 pg_t raw_pgid
= curmap
.object_locator_to_pg(oid
, loc
);
5107 pg_t pgid
= curmap
.raw_pg_to_pg(raw_pgid
);
5109 if (import_pgid
.pgid
== pgid
) {
5110 out
.log
.push_back(*i
);
5112 reject
.log
.push_back(*i
);
5115 out
.log
.push_back(*i
);
5120 void pg_log_t::encode(ceph::buffer::list
& bl
) const
5122 ENCODE_START(7, 3, bl
);
5126 encode(can_rollback_to
, bl
);
5127 encode(rollback_info_trimmed_to
, bl
);
5132 void pg_log_t::decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
)
5134 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl
);
5139 decode(backlog
, bl
);
5143 decode(can_rollback_to
, bl
);
5146 decode(rollback_info_trimmed_to
, bl
);
5148 rollback_info_trimmed_to
= tail
;
5155 // handle hobject_t format change
5157 for (auto i
= log
.begin(); i
!= log
.end(); ++i
) {
5158 if (!i
->soid
.is_max() && i
->soid
.pool
== -1)
5159 i
->soid
.pool
= pool
;
5164 void pg_log_t::dump(Formatter
*f
) const
5166 f
->dump_stream("head") << head
;
5167 f
->dump_stream("tail") << tail
;
5168 f
->open_array_section("log");
5169 for (auto p
= log
.cbegin(); p
!= log
.cend(); ++p
) {
5170 f
->open_object_section("entry");
5175 f
->open_array_section("dups");
5176 for (const auto& entry
: dups
) {
5177 f
->open_object_section("entry");
5184 void pg_log_t::generate_test_instances(list
<pg_log_t
*>& o
)
5186 o
.push_back(new pg_log_t
);
5188 // this is nonsensical:
5189 o
.push_back(new pg_log_t
);
5190 o
.back()->head
= eversion_t(1,2);
5191 o
.back()->tail
= eversion_t(3,4);
5192 list
<pg_log_entry_t
*> e
;
5193 pg_log_entry_t::generate_test_instances(e
);
5194 for (auto p
= e
.begin(); p
!= e
.end(); ++p
)
5195 o
.back()->log
.push_back(**p
);
5198 static void _handle_dups(CephContext
* cct
, pg_log_t
&target
, const pg_log_t
&other
, unsigned maxdups
)
5200 auto earliest_dup_version
=
5201 target
.head
.version
< maxdups
? 0u : target
.head
.version
- maxdups
+ 1;
5202 lgeneric_subdout(cct
, osd
, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version
<< dendl
;
5204 for (auto d
= other
.dups
.cbegin(); d
!= other
.dups
.cend(); ++d
) {
5205 if (d
->version
.version
>= earliest_dup_version
) {
5206 lgeneric_subdout(cct
, osd
, 20)
5207 << "copy_up_to/copy_after copy dup version "
5208 << d
->version
<< dendl
;
5209 target
.dups
.push_back(pg_log_dup_t(*d
));
5213 for (auto i
= other
.log
.cbegin(); i
!= other
.log
.cend(); ++i
) {
5214 ceph_assert(i
->version
> other
.tail
);
5215 if (i
->version
> target
.tail
)
5217 if (i
->version
.version
>= earliest_dup_version
) {
5218 lgeneric_subdout(cct
, osd
, 20)
5219 << "copy_up_to/copy_after copy dup from log version "
5220 << i
->version
<< dendl
;
5221 target
.dups
.push_back(pg_log_dup_t(*i
));
5227 void pg_log_t::copy_after(CephContext
* cct
, const pg_log_t
&other
, eversion_t v
)
5229 can_rollback_to
= other
.can_rollback_to
;
5232 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " v " << v
<< dendl
;
5233 for (auto i
= other
.log
.crbegin(); i
!= other
.log
.crend(); ++i
) {
5234 ceph_assert(i
->version
> other
.tail
);
5235 if (i
->version
<= v
) {
5236 // make tail accurate.
5240 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " copy log version " << i
->version
<< dendl
;
5243 _handle_dups(cct
, *this, other
, cct
->_conf
->osd_pg_log_dups_tracked
);
5246 void pg_log_t::copy_up_to(CephContext
* cct
, const pg_log_t
&other
, int max
)
5248 can_rollback_to
= other
.can_rollback_to
;
5252 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " max " << max
<< dendl
;
5253 for (auto i
= other
.log
.crbegin(); i
!= other
.log
.crend(); ++i
) {
5254 ceph_assert(i
->version
> other
.tail
);
5259 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " copy log version " << i
->version
<< dendl
;
5262 _handle_dups(cct
, *this, other
, cct
->_conf
->osd_pg_log_dups_tracked
);
5265 ostream
& pg_log_t::print(ostream
& out
) const
5267 out
<< *this << std::endl
;
5268 for (auto p
= log
.cbegin(); p
!= log
.cend(); ++p
)
5269 out
<< *p
<< std::endl
;
5270 for (const auto& entry
: dups
) {
5271 out
<< " dup entry: " << entry
<< std::endl
;
5276 // -- pg_missing_t --
5278 ostream
& operator<<(ostream
& out
, const pg_missing_item
& i
)
5281 if (i
.have
!= eversion_t())
5282 out
<< "(" << i
.have
<< ")";
5283 out
<< " flags = " << i
.flag_str()
5284 << " " << i
.clean_regions
;
5288 // -- object_copy_cursor_t --
5290 void object_copy_cursor_t::encode(ceph::buffer::list
& bl
) const
5292 ENCODE_START(1, 1, bl
);
5293 encode(attr_complete
, bl
);
5294 encode(data_offset
, bl
);
5295 encode(data_complete
, bl
);
5296 encode(omap_offset
, bl
);
5297 encode(omap_complete
, bl
);
5301 void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator
&bl
)
5303 DECODE_START(1, bl
);
5304 decode(attr_complete
, bl
);
5305 decode(data_offset
, bl
);
5306 decode(data_complete
, bl
);
5307 decode(omap_offset
, bl
);
5308 decode(omap_complete
, bl
);
5312 void object_copy_cursor_t::dump(Formatter
*f
) const
5314 f
->dump_unsigned("attr_complete", (int)attr_complete
);
5315 f
->dump_unsigned("data_offset", data_offset
);
5316 f
->dump_unsigned("data_complete", (int)data_complete
);
5317 f
->dump_string("omap_offset", omap_offset
);
5318 f
->dump_unsigned("omap_complete", (int)omap_complete
);
5321 void object_copy_cursor_t::generate_test_instances(list
<object_copy_cursor_t
*>& o
)
5323 o
.push_back(new object_copy_cursor_t
);
5324 o
.push_back(new object_copy_cursor_t
);
5325 o
.back()->attr_complete
= true;
5326 o
.back()->data_offset
= 123;
5327 o
.push_back(new object_copy_cursor_t
);
5328 o
.back()->attr_complete
= true;
5329 o
.back()->data_complete
= true;
5330 o
.back()->omap_offset
= "foo";
5331 o
.push_back(new object_copy_cursor_t
);
5332 o
.back()->attr_complete
= true;
5333 o
.back()->data_complete
= true;
5334 o
.back()->omap_complete
= true;
5337 // -- object_copy_data_t --
5339 void object_copy_data_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
5341 ENCODE_START(8, 5, bl
);
5346 encode(omap_data
, bl
);
5348 encode(omap_header
, bl
);
5350 encode(snap_seq
, bl
);
5352 encode(data_digest
, bl
);
5353 encode(omap_digest
, bl
);
5355 encode(truncate_seq
, bl
);
5356 encode(truncate_size
, bl
);
5357 encode(reqid_return_codes
, bl
);
5361 void object_copy_data_t::decode(ceph::buffer::list::const_iterator
& bl
)
5363 DECODE_START(8, bl
);
5370 decode(category
, bl
); // no longer used
5375 map
<string
,ceph::buffer::list
> omap
;
5378 if (!omap
.empty()) {
5380 encode(omap
, omap_data
);
5385 decode(omap_header
, bl
);
5386 if (struct_v
>= 3) {
5388 decode(snap_seq
, bl
);
5393 if (struct_v
>= 4) {
5395 decode(data_digest
, bl
);
5396 decode(omap_digest
, bl
);
5404 decode(omap_data
, bl
);
5406 decode(omap_header
, bl
);
5408 decode(snap_seq
, bl
);
5409 if (struct_v
>= 4) {
5411 decode(data_digest
, bl
);
5412 decode(omap_digest
, bl
);
5414 if (struct_v
>= 6) {
5417 if (struct_v
>= 7) {
5418 decode(truncate_seq
, bl
);
5419 decode(truncate_size
, bl
);
5421 if (struct_v
>= 8) {
5422 decode(reqid_return_codes
, bl
);
5428 void object_copy_data_t::generate_test_instances(list
<object_copy_data_t
*>& o
)
5430 o
.push_back(new object_copy_data_t());
5432 list
<object_copy_cursor_t
*> cursors
;
5433 object_copy_cursor_t::generate_test_instances(cursors
);
5434 auto ci
= cursors
.begin();
5435 o
.back()->cursor
= **(ci
++);
5437 o
.push_back(new object_copy_data_t());
5438 o
.back()->cursor
= **(ci
++);
5440 o
.push_back(new object_copy_data_t());
5441 o
.back()->size
= 1234;
5442 o
.back()->mtime
.set_from_double(1234);
5443 ceph::buffer::ptr
bp("there", 5);
5444 ceph::buffer::list bl
;
5446 o
.back()->attrs
["hello"] = bl
;
5447 ceph::buffer::ptr
bp2("not", 3);
5448 ceph::buffer::list bl2
;
5450 map
<string
,ceph::buffer::list
> omap
;
5453 encode(omap
, o
.back()->omap_data
);
5454 ceph::buffer::ptr
databp("iamsomedatatocontain", 20);
5455 o
.back()->data
.push_back(databp
);
5456 o
.back()->omap_header
.append("this is an omap header");
5457 o
.back()->snaps
.push_back(123);
5458 o
.back()->reqids
.push_back(make_pair(osd_reqid_t(), version_t()));
5461 void object_copy_data_t::dump(Formatter
*f
) const
5463 f
->open_object_section("cursor");
5465 f
->close_section(); // cursor
5466 f
->dump_int("size", size
);
5467 f
->dump_stream("mtime") << mtime
;
5468 /* we should really print out the attrs here, but ceph::buffer::list
5469 const-correctness prevents that */
5470 f
->dump_int("attrs_size", attrs
.size());
5471 f
->dump_int("flags", flags
);
5472 f
->dump_unsigned("data_digest", data_digest
);
5473 f
->dump_unsigned("omap_digest", omap_digest
);
5474 f
->dump_int("omap_data_length", omap_data
.length());
5475 f
->dump_int("omap_header_length", omap_header
.length());
5476 f
->dump_int("data_length", data
.length());
5477 f
->open_array_section("snaps");
5478 for (auto p
= snaps
.cbegin(); p
!= snaps
.cend(); ++p
)
5479 f
->dump_unsigned("snap", *p
);
5481 f
->open_array_section("reqids");
5483 for (auto p
= reqids
.begin();
5486 f
->open_object_section("extra_reqid");
5487 f
->dump_stream("reqid") << p
->first
;
5488 f
->dump_stream("user_version") << p
->second
;
5489 auto it
= reqid_return_codes
.find(idx
);
5490 if (it
!= reqid_return_codes
.end()) {
5491 f
->dump_int("return_code", it
->second
);
5498 // -- pg_create_t --
5500 void pg_create_t::encode(ceph::buffer::list
&bl
) const
5502 ENCODE_START(1, 1, bl
);
5503 encode(created
, bl
);
5505 encode(split_bits
, bl
);
5509 void pg_create_t::decode(ceph::buffer::list::const_iterator
&bl
)
5511 DECODE_START(1, bl
);
5512 decode(created
, bl
);
5514 decode(split_bits
, bl
);
5518 void pg_create_t::dump(Formatter
*f
) const
5520 f
->dump_unsigned("created", created
);
5521 f
->dump_stream("parent") << parent
;
5522 f
->dump_int("split_bits", split_bits
);
5525 void pg_create_t::generate_test_instances(list
<pg_create_t
*>& o
)
5527 o
.push_back(new pg_create_t
);
5528 o
.push_back(new pg_create_t(1, pg_t(3, 4), 2));
5532 // -- pg_hit_set_info_t --
5534 void pg_hit_set_info_t::encode(ceph::buffer::list
& bl
) const
5536 ENCODE_START(2, 1, bl
);
5539 encode(version
, bl
);
5540 encode(using_gmt
, bl
);
5544 void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator
& p
)
5550 if (struct_v
>= 2) {
5551 decode(using_gmt
, p
);
5558 void pg_hit_set_info_t::dump(Formatter
*f
) const
5560 f
->dump_stream("begin") << begin
;
5561 f
->dump_stream("end") << end
;
5562 f
->dump_stream("version") << version
;
5563 f
->dump_stream("using_gmt") << using_gmt
;
5566 void pg_hit_set_info_t::generate_test_instances(list
<pg_hit_set_info_t
*>& ls
)
5568 ls
.push_back(new pg_hit_set_info_t
);
5569 ls
.push_back(new pg_hit_set_info_t
);
5570 ls
.back()->begin
= utime_t(1, 2);
5571 ls
.back()->end
= utime_t(3, 4);
5575 // -- pg_hit_set_history_t --
5577 void pg_hit_set_history_t::encode(ceph::buffer::list
& bl
) const
5579 ENCODE_START(1, 1, bl
);
5580 encode(current_last_update
, bl
);
5582 utime_t dummy_stamp
;
5583 encode(dummy_stamp
, bl
);
5586 pg_hit_set_info_t dummy_info
;
5587 encode(dummy_info
, bl
);
5589 encode(history
, bl
);
5593 void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator
& p
)
5596 decode(current_last_update
, p
);
5598 utime_t dummy_stamp
;
5599 decode(dummy_stamp
, p
);
5602 pg_hit_set_info_t dummy_info
;
5603 decode(dummy_info
, p
);
5609 void pg_hit_set_history_t::dump(Formatter
*f
) const
5611 f
->dump_stream("current_last_update") << current_last_update
;
5612 f
->open_array_section("history");
5613 for (auto p
= history
.cbegin(); p
!= history
.cend(); ++p
) {
5614 f
->open_object_section("info");
5621 void pg_hit_set_history_t::generate_test_instances(list
<pg_hit_set_history_t
*>& ls
)
5623 ls
.push_back(new pg_hit_set_history_t
);
5624 ls
.push_back(new pg_hit_set_history_t
);
5625 ls
.back()->current_last_update
= eversion_t(1, 2);
5626 ls
.back()->history
.push_back(pg_hit_set_info_t());
5629 // -- OSDSuperblock --
5631 void OSDSuperblock::encode(ceph::buffer::list
&bl
) const
5633 ENCODE_START(9, 5, bl
);
5634 encode(cluster_fsid
, bl
);
5636 encode(current_epoch
, bl
);
5637 encode(oldest_map
, bl
);
5638 encode(newest_map
, bl
);
5640 compat_features
.encode(bl
);
5641 encode(clean_thru
, bl
);
5642 encode(mounted
, bl
);
5643 encode(osd_fsid
, bl
);
5644 encode((epoch_t
)0, bl
); // epoch_t last_epoch_marked_full
5645 encode((uint32_t)0, bl
); // map<int64_t,epoch_t> pool_last_epoch_marked_full
5646 encode(purged_snaps_last
, bl
);
5647 encode(last_purged_snaps_scrub
, bl
);
5651 void OSDSuperblock::decode(ceph::buffer::list::const_iterator
&bl
)
5653 DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl
);
5658 decode(cluster_fsid
, bl
);
5660 decode(current_epoch
, bl
);
5661 decode(oldest_map
, bl
);
5662 decode(newest_map
, bl
);
5664 if (struct_v
>= 2) {
5665 compat_features
.decode(bl
);
5666 } else { //upgrade it!
5667 compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
5669 decode(clean_thru
, bl
);
5670 decode(mounted
, bl
);
5672 decode(osd_fsid
, bl
);
5673 if (struct_v
>= 6) {
5674 epoch_t last_map_marked_full
;
5675 decode(last_map_marked_full
, bl
);
5677 if (struct_v
>= 7) {
5678 map
<int64_t,epoch_t
> pool_last_map_marked_full
;
5679 decode(pool_last_map_marked_full
, bl
);
5681 if (struct_v
>= 9) {
5682 decode(purged_snaps_last
, bl
);
5683 decode(last_purged_snaps_scrub
, bl
);
5685 purged_snaps_last
= 0;
5690 void OSDSuperblock::dump(Formatter
*f
) const
5692 f
->dump_stream("cluster_fsid") << cluster_fsid
;
5693 f
->dump_stream("osd_fsid") << osd_fsid
;
5694 f
->dump_int("whoami", whoami
);
5695 f
->dump_int("current_epoch", current_epoch
);
5696 f
->dump_int("oldest_map", oldest_map
);
5697 f
->dump_int("newest_map", newest_map
);
5698 f
->dump_float("weight", weight
);
5699 f
->open_object_section("compat");
5700 compat_features
.dump(f
);
5702 f
->dump_int("clean_thru", clean_thru
);
5703 f
->dump_int("last_epoch_mounted", mounted
);
5704 f
->dump_unsigned("purged_snaps_last", purged_snaps_last
);
5705 f
->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub
;
5708 void OSDSuperblock::generate_test_instances(list
<OSDSuperblock
*>& o
)
5711 o
.push_back(new OSDSuperblock(z
));
5712 z
.cluster_fsid
.parse("01010101-0101-0101-0101-010101010101");
5713 z
.osd_fsid
.parse("02020202-0202-0202-0202-020202020202");
5715 z
.current_epoch
= 4;
5720 o
.push_back(new OSDSuperblock(z
));
5721 o
.push_back(new OSDSuperblock(z
));
5726 void SnapSet::encode(ceph::buffer::list
& bl
) const
5728 ENCODE_START(3, 2, bl
);
5730 encode(true, bl
); // head_exists
5733 encode(clone_overlap
, bl
);
5734 encode(clone_size
, bl
);
5735 encode(clone_snaps
, bl
);
5739 void SnapSet::decode(ceph::buffer::list::const_iterator
& bl
)
5741 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
5743 bl
+= 1u; // skip legacy head_exists (always true)
5746 decode(clone_overlap
, bl
);
5747 decode(clone_size
, bl
);
5748 if (struct_v
>= 3) {
5749 decode(clone_snaps
, bl
);
5751 clone_snaps
.clear();
5756 void SnapSet::dump(Formatter
*f
) const
5758 f
->dump_unsigned("seq", seq
);
5759 f
->open_array_section("clones");
5760 for (auto p
= clones
.cbegin(); p
!= clones
.cend(); ++p
) {
5761 f
->open_object_section("clone");
5762 f
->dump_unsigned("snap", *p
);
5763 auto cs
= clone_size
.find(*p
);
5764 if (cs
!= clone_size
.end())
5765 f
->dump_unsigned("size", cs
->second
);
5767 f
->dump_string("size", "????");
5768 auto co
= clone_overlap
.find(*p
);
5769 if (co
!= clone_overlap
.end())
5770 f
->dump_stream("overlap") << co
->second
;
5772 f
->dump_stream("overlap") << "????";
5773 auto q
= clone_snaps
.find(*p
);
5774 if (q
!= clone_snaps
.end()) {
5775 f
->open_array_section("snaps");
5776 for (auto s
: q
->second
) {
5777 f
->dump_unsigned("snap", s
);
5786 void SnapSet::generate_test_instances(list
<SnapSet
*>& o
)
5788 o
.push_back(new SnapSet
);
5789 o
.push_back(new SnapSet
);
5790 o
.back()->seq
= 123;
5791 o
.back()->snaps
.push_back(123);
5792 o
.back()->snaps
.push_back(12);
5793 o
.push_back(new SnapSet
);
5794 o
.back()->seq
= 123;
5795 o
.back()->snaps
.push_back(123);
5796 o
.back()->snaps
.push_back(12);
5797 o
.back()->clones
.push_back(12);
5798 o
.back()->clone_size
[12] = 12345;
5799 o
.back()->clone_overlap
[12];
5800 o
.back()->clone_snaps
[12] = {12, 10, 8};
5803 ostream
& operator<<(ostream
& out
, const SnapSet
& cs
)
5805 return out
<< cs
.seq
<< "=" << cs
.snaps
<< ":"
5809 void SnapSet::from_snap_set(const librados::snap_set_t
& ss
, bool legacy
)
5811 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5812 // correct: it will not include snaps that still logically exist
5813 // but for which there was no clone that is defined. For all
5814 // practical purposes this doesn't matter, since we only use that
5815 // information to clone on the OSD, and we have already moved
5816 // forward past that part of the object history.
5819 set
<snapid_t
> _snaps
;
5820 set
<snapid_t
> _clones
;
5821 for (auto p
= ss
.clones
.cbegin(); p
!= ss
.clones
.cend(); ++p
) {
5822 if (p
->cloneid
!= librados::SNAP_HEAD
) {
5823 _clones
.insert(p
->cloneid
);
5824 _snaps
.insert(p
->snaps
.begin(), p
->snaps
.end());
5825 clone_size
[p
->cloneid
] = p
->size
;
5826 clone_overlap
[p
->cloneid
]; // the entry must exist, even if it's empty.
5827 for (auto q
= p
->overlap
.cbegin(); q
!= p
->overlap
.cend(); ++q
)
5828 clone_overlap
[p
->cloneid
].insert(q
->first
, q
->second
);
5830 // p->snaps is ascending; clone_snaps is descending
5831 vector
<snapid_t
>& v
= clone_snaps
[p
->cloneid
];
5832 for (auto q
= p
->snaps
.rbegin(); q
!= p
->snaps
.rend(); ++q
) {
5841 clones
.reserve(_clones
.size());
5842 for (auto p
= _clones
.begin(); p
!= _clones
.end(); ++p
)
5843 clones
.push_back(*p
);
5847 snaps
.reserve(_snaps
.size());
5848 for (auto p
= _snaps
.rbegin();
5849 p
!= _snaps
.rend(); ++p
)
5850 snaps
.push_back(*p
);
5853 uint64_t SnapSet::get_clone_bytes(snapid_t clone
) const
5855 ceph_assert(clone_size
.count(clone
));
5856 uint64_t size
= clone_size
.find(clone
)->second
;
5857 ceph_assert(clone_overlap
.count(clone
));
5858 const interval_set
<uint64_t> &overlap
= clone_overlap
.find(clone
)->second
;
5859 ceph_assert(size
>= (uint64_t)overlap
.size());
5860 return size
- overlap
.size();
5863 void SnapSet::filter(const pg_pool_t
&pinfo
)
5865 vector
<snapid_t
> oldsnaps
;
5866 oldsnaps
.swap(snaps
);
5867 for (auto i
= oldsnaps
.cbegin(); i
!= oldsnaps
.cend(); ++i
) {
5868 if (!pinfo
.is_removed_snap(*i
))
5869 snaps
.push_back(*i
);
5873 SnapSet
SnapSet::get_filtered(const pg_pool_t
&pinfo
) const
5880 // -- watch_info_t --
5882 void watch_info_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
5884 ENCODE_START(4, 3, bl
);
5886 encode(timeout_seconds
, bl
);
5887 encode(addr
, bl
, features
);
5891 void watch_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
5893 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl
);
5899 decode(timeout_seconds
, bl
);
5900 if (struct_v
>= 4) {
5906 void watch_info_t::dump(Formatter
*f
) const
5908 f
->dump_unsigned("cookie", cookie
);
5909 f
->dump_unsigned("timeout_seconds", timeout_seconds
);
5910 f
->open_object_section("addr");
5915 void watch_info_t::generate_test_instances(list
<watch_info_t
*>& o
)
5917 o
.push_back(new watch_info_t
);
5918 o
.push_back(new watch_info_t
);
5919 o
.back()->cookie
= 123;
5920 o
.back()->timeout_seconds
= 99;
5922 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
5924 ea
.set_family(AF_INET
);
5925 ea
.set_in4_quad(0, 127);
5926 ea
.set_in4_quad(1, 0);
5927 ea
.set_in4_quad(2, 1);
5928 ea
.set_in4_quad(3, 2);
5930 o
.back()->addr
= ea
;
5933 // -- chunk_info_t --
5935 void chunk_info_t::encode(ceph::buffer::list
& bl
) const
5937 ENCODE_START(1, 1, bl
);
5941 __u32 _flags
= flags
;
5946 void chunk_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
5948 DECODE_START(1, bl
);
5954 flags
= (cflag_t
)_flags
;
5958 void chunk_info_t::dump(Formatter
*f
) const
5960 f
->dump_unsigned("length", length
);
5961 f
->open_object_section("oid");
5964 f
->dump_unsigned("flags", flags
);
5968 bool chunk_info_t::operator==(const chunk_info_t
& cit
) const
5970 if (has_fingerprint()) {
5971 if (oid
.oid
.name
== cit
.oid
.oid
.name
) {
5975 if (offset
== cit
.offset
&& length
== cit
.length
&&
5976 oid
.oid
.name
== cit
.oid
.oid
.name
) {
5984 bool operator==(const std::pair
<const long unsigned int, chunk_info_t
> & l
,
5985 const std::pair
<const long unsigned int, chunk_info_t
> & r
)
5987 return l
.first
== r
.first
&&
5988 l
.second
== r
.second
;
5991 ostream
& operator<<(ostream
& out
, const chunk_info_t
& ci
)
5993 return out
<< "(len: " << ci
.length
<< " oid: " << ci
.oid
5994 << " offset: " << ci
.offset
5995 << " flags: " << ci
.get_flag_string(ci
.flags
) << ")";
5998 // -- object_manifest_t --
6000 std::ostream
& operator<<(std::ostream
& out
, const object_ref_delta_t
& ci
)
6002 return out
<< ci
.ref_delta
<< std::endl
;
6005 void object_manifest_t::calc_refs_to_inc_on_set(
6006 const object_manifest_t
* _g
,
6007 const object_manifest_t
* _l
,
6008 object_ref_delta_t
&refs
) const
6010 /* avoid to increment the same reference on adjacent clones */
6011 auto iter
= chunk_map
.begin();
6012 auto find_chunk
= [](decltype(iter
) &i
, const object_manifest_t
* cur
)
6015 auto c
= cur
->chunk_map
.find(i
->first
);
6016 if (c
!= cur
->chunk_map
.end() && c
->second
== i
->second
) {
6024 /* If at least a same chunk exists on either _g or _l, do not increment
6027 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6028 * 20: [0, 2) aaa, <- set_chunk
6029 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6030 * --> incremnt the reference
6032 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6033 * 20: [0, 2) ccc, <- set_chunk
6034 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6035 * --> do not need to increment
6037 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6038 * 20: [0, 2) ccc, <- set_chunk
6039 * 30: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6040 * --> decrement the reference of ccc
6043 for (; iter
!= chunk_map
.end(); ++iter
) {
6044 auto found_g
= find_chunk(iter
, _g
);
6045 auto found_l
= find_chunk(iter
, _l
);
6046 if (!found_g
&& !found_l
) {
6047 refs
.inc_ref(iter
->second
.oid
);
6048 } else if (found_g
&& found_l
) {
6049 refs
.dec_ref(iter
->second
.oid
);
6054 void object_manifest_t::calc_refs_to_drop_on_modify(
6055 const object_manifest_t
* _l
,
6056 const ObjectCleanRegions
& clean_regions
,
6057 object_ref_delta_t
&refs
) const
6059 for (auto &p
: chunk_map
) {
6060 if (!clean_regions
.is_clean_region(p
.first
, p
.second
.length
)) {
6061 // has previous snapshot
6064 * Let's assume that there is a manifest snapshotted object which has three chunks
6065 * head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6066 * 20: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6068 * If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because
6069 * 20 has the reference for bbb. Therefore, we only drop the reference if two chunks
6070 * (head: [6, 2) and 20: [6, 2)) are different.
6073 auto c
= _l
->chunk_map
.find(p
.first
);
6074 if (c
!= _l
->chunk_map
.end()) {
6075 if (p
.second
== c
->second
) {
6079 refs
.dec_ref(p
.second
.oid
);
6081 // decrement the reference of the updated chunks if the manifest object has no snapshot
6082 refs
.dec_ref(p
.second
.oid
);
6088 void object_manifest_t::calc_refs_to_drop_on_removal(
6089 const object_manifest_t
* _g
,
6090 const object_manifest_t
* _l
,
6091 object_ref_delta_t
&refs
) const
6093 /* At a high level, the rule is that consecutive clones with the same reference
6094 * at the same offset share a reference. As such, removing *this may result
6095 * in removing references in two cases:
6096 * 1) *this has a reference which it shares with neither _g nor _l
6097 * 2) _g and _l have a reference which they share with each other but not
6100 * For a particular offset, both 1 and 2 can happen.
6102 * Notably, this means that to evaluate the reference change from removing
6103 * the object with *this, we only need to look at the two adjacent clones.
6106 // Paper over possibly missing _g or _l -- nullopt is semantically the same
6107 // as an empty chunk_map
6108 static const object_manifest_t empty
;
6109 const object_manifest_t
&g
= _g
? *_g
: empty
;
6110 const object_manifest_t
&l
= _l
? *_l
: empty
;
6112 auto giter
= g
.chunk_map
.begin();
6113 auto iter
= chunk_map
.begin();
6114 auto liter
= l
.chunk_map
.begin();
6116 // Translate iter, map pair to the current offset, end() -> max
6117 auto get_offset
= [](decltype(iter
) &i
, const object_manifest_t
&manifest
)
6119 return i
== manifest
.chunk_map
.end() ?
6120 std::numeric_limits
<uint64_t>::max() : i
->first
;
6123 /* If current matches the offset at iter, returns the chunk at *iter
6124 * and increments iter. Otherwise, returns nullptr.
6126 * current will always be derived from the min of *giter, *iter, and
6127 * *liter on each cycle, so the result will be that each loop iteration
6128 * will pick up all chunks at the offest being considered, each offset
6129 * will be considered once, and all offsets will be considered.
6131 auto get_chunk
= [](
6132 uint64_t current
, decltype(iter
) &i
, const object_manifest_t
&manifest
)
6133 -> const chunk_info_t
* {
6134 if (i
== manifest
.chunk_map
.end() || current
!= i
->first
) {
6137 return &(i
++)->second
;
6141 while (giter
!= g
.chunk_map
.end() ||
6142 iter
!= chunk_map
.end() ||
6143 liter
!= l
.chunk_map
.end()) {
6144 auto current
= std::min(
6145 std::min(get_offset(giter
, g
), get_offset(iter
, *this)),
6146 get_offset(liter
, l
));
6148 auto gchunk
= get_chunk(current
, giter
, g
);
6149 auto chunk
= get_chunk(current
, iter
, *this);
6150 auto lchunk
= get_chunk(current
, liter
, l
);
6152 if (gchunk
&& lchunk
&& *gchunk
== *lchunk
&&
6153 (!chunk
|| *gchunk
!= *chunk
)) {
6154 // case 1 from above: l and g match, chunk does not
6155 refs
.dec_ref(gchunk
->oid
);
6159 (!gchunk
|| chunk
->oid
!= gchunk
->oid
) &&
6160 (!lchunk
|| chunk
->oid
!= lchunk
->oid
)) {
6161 // case 2 from above: *this matches neither
6162 refs
.dec_ref(chunk
->oid
);
6167 void object_manifest_t::encode(ceph::buffer::list
& bl
) const
6169 ENCODE_START(1, 1, bl
);
6172 case TYPE_NONE
: break;
6174 encode(redirect_target
, bl
);
6177 encode(chunk_map
, bl
);
6185 void object_manifest_t::decode(ceph::buffer::list::const_iterator
& bl
)
6187 DECODE_START(1, bl
);
6190 case TYPE_NONE
: break;
6192 decode(redirect_target
, bl
);
6195 decode(chunk_map
, bl
);
6203 void object_manifest_t::dump(Formatter
*f
) const
6205 f
->dump_unsigned("type", type
);
6206 if (type
== TYPE_REDIRECT
) {
6207 f
->open_object_section("redirect_target");
6208 redirect_target
.dump(f
);
6210 } else if (type
== TYPE_CHUNKED
) {
6211 f
->open_array_section("chunk_map");
6212 for (auto& p
: chunk_map
) {
6213 f
->open_object_section("chunk");
6214 f
->dump_unsigned("offset", p
.first
);
6222 void object_manifest_t::generate_test_instances(list
<object_manifest_t
*>& o
)
6224 o
.push_back(new object_manifest_t());
6225 o
.back()->type
= TYPE_REDIRECT
;
6228 ostream
& operator<<(ostream
& out
, const object_manifest_t
& om
)
6230 out
<< "manifest(" << om
.get_type_name();
6231 if (om
.is_redirect()) {
6232 out
<< " " << om
.redirect_target
;
6233 } else if (om
.is_chunked()) {
6234 out
<< " " << om
.chunk_map
;
6240 // -- object_info_t --
6242 void object_info_t::copy_user_bits(const object_info_t
& other
)
6244 // these bits are copied from head->clone.
6246 mtime
= other
.mtime
;
6247 local_mtime
= other
.local_mtime
;
6248 last_reqid
= other
.last_reqid
;
6249 truncate_seq
= other
.truncate_seq
;
6250 truncate_size
= other
.truncate_size
;
6251 flags
= other
.flags
;
6252 user_version
= other
.user_version
;
6253 data_digest
= other
.data_digest
;
6254 omap_digest
= other
.omap_digest
;
6257 void object_info_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
6259 object_locator_t
myoloc(soid
);
6260 map
<entity_name_t
, watch_info_t
> old_watchers
;
6261 for (auto i
= watchers
.cbegin(); i
!= watchers
.cend(); ++i
) {
6262 old_watchers
.insert(make_pair(i
->first
.second
, i
->second
));
6264 ENCODE_START(17, 8, bl
);
6266 encode(myoloc
, bl
); //Retained for compatibility
6267 encode((__u32
)0, bl
); // was category, no longer used
6268 encode(version
, bl
);
6269 encode(prior_version
, bl
);
6270 encode(last_reqid
, bl
);
6273 if (soid
.snap
== CEPH_NOSNAP
)
6274 encode(osd_reqid_t(), bl
); // used to be wrlock_by
6276 encode((uint32_t)0, bl
); // was legacy_snaps
6277 encode(truncate_seq
, bl
);
6278 encode(truncate_size
, bl
);
6279 encode(is_lost(), bl
);
6280 encode(old_watchers
, bl
, features
);
6281 /* shenanigans to avoid breaking backwards compatibility in the disk format.
6282 * When we can, switch this out for simply putting the version_t on disk. */
6283 eversion_t
user_eversion(0, user_version
);
6284 encode(user_eversion
, bl
);
6285 encode(test_flag(FLAG_USES_TMAP
), bl
);
6286 encode(watchers
, bl
, features
);
6287 __u32 _flags
= flags
;
6289 encode(local_mtime
, bl
);
6290 encode(data_digest
, bl
);
6291 encode(omap_digest
, bl
);
6292 encode(expected_object_size
, bl
);
6293 encode(expected_write_size
, bl
);
6294 encode(alloc_hint_flags
, bl
);
6295 if (has_manifest()) {
6296 encode(manifest
, bl
);
6301 void object_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
6303 object_locator_t myoloc
;
6304 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl
);
6305 map
<entity_name_t
, watch_info_t
> old_watchers
;
6310 decode(category
, bl
); // no longer used
6312 decode(version
, bl
);
6313 decode(prior_version
, bl
);
6314 decode(last_reqid
, bl
);
6317 if (soid
.snap
== CEPH_NOSNAP
) {
6318 osd_reqid_t wrlock_by
;
6319 decode(wrlock_by
, bl
);
6321 vector
<snapid_t
> legacy_snaps
;
6322 decode(legacy_snaps
, bl
);
6324 decode(truncate_seq
, bl
);
6325 decode(truncate_size
, bl
);
6327 // if this is struct_v >= 13, we will overwrite this
6328 // below since this field is just here for backwards
6334 decode(old_watchers
, bl
);
6335 eversion_t user_eversion
;
6336 decode(user_eversion
, bl
);
6337 user_version
= user_eversion
.version
;
6339 if (struct_v
>= 9) {
6340 bool uses_tmap
= false;
6341 decode(uses_tmap
, bl
);
6343 set_flag(FLAG_USES_TMAP
);
6345 set_flag(FLAG_USES_TMAP
);
6348 soid
.pool
= myoloc
.pool
;
6349 if (struct_v
>= 11) {
6350 decode(watchers
, bl
);
6352 for (auto i
= old_watchers
.begin(); i
!= old_watchers
.end(); ++i
) {
6355 make_pair(i
->second
.cookie
, i
->first
), i
->second
));
6358 if (struct_v
>= 13) {
6361 flags
= (flag_t
)_flags
;
6363 if (struct_v
>= 14) {
6364 decode(local_mtime
, bl
);
6366 local_mtime
= utime_t();
6368 if (struct_v
>= 15) {
6369 decode(data_digest
, bl
);
6370 decode(omap_digest
, bl
);
6372 data_digest
= omap_digest
= -1;
6373 clear_flag(FLAG_DATA_DIGEST
);
6374 clear_flag(FLAG_OMAP_DIGEST
);
6376 if (struct_v
>= 16) {
6377 decode(expected_object_size
, bl
);
6378 decode(expected_write_size
, bl
);
6379 decode(alloc_hint_flags
, bl
);
6381 expected_object_size
= 0;
6382 expected_write_size
= 0;
6383 alloc_hint_flags
= 0;
6385 if (struct_v
>= 17) {
6386 if (has_manifest()) {
6387 decode(manifest
, bl
);
6393 void object_info_t::dump(Formatter
*f
) const
6395 f
->open_object_section("oid");
6398 f
->dump_stream("version") << version
;
6399 f
->dump_stream("prior_version") << prior_version
;
6400 f
->dump_stream("last_reqid") << last_reqid
;
6401 f
->dump_unsigned("user_version", user_version
);
6402 f
->dump_unsigned("size", size
);
6403 f
->dump_stream("mtime") << mtime
;
6404 f
->dump_stream("local_mtime") << local_mtime
;
6405 f
->dump_unsigned("lost", (int)is_lost());
6406 vector
<string
> sv
= get_flag_vector(flags
);
6407 f
->open_array_section("flags");
6408 for (const auto& str
: sv
) {
6409 f
->dump_string("flags", str
);
6412 f
->dump_unsigned("truncate_seq", truncate_seq
);
6413 f
->dump_unsigned("truncate_size", truncate_size
);
6414 f
->dump_format("data_digest", "0x%08x", data_digest
);
6415 f
->dump_format("omap_digest", "0x%08x", omap_digest
);
6416 f
->dump_unsigned("expected_object_size", expected_object_size
);
6417 f
->dump_unsigned("expected_write_size", expected_write_size
);
6418 f
->dump_unsigned("alloc_hint_flags", alloc_hint_flags
);
6419 f
->dump_object("manifest", manifest
);
6420 f
->open_object_section("watchers");
6421 for (auto p
= watchers
.cbegin(); p
!= watchers
.cend(); ++p
) {
6422 CachedStackStringStream css
;
6423 *css
<< p
->first
.second
;
6424 f
->open_object_section(css
->strv());
6431 void object_info_t::generate_test_instances(list
<object_info_t
*>& o
)
6433 o
.push_back(new object_info_t());
6439 ostream
& operator<<(ostream
& out
, const object_info_t
& oi
)
6441 out
<< oi
.soid
<< "(" << oi
.version
6442 << " " << oi
.last_reqid
;
6444 out
<< " " << oi
.get_flag_string();
6445 out
<< " s " << oi
.size
;
6446 out
<< " uv " << oi
.user_version
;
6447 if (oi
.is_data_digest())
6448 out
<< " dd " << std::hex
<< oi
.data_digest
<< std::dec
;
6449 if (oi
.is_omap_digest())
6450 out
<< " od " << std::hex
<< oi
.omap_digest
<< std::dec
;
6451 out
<< " alloc_hint [" << oi
.expected_object_size
6452 << " " << oi
.expected_write_size
6453 << " " << oi
.alloc_hint_flags
<< "]";
6454 if (oi
.has_manifest())
6455 out
<< " " << oi
.manifest
;
6460 // -- ObjectRecovery --
6461 void ObjectRecoveryProgress::encode(ceph::buffer::list
&bl
) const
6463 ENCODE_START(1, 1, bl
);
6465 encode(data_complete
, bl
);
6466 encode(data_recovered_to
, bl
);
6467 encode(omap_recovered_to
, bl
);
6468 encode(omap_complete
, bl
);
6472 void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator
&bl
)
6474 DECODE_START(1, bl
);
6476 decode(data_complete
, bl
);
6477 decode(data_recovered_to
, bl
);
6478 decode(omap_recovered_to
, bl
);
6479 decode(omap_complete
, bl
);
6483 ostream
&operator<<(ostream
&out
, const ObjectRecoveryProgress
&prog
)
6485 return prog
.print(out
);
6488 void ObjectRecoveryProgress::generate_test_instances(
6489 list
<ObjectRecoveryProgress
*>& o
)
6491 o
.push_back(new ObjectRecoveryProgress
);
6492 o
.back()->first
= false;
6493 o
.back()->data_complete
= true;
6494 o
.back()->omap_complete
= true;
6495 o
.back()->data_recovered_to
= 100;
6497 o
.push_back(new ObjectRecoveryProgress
);
6498 o
.back()->first
= true;
6499 o
.back()->data_complete
= false;
6500 o
.back()->omap_complete
= false;
6501 o
.back()->data_recovered_to
= 0;
6504 ostream
&ObjectRecoveryProgress::print(ostream
&out
) const
6506 return out
<< "ObjectRecoveryProgress("
6507 << ( first
? "" : "!" ) << "first, "
6508 << "data_recovered_to:" << data_recovered_to
6509 << ", data_complete:" << ( data_complete
? "true" : "false" )
6510 << ", omap_recovered_to:" << omap_recovered_to
6511 << ", omap_complete:" << ( omap_complete
? "true" : "false" )
6512 << ", error:" << ( error
? "true" : "false" )
6516 void ObjectRecoveryProgress::dump(Formatter
*f
) const
6518 f
->dump_int("first?", first
);
6519 f
->dump_int("data_complete?", data_complete
);
6520 f
->dump_unsigned("data_recovered_to", data_recovered_to
);
6521 f
->dump_int("omap_complete?", omap_complete
);
6522 f
->dump_string("omap_recovered_to", omap_recovered_to
);
6525 void ObjectRecoveryInfo::encode(ceph::buffer::list
&bl
, uint64_t features
) const
6527 ENCODE_START(3, 1, bl
);
6529 encode(version
, bl
);
6531 encode(oi
, bl
, features
);
6533 encode(copy_subset
, bl
);
6534 encode(clone_subset
, bl
);
6535 encode(object_exist
, bl
);
6539 void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator
&bl
,
6542 DECODE_START(3, bl
);
6544 decode(version
, bl
);
6548 decode(copy_subset
, bl
);
6549 decode(clone_subset
, bl
);
6551 decode(object_exist
, bl
);
6553 object_exist
= false;
6556 if (!soid
.is_max() && soid
.pool
== -1)
6558 map
<hobject_t
, interval_set
<uint64_t>> tmp
;
6559 tmp
.swap(clone_subset
);
6560 for (auto i
= tmp
.begin(); i
!= tmp
.end(); ++i
) {
6561 hobject_t
first(i
->first
);
6562 if (!first
.is_max() && first
.pool
== -1)
6564 clone_subset
[first
].swap(i
->second
);
6569 void ObjectRecoveryInfo::generate_test_instances(
6570 list
<ObjectRecoveryInfo
*>& o
)
6572 o
.push_back(new ObjectRecoveryInfo
);
6573 o
.back()->soid
= hobject_t(sobject_t("key", CEPH_NOSNAP
));
6574 o
.back()->version
= eversion_t(0,0);
6575 o
.back()->size
= 100;
6576 o
.back()->object_exist
= false;
6580 void ObjectRecoveryInfo::dump(Formatter
*f
) const
6582 f
->dump_stream("object") << soid
;
6583 f
->dump_stream("at_version") << version
;
6584 f
->dump_stream("size") << size
;
6586 f
->open_object_section("object_info");
6591 f
->open_object_section("snapset");
6595 f
->dump_stream("copy_subset") << copy_subset
;
6596 f
->dump_stream("clone_subset") << clone_subset
;
6597 f
->dump_stream("object_exist") << object_exist
;
6600 ostream
& operator<<(ostream
& out
, const ObjectRecoveryInfo
&inf
)
6602 return inf
.print(out
);
6605 ostream
&ObjectRecoveryInfo::print(ostream
&out
) const
6607 return out
<< "ObjectRecoveryInfo("
6608 << soid
<< "@" << version
6609 << ", size: " << size
6610 << ", copy_subset: " << copy_subset
6611 << ", clone_subset: " << clone_subset
6612 << ", snapset: " << ss
6613 << ", object_exist: " << object_exist
6617 // -- PushReplyOp --
6618 void PushReplyOp::generate_test_instances(list
<PushReplyOp
*> &o
)
6620 o
.push_back(new PushReplyOp
);
6621 o
.push_back(new PushReplyOp
);
6622 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
6623 o
.push_back(new PushReplyOp
);
6624 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
6627 void PushReplyOp::encode(ceph::buffer::list
&bl
) const
6629 ENCODE_START(1, 1, bl
);
6634 void PushReplyOp::decode(ceph::buffer::list::const_iterator
&bl
)
6636 DECODE_START(1, bl
);
6641 void PushReplyOp::dump(Formatter
*f
) const
6643 f
->dump_stream("soid") << soid
;
6646 ostream
&PushReplyOp::print(ostream
&out
) const
6649 << "PushReplyOp(" << soid
6653 ostream
& operator<<(ostream
& out
, const PushReplyOp
&op
)
6655 return op
.print(out
);
6658 uint64_t PushReplyOp::cost(CephContext
*cct
) const
6661 return cct
->_conf
->osd_push_per_object_cost
+
6662 cct
->_conf
->osd_recovery_max_chunk
;
6666 void PullOp::generate_test_instances(list
<PullOp
*> &o
)
6668 o
.push_back(new PullOp
);
6669 o
.push_back(new PullOp
);
6670 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
6671 o
.back()->recovery_info
.version
= eversion_t(3, 10);
6672 o
.push_back(new PullOp
);
6673 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
6674 o
.back()->recovery_info
.version
= eversion_t(0, 0);
6677 void PullOp::encode(ceph::buffer::list
&bl
, uint64_t features
) const
6679 ENCODE_START(1, 1, bl
);
6681 encode(recovery_info
, bl
, features
);
6682 encode(recovery_progress
, bl
);
6686 void PullOp::decode(ceph::buffer::list::const_iterator
&bl
)
6688 DECODE_START(1, bl
);
6690 decode(recovery_info
, bl
);
6691 decode(recovery_progress
, bl
);
6695 void PullOp::dump(Formatter
*f
) const
6697 f
->dump_stream("soid") << soid
;
6699 f
->open_object_section("recovery_info");
6700 recovery_info
.dump(f
);
6704 f
->open_object_section("recovery_progress");
6705 recovery_progress
.dump(f
);
6710 ostream
&PullOp::print(ostream
&out
) const
6713 << "PullOp(" << soid
6714 << ", recovery_info: " << recovery_info
6715 << ", recovery_progress: " << recovery_progress
6719 ostream
& operator<<(ostream
& out
, const PullOp
&op
)
6721 return op
.print(out
);
6724 uint64_t PullOp::cost(CephContext
*cct
) const
6726 return cct
->_conf
->osd_push_per_object_cost
+
6727 cct
->_conf
->osd_recovery_max_chunk
;
6731 void PushOp::generate_test_instances(list
<PushOp
*> &o
)
6733 o
.push_back(new PushOp
);
6734 o
.push_back(new PushOp
);
6735 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
6736 o
.back()->version
= eversion_t(3, 10);
6737 o
.push_back(new PushOp
);
6738 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
6739 o
.back()->version
= eversion_t(0, 0);
6742 void PushOp::encode(ceph::buffer::list
&bl
, uint64_t features
) const
6744 ENCODE_START(1, 1, bl
);
6746 encode(version
, bl
);
6748 encode(data_included
, bl
);
6749 encode(omap_header
, bl
);
6750 encode(omap_entries
, bl
);
6751 encode(attrset
, bl
);
6752 encode(recovery_info
, bl
, features
);
6753 encode(after_progress
, bl
);
6754 encode(before_progress
, bl
);
6758 void PushOp::decode(ceph::buffer::list::const_iterator
&bl
)
6760 DECODE_START(1, bl
);
6762 decode(version
, bl
);
6764 decode(data_included
, bl
);
6765 decode(omap_header
, bl
);
6766 decode(omap_entries
, bl
);
6767 decode(attrset
, bl
);
6768 decode(recovery_info
, bl
);
6769 decode(after_progress
, bl
);
6770 decode(before_progress
, bl
);
6774 void PushOp::dump(Formatter
*f
) const
6776 f
->dump_stream("soid") << soid
;
6777 f
->dump_stream("version") << version
;
6778 f
->dump_int("data_len", data
.length());
6779 f
->dump_stream("data_included") << data_included
;
6780 f
->dump_int("omap_header_len", omap_header
.length());
6781 f
->dump_int("omap_entries_len", omap_entries
.size());
6782 f
->dump_int("attrset_len", attrset
.size());
6784 f
->open_object_section("recovery_info");
6785 recovery_info
.dump(f
);
6789 f
->open_object_section("after_progress");
6790 after_progress
.dump(f
);
6794 f
->open_object_section("before_progress");
6795 before_progress
.dump(f
);
6800 ostream
&PushOp::print(ostream
&out
) const
6803 << "PushOp(" << soid
6804 << ", version: " << version
6805 << ", data_included: " << data_included
6806 << ", data_size: " << data
.length()
6807 << ", omap_header_size: " << omap_header
.length()
6808 << ", omap_entries_size: " << omap_entries
.size()
6809 << ", attrset_size: " << attrset
.size()
6810 << ", recovery_info: " << recovery_info
6811 << ", after_progress: " << after_progress
6812 << ", before_progress: " << before_progress
6816 ostream
& operator<<(ostream
& out
, const PushOp
&op
)
6818 return op
.print(out
);
6821 uint64_t PushOp::cost(CephContext
*cct
) const
6823 uint64_t cost
= data_included
.size();
6824 for (auto i
= omap_entries
.cbegin(); i
!= omap_entries
.cend(); ++i
) {
6825 cost
+= i
->second
.length();
6827 cost
+= cct
->_conf
->osd_push_per_object_cost
;
6833 void ScrubMap::merge_incr(const ScrubMap
&l
)
6835 ceph_assert(valid_through
== l
.incr_since
);
6836 valid_through
= l
.valid_through
;
6838 for (auto p
= l
.objects
.cbegin(); p
!= l
.objects
.cend(); ++p
){
6839 if (p
->second
.negative
) {
6840 auto q
= objects
.find(p
->first
);
6841 if (q
!= objects
.end()) {
6845 objects
[p
->first
] = p
->second
;
6850 void ScrubMap::encode(ceph::buffer::list
& bl
) const
6852 ENCODE_START(3, 2, bl
);
6853 encode(objects
, bl
);
6854 encode((__u32
)0, bl
); // used to be attrs; now deprecated
6855 ceph::buffer::list old_logbl
; // not used
6856 encode(old_logbl
, bl
);
6857 encode(valid_through
, bl
);
6858 encode(incr_since
, bl
);
6862 void ScrubMap::decode(ceph::buffer::list::const_iterator
& bl
, int64_t pool
)
6864 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
6865 decode(objects
, bl
);
6867 map
<string
,string
> attrs
; // deprecated
6870 ceph::buffer::list old_logbl
; // not used
6871 decode(old_logbl
, bl
);
6872 decode(valid_through
, bl
);
6873 decode(incr_since
, bl
);
6876 // handle hobject_t upgrade
6878 map
<hobject_t
, object
> tmp
;
6880 for (auto i
= tmp
.begin(); i
!= tmp
.end(); ++i
) {
6881 hobject_t
first(i
->first
);
6882 if (!first
.is_max() && first
.pool
== -1)
6884 objects
[first
] = i
->second
;
6889 void ScrubMap::dump(Formatter
*f
) const
6891 f
->dump_stream("valid_through") << valid_through
;
6892 f
->dump_stream("incremental_since") << incr_since
;
6893 f
->open_array_section("objects");
6894 for (auto p
= objects
.cbegin(); p
!= objects
.cend(); ++p
) {
6895 f
->open_object_section("object");
6896 f
->dump_string("name", p
->first
.oid
.name
);
6897 f
->dump_unsigned("hash", p
->first
.get_hash());
6898 f
->dump_string("key", p
->first
.get_key());
6899 f
->dump_int("snapid", p
->first
.snap
);
6906 void ScrubMap::generate_test_instances(list
<ScrubMap
*>& o
)
6908 o
.push_back(new ScrubMap
);
6909 o
.push_back(new ScrubMap
);
6910 o
.back()->valid_through
= eversion_t(1, 2);
6911 o
.back()->incr_since
= eversion_t(3, 4);
6913 object::generate_test_instances(obj
);
6914 o
.back()->objects
[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj
.back();
6916 o
.back()->objects
[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj
.back();
6919 // -- ScrubMap::object --
6921 void ScrubMap::object::encode(ceph::buffer::list
& bl
) const
6923 bool compat_read_error
= read_error
|| ec_hash_mismatch
|| ec_size_mismatch
;
6924 ENCODE_START(10, 7, bl
);
6926 encode(negative
, bl
);
6929 encode(digest_present
, bl
);
6930 encode((uint32_t)0, bl
); // obsolete nlinks
6931 encode((uint32_t)0, bl
); // snapcolls
6932 encode(omap_digest
, bl
);
6933 encode(omap_digest_present
, bl
);
6934 encode(compat_read_error
, bl
);
6935 encode(stat_error
, bl
);
6936 encode(read_error
, bl
);
6937 encode(ec_hash_mismatch
, bl
);
6938 encode(ec_size_mismatch
, bl
);
6939 encode(large_omap_object_found
, bl
);
6940 encode(large_omap_object_key_count
, bl
);
6941 encode(large_omap_object_value_size
, bl
);
6942 encode(object_omap_bytes
, bl
);
6943 encode(object_omap_keys
, bl
);
6947 void ScrubMap::object::decode(ceph::buffer::list::const_iterator
& bl
)
6949 DECODE_START(10, bl
);
6951 bool tmp
, compat_read_error
= false;
6957 digest_present
= tmp
;
6961 set
<snapid_t
> snapcolls
;
6962 decode(snapcolls
, bl
);
6964 decode(omap_digest
, bl
);
6966 omap_digest_present
= tmp
;
6967 decode(compat_read_error
, bl
);
6970 if (struct_v
>= 8) {
6974 ec_hash_mismatch
= tmp
;
6976 ec_size_mismatch
= tmp
;
6978 // If older encoder found a read_error, set read_error
6979 if (compat_read_error
&& !read_error
&& !ec_hash_mismatch
&& !ec_size_mismatch
)
6981 if (struct_v
>= 9) {
6983 large_omap_object_found
= tmp
;
6984 decode(large_omap_object_key_count
, bl
);
6985 decode(large_omap_object_value_size
, bl
);
6987 if (struct_v
>= 10) {
6988 decode(object_omap_bytes
, bl
);
6989 decode(object_omap_keys
, bl
);
6994 void ScrubMap::object::dump(Formatter
*f
) const
6996 f
->dump_int("size", size
);
6997 f
->dump_int("negative", negative
);
6998 f
->open_array_section("attrs");
6999 for (auto p
= attrs
.cbegin(); p
!= attrs
.cend(); ++p
) {
7000 f
->open_object_section("attr");
7001 f
->dump_string("name", p
->first
);
7002 f
->dump_int("length", p
->second
.length());
7008 void ScrubMap::object::generate_test_instances(list
<object
*>& o
)
7010 o
.push_back(new object
);
7011 o
.push_back(new object
);
7012 o
.back()->negative
= true;
7013 o
.push_back(new object
);
7014 o
.back()->size
= 123;
7015 o
.back()->attrs
["foo"] = ceph::buffer::copy("foo", 3);
7016 o
.back()->attrs
["bar"] = ceph::buffer::copy("barval", 6);
7021 ostream
& operator<<(ostream
& out
, const OSDOp
& op
)
7023 out
<< ceph_osd_op_name(op
.op
.op
);
7024 if (ceph_osd_op_type_data(op
.op
.op
)) {
7027 case CEPH_OSD_OP_ASSERT_VER
:
7028 out
<< " v" << op
.op
.assert_ver
.ver
;
7030 case CEPH_OSD_OP_TRUNCATE
:
7031 out
<< " " << op
.op
.extent
.offset
;
7033 case CEPH_OSD_OP_MASKTRUNC
:
7034 case CEPH_OSD_OP_TRIMTRUNC
:
7035 out
<< " " << op
.op
.extent
.truncate_seq
<< "@"
7036 << (int64_t)op
.op
.extent
.truncate_size
;
7038 case CEPH_OSD_OP_ROLLBACK
:
7039 out
<< " " << snapid_t(op
.op
.snap
.snapid
);
7041 case CEPH_OSD_OP_WATCH
:
7042 out
<< " " << ceph_osd_watch_op_name(op
.op
.watch
.op
)
7043 << " cookie " << op
.op
.watch
.cookie
;
7044 if (op
.op
.watch
.gen
)
7045 out
<< " gen " << op
.op
.watch
.gen
;
7047 case CEPH_OSD_OP_NOTIFY
:
7048 out
<< " cookie " << op
.op
.notify
.cookie
;
7050 case CEPH_OSD_OP_COPY_GET
:
7051 out
<< " max " << op
.op
.copy_get
.max
;
7053 case CEPH_OSD_OP_COPY_FROM
:
7054 out
<< " ver " << op
.op
.copy_from
.src_version
;
7056 case CEPH_OSD_OP_SETALLOCHINT
:
7057 out
<< " object_size " << op
.op
.alloc_hint
.expected_object_size
7058 << " write_size " << op
.op
.alloc_hint
.expected_write_size
;
7060 case CEPH_OSD_OP_READ
:
7061 case CEPH_OSD_OP_SPARSE_READ
:
7062 case CEPH_OSD_OP_SYNC_READ
:
7063 case CEPH_OSD_OP_WRITE
:
7064 case CEPH_OSD_OP_WRITEFULL
:
7065 case CEPH_OSD_OP_ZERO
:
7066 case CEPH_OSD_OP_APPEND
:
7067 case CEPH_OSD_OP_MAPEXT
:
7068 case CEPH_OSD_OP_CMPEXT
:
7069 out
<< " " << op
.op
.extent
.offset
<< "~" << op
.op
.extent
.length
;
7070 if (op
.op
.extent
.truncate_seq
)
7071 out
<< " [" << op
.op
.extent
.truncate_seq
<< "@"
7072 << (int64_t)op
.op
.extent
.truncate_size
<< "]";
7074 out
<< " [" << ceph_osd_op_flag_string(op
.op
.flags
) << "]";
7076 // don't show any arg info
7079 } else if (ceph_osd_op_type_attr(op
.op
.op
)) {
7081 if (op
.op
.xattr
.name_len
&& op
.indata
.length()) {
7083 op
.indata
.write(0, op
.op
.xattr
.name_len
, out
);
7085 if (op
.op
.xattr
.value_len
)
7086 out
<< " (" << op
.op
.xattr
.value_len
<< ")";
7087 if (op
.op
.op
== CEPH_OSD_OP_CMPXATTR
)
7088 out
<< " op " << (int)op
.op
.xattr
.cmp_op
7089 << " mode " << (int)op
.op
.xattr
.cmp_mode
;
7090 } else if (ceph_osd_op_type_exec(op
.op
.op
)) {
7092 if (op
.op
.cls
.class_len
&& op
.indata
.length()) {
7094 op
.indata
.write(0, op
.op
.cls
.class_len
, out
);
7096 op
.indata
.write(op
.op
.cls
.class_len
, op
.op
.cls
.method_len
, out
);
7098 } else if (ceph_osd_op_type_pg(op
.op
.op
)) {
7100 case CEPH_OSD_OP_PGLS
:
7101 case CEPH_OSD_OP_PGLS_FILTER
:
7102 case CEPH_OSD_OP_PGNLS
:
7103 case CEPH_OSD_OP_PGNLS_FILTER
:
7104 out
<< " start_epoch " << op
.op
.pgls
.start_epoch
;
7106 case CEPH_OSD_OP_PG_HITSET_LS
:
7108 case CEPH_OSD_OP_PG_HITSET_GET
:
7109 out
<< " " << utime_t(op
.op
.hit_set_get
.stamp
);
7111 case CEPH_OSD_OP_SCRUBLS
:
7115 if (op
.indata
.length()) {
7116 out
<< " in=" << op
.indata
.length() << "b";
7118 if (op
.outdata
.length()) {
7119 out
<< " out=" << op
.outdata
.length() << "b";
7125 void OSDOp::split_osd_op_vector_out_data(vector
<OSDOp
>& ops
, ceph::buffer::list
& in
)
7127 auto datap
= in
.begin();
7128 for (unsigned i
= 0; i
< ops
.size(); i
++) {
7129 if (ops
[i
].op
.payload_len
) {
7130 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].outdata
);
7135 void OSDOp::merge_osd_op_vector_out_data(vector
<OSDOp
>& ops
, ceph::buffer::list
& out
)
7137 for (unsigned i
= 0; i
< ops
.size(); i
++) {
7138 ops
[i
].op
.payload_len
= ops
[i
].outdata
.length();
7139 if (ops
[i
].outdata
.length()) {
7140 out
.append(ops
[i
].outdata
);
7145 int prepare_info_keymap(
7147 map
<string
,bufferlist
> *km
,
7148 string
*key_to_remove
,
7151 pg_info_t
&last_written_info
,
7152 PastIntervals
&past_intervals
,
7153 bool dirty_big_info
,
7156 PerfCounters
*logger
,
7157 DoutPrefixProvider
*dpp
)
7160 encode(epoch
, (*km
)[string(epoch_key
)]);
7164 logger
->inc(l_osd_pg_info
);
7166 // try to do info efficiently?
7167 if (!dirty_big_info
&& try_fast_info
&&
7168 info
.last_update
> last_written_info
.last_update
) {
7169 pg_fast_info_t fast
;
7170 fast
.populate_from(info
);
7171 bool did
= fast
.try_apply_to(&last_written_info
);
7172 ceph_assert(did
); // we verified last_update increased above
7173 if (info
== last_written_info
) {
7174 encode(fast
, (*km
)[string(fastinfo_key
)]);
7176 logger
->inc(l_osd_pg_fastinfo
);
7180 ldpp_dout(dpp
, 30) << __func__
<< " fastinfo failed, info:\n";
7182 JSONFormatter
jf(true);
7183 jf
.dump_object("info", info
);
7187 *_dout
<< "\nlast_written_info:\n";
7188 JSONFormatter
jf(true);
7189 jf
.dump_object("last_written_info", last_written_info
);
7194 } else if (info
.last_update
<= last_written_info
.last_update
) {
7195 // clean up any potentially stale fastinfo key resulting from last_update
7196 // not moving forwards (e.g., a backwards jump during peering)
7197 *key_to_remove
= fastinfo_key
;
7200 last_written_info
= info
;
7202 // info. store purged_snaps separately.
7203 interval_set
<snapid_t
> purged_snaps
;
7204 purged_snaps
.swap(info
.purged_snaps
);
7205 encode(info
, (*km
)[string(info_key
)]);
7206 purged_snaps
.swap(info
.purged_snaps
);
7208 if (dirty_big_info
) {
7209 // potentially big stuff
7210 bufferlist
& bigbl
= (*km
)[string(biginfo_key
)];
7211 encode(past_intervals
, bigbl
);
7212 encode(info
.purged_snaps
, bigbl
);
7213 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
7215 logger
->inc(l_osd_pg_biginfo
);
7221 void create_pg_collection(
7222 ceph::os::Transaction
& t
, spg_t pgid
, int bits
)
7225 t
.create_collection(coll
, bits
);
7228 void init_pg_ondisk(
7229 ceph::os::Transaction
& t
,
7231 const pg_pool_t
*pool
)
7235 // Give a hint to the PG collection
7237 uint32_t pg_num
= pool
->get_pg_num();
7238 uint64_t expected_num_objects_pg
= pool
->expected_num_objects
/ pg_num
;
7239 encode(pg_num
, hint
);
7240 encode(expected_num_objects_pg
, hint
);
7241 uint32_t hint_type
= ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
;
7242 t
.collection_hint(coll
, hint_type
, hint
);
7245 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
7246 t
.touch(coll
, pgmeta_oid
);
7247 map
<string
,bufferlist
> values
;
7248 __u8 struct_v
= pg_latest_struct_v
;
7249 encode(struct_v
, values
[string(infover_key
)]);
7250 t
.omap_setkeys(coll
, pgmeta_oid
, values
);
7253 PGLSFilter::PGLSFilter() : cct(nullptr)
7257 PGLSFilter::~PGLSFilter()
7261 int PGLSPlainFilter::init(ceph::bufferlist::const_iterator
¶ms
)
7264 decode(xattr
, params
);
7265 decode(val
, params
);
7266 } catch (ceph::buffer::error
&e
) {
7272 bool PGLSPlainFilter::filter(const hobject_t
& obj
,
7273 const ceph::bufferlist
& xattr_data
) const
7275 return xattr_data
.contents_equal(val
.c_str(), val
.size());