1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
28 #include <boost/assign/list_of.hpp>
30 #include "include/ceph_features.h"
31 #include "include/encoding.h"
32 #include "include/stringify.h"
34 #include "crush/hash.h"
37 #include "common/Formatter.h"
38 #include "common/StackStringStream.h"
39 #include "include/utime_fmt.h"
41 #include "osd_types.h"
42 #include "osd_types_fmt.h"
43 #include "os/Transaction.h"
51 using std::shared_ptr
;
53 using std::unique_ptr
;
56 using ceph::bufferlist
;
58 using ceph::decode_nohead
;
60 using ceph::encode_nohead
;
61 using ceph::Formatter
;
62 using ceph::make_timespan
;
63 using ceph::JSONFormatter
;
65 using namespace std::literals
;
67 const char *ceph_osd_flag_name(unsigned flag
)
70 case CEPH_OSD_FLAG_ACK
: return "ack";
71 case CEPH_OSD_FLAG_ONNVRAM
: return "onnvram";
72 case CEPH_OSD_FLAG_ONDISK
: return "ondisk";
73 case CEPH_OSD_FLAG_RETRY
: return "retry";
74 case CEPH_OSD_FLAG_READ
: return "read";
75 case CEPH_OSD_FLAG_WRITE
: return "write";
76 case CEPH_OSD_FLAG_ORDERSNAP
: return "ordersnap";
77 case CEPH_OSD_FLAG_PEERSTAT_OLD
: return "peerstat_old";
78 case CEPH_OSD_FLAG_BALANCE_READS
: return "balance_reads";
79 case CEPH_OSD_FLAG_PARALLELEXEC
: return "parallelexec";
80 case CEPH_OSD_FLAG_PGOP
: return "pgop";
81 case CEPH_OSD_FLAG_EXEC
: return "exec";
82 case CEPH_OSD_FLAG_EXEC_PUBLIC
: return "exec_public";
83 case CEPH_OSD_FLAG_LOCALIZE_READS
: return "localize_reads";
84 case CEPH_OSD_FLAG_RWORDERED
: return "rwordered";
85 case CEPH_OSD_FLAG_IGNORE_CACHE
: return "ignore_cache";
86 case CEPH_OSD_FLAG_SKIPRWLOCKS
: return "skiprwlocks";
87 case CEPH_OSD_FLAG_IGNORE_OVERLAY
: return "ignore_overlay";
88 case CEPH_OSD_FLAG_FLUSH
: return "flush";
89 case CEPH_OSD_FLAG_MAP_SNAP_CLONE
: return "map_snap_clone";
90 case CEPH_OSD_FLAG_ENFORCE_SNAPC
: return "enforce_snapc";
91 case CEPH_OSD_FLAG_REDIRECTED
: return "redirected";
92 case CEPH_OSD_FLAG_KNOWN_REDIR
: return "known_if_redirected";
93 case CEPH_OSD_FLAG_FULL_TRY
: return "full_try";
94 case CEPH_OSD_FLAG_FULL_FORCE
: return "full_force";
95 case CEPH_OSD_FLAG_IGNORE_REDIRECT
: return "ignore_redirect";
96 case CEPH_OSD_FLAG_RETURNVEC
: return "returnvec";
97 case CEPH_OSD_FLAG_SUPPORTSPOOLEIO
: return "supports_pool_eio";
98 default: return "???";
102 string
ceph_osd_flag_string(unsigned flags
)
105 for (unsigned i
=0; i
<32; ++i
) {
106 if (flags
& (1u<<i
)) {
109 s
+= ceph_osd_flag_name(1u << i
);
117 const char * ceph_osd_op_flag_name(unsigned flag
)
122 case CEPH_OSD_OP_FLAG_EXCL
:
125 case CEPH_OSD_OP_FLAG_FAILOK
:
128 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM
:
129 name
= "fadvise_random";
131 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
:
132 name
= "fadvise_sequential";
134 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
:
135 name
= "favise_willneed";
137 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
:
138 name
= "fadvise_dontneed";
140 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
:
141 name
= "fadvise_nocache";
143 case CEPH_OSD_OP_FLAG_WITH_REFERENCE
:
144 name
= "with_reference";
146 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
:
147 name
= "bypass_clean_cache";
156 string
ceph_osd_op_flag_string(unsigned flags
)
159 for (unsigned i
=0; i
<32; ++i
) {
160 if (flags
& (1u<<i
)) {
163 s
+= ceph_osd_op_flag_name(1u << i
);
171 string
ceph_osd_alloc_hint_flag_string(unsigned flags
)
174 for (unsigned i
=0; i
<32; ++i
) {
175 if (flags
& (1u<<i
)) {
178 s
+= ceph_osd_alloc_hint_flag_name(1u << i
);
186 void pg_shard_t::encode(ceph::buffer::list
&bl
) const
188 ENCODE_START(1, 1, bl
);
193 void pg_shard_t::decode(ceph::buffer::list::const_iterator
&bl
)
201 ostream
&operator<<(ostream
&lhs
, const pg_shard_t
&rhs
)
203 if (rhs
.is_undefined())
205 if (rhs
.shard
== shard_id_t::NO_SHARD
)
206 return lhs
<< rhs
.get_osd();
207 return lhs
<< rhs
.get_osd() << '(' << (unsigned)(rhs
.shard
) << ')';
210 void dump(Formatter
* f
, const osd_alerts_t
& alerts
)
212 for (auto& a
: alerts
) {
213 string s0
= " osd: ";
214 s0
+= stringify(a
.first
);
216 for (auto& aa
: a
.second
) {
222 f
->dump_string("alert", s
);
228 void osd_reqid_t::dump(Formatter
*f
) const
230 f
->dump_stream("name") << name
;
231 f
->dump_int("inc", inc
);
232 f
->dump_unsigned("tid", tid
);
235 void osd_reqid_t::generate_test_instances(list
<osd_reqid_t
*>& o
)
237 o
.push_back(new osd_reqid_t
);
238 o
.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
241 // -- object_locator_t --
243 void object_locator_t::encode(ceph::buffer::list
& bl
) const
245 // verify that nobody's corrupted the locator
246 ceph_assert(hash
== -1 || key
.empty());
247 __u8 encode_compat
= 3;
248 ENCODE_START(6, encode_compat
, bl
);
250 int32_t preferred
= -1; // tell old code there is no preferred osd (-1).
251 encode(preferred
, bl
);
256 encode_compat
= std::max
<std::uint8_t>(encode_compat
, 6); // need to interpret the hash
257 ENCODE_FINISH_NEW_COMPAT(bl
, encode_compat
);
260 void object_locator_t::decode(ceph::buffer::list::const_iterator
& p
)
262 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p
);
272 decode(preferred
, p
);
282 // verify that nobody's corrupted the locator
283 ceph_assert(hash
== -1 || key
.empty());
286 void object_locator_t::dump(Formatter
*f
) const
288 f
->dump_int("pool", pool
);
289 f
->dump_string("key", key
);
290 f
->dump_string("namespace", nspace
);
291 f
->dump_int("hash", hash
);
294 void object_locator_t::generate_test_instances(list
<object_locator_t
*>& o
)
296 o
.push_back(new object_locator_t
);
297 o
.push_back(new object_locator_t(123));
298 o
.push_back(new object_locator_t(123, 876));
299 o
.push_back(new object_locator_t(1, "n2"));
300 o
.push_back(new object_locator_t(1234, "", "key"));
301 o
.push_back(new object_locator_t(12, "n1", "key2"));
304 // -- request_redirect_t --
305 void request_redirect_t::encode(ceph::buffer::list
& bl
) const
307 ENCODE_START(1, 1, bl
);
308 encode(redirect_locator
, bl
);
309 encode(redirect_object
, bl
);
310 // legacy of the removed osd_instructions member
311 encode((uint32_t)0, bl
);
315 void request_redirect_t::decode(ceph::buffer::list::const_iterator
& bl
)
318 uint32_t legacy_osd_instructions_len
;
319 decode(redirect_locator
, bl
);
320 decode(redirect_object
, bl
);
321 decode(legacy_osd_instructions_len
, bl
);
322 if (legacy_osd_instructions_len
) {
323 bl
+= legacy_osd_instructions_len
;
328 void request_redirect_t::dump(Formatter
*f
) const
330 f
->dump_string("object", redirect_object
);
331 f
->open_object_section("locator");
332 redirect_locator
.dump(f
);
333 f
->close_section(); // locator
336 void request_redirect_t::generate_test_instances(list
<request_redirect_t
*>& o
)
338 object_locator_t
loc(1, "redir_obj");
339 o
.push_back(new request_redirect_t());
340 o
.push_back(new request_redirect_t(loc
, 0));
341 o
.push_back(new request_redirect_t(loc
, "redir_obj"));
342 o
.push_back(new request_redirect_t(loc
));
345 void objectstore_perf_stat_t::dump(Formatter
*f
) const
347 // *_ms values just for compatibility.
348 f
->dump_float("commit_latency_ms", os_commit_latency_ns
/ 1000000.0);
349 f
->dump_float("apply_latency_ms", os_apply_latency_ns
/ 1000000.0);
350 f
->dump_unsigned("commit_latency_ns", os_commit_latency_ns
);
351 f
->dump_unsigned("apply_latency_ns", os_apply_latency_ns
);
354 void objectstore_perf_stat_t::encode(ceph::buffer::list
&bl
, uint64_t features
) const
356 uint8_t target_v
= 2;
357 if (!HAVE_FEATURE(features
, OS_PERF_STAT_NS
)) {
360 ENCODE_START(target_v
, target_v
, bl
);
362 encode(os_commit_latency_ns
, bl
);
363 encode(os_apply_latency_ns
, bl
);
365 constexpr auto NS_PER_MS
= std::chrono::nanoseconds(1ms
).count();
366 uint32_t commit_latency_ms
= os_commit_latency_ns
/ NS_PER_MS
;
367 uint32_t apply_latency_ms
= os_apply_latency_ns
/ NS_PER_MS
;
368 encode(commit_latency_ms
, bl
); // for compatibility with older monitor.
369 encode(apply_latency_ms
, bl
); // for compatibility with older monitor.
374 void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator
&bl
)
378 decode(os_commit_latency_ns
, bl
);
379 decode(os_apply_latency_ns
, bl
);
381 uint32_t commit_latency_ms
;
382 uint32_t apply_latency_ms
;
383 decode(commit_latency_ms
, bl
);
384 decode(apply_latency_ms
, bl
);
385 constexpr auto NS_PER_MS
= std::chrono::nanoseconds(1ms
).count();
386 os_commit_latency_ns
= commit_latency_ms
* NS_PER_MS
;
387 os_apply_latency_ns
= apply_latency_ms
* NS_PER_MS
;
392 void objectstore_perf_stat_t::generate_test_instances(std::list
<objectstore_perf_stat_t
*>& o
)
394 o
.push_back(new objectstore_perf_stat_t());
395 o
.push_back(new objectstore_perf_stat_t());
396 o
.back()->os_commit_latency_ns
= 20000000;
397 o
.back()->os_apply_latency_ns
= 30000000;
401 void osd_stat_t::dump(Formatter
*f
, bool with_net
) const
403 f
->dump_unsigned("up_from", up_from
);
404 f
->dump_unsigned("seq", seq
);
405 f
->dump_unsigned("num_pgs", num_pgs
);
406 f
->dump_unsigned("num_osds", num_osds
);
407 f
->dump_unsigned("num_per_pool_osds", num_per_pool_osds
);
408 f
->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds
);
410 /// dump legacy stats fields to ensure backward compatibility.
411 f
->dump_unsigned("kb", statfs
.kb());
412 f
->dump_unsigned("kb_used", statfs
.kb_used_raw());
413 f
->dump_unsigned("kb_used_data", statfs
.kb_used_data());
414 f
->dump_unsigned("kb_used_omap", statfs
.kb_used_omap());
415 f
->dump_unsigned("kb_used_meta", statfs
.kb_used_internal_metadata());
416 f
->dump_unsigned("kb_avail", statfs
.kb_avail());
419 f
->open_object_section("statfs");
422 f
->open_array_section("hb_peers");
423 for (auto p
: hb_peers
)
424 f
->dump_int("osd", p
);
426 f
->dump_int("snap_trim_queue_len", snap_trim_queue_len
);
427 f
->dump_int("num_snap_trimming", num_snap_trimming
);
428 f
->dump_int("num_shards_repaired", num_shards_repaired
);
429 f
->open_object_section("op_queue_age_hist");
430 op_queue_age_hist
.dump(f
);
432 f
->open_object_section("perf_stat");
433 os_perf_stat
.dump(f
);
435 f
->open_array_section("alerts");
436 ::dump(f
, os_alerts
);
443 void osd_stat_t::dump_ping_time(Formatter
*f
) const
445 f
->open_array_section("network_ping_times");
446 for (auto &i
: hb_pingtime
) {
447 f
->open_object_section("entry");
448 f
->dump_int("osd", i
.first
);
449 const time_t lu(i
.second
.last_update
);
451 string
lustr(ctime_r(&lu
, buffer
));
452 lustr
.pop_back(); // Remove trailing \n
453 f
->dump_string("last update", lustr
);
454 f
->open_array_section("interfaces");
455 f
->open_object_section("interface");
456 f
->dump_string("interface", "back");
457 f
->open_object_section("average");
458 f
->dump_float("1min", i
.second
.back_pingtime
[0]/1000.0);
459 f
->dump_float("5min", i
.second
.back_pingtime
[1]/1000.0);
460 f
->dump_float("15min", i
.second
.back_pingtime
[2]/1000.0);
461 f
->close_section(); // average
462 f
->open_object_section("min");
463 f
->dump_float("1min", i
.second
.back_min
[0]/1000.0);
464 f
->dump_float("5min", i
.second
.back_min
[1]/1000.0);
465 f
->dump_float("15min", i
.second
.back_min
[2]/1000.0);
466 f
->close_section(); // min
467 f
->open_object_section("max");
468 f
->dump_float("1min", i
.second
.back_max
[0]/1000.0);
469 f
->dump_float("5min", i
.second
.back_max
[1]/1000.0);
470 f
->dump_float("15min", i
.second
.back_max
[2]/1000.0);
471 f
->close_section(); // max
472 f
->dump_float("last", i
.second
.back_last
/1000.0);
473 f
->close_section(); // interface
475 if (i
.second
.front_pingtime
[0] != 0) {
476 f
->open_object_section("interface");
477 f
->dump_string("interface", "front");
478 f
->open_object_section("average");
479 f
->dump_float("1min", i
.second
.front_pingtime
[0]/1000.0);
480 f
->dump_float("5min", i
.second
.front_pingtime
[1]/1000.0);
481 f
->dump_float("15min", i
.second
.front_pingtime
[2]/1000.0);
482 f
->close_section(); // average
483 f
->open_object_section("min");
484 f
->dump_float("1min", i
.second
.front_min
[0]/1000.0);
485 f
->dump_float("5min", i
.second
.front_min
[1]/1000.0);
486 f
->dump_float("15min", i
.second
.front_min
[2]/1000.0);
487 f
->close_section(); // min
488 f
->open_object_section("max");
489 f
->dump_float("1min", i
.second
.front_max
[0]/1000.0);
490 f
->dump_float("5min", i
.second
.front_max
[1]/1000.0);
491 f
->dump_float("15min", i
.second
.front_max
[2]/1000.0);
492 f
->close_section(); // max
493 f
->dump_float("last", i
.second
.front_last
/1000.0);
494 f
->close_section(); // interface
496 f
->close_section(); // interfaces
497 f
->close_section(); // entry
499 f
->close_section(); // network_ping_time
502 void osd_stat_t::encode(ceph::buffer::list
&bl
, uint64_t features
) const
504 ENCODE_START(14, 2, bl
);
506 //////// for compatibility ////////
507 int64_t kb
= statfs
.kb();
508 int64_t kb_used
= statfs
.kb_used_raw();
509 int64_t kb_avail
= statfs
.kb_avail();
512 encode(kb_avail
, bl
);
513 ///////////////////////////////////
515 encode(snap_trim_queue_len
, bl
);
516 encode(num_snap_trimming
, bl
);
517 encode(hb_peers
, bl
);
518 encode((uint32_t)0, bl
);
519 encode(op_queue_age_hist
, bl
);
520 encode(os_perf_stat
, bl
, features
);
525 //////// for compatibility ////////
526 int64_t kb_used_data
= statfs
.kb_used_data();
527 int64_t kb_used_omap
= statfs
.kb_used_omap();
528 int64_t kb_used_meta
= statfs
.kb_used_internal_metadata();
529 encode(kb_used_data
, bl
);
530 encode(kb_used_omap
, bl
);
531 encode(kb_used_meta
, bl
);
533 ///////////////////////////////////
534 encode(os_alerts
, bl
);
535 encode(num_shards_repaired
, bl
);
536 encode(num_osds
, bl
);
537 encode(num_per_pool_osds
, bl
);
538 encode(num_per_pool_omap_osds
, bl
);
541 encode((int)hb_pingtime
.size(), bl
);
542 for (auto i
: hb_pingtime
) {
543 encode(i
.first
, bl
); // osd
544 encode(i
.second
.last_update
, bl
);
545 encode(i
.second
.back_pingtime
[0], bl
);
546 encode(i
.second
.back_pingtime
[1], bl
);
547 encode(i
.second
.back_pingtime
[2], bl
);
548 encode(i
.second
.back_min
[0], bl
);
549 encode(i
.second
.back_min
[1], bl
);
550 encode(i
.second
.back_min
[2], bl
);
551 encode(i
.second
.back_max
[0], bl
);
552 encode(i
.second
.back_max
[1], bl
);
553 encode(i
.second
.back_max
[2], bl
);
554 encode(i
.second
.back_last
, bl
);
555 encode(i
.second
.front_pingtime
[0], bl
);
556 encode(i
.second
.front_pingtime
[1], bl
);
557 encode(i
.second
.front_pingtime
[2], bl
);
558 encode(i
.second
.front_min
[0], bl
);
559 encode(i
.second
.front_min
[1], bl
);
560 encode(i
.second
.front_min
[2], bl
);
561 encode(i
.second
.front_max
[0], bl
);
562 encode(i
.second
.front_max
[1], bl
);
563 encode(i
.second
.front_max
[2], bl
);
564 encode(i
.second
.front_last
, bl
);
569 void osd_stat_t::decode(ceph::buffer::list::const_iterator
&bl
)
571 int64_t kb
, kb_used
,kb_avail
;
572 int64_t kb_used_data
, kb_used_omap
, kb_used_meta
;
573 DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl
);
576 decode(kb_avail
, bl
);
577 decode(snap_trim_queue_len
, bl
);
578 decode(num_snap_trimming
, bl
);
579 decode(hb_peers
, bl
);
580 vector
<int> num_hb_out
;
581 decode(num_hb_out
, bl
);
583 decode(op_queue_age_hist
, bl
);
585 decode(os_perf_stat
, bl
);
594 decode(kb_used_data
, bl
);
595 decode(kb_used_omap
, bl
);
596 decode(kb_used_meta
, bl
);
598 kb_used_data
= kb_used
;
606 statfs
.total
= kb
<< 10;
607 statfs
.available
= kb_avail
<< 10;
608 // actually it's totally unexpected to have ststfs.total < statfs.available
609 // here but unfortunately legacy generate_test_instances produced such a
610 // case hence inserting some handling rather than assert
611 statfs
.internally_reserved
=
612 statfs
.total
> statfs
.available
? statfs
.total
- statfs
.available
: 0;
614 if ((int64_t)statfs
.internally_reserved
> kb_used
) {
615 statfs
.internally_reserved
-= kb_used
;
617 statfs
.internally_reserved
= 0;
619 statfs
.allocated
= kb_used_data
<< 10;
620 statfs
.omap_allocated
= kb_used_omap
<< 10;
621 statfs
.internal_metadata
= kb_used_meta
<< 10;
623 if (struct_v
>= 10) {
624 decode(os_alerts
, bl
);
628 if (struct_v
>= 11) {
629 decode(num_shards_repaired
, bl
);
631 num_shards_repaired
= 0;
633 if (struct_v
>= 12) {
634 decode(num_osds
, bl
);
635 decode(num_per_pool_osds
, bl
);
638 num_per_pool_osds
= 0;
640 if (struct_v
>= 13) {
641 decode(num_per_pool_omap_osds
, bl
);
643 num_per_pool_omap_osds
= 0;
646 if (struct_v
>= 14) {
649 for (int i
= 0 ; i
< count
; i
++) {
652 struct Interfaces ifs
;
653 decode(ifs
.last_update
, bl
);
654 decode(ifs
.back_pingtime
[0],bl
);
655 decode(ifs
.back_pingtime
[1], bl
);
656 decode(ifs
.back_pingtime
[2], bl
);
657 decode(ifs
.back_min
[0],bl
);
658 decode(ifs
.back_min
[1], bl
);
659 decode(ifs
.back_min
[2], bl
);
660 decode(ifs
.back_max
[0],bl
);
661 decode(ifs
.back_max
[1], bl
);
662 decode(ifs
.back_max
[2], bl
);
663 decode(ifs
.back_last
, bl
);
664 decode(ifs
.front_pingtime
[0], bl
);
665 decode(ifs
.front_pingtime
[1], bl
);
666 decode(ifs
.front_pingtime
[2], bl
);
667 decode(ifs
.front_min
[0], bl
);
668 decode(ifs
.front_min
[1], bl
);
669 decode(ifs
.front_min
[2], bl
);
670 decode(ifs
.front_max
[0], bl
);
671 decode(ifs
.front_max
[1], bl
);
672 decode(ifs
.front_max
[2], bl
);
673 decode(ifs
.front_last
, bl
);
674 hb_pingtime
[osd
] = ifs
;
680 void osd_stat_t::generate_test_instances(std::list
<osd_stat_t
*>& o
)
682 o
.push_back(new osd_stat_t
);
684 o
.push_back(new osd_stat_t
);
685 list
<store_statfs_t
*> ll
;
686 store_statfs_t::generate_test_instances(ll
);
687 o
.back()->statfs
= *ll
.back();
688 o
.back()->hb_peers
.push_back(7);
689 o
.back()->snap_trim_queue_len
= 8;
690 o
.back()->num_snap_trimming
= 99;
691 o
.back()->num_shards_repaired
= 101;
692 o
.back()->os_alerts
[0].emplace(
693 "some alert", "some alert details");
694 o
.back()->os_alerts
[1].emplace(
695 "some alert2", "some alert2 details");
696 struct Interfaces gen_interfaces
= {
697 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
698 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
699 o
.back()->hb_pingtime
[20] = gen_interfaces
;
701 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
702 o
.back()->hb_pingtime
[30] = gen_interfaces
;
707 int pg_t::print(char *o
, int maxlen
) const
709 return snprintf(o
, maxlen
, "%llu.%x", (unsigned long long)pool(), ps());
712 bool pg_t::parse(const char *s
)
716 int r
= sscanf(s
, "%llu.%x", (long long unsigned *)&ppool
, &pseed
);
724 bool spg_t::parse(const char *s
)
726 shard
= shard_id_t::NO_SHARD
;
730 int r
= sscanf(s
, "%llu.%x", (long long unsigned *)&ppool
, &pseed
);
733 pgid
.set_pool(ppool
);
736 const char *p
= strchr(s
, 's');
738 r
= sscanf(p
, "s%u", &pshard
);
740 shard
= shard_id_t(pshard
);
748 char *spg_t::calc_name(char *buf
, const char *suffix_backwords
) const
750 while (*suffix_backwords
)
751 *--buf
= *suffix_backwords
++;
753 if (!is_no_shard()) {
754 buf
= ritoa
<uint8_t, 10>((uint8_t)shard
.id
, buf
);
758 return pgid
.calc_name(buf
, "");
761 ostream
& operator<<(ostream
& out
, const spg_t
&pg
)
763 char buf
[spg_t::calc_name_buf_size
];
764 buf
[spg_t::calc_name_buf_size
- 1] = '\0';
765 out
<< pg
.calc_name(buf
+ spg_t::calc_name_buf_size
- 1, "");
769 pg_t
pg_t::get_ancestor(unsigned old_pg_num
) const
771 int old_bits
= cbits(old_pg_num
);
772 int old_mask
= (1 << old_bits
) - 1;
774 ret
.m_seed
= ceph_stable_mod(m_seed
, old_pg_num
, old_mask
);
778 bool pg_t::is_split(unsigned old_pg_num
, unsigned new_pg_num
, set
<pg_t
> *children
) const
780 //ceph_assert(m_seed < old_pg_num);
781 if (m_seed
>= old_pg_num
) {
785 if (new_pg_num
<= old_pg_num
)
790 unsigned old_bits
= cbits(old_pg_num
);
791 unsigned old_mask
= (1 << old_bits
) - 1;
792 for (unsigned n
= 1; ; n
++) {
793 unsigned next_bit
= (n
<< (old_bits
-1));
794 unsigned s
= next_bit
| m_seed
;
796 if (s
< old_pg_num
|| s
== m_seed
)
800 if ((unsigned)ceph_stable_mod(s
, old_pg_num
, old_mask
) == m_seed
) {
803 children
->insert(pg_t(s
, m_pool
));
809 int old_bits
= cbits(old_pg_num
);
810 int old_mask
= (1 << old_bits
) - 1;
811 for (unsigned x
= old_pg_num
; x
< new_pg_num
; ++x
) {
812 unsigned o
= ceph_stable_mod(x
, old_pg_num
, old_mask
);
815 children
->insert(pg_t(x
, m_pool
));
822 unsigned pg_t::get_split_bits(unsigned pg_num
) const {
825 ceph_assert(pg_num
> 1);
827 // Find unique p such that pg_num \in [2^(p-1), 2^p)
828 unsigned p
= cbits(pg_num
);
829 ceph_assert(p
); // silence coverity #751330
831 if ((m_seed
% (1<<(p
-1))) < (pg_num
% (1<<(p
-1))))
837 bool pg_t::is_merge_source(
842 if (m_seed
< old_pg_num
&&
843 m_seed
>= new_pg_num
) {
846 while (t
.m_seed
>= new_pg_num
) {
856 pg_t
pg_t::get_parent() const
858 unsigned bits
= cbits(m_seed
);
861 retval
.m_seed
&= ~((~0)<<(bits
- 1));
865 hobject_t
pg_t::get_hobj_start() const
867 return hobject_t(object_t(), string(), 0, m_seed
, m_pool
,
871 hobject_t
pg_t::get_hobj_end(unsigned pg_num
) const
873 // note: this assumes a bitwise sort; with the legacy nibblewise
874 // sort a PG did not always cover a single contiguous range of the
875 // (bit-reversed) hash range.
876 unsigned bits
= get_split_bits(pg_num
);
877 uint64_t rev_start
= hobject_t::_reverse_bits(m_seed
);
878 uint64_t rev_end
= (rev_start
| (0xffffffff >> bits
)) + 1;
879 if (rev_end
>= 0x100000000) {
880 ceph_assert(rev_end
== 0x100000000);
881 return hobject_t::get_max();
883 return hobject_t(object_t(), string(), CEPH_NOSNAP
,
884 hobject_t::_reverse_bits(rev_end
), m_pool
,
889 void pg_t::dump(Formatter
*f
) const
891 f
->dump_unsigned("pool", m_pool
);
892 f
->dump_unsigned("seed", m_seed
);
895 void pg_t::generate_test_instances(list
<pg_t
*>& o
)
897 o
.push_back(new pg_t
);
898 o
.push_back(new pg_t(1, 2));
899 o
.push_back(new pg_t(13123, 3));
900 o
.push_back(new pg_t(131223, 4));
903 char *pg_t::calc_name(char *buf
, const char *suffix_backwords
) const
905 while (*suffix_backwords
)
906 *--buf
= *suffix_backwords
++;
908 buf
= ritoa
<uint32_t, 16>(m_seed
, buf
);
912 return ritoa
<uint64_t, 10>(m_pool
, buf
);
915 ostream
& operator<<(ostream
& out
, const pg_t
&pg
)
917 char buf
[pg_t::calc_name_buf_size
];
918 buf
[pg_t::calc_name_buf_size
- 1] = '\0';
919 out
<< pg
.calc_name(buf
+ pg_t::calc_name_buf_size
- 1, "");
926 void coll_t::calc_str()
930 strcpy(_str_buff
, "meta");
934 _str_buff
[spg_t::calc_name_buf_size
- 1] = '\0';
935 _str
= pgid
.calc_name(_str_buff
+ spg_t::calc_name_buf_size
- 1, "daeh_");
938 _str_buff
[spg_t::calc_name_buf_size
- 1] = '\0';
939 _str
= pgid
.calc_name(_str_buff
+ spg_t::calc_name_buf_size
- 1, "PMET_");
942 ceph_abort_msg("unknown collection type");
946 bool coll_t::parse(const std::string
& s
)
953 ceph_assert(s
== _str
);
956 if (s
.find("_head") == s
.length() - 5 &&
957 pgid
.parse(s
.substr(0, s
.length() - 5))) {
961 ceph_assert(s
== _str
);
964 if (s
.find("_TEMP") == s
.length() - 5 &&
965 pgid
.parse(s
.substr(0, s
.length() - 5))) {
969 ceph_assert(s
== _str
);
975 void coll_t::encode(ceph::buffer::list
& bl
) const
978 // when changing this, remember to update encoded_size() too.
980 // can't express this as v2...
982 encode(struct_v
, bl
);
983 encode(to_str(), bl
);
986 encode(struct_v
, bl
);
987 encode((__u8
)type
, bl
);
989 snapid_t snap
= CEPH_NOSNAP
;
994 size_t coll_t::encoded_size() const
996 size_t r
= sizeof(__u8
);
1008 // - encoding header
1009 r
+= sizeof(ceph_le32
) + 2 * sizeof(__u8
);
1011 r
+= sizeof(__u8
) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
1013 r
+= sizeof(int8_t);
1015 r
+= sizeof(uint64_t);
1021 void coll_t::decode(ceph::buffer::list::const_iterator
& bl
)
1025 decode(struct_v
, bl
);
1034 if (pgid
== spg_t() && snap
== 0) {
1050 type
= (type_t
)_type
;
1059 bool ok
= parse(str
);
1061 throw std::domain_error(std::string("unable to parse pg ") + str
);
1067 CachedStackStringStream css
;
1068 *css
<< "coll_t::decode(): don't know how to decode version "
1070 throw std::domain_error(css
->str());
1075 void coll_t::dump(Formatter
*f
) const
1077 f
->dump_unsigned("type_id", (unsigned)type
);
1078 if (type
!= TYPE_META
)
1079 f
->dump_stream("pgid") << pgid
;
1080 f
->dump_string("name", to_str());
1083 void coll_t::generate_test_instances(list
<coll_t
*>& o
)
1085 o
.push_back(new coll_t());
1086 o
.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD
)));
1087 o
.push_back(new coll_t(o
.back()->get_temp()));
1088 o
.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
1089 o
.push_back(new coll_t(o
.back()->get_temp()));
1090 o
.push_back(new coll_t());
1095 std::string
pg_vector_string(const vector
<int32_t> &a
)
1097 CachedStackStringStream css
;
1099 for (auto i
= a
.cbegin(); i
!= a
.cend(); ++i
) {
1102 if (*i
!= CRUSH_ITEM_NONE
)
1111 std::string
pg_state_string(uint64_t state
)
1113 CachedStackStringStream css
;
1114 if (state
& PG_STATE_STALE
)
1116 if (state
& PG_STATE_CREATING
)
1117 *css
<< "creating+";
1118 if (state
& PG_STATE_ACTIVE
)
1120 if (state
& PG_STATE_ACTIVATING
)
1121 *css
<< "activating+";
1122 if (state
& PG_STATE_CLEAN
)
1124 if (state
& PG_STATE_RECOVERY_WAIT
)
1125 *css
<< "recovery_wait+";
1126 if (state
& PG_STATE_RECOVERY_TOOFULL
)
1127 *css
<< "recovery_toofull+";
1128 if (state
& PG_STATE_RECOVERING
)
1129 *css
<< "recovering+";
1130 if (state
& PG_STATE_FORCED_RECOVERY
)
1131 *css
<< "forced_recovery+";
1132 if (state
& PG_STATE_DOWN
)
1134 if (state
& PG_STATE_RECOVERY_UNFOUND
)
1135 *css
<< "recovery_unfound+";
1136 if (state
& PG_STATE_BACKFILL_UNFOUND
)
1137 *css
<< "backfill_unfound+";
1138 if (state
& PG_STATE_UNDERSIZED
)
1139 *css
<< "undersized+";
1140 if (state
& PG_STATE_DEGRADED
)
1141 *css
<< "degraded+";
1142 if (state
& PG_STATE_REMAPPED
)
1143 *css
<< "remapped+";
1144 if (state
& PG_STATE_PREMERGE
)
1145 *css
<< "premerge+";
1146 if (state
& PG_STATE_SCRUBBING
)
1147 *css
<< "scrubbing+";
1148 if (state
& PG_STATE_DEEP_SCRUB
)
1150 if (state
& PG_STATE_INCONSISTENT
)
1151 *css
<< "inconsistent+";
1152 if (state
& PG_STATE_PEERING
)
1154 if (state
& PG_STATE_REPAIR
)
1156 if (state
& PG_STATE_BACKFILL_WAIT
)
1157 *css
<< "backfill_wait+";
1158 if (state
& PG_STATE_BACKFILLING
)
1159 *css
<< "backfilling+";
1160 if (state
& PG_STATE_FORCED_BACKFILL
)
1161 *css
<< "forced_backfill+";
1162 if (state
& PG_STATE_BACKFILL_TOOFULL
)
1163 *css
<< "backfill_toofull+";
1164 if (state
& PG_STATE_INCOMPLETE
)
1165 *css
<< "incomplete+";
1166 if (state
& PG_STATE_PEERED
)
1168 if (state
& PG_STATE_SNAPTRIM
)
1169 *css
<< "snaptrim+";
1170 if (state
& PG_STATE_SNAPTRIM_WAIT
)
1171 *css
<< "snaptrim_wait+";
1172 if (state
& PG_STATE_SNAPTRIM_ERROR
)
1173 *css
<< "snaptrim_error+";
1174 if (state
& PG_STATE_FAILED_REPAIR
)
1175 *css
<< "failed_repair+";
1176 if (state
& PG_STATE_LAGGY
)
1178 if (state
& PG_STATE_WAIT
)
1180 auto ret
= css
->str();
1181 if (ret
.length() > 0)
1182 ret
.resize(ret
.length() - 1);
1188 std::optional
<uint64_t> pg_string_state(const std::string
& state
)
1190 std::optional
<uint64_t> type
;
1191 if (state
== "active")
1192 type
= PG_STATE_ACTIVE
;
1193 else if (state
== "clean")
1194 type
= PG_STATE_CLEAN
;
1195 else if (state
== "down")
1196 type
= PG_STATE_DOWN
;
1197 else if (state
== "recovery_unfound")
1198 type
= PG_STATE_RECOVERY_UNFOUND
;
1199 else if (state
== "backfill_unfound")
1200 type
= PG_STATE_BACKFILL_UNFOUND
;
1201 else if (state
== "premerge")
1202 type
= PG_STATE_PREMERGE
;
1203 else if (state
== "scrubbing")
1204 type
= PG_STATE_SCRUBBING
;
1205 else if (state
== "degraded")
1206 type
= PG_STATE_DEGRADED
;
1207 else if (state
== "inconsistent")
1208 type
= PG_STATE_INCONSISTENT
;
1209 else if (state
== "peering")
1210 type
= PG_STATE_PEERING
;
1211 else if (state
== "repair")
1212 type
= PG_STATE_REPAIR
;
1213 else if (state
== "recovering")
1214 type
= PG_STATE_RECOVERING
;
1215 else if (state
== "forced_recovery")
1216 type
= PG_STATE_FORCED_RECOVERY
;
1217 else if (state
== "backfill_wait")
1218 type
= PG_STATE_BACKFILL_WAIT
;
1219 else if (state
== "incomplete")
1220 type
= PG_STATE_INCOMPLETE
;
1221 else if (state
== "stale")
1222 type
= PG_STATE_STALE
;
1223 else if (state
== "remapped")
1224 type
= PG_STATE_REMAPPED
;
1225 else if (state
== "deep")
1226 type
= PG_STATE_DEEP_SCRUB
;
1227 else if (state
== "backfilling")
1228 type
= PG_STATE_BACKFILLING
;
1229 else if (state
== "forced_backfill")
1230 type
= PG_STATE_FORCED_BACKFILL
;
1231 else if (state
== "backfill_toofull")
1232 type
= PG_STATE_BACKFILL_TOOFULL
;
1233 else if (state
== "recovery_wait")
1234 type
= PG_STATE_RECOVERY_WAIT
;
1235 else if (state
== "recovery_toofull")
1236 type
= PG_STATE_RECOVERY_TOOFULL
;
1237 else if (state
== "undersized")
1238 type
= PG_STATE_UNDERSIZED
;
1239 else if (state
== "activating")
1240 type
= PG_STATE_ACTIVATING
;
1241 else if (state
== "peered")
1242 type
= PG_STATE_PEERED
;
1243 else if (state
== "snaptrim")
1244 type
= PG_STATE_SNAPTRIM
;
1245 else if (state
== "snaptrim_wait")
1246 type
= PG_STATE_SNAPTRIM_WAIT
;
1247 else if (state
== "snaptrim_error")
1248 type
= PG_STATE_SNAPTRIM_ERROR
;
1249 else if (state
== "creating")
1250 type
= PG_STATE_CREATING
;
1251 else if (state
== "failed_repair")
1252 type
= PG_STATE_FAILED_REPAIR
;
1253 else if (state
== "laggy")
1254 type
= PG_STATE_LAGGY
;
1255 else if (state
== "wait")
1256 type
= PG_STATE_WAIT
;
1257 else if (state
== "unknown")
1260 type
= std::nullopt
;
1265 string
eversion_t::get_key_name() const
1267 std::string
key(32, ' ');
1268 get_key_name(&key
[0]);
1269 key
.resize(31); // remove the null terminator
1273 // -- pool_snap_info_t --
1274 void pool_snap_info_t::dump(Formatter
*f
) const
1276 f
->dump_unsigned("snapid", snapid
);
1277 f
->dump_stream("stamp") << stamp
;
1278 f
->dump_string("name", name
);
1281 void pool_snap_info_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
1284 if ((features
& CEPH_FEATURE_PGPOOL3
) == 0) {
1286 encode(struct_v
, bl
);
1292 ENCODE_START(2, 2, bl
);
1299 void pool_snap_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
1301 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
1308 void pool_snap_info_t::generate_test_instances(list
<pool_snap_info_t
*>& o
)
1310 o
.push_back(new pool_snap_info_t
);
1311 o
.push_back(new pool_snap_info_t
);
1312 o
.back()->snapid
= 1;
1313 o
.back()->stamp
= utime_t(1, 2);
1314 o
.back()->name
= "foo";
1317 // -- pool_opts_t --
1319 // The order of items in the list is important, therefore,
1320 // you should always add to the end of the list when adding new options.
1322 typedef std::map
<std::string
, pool_opts_t::opt_desc_t
> opt_mapping_t
;
1323 static opt_mapping_t opt_mapping
= boost::assign::map_list_of
1324 ("scrub_min_interval", pool_opts_t::opt_desc_t(
1325 pool_opts_t::SCRUB_MIN_INTERVAL
, pool_opts_t::DOUBLE
))
1326 ("scrub_max_interval", pool_opts_t::opt_desc_t(
1327 pool_opts_t::SCRUB_MAX_INTERVAL
, pool_opts_t::DOUBLE
))
1328 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1329 pool_opts_t::DEEP_SCRUB_INTERVAL
, pool_opts_t::DOUBLE
))
1330 ("recovery_priority", pool_opts_t::opt_desc_t(
1331 pool_opts_t::RECOVERY_PRIORITY
, pool_opts_t::INT
))
1332 ("recovery_op_priority", pool_opts_t::opt_desc_t(
1333 pool_opts_t::RECOVERY_OP_PRIORITY
, pool_opts_t::INT
))
1334 ("scrub_priority", pool_opts_t::opt_desc_t(
1335 pool_opts_t::SCRUB_PRIORITY
, pool_opts_t::INT
))
1336 ("compression_mode", pool_opts_t::opt_desc_t(
1337 pool_opts_t::COMPRESSION_MODE
, pool_opts_t::STR
))
1338 ("compression_algorithm", pool_opts_t::opt_desc_t(
1339 pool_opts_t::COMPRESSION_ALGORITHM
, pool_opts_t::STR
))
1340 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1341 pool_opts_t::COMPRESSION_REQUIRED_RATIO
, pool_opts_t::DOUBLE
))
1342 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1343 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, pool_opts_t::INT
))
1344 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1345 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, pool_opts_t::INT
))
1346 ("csum_type", pool_opts_t::opt_desc_t(
1347 pool_opts_t::CSUM_TYPE
, pool_opts_t::INT
))
1348 ("csum_max_block", pool_opts_t::opt_desc_t(
1349 pool_opts_t::CSUM_MAX_BLOCK
, pool_opts_t::INT
))
1350 ("csum_min_block", pool_opts_t::opt_desc_t(
1351 pool_opts_t::CSUM_MIN_BLOCK
, pool_opts_t::INT
))
1352 ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1353 pool_opts_t::FINGERPRINT_ALGORITHM
, pool_opts_t::STR
))
1354 ("pg_num_min", pool_opts_t::opt_desc_t(
1355 pool_opts_t::PG_NUM_MIN
, pool_opts_t::INT
))
1356 ("target_size_bytes", pool_opts_t::opt_desc_t(
1357 pool_opts_t::TARGET_SIZE_BYTES
, pool_opts_t::INT
))
1358 ("target_size_ratio", pool_opts_t::opt_desc_t(
1359 pool_opts_t::TARGET_SIZE_RATIO
, pool_opts_t::DOUBLE
))
1360 ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
1361 pool_opts_t::PG_AUTOSCALE_BIAS
, pool_opts_t::DOUBLE
))
1362 ("read_lease_interval", pool_opts_t::opt_desc_t(
1363 pool_opts_t::READ_LEASE_INTERVAL
, pool_opts_t::DOUBLE
))
1364 ("dedup_tier", pool_opts_t::opt_desc_t(
1365 pool_opts_t::DEDUP_TIER
, pool_opts_t::INT
))
1366 ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t(
1367 pool_opts_t::DEDUP_CHUNK_ALGORITHM
, pool_opts_t::STR
))
1368 ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t(
1369 pool_opts_t::DEDUP_CDC_CHUNK_SIZE
, pool_opts_t::INT
))
1370 ("pg_num_max", pool_opts_t::opt_desc_t(
1371 pool_opts_t::PG_NUM_MAX
, pool_opts_t::INT
));
1373 bool pool_opts_t::is_opt_name(const std::string
& name
)
1375 return opt_mapping
.count(name
);
1378 pool_opts_t::opt_desc_t
pool_opts_t::get_opt_desc(const std::string
& name
)
1380 auto i
= opt_mapping
.find(name
);
1381 ceph_assert(i
!= opt_mapping
.end());
1385 bool pool_opts_t::is_set(pool_opts_t::key_t key
) const
1387 return opts
.count(key
);
1390 const pool_opts_t::value_t
& pool_opts_t::get(pool_opts_t::key_t key
) const
1392 auto i
= opts
.find(key
);
1393 ceph_assert(i
!= opts
.end());
1397 bool pool_opts_t::unset(pool_opts_t::key_t key
) {
1398 return opts
.erase(key
) > 0;
1401 class pool_opts_dumper_t
: public boost::static_visitor
<> {
1403 pool_opts_dumper_t(const std::string
& name_
, Formatter
* f_
) :
1404 name(name_
.c_str()), f(f_
) {}
1406 void operator()(std::string s
) const {
1407 f
->dump_string(name
, s
);
1409 void operator()(int64_t i
) const {
1410 f
->dump_int(name
, i
);
1412 void operator()(double d
) const {
1413 f
->dump_float(name
, d
);
1421 void pool_opts_t::dump(const std::string
& name
, Formatter
* f
) const
1423 const opt_desc_t
& desc
= get_opt_desc(name
);
1424 auto i
= opts
.find(desc
.key
);
1425 if (i
== opts
.end()) {
1428 boost::apply_visitor(pool_opts_dumper_t(name
, f
), i
->second
);
1431 void pool_opts_t::dump(Formatter
* f
) const
1433 for (auto i
= opt_mapping
.cbegin(); i
!= opt_mapping
.cend(); ++i
) {
1434 const std::string
& name
= i
->first
;
1435 const opt_desc_t
& desc
= i
->second
;
1436 auto j
= opts
.find(desc
.key
);
1437 if (j
== opts
.end()) {
1440 boost::apply_visitor(pool_opts_dumper_t(name
, f
), j
->second
);
1444 class pool_opts_encoder_t
: public boost::static_visitor
<> {
1446 explicit pool_opts_encoder_t(ceph::buffer::list
& bl_
, uint64_t features
)
1448 features(features
) {}
1450 void operator()(const std::string
&s
) const {
1451 encode(static_cast<int32_t>(pool_opts_t::STR
), bl
);
1454 void operator()(int64_t i
) const {
1455 encode(static_cast<int32_t>(pool_opts_t::INT
), bl
);
1456 if (HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1459 encode(static_cast<int32_t>(i
), bl
);
1462 void operator()(double d
) const {
1463 encode(static_cast<int32_t>(pool_opts_t::DOUBLE
), bl
);
1468 ceph::buffer::list
& bl
;
1472 void pool_opts_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
1475 if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1478 ENCODE_START(v
, 1, bl
);
1479 uint32_t n
= static_cast<uint32_t>(opts
.size());
1481 for (auto i
= opts
.cbegin(); i
!= opts
.cend(); ++i
) {
1482 encode(static_cast<int32_t>(i
->first
), bl
);
1483 boost::apply_visitor(pool_opts_encoder_t(bl
, features
), i
->second
);
1488 void pool_opts_t::decode(ceph::buffer::list::const_iterator
& bl
)
1490 DECODE_START(1, bl
);
1501 opts
[static_cast<key_t
>(k
)] = s
;
1502 } else if (t
== INT
) {
1504 if (struct_v
>= 2) {
1511 opts
[static_cast<key_t
>(k
)] = i
;
1512 } else if (t
== DOUBLE
) {
1515 opts
[static_cast<key_t
>(k
)] = d
;
1517 ceph_assert(!"invalid type");
1523 ostream
& operator<<(ostream
& out
, const pool_opts_t
& opts
)
1525 for (auto i
= opt_mapping
.begin(); i
!= opt_mapping
.end(); ++i
) {
1526 const std::string
& name
= i
->first
;
1527 const pool_opts_t::opt_desc_t
& desc
= i
->second
;
1528 auto j
= opts
.opts
.find(desc
.key
);
1529 if (j
== opts
.opts
.end()) {
1532 out
<< " " << name
<< " " << j
->second
;
1539 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1540 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1541 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1543 void pg_pool_t::dump(Formatter
*f
) const
1545 f
->dump_stream("create_time") << get_create_time();
1546 f
->dump_unsigned("flags", get_flags());
1547 f
->dump_string("flags_names", get_flags_string());
1548 f
->dump_int("type", get_type());
1549 f
->dump_int("size", get_size());
1550 f
->dump_int("min_size", get_min_size());
1551 f
->dump_int("crush_rule", get_crush_rule());
1552 f
->dump_int("peering_crush_bucket_count", peering_crush_bucket_count
);
1553 f
->dump_int("peering_crush_bucket_target", peering_crush_bucket_target
);
1554 f
->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier
);
1555 f
->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member
);
1556 f
->dump_int("object_hash", get_object_hash());
1557 f
->dump_string("pg_autoscale_mode",
1558 get_pg_autoscale_mode_name(pg_autoscale_mode
));
1559 f
->dump_unsigned("pg_num", get_pg_num());
1560 f
->dump_unsigned("pg_placement_num", get_pgp_num());
1561 f
->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1562 f
->dump_unsigned("pg_num_target", get_pg_num_target());
1563 f
->dump_unsigned("pg_num_pending", get_pg_num_pending());
1564 f
->dump_object("last_pg_merge_meta", last_pg_merge_meta
);
1565 f
->dump_stream("last_change") << get_last_change();
1566 f
->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1567 f
->dump_stream("last_force_op_resend_prenautilus")
1568 << get_last_force_op_resend_prenautilus();
1569 f
->dump_stream("last_force_op_resend_preluminous")
1570 << get_last_force_op_resend_preluminous();
1571 f
->dump_unsigned("auid", get_auid());
1572 f
->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1573 f
->dump_unsigned("snap_seq", get_snap_seq());
1574 f
->dump_unsigned("snap_epoch", get_snap_epoch());
1575 f
->open_array_section("pool_snaps");
1576 for (auto p
= snaps
.cbegin(); p
!= snaps
.cend(); ++p
) {
1577 f
->open_object_section("pool_snap_info");
1582 f
->dump_stream("removed_snaps") << removed_snaps
;
1583 f
->dump_unsigned("quota_max_bytes", quota_max_bytes
);
1584 f
->dump_unsigned("quota_max_objects", quota_max_objects
);
1585 f
->open_array_section("tiers");
1586 for (auto p
= tiers
.cbegin(); p
!= tiers
.cend(); ++p
)
1587 f
->dump_unsigned("pool_id", *p
);
1589 f
->dump_int("tier_of", tier_of
);
1590 f
->dump_int("read_tier", read_tier
);
1591 f
->dump_int("write_tier", write_tier
);
1592 f
->dump_string("cache_mode", get_cache_mode_name());
1593 f
->dump_unsigned("target_max_bytes", target_max_bytes
);
1594 f
->dump_unsigned("target_max_objects", target_max_objects
);
1595 f
->dump_unsigned("cache_target_dirty_ratio_micro",
1596 cache_target_dirty_ratio_micro
);
1597 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
1598 cache_target_dirty_high_ratio_micro
);
1599 f
->dump_unsigned("cache_target_full_ratio_micro",
1600 cache_target_full_ratio_micro
);
1601 f
->dump_unsigned("cache_min_flush_age", cache_min_flush_age
);
1602 f
->dump_unsigned("cache_min_evict_age", cache_min_evict_age
);
1603 f
->dump_string("erasure_code_profile", erasure_code_profile
);
1604 f
->open_object_section("hit_set_params");
1605 hit_set_params
.dump(f
);
1606 f
->close_section(); // hit_set_params
1607 f
->dump_unsigned("hit_set_period", hit_set_period
);
1608 f
->dump_unsigned("hit_set_count", hit_set_count
);
1609 f
->dump_bool("use_gmt_hitset", use_gmt_hitset
);
1610 f
->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote
);
1611 f
->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote
);
1612 f
->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate
);
1613 f
->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n
);
1614 f
->open_array_section("grade_table");
1615 for (unsigned i
= 0; i
< hit_set_count
; ++i
)
1616 f
->dump_unsigned("value", get_grade(i
));
1618 f
->dump_unsigned("stripe_width", get_stripe_width());
1619 f
->dump_unsigned("expected_num_objects", expected_num_objects
);
1620 f
->dump_bool("fast_read", fast_read
);
1621 f
->open_object_section("options");
1623 f
->close_section(); // options
1624 f
->open_object_section("application_metadata");
1625 for (auto &app_pair
: application_metadata
) {
1626 f
->open_object_section(app_pair
.first
.c_str());
1627 for (auto &kv_pair
: app_pair
.second
) {
1628 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
1630 f
->close_section(); // application
1632 f
->close_section(); // application_metadata
1635 void pg_pool_t::convert_to_pg_shards(const vector
<int> &from
, set
<pg_shard_t
>* to
) const {
1636 for (size_t i
= 0; i
< from
.size(); ++i
) {
1637 if (from
[i
] != CRUSH_ITEM_NONE
) {
1641 is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1646 void pg_pool_t::calc_pg_masks()
1648 pg_num_mask
= (1 << cbits(pg_num
-1)) - 1;
1649 pgp_num_mask
= (1 << cbits(pgp_num
-1)) - 1;
1652 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid
) const
1654 if (pg_num
== pg_num_mask
+ 1)
1655 return pg_num
; // power-of-2 split
1656 unsigned mask
= pg_num_mask
>> 1;
1657 if ((pgid
.ps() & mask
) < (pg_num
& mask
))
1658 return pg_num_mask
+ 1; // smaller bin size (already split)
1660 return (pg_num_mask
+ 1) >> 1; // bigger bin (not yet split)
1663 bool pg_pool_t::is_pending_merge(pg_t pgid
, bool *target
) const
1665 if (pg_num_pending
>= pg_num
) {
1668 if (pgid
.ps() >= pg_num_pending
&& pgid
.ps() < pg_num
) {
1674 for (unsigned ps
= pg_num_pending
; ps
< pg_num
; ++ps
) {
1675 if (pg_t(ps
, pgid
.pool()).get_parent() == pgid
) {
1686 * we have two snap modes:
1688 * - snap existence/non-existence defined by snaps[] and snap_seq
1689 * - user managed snaps
1690 * - existence tracked by librados user
1692 bool pg_pool_t::is_pool_snaps_mode() const
1694 return has_flag(FLAG_POOL_SNAPS
);
1697 bool pg_pool_t::is_unmanaged_snaps_mode() const
1699 return has_flag(FLAG_SELFMANAGED_SNAPS
);
1702 bool pg_pool_t::is_removed_snap(snapid_t s
) const
1704 if (is_pool_snaps_mode())
1705 return s
<= get_snap_seq() && snaps
.count(s
) == 0;
1707 return removed_snaps
.contains(s
);
1710 snapid_t
pg_pool_t::snap_exists(std::string_view s
) const
1712 for (auto p
= snaps
.cbegin(); p
!= snaps
.cend(); ++p
)
1713 if (p
->second
.name
== s
)
1714 return p
->second
.snapid
;
1718 void pg_pool_t::add_snap(const char *n
, utime_t stamp
)
1720 ceph_assert(!is_unmanaged_snaps_mode());
1721 flags
|= FLAG_POOL_SNAPS
;
1722 snapid_t s
= get_snap_seq() + 1;
1724 snaps
[s
].snapid
= s
;
1726 snaps
[s
].stamp
= stamp
;
1729 uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat
)
1731 ceph_assert(!is_pool_snaps_mode());
1732 if (snap_seq
== 0) {
1733 if (preoctopus_compat
) {
1734 // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
1735 // mimic this field is not decoded but our flag is set; pre-mimic, we
1736 // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1737 removed_snaps
.insert(snapid_t(1));
1741 flags
|= FLAG_SELFMANAGED_SNAPS
;
1742 snap_seq
= snap_seq
+ 1;
1746 void pg_pool_t::remove_snap(snapid_t s
)
1748 ceph_assert(snaps
.count(s
));
1750 snap_seq
= snap_seq
+ 1;
1753 void pg_pool_t::remove_unmanaged_snap(snapid_t s
, bool preoctopus_compat
)
1755 ceph_assert(is_unmanaged_snaps_mode());
1757 if (preoctopus_compat
) {
1758 removed_snaps
.insert(s
);
1759 // try to add in the new seq, just to try to keep the interval_set contiguous
1760 if (!removed_snaps
.contains(get_snap_seq())) {
1761 removed_snaps
.insert(get_snap_seq());
1766 SnapContext
pg_pool_t::get_snap_context() const
1768 vector
<snapid_t
> s(snaps
.size());
1770 for (auto p
= snaps
.crbegin(); p
!= snaps
.crend(); ++p
)
1772 return SnapContext(get_snap_seq(), s
);
1775 uint32_t pg_pool_t::hash_key(const string
& key
, const string
& ns
) const
1778 return ceph_str_hash(object_hash
, key
.data(), key
.length());
1779 int nsl
= ns
.length();
1780 int len
= key
.length() + nsl
+ 1;
1782 memcpy(&buf
[0], ns
.data(), nsl
);
1784 memcpy(&buf
[nsl
+1], key
.data(), key
.length());
1785 return ceph_str_hash(object_hash
, &buf
[0], len
);
1788 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v
) const
1790 return ceph_stable_mod(v
, pg_num
, pg_num_mask
);
1794 * map a raw pg (with full precision ps) into an actual pg, for storage
1796 pg_t
pg_pool_t::raw_pg_to_pg(pg_t pg
) const
1798 pg
.set_ps(ceph_stable_mod(pg
.ps(), pg_num
, pg_num_mask
));
1803 * map raw pg (full precision ps) into a placement seed. include
1804 * pool id in that value so that different pools don't use the same
1807 ps_t
pg_pool_t::raw_pg_to_pps(pg_t pg
) const
1809 if (flags
& FLAG_HASHPSPOOL
) {
1810 // Hash the pool id so that pool PGs do not overlap.
1812 crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1813 ceph_stable_mod(pg
.ps(), pgp_num
, pgp_num_mask
),
1816 // Legacy behavior; add ps and pool together. This is not a great
1817 // idea because the PGs from each pool will essentially overlap on
1818 // top of each other: 0.5 == 1.4 == 2.3 == ...
1820 ceph_stable_mod(pg
.ps(), pgp_num
, pgp_num_mask
) +
1825 uint32_t pg_pool_t::get_random_pg_position(pg_t pg
, uint32_t seed
) const
1827 uint32_t r
= crush_hash32_2(CRUSH_HASH_RJENKINS1
, seed
, 123);
1828 if (pg_num
== pg_num_mask
+ 1) {
1831 unsigned smaller_mask
= pg_num_mask
>> 1;
1832 if ((pg
.ps() & smaller_mask
) < (pg_num
& smaller_mask
)) {
1842 void pg_pool_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
1845 if ((features
& CEPH_FEATURE_PGPOOL3
) == 0) {
1846 // this encoding matches the old struct ceph_pg_pool
1848 encode(struct_v
, bl
);
1851 encode(crush_rule
, bl
);
1852 encode(object_hash
, bl
);
1854 encode(pgp_num
, bl
);
1855 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1856 encode(lpg_num
, bl
);
1857 encode(lpgp_num
, bl
);
1858 encode(last_change
, bl
);
1859 encode(snap_seq
, bl
);
1860 encode(snap_epoch
, bl
);
1862 __u32 n
= snaps
.size();
1864 n
= removed_snaps
.num_intervals();
1869 encode_nohead(snaps
, bl
, features
);
1870 encode_nohead(removed_snaps
, bl
);
1874 if ((features
& CEPH_FEATURE_OSDENC
) == 0) {
1876 encode(struct_v
, bl
);
1879 encode(crush_rule
, bl
);
1880 encode(object_hash
, bl
);
1882 encode(pgp_num
, bl
);
1883 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1884 encode(lpg_num
, bl
);
1885 encode(lpgp_num
, bl
);
1886 encode(last_change
, bl
);
1887 encode(snap_seq
, bl
);
1888 encode(snap_epoch
, bl
);
1889 encode(snaps
, bl
, features
);
1890 encode(removed_snaps
, bl
);
1893 encode((uint32_t)0, bl
); // crash_replay_interval
1897 if ((features
& CEPH_FEATURE_OSD_POOLRESEND
) == 0) {
1898 // we simply added last_force_op_resend here, which is a fully
1899 // backward compatible change. however, encoding the same map
1900 // differently between monitors triggers scrub noise (even though
1901 // they are decodable without the feature), so let's be pendantic
1903 ENCODE_START(14, 5, bl
);
1906 encode(crush_rule
, bl
);
1907 encode(object_hash
, bl
);
1909 encode(pgp_num
, bl
);
1910 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1911 encode(lpg_num
, bl
);
1912 encode(lpgp_num
, bl
);
1913 encode(last_change
, bl
);
1914 encode(snap_seq
, bl
);
1915 encode(snap_epoch
, bl
);
1916 encode(snaps
, bl
, features
);
1917 encode(removed_snaps
, bl
);
1920 encode((uint32_t)0, bl
); // crash_replay_interval
1921 encode(min_size
, bl
);
1922 encode(quota_max_bytes
, bl
);
1923 encode(quota_max_objects
, bl
);
1925 encode(tier_of
, bl
);
1926 __u8 c
= cache_mode
;
1928 encode(read_tier
, bl
);
1929 encode(write_tier
, bl
);
1930 encode(properties
, bl
);
1931 encode(hit_set_params
, bl
);
1932 encode(hit_set_period
, bl
);
1933 encode(hit_set_count
, bl
);
1934 encode(stripe_width
, bl
);
1935 encode(target_max_bytes
, bl
);
1936 encode(target_max_objects
, bl
);
1937 encode(cache_target_dirty_ratio_micro
, bl
);
1938 encode(cache_target_full_ratio_micro
, bl
);
1939 encode(cache_min_flush_age
, bl
);
1940 encode(cache_min_evict_age
, bl
);
1941 encode(erasure_code_profile
, bl
);
1947 // NOTE: any new encoding dependencies must be reflected by
1948 // SIGNIFICANT_FEATURES
1949 if (!(features
& CEPH_FEATURE_NEW_OSDOP_ENCODING
)) {
1950 // this was the first post-hammer thing we added; if it's missing, encode
1953 } else if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
1955 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
1957 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
1959 } else if (!is_stretch_pool()) {
1963 ENCODE_START(v
, 5, bl
);
1966 encode(crush_rule
, bl
);
1967 encode(object_hash
, bl
);
1969 encode(pgp_num
, bl
);
1970 __u32 lpg_num
= 0, lpgp_num
= 0; // tell old code that there are no localized pgs.
1971 encode(lpg_num
, bl
);
1972 encode(lpgp_num
, bl
);
1973 encode(last_change
, bl
);
1974 encode(snap_seq
, bl
);
1975 encode(snap_epoch
, bl
);
1976 encode(snaps
, bl
, features
);
1977 encode(removed_snaps
, bl
);
1983 tmp
&= ~(FLAG_SELFMANAGED_SNAPS
| FLAG_POOL_SNAPS
| FLAG_CREATING
);
1986 encode((uint32_t)0, bl
); // crash_replay_interval
1987 encode(min_size
, bl
);
1988 encode(quota_max_bytes
, bl
);
1989 encode(quota_max_objects
, bl
);
1991 encode(tier_of
, bl
);
1992 __u8 c
= cache_mode
;
1994 encode(read_tier
, bl
);
1995 encode(write_tier
, bl
);
1996 encode(properties
, bl
);
1997 encode(hit_set_params
, bl
);
1998 encode(hit_set_period
, bl
);
1999 encode(hit_set_count
, bl
);
2000 encode(stripe_width
, bl
);
2001 encode(target_max_bytes
, bl
);
2002 encode(target_max_objects
, bl
);
2003 encode(cache_target_dirty_ratio_micro
, bl
);
2004 encode(cache_target_full_ratio_micro
, bl
);
2005 encode(cache_min_flush_age
, bl
);
2006 encode(cache_min_evict_age
, bl
);
2007 encode(erasure_code_profile
, bl
);
2008 encode(last_force_op_resend_preluminous
, bl
);
2009 encode(min_read_recency_for_promote
, bl
);
2010 encode(expected_num_objects
, bl
);
2012 encode(cache_target_dirty_high_ratio_micro
, bl
);
2015 encode(min_write_recency_for_promote
, bl
);
2018 encode(use_gmt_hitset
, bl
);
2021 encode(fast_read
, bl
);
2024 encode(hit_set_grade_decay_rate
, bl
);
2025 encode(hit_set_search_last_n
, bl
);
2028 encode(opts
, bl
, features
);
2031 encode(last_force_op_resend_prenautilus
, bl
);
2034 encode(application_metadata
, bl
);
2037 encode(create_time
, bl
);
2040 encode(pg_num_target
, bl
);
2041 encode(pgp_num_target
, bl
);
2042 encode(pg_num_pending
, bl
);
2043 encode((epoch_t
)0, bl
); // pg_num_dec_last_epoch_started from 14.1.[01]
2044 encode((epoch_t
)0, bl
); // pg_num_dec_last_epoch_clean from 14.1.[01]
2045 encode(last_force_op_resend
, bl
);
2046 encode(pg_autoscale_mode
, bl
);
2049 encode(last_pg_merge_meta
, bl
);
2052 encode(peering_crush_bucket_count
, bl
);
2053 encode(peering_crush_bucket_target
, bl
);
2054 encode(peering_crush_bucket_barrier
, bl
);
2055 encode(peering_crush_mandatory_member
, bl
);
2060 void pg_pool_t::decode(ceph::buffer::list::const_iterator
& bl
)
2062 DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl
);
2065 decode(crush_rule
, bl
);
2066 decode(object_hash
, bl
);
2068 decode(pgp_num
, bl
);
2070 __u32 lpg_num
, lpgp_num
;
2071 decode(lpg_num
, bl
);
2072 decode(lpgp_num
, bl
);
2074 decode(last_change
, bl
);
2075 decode(snap_seq
, bl
);
2076 decode(snap_epoch
, bl
);
2078 if (struct_v
>= 3) {
2080 decode(removed_snaps
, bl
);
2087 decode_nohead(n
, snaps
, bl
);
2088 decode_nohead(m
, removed_snaps
, bl
);
2091 if (struct_v
>= 4) {
2093 uint32_t crash_replay_interval
;
2094 decode(crash_replay_interval
, bl
);
2098 // upgrade path for selfmanaged vs pool snaps
2099 if (snap_seq
> 0 && (flags
& (FLAG_SELFMANAGED_SNAPS
|FLAG_POOL_SNAPS
)) == 0) {
2100 if (!removed_snaps
.empty()) {
2101 flags
|= FLAG_SELFMANAGED_SNAPS
;
2103 flags
|= FLAG_POOL_SNAPS
;
2106 if (struct_v
>= 7) {
2107 decode(min_size
, bl
);
2109 min_size
= size
- size
/2;
2111 if (struct_v
>= 8) {
2112 decode(quota_max_bytes
, bl
);
2113 decode(quota_max_objects
, bl
);
2115 if (struct_v
>= 9) {
2117 decode(tier_of
, bl
);
2120 cache_mode
= (cache_mode_t
)v
;
2121 decode(read_tier
, bl
);
2122 decode(write_tier
, bl
);
2124 if (struct_v
>= 10) {
2125 decode(properties
, bl
);
2127 if (struct_v
>= 11) {
2128 decode(hit_set_params
, bl
);
2129 decode(hit_set_period
, bl
);
2130 decode(hit_set_count
, bl
);
2133 hit_set_period
= def
.hit_set_period
;
2134 hit_set_count
= def
.hit_set_count
;
2136 if (struct_v
>= 12) {
2137 decode(stripe_width
, bl
);
2139 set_stripe_width(0);
2141 if (struct_v
>= 13) {
2142 decode(target_max_bytes
, bl
);
2143 decode(target_max_objects
, bl
);
2144 decode(cache_target_dirty_ratio_micro
, bl
);
2145 decode(cache_target_full_ratio_micro
, bl
);
2146 decode(cache_min_flush_age
, bl
);
2147 decode(cache_min_evict_age
, bl
);
2149 target_max_bytes
= 0;
2150 target_max_objects
= 0;
2151 cache_target_dirty_ratio_micro
= 0;
2152 cache_target_full_ratio_micro
= 0;
2153 cache_min_flush_age
= 0;
2154 cache_min_evict_age
= 0;
2156 if (struct_v
>= 14) {
2157 decode(erasure_code_profile
, bl
);
2159 if (struct_v
>= 15) {
2160 decode(last_force_op_resend_preluminous
, bl
);
2162 last_force_op_resend_preluminous
= 0;
2164 if (struct_v
>= 16) {
2165 decode(min_read_recency_for_promote
, bl
);
2167 min_read_recency_for_promote
= 1;
2169 if (struct_v
>= 17) {
2170 decode(expected_num_objects
, bl
);
2172 expected_num_objects
= 0;
2174 if (struct_v
>= 19) {
2175 decode(cache_target_dirty_high_ratio_micro
, bl
);
2177 cache_target_dirty_high_ratio_micro
= cache_target_dirty_ratio_micro
;
2179 if (struct_v
>= 20) {
2180 decode(min_write_recency_for_promote
, bl
);
2182 min_write_recency_for_promote
= 1;
2184 if (struct_v
>= 21) {
2185 decode(use_gmt_hitset
, bl
);
2187 use_gmt_hitset
= false;
2189 if (struct_v
>= 22) {
2190 decode(fast_read
, bl
);
2194 if (struct_v
>= 23) {
2195 decode(hit_set_grade_decay_rate
, bl
);
2196 decode(hit_set_search_last_n
, bl
);
2198 hit_set_grade_decay_rate
= 0;
2199 hit_set_search_last_n
= 1;
2201 if (struct_v
>= 24) {
2204 if (struct_v
>= 25) {
2205 decode(last_force_op_resend_prenautilus
, bl
);
2207 last_force_op_resend_prenautilus
= last_force_op_resend_preluminous
;
2209 if (struct_v
>= 26) {
2210 decode(application_metadata
, bl
);
2212 if (struct_v
>= 27) {
2213 decode(create_time
, bl
);
2215 if (struct_v
>= 28) {
2216 decode(pg_num_target
, bl
);
2217 decode(pgp_num_target
, bl
);
2218 decode(pg_num_pending
, bl
);
2219 epoch_t old_merge_last_epoch_clean
, old_merge_last_epoch_started
;
2220 decode(old_merge_last_epoch_started
, bl
);
2221 decode(old_merge_last_epoch_clean
, bl
);
2222 decode(last_force_op_resend
, bl
);
2223 decode(pg_autoscale_mode
, bl
);
2224 if (struct_v
>= 29) {
2225 decode(last_pg_merge_meta
, bl
);
2227 last_pg_merge_meta
.last_epoch_clean
= old_merge_last_epoch_clean
;
2228 last_pg_merge_meta
.last_epoch_started
= old_merge_last_epoch_started
;
2231 pg_num_target
= pg_num
;
2232 pgp_num_target
= pgp_num
;
2233 pg_num_pending
= pg_num
;
2234 last_force_op_resend
= last_force_op_resend_prenautilus
;
2235 pg_autoscale_mode
= pg_autoscale_mode_t::WARN
; // default to warn on upgrade
2237 if (struct_v
>= 30) {
2238 decode(peering_crush_bucket_count
, bl
);
2239 decode(peering_crush_bucket_target
, bl
);
2240 decode(peering_crush_bucket_barrier
, bl
);
2241 decode(peering_crush_mandatory_member
, bl
);
2248 bool pg_pool_t::stretch_set_can_peer(const set
<int>& want
, const OSDMap
& osdmap
,
2249 std::ostream
* out
) const
2251 if (!is_stretch_pool()) return true;
2252 const uint32_t barrier_id
= peering_crush_bucket_barrier
;
2253 const uint32_t barrier_count
= peering_crush_bucket_count
;
2255 const shared_ptr
<CrushWrapper
>& crush
= osdmap
.crush
;
2256 for (int osdid
: want
) {
2257 int ancestor
= crush
->get_parent_of_type(osdid
, barrier_id
,
2259 ancestors
.insert(ancestor
);
2261 if (ancestors
.size() < barrier_count
) {
2263 *out
<< __func__
<< ": not enough crush buckets with OSDs in want set "
2267 } else if (peering_crush_mandatory_member
!= CRUSH_ITEM_NONE
&&
2268 !ancestors
.count(peering_crush_mandatory_member
)) {
2270 *out
<< __func__
<< ": missing mandatory crush bucket member "
2271 << peering_crush_mandatory_member
;
2278 void pg_pool_t::generate_test_instances(list
<pg_pool_t
*>& o
)
2281 o
.push_back(new pg_pool_t(a
));
2283 a
.create_time
= utime_t(4,5);
2284 a
.type
= TYPE_REPLICATED
;
2290 a
.pgp_num_target
= 4;
2291 a
.pg_num_target
= 5;
2292 a
.pg_num_pending
= 5;
2293 a
.last_pg_merge_meta
.last_epoch_started
= 2;
2294 a
.last_pg_merge_meta
.last_epoch_clean
= 2;
2296 a
.last_force_op_resend
= 123823;
2297 a
.last_force_op_resend_preluminous
= 123824;
2300 a
.flags
= FLAG_POOL_SNAPS
;
2302 a
.quota_max_bytes
= 473;
2303 a
.quota_max_objects
= 474;
2304 o
.push_back(new pg_pool_t(a
));
2306 a
.snaps
[3].name
= "asdf";
2307 a
.snaps
[3].snapid
= 3;
2308 a
.snaps
[3].stamp
= utime_t(123, 4);
2309 a
.snaps
[6].name
= "qwer";
2310 a
.snaps
[6].snapid
= 6;
2311 a
.snaps
[6].stamp
= utime_t(23423, 4);
2312 o
.push_back(new pg_pool_t(a
));
2314 a
.flags
= FLAG_SELFMANAGED_SNAPS
;
2316 a
.removed_snaps
.insert(2);
2317 a
.quota_max_bytes
= 2473;
2318 a
.quota_max_objects
= 4374;
2322 a
.cache_mode
= CACHEMODE_WRITEBACK
;
2325 a
.hit_set_params
= HitSet::Params(new BloomHitSet::Params
);
2326 a
.hit_set_period
= 3600;
2327 a
.hit_set_count
= 8;
2328 a
.min_read_recency_for_promote
= 1;
2329 a
.min_write_recency_for_promote
= 1;
2330 a
.hit_set_grade_decay_rate
= 50;
2331 a
.hit_set_search_last_n
= 1;
2332 a
.calc_grade_table();
2333 a
.set_stripe_width(12345);
2334 a
.target_max_bytes
= 1238132132;
2335 a
.target_max_objects
= 1232132;
2336 a
.cache_target_dirty_ratio_micro
= 187232;
2337 a
.cache_target_dirty_high_ratio_micro
= 309856;
2338 a
.cache_target_full_ratio_micro
= 987222;
2339 a
.cache_min_flush_age
= 231;
2340 a
.cache_min_evict_age
= 2321;
2341 a
.erasure_code_profile
= "profile in osdmap";
2342 a
.expected_num_objects
= 123456;
2343 a
.fast_read
= false;
2344 a
.application_metadata
= {{"rbd", {{"key", "value"}}}};
2345 o
.push_back(new pg_pool_t(a
));
2348 ostream
& operator<<(ostream
& out
, const pg_pool_t
& p
)
2350 out
<< p
.get_type_name();
2351 if (p
.get_type_name() == "erasure") {
2352 out
<< " profile " << p
.erasure_code_profile
;
2354 out
<< " size " << p
.get_size()
2355 << " min_size " << p
.get_min_size()
2356 << " crush_rule " << p
.get_crush_rule()
2357 << " object_hash " << p
.get_object_hash_name()
2358 << " pg_num " << p
.get_pg_num()
2359 << " pgp_num " << p
.get_pgp_num();
2360 if (p
.get_pg_num_target() != p
.get_pg_num()) {
2361 out
<< " pg_num_target " << p
.get_pg_num_target();
2363 if (p
.get_pgp_num_target() != p
.get_pgp_num()) {
2364 out
<< " pgp_num_target " << p
.get_pgp_num_target();
2366 if (p
.get_pg_num_pending() != p
.get_pg_num()) {
2367 out
<< " pg_num_pending " << p
.get_pg_num_pending();
2369 if (p
.pg_autoscale_mode
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
2370 out
<< " autoscale_mode " << p
.get_pg_autoscale_mode_name(p
.pg_autoscale_mode
);
2372 out
<< " last_change " << p
.get_last_change();
2373 if (p
.get_last_force_op_resend() ||
2374 p
.get_last_force_op_resend_prenautilus() ||
2375 p
.get_last_force_op_resend_preluminous())
2376 out
<< " lfor " << p
.get_last_force_op_resend() << "/"
2377 << p
.get_last_force_op_resend_prenautilus() << "/"
2378 << p
.get_last_force_op_resend_preluminous();
2380 out
<< " owner " << p
.get_auid();
2382 out
<< " flags " << p
.get_flags_string();
2383 if (p
.quota_max_bytes
)
2384 out
<< " max_bytes " << p
.quota_max_bytes
;
2385 if (p
.quota_max_objects
)
2386 out
<< " max_objects " << p
.quota_max_objects
;
2387 if (!p
.tiers
.empty())
2388 out
<< " tiers " << p
.tiers
;
2390 out
<< " tier_of " << p
.tier_of
;
2391 if (p
.has_read_tier())
2392 out
<< " read_tier " << p
.read_tier
;
2393 if (p
.has_write_tier())
2394 out
<< " write_tier " << p
.write_tier
;
2396 out
<< " cache_mode " << p
.get_cache_mode_name();
2397 if (p
.target_max_bytes
)
2398 out
<< " target_bytes " << p
.target_max_bytes
;
2399 if (p
.target_max_objects
)
2400 out
<< " target_objects " << p
.target_max_objects
;
2401 if (p
.hit_set_params
.get_type() != HitSet::TYPE_NONE
) {
2402 out
<< " hit_set " << p
.hit_set_params
2403 << " " << p
.hit_set_period
<< "s"
2404 << " x" << p
.hit_set_count
<< " decay_rate "
2405 << p
.hit_set_grade_decay_rate
2406 << " search_last_n " << p
.hit_set_search_last_n
;
2408 if (p
.min_read_recency_for_promote
)
2409 out
<< " min_read_recency_for_promote " << p
.min_read_recency_for_promote
;
2410 if (p
.min_write_recency_for_promote
)
2411 out
<< " min_write_recency_for_promote " << p
.min_write_recency_for_promote
;
2412 out
<< " stripe_width " << p
.get_stripe_width();
2413 if (p
.expected_num_objects
)
2414 out
<< " expected_num_objects " << p
.expected_num_objects
;
2416 out
<< " fast_read " << p
.fast_read
;
2418 if (!p
.application_metadata
.empty()) {
2419 out
<< " application ";
2420 for (auto it
= p
.application_metadata
.begin();
2421 it
!= p
.application_metadata
.end(); ++it
) {
2422 if (it
!= p
.application_metadata
.begin())
2431 // -- object_stat_sum_t --
2433 void object_stat_sum_t::dump(Formatter
*f
) const
2435 f
->dump_int("num_bytes", num_bytes
);
2436 f
->dump_int("num_objects", num_objects
);
2437 f
->dump_int("num_object_clones", num_object_clones
);
2438 f
->dump_int("num_object_copies", num_object_copies
);
2439 f
->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary
);
2440 f
->dump_int("num_objects_missing", num_objects_missing
);
2441 f
->dump_int("num_objects_degraded", num_objects_degraded
);
2442 f
->dump_int("num_objects_misplaced", num_objects_misplaced
);
2443 f
->dump_int("num_objects_unfound", num_objects_unfound
);
2444 f
->dump_int("num_objects_dirty", num_objects_dirty
);
2445 f
->dump_int("num_whiteouts", num_whiteouts
);
2446 f
->dump_int("num_read", num_rd
);
2447 f
->dump_int("num_read_kb", num_rd_kb
);
2448 f
->dump_int("num_write", num_wr
);
2449 f
->dump_int("num_write_kb", num_wr_kb
);
2450 f
->dump_int("num_scrub_errors", num_scrub_errors
);
2451 f
->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors
);
2452 f
->dump_int("num_deep_scrub_errors", num_deep_scrub_errors
);
2453 f
->dump_int("num_objects_recovered", num_objects_recovered
);
2454 f
->dump_int("num_bytes_recovered", num_bytes_recovered
);
2455 f
->dump_int("num_keys_recovered", num_keys_recovered
);
2456 f
->dump_int("num_objects_omap", num_objects_omap
);
2457 f
->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive
);
2458 f
->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive
);
2459 f
->dump_int("num_flush", num_flush
);
2460 f
->dump_int("num_flush_kb", num_flush_kb
);
2461 f
->dump_int("num_evict", num_evict
);
2462 f
->dump_int("num_evict_kb", num_evict_kb
);
2463 f
->dump_int("num_promote", num_promote
);
2464 f
->dump_int("num_flush_mode_high", num_flush_mode_high
);
2465 f
->dump_int("num_flush_mode_low", num_flush_mode_low
);
2466 f
->dump_int("num_evict_mode_some", num_evict_mode_some
);
2467 f
->dump_int("num_evict_mode_full", num_evict_mode_full
);
2468 f
->dump_int("num_objects_pinned", num_objects_pinned
);
2469 f
->dump_int("num_legacy_snapsets", num_legacy_snapsets
);
2470 f
->dump_int("num_large_omap_objects", num_large_omap_objects
);
2471 f
->dump_int("num_objects_manifest", num_objects_manifest
);
2472 f
->dump_int("num_omap_bytes", num_omap_bytes
);
2473 f
->dump_int("num_omap_keys", num_omap_keys
);
2474 f
->dump_int("num_objects_repaired", num_objects_repaired
);
2477 void object_stat_sum_t::encode(ceph::buffer::list
& bl
) const
2479 ENCODE_START(20, 14, bl
);
2480 #if defined(CEPH_LITTLE_ENDIAN)
2481 bl
.append((char *)(&num_bytes
), sizeof(object_stat_sum_t
));
2483 encode(num_bytes
, bl
);
2484 encode(num_objects
, bl
);
2485 encode(num_object_clones
, bl
);
2486 encode(num_object_copies
, bl
);
2487 encode(num_objects_missing_on_primary
, bl
);
2488 encode(num_objects_degraded
, bl
);
2489 encode(num_objects_unfound
, bl
);
2491 encode(num_rd_kb
, bl
);
2493 encode(num_wr_kb
, bl
);
2494 encode(num_scrub_errors
, bl
);
2495 encode(num_objects_recovered
, bl
);
2496 encode(num_bytes_recovered
, bl
);
2497 encode(num_keys_recovered
, bl
);
2498 encode(num_shallow_scrub_errors
, bl
);
2499 encode(num_deep_scrub_errors
, bl
);
2500 encode(num_objects_dirty
, bl
);
2501 encode(num_whiteouts
, bl
);
2502 encode(num_objects_omap
, bl
);
2503 encode(num_objects_hit_set_archive
, bl
);
2504 encode(num_objects_misplaced
, bl
);
2505 encode(num_bytes_hit_set_archive
, bl
);
2506 encode(num_flush
, bl
);
2507 encode(num_flush_kb
, bl
);
2508 encode(num_evict
, bl
);
2509 encode(num_evict_kb
, bl
);
2510 encode(num_promote
, bl
);
2511 encode(num_flush_mode_high
, bl
);
2512 encode(num_flush_mode_low
, bl
);
2513 encode(num_evict_mode_some
, bl
);
2514 encode(num_evict_mode_full
, bl
);
2515 encode(num_objects_pinned
, bl
);
2516 encode(num_objects_missing
, bl
);
2517 encode(num_legacy_snapsets
, bl
);
2518 encode(num_large_omap_objects
, bl
);
2519 encode(num_objects_manifest
, bl
);
2520 encode(num_omap_bytes
, bl
);
2521 encode(num_omap_keys
, bl
);
2522 encode(num_objects_repaired
, bl
);
2527 void object_stat_sum_t::decode(ceph::buffer::list::const_iterator
& bl
)
2529 bool decode_finish
= false;
2530 static const int STAT_SUM_DECODE_VERSION
= 20;
2531 DECODE_START(STAT_SUM_DECODE_VERSION
, bl
);
2532 #if defined(CEPH_LITTLE_ENDIAN)
2533 if (struct_v
== STAT_SUM_DECODE_VERSION
) {
2534 bl
.copy(sizeof(object_stat_sum_t
), (char*)(&num_bytes
));
2535 decode_finish
= true;
2538 if (!decode_finish
) {
2539 decode(num_bytes
, bl
);
2540 decode(num_objects
, bl
);
2541 decode(num_object_clones
, bl
);
2542 decode(num_object_copies
, bl
);
2543 decode(num_objects_missing_on_primary
, bl
);
2544 decode(num_objects_degraded
, bl
);
2545 decode(num_objects_unfound
, bl
);
2547 decode(num_rd_kb
, bl
);
2549 decode(num_wr_kb
, bl
);
2550 decode(num_scrub_errors
, bl
);
2551 decode(num_objects_recovered
, bl
);
2552 decode(num_bytes_recovered
, bl
);
2553 decode(num_keys_recovered
, bl
);
2554 decode(num_shallow_scrub_errors
, bl
);
2555 decode(num_deep_scrub_errors
, bl
);
2556 decode(num_objects_dirty
, bl
);
2557 decode(num_whiteouts
, bl
);
2558 decode(num_objects_omap
, bl
);
2559 decode(num_objects_hit_set_archive
, bl
);
2560 decode(num_objects_misplaced
, bl
);
2561 decode(num_bytes_hit_set_archive
, bl
);
2562 decode(num_flush
, bl
);
2563 decode(num_flush_kb
, bl
);
2564 decode(num_evict
, bl
);
2565 decode(num_evict_kb
, bl
);
2566 decode(num_promote
, bl
);
2567 decode(num_flush_mode_high
, bl
);
2568 decode(num_flush_mode_low
, bl
);
2569 decode(num_evict_mode_some
, bl
);
2570 decode(num_evict_mode_full
, bl
);
2571 decode(num_objects_pinned
, bl
);
2572 decode(num_objects_missing
, bl
);
2573 if (struct_v
>= 16) {
2574 decode(num_legacy_snapsets
, bl
);
2576 num_legacy_snapsets
= num_object_clones
; // upper bound
2578 if (struct_v
>= 17) {
2579 decode(num_large_omap_objects
, bl
);
2581 if (struct_v
>= 18) {
2582 decode(num_objects_manifest
, bl
);
2584 if (struct_v
>= 19) {
2585 decode(num_omap_bytes
, bl
);
2586 decode(num_omap_keys
, bl
);
2588 if (struct_v
>= 20) {
2589 decode(num_objects_repaired
, bl
);
2595 void object_stat_sum_t::generate_test_instances(list
<object_stat_sum_t
*>& o
)
2597 object_stat_sum_t a
;
2601 a
.num_object_clones
= 4;
2602 a
.num_object_copies
= 5;
2603 a
.num_objects_missing_on_primary
= 6;
2604 a
.num_objects_missing
= 123;
2605 a
.num_objects_degraded
= 7;
2606 a
.num_objects_unfound
= 8;
2607 a
.num_rd
= 9; a
.num_rd_kb
= 10;
2608 a
.num_wr
= 11; a
.num_wr_kb
= 12;
2609 a
.num_objects_recovered
= 14;
2610 a
.num_bytes_recovered
= 15;
2611 a
.num_keys_recovered
= 16;
2612 a
.num_deep_scrub_errors
= 17;
2613 a
.num_shallow_scrub_errors
= 18;
2614 a
.num_scrub_errors
= a
.num_deep_scrub_errors
+ a
.num_shallow_scrub_errors
;
2615 a
.num_objects_dirty
= 21;
2616 a
.num_whiteouts
= 22;
2617 a
.num_objects_misplaced
= 1232;
2618 a
.num_objects_hit_set_archive
= 2;
2619 a
.num_bytes_hit_set_archive
= 27;
2625 a
.num_flush_mode_high
= 0;
2626 a
.num_flush_mode_low
= 1;
2627 a
.num_evict_mode_some
= 1;
2628 a
.num_evict_mode_full
= 0;
2629 a
.num_objects_pinned
= 20;
2630 a
.num_large_omap_objects
= 5;
2631 a
.num_objects_manifest
= 2;
2632 a
.num_omap_bytes
= 20000;
2633 a
.num_omap_keys
= 200;
2634 a
.num_objects_repaired
= 300;
2635 o
.push_back(new object_stat_sum_t(a
));
2638 void object_stat_sum_t::add(const object_stat_sum_t
& o
)
2640 num_bytes
+= o
.num_bytes
;
2641 num_objects
+= o
.num_objects
;
2642 num_object_clones
+= o
.num_object_clones
;
2643 num_object_copies
+= o
.num_object_copies
;
2644 num_objects_missing_on_primary
+= o
.num_objects_missing_on_primary
;
2645 num_objects_missing
+= o
.num_objects_missing
;
2646 num_objects_degraded
+= o
.num_objects_degraded
;
2647 num_objects_misplaced
+= o
.num_objects_misplaced
;
2649 num_rd_kb
+= o
.num_rd_kb
;
2651 num_wr_kb
+= o
.num_wr_kb
;
2652 num_objects_unfound
+= o
.num_objects_unfound
;
2653 num_scrub_errors
+= o
.num_scrub_errors
;
2654 num_shallow_scrub_errors
+= o
.num_shallow_scrub_errors
;
2655 num_deep_scrub_errors
+= o
.num_deep_scrub_errors
;
2656 num_objects_recovered
+= o
.num_objects_recovered
;
2657 num_bytes_recovered
+= o
.num_bytes_recovered
;
2658 num_keys_recovered
+= o
.num_keys_recovered
;
2659 num_objects_dirty
+= o
.num_objects_dirty
;
2660 num_whiteouts
+= o
.num_whiteouts
;
2661 num_objects_omap
+= o
.num_objects_omap
;
2662 num_objects_hit_set_archive
+= o
.num_objects_hit_set_archive
;
2663 num_bytes_hit_set_archive
+= o
.num_bytes_hit_set_archive
;
2664 num_flush
+= o
.num_flush
;
2665 num_flush_kb
+= o
.num_flush_kb
;
2666 num_evict
+= o
.num_evict
;
2667 num_evict_kb
+= o
.num_evict_kb
;
2668 num_promote
+= o
.num_promote
;
2669 num_flush_mode_high
+= o
.num_flush_mode_high
;
2670 num_flush_mode_low
+= o
.num_flush_mode_low
;
2671 num_evict_mode_some
+= o
.num_evict_mode_some
;
2672 num_evict_mode_full
+= o
.num_evict_mode_full
;
2673 num_objects_pinned
+= o
.num_objects_pinned
;
2674 num_legacy_snapsets
+= o
.num_legacy_snapsets
;
2675 num_large_omap_objects
+= o
.num_large_omap_objects
;
2676 num_objects_manifest
+= o
.num_objects_manifest
;
2677 num_omap_bytes
+= o
.num_omap_bytes
;
2678 num_omap_keys
+= o
.num_omap_keys
;
2679 num_objects_repaired
+= o
.num_objects_repaired
;
2682 void object_stat_sum_t::sub(const object_stat_sum_t
& o
)
2684 num_bytes
-= o
.num_bytes
;
2685 num_objects
-= o
.num_objects
;
2686 num_object_clones
-= o
.num_object_clones
;
2687 num_object_copies
-= o
.num_object_copies
;
2688 num_objects_missing_on_primary
-= o
.num_objects_missing_on_primary
;
2689 num_objects_missing
-= o
.num_objects_missing
;
2690 num_objects_degraded
-= o
.num_objects_degraded
;
2691 num_objects_misplaced
-= o
.num_objects_misplaced
;
2693 num_rd_kb
-= o
.num_rd_kb
;
2695 num_wr_kb
-= o
.num_wr_kb
;
2696 num_objects_unfound
-= o
.num_objects_unfound
;
2697 num_scrub_errors
-= o
.num_scrub_errors
;
2698 num_shallow_scrub_errors
-= o
.num_shallow_scrub_errors
;
2699 num_deep_scrub_errors
-= o
.num_deep_scrub_errors
;
2700 num_objects_recovered
-= o
.num_objects_recovered
;
2701 num_bytes_recovered
-= o
.num_bytes_recovered
;
2702 num_keys_recovered
-= o
.num_keys_recovered
;
2703 num_objects_dirty
-= o
.num_objects_dirty
;
2704 num_whiteouts
-= o
.num_whiteouts
;
2705 num_objects_omap
-= o
.num_objects_omap
;
2706 num_objects_hit_set_archive
-= o
.num_objects_hit_set_archive
;
2707 num_bytes_hit_set_archive
-= o
.num_bytes_hit_set_archive
;
2708 num_flush
-= o
.num_flush
;
2709 num_flush_kb
-= o
.num_flush_kb
;
2710 num_evict
-= o
.num_evict
;
2711 num_evict_kb
-= o
.num_evict_kb
;
2712 num_promote
-= o
.num_promote
;
2713 num_flush_mode_high
-= o
.num_flush_mode_high
;
2714 num_flush_mode_low
-= o
.num_flush_mode_low
;
2715 num_evict_mode_some
-= o
.num_evict_mode_some
;
2716 num_evict_mode_full
-= o
.num_evict_mode_full
;
2717 num_objects_pinned
-= o
.num_objects_pinned
;
2718 num_legacy_snapsets
-= o
.num_legacy_snapsets
;
2719 num_large_omap_objects
-= o
.num_large_omap_objects
;
2720 num_objects_manifest
-= o
.num_objects_manifest
;
2721 num_omap_bytes
-= o
.num_omap_bytes
;
2722 num_omap_keys
-= o
.num_omap_keys
;
2723 num_objects_repaired
-= o
.num_objects_repaired
;
2726 bool operator==(const object_stat_sum_t
& l
, const object_stat_sum_t
& r
)
2729 l
.num_bytes
== r
.num_bytes
&&
2730 l
.num_objects
== r
.num_objects
&&
2731 l
.num_object_clones
== r
.num_object_clones
&&
2732 l
.num_object_copies
== r
.num_object_copies
&&
2733 l
.num_objects_missing_on_primary
== r
.num_objects_missing_on_primary
&&
2734 l
.num_objects_missing
== r
.num_objects_missing
&&
2735 l
.num_objects_degraded
== r
.num_objects_degraded
&&
2736 l
.num_objects_misplaced
== r
.num_objects_misplaced
&&
2737 l
.num_objects_unfound
== r
.num_objects_unfound
&&
2738 l
.num_rd
== r
.num_rd
&&
2739 l
.num_rd_kb
== r
.num_rd_kb
&&
2740 l
.num_wr
== r
.num_wr
&&
2741 l
.num_wr_kb
== r
.num_wr_kb
&&
2742 l
.num_scrub_errors
== r
.num_scrub_errors
&&
2743 l
.num_shallow_scrub_errors
== r
.num_shallow_scrub_errors
&&
2744 l
.num_deep_scrub_errors
== r
.num_deep_scrub_errors
&&
2745 l
.num_objects_recovered
== r
.num_objects_recovered
&&
2746 l
.num_bytes_recovered
== r
.num_bytes_recovered
&&
2747 l
.num_keys_recovered
== r
.num_keys_recovered
&&
2748 l
.num_objects_dirty
== r
.num_objects_dirty
&&
2749 l
.num_whiteouts
== r
.num_whiteouts
&&
2750 l
.num_objects_omap
== r
.num_objects_omap
&&
2751 l
.num_objects_hit_set_archive
== r
.num_objects_hit_set_archive
&&
2752 l
.num_bytes_hit_set_archive
== r
.num_bytes_hit_set_archive
&&
2753 l
.num_flush
== r
.num_flush
&&
2754 l
.num_flush_kb
== r
.num_flush_kb
&&
2755 l
.num_evict
== r
.num_evict
&&
2756 l
.num_evict_kb
== r
.num_evict_kb
&&
2757 l
.num_promote
== r
.num_promote
&&
2758 l
.num_flush_mode_high
== r
.num_flush_mode_high
&&
2759 l
.num_flush_mode_low
== r
.num_flush_mode_low
&&
2760 l
.num_evict_mode_some
== r
.num_evict_mode_some
&&
2761 l
.num_evict_mode_full
== r
.num_evict_mode_full
&&
2762 l
.num_objects_pinned
== r
.num_objects_pinned
&&
2763 l
.num_legacy_snapsets
== r
.num_legacy_snapsets
&&
2764 l
.num_large_omap_objects
== r
.num_large_omap_objects
&&
2765 l
.num_objects_manifest
== r
.num_objects_manifest
&&
2766 l
.num_omap_bytes
== r
.num_omap_bytes
&&
2767 l
.num_omap_keys
== r
.num_omap_keys
&&
2768 l
.num_objects_repaired
== r
.num_objects_repaired
;
2771 // -- object_stat_collection_t --
2773 void object_stat_collection_t::dump(Formatter
*f
) const
2775 f
->open_object_section("stat_sum");
2780 void object_stat_collection_t::encode(ceph::buffer::list
& bl
) const
2782 ENCODE_START(2, 2, bl
);
2784 encode((__u32
)0, bl
);
2788 void object_stat_collection_t::decode(ceph::buffer::list::const_iterator
& bl
)
2790 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2793 map
<string
,object_stat_sum_t
> cat_sum
;
2794 decode(cat_sum
, bl
);
2799 void object_stat_collection_t::generate_test_instances(list
<object_stat_collection_t
*>& o
)
2801 object_stat_collection_t a
;
2802 o
.push_back(new object_stat_collection_t(a
));
2803 list
<object_stat_sum_t
*> l
;
2804 object_stat_sum_t::generate_test_instances(l
);
2805 for (auto p
= l
.begin(); p
!= l
.end(); ++p
) {
2807 o
.push_back(new object_stat_collection_t(a
));
2814 bool pg_stat_t::is_acting_osd(int32_t osd
, bool primary
) const
2816 if (primary
&& osd
== acting_primary
) {
2818 } else if (!primary
) {
2819 for(auto it
= acting
.cbegin(); it
!= acting
.cend(); ++it
)
2828 void pg_stat_t::dump(Formatter
*f
) const
2830 f
->dump_stream("version") << version
;
2831 f
->dump_unsigned("reported_seq", reported_seq
);
2832 f
->dump_unsigned("reported_epoch", reported_epoch
);
2833 f
->dump_string("state", pg_state_string(state
));
2834 f
->dump_stream("last_fresh") << last_fresh
;
2835 f
->dump_stream("last_change") << last_change
;
2836 f
->dump_stream("last_active") << last_active
;
2837 f
->dump_stream("last_peered") << last_peered
;
2838 f
->dump_stream("last_clean") << last_clean
;
2839 f
->dump_stream("last_became_active") << last_became_active
;
2840 f
->dump_stream("last_became_peered") << last_became_peered
;
2841 f
->dump_stream("last_unstale") << last_unstale
;
2842 f
->dump_stream("last_undegraded") << last_undegraded
;
2843 f
->dump_stream("last_fullsized") << last_fullsized
;
2844 f
->dump_unsigned("mapping_epoch", mapping_epoch
);
2845 f
->dump_stream("log_start") << log_start
;
2846 f
->dump_stream("ondisk_log_start") << ondisk_log_start
;
2847 f
->dump_unsigned("created", created
);
2848 f
->dump_unsigned("last_epoch_clean", last_epoch_clean
);
2849 f
->dump_stream("parent") << parent
;
2850 f
->dump_unsigned("parent_split_bits", parent_split_bits
);
2851 f
->dump_stream("last_scrub") << last_scrub
;
2852 f
->dump_stream("last_scrub_stamp") << last_scrub_stamp
;
2853 f
->dump_stream("last_deep_scrub") << last_deep_scrub
;
2854 f
->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp
;
2855 f
->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp
;
2856 f
->dump_int("objects_scrubbed", objects_scrubbed
);
2857 f
->dump_int("log_size", log_size
);
2858 f
->dump_int("ondisk_log_size", ondisk_log_size
);
2859 f
->dump_bool("stats_invalid", stats_invalid
);
2860 f
->dump_bool("dirty_stats_invalid", dirty_stats_invalid
);
2861 f
->dump_bool("omap_stats_invalid", omap_stats_invalid
);
2862 f
->dump_bool("hitset_stats_invalid", hitset_stats_invalid
);
2863 f
->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid
);
2864 f
->dump_bool("pin_stats_invalid", pin_stats_invalid
);
2865 f
->dump_bool("manifest_stats_invalid", manifest_stats_invalid
);
2866 f
->dump_unsigned("snaptrimq_len", snaptrimq_len
);
2867 f
->dump_int("last_scrub_duration", last_scrub_duration
);
2868 f
->dump_string("scrub_schedule", dump_scrub_schedule());
2869 f
->dump_float("scrub_duration", scrub_duration
);
2870 f
->dump_int("objects_trimmed", objects_trimmed
);
2871 f
->dump_float("snaptrim_duration", snaptrim_duration
);
2873 f
->open_array_section("up");
2874 for (auto p
= up
.cbegin(); p
!= up
.cend(); ++p
)
2875 f
->dump_int("osd", *p
);
2877 f
->open_array_section("acting");
2878 for (auto p
= acting
.cbegin(); p
!= acting
.cend(); ++p
)
2879 f
->dump_int("osd", *p
);
2881 f
->open_array_section("avail_no_missing");
2882 for (auto p
= avail_no_missing
.cbegin(); p
!= avail_no_missing
.cend(); ++p
)
2883 f
->dump_stream("shard") << *p
;
2885 f
->open_array_section("object_location_counts");
2886 for (auto p
= object_location_counts
.cbegin(); p
!= object_location_counts
.cend(); ++p
) {
2887 f
->open_object_section("entry");
2888 f
->dump_stream("shards") << p
->first
;
2889 f
->dump_int("objects", p
->second
);
2893 f
->open_array_section("blocked_by");
2894 for (auto p
= blocked_by
.cbegin(); p
!= blocked_by
.cend(); ++p
)
2895 f
->dump_int("osd", *p
);
2897 f
->dump_int("up_primary", up_primary
);
2898 f
->dump_int("acting_primary", acting_primary
);
2899 f
->open_array_section("purged_snaps");
2900 for (auto i
= purged_snaps
.begin(); i
!= purged_snaps
.end(); ++i
) {
2901 f
->open_object_section("interval");
2902 f
->dump_stream("start") << i
.get_start();
2903 f
->dump_stream("length") << i
.get_len();
2909 void pg_stat_t::dump_brief(Formatter
*f
) const
2911 f
->dump_string("state", pg_state_string(state
));
2912 f
->open_array_section("up");
2913 for (auto p
= up
.cbegin(); p
!= up
.cend(); ++p
)
2914 f
->dump_int("osd", *p
);
2916 f
->open_array_section("acting");
2917 for (auto p
= acting
.cbegin(); p
!= acting
.cend(); ++p
)
2918 f
->dump_int("osd", *p
);
2920 f
->dump_int("up_primary", up_primary
);
2921 f
->dump_int("acting_primary", acting_primary
);
2924 std::string
pg_stat_t::dump_scrub_schedule() const
2926 if (scrub_sched_status
.m_is_active
) {
2928 "{}scrubbing for {}s",
2929 ((scrub_sched_status
.m_is_deep
== scrub_level_t::deep
) ? "deep " : ""),
2930 scrub_sched_status
.m_duration_seconds
);
2932 switch (scrub_sched_status
.m_sched_status
) {
2933 case pg_scrub_sched_status_t::unknown
:
2934 // no reported scrub schedule yet
2936 case pg_scrub_sched_status_t::not_queued
:
2937 return "no scrub is scheduled"s
;
2938 case pg_scrub_sched_status_t::scheduled
:
2940 "{} {}scrub scheduled @ {}",
2941 (scrub_sched_status
.m_is_periodic
? "periodic" : "user requested"),
2942 ((scrub_sched_status
.m_is_deep
== scrub_level_t::deep
) ? "deep " : ""),
2943 scrub_sched_status
.m_scheduled_at
);
2944 case pg_scrub_sched_status_t::queued
:
2946 "queued for {}scrub",
2947 ((scrub_sched_status
.m_is_deep
== scrub_level_t::deep
) ? "deep " : ""));
2950 return "SCRUB STATE MISMATCH!"s
;
2954 bool operator==(const pg_scrubbing_status_t
& l
, const pg_scrubbing_status_t
& r
)
2957 l
.m_sched_status
== r
.m_sched_status
&&
2958 l
.m_scheduled_at
== r
.m_scheduled_at
&&
2959 l
.m_duration_seconds
== r
.m_duration_seconds
&&
2960 l
.m_is_active
== r
.m_is_active
&&
2961 l
.m_is_deep
== r
.m_is_deep
&&
2962 l
.m_is_periodic
== r
.m_is_periodic
;
2965 void pg_stat_t::encode(ceph::buffer::list
&bl
) const
2967 ENCODE_START(28, 22, bl
);
2968 encode(version
, bl
);
2969 encode(reported_seq
, bl
);
2970 encode(reported_epoch
, bl
);
2971 encode((__u32
)state
, bl
); // for older peers
2972 encode(log_start
, bl
);
2973 encode(ondisk_log_start
, bl
);
2974 encode(created
, bl
);
2975 encode(last_epoch_clean
, bl
);
2977 encode(parent_split_bits
, bl
);
2978 encode(last_scrub
, bl
);
2979 encode(last_scrub_stamp
, bl
);
2981 encode(log_size
, bl
);
2982 encode(ondisk_log_size
, bl
);
2985 encode(last_fresh
, bl
);
2986 encode(last_change
, bl
);
2987 encode(last_active
, bl
);
2988 encode(last_clean
, bl
);
2989 encode(last_unstale
, bl
);
2990 encode(mapping_epoch
, bl
);
2991 encode(last_deep_scrub
, bl
);
2992 encode(last_deep_scrub_stamp
, bl
);
2993 encode(stats_invalid
, bl
);
2994 encode(last_clean_scrub_stamp
, bl
);
2995 encode(last_became_active
, bl
);
2996 encode(dirty_stats_invalid
, bl
);
2997 encode(up_primary
, bl
);
2998 encode(acting_primary
, bl
);
2999 encode(omap_stats_invalid
, bl
);
3000 encode(hitset_stats_invalid
, bl
);
3001 encode(blocked_by
, bl
);
3002 encode(last_undegraded
, bl
);
3003 encode(last_fullsized
, bl
);
3004 encode(hitset_bytes_stats_invalid
, bl
);
3005 encode(last_peered
, bl
);
3006 encode(last_became_peered
, bl
);
3007 encode(pin_stats_invalid
, bl
);
3008 encode(snaptrimq_len
, bl
);
3009 __u32 top_state
= (state
>> 32);
3010 encode(top_state
, bl
);
3011 encode(purged_snaps
, bl
);
3012 encode(manifest_stats_invalid
, bl
);
3013 encode(avail_no_missing
, bl
);
3014 encode(object_location_counts
, bl
);
3015 encode(last_scrub_duration
, bl
);
3016 encode(scrub_sched_status
.m_scheduled_at
, bl
);
3017 encode(scrub_sched_status
.m_duration_seconds
, bl
);
3018 encode((__u16
)scrub_sched_status
.m_sched_status
, bl
);
3019 encode(scrub_sched_status
.m_is_active
, bl
);
3020 encode((scrub_sched_status
.m_is_deep
==scrub_level_t::deep
), bl
);
3021 encode(scrub_sched_status
.m_is_periodic
, bl
);
3022 encode(objects_scrubbed
, bl
);
3023 encode(scrub_duration
, bl
);
3024 encode(objects_trimmed
, bl
);
3025 encode(snaptrim_duration
, bl
);
3030 void pg_stat_t::decode(ceph::buffer::list::const_iterator
&bl
)
3034 DECODE_START(28, bl
);
3035 decode(version
, bl
);
3036 decode(reported_seq
, bl
);
3037 decode(reported_epoch
, bl
);
3038 decode(old_state
, bl
);
3039 decode(log_start
, bl
);
3040 decode(ondisk_log_start
, bl
);
3041 decode(created
, bl
);
3042 decode(last_epoch_clean
, bl
);
3044 decode(parent_split_bits
, bl
);
3045 decode(last_scrub
, bl
);
3046 decode(last_scrub_stamp
, bl
);
3048 decode(log_size
, bl
);
3049 decode(ondisk_log_size
, bl
);
3052 decode(last_fresh
, bl
);
3053 decode(last_change
, bl
);
3054 decode(last_active
, bl
);
3055 decode(last_clean
, bl
);
3056 decode(last_unstale
, bl
);
3057 decode(mapping_epoch
, bl
);
3058 decode(last_deep_scrub
, bl
);
3059 decode(last_deep_scrub_stamp
, bl
);
3061 stats_invalid
= tmp
;
3062 decode(last_clean_scrub_stamp
, bl
);
3063 decode(last_became_active
, bl
);
3065 dirty_stats_invalid
= tmp
;
3066 decode(up_primary
, bl
);
3067 decode(acting_primary
, bl
);
3069 omap_stats_invalid
= tmp
;
3071 hitset_stats_invalid
= tmp
;
3072 decode(blocked_by
, bl
);
3073 decode(last_undegraded
, bl
);
3074 decode(last_fullsized
, bl
);
3076 hitset_bytes_stats_invalid
= tmp
;
3077 decode(last_peered
, bl
);
3078 decode(last_became_peered
, bl
);
3080 pin_stats_invalid
= tmp
;
3081 if (struct_v
>= 23) {
3082 decode(snaptrimq_len
, bl
);
3083 if (struct_v
>= 24) {
3085 decode(top_state
, bl
);
3086 state
= (uint64_t)old_state
| ((uint64_t)top_state
<< 32);
3087 decode(purged_snaps
, bl
);
3091 if (struct_v
>= 25) {
3093 manifest_stats_invalid
= tmp
;
3095 manifest_stats_invalid
= true;
3097 if (struct_v
>= 26) {
3098 decode(avail_no_missing
, bl
);
3099 decode(object_location_counts
, bl
);
3101 if (struct_v
>= 27) {
3102 decode(last_scrub_duration
, bl
);
3103 decode(scrub_sched_status
.m_scheduled_at
, bl
);
3104 decode(scrub_sched_status
.m_duration_seconds
, bl
);
3105 __u16 scrub_sched_as_u16
;
3106 decode(scrub_sched_as_u16
, bl
);
3107 scrub_sched_status
.m_sched_status
= (pg_scrub_sched_status_t
)(scrub_sched_as_u16
);
3109 scrub_sched_status
.m_is_active
= tmp
;
3111 scrub_sched_status
.m_is_deep
= tmp
? scrub_level_t::deep
: scrub_level_t::shallow
;
3113 scrub_sched_status
.m_is_periodic
= tmp
;
3114 decode(objects_scrubbed
, bl
);
3116 if (struct_v
>= 28) {
3117 decode(scrub_duration
, bl
);
3118 decode(objects_trimmed
, bl
);
3119 decode(snaptrim_duration
, bl
);
3125 void pg_stat_t::generate_test_instances(list
<pg_stat_t
*>& o
)
3128 o
.push_back(new pg_stat_t(a
));
3130 a
.version
= eversion_t(1, 3);
3131 a
.reported_epoch
= 1;
3134 a
.mapping_epoch
= 998;
3135 a
.last_fresh
= utime_t(1002, 1);
3136 a
.last_change
= utime_t(1002, 2);
3137 a
.last_active
= utime_t(1002, 3);
3138 a
.last_clean
= utime_t(1002, 4);
3139 a
.last_unstale
= utime_t(1002, 5);
3140 a
.last_undegraded
= utime_t(1002, 7);
3141 a
.last_fullsized
= utime_t(1002, 8);
3142 a
.log_start
= eversion_t(1, 4);
3143 a
.ondisk_log_start
= eversion_t(1, 5);
3145 a
.last_epoch_clean
= 7;
3146 a
.parent
= pg_t(1, 2);
3147 a
.parent_split_bits
= 12;
3148 a
.last_scrub
= eversion_t(9, 10);
3149 a
.last_scrub_stamp
= utime_t(11, 12);
3150 a
.last_deep_scrub
= eversion_t(13, 14);
3151 a
.last_deep_scrub_stamp
= utime_t(15, 16);
3152 a
.last_clean_scrub_stamp
= utime_t(17, 18);
3153 a
.last_scrub_duration
= 3617;
3154 a
.scrub_duration
= 0.003;
3155 a
.snaptrimq_len
= 1048576;
3156 a
.objects_scrubbed
= 0;
3157 a
.objects_trimmed
= 0;
3158 a
.snaptrim_duration
= 0.123;
3159 list
<object_stat_collection_t
*> l
;
3160 object_stat_collection_t::generate_test_instances(l
);
3161 a
.stats
= *l
.back();
3163 a
.ondisk_log_size
= 88;
3164 a
.up
.push_back(123);
3166 a
.acting
.push_back(456);
3167 a
.avail_no_missing
.push_back(pg_shard_t(456, shard_id_t::NO_SHARD
));
3168 set
<pg_shard_t
> sset
= { pg_shard_t(0), pg_shard_t(1) };
3169 a
.object_location_counts
.insert(make_pair(sset
, 10));
3170 sset
.insert(pg_shard_t(2));
3171 a
.object_location_counts
.insert(make_pair(sset
, 5));
3172 a
.acting_primary
= 456;
3173 o
.push_back(new pg_stat_t(a
));
3175 a
.up
.push_back(124);
3177 a
.acting
.push_back(124);
3178 a
.acting_primary
= 124;
3179 a
.blocked_by
.push_back(155);
3180 a
.blocked_by
.push_back(156);
3181 o
.push_back(new pg_stat_t(a
));
3184 bool operator==(const pg_stat_t
& l
, const pg_stat_t
& r
)
3187 l
.version
== r
.version
&&
3188 l
.reported_seq
== r
.reported_seq
&&
3189 l
.reported_epoch
== r
.reported_epoch
&&
3190 l
.state
== r
.state
&&
3191 l
.last_fresh
== r
.last_fresh
&&
3192 l
.last_change
== r
.last_change
&&
3193 l
.last_active
== r
.last_active
&&
3194 l
.last_peered
== r
.last_peered
&&
3195 l
.last_clean
== r
.last_clean
&&
3196 l
.last_unstale
== r
.last_unstale
&&
3197 l
.last_undegraded
== r
.last_undegraded
&&
3198 l
.last_fullsized
== r
.last_fullsized
&&
3199 l
.log_start
== r
.log_start
&&
3200 l
.ondisk_log_start
== r
.ondisk_log_start
&&
3201 l
.created
== r
.created
&&
3202 l
.last_epoch_clean
== r
.last_epoch_clean
&&
3203 l
.parent
== r
.parent
&&
3204 l
.parent_split_bits
== r
.parent_split_bits
&&
3205 l
.last_scrub
== r
.last_scrub
&&
3206 l
.last_deep_scrub
== r
.last_deep_scrub
&&
3207 l
.last_scrub_stamp
== r
.last_scrub_stamp
&&
3208 l
.last_deep_scrub_stamp
== r
.last_deep_scrub_stamp
&&
3209 l
.last_clean_scrub_stamp
== r
.last_clean_scrub_stamp
&&
3210 l
.stats
== r
.stats
&&
3211 l
.stats_invalid
== r
.stats_invalid
&&
3212 l
.log_size
== r
.log_size
&&
3213 l
.ondisk_log_size
== r
.ondisk_log_size
&&
3215 l
.acting
== r
.acting
&&
3216 l
.avail_no_missing
== r
.avail_no_missing
&&
3217 l
.object_location_counts
== r
.object_location_counts
&&
3218 l
.mapping_epoch
== r
.mapping_epoch
&&
3219 l
.blocked_by
== r
.blocked_by
&&
3220 l
.last_became_active
== r
.last_became_active
&&
3221 l
.last_became_peered
== r
.last_became_peered
&&
3222 l
.dirty_stats_invalid
== r
.dirty_stats_invalid
&&
3223 l
.omap_stats_invalid
== r
.omap_stats_invalid
&&
3224 l
.hitset_stats_invalid
== r
.hitset_stats_invalid
&&
3225 l
.hitset_bytes_stats_invalid
== r
.hitset_bytes_stats_invalid
&&
3226 l
.up_primary
== r
.up_primary
&&
3227 l
.acting_primary
== r
.acting_primary
&&
3228 l
.pin_stats_invalid
== r
.pin_stats_invalid
&&
3229 l
.manifest_stats_invalid
== r
.manifest_stats_invalid
&&
3230 l
.purged_snaps
== r
.purged_snaps
&&
3231 l
.snaptrimq_len
== r
.snaptrimq_len
&&
3232 l
.last_scrub_duration
== r
.last_scrub_duration
&&
3233 l
.scrub_sched_status
== r
.scrub_sched_status
&&
3234 l
.objects_scrubbed
== r
.objects_scrubbed
&&
3235 l
.scrub_duration
== r
.scrub_duration
&&
3236 l
.objects_trimmed
== r
.objects_trimmed
&&
3237 l
.snaptrim_duration
== r
.snaptrim_duration
;
3240 // -- store_statfs_t --
3242 bool store_statfs_t::operator==(const store_statfs_t
& other
) const
3244 return total
== other
.total
3245 && available
== other
.available
3246 && allocated
== other
.allocated
3247 && internally_reserved
== other
.internally_reserved
3248 && data_stored
== other
.data_stored
3249 && data_compressed
== other
.data_compressed
3250 && data_compressed_allocated
== other
.data_compressed_allocated
3251 && data_compressed_original
== other
.data_compressed_original
3252 && omap_allocated
== other
.omap_allocated
3253 && internal_metadata
== other
.internal_metadata
;
3256 void store_statfs_t::dump(Formatter
*f
) const
3258 f
->dump_int("total", total
);
3259 f
->dump_int("available", available
);
3260 f
->dump_int("internally_reserved", internally_reserved
);
3261 f
->dump_int("allocated", allocated
);
3262 f
->dump_int("data_stored", data_stored
);
3263 f
->dump_int("data_compressed", data_compressed
);
3264 f
->dump_int("data_compressed_allocated", data_compressed_allocated
);
3265 f
->dump_int("data_compressed_original", data_compressed_original
);
3266 f
->dump_int("omap_allocated", omap_allocated
);
3267 f
->dump_int("internal_metadata", internal_metadata
);
3270 ostream
& operator<<(ostream
& out
, const store_statfs_t
&s
)
3273 << "store_statfs(0x" << s
.available
3274 << "/0x" << s
.internally_reserved
3276 << ", data 0x" << s
.data_stored
3277 << "/0x" << s
.allocated
3278 << ", compress 0x" << s
.data_compressed
3279 << "/0x" << s
.data_compressed_allocated
3280 << "/0x" << s
.data_compressed_original
3281 << ", omap 0x" << s
.omap_allocated
3282 << ", meta 0x" << s
.internal_metadata
3288 void store_statfs_t::generate_test_instances(list
<store_statfs_t
*>& o
)
3291 o
.push_back(new store_statfs_t(a
));
3294 a
.internally_reserved
= 33;
3297 a
.data_compressed
= 21;
3298 a
.data_compressed_allocated
= 12;
3299 a
.data_compressed_original
= 13;
3300 a
.omap_allocated
= 14;
3301 a
.internal_metadata
= 15;
3302 o
.push_back(new store_statfs_t(a
));
3305 // -- pool_stat_t --
3307 void pool_stat_t::dump(Formatter
*f
) const
3310 f
->open_object_section("store_stats");
3311 store_stats
.dump(f
);
3313 f
->dump_int("log_size", log_size
);
3314 f
->dump_int("ondisk_log_size", ondisk_log_size
);
3315 f
->dump_int("up", up
);
3316 f
->dump_int("acting", acting
);
3317 f
->dump_int("num_store_stats", num_store_stats
);
3320 void pool_stat_t::encode(ceph::buffer::list
&bl
, uint64_t features
) const
3323 if ((features
& CEPH_FEATURE_OSDENC
) == 0) {
3327 encode(log_size
, bl
);
3328 encode(ondisk_log_size
, bl
);
3332 ENCODE_START(7, 5, bl
);
3334 encode(log_size
, bl
);
3335 encode(ondisk_log_size
, bl
);
3338 encode(store_stats
, bl
);
3339 encode(num_store_stats
, bl
);
3343 void pool_stat_t::decode(ceph::buffer::list::const_iterator
&bl
)
3345 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
3346 if (struct_v
>= 4) {
3348 decode(log_size
, bl
);
3349 decode(ondisk_log_size
, bl
);
3350 if (struct_v
>= 6) {
3357 if (struct_v
>= 7) {
3358 decode(store_stats
, bl
);
3359 decode(num_store_stats
, bl
);
3361 store_stats
.reset();
3362 num_store_stats
= 0;
3366 decode(stats
.sum
.num_bytes
, bl
);
3369 decode(stats
.sum
.num_objects
, bl
);
3370 decode(stats
.sum
.num_object_clones
, bl
);
3371 decode(stats
.sum
.num_object_copies
, bl
);
3372 decode(stats
.sum
.num_objects_missing_on_primary
, bl
);
3373 decode(stats
.sum
.num_objects_degraded
, bl
);
3374 decode(log_size
, bl
);
3375 decode(ondisk_log_size
, bl
);
3376 if (struct_v
>= 2) {
3377 decode(stats
.sum
.num_rd
, bl
);
3378 decode(stats
.sum
.num_rd_kb
, bl
);
3379 decode(stats
.sum
.num_wr
, bl
);
3380 decode(stats
.sum
.num_wr_kb
, bl
);
3382 if (struct_v
>= 3) {
3383 decode(stats
.sum
.num_objects_unfound
, bl
);
3389 void pool_stat_t::generate_test_instances(list
<pool_stat_t
*>& o
)
3392 o
.push_back(new pool_stat_t(a
));
3394 list
<object_stat_collection_t
*> l
;
3395 object_stat_collection_t::generate_test_instances(l
);
3396 list
<store_statfs_t
*> ll
;
3397 store_statfs_t::generate_test_instances(ll
);
3398 a
.stats
= *l
.back();
3399 a
.store_stats
= *ll
.back();
3401 a
.ondisk_log_size
= 456;
3404 a
.num_store_stats
= 1;
3405 o
.push_back(new pool_stat_t(a
));
3409 // -- pg_history_t --
3411 void pg_history_t::encode(ceph::buffer::list
&bl
) const
3413 ENCODE_START(10, 4, bl
);
3414 encode(epoch_created
, bl
);
3415 encode(last_epoch_started
, bl
);
3416 encode(last_epoch_clean
, bl
);
3417 encode(last_epoch_split
, bl
);
3418 encode(same_interval_since
, bl
);
3419 encode(same_up_since
, bl
);
3420 encode(same_primary_since
, bl
);
3421 encode(last_scrub
, bl
);
3422 encode(last_scrub_stamp
, bl
);
3423 encode(last_deep_scrub
, bl
);
3424 encode(last_deep_scrub_stamp
, bl
);
3425 encode(last_clean_scrub_stamp
, bl
);
3426 encode(last_epoch_marked_full
, bl
);
3427 encode(last_interval_started
, bl
);
3428 encode(last_interval_clean
, bl
);
3429 encode(epoch_pool_created
, bl
);
3430 encode(prior_readable_until_ub
, bl
);
3434 void pg_history_t::decode(ceph::buffer::list::const_iterator
&bl
)
3436 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl
);
3437 decode(epoch_created
, bl
);
3438 decode(last_epoch_started
, bl
);
3440 decode(last_epoch_clean
, bl
);
3442 last_epoch_clean
= last_epoch_started
; // careful, it's a lie!
3443 decode(last_epoch_split
, bl
);
3444 decode(same_interval_since
, bl
);
3445 decode(same_up_since
, bl
);
3446 decode(same_primary_since
, bl
);
3447 if (struct_v
>= 2) {
3448 decode(last_scrub
, bl
);
3449 decode(last_scrub_stamp
, bl
);
3451 if (struct_v
>= 5) {
3452 decode(last_deep_scrub
, bl
);
3453 decode(last_deep_scrub_stamp
, bl
);
3455 if (struct_v
>= 6) {
3456 decode(last_clean_scrub_stamp
, bl
);
3458 if (struct_v
>= 7) {
3459 decode(last_epoch_marked_full
, bl
);
3461 if (struct_v
>= 8) {
3462 decode(last_interval_started
, bl
);
3463 decode(last_interval_clean
, bl
);
3465 if (last_epoch_started
>= same_interval_since
) {
3466 last_interval_started
= same_interval_since
;
3468 last_interval_started
= last_epoch_started
; // best guess
3470 if (last_epoch_clean
>= same_interval_since
) {
3471 last_interval_clean
= same_interval_since
;
3473 last_interval_clean
= last_epoch_clean
; // best guess
3476 if (struct_v
>= 9) {
3477 decode(epoch_pool_created
, bl
);
3479 epoch_pool_created
= epoch_created
;
3481 if (struct_v
>= 10) {
3482 decode(prior_readable_until_ub
, bl
);
3487 void pg_history_t::dump(Formatter
*f
) const
3489 f
->dump_int("epoch_created", epoch_created
);
3490 f
->dump_int("epoch_pool_created", epoch_pool_created
);
3491 f
->dump_int("last_epoch_started", last_epoch_started
);
3492 f
->dump_int("last_interval_started", last_interval_started
);
3493 f
->dump_int("last_epoch_clean", last_epoch_clean
);
3494 f
->dump_int("last_interval_clean", last_interval_clean
);
3495 f
->dump_int("last_epoch_split", last_epoch_split
);
3496 f
->dump_int("last_epoch_marked_full", last_epoch_marked_full
);
3497 f
->dump_int("same_up_since", same_up_since
);
3498 f
->dump_int("same_interval_since", same_interval_since
);
3499 f
->dump_int("same_primary_since", same_primary_since
);
3500 f
->dump_stream("last_scrub") << last_scrub
;
3501 f
->dump_stream("last_scrub_stamp") << last_scrub_stamp
;
3502 f
->dump_stream("last_deep_scrub") << last_deep_scrub
;
3503 f
->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp
;
3504 f
->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp
;
3506 "prior_readable_until_ub",
3507 std::chrono::duration
<double>(prior_readable_until_ub
).count());
3510 void pg_history_t::generate_test_instances(list
<pg_history_t
*>& o
)
3512 o
.push_back(new pg_history_t
);
3513 o
.push_back(new pg_history_t
);
3514 o
.back()->epoch_created
= 1;
3515 o
.back()->epoch_pool_created
= 1;
3516 o
.back()->last_epoch_started
= 2;
3517 o
.back()->last_interval_started
= 2;
3518 o
.back()->last_epoch_clean
= 3;
3519 o
.back()->last_interval_clean
= 2;
3520 o
.back()->last_epoch_split
= 4;
3521 o
.back()->prior_readable_until_ub
= make_timespan(3.1415);
3522 o
.back()->same_up_since
= 5;
3523 o
.back()->same_interval_since
= 6;
3524 o
.back()->same_primary_since
= 7;
3525 o
.back()->last_scrub
= eversion_t(8, 9);
3526 o
.back()->last_scrub_stamp
= utime_t(10, 11);
3527 o
.back()->last_deep_scrub
= eversion_t(12, 13);
3528 o
.back()->last_deep_scrub_stamp
= utime_t(14, 15);
3529 o
.back()->last_clean_scrub_stamp
= utime_t(16, 17);
3530 o
.back()->last_epoch_marked_full
= 18;
3536 void pg_info_t::encode(ceph::buffer::list
&bl
) const
3538 ENCODE_START(32, 26, bl
);
3539 encode(pgid
.pgid
, bl
);
3540 encode(last_update
, bl
);
3541 encode(last_complete
, bl
);
3542 encode(log_tail
, bl
);
3543 encode(hobject_t(), bl
); // old (nibblewise) last_backfill
3546 encode(purged_snaps
, bl
);
3547 encode(last_epoch_started
, bl
);
3548 encode(last_user_version
, bl
);
3549 encode(hit_set
, bl
);
3550 encode(pgid
.shard
, bl
);
3551 encode(last_backfill
, bl
);
3552 encode(true, bl
); // was last_backfill_bitwise
3553 encode(last_interval_started
, bl
);
3557 void pg_info_t::decode(ceph::buffer::list::const_iterator
&bl
)
3559 DECODE_START(32, bl
);
3560 decode(pgid
.pgid
, bl
);
3561 decode(last_update
, bl
);
3562 decode(last_complete
, bl
);
3563 decode(log_tail
, bl
);
3565 hobject_t old_last_backfill
;
3566 decode(old_last_backfill
, bl
);
3570 decode(purged_snaps
, bl
);
3571 decode(last_epoch_started
, bl
);
3572 decode(last_user_version
, bl
);
3573 decode(hit_set
, bl
);
3574 decode(pgid
.shard
, bl
);
3575 decode(last_backfill
, bl
);
3577 bool last_backfill_bitwise
;
3578 decode(last_backfill_bitwise
, bl
);
3579 // note: we may see a false value here since the default value for
3580 // the member was false, so it often didn't get set to true until
3581 // peering progressed.
3583 if (struct_v
>= 32) {
3584 decode(last_interval_started
, bl
);
3586 last_interval_started
= last_epoch_started
;
3593 void pg_info_t::dump(Formatter
*f
) const
3595 f
->dump_stream("pgid") << pgid
;
3596 f
->dump_stream("last_update") << last_update
;
3597 f
->dump_stream("last_complete") << last_complete
;
3598 f
->dump_stream("log_tail") << log_tail
;
3599 f
->dump_int("last_user_version", last_user_version
);
3600 f
->dump_stream("last_backfill") << last_backfill
;
3601 f
->open_array_section("purged_snaps");
3602 for (interval_set
<snapid_t
>::const_iterator i
=purged_snaps
.begin();
3603 i
!= purged_snaps
.end();
3605 f
->open_object_section("purged_snap_interval");
3606 f
->dump_stream("start") << i
.get_start();
3607 f
->dump_stream("length") << i
.get_len();
3611 f
->open_object_section("history");
3614 f
->open_object_section("stats");
3618 f
->dump_int("empty", is_empty());
3619 f
->dump_int("dne", dne());
3620 f
->dump_int("incomplete", is_incomplete());
3621 f
->dump_int("last_epoch_started", last_epoch_started
);
3623 f
->open_object_section("hit_set_history");
3628 void pg_info_t::generate_test_instances(list
<pg_info_t
*>& o
)
3630 o
.push_back(new pg_info_t
);
3631 o
.push_back(new pg_info_t
);
3632 list
<pg_history_t
*> h
;
3633 pg_history_t::generate_test_instances(h
);
3634 o
.back()->history
= *h
.back();
3635 o
.back()->pgid
= spg_t(pg_t(1, 2), shard_id_t::NO_SHARD
);
3636 o
.back()->last_update
= eversion_t(3, 4);
3637 o
.back()->last_complete
= eversion_t(5, 6);
3638 o
.back()->last_user_version
= 2;
3639 o
.back()->log_tail
= eversion_t(7, 8);
3640 o
.back()->last_backfill
= hobject_t(object_t("objname"), "key", 123, 456, -1, "");
3643 pg_stat_t::generate_test_instances(s
);
3644 o
.back()->stats
= *s
.back();
3647 list
<pg_hit_set_history_t
*> s
;
3648 pg_hit_set_history_t::generate_test_instances(s
);
3649 o
.back()->hit_set
= *s
.back();
3653 // -- pg_notify_t --
3654 void pg_notify_t::encode(ceph::buffer::list
&bl
) const
3656 ENCODE_START(3, 2, bl
);
3657 encode(query_epoch
, bl
);
3658 encode(epoch_sent
, bl
);
3662 encode(past_intervals
, bl
);
3666 void pg_notify_t::decode(ceph::buffer::list::const_iterator
&bl
)
3668 DECODE_START(3, bl
);
3669 decode(query_epoch
, bl
);
3670 decode(epoch_sent
, bl
);
3674 if (struct_v
>= 3) {
3675 decode(past_intervals
, bl
);
3680 void pg_notify_t::dump(Formatter
*f
) const
3682 f
->dump_int("from", from
);
3683 f
->dump_int("to", to
);
3684 f
->dump_unsigned("query_epoch", query_epoch
);
3685 f
->dump_unsigned("epoch_sent", epoch_sent
);
3687 f
->open_object_section("info");
3691 f
->dump_object("past_intervals", past_intervals
);
3694 void pg_notify_t::generate_test_instances(list
<pg_notify_t
*>& o
)
3696 o
.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD
, 1, 1,
3697 pg_info_t(), PastIntervals()));
3698 o
.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
3699 pg_info_t(), PastIntervals()));
3702 ostream
&operator<<(ostream
&lhs
, const pg_notify_t
¬ify
)
3704 lhs
<< "(query:" << notify
.query_epoch
3705 << " sent:" << notify
.epoch_sent
3706 << " " << notify
.info
;
3707 if (notify
.from
!= shard_id_t::NO_SHARD
||
3708 notify
.to
!= shard_id_t::NO_SHARD
)
3709 lhs
<< " " << (unsigned)notify
.from
3710 << "->" << (unsigned)notify
.to
;
3711 lhs
<< " " << notify
.past_intervals
;
3715 // -- pg_interval_t --
3717 void PastIntervals::pg_interval_t::encode(ceph::buffer::list
& bl
) const
3719 ENCODE_START(4, 2, bl
);
3724 encode(maybe_went_rw
, bl
);
3725 encode(primary
, bl
);
3726 encode(up_primary
, bl
);
3730 void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator
& bl
)
3732 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl
);
3737 decode(maybe_went_rw
, bl
);
3738 if (struct_v
>= 3) {
3739 decode(primary
, bl
);
3742 primary
= acting
[0];
3744 if (struct_v
>= 4) {
3745 decode(up_primary
, bl
);
3753 void PastIntervals::pg_interval_t::dump(Formatter
*f
) const
3755 f
->dump_unsigned("first", first
);
3756 f
->dump_unsigned("last", last
);
3757 f
->dump_int("maybe_went_rw", maybe_went_rw
? 1 : 0);
3758 f
->open_array_section("up");
3759 for (auto p
= up
.cbegin(); p
!= up
.cend(); ++p
)
3760 f
->dump_int("osd", *p
);
3762 f
->open_array_section("acting");
3763 for (auto p
= acting
.cbegin(); p
!= acting
.cend(); ++p
)
3764 f
->dump_int("osd", *p
);
3766 f
->dump_int("primary", primary
);
3767 f
->dump_int("up_primary", up_primary
);
3770 void PastIntervals::pg_interval_t::generate_test_instances(list
<pg_interval_t
*>& o
)
3772 o
.push_back(new pg_interval_t
);
3773 o
.push_back(new pg_interval_t
);
3774 o
.back()->up
.push_back(1);
3775 o
.back()->acting
.push_back(2);
3776 o
.back()->acting
.push_back(3);
3777 o
.back()->first
= 4;
3779 o
.back()->maybe_went_rw
= true;
3782 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t
)
3788 * PastIntervals only needs to be able to answer two questions:
3789 * 1) Where should the primary look for unfound objects?
3790 * 2) List a set of subsets of the OSDs such that contacting at least
3791 * one from each subset guarantees we speak to at least one witness
3792 * of any completed write.
3794 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3795 * we don't need to keep any where maybe_went_rw would be false. We also
3796 * needn't keep two intervals where the actingset in one is a subset
3797 * of the other (only need to keep the smaller of the two sets). In order
3798 * to accurately trim the set of intervals as last_epoch_started changes
3799 * without rebuilding the set from scratch, we'll retain the larger set
3800 * if it in an older interval.
3802 struct compact_interval_t
{
3805 set
<pg_shard_t
> acting
;
3806 bool supersedes(const compact_interval_t
&other
) {
3807 for (auto &&i
: acting
) {
3808 if (!other
.acting
.count(i
))
3813 void dump(Formatter
*f
) const {
3814 f
->open_object_section("compact_interval_t");
3815 f
->dump_stream("first") << first
;
3816 f
->dump_stream("last") << last
;
3817 f
->dump_stream("acting") << acting
;
3820 void encode(ceph::buffer::list
&bl
) const {
3821 ENCODE_START(1, 1, bl
);
3827 void decode(ceph::buffer::list::const_iterator
&bl
) {
3828 DECODE_START(1, bl
);
3834 static void generate_test_instances(list
<compact_interval_t
*> & o
) {
3835 /* Not going to be used, we'll generate pi_compact_rep directly */
3838 ostream
&operator<<(ostream
&o
, const compact_interval_t
&rhs
)
3840 return o
<< "([" << rhs
.first
<< "," << rhs
.last
3841 << "] acting " << rhs
.acting
<< ")";
3843 WRITE_CLASS_ENCODER(compact_interval_t
)
3845 class pi_compact_rep
: public PastIntervals::interval_rep
{
3847 epoch_t last
= 0; // inclusive
3848 set
<pg_shard_t
> all_participants
;
3849 list
<compact_interval_t
> intervals
;
3852 std::list
<PastIntervals::pg_interval_t
> &&intervals
) {
3853 for (auto &&i
: intervals
)
3854 add_interval(ec_pool
, i
);
3857 pi_compact_rep() = default;
3858 pi_compact_rep(const pi_compact_rep
&) = default;
3859 pi_compact_rep(pi_compact_rep
&&) = default;
3860 pi_compact_rep
&operator=(const pi_compact_rep
&) = default;
3861 pi_compact_rep
&operator=(pi_compact_rep
&&) = default;
3863 size_t size() const override
{ return intervals
.size(); }
3864 bool empty() const override
{
3865 return first
> last
|| (first
== 0 && last
== 0);
3867 void clear() override
{
3868 *this = pi_compact_rep();
3870 pair
<epoch_t
, epoch_t
> get_bounds() const override
{
3871 return make_pair(first
, last
+ 1);
3873 void adjust_start_backwards(epoch_t last_epoch_clean
) override
{
3874 first
= last_epoch_clean
;
3877 set
<pg_shard_t
> get_all_participants(
3878 bool ec_pool
) const override
{
3879 return all_participants
;
3882 bool ec_pool
, const PastIntervals::pg_interval_t
&interval
) override
{
3884 first
= interval
.first
;
3885 ceph_assert(interval
.last
> last
);
3886 last
= interval
.last
;
3887 set
<pg_shard_t
> acting
;
3888 for (unsigned i
= 0; i
< interval
.acting
.size(); ++i
) {
3889 if (interval
.acting
[i
] == CRUSH_ITEM_NONE
)
3894 ec_pool
? shard_id_t(i
) : shard_id_t::NO_SHARD
));
3896 all_participants
.insert(acting
.begin(), acting
.end());
3897 if (!interval
.maybe_went_rw
)
3899 intervals
.push_back(
3900 compact_interval_t
{interval
.first
, interval
.last
, acting
});
3901 auto plast
= intervals
.end();
3903 for (auto cur
= intervals
.begin(); cur
!= plast
; ) {
3904 if (plast
->supersedes(*cur
)) {
3905 intervals
.erase(cur
++);
3911 unique_ptr
<PastIntervals::interval_rep
> clone() const override
{
3912 return unique_ptr
<PastIntervals::interval_rep
>(new pi_compact_rep(*this));
3914 ostream
&print(ostream
&out
) const override
{
3915 return out
<< "([" << first
<< "," << last
3916 << "] all_participants=" << all_participants
3917 << " intervals=" << intervals
<< ")";
3919 void encode(ceph::buffer::list
&bl
) const override
{
3920 ENCODE_START(1, 1, bl
);
3923 encode(all_participants
, bl
);
3924 encode(intervals
, bl
);
3927 void decode(ceph::buffer::list::const_iterator
&bl
) override
{
3928 DECODE_START(1, bl
);
3931 decode(all_participants
, bl
);
3932 decode(intervals
, bl
);
3935 void dump(Formatter
*f
) const override
{
3936 f
->open_object_section("PastIntervals::compact_rep");
3937 f
->dump_stream("first") << first
;
3938 f
->dump_stream("last") << last
;
3939 f
->open_array_section("all_participants");
3940 for (auto& i
: all_participants
) {
3941 f
->dump_object("pg_shard", i
);
3944 f
->open_array_section("intervals");
3945 for (auto &&i
: intervals
) {
3951 static void generate_test_instances(list
<pi_compact_rep
*> &o
) {
3952 using ival
= PastIntervals::pg_interval_t
;
3953 using ivallst
= std::list
<ival
>;
3957 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3958 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3959 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3960 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3965 { ival
{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3966 , ival
{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3967 , ival
{{ 2}, { 2}, 31, 35, false, 2, 2}
3968 , ival
{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3973 { ival
{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3974 , ival
{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3975 , ival
{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3976 , ival
{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3979 void iterate_mayberw_back_to(
3981 std::function
<void(epoch_t
, const set
<pg_shard_t
> &)> &&f
) const override
{
3982 for (auto i
= intervals
.rbegin(); i
!= intervals
.rend(); ++i
) {
3985 f(i
->first
, i
->acting
);
3988 virtual ~pi_compact_rep() override
{}
3990 WRITE_CLASS_ENCODER(pi_compact_rep
)
3992 PastIntervals::PastIntervals()
3994 past_intervals
.reset(new pi_compact_rep
);
3997 PastIntervals::PastIntervals(const PastIntervals
&rhs
)
3998 : past_intervals(rhs
.past_intervals
?
3999 rhs
.past_intervals
->clone() :
4002 PastIntervals
&PastIntervals::operator=(const PastIntervals
&rhs
)
4004 PastIntervals
other(rhs
);
4009 ostream
& operator<<(ostream
& out
, const PastIntervals
&i
)
4011 if (i
.past_intervals
) {
4012 return i
.past_intervals
->print(out
);
4014 return out
<< "(empty)";
4018 ostream
& operator<<(ostream
& out
, const PastIntervals::PriorSet
&i
)
4020 return out
<< "PriorSet("
4021 << "ec_pool: " << i
.ec_pool
4022 << ", probe: " << i
.probe
4023 << ", down: " << i
.down
4024 << ", blocked_by: " << i
.blocked_by
4025 << ", pg_down: " << i
.pg_down
4029 void PastIntervals::decode(ceph::buffer::list::const_iterator
&bl
)
4031 DECODE_START(1, bl
);
4038 ceph_abort_msg("pi_simple_rep support removed post-luminous");
4041 past_intervals
.reset(new pi_compact_rep
);
4042 past_intervals
->decode(bl
);
4048 void PastIntervals::generate_test_instances(list
<PastIntervals
*> &o
)
4051 list
<pi_compact_rep
*> compact
;
4052 pi_compact_rep::generate_test_instances(compact
);
4053 for (auto &&i
: compact
) {
4054 // takes ownership of contents
4055 o
.push_back(new PastIntervals(i
));
4061 bool PastIntervals::is_new_interval(
4062 int old_acting_primary
,
4063 int new_acting_primary
,
4064 const vector
<int> &old_acting
,
4065 const vector
<int> &new_acting
,
4068 const vector
<int> &old_up
,
4069 const vector
<int> &new_up
,
4074 unsigned old_pg_num
,
4075 unsigned new_pg_num
,
4076 unsigned old_pg_num_pending
,
4077 unsigned new_pg_num_pending
,
4078 bool old_sort_bitwise
,
4079 bool new_sort_bitwise
,
4080 bool old_recovery_deletes
,
4081 bool new_recovery_deletes
,
4082 uint32_t old_crush_count
,
4083 uint32_t new_crush_count
,
4084 uint32_t old_crush_target
,
4085 uint32_t new_crush_target
,
4086 uint32_t old_crush_barrier
,
4087 uint32_t new_crush_barrier
,
4088 int32_t old_crush_member
,
4089 int32_t new_crush_member
,
4091 return old_acting_primary
!= new_acting_primary
||
4092 new_acting
!= old_acting
||
4093 old_up_primary
!= new_up_primary
||
4095 old_min_size
!= new_min_size
||
4096 old_size
!= new_size
||
4097 pgid
.is_split(old_pg_num
, new_pg_num
, 0) ||
4098 // (is or was) pre-merge source
4099 pgid
.is_merge_source(old_pg_num_pending
, new_pg_num_pending
, 0) ||
4100 pgid
.is_merge_source(new_pg_num_pending
, old_pg_num_pending
, 0) ||
4102 pgid
.is_merge_source(old_pg_num
, new_pg_num
, 0) ||
4103 // (is or was) pre-merge target
4104 pgid
.is_merge_target(old_pg_num_pending
, new_pg_num_pending
) ||
4105 pgid
.is_merge_target(new_pg_num_pending
, old_pg_num_pending
) ||
4107 pgid
.is_merge_target(old_pg_num
, new_pg_num
) ||
4108 old_sort_bitwise
!= new_sort_bitwise
||
4109 old_recovery_deletes
!= new_recovery_deletes
||
4110 old_crush_count
!= new_crush_count
||
4111 old_crush_target
!= new_crush_target
||
4112 old_crush_barrier
!= new_crush_barrier
||
4113 old_crush_member
!= new_crush_member
;
4116 bool PastIntervals::is_new_interval(
4117 int old_acting_primary
,
4118 int new_acting_primary
,
4119 const vector
<int> &old_acting
,
4120 const vector
<int> &new_acting
,
4123 const vector
<int> &old_up
,
4124 const vector
<int> &new_up
,
4125 const OSDMap
*osdmap
,
4126 const OSDMap
*lastmap
,
4129 const pg_pool_t
*plast
= lastmap
->get_pg_pool(pgid
.pool());
4131 return false; // after pool is deleted there are no more interval changes
4133 const pg_pool_t
*pi
= osdmap
->get_pg_pool(pgid
.pool());
4135 return true; // pool was deleted this epoch -> (final!) interval change
4138 is_new_interval(old_acting_primary
,
4150 plast
->get_pg_num(),
4152 plast
->get_pg_num_pending(),
4153 pi
->get_pg_num_pending(),
4154 lastmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
),
4155 osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
),
4156 lastmap
->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
),
4157 osdmap
->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
),
4158 plast
->peering_crush_bucket_count
, pi
->peering_crush_bucket_count
,
4159 plast
->peering_crush_bucket_target
, pi
->peering_crush_bucket_target
,
4160 plast
->peering_crush_bucket_barrier
, pi
->peering_crush_bucket_barrier
,
4161 plast
->peering_crush_mandatory_member
, pi
->peering_crush_mandatory_member
,
4165 bool PastIntervals::check_new_interval(
4166 int old_acting_primary
,
4167 int new_acting_primary
,
4168 const vector
<int> &old_acting
,
4169 const vector
<int> &new_acting
,
4172 const vector
<int> &old_up
,
4173 const vector
<int> &new_up
,
4174 epoch_t same_interval_since
,
4175 epoch_t last_epoch_clean
,
4176 const OSDMap
*osdmap
,
4177 const OSDMap
*lastmap
,
4179 const IsPGRecoverablePredicate
&could_have_gone_active
,
4180 PastIntervals
*past_intervals
,
4184 * We have to be careful to gracefully deal with situations like
4185 * so. Say we have a power outage or something that takes out both
4186 * OSDs, but the monitor doesn't mark them down in the same epoch.
4187 * The history may look like
4191 * 3: let's say B dies for good, too (say, from the power spike)
4194 * which makes it look like B may have applied updates to the PG
4195 * that we need in order to proceed. This sucks...
4197 * To minimize the risk of this happening, we CANNOT go active if
4198 * _any_ OSDs in the prior set are down until we send an MOSDAlive
4199 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
4200 * Then, we have something like
4207 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
4217 * -> we must wait for B, bc it was alive through 2, and could have
4218 * written to the pg.
4220 * If B is really dead, then an administrator will need to manually
4221 * intervene by marking the OSD as "lost."
4224 // remember past interval
4225 // NOTE: a change in the up set primary triggers an interval
4226 // change, even though the interval members in the pg_interval_t
4228 ceph_assert(past_intervals
);
4229 ceph_assert(past_intervals
->past_intervals
);
4230 if (is_new_interval(
4243 i
.first
= same_interval_since
;
4244 i
.last
= osdmap
->get_epoch() - 1;
4245 ceph_assert(i
.first
<= i
.last
);
4246 i
.acting
= old_acting
;
4248 i
.primary
= old_acting_primary
;
4249 i
.up_primary
= old_up_primary
;
4251 unsigned num_acting
= 0;
4252 for (auto p
= i
.acting
.cbegin(); p
!= i
.acting
.cend(); ++p
)
4253 if (*p
!= CRUSH_ITEM_NONE
)
4256 ceph_assert(lastmap
->get_pools().count(pgid
.pool()));
4257 const pg_pool_t
& old_pg_pool
= lastmap
->get_pools().find(pgid
.pool())->second
;
4258 set
<pg_shard_t
> old_acting_shards
;
4259 old_pg_pool
.convert_to_pg_shards(old_acting
, &old_acting_shards
);
4263 num_acting
>= old_pg_pool
.min_size
&&
4264 (!old_pg_pool
.is_stretch_pool() ||
4265 old_pg_pool
.stretch_set_can_peer(old_acting
, *lastmap
, out
)) &&
4266 could_have_gone_active(old_acting_shards
)) {
4268 *out
<< __func__
<< " " << i
4269 << " up_thru " << lastmap
->get_up_thru(i
.primary
)
4270 << " up_from " << lastmap
->get_up_from(i
.primary
)
4271 << " last_epoch_clean " << last_epoch_clean
;
4272 if (lastmap
->get_up_thru(i
.primary
) >= i
.first
&&
4273 lastmap
->get_up_from(i
.primary
) <= i
.first
) {
4274 i
.maybe_went_rw
= true;
4277 << " : primary up " << lastmap
->get_up_from(i
.primary
)
4278 << "-" << lastmap
->get_up_thru(i
.primary
)
4279 << " includes interval"
4281 } else if (last_epoch_clean
>= i
.first
&&
4282 last_epoch_clean
<= i
.last
) {
4283 // If the last_epoch_clean is included in this interval, then
4284 // the pg must have been rw (for recovery to have completed).
4285 // This is important because we won't know the _real_
4286 // first_epoch because we stop at last_epoch_clean, and we
4287 // don't want the oldest interval to randomly have
4288 // maybe_went_rw false depending on the relative up_thru vs
4289 // last_epoch_clean timing.
4290 i
.maybe_went_rw
= true;
4293 << " : includes last_epoch_clean " << last_epoch_clean
4294 << " and presumed to have been rw"
4297 i
.maybe_went_rw
= false;
4300 << " : primary up " << lastmap
->get_up_from(i
.primary
)
4301 << "-" << lastmap
->get_up_thru(i
.primary
)
4302 << " does not include interval"
4306 i
.maybe_went_rw
= false;
4308 *out
<< __func__
<< " " << i
<< " : acting set is too small" << std::endl
;
4310 past_intervals
->past_intervals
->add_interval(old_pg_pool
.is_erasure(), i
);
4317 // true if the given map affects the prior set
4318 bool PastIntervals::PriorSet::affected_by_map(
4319 const OSDMap
&osdmap
,
4320 const DoutPrefixProvider
*dpp
) const
4322 for (auto p
= probe
.begin(); p
!= probe
.end(); ++p
) {
4325 // did someone in the prior set go down?
4326 if (osdmap
.is_down(o
) && down
.count(o
) == 0) {
4327 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " now down" << dendl
;
4331 // did a down osd in cur get (re)marked as lost?
4332 auto r
= blocked_by
.find(o
);
4333 if (r
!= blocked_by
.end()) {
4334 if (!osdmap
.exists(o
)) {
4335 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " no longer exists" << dendl
;
4338 if (osdmap
.get_info(o
).lost_at
!= r
->second
) {
4339 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " (re)marked as lost" << dendl
;
4345 // did someone in the prior down set go up?
4346 for (auto p
= down
.cbegin(); p
!= down
.cend(); ++p
) {
4349 if (osdmap
.is_up(o
)) {
4350 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " now up" << dendl
;
4354 // did someone in the prior set get lost or destroyed?
4355 if (!osdmap
.exists(o
)) {
4356 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " no longer exists" << dendl
;
4359 // did a down osd in down get (re)marked as lost?
4360 auto r
= blocked_by
.find(o
);
4361 if (r
!= blocked_by
.end()) {
4362 if (osdmap
.get_info(o
).lost_at
!= r
->second
) {
4363 ldpp_dout(dpp
, 10) << "affected_by_map osd." << o
<< " (re)marked as lost" << dendl
;
4372 ostream
& operator<<(ostream
& out
, const PastIntervals::pg_interval_t
& i
)
4374 out
<< "interval(" << i
.first
<< "-" << i
.last
4375 << " up " << i
.up
<< "(" << i
.up_primary
<< ")"
4376 << " acting " << i
.acting
<< "(" << i
.primary
<< ")";
4377 if (i
.maybe_went_rw
)
4378 out
<< " maybe_went_rw";
4387 void pg_query_t::encode(ceph::buffer::list
&bl
, uint64_t features
) const {
4388 ENCODE_START(3, 3, bl
);
4392 encode(epoch_sent
, bl
);
4398 void pg_query_t::decode(ceph::buffer::list::const_iterator
&bl
) {
4399 DECODE_START(3, bl
);
4403 decode(epoch_sent
, bl
);
4409 void pg_query_t::dump(Formatter
*f
) const
4411 f
->dump_int("from", from
);
4412 f
->dump_int("to", to
);
4413 f
->dump_string("type", get_type_name());
4414 f
->dump_stream("since") << since
;
4415 f
->dump_stream("epoch_sent") << epoch_sent
;
4416 f
->open_object_section("history");
4420 void pg_query_t::generate_test_instances(list
<pg_query_t
*>& o
)
4422 o
.push_back(new pg_query_t());
4423 list
<pg_history_t
*> h
;
4424 pg_history_t::generate_test_instances(h
);
4425 o
.push_back(new pg_query_t(pg_query_t::INFO
, shard_id_t(1), shard_id_t(2), *h
.back(), 4));
4426 o
.push_back(new pg_query_t(pg_query_t::MISSING
, shard_id_t(2), shard_id_t(3), *h
.back(), 4));
4427 o
.push_back(new pg_query_t(pg_query_t::LOG
, shard_id_t(0), shard_id_t(0),
4428 eversion_t(4, 5), *h
.back(), 4));
4429 o
.push_back(new pg_query_t(pg_query_t::FULLLOG
,
4430 shard_id_t::NO_SHARD
, shard_id_t::NO_SHARD
,
4436 void pg_lease_t::encode(bufferlist
& bl
) const
4438 ENCODE_START(1, 1, bl
);
4439 encode(readable_until
, bl
);
4440 encode(readable_until_ub
, bl
);
4441 encode(interval
, bl
);
4445 void pg_lease_t::decode(bufferlist::const_iterator
& p
)
4448 decode(readable_until
, p
);
4449 decode(readable_until_ub
, p
);
4450 decode(interval
, p
);
4454 void pg_lease_t::dump(Formatter
*f
) const
4456 f
->dump_stream("readable_until") << readable_until
;
4457 f
->dump_stream("readable_until_ub") << readable_until_ub
;
4458 f
->dump_stream("interval") << interval
;
4461 void pg_lease_t::generate_test_instances(std::list
<pg_lease_t
*>& o
)
4463 o
.push_back(new pg_lease_t());
4464 o
.push_back(new pg_lease_t());
4465 o
.back()->readable_until
= make_timespan(1.5);
4466 o
.back()->readable_until_ub
= make_timespan(3.4);
4467 o
.back()->interval
= make_timespan(1.0);
4470 // -- pg_lease_ack_t --
4472 void pg_lease_ack_t::encode(bufferlist
& bl
) const
4474 ENCODE_START(1, 1, bl
);
4475 encode(readable_until_ub
, bl
);
4479 void pg_lease_ack_t::decode(bufferlist::const_iterator
& p
)
4482 decode(readable_until_ub
, p
);
4486 void pg_lease_ack_t::dump(Formatter
*f
) const
4488 f
->dump_stream("readable_until_ub") << readable_until_ub
;
4491 void pg_lease_ack_t::generate_test_instances(std::list
<pg_lease_ack_t
*>& o
)
4493 o
.push_back(new pg_lease_ack_t());
4494 o
.push_back(new pg_lease_ack_t());
4495 o
.back()->readable_until_ub
= make_timespan(3.4);
4499 // -- ObjectModDesc --
4500 void ObjectModDesc::visit(Visitor
*visitor
) const
4502 auto bp
= bl
.cbegin();
4505 DECODE_START(max_required_version
, bp
);
4512 visitor
->append(size
);
4516 map
<string
, std::optional
<ceph::buffer::list
> > attrs
;
4518 visitor
->setattrs(attrs
);
4522 version_t old_version
;
4523 decode(old_version
, bp
);
4524 visitor
->rmobject(old_version
);
4531 case UPDATE_SNAPS
: {
4532 set
<snapid_t
> snaps
;
4534 visitor
->update_snaps(snaps
);
4538 version_t old_version
;
4539 decode(old_version
, bp
);
4540 visitor
->try_rmobject(old_version
);
4543 case ROLLBACK_EXTENTS
: {
4544 vector
<pair
<uint64_t, uint64_t> > extents
;
4547 decode(extents
, bp
);
4548 visitor
->rollback_extents(gen
,extents
);
4552 ceph_abort_msg("Invalid rollback code");
4557 ceph_abort_msg("Invalid encoding");
4561 struct DumpVisitor
: public ObjectModDesc::Visitor
{
4563 explicit DumpVisitor(Formatter
*f
) : f(f
) {}
4564 void append(uint64_t old_size
) override
{
4565 f
->open_object_section("op");
4566 f
->dump_string("code", "APPEND");
4567 f
->dump_unsigned("old_size", old_size
);
4570 void setattrs(map
<string
, std::optional
<ceph::buffer::list
> > &attrs
) override
{
4571 f
->open_object_section("op");
4572 f
->dump_string("code", "SETATTRS");
4573 f
->open_array_section("attrs");
4574 for (auto i
= attrs
.begin(); i
!= attrs
.end(); ++i
) {
4575 f
->dump_string("attr_name", i
->first
);
4580 void rmobject(version_t old_version
) override
{
4581 f
->open_object_section("op");
4582 f
->dump_string("code", "RMOBJECT");
4583 f
->dump_unsigned("old_version", old_version
);
4586 void try_rmobject(version_t old_version
) override
{
4587 f
->open_object_section("op");
4588 f
->dump_string("code", "TRY_RMOBJECT");
4589 f
->dump_unsigned("old_version", old_version
);
4592 void create() override
{
4593 f
->open_object_section("op");
4594 f
->dump_string("code", "CREATE");
4597 void update_snaps(const set
<snapid_t
> &snaps
) override
{
4598 f
->open_object_section("op");
4599 f
->dump_string("code", "UPDATE_SNAPS");
4600 f
->dump_stream("snaps") << snaps
;
4603 void rollback_extents(
4605 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
4606 f
->open_object_section("op");
4607 f
->dump_string("code", "ROLLBACK_EXTENTS");
4608 f
->dump_unsigned("gen", gen
);
4609 f
->dump_stream("snaps") << extents
;
4614 void ObjectModDesc::dump(Formatter
*f
) const
4616 f
->open_object_section("object_mod_desc");
4617 f
->dump_bool("can_local_rollback", can_local_rollback
);
4618 f
->dump_bool("rollback_info_completed", rollback_info_completed
);
4620 f
->open_array_section("ops");
4628 void ObjectModDesc::generate_test_instances(list
<ObjectModDesc
*>& o
)
4630 map
<string
, std::optional
<ceph::buffer::list
> > attrs
;
4634 o
.push_back(new ObjectModDesc());
4635 o
.back()->append(100);
4636 o
.back()->setattrs(attrs
);
4637 o
.push_back(new ObjectModDesc());
4638 o
.back()->rmobject(1001);
4639 o
.push_back(new ObjectModDesc());
4641 o
.back()->setattrs(attrs
);
4642 o
.push_back(new ObjectModDesc());
4644 o
.back()->setattrs(attrs
);
4645 o
.back()->mark_unrollbackable();
4646 o
.back()->append(1000);
4649 void ObjectModDesc::encode(ceph::buffer::list
&_bl
) const
4651 ENCODE_START(max_required_version
, max_required_version
, _bl
);
4652 encode(can_local_rollback
, _bl
);
4653 encode(rollback_info_completed
, _bl
);
4657 void ObjectModDesc::decode(ceph::buffer::list::const_iterator
&_bl
)
4659 DECODE_START(2, _bl
);
4660 max_required_version
= struct_v
;
4661 decode(can_local_rollback
, _bl
);
4662 decode(rollback_info_completed
, _bl
);
4664 // ensure bl does not pin a larger ceph::buffer in memory
4666 bl
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4670 std::atomic
<uint32_t> ObjectCleanRegions::max_num_intervals
= {10};
4672 void ObjectCleanRegions::set_max_num_intervals(uint32_t num
)
4674 max_num_intervals
= num
;
4677 void ObjectCleanRegions::trim()
4679 while(clean_offsets
.num_intervals() > max_num_intervals
) {
4680 typename interval_set
<uint64_t>::iterator shortest_interval
= clean_offsets
.begin();
4681 if (shortest_interval
== clean_offsets
.end())
4683 for (typename interval_set
<uint64_t>::iterator it
= clean_offsets
.begin();
4684 it
!= clean_offsets
.end();
4686 if (it
.get_len() < shortest_interval
.get_len())
4687 shortest_interval
= it
;
4689 clean_offsets
.erase(shortest_interval
);
4693 void ObjectCleanRegions::merge(const ObjectCleanRegions
&other
)
4695 clean_offsets
.intersection_of(other
.clean_offsets
);
4696 clean_omap
= clean_omap
&& other
.clean_omap
;
4700 void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset
, uint64_t len
)
4702 interval_set
<uint64_t> clean_region
;
4703 clean_region
.insert(0, (uint64_t)-1);
4704 clean_region
.erase(offset
, len
);
4705 clean_offsets
.intersection_of(clean_region
);
4709 bool ObjectCleanRegions::is_clean_region(uint64_t offset
, uint64_t len
) const
4711 return clean_offsets
.contains(offset
, len
);
4714 void ObjectCleanRegions::mark_omap_dirty()
4719 void ObjectCleanRegions::mark_object_new()
4724 void ObjectCleanRegions::mark_fully_dirty()
4726 mark_data_region_dirty(0, (uint64_t)-1);
4731 interval_set
<uint64_t> ObjectCleanRegions::get_dirty_regions() const
4733 interval_set
<uint64_t> dirty_region
;
4734 dirty_region
.insert(0, (uint64_t)-1);
4735 dirty_region
.subtract(clean_offsets
);
4736 return dirty_region
;
4739 bool ObjectCleanRegions::omap_is_dirty() const
4744 bool ObjectCleanRegions::object_is_exist() const
4749 void ObjectCleanRegions::encode(bufferlist
&bl
) const
4751 ENCODE_START(1, 1, bl
);
4753 encode(clean_offsets
, bl
);
4754 encode(clean_omap
, bl
);
4755 encode(new_object
, bl
);
4759 void ObjectCleanRegions::decode(bufferlist::const_iterator
&bl
)
4761 DECODE_START(1, bl
);
4763 decode(clean_offsets
, bl
);
4764 decode(clean_omap
, bl
);
4765 decode(new_object
, bl
);
4769 void ObjectCleanRegions::dump(Formatter
*f
) const
4771 f
->open_object_section("object_clean_regions");
4772 f
->dump_stream("clean_offsets") << clean_offsets
;
4773 f
->dump_bool("clean_omap", clean_omap
);
4774 f
->dump_bool("new_object", new_object
);
4778 void ObjectCleanRegions::generate_test_instances(list
<ObjectCleanRegions
*>& o
)
4780 o
.push_back(new ObjectCleanRegions());
4781 o
.push_back(new ObjectCleanRegions());
4782 o
.back()->mark_data_region_dirty(4096, 40960);
4783 o
.back()->mark_omap_dirty();
4784 o
.back()->mark_object_new();
4787 ostream
& operator<<(ostream
& out
, const ObjectCleanRegions
& ocr
)
4789 return out
<< "clean_offsets: " << ocr
.clean_offsets
4790 << ", clean_omap: " << ocr
.clean_omap
4791 << ", new_object: " << ocr
.new_object
;
4794 // -- pg_log_entry_t --
4796 string
pg_log_entry_t::get_key_name() const
4798 return version
.get_key_name();
4801 void pg_log_entry_t::encode_with_checksum(ceph::buffer::list
& bl
) const
4804 ceph::buffer::list
ebl(sizeof(*this)*2);
4806 __u32 crc
= ebl
.crc32c(0);
4811 void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator
& p
)
4814 ceph::buffer::list bl
;
4818 if (crc
!= bl
.crc32c(0))
4819 throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
4820 auto q
= bl
.cbegin();
4824 void pg_log_entry_t::encode(ceph::buffer::list
&bl
) const
4826 ENCODE_START(14, 4, bl
);
4829 encode(version
, bl
);
4832 * Added with reverting_to:
4833 * Previous code used prior_version to encode
4834 * what we now call reverting_to. This will
4835 * allow older code to decode reverting_to
4836 * into prior_version as expected.
4838 if (op
== LOST_REVERT
)
4839 encode(reverting_to
, bl
);
4841 encode(prior_version
, bl
);
4845 if (op
== LOST_REVERT
)
4846 encode(prior_version
, bl
);
4848 encode(user_version
, bl
);
4849 encode(mod_desc
, bl
);
4850 encode(extra_reqids
, bl
);
4852 encode(return_code
, bl
);
4853 if (!extra_reqids
.empty())
4854 encode(extra_reqid_return_codes
, bl
);
4855 encode(clean_regions
, bl
);
4857 encode(return_code
, bl
);
4858 encode(op_returns
, bl
);
4862 void pg_log_entry_t::decode(ceph::buffer::list::const_iterator
&bl
)
4864 DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl
);
4868 decode(old_soid
, bl
);
4869 soid
.oid
= old_soid
.oid
;
4870 soid
.snap
= old_soid
.snap
;
4871 invalid_hash
= true;
4876 invalid_hash
= true;
4877 decode(version
, bl
);
4879 if (struct_v
>= 6 && op
== LOST_REVERT
)
4880 decode(reverting_to
, bl
);
4882 decode(prior_version
, bl
);
4888 invalid_pool
= true;
4890 if (op
== LOST_REVERT
) {
4891 if (struct_v
>= 6) {
4892 decode(prior_version
, bl
);
4894 reverting_to
= prior_version
;
4897 if (struct_v
>= 7 || // for v >= 7, this is for all ops.
4898 op
== CLONE
) { // for v < 7, it's only present for CLONE.
4900 // ensure snaps does not pin a larger ceph::buffer in memory
4902 snaps
.reassign_to_mempool(mempool::mempool_osd_pglog
);
4906 decode(user_version
, bl
);
4908 user_version
= version
.version
;
4911 decode(mod_desc
, bl
);
4913 mod_desc
.mark_unrollbackable();
4915 decode(extra_reqids
, bl
);
4916 if (struct_v
>= 11 && op
== ERROR
)
4917 decode(return_code
, bl
);
4918 if (struct_v
>= 12 && !extra_reqids
.empty())
4919 decode(extra_reqid_return_codes
, bl
);
4921 decode(clean_regions
, bl
);
4923 clean_regions
.mark_fully_dirty();
4924 if (struct_v
>= 14) {
4926 decode(return_code
, bl
);
4928 decode(op_returns
, bl
);
4933 void pg_log_entry_t::dump(Formatter
*f
) const
4935 f
->dump_string("op", get_op_name());
4936 f
->dump_stream("object") << soid
;
4937 f
->dump_stream("version") << version
;
4938 f
->dump_stream("prior_version") << prior_version
;
4939 f
->dump_stream("reqid") << reqid
;
4940 f
->open_array_section("extra_reqids");
4942 for (auto p
= extra_reqids
.begin();
4943 p
!= extra_reqids
.end();
4945 f
->open_object_section("extra_reqid");
4946 f
->dump_stream("reqid") << p
->first
;
4947 f
->dump_stream("user_version") << p
->second
;
4948 auto it
= extra_reqid_return_codes
.find(idx
);
4949 if (it
!= extra_reqid_return_codes
.end()) {
4950 f
->dump_int("return_code", it
->second
);
4955 f
->dump_stream("mtime") << mtime
;
4956 f
->dump_int("return_code", return_code
);
4957 if (!op_returns
.empty()) {
4958 f
->open_array_section("op_returns");
4959 for (auto& i
: op_returns
) {
4960 f
->dump_object("op", i
);
4964 if (snaps
.length() > 0) {
4966 ceph::buffer::list c
= snaps
;
4967 auto p
= c
.cbegin();
4974 f
->open_object_section("snaps");
4975 for (auto p
= v
.begin(); p
!= v
.end(); ++p
)
4976 f
->dump_unsigned("snap", *p
);
4980 f
->open_object_section("mod_desc");
4985 f
->open_object_section("clean_regions");
4986 clean_regions
.dump(f
);
4991 void pg_log_entry_t::generate_test_instances(list
<pg_log_entry_t
*>& o
)
4993 o
.push_back(new pg_log_entry_t());
4994 hobject_t
oid(object_t("objname"), "key", 123, 456, 0, "");
4995 o
.push_back(new pg_log_entry_t(MODIFY
, oid
, eversion_t(1,2), eversion_t(3,4),
4996 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4998 o
.push_back(new pg_log_entry_t(ERROR
, oid
, eversion_t(1,2), eversion_t(3,4),
4999 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5000 utime_t(8,9), -ENOENT
));
5003 ostream
& operator<<(ostream
& out
, const pg_log_entry_t
& e
)
5005 out
<< e
.version
<< " (" << e
.prior_version
<< ") "
5006 << std::left
<< std::setw(8) << e
.get_op_name() << ' '
5007 << e
.soid
<< " by " << e
.reqid
<< " " << e
.mtime
5008 << " " << e
.return_code
;
5009 if (!e
.op_returns
.empty()) {
5010 out
<< " " << e
.op_returns
;
5012 if (e
.snaps
.length()) {
5013 vector
<snapid_t
> snaps
;
5014 ceph::buffer::list c
= e
.snaps
;
5015 auto p
= c
.cbegin();
5021 out
<< " snaps " << snaps
;
5023 out
<< " ObjectCleanRegions " << e
.clean_regions
;
5027 // -- pg_log_dup_t --
5029 std::string
pg_log_dup_t::get_key_name() const
5031 static const char prefix
[] = "dup_";
5032 std::string
key(36, ' ');
5033 memcpy(&key
[0], prefix
, 4);
5034 version
.get_key_name(&key
[4]);
5035 key
.resize(35); // remove the null terminator
5039 void pg_log_dup_t::encode(ceph::buffer::list
&bl
) const
5041 ENCODE_START(2, 1, bl
);
5043 encode(version
, bl
);
5044 encode(user_version
, bl
);
5045 encode(return_code
, bl
);
5046 encode(op_returns
, bl
);
5050 void pg_log_dup_t::decode(ceph::buffer::list::const_iterator
&bl
)
5052 DECODE_START(2, bl
);
5054 decode(version
, bl
);
5055 decode(user_version
, bl
);
5056 decode(return_code
, bl
);
5057 if (struct_v
>= 2) {
5058 decode(op_returns
, bl
);
5063 void pg_log_dup_t::dump(Formatter
*f
) const
5065 f
->dump_stream("reqid") << reqid
;
5066 f
->dump_stream("version") << version
;
5067 f
->dump_stream("user_version") << user_version
;
5068 f
->dump_stream("return_code") << return_code
;
5069 if (!op_returns
.empty()) {
5070 f
->open_array_section("op_returns");
5071 for (auto& i
: op_returns
) {
5072 f
->dump_object("op", i
);
5078 void pg_log_dup_t::generate_test_instances(list
<pg_log_dup_t
*>& o
)
5080 o
.push_back(new pg_log_dup_t());
5081 o
.push_back(new pg_log_dup_t(eversion_t(1,2),
5083 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5085 o
.push_back(new pg_log_dup_t(eversion_t(1,2),
5087 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5092 std::ostream
& operator<<(std::ostream
& out
, const pg_log_dup_t
& e
) {
5093 out
<< "log_dup(reqid=" << e
.reqid
<<
5094 " v=" << e
.version
<< " uv=" << e
.user_version
<<
5095 " rc=" << e
.return_code
;
5096 if (!e
.op_returns
.empty()) {
5097 out
<< " " << e
.op_returns
;
5105 // out: pg_log_t that only has entries that apply to import_pgid using curmap
5106 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
5107 void pg_log_t::filter_log(spg_t import_pgid
, const OSDMap
&curmap
,
5108 const string
&hit_set_namespace
, const pg_log_t
&in
,
5109 pg_log_t
&out
, pg_log_t
&reject
)
5115 for (auto i
= in
.log
.cbegin(); i
!= in
.log
.cend(); ++i
) {
5117 // Reject pg log entries for temporary objects
5118 if (i
->soid
.is_temp()) {
5119 reject
.log
.push_back(*i
);
5123 if (i
->soid
.nspace
!= hit_set_namespace
) {
5124 object_t oid
= i
->soid
.oid
;
5125 object_locator_t
loc(i
->soid
);
5126 pg_t raw_pgid
= curmap
.object_locator_to_pg(oid
, loc
);
5127 pg_t pgid
= curmap
.raw_pg_to_pg(raw_pgid
);
5129 if (import_pgid
.pgid
== pgid
) {
5130 out
.log
.push_back(*i
);
5132 reject
.log
.push_back(*i
);
5135 out
.log
.push_back(*i
);
5140 void pg_log_t::encode(ceph::buffer::list
& bl
) const
5142 ENCODE_START(7, 3, bl
);
5146 encode(can_rollback_to
, bl
);
5147 encode(rollback_info_trimmed_to
, bl
);
5152 void pg_log_t::decode(ceph::buffer::list::const_iterator
&bl
, int64_t pool
)
5154 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl
);
5159 decode(backlog
, bl
);
5163 decode(can_rollback_to
, bl
);
5166 decode(rollback_info_trimmed_to
, bl
);
5168 rollback_info_trimmed_to
= tail
;
5175 // handle hobject_t format change
5177 for (auto i
= log
.begin(); i
!= log
.end(); ++i
) {
5178 if (!i
->soid
.is_max() && i
->soid
.pool
== -1)
5179 i
->soid
.pool
= pool
;
5184 void pg_log_t::dump(Formatter
*f
) const
5186 f
->dump_stream("head") << head
;
5187 f
->dump_stream("tail") << tail
;
5188 f
->open_array_section("log");
5189 for (auto p
= log
.cbegin(); p
!= log
.cend(); ++p
) {
5190 f
->open_object_section("entry");
5195 f
->open_array_section("dups");
5196 for (const auto& entry
: dups
) {
5197 f
->open_object_section("entry");
5204 void pg_log_t::generate_test_instances(list
<pg_log_t
*>& o
)
5206 o
.push_back(new pg_log_t
);
5208 // this is nonsensical:
5209 o
.push_back(new pg_log_t
);
5210 o
.back()->head
= eversion_t(1,2);
5211 o
.back()->tail
= eversion_t(3,4);
5212 list
<pg_log_entry_t
*> e
;
5213 pg_log_entry_t::generate_test_instances(e
);
5214 for (auto p
= e
.begin(); p
!= e
.end(); ++p
)
5215 o
.back()->log
.push_back(**p
);
5218 static void _handle_dups(CephContext
* cct
, pg_log_t
&target
, const pg_log_t
&other
, unsigned maxdups
)
5220 auto earliest_dup_version
=
5221 target
.head
.version
< maxdups
? 0u : target
.head
.version
- maxdups
+ 1;
5222 lgeneric_subdout(cct
, osd
, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version
<< dendl
;
5224 for (auto d
= other
.dups
.cbegin(); d
!= other
.dups
.cend(); ++d
) {
5225 if (d
->version
.version
>= earliest_dup_version
) {
5226 lgeneric_subdout(cct
, osd
, 20)
5227 << "copy_up_to/copy_after copy dup version "
5228 << d
->version
<< dendl
;
5229 target
.dups
.push_back(pg_log_dup_t(*d
));
5233 for (auto i
= other
.log
.cbegin(); i
!= other
.log
.cend(); ++i
) {
5234 ceph_assert(i
->version
> other
.tail
);
5235 if (i
->version
> target
.tail
)
5237 if (i
->version
.version
>= earliest_dup_version
) {
5238 lgeneric_subdout(cct
, osd
, 20)
5239 << "copy_up_to/copy_after copy dup from log version "
5240 << i
->version
<< dendl
;
5241 target
.dups
.push_back(pg_log_dup_t(*i
));
5247 void pg_log_t::copy_after(CephContext
* cct
, const pg_log_t
&other
, eversion_t v
)
5249 can_rollback_to
= other
.can_rollback_to
;
5252 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " v " << v
<< dendl
;
5253 for (auto i
= other
.log
.crbegin(); i
!= other
.log
.crend(); ++i
) {
5254 ceph_assert(i
->version
> other
.tail
);
5255 if (i
->version
<= v
) {
5256 // make tail accurate.
5260 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " copy log version " << i
->version
<< dendl
;
5263 _handle_dups(cct
, *this, other
, cct
->_conf
->osd_pg_log_dups_tracked
);
5266 void pg_log_t::copy_up_to(CephContext
* cct
, const pg_log_t
&other
, int max
)
5268 can_rollback_to
= other
.can_rollback_to
;
5272 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " max " << max
<< dendl
;
5273 for (auto i
= other
.log
.crbegin(); i
!= other
.log
.crend(); ++i
) {
5274 ceph_assert(i
->version
> other
.tail
);
5279 lgeneric_subdout(cct
, osd
, 20) << __func__
<< " copy log version " << i
->version
<< dendl
;
5282 _handle_dups(cct
, *this, other
, cct
->_conf
->osd_pg_log_dups_tracked
);
5285 ostream
& pg_log_t::print(ostream
& out
) const
5287 out
<< *this << std::endl
;
5288 for (auto p
= log
.cbegin(); p
!= log
.cend(); ++p
)
5289 out
<< *p
<< std::endl
;
5290 for (const auto& entry
: dups
) {
5291 out
<< " dup entry: " << entry
<< std::endl
;
5296 // -- pg_missing_t --
5298 ostream
& operator<<(ostream
& out
, const pg_missing_item
& i
)
5301 if (i
.have
!= eversion_t())
5302 out
<< "(" << i
.have
<< ")";
5303 out
<< " flags = " << i
.flag_str()
5304 << " " << i
.clean_regions
;
5308 // -- object_copy_cursor_t --
5310 void object_copy_cursor_t::encode(ceph::buffer::list
& bl
) const
5312 ENCODE_START(1, 1, bl
);
5313 encode(attr_complete
, bl
);
5314 encode(data_offset
, bl
);
5315 encode(data_complete
, bl
);
5316 encode(omap_offset
, bl
);
5317 encode(omap_complete
, bl
);
5321 void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator
&bl
)
5323 DECODE_START(1, bl
);
5324 decode(attr_complete
, bl
);
5325 decode(data_offset
, bl
);
5326 decode(data_complete
, bl
);
5327 decode(omap_offset
, bl
);
5328 decode(omap_complete
, bl
);
5332 void object_copy_cursor_t::dump(Formatter
*f
) const
5334 f
->dump_unsigned("attr_complete", (int)attr_complete
);
5335 f
->dump_unsigned("data_offset", data_offset
);
5336 f
->dump_unsigned("data_complete", (int)data_complete
);
5337 f
->dump_string("omap_offset", omap_offset
);
5338 f
->dump_unsigned("omap_complete", (int)omap_complete
);
5341 void object_copy_cursor_t::generate_test_instances(list
<object_copy_cursor_t
*>& o
)
5343 o
.push_back(new object_copy_cursor_t
);
5344 o
.push_back(new object_copy_cursor_t
);
5345 o
.back()->attr_complete
= true;
5346 o
.back()->data_offset
= 123;
5347 o
.push_back(new object_copy_cursor_t
);
5348 o
.back()->attr_complete
= true;
5349 o
.back()->data_complete
= true;
5350 o
.back()->omap_offset
= "foo";
5351 o
.push_back(new object_copy_cursor_t
);
5352 o
.back()->attr_complete
= true;
5353 o
.back()->data_complete
= true;
5354 o
.back()->omap_complete
= true;
5357 // -- object_copy_data_t --
5359 void object_copy_data_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
5361 ENCODE_START(8, 5, bl
);
5366 encode(omap_data
, bl
);
5368 encode(omap_header
, bl
);
5370 encode(snap_seq
, bl
);
5372 encode(data_digest
, bl
);
5373 encode(omap_digest
, bl
);
5375 encode(truncate_seq
, bl
);
5376 encode(truncate_size
, bl
);
5377 encode(reqid_return_codes
, bl
);
5381 void object_copy_data_t::decode(ceph::buffer::list::const_iterator
& bl
)
5383 DECODE_START(8, bl
);
5390 decode(category
, bl
); // no longer used
5395 map
<string
,ceph::buffer::list
> omap
;
5398 if (!omap
.empty()) {
5400 encode(omap
, omap_data
);
5405 decode(omap_header
, bl
);
5406 if (struct_v
>= 3) {
5408 decode(snap_seq
, bl
);
5413 if (struct_v
>= 4) {
5415 decode(data_digest
, bl
);
5416 decode(omap_digest
, bl
);
5424 decode(omap_data
, bl
);
5426 decode(omap_header
, bl
);
5428 decode(snap_seq
, bl
);
5429 if (struct_v
>= 4) {
5431 decode(data_digest
, bl
);
5432 decode(omap_digest
, bl
);
5434 if (struct_v
>= 6) {
5437 if (struct_v
>= 7) {
5438 decode(truncate_seq
, bl
);
5439 decode(truncate_size
, bl
);
5441 if (struct_v
>= 8) {
5442 decode(reqid_return_codes
, bl
);
5448 void object_copy_data_t::generate_test_instances(list
<object_copy_data_t
*>& o
)
5450 o
.push_back(new object_copy_data_t());
5452 list
<object_copy_cursor_t
*> cursors
;
5453 object_copy_cursor_t::generate_test_instances(cursors
);
5454 auto ci
= cursors
.begin();
5455 o
.back()->cursor
= **(ci
++);
5457 o
.push_back(new object_copy_data_t());
5458 o
.back()->cursor
= **(ci
++);
5460 o
.push_back(new object_copy_data_t());
5461 o
.back()->size
= 1234;
5462 o
.back()->mtime
.set_from_double(1234);
5463 ceph::buffer::ptr
bp("there", 5);
5464 ceph::buffer::list bl
;
5466 o
.back()->attrs
["hello"] = bl
;
5467 ceph::buffer::ptr
bp2("not", 3);
5468 ceph::buffer::list bl2
;
5470 map
<string
,ceph::buffer::list
> omap
;
5473 encode(omap
, o
.back()->omap_data
);
5474 ceph::buffer::ptr
databp("iamsomedatatocontain", 20);
5475 o
.back()->data
.push_back(databp
);
5476 o
.back()->omap_header
.append("this is an omap header");
5477 o
.back()->snaps
.push_back(123);
5478 o
.back()->reqids
.push_back(make_pair(osd_reqid_t(), version_t()));
5481 void object_copy_data_t::dump(Formatter
*f
) const
5483 f
->open_object_section("cursor");
5485 f
->close_section(); // cursor
5486 f
->dump_int("size", size
);
5487 f
->dump_stream("mtime") << mtime
;
5488 /* we should really print out the attrs here, but ceph::buffer::list
5489 const-correctness prevents that */
5490 f
->dump_int("attrs_size", attrs
.size());
5491 f
->dump_int("flags", flags
);
5492 f
->dump_unsigned("data_digest", data_digest
);
5493 f
->dump_unsigned("omap_digest", omap_digest
);
5494 f
->dump_int("omap_data_length", omap_data
.length());
5495 f
->dump_int("omap_header_length", omap_header
.length());
5496 f
->dump_int("data_length", data
.length());
5497 f
->open_array_section("snaps");
5498 for (auto p
= snaps
.cbegin(); p
!= snaps
.cend(); ++p
)
5499 f
->dump_unsigned("snap", *p
);
5501 f
->open_array_section("reqids");
5503 for (auto p
= reqids
.begin();
5506 f
->open_object_section("extra_reqid");
5507 f
->dump_stream("reqid") << p
->first
;
5508 f
->dump_stream("user_version") << p
->second
;
5509 auto it
= reqid_return_codes
.find(idx
);
5510 if (it
!= reqid_return_codes
.end()) {
5511 f
->dump_int("return_code", it
->second
);
5518 // -- pg_create_t --
5520 void pg_create_t::encode(ceph::buffer::list
&bl
) const
5522 ENCODE_START(1, 1, bl
);
5523 encode(created
, bl
);
5525 encode(split_bits
, bl
);
5529 void pg_create_t::decode(ceph::buffer::list::const_iterator
&bl
)
5531 DECODE_START(1, bl
);
5532 decode(created
, bl
);
5534 decode(split_bits
, bl
);
5538 void pg_create_t::dump(Formatter
*f
) const
5540 f
->dump_unsigned("created", created
);
5541 f
->dump_stream("parent") << parent
;
5542 f
->dump_int("split_bits", split_bits
);
5545 void pg_create_t::generate_test_instances(list
<pg_create_t
*>& o
)
5547 o
.push_back(new pg_create_t
);
5548 o
.push_back(new pg_create_t(1, pg_t(3, 4), 2));
5552 // -- pg_hit_set_info_t --
5554 void pg_hit_set_info_t::encode(ceph::buffer::list
& bl
) const
5556 ENCODE_START(2, 1, bl
);
5559 encode(version
, bl
);
5560 encode(using_gmt
, bl
);
5564 void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator
& p
)
5570 if (struct_v
>= 2) {
5571 decode(using_gmt
, p
);
5578 void pg_hit_set_info_t::dump(Formatter
*f
) const
5580 f
->dump_stream("begin") << begin
;
5581 f
->dump_stream("end") << end
;
5582 f
->dump_stream("version") << version
;
5583 f
->dump_stream("using_gmt") << using_gmt
;
5586 void pg_hit_set_info_t::generate_test_instances(list
<pg_hit_set_info_t
*>& ls
)
5588 ls
.push_back(new pg_hit_set_info_t
);
5589 ls
.push_back(new pg_hit_set_info_t
);
5590 ls
.back()->begin
= utime_t(1, 2);
5591 ls
.back()->end
= utime_t(3, 4);
5595 // -- pg_hit_set_history_t --
5597 void pg_hit_set_history_t::encode(ceph::buffer::list
& bl
) const
5599 ENCODE_START(1, 1, bl
);
5600 encode(current_last_update
, bl
);
5602 utime_t dummy_stamp
;
5603 encode(dummy_stamp
, bl
);
5606 pg_hit_set_info_t dummy_info
;
5607 encode(dummy_info
, bl
);
5609 encode(history
, bl
);
5613 void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator
& p
)
5616 decode(current_last_update
, p
);
5618 utime_t dummy_stamp
;
5619 decode(dummy_stamp
, p
);
5622 pg_hit_set_info_t dummy_info
;
5623 decode(dummy_info
, p
);
5629 void pg_hit_set_history_t::dump(Formatter
*f
) const
5631 f
->dump_stream("current_last_update") << current_last_update
;
5632 f
->open_array_section("history");
5633 for (auto p
= history
.cbegin(); p
!= history
.cend(); ++p
) {
5634 f
->open_object_section("info");
5641 void pg_hit_set_history_t::generate_test_instances(list
<pg_hit_set_history_t
*>& ls
)
5643 ls
.push_back(new pg_hit_set_history_t
);
5644 ls
.push_back(new pg_hit_set_history_t
);
5645 ls
.back()->current_last_update
= eversion_t(1, 2);
5646 ls
.back()->history
.push_back(pg_hit_set_info_t());
5649 // -- OSDSuperblock --
5651 void OSDSuperblock::encode(ceph::buffer::list
&bl
) const
5653 ENCODE_START(9, 5, bl
);
5654 encode(cluster_fsid
, bl
);
5656 encode(current_epoch
, bl
);
5657 encode(oldest_map
, bl
);
5658 encode(newest_map
, bl
);
5660 compat_features
.encode(bl
);
5661 encode(clean_thru
, bl
);
5662 encode(mounted
, bl
);
5663 encode(osd_fsid
, bl
);
5664 encode((epoch_t
)0, bl
); // epoch_t last_epoch_marked_full
5665 encode((uint32_t)0, bl
); // map<int64_t,epoch_t> pool_last_epoch_marked_full
5666 encode(purged_snaps_last
, bl
);
5667 encode(last_purged_snaps_scrub
, bl
);
5671 void OSDSuperblock::decode(ceph::buffer::list::const_iterator
&bl
)
5673 DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl
);
5678 decode(cluster_fsid
, bl
);
5680 decode(current_epoch
, bl
);
5681 decode(oldest_map
, bl
);
5682 decode(newest_map
, bl
);
5684 if (struct_v
>= 2) {
5685 compat_features
.decode(bl
);
5686 } else { //upgrade it!
5687 compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
5689 decode(clean_thru
, bl
);
5690 decode(mounted
, bl
);
5692 decode(osd_fsid
, bl
);
5693 if (struct_v
>= 6) {
5694 epoch_t last_map_marked_full
;
5695 decode(last_map_marked_full
, bl
);
5697 if (struct_v
>= 7) {
5698 map
<int64_t,epoch_t
> pool_last_map_marked_full
;
5699 decode(pool_last_map_marked_full
, bl
);
5701 if (struct_v
>= 9) {
5702 decode(purged_snaps_last
, bl
);
5703 decode(last_purged_snaps_scrub
, bl
);
5705 purged_snaps_last
= 0;
5710 void OSDSuperblock::dump(Formatter
*f
) const
5712 f
->dump_stream("cluster_fsid") << cluster_fsid
;
5713 f
->dump_stream("osd_fsid") << osd_fsid
;
5714 f
->dump_int("whoami", whoami
);
5715 f
->dump_int("current_epoch", current_epoch
);
5716 f
->dump_int("oldest_map", oldest_map
);
5717 f
->dump_int("newest_map", newest_map
);
5718 f
->dump_float("weight", weight
);
5719 f
->open_object_section("compat");
5720 compat_features
.dump(f
);
5722 f
->dump_int("clean_thru", clean_thru
);
5723 f
->dump_int("last_epoch_mounted", mounted
);
5724 f
->dump_unsigned("purged_snaps_last", purged_snaps_last
);
5725 f
->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub
;
5728 void OSDSuperblock::generate_test_instances(list
<OSDSuperblock
*>& o
)
5731 o
.push_back(new OSDSuperblock(z
));
5732 z
.cluster_fsid
.parse("01010101-0101-0101-0101-010101010101");
5733 z
.osd_fsid
.parse("02020202-0202-0202-0202-020202020202");
5735 z
.current_epoch
= 4;
5740 o
.push_back(new OSDSuperblock(z
));
5741 o
.push_back(new OSDSuperblock(z
));
5746 void SnapSet::encode(ceph::buffer::list
& bl
) const
5748 ENCODE_START(3, 2, bl
);
5750 encode(true, bl
); // head_exists
5753 encode(clone_overlap
, bl
);
5754 encode(clone_size
, bl
);
5755 encode(clone_snaps
, bl
);
5759 void SnapSet::decode(ceph::buffer::list::const_iterator
& bl
)
5761 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
5763 bl
+= 1u; // skip legacy head_exists (always true)
5766 decode(clone_overlap
, bl
);
5767 decode(clone_size
, bl
);
5768 if (struct_v
>= 3) {
5769 decode(clone_snaps
, bl
);
5771 clone_snaps
.clear();
5776 void SnapSet::dump(Formatter
*f
) const
5778 f
->dump_unsigned("seq", seq
);
5779 f
->open_array_section("clones");
5780 for (auto p
= clones
.cbegin(); p
!= clones
.cend(); ++p
) {
5781 f
->open_object_section("clone");
5782 f
->dump_unsigned("snap", *p
);
5783 auto cs
= clone_size
.find(*p
);
5784 if (cs
!= clone_size
.end())
5785 f
->dump_unsigned("size", cs
->second
);
5787 f
->dump_string("size", "????");
5788 auto co
= clone_overlap
.find(*p
);
5789 if (co
!= clone_overlap
.end())
5790 f
->dump_stream("overlap") << co
->second
;
5792 f
->dump_stream("overlap") << "????";
5793 auto q
= clone_snaps
.find(*p
);
5794 if (q
!= clone_snaps
.end()) {
5795 f
->open_array_section("snaps");
5796 for (auto s
: q
->second
) {
5797 f
->dump_unsigned("snap", s
);
5806 void SnapSet::generate_test_instances(list
<SnapSet
*>& o
)
5808 o
.push_back(new SnapSet
);
5809 o
.push_back(new SnapSet
);
5810 o
.back()->seq
= 123;
5811 o
.back()->snaps
.push_back(123);
5812 o
.back()->snaps
.push_back(12);
5813 o
.push_back(new SnapSet
);
5814 o
.back()->seq
= 123;
5815 o
.back()->snaps
.push_back(123);
5816 o
.back()->snaps
.push_back(12);
5817 o
.back()->clones
.push_back(12);
5818 o
.back()->clone_size
[12] = 12345;
5819 o
.back()->clone_overlap
[12];
5820 o
.back()->clone_snaps
[12] = {12, 10, 8};
5823 ostream
& operator<<(ostream
& out
, const SnapSet
& cs
)
5825 return out
<< cs
.seq
<< "=" << cs
.snaps
<< ":"
5829 void SnapSet::from_snap_set(const librados::snap_set_t
& ss
, bool legacy
)
5831 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5832 // correct: it will not include snaps that still logically exist
5833 // but for which there was no clone that is defined. For all
5834 // practical purposes this doesn't matter, since we only use that
5835 // information to clone on the OSD, and we have already moved
5836 // forward past that part of the object history.
5839 set
<snapid_t
> _snaps
;
5840 set
<snapid_t
> _clones
;
5841 for (auto p
= ss
.clones
.cbegin(); p
!= ss
.clones
.cend(); ++p
) {
5842 if (p
->cloneid
!= librados::SNAP_HEAD
) {
5843 _clones
.insert(p
->cloneid
);
5844 _snaps
.insert(p
->snaps
.begin(), p
->snaps
.end());
5845 clone_size
[p
->cloneid
] = p
->size
;
5846 clone_overlap
[p
->cloneid
]; // the entry must exist, even if it's empty.
5847 for (auto q
= p
->overlap
.cbegin(); q
!= p
->overlap
.cend(); ++q
)
5848 clone_overlap
[p
->cloneid
].insert(q
->first
, q
->second
);
5850 // p->snaps is ascending; clone_snaps is descending
5851 vector
<snapid_t
>& v
= clone_snaps
[p
->cloneid
];
5852 for (auto q
= p
->snaps
.rbegin(); q
!= p
->snaps
.rend(); ++q
) {
5861 clones
.reserve(_clones
.size());
5862 for (auto p
= _clones
.begin(); p
!= _clones
.end(); ++p
)
5863 clones
.push_back(*p
);
5867 snaps
.reserve(_snaps
.size());
5868 for (auto p
= _snaps
.rbegin();
5869 p
!= _snaps
.rend(); ++p
)
5870 snaps
.push_back(*p
);
5873 uint64_t SnapSet::get_clone_bytes(snapid_t clone
) const
5875 ceph_assert(clone_size
.count(clone
));
5876 uint64_t size
= clone_size
.find(clone
)->second
;
5877 ceph_assert(clone_overlap
.count(clone
));
5878 const interval_set
<uint64_t> &overlap
= clone_overlap
.find(clone
)->second
;
5879 ceph_assert(size
>= (uint64_t)overlap
.size());
5880 return size
- overlap
.size();
5883 void SnapSet::filter(const pg_pool_t
&pinfo
)
5885 vector
<snapid_t
> oldsnaps
;
5886 oldsnaps
.swap(snaps
);
5887 for (auto i
= oldsnaps
.cbegin(); i
!= oldsnaps
.cend(); ++i
) {
5888 if (!pinfo
.is_removed_snap(*i
))
5889 snaps
.push_back(*i
);
5893 SnapSet
SnapSet::get_filtered(const pg_pool_t
&pinfo
) const
5900 // -- watch_info_t --
5902 void watch_info_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
5904 ENCODE_START(4, 3, bl
);
5906 encode(timeout_seconds
, bl
);
5907 encode(addr
, bl
, features
);
5911 void watch_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
5913 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl
);
5919 decode(timeout_seconds
, bl
);
5920 if (struct_v
>= 4) {
5926 void watch_info_t::dump(Formatter
*f
) const
5928 f
->dump_unsigned("cookie", cookie
);
5929 f
->dump_unsigned("timeout_seconds", timeout_seconds
);
5930 f
->open_object_section("addr");
5935 void watch_info_t::generate_test_instances(list
<watch_info_t
*>& o
)
5937 o
.push_back(new watch_info_t
);
5938 o
.push_back(new watch_info_t
);
5939 o
.back()->cookie
= 123;
5940 o
.back()->timeout_seconds
= 99;
5942 ea
.set_type(entity_addr_t::TYPE_LEGACY
);
5944 ea
.set_family(AF_INET
);
5945 ea
.set_in4_quad(0, 127);
5946 ea
.set_in4_quad(1, 0);
5947 ea
.set_in4_quad(2, 1);
5948 ea
.set_in4_quad(3, 2);
5950 o
.back()->addr
= ea
;
5953 // -- chunk_info_t --
5955 void chunk_info_t::encode(ceph::buffer::list
& bl
) const
5957 ENCODE_START(1, 1, bl
);
5961 __u32 _flags
= flags
;
5966 void chunk_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
5968 DECODE_START(1, bl
);
5974 flags
= (cflag_t
)_flags
;
5978 void chunk_info_t::dump(Formatter
*f
) const
5980 f
->dump_unsigned("length", length
);
5981 f
->open_object_section("oid");
5984 f
->dump_unsigned("flags", flags
);
5988 bool chunk_info_t::operator==(const chunk_info_t
& cit
) const
5990 if (has_fingerprint()) {
5991 if (oid
.oid
.name
== cit
.oid
.oid
.name
) {
5995 if (offset
== cit
.offset
&& length
== cit
.length
&&
5996 oid
.oid
.name
== cit
.oid
.oid
.name
) {
6004 bool operator==(const std::pair
<const long unsigned int, chunk_info_t
> & l
,
6005 const std::pair
<const long unsigned int, chunk_info_t
> & r
)
6007 return l
.first
== r
.first
&&
6008 l
.second
== r
.second
;
6011 ostream
& operator<<(ostream
& out
, const chunk_info_t
& ci
)
6013 return out
<< "(len: " << ci
.length
<< " oid: " << ci
.oid
6014 << " offset: " << ci
.offset
6015 << " flags: " << ci
.get_flag_string(ci
.flags
) << ")";
6018 // -- object_manifest_t --
6020 std::ostream
& operator<<(std::ostream
& out
, const object_ref_delta_t
& ci
)
6022 return out
<< ci
.ref_delta
<< std::endl
;
6025 void object_manifest_t::calc_refs_to_inc_on_set(
6026 const object_manifest_t
* _g
,
6027 const object_manifest_t
* _l
,
6028 object_ref_delta_t
&refs
) const
6030 /* avoid to increment the same reference on adjacent clones */
6031 auto iter
= chunk_map
.begin();
6032 auto find_chunk
= [](decltype(iter
) &i
, const object_manifest_t
* cur
)
6035 auto c
= cur
->chunk_map
.find(i
->first
);
6036 if (c
!= cur
->chunk_map
.end() && c
->second
== i
->second
) {
6044 /* If at least a same chunk exists on either _g or _l, do not increment
6047 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6048 * 20: [0, 2) aaa, <- set_chunk
6049 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6050 * --> incremnt the reference
6052 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6053 * 20: [0, 2) ccc, <- set_chunk
6054 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6055 * --> do not need to increment
6057 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6058 * 20: [0, 2) ccc, <- set_chunk
6059 * 30: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6060 * --> decrement the reference of ccc
6063 for (; iter
!= chunk_map
.end(); ++iter
) {
6064 auto found_g
= find_chunk(iter
, _g
);
6065 auto found_l
= find_chunk(iter
, _l
);
6066 if (!found_g
&& !found_l
) {
6067 refs
.inc_ref(iter
->second
.oid
);
6068 } else if (found_g
&& found_l
) {
6069 refs
.dec_ref(iter
->second
.oid
);
6074 void object_manifest_t::calc_refs_to_drop_on_modify(
6075 const object_manifest_t
* _l
,
6076 const ObjectCleanRegions
& clean_regions
,
6077 object_ref_delta_t
&refs
) const
6079 for (auto &p
: chunk_map
) {
6080 if (!clean_regions
.is_clean_region(p
.first
, p
.second
.length
)) {
6081 // has previous snapshot
6084 * Let's assume that there is a manifest snapshotted object which has three chunks
6085 * head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6086 * 20: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6088 * If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because
6089 * 20 has the reference for bbb. Therefore, we only drop the reference if two chunks
6090 * (head: [6, 2) and 20: [6, 2)) are different.
6093 auto c
= _l
->chunk_map
.find(p
.first
);
6094 if (c
!= _l
->chunk_map
.end()) {
6095 if (p
.second
== c
->second
) {
6099 refs
.dec_ref(p
.second
.oid
);
6101 // decrement the reference of the updated chunks if the manifest object has no snapshot
6102 refs
.dec_ref(p
.second
.oid
);
6108 void object_manifest_t::calc_refs_to_drop_on_removal(
6109 const object_manifest_t
* _g
,
6110 const object_manifest_t
* _l
,
6111 object_ref_delta_t
&refs
) const
6113 /* At a high level, the rule is that consecutive clones with the same reference
6114 * at the same offset share a reference. As such, removing *this may result
6115 * in removing references in two cases:
6116 * 1) *this has a reference which it shares with neither _g nor _l
6117 * 2) _g and _l have a reference which they share with each other but not
6120 * For a particular offset, both 1 and 2 can happen.
6122 * Notably, this means that to evaluate the reference change from removing
6123 * the object with *this, we only need to look at the two adjacent clones.
6126 // Paper over possibly missing _g or _l -- nullopt is semantically the same
6127 // as an empty chunk_map
6128 static const object_manifest_t empty
;
6129 const object_manifest_t
&g
= _g
? *_g
: empty
;
6130 const object_manifest_t
&l
= _l
? *_l
: empty
;
6132 auto giter
= g
.chunk_map
.begin();
6133 auto iter
= chunk_map
.begin();
6134 auto liter
= l
.chunk_map
.begin();
6136 // Translate iter, map pair to the current offset, end() -> max
6137 auto get_offset
= [](decltype(iter
) &i
, const object_manifest_t
&manifest
)
6139 return i
== manifest
.chunk_map
.end() ?
6140 std::numeric_limits
<uint64_t>::max() : i
->first
;
6143 /* If current matches the offset at iter, returns the chunk at *iter
6144 * and increments iter. Otherwise, returns nullptr.
6146 * current will always be derived from the min of *giter, *iter, and
6147 * *liter on each cycle, so the result will be that each loop iteration
6148 * will pick up all chunks at the offest being considered, each offset
6149 * will be considered once, and all offsets will be considered.
6151 auto get_chunk
= [](
6152 uint64_t current
, decltype(iter
) &i
, const object_manifest_t
&manifest
)
6153 -> const chunk_info_t
* {
6154 if (i
== manifest
.chunk_map
.end() || current
!= i
->first
) {
6157 return &(i
++)->second
;
6161 while (giter
!= g
.chunk_map
.end() ||
6162 iter
!= chunk_map
.end() ||
6163 liter
!= l
.chunk_map
.end()) {
6164 auto current
= std::min(
6165 std::min(get_offset(giter
, g
), get_offset(iter
, *this)),
6166 get_offset(liter
, l
));
6168 auto gchunk
= get_chunk(current
, giter
, g
);
6169 auto chunk
= get_chunk(current
, iter
, *this);
6170 auto lchunk
= get_chunk(current
, liter
, l
);
6172 if (gchunk
&& lchunk
&& *gchunk
== *lchunk
&&
6173 (!chunk
|| *gchunk
!= *chunk
)) {
6174 // case 1 from above: l and g match, chunk does not
6175 refs
.dec_ref(gchunk
->oid
);
6179 (!gchunk
|| chunk
->oid
!= gchunk
->oid
) &&
6180 (!lchunk
|| chunk
->oid
!= lchunk
->oid
)) {
6181 // case 2 from above: *this matches neither
6182 refs
.dec_ref(chunk
->oid
);
6187 void object_manifest_t::encode(ceph::buffer::list
& bl
) const
6189 ENCODE_START(1, 1, bl
);
6192 case TYPE_NONE
: break;
6194 encode(redirect_target
, bl
);
6197 encode(chunk_map
, bl
);
6205 void object_manifest_t::decode(ceph::buffer::list::const_iterator
& bl
)
6207 DECODE_START(1, bl
);
6210 case TYPE_NONE
: break;
6212 decode(redirect_target
, bl
);
6215 decode(chunk_map
, bl
);
6223 void object_manifest_t::dump(Formatter
*f
) const
6225 f
->dump_unsigned("type", type
);
6226 if (type
== TYPE_REDIRECT
) {
6227 f
->open_object_section("redirect_target");
6228 redirect_target
.dump(f
);
6230 } else if (type
== TYPE_CHUNKED
) {
6231 f
->open_array_section("chunk_map");
6232 for (auto& p
: chunk_map
) {
6233 f
->open_object_section("chunk");
6234 f
->dump_unsigned("offset", p
.first
);
6242 void object_manifest_t::generate_test_instances(list
<object_manifest_t
*>& o
)
6244 o
.push_back(new object_manifest_t());
6245 o
.back()->type
= TYPE_REDIRECT
;
6248 ostream
& operator<<(ostream
& out
, const object_manifest_t
& om
)
6250 out
<< "manifest(" << om
.get_type_name();
6251 if (om
.is_redirect()) {
6252 out
<< " " << om
.redirect_target
;
6253 } else if (om
.is_chunked()) {
6254 out
<< " " << om
.chunk_map
;
6260 // -- object_info_t --
6262 void object_info_t::copy_user_bits(const object_info_t
& other
)
6264 // these bits are copied from head->clone.
6266 mtime
= other
.mtime
;
6267 local_mtime
= other
.local_mtime
;
6268 last_reqid
= other
.last_reqid
;
6269 truncate_seq
= other
.truncate_seq
;
6270 truncate_size
= other
.truncate_size
;
6271 flags
= other
.flags
;
6272 user_version
= other
.user_version
;
6273 data_digest
= other
.data_digest
;
6274 omap_digest
= other
.omap_digest
;
6277 void object_info_t::encode(ceph::buffer::list
& bl
, uint64_t features
) const
6279 object_locator_t
myoloc(soid
);
6280 map
<entity_name_t
, watch_info_t
> old_watchers
;
6281 for (auto i
= watchers
.cbegin(); i
!= watchers
.cend(); ++i
) {
6282 old_watchers
.insert(make_pair(i
->first
.second
, i
->second
));
6284 ENCODE_START(17, 8, bl
);
6286 encode(myoloc
, bl
); //Retained for compatibility
6287 encode((__u32
)0, bl
); // was category, no longer used
6288 encode(version
, bl
);
6289 encode(prior_version
, bl
);
6290 encode(last_reqid
, bl
);
6293 if (soid
.snap
== CEPH_NOSNAP
)
6294 encode(osd_reqid_t(), bl
); // used to be wrlock_by
6296 encode((uint32_t)0, bl
); // was legacy_snaps
6297 encode(truncate_seq
, bl
);
6298 encode(truncate_size
, bl
);
6299 encode(is_lost(), bl
);
6300 encode(old_watchers
, bl
, features
);
6301 /* shenanigans to avoid breaking backwards compatibility in the disk format.
6302 * When we can, switch this out for simply putting the version_t on disk. */
6303 eversion_t
user_eversion(0, user_version
);
6304 encode(user_eversion
, bl
);
6305 encode(test_flag(FLAG_USES_TMAP
), bl
);
6306 encode(watchers
, bl
, features
);
6307 __u32 _flags
= flags
;
6309 encode(local_mtime
, bl
);
6310 encode(data_digest
, bl
);
6311 encode(omap_digest
, bl
);
6312 encode(expected_object_size
, bl
);
6313 encode(expected_write_size
, bl
);
6314 encode(alloc_hint_flags
, bl
);
6315 if (has_manifest()) {
6316 encode(manifest
, bl
);
6321 void object_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
6323 object_locator_t myoloc
;
6324 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl
);
6325 map
<entity_name_t
, watch_info_t
> old_watchers
;
6330 decode(category
, bl
); // no longer used
6332 decode(version
, bl
);
6333 decode(prior_version
, bl
);
6334 decode(last_reqid
, bl
);
6337 if (soid
.snap
== CEPH_NOSNAP
) {
6338 osd_reqid_t wrlock_by
;
6339 decode(wrlock_by
, bl
);
6341 vector
<snapid_t
> legacy_snaps
;
6342 decode(legacy_snaps
, bl
);
6344 decode(truncate_seq
, bl
);
6345 decode(truncate_size
, bl
);
6347 // if this is struct_v >= 13, we will overwrite this
6348 // below since this field is just here for backwards
6354 decode(old_watchers
, bl
);
6355 eversion_t user_eversion
;
6356 decode(user_eversion
, bl
);
6357 user_version
= user_eversion
.version
;
6359 if (struct_v
>= 9) {
6360 bool uses_tmap
= false;
6361 decode(uses_tmap
, bl
);
6363 set_flag(FLAG_USES_TMAP
);
6365 set_flag(FLAG_USES_TMAP
);
6368 soid
.pool
= myoloc
.pool
;
6369 if (struct_v
>= 11) {
6370 decode(watchers
, bl
);
6372 for (auto i
= old_watchers
.begin(); i
!= old_watchers
.end(); ++i
) {
6375 make_pair(i
->second
.cookie
, i
->first
), i
->second
));
6378 if (struct_v
>= 13) {
6381 flags
= (flag_t
)_flags
;
6383 if (struct_v
>= 14) {
6384 decode(local_mtime
, bl
);
6386 local_mtime
= utime_t();
6388 if (struct_v
>= 15) {
6389 decode(data_digest
, bl
);
6390 decode(omap_digest
, bl
);
6392 data_digest
= omap_digest
= -1;
6393 clear_flag(FLAG_DATA_DIGEST
);
6394 clear_flag(FLAG_OMAP_DIGEST
);
6396 if (struct_v
>= 16) {
6397 decode(expected_object_size
, bl
);
6398 decode(expected_write_size
, bl
);
6399 decode(alloc_hint_flags
, bl
);
6401 expected_object_size
= 0;
6402 expected_write_size
= 0;
6403 alloc_hint_flags
= 0;
6405 if (struct_v
>= 17) {
6406 if (has_manifest()) {
6407 decode(manifest
, bl
);
6413 void object_info_t::dump(Formatter
*f
) const
6415 f
->open_object_section("oid");
6418 f
->dump_stream("version") << version
;
6419 f
->dump_stream("prior_version") << prior_version
;
6420 f
->dump_stream("last_reqid") << last_reqid
;
6421 f
->dump_unsigned("user_version", user_version
);
6422 f
->dump_unsigned("size", size
);
6423 f
->dump_stream("mtime") << mtime
;
6424 f
->dump_stream("local_mtime") << local_mtime
;
6425 f
->dump_unsigned("lost", (int)is_lost());
6426 vector
<string
> sv
= get_flag_vector(flags
);
6427 f
->open_array_section("flags");
6428 for (const auto& str
: sv
) {
6429 f
->dump_string("flags", str
);
6432 f
->dump_unsigned("truncate_seq", truncate_seq
);
6433 f
->dump_unsigned("truncate_size", truncate_size
);
6434 f
->dump_format("data_digest", "0x%08x", data_digest
);
6435 f
->dump_format("omap_digest", "0x%08x", omap_digest
);
6436 f
->dump_unsigned("expected_object_size", expected_object_size
);
6437 f
->dump_unsigned("expected_write_size", expected_write_size
);
6438 f
->dump_unsigned("alloc_hint_flags", alloc_hint_flags
);
6439 f
->dump_object("manifest", manifest
);
6440 f
->open_object_section("watchers");
6441 for (auto p
= watchers
.cbegin(); p
!= watchers
.cend(); ++p
) {
6442 CachedStackStringStream css
;
6443 *css
<< p
->first
.second
;
6444 f
->open_object_section(css
->strv());
6451 void object_info_t::generate_test_instances(list
<object_info_t
*>& o
)
6453 o
.push_back(new object_info_t());
6459 ostream
& operator<<(ostream
& out
, const object_info_t
& oi
)
6461 out
<< oi
.soid
<< "(" << oi
.version
6462 << " " << oi
.last_reqid
;
6464 out
<< " " << oi
.get_flag_string();
6465 out
<< " s " << oi
.size
;
6466 out
<< " uv " << oi
.user_version
;
6467 if (oi
.is_data_digest())
6468 out
<< " dd " << std::hex
<< oi
.data_digest
<< std::dec
;
6469 if (oi
.is_omap_digest())
6470 out
<< " od " << std::hex
<< oi
.omap_digest
<< std::dec
;
6471 out
<< " alloc_hint [" << oi
.expected_object_size
6472 << " " << oi
.expected_write_size
6473 << " " << oi
.alloc_hint_flags
<< "]";
6474 if (oi
.has_manifest())
6475 out
<< " " << oi
.manifest
;
6480 // -- ObjectRecovery --
6481 void ObjectRecoveryProgress::encode(ceph::buffer::list
&bl
) const
6483 ENCODE_START(1, 1, bl
);
6485 encode(data_complete
, bl
);
6486 encode(data_recovered_to
, bl
);
6487 encode(omap_recovered_to
, bl
);
6488 encode(omap_complete
, bl
);
6492 void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator
&bl
)
6494 DECODE_START(1, bl
);
6496 decode(data_complete
, bl
);
6497 decode(data_recovered_to
, bl
);
6498 decode(omap_recovered_to
, bl
);
6499 decode(omap_complete
, bl
);
6503 ostream
&operator<<(ostream
&out
, const ObjectRecoveryProgress
&prog
)
6505 return prog
.print(out
);
6508 void ObjectRecoveryProgress::generate_test_instances(
6509 list
<ObjectRecoveryProgress
*>& o
)
6511 o
.push_back(new ObjectRecoveryProgress
);
6512 o
.back()->first
= false;
6513 o
.back()->data_complete
= true;
6514 o
.back()->omap_complete
= true;
6515 o
.back()->data_recovered_to
= 100;
6517 o
.push_back(new ObjectRecoveryProgress
);
6518 o
.back()->first
= true;
6519 o
.back()->data_complete
= false;
6520 o
.back()->omap_complete
= false;
6521 o
.back()->data_recovered_to
= 0;
6524 ostream
&ObjectRecoveryProgress::print(ostream
&out
) const
6526 return out
<< "ObjectRecoveryProgress("
6527 << ( first
? "" : "!" ) << "first, "
6528 << "data_recovered_to:" << data_recovered_to
6529 << ", data_complete:" << ( data_complete
? "true" : "false" )
6530 << ", omap_recovered_to:" << omap_recovered_to
6531 << ", omap_complete:" << ( omap_complete
? "true" : "false" )
6532 << ", error:" << ( error
? "true" : "false" )
6536 void ObjectRecoveryProgress::dump(Formatter
*f
) const
6538 f
->dump_int("first?", first
);
6539 f
->dump_int("data_complete?", data_complete
);
6540 f
->dump_unsigned("data_recovered_to", data_recovered_to
);
6541 f
->dump_int("omap_complete?", omap_complete
);
6542 f
->dump_string("omap_recovered_to", omap_recovered_to
);
6545 void ObjectRecoveryInfo::encode(ceph::buffer::list
&bl
, uint64_t features
) const
6547 ENCODE_START(3, 1, bl
);
6549 encode(version
, bl
);
6551 encode(oi
, bl
, features
);
6553 encode(copy_subset
, bl
);
6554 encode(clone_subset
, bl
);
6555 encode(object_exist
, bl
);
6559 void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator
&bl
,
6562 DECODE_START(3, bl
);
6564 decode(version
, bl
);
6568 decode(copy_subset
, bl
);
6569 decode(clone_subset
, bl
);
6571 decode(object_exist
, bl
);
6573 object_exist
= false;
6576 if (!soid
.is_max() && soid
.pool
== -1)
6578 map
<hobject_t
, interval_set
<uint64_t>> tmp
;
6579 tmp
.swap(clone_subset
);
6580 for (auto i
= tmp
.begin(); i
!= tmp
.end(); ++i
) {
6581 hobject_t
first(i
->first
);
6582 if (!first
.is_max() && first
.pool
== -1)
6584 clone_subset
[first
].swap(i
->second
);
6589 void ObjectRecoveryInfo::generate_test_instances(
6590 list
<ObjectRecoveryInfo
*>& o
)
6592 o
.push_back(new ObjectRecoveryInfo
);
6593 o
.back()->soid
= hobject_t(sobject_t("key", CEPH_NOSNAP
));
6594 o
.back()->version
= eversion_t(0,0);
6595 o
.back()->size
= 100;
6596 o
.back()->object_exist
= false;
6600 void ObjectRecoveryInfo::dump(Formatter
*f
) const
6602 f
->dump_stream("object") << soid
;
6603 f
->dump_stream("at_version") << version
;
6604 f
->dump_stream("size") << size
;
6606 f
->open_object_section("object_info");
6611 f
->open_object_section("snapset");
6615 f
->dump_stream("copy_subset") << copy_subset
;
6616 f
->dump_stream("clone_subset") << clone_subset
;
6617 f
->dump_stream("object_exist") << object_exist
;
6620 ostream
& operator<<(ostream
& out
, const ObjectRecoveryInfo
&inf
)
6622 return inf
.print(out
);
6625 ostream
&ObjectRecoveryInfo::print(ostream
&out
) const
6627 return out
<< "ObjectRecoveryInfo("
6628 << soid
<< "@" << version
6629 << ", size: " << size
6630 << ", copy_subset: " << copy_subset
6631 << ", clone_subset: " << clone_subset
6632 << ", snapset: " << ss
6633 << ", object_exist: " << object_exist
6637 // -- PushReplyOp --
6638 void PushReplyOp::generate_test_instances(list
<PushReplyOp
*> &o
)
6640 o
.push_back(new PushReplyOp
);
6641 o
.push_back(new PushReplyOp
);
6642 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
6643 o
.push_back(new PushReplyOp
);
6644 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
6647 void PushReplyOp::encode(ceph::buffer::list
&bl
) const
6649 ENCODE_START(1, 1, bl
);
6654 void PushReplyOp::decode(ceph::buffer::list::const_iterator
&bl
)
6656 DECODE_START(1, bl
);
6661 void PushReplyOp::dump(Formatter
*f
) const
6663 f
->dump_stream("soid") << soid
;
6666 ostream
&PushReplyOp::print(ostream
&out
) const
6669 << "PushReplyOp(" << soid
6673 ostream
& operator<<(ostream
& out
, const PushReplyOp
&op
)
6675 return op
.print(out
);
6678 uint64_t PushReplyOp::cost(CephContext
*cct
) const
6681 return cct
->_conf
->osd_push_per_object_cost
+
6682 cct
->_conf
->osd_recovery_max_chunk
;
6686 void PullOp::generate_test_instances(list
<PullOp
*> &o
)
6688 o
.push_back(new PullOp
);
6689 o
.push_back(new PullOp
);
6690 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
6691 o
.back()->recovery_info
.version
= eversion_t(3, 10);
6692 o
.push_back(new PullOp
);
6693 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
6694 o
.back()->recovery_info
.version
= eversion_t(0, 0);
6697 void PullOp::encode(ceph::buffer::list
&bl
, uint64_t features
) const
6699 ENCODE_START(1, 1, bl
);
6701 encode(recovery_info
, bl
, features
);
6702 encode(recovery_progress
, bl
);
6706 void PullOp::decode(ceph::buffer::list::const_iterator
&bl
)
6708 DECODE_START(1, bl
);
6710 decode(recovery_info
, bl
);
6711 decode(recovery_progress
, bl
);
6715 void PullOp::dump(Formatter
*f
) const
6717 f
->dump_stream("soid") << soid
;
6719 f
->open_object_section("recovery_info");
6720 recovery_info
.dump(f
);
6724 f
->open_object_section("recovery_progress");
6725 recovery_progress
.dump(f
);
6730 ostream
&PullOp::print(ostream
&out
) const
6733 << "PullOp(" << soid
6734 << ", recovery_info: " << recovery_info
6735 << ", recovery_progress: " << recovery_progress
6739 ostream
& operator<<(ostream
& out
, const PullOp
&op
)
6741 return op
.print(out
);
6744 uint64_t PullOp::cost(CephContext
*cct
) const
6746 return cct
->_conf
->osd_push_per_object_cost
+
6747 cct
->_conf
->osd_recovery_max_chunk
;
6751 void PushOp::generate_test_instances(list
<PushOp
*> &o
)
6753 o
.push_back(new PushOp
);
6754 o
.push_back(new PushOp
);
6755 o
.back()->soid
= hobject_t(sobject_t("asdf", 2));
6756 o
.back()->version
= eversion_t(3, 10);
6757 o
.push_back(new PushOp
);
6758 o
.back()->soid
= hobject_t(sobject_t("asdf", CEPH_NOSNAP
));
6759 o
.back()->version
= eversion_t(0, 0);
6762 void PushOp::encode(ceph::buffer::list
&bl
, uint64_t features
) const
6764 ENCODE_START(1, 1, bl
);
6766 encode(version
, bl
);
6768 encode(data_included
, bl
);
6769 encode(omap_header
, bl
);
6770 encode(omap_entries
, bl
);
6771 encode(attrset
, bl
);
6772 encode(recovery_info
, bl
, features
);
6773 encode(after_progress
, bl
);
6774 encode(before_progress
, bl
);
6778 void PushOp::decode(ceph::buffer::list::const_iterator
&bl
)
6780 DECODE_START(1, bl
);
6782 decode(version
, bl
);
6784 decode(data_included
, bl
);
6785 decode(omap_header
, bl
);
6786 decode(omap_entries
, bl
);
6787 decode(attrset
, bl
);
6788 decode(recovery_info
, bl
);
6789 decode(after_progress
, bl
);
6790 decode(before_progress
, bl
);
6794 void PushOp::dump(Formatter
*f
) const
6796 f
->dump_stream("soid") << soid
;
6797 f
->dump_stream("version") << version
;
6798 f
->dump_int("data_len", data
.length());
6799 f
->dump_stream("data_included") << data_included
;
6800 f
->dump_int("omap_header_len", omap_header
.length());
6801 f
->dump_int("omap_entries_len", omap_entries
.size());
6802 f
->dump_int("attrset_len", attrset
.size());
6804 f
->open_object_section("recovery_info");
6805 recovery_info
.dump(f
);
6809 f
->open_object_section("after_progress");
6810 after_progress
.dump(f
);
6814 f
->open_object_section("before_progress");
6815 before_progress
.dump(f
);
6820 ostream
&PushOp::print(ostream
&out
) const
6823 << "PushOp(" << soid
6824 << ", version: " << version
6825 << ", data_included: " << data_included
6826 << ", data_size: " << data
.length()
6827 << ", omap_header_size: " << omap_header
.length()
6828 << ", omap_entries_size: " << omap_entries
.size()
6829 << ", attrset_size: " << attrset
.size()
6830 << ", recovery_info: " << recovery_info
6831 << ", after_progress: " << after_progress
6832 << ", before_progress: " << before_progress
6836 ostream
& operator<<(ostream
& out
, const PushOp
&op
)
6838 return op
.print(out
);
6841 uint64_t PushOp::cost(CephContext
*cct
) const
6843 uint64_t cost
= data_included
.size();
6844 for (auto i
= omap_entries
.cbegin(); i
!= omap_entries
.cend(); ++i
) {
6845 cost
+= i
->second
.length();
6847 cost
+= cct
->_conf
->osd_push_per_object_cost
;
6853 void ScrubMap::merge_incr(const ScrubMap
&l
)
6855 ceph_assert(valid_through
== l
.incr_since
);
6856 valid_through
= l
.valid_through
;
6858 for (auto p
= l
.objects
.cbegin(); p
!= l
.objects
.cend(); ++p
){
6859 if (p
->second
.negative
) {
6860 auto q
= objects
.find(p
->first
);
6861 if (q
!= objects
.end()) {
6865 objects
[p
->first
] = p
->second
;
6870 void ScrubMap::encode(ceph::buffer::list
& bl
) const
6872 ENCODE_START(3, 2, bl
);
6873 encode(objects
, bl
);
6874 encode((__u32
)0, bl
); // used to be attrs; now deprecated
6875 ceph::buffer::list old_logbl
; // not used
6876 encode(old_logbl
, bl
);
6877 encode(valid_through
, bl
);
6878 encode(incr_since
, bl
);
6882 void ScrubMap::decode(ceph::buffer::list::const_iterator
& bl
, int64_t pool
)
6884 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
6885 decode(objects
, bl
);
6887 map
<string
,string
> attrs
; // deprecated
6890 ceph::buffer::list old_logbl
; // not used
6891 decode(old_logbl
, bl
);
6892 decode(valid_through
, bl
);
6893 decode(incr_since
, bl
);
6896 // handle hobject_t upgrade
6898 map
<hobject_t
, object
> tmp
;
6900 for (auto i
= tmp
.begin(); i
!= tmp
.end(); ++i
) {
6901 hobject_t
first(i
->first
);
6902 if (!first
.is_max() && first
.pool
== -1)
6904 objects
[first
] = i
->second
;
6909 void ScrubMap::dump(Formatter
*f
) const
6911 f
->dump_stream("valid_through") << valid_through
;
6912 f
->dump_stream("incremental_since") << incr_since
;
6913 f
->open_array_section("objects");
6914 for (auto p
= objects
.cbegin(); p
!= objects
.cend(); ++p
) {
6915 f
->open_object_section("object");
6916 f
->dump_string("name", p
->first
.oid
.name
);
6917 f
->dump_unsigned("hash", p
->first
.get_hash());
6918 f
->dump_string("key", p
->first
.get_key());
6919 f
->dump_int("snapid", p
->first
.snap
);
6926 void ScrubMap::generate_test_instances(list
<ScrubMap
*>& o
)
6928 o
.push_back(new ScrubMap
);
6929 o
.push_back(new ScrubMap
);
6930 o
.back()->valid_through
= eversion_t(1, 2);
6931 o
.back()->incr_since
= eversion_t(3, 4);
6933 object::generate_test_instances(obj
);
6934 o
.back()->objects
[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj
.back();
6936 o
.back()->objects
[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj
.back();
6939 // -- ScrubMap::object --
6941 void ScrubMap::object::encode(ceph::buffer::list
& bl
) const
6943 bool compat_read_error
= read_error
|| ec_hash_mismatch
|| ec_size_mismatch
;
6944 ENCODE_START(10, 7, bl
);
6946 encode(negative
, bl
);
6949 encode(digest_present
, bl
);
6950 encode((uint32_t)0, bl
); // obsolete nlinks
6951 encode((uint32_t)0, bl
); // snapcolls
6952 encode(omap_digest
, bl
);
6953 encode(omap_digest_present
, bl
);
6954 encode(compat_read_error
, bl
);
6955 encode(stat_error
, bl
);
6956 encode(read_error
, bl
);
6957 encode(ec_hash_mismatch
, bl
);
6958 encode(ec_size_mismatch
, bl
);
6959 encode(large_omap_object_found
, bl
);
6960 encode(large_omap_object_key_count
, bl
);
6961 encode(large_omap_object_value_size
, bl
);
6962 encode(object_omap_bytes
, bl
);
6963 encode(object_omap_keys
, bl
);
6967 void ScrubMap::object::decode(ceph::buffer::list::const_iterator
& bl
)
6969 DECODE_START(10, bl
);
6971 bool tmp
, compat_read_error
= false;
6977 digest_present
= tmp
;
6981 set
<snapid_t
> snapcolls
;
6982 decode(snapcolls
, bl
);
6984 decode(omap_digest
, bl
);
6986 omap_digest_present
= tmp
;
6987 decode(compat_read_error
, bl
);
6990 if (struct_v
>= 8) {
6994 ec_hash_mismatch
= tmp
;
6996 ec_size_mismatch
= tmp
;
6998 // If older encoder found a read_error, set read_error
6999 if (compat_read_error
&& !read_error
&& !ec_hash_mismatch
&& !ec_size_mismatch
)
7001 if (struct_v
>= 9) {
7003 large_omap_object_found
= tmp
;
7004 decode(large_omap_object_key_count
, bl
);
7005 decode(large_omap_object_value_size
, bl
);
7007 if (struct_v
>= 10) {
7008 decode(object_omap_bytes
, bl
);
7009 decode(object_omap_keys
, bl
);
7014 void ScrubMap::object::dump(Formatter
*f
) const
7016 f
->dump_int("size", size
);
7017 f
->dump_int("negative", negative
);
7018 f
->open_array_section("attrs");
7019 for (auto p
= attrs
.cbegin(); p
!= attrs
.cend(); ++p
) {
7020 f
->open_object_section("attr");
7021 f
->dump_string("name", p
->first
);
7022 f
->dump_int("length", p
->second
.length());
7028 void ScrubMap::object::generate_test_instances(list
<object
*>& o
)
7030 o
.push_back(new object
);
7031 o
.push_back(new object
);
7032 o
.back()->negative
= true;
7033 o
.push_back(new object
);
7034 o
.back()->size
= 123;
7035 o
.back()->attrs
["foo"] = ceph::buffer::copy("foo", 3);
7036 o
.back()->attrs
["bar"] = ceph::buffer::copy("barval", 6);
7041 ostream
& operator<<(ostream
& out
, const OSDOp
& op
)
7043 out
<< ceph_osd_op_name(op
.op
.op
);
7044 if (ceph_osd_op_type_data(op
.op
.op
)) {
7047 case CEPH_OSD_OP_ASSERT_VER
:
7048 out
<< " v" << op
.op
.assert_ver
.ver
;
7050 case CEPH_OSD_OP_TRUNCATE
:
7051 out
<< " " << op
.op
.extent
.offset
;
7053 case CEPH_OSD_OP_MASKTRUNC
:
7054 case CEPH_OSD_OP_TRIMTRUNC
:
7055 out
<< " " << op
.op
.extent
.truncate_seq
<< "@"
7056 << (int64_t)op
.op
.extent
.truncate_size
;
7058 case CEPH_OSD_OP_ROLLBACK
:
7059 out
<< " " << snapid_t(op
.op
.snap
.snapid
);
7061 case CEPH_OSD_OP_WATCH
:
7062 out
<< " " << ceph_osd_watch_op_name(op
.op
.watch
.op
)
7063 << " cookie " << op
.op
.watch
.cookie
;
7064 if (op
.op
.watch
.gen
)
7065 out
<< " gen " << op
.op
.watch
.gen
;
7067 case CEPH_OSD_OP_NOTIFY
:
7068 out
<< " cookie " << op
.op
.notify
.cookie
;
7070 case CEPH_OSD_OP_COPY_GET
:
7071 out
<< " max " << op
.op
.copy_get
.max
;
7073 case CEPH_OSD_OP_COPY_FROM
:
7074 out
<< " ver " << op
.op
.copy_from
.src_version
;
7076 case CEPH_OSD_OP_SETALLOCHINT
:
7077 out
<< " object_size " << op
.op
.alloc_hint
.expected_object_size
7078 << " write_size " << op
.op
.alloc_hint
.expected_write_size
;
7080 case CEPH_OSD_OP_READ
:
7081 case CEPH_OSD_OP_SPARSE_READ
:
7082 case CEPH_OSD_OP_SYNC_READ
:
7083 case CEPH_OSD_OP_WRITE
:
7084 case CEPH_OSD_OP_WRITEFULL
:
7085 case CEPH_OSD_OP_ZERO
:
7086 case CEPH_OSD_OP_APPEND
:
7087 case CEPH_OSD_OP_MAPEXT
:
7088 case CEPH_OSD_OP_CMPEXT
:
7089 out
<< " " << op
.op
.extent
.offset
<< "~" << op
.op
.extent
.length
;
7090 if (op
.op
.extent
.truncate_seq
)
7091 out
<< " [" << op
.op
.extent
.truncate_seq
<< "@"
7092 << (int64_t)op
.op
.extent
.truncate_size
<< "]";
7094 out
<< " [" << ceph_osd_op_flag_string(op
.op
.flags
) << "]";
7096 // don't show any arg info
7099 } else if (ceph_osd_op_type_attr(op
.op
.op
)) {
7101 if (op
.op
.xattr
.name_len
&& op
.indata
.length()) {
7103 op
.indata
.write(0, op
.op
.xattr
.name_len
, out
);
7105 if (op
.op
.xattr
.value_len
)
7106 out
<< " (" << op
.op
.xattr
.value_len
<< ")";
7107 if (op
.op
.op
== CEPH_OSD_OP_CMPXATTR
)
7108 out
<< " op " << (int)op
.op
.xattr
.cmp_op
7109 << " mode " << (int)op
.op
.xattr
.cmp_mode
;
7110 } else if (ceph_osd_op_type_exec(op
.op
.op
)) {
7112 if (op
.op
.cls
.class_len
&& op
.indata
.length()) {
7114 op
.indata
.write(0, op
.op
.cls
.class_len
, out
);
7116 op
.indata
.write(op
.op
.cls
.class_len
, op
.op
.cls
.method_len
, out
);
7118 } else if (ceph_osd_op_type_pg(op
.op
.op
)) {
7120 case CEPH_OSD_OP_PGLS
:
7121 case CEPH_OSD_OP_PGLS_FILTER
:
7122 case CEPH_OSD_OP_PGNLS
:
7123 case CEPH_OSD_OP_PGNLS_FILTER
:
7124 out
<< " start_epoch " << op
.op
.pgls
.start_epoch
;
7126 case CEPH_OSD_OP_PG_HITSET_LS
:
7128 case CEPH_OSD_OP_PG_HITSET_GET
:
7129 out
<< " " << utime_t(op
.op
.hit_set_get
.stamp
);
7131 case CEPH_OSD_OP_SCRUBLS
:
7135 if (op
.indata
.length()) {
7136 out
<< " in=" << op
.indata
.length() << "b";
7138 if (op
.outdata
.length()) {
7139 out
<< " out=" << op
.outdata
.length() << "b";
7145 void OSDOp::split_osd_op_vector_out_data(vector
<OSDOp
>& ops
, ceph::buffer::list
& in
)
7147 auto datap
= in
.begin();
7148 for (unsigned i
= 0; i
< ops
.size(); i
++) {
7149 if (ops
[i
].op
.payload_len
) {
7150 datap
.copy(ops
[i
].op
.payload_len
, ops
[i
].outdata
);
7155 void OSDOp::merge_osd_op_vector_out_data(vector
<OSDOp
>& ops
, ceph::buffer::list
& out
)
7157 for (unsigned i
= 0; i
< ops
.size(); i
++) {
7158 ops
[i
].op
.payload_len
= ops
[i
].outdata
.length();
7159 if (ops
[i
].outdata
.length()) {
7160 out
.append(ops
[i
].outdata
);
7165 int prepare_info_keymap(
7167 map
<string
,bufferlist
> *km
,
7168 string
*key_to_remove
,
7171 pg_info_t
&last_written_info
,
7172 PastIntervals
&past_intervals
,
7173 bool dirty_big_info
,
7176 PerfCounters
*logger
,
7177 DoutPrefixProvider
*dpp
)
7180 encode(epoch
, (*km
)[string(epoch_key
)]);
7184 logger
->inc(l_osd_pg_info
);
7186 // try to do info efficiently?
7187 if (!dirty_big_info
&& try_fast_info
&&
7188 info
.last_update
> last_written_info
.last_update
) {
7189 pg_fast_info_t fast
;
7190 fast
.populate_from(info
);
7191 bool did
= fast
.try_apply_to(&last_written_info
);
7192 ceph_assert(did
); // we verified last_update increased above
7193 if (info
== last_written_info
) {
7194 encode(fast
, (*km
)[string(fastinfo_key
)]);
7196 logger
->inc(l_osd_pg_fastinfo
);
7200 ldpp_dout(dpp
, 30) << __func__
<< " fastinfo failed, info:\n";
7202 JSONFormatter
jf(true);
7203 jf
.dump_object("info", info
);
7207 *_dout
<< "\nlast_written_info:\n";
7208 JSONFormatter
jf(true);
7209 jf
.dump_object("last_written_info", last_written_info
);
7214 } else if (info
.last_update
<= last_written_info
.last_update
) {
7215 // clean up any potentially stale fastinfo key resulting from last_update
7216 // not moving forwards (e.g., a backwards jump during peering)
7217 *key_to_remove
= fastinfo_key
;
7220 last_written_info
= info
;
7222 // info. store purged_snaps separately.
7223 interval_set
<snapid_t
> purged_snaps
;
7224 purged_snaps
.swap(info
.purged_snaps
);
7225 encode(info
, (*km
)[string(info_key
)]);
7226 purged_snaps
.swap(info
.purged_snaps
);
7228 if (dirty_big_info
) {
7229 // potentially big stuff
7230 bufferlist
& bigbl
= (*km
)[string(biginfo_key
)];
7231 encode(past_intervals
, bigbl
);
7232 encode(info
.purged_snaps
, bigbl
);
7233 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
7235 logger
->inc(l_osd_pg_biginfo
);
7241 void create_pg_collection(
7242 ceph::os::Transaction
& t
, spg_t pgid
, int bits
)
7245 t
.create_collection(coll
, bits
);
7248 void init_pg_ondisk(
7249 ceph::os::Transaction
& t
,
7251 const pg_pool_t
*pool
)
7255 // Give a hint to the PG collection
7257 uint32_t pg_num
= pool
->get_pg_num();
7258 uint64_t expected_num_objects_pg
= pool
->expected_num_objects
/ pg_num
;
7259 encode(pg_num
, hint
);
7260 encode(expected_num_objects_pg
, hint
);
7261 uint32_t hint_type
= ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
;
7262 t
.collection_hint(coll
, hint_type
, hint
);
7265 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
7266 t
.touch(coll
, pgmeta_oid
);
7267 map
<string
,bufferlist
> values
;
7268 __u8 struct_v
= pg_latest_struct_v
;
7269 encode(struct_v
, values
[string(infover_key
)]);
7270 t
.omap_setkeys(coll
, pgmeta_oid
, values
);
7273 PGLSFilter::PGLSFilter() : cct(nullptr)
7277 PGLSFilter::~PGLSFilter()
7281 int PGLSPlainFilter::init(ceph::bufferlist::const_iterator
¶ms
)
7284 decode(xattr
, params
);
7285 decode(val
, params
);
7286 } catch (ceph::buffer::error
&e
) {
7292 bool PGLSPlainFilter::filter(const hobject_t
& obj
,
7293 const ceph::bufferlist
& xattr_data
) const
7295 return xattr_data
.contents_equal(val
.c_str(), val
.size());