]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/osd_types.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / osd / osd_types.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
1e59de90 18#include <algorithm>
9f95a23c
TL
19#include <list>
20#include <map>
21#include <ostream>
22#include <sstream>
23#include <set>
24#include <string>
25#include <utility>
26#include <vector>
27
28
7c673cae
FG
29#include <boost/assign/list_of.hpp>
30
7c673cae 31#include "include/ceph_features.h"
9f95a23c 32#include "include/encoding.h"
11fdf7f2 33#include "include/stringify.h"
7c673cae
FG
34extern "C" {
35#include "crush/hash.h"
36}
9f95a23c
TL
37
38#include "common/Formatter.h"
f67539c2 39#include "common/StackStringStream.h"
20effc67 40#include "include/utime_fmt.h"
7c673cae 41#include "OSDMap.h"
9f95a23c 42#include "osd_types.h"
20effc67 43#include "osd_types_fmt.h"
9f95a23c
TL
44#include "os/Transaction.h"
45
46using std::list;
47using std::make_pair;
48using std::map;
49using std::ostream;
9f95a23c
TL
50using std::pair;
51using std::set;
20effc67 52using std::shared_ptr;
9f95a23c 53using std::string;
9f95a23c
TL
54using std::unique_ptr;
55using std::vector;
56
f67539c2 57using ceph::bufferlist;
9f95a23c
TL
58using ceph::decode;
59using ceph::decode_nohead;
60using ceph::encode;
61using ceph::encode_nohead;
62using ceph::Formatter;
f67539c2
TL
63using ceph::make_timespan;
64using ceph::JSONFormatter;
9f95a23c
TL
65
66using namespace std::literals;
7c673cae
FG
67
68const char *ceph_osd_flag_name(unsigned flag)
69{
70 switch (flag) {
71 case CEPH_OSD_FLAG_ACK: return "ack";
72 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
73 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
74 case CEPH_OSD_FLAG_RETRY: return "retry";
75 case CEPH_OSD_FLAG_READ: return "read";
76 case CEPH_OSD_FLAG_WRITE: return "write";
77 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
78 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
79 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
80 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
81 case CEPH_OSD_FLAG_PGOP: return "pgop";
82 case CEPH_OSD_FLAG_EXEC: return "exec";
83 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
84 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
85 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
86 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
87 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
88 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
89 case CEPH_OSD_FLAG_FLUSH: return "flush";
90 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
91 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
92 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
93 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
94 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
95 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
224ce89b 96 case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
9f95a23c 97 case CEPH_OSD_FLAG_RETURNVEC: return "returnvec";
20effc67 98 case CEPH_OSD_FLAG_SUPPORTSPOOLEIO: return "supports_pool_eio";
7c673cae
FG
99 default: return "???";
100 }
101}
102
103string ceph_osd_flag_string(unsigned flags)
104{
105 string s;
106 for (unsigned i=0; i<32; ++i) {
107 if (flags & (1u<<i)) {
108 if (s.length())
109 s += "+";
110 s += ceph_osd_flag_name(1u << i);
111 }
112 }
113 if (s.length())
114 return s;
115 return string("-");
116}
117
118const char * ceph_osd_op_flag_name(unsigned flag)
119{
120 const char *name;
121
122 switch(flag) {
123 case CEPH_OSD_OP_FLAG_EXCL:
124 name = "excl";
125 break;
126 case CEPH_OSD_OP_FLAG_FAILOK:
127 name = "failok";
128 break;
129 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
130 name = "fadvise_random";
131 break;
132 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
133 name = "fadvise_sequential";
134 break;
135 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
136 name = "favise_willneed";
137 break;
138 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
139 name = "fadvise_dontneed";
140 break;
141 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
142 name = "fadvise_nocache";
143 break;
11fdf7f2
TL
144 case CEPH_OSD_OP_FLAG_WITH_REFERENCE:
145 name = "with_reference";
146 break;
91327a77
AA
147 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
148 name = "bypass_clean_cache";
149 break;
7c673cae
FG
150 default:
151 name = "???";
152 };
153
154 return name;
155}
156
157string ceph_osd_op_flag_string(unsigned flags)
158{
159 string s;
160 for (unsigned i=0; i<32; ++i) {
161 if (flags & (1u<<i)) {
162 if (s.length())
163 s += "+";
164 s += ceph_osd_op_flag_name(1u << i);
165 }
166 }
167 if (s.length())
168 return s;
169 return string("-");
170}
171
172string ceph_osd_alloc_hint_flag_string(unsigned flags)
173{
174 string s;
175 for (unsigned i=0; i<32; ++i) {
176 if (flags & (1u<<i)) {
177 if (s.length())
178 s += "+";
179 s += ceph_osd_alloc_hint_flag_name(1u << i);
180 }
181 }
182 if (s.length())
183 return s;
184 return string("-");
185}
186
9f95a23c 187void pg_shard_t::encode(ceph::buffer::list &bl) const
7c673cae
FG
188{
189 ENCODE_START(1, 1, bl);
11fdf7f2
TL
190 encode(osd, bl);
191 encode(shard, bl);
7c673cae
FG
192 ENCODE_FINISH(bl);
193}
9f95a23c 194void pg_shard_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
195{
196 DECODE_START(1, bl);
11fdf7f2
TL
197 decode(osd, bl);
198 decode(shard, bl);
7c673cae
FG
199 DECODE_FINISH(bl);
200}
201
202ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
203{
204 if (rhs.is_undefined())
205 return lhs << "?";
206 if (rhs.shard == shard_id_t::NO_SHARD)
b32b8144
FG
207 return lhs << rhs.get_osd();
208 return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
7c673cae
FG
209}
210
11fdf7f2
TL
211void dump(Formatter* f, const osd_alerts_t& alerts)
212{
213 for (auto& a : alerts) {
214 string s0 = " osd: ";
215 s0 += stringify(a.first);
216 string s;
217 for (auto& aa : a.second) {
218 s = s0;
219 s += " ";
220 s += aa.first;
221 s += ":";
222 s += aa.second;
223 f->dump_string("alert", s);
224 }
225 }
226}
227
7c673cae
FG
228// -- osd_reqid_t --
229void osd_reqid_t::dump(Formatter *f) const
230{
231 f->dump_stream("name") << name;
232 f->dump_int("inc", inc);
233 f->dump_unsigned("tid", tid);
234}
235
236void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
237{
238 o.push_back(new osd_reqid_t);
239 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
240}
241
242// -- object_locator_t --
243
9f95a23c 244void object_locator_t::encode(ceph::buffer::list& bl) const
7c673cae
FG
245{
246 // verify that nobody's corrupted the locator
11fdf7f2 247 ceph_assert(hash == -1 || key.empty());
7c673cae
FG
248 __u8 encode_compat = 3;
249 ENCODE_START(6, encode_compat, bl);
11fdf7f2 250 encode(pool, bl);
7c673cae 251 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
11fdf7f2
TL
252 encode(preferred, bl);
253 encode(key, bl);
254 encode(nspace, bl);
255 encode(hash, bl);
7c673cae 256 if (hash != -1)
11fdf7f2 257 encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash
7c673cae
FG
258 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
259}
260
9f95a23c 261void object_locator_t::decode(ceph::buffer::list::const_iterator& p)
7c673cae
FG
262{
263 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
264 if (struct_v < 2) {
265 int32_t op;
11fdf7f2 266 decode(op, p);
7c673cae
FG
267 pool = op;
268 int16_t pref;
11fdf7f2 269 decode(pref, p);
7c673cae 270 } else {
11fdf7f2 271 decode(pool, p);
7c673cae 272 int32_t preferred;
11fdf7f2 273 decode(preferred, p);
7c673cae 274 }
11fdf7f2 275 decode(key, p);
7c673cae 276 if (struct_v >= 5)
11fdf7f2 277 decode(nspace, p);
7c673cae 278 if (struct_v >= 6)
11fdf7f2 279 decode(hash, p);
7c673cae
FG
280 else
281 hash = -1;
282 DECODE_FINISH(p);
283 // verify that nobody's corrupted the locator
11fdf7f2 284 ceph_assert(hash == -1 || key.empty());
7c673cae
FG
285}
286
287void object_locator_t::dump(Formatter *f) const
288{
289 f->dump_int("pool", pool);
290 f->dump_string("key", key);
291 f->dump_string("namespace", nspace);
292 f->dump_int("hash", hash);
293}
294
295void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
296{
297 o.push_back(new object_locator_t);
298 o.push_back(new object_locator_t(123));
299 o.push_back(new object_locator_t(123, 876));
300 o.push_back(new object_locator_t(1, "n2"));
301 o.push_back(new object_locator_t(1234, "", "key"));
302 o.push_back(new object_locator_t(12, "n1", "key2"));
303}
304
305// -- request_redirect_t --
9f95a23c 306void request_redirect_t::encode(ceph::buffer::list& bl) const
7c673cae
FG
307{
308 ENCODE_START(1, 1, bl);
11fdf7f2
TL
309 encode(redirect_locator, bl);
310 encode(redirect_object, bl);
311 // legacy of the removed osd_instructions member
312 encode((uint32_t)0, bl);
7c673cae
FG
313 ENCODE_FINISH(bl);
314}
315
9f95a23c 316void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae
FG
317{
318 DECODE_START(1, bl);
11fdf7f2
TL
319 uint32_t legacy_osd_instructions_len;
320 decode(redirect_locator, bl);
321 decode(redirect_object, bl);
322 decode(legacy_osd_instructions_len, bl);
323 if (legacy_osd_instructions_len) {
9f95a23c 324 bl += legacy_osd_instructions_len;
11fdf7f2 325 }
7c673cae
FG
326 DECODE_FINISH(bl);
327}
328
329void request_redirect_t::dump(Formatter *f) const
330{
331 f->dump_string("object", redirect_object);
332 f->open_object_section("locator");
333 redirect_locator.dump(f);
334 f->close_section(); // locator
335}
336
337void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
338{
339 object_locator_t loc(1, "redir_obj");
340 o.push_back(new request_redirect_t());
341 o.push_back(new request_redirect_t(loc, 0));
342 o.push_back(new request_redirect_t(loc, "redir_obj"));
343 o.push_back(new request_redirect_t(loc));
344}
345
346void objectstore_perf_stat_t::dump(Formatter *f) const
347{
11fdf7f2
TL
348 // *_ms values just for compatibility.
349 f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0);
350 f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0);
351 f->dump_unsigned("commit_latency_ns", os_commit_latency_ns);
352 f->dump_unsigned("apply_latency_ns", os_apply_latency_ns);
7c673cae
FG
353}
354
9f95a23c 355void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
7c673cae 356{
11fdf7f2
TL
357 uint8_t target_v = 2;
358 if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) {
359 target_v = 1;
360 }
361 ENCODE_START(target_v, target_v, bl);
362 if (target_v >= 2) {
363 encode(os_commit_latency_ns, bl);
364 encode(os_apply_latency_ns, bl);
365 } else {
366 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
367 uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS;
368 uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS;
369 encode(commit_latency_ms, bl); // for compatibility with older monitor.
370 encode(apply_latency_ms, bl); // for compatibility with older monitor.
371 }
7c673cae
FG
372 ENCODE_FINISH(bl);
373}
374
9f95a23c 375void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae 376{
11fdf7f2
TL
377 DECODE_START(2, bl);
378 if (struct_v >= 2) {
379 decode(os_commit_latency_ns, bl);
380 decode(os_apply_latency_ns, bl);
381 } else {
382 uint32_t commit_latency_ms;
383 uint32_t apply_latency_ms;
384 decode(commit_latency_ms, bl);
385 decode(apply_latency_ms, bl);
386 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
387 os_commit_latency_ns = commit_latency_ms * NS_PER_MS;
388 os_apply_latency_ns = apply_latency_ms * NS_PER_MS;
389 }
7c673cae
FG
390 DECODE_FINISH(bl);
391}
392
393void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
394{
395 o.push_back(new objectstore_perf_stat_t());
396 o.push_back(new objectstore_perf_stat_t());
11fdf7f2
TL
397 o.back()->os_commit_latency_ns = 20000000;
398 o.back()->os_apply_latency_ns = 30000000;
7c673cae
FG
399}
400
401// -- osd_stat_t --
ded94939 402void osd_stat_t::dump(Formatter *f, bool with_net) const
7c673cae 403{
31f18b77
FG
404 f->dump_unsigned("up_from", up_from);
405 f->dump_unsigned("seq", seq);
35e4c445 406 f->dump_unsigned("num_pgs", num_pgs);
81eedcae
TL
407 f->dump_unsigned("num_osds", num_osds);
408 f->dump_unsigned("num_per_pool_osds", num_per_pool_osds);
9f95a23c 409 f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds);
11fdf7f2
TL
410
411 /// dump legacy stats fields to ensure backward compatibility.
412 f->dump_unsigned("kb", statfs.kb());
413 f->dump_unsigned("kb_used", statfs.kb_used_raw());
414 f->dump_unsigned("kb_used_data", statfs.kb_used_data());
415 f->dump_unsigned("kb_used_omap", statfs.kb_used_omap());
416 f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata());
417 f->dump_unsigned("kb_avail", statfs.kb_avail());
418 ////////////////////
419
420 f->open_object_section("statfs");
421 statfs.dump(f);
422 f->close_section();
7c673cae
FG
423 f->open_array_section("hb_peers");
424 for (auto p : hb_peers)
425 f->dump_int("osd", p);
426 f->close_section();
427 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
428 f->dump_int("num_snap_trimming", num_snap_trimming);
11fdf7f2 429 f->dump_int("num_shards_repaired", num_shards_repaired);
7c673cae
FG
430 f->open_object_section("op_queue_age_hist");
431 op_queue_age_hist.dump(f);
432 f->close_section();
433 f->open_object_section("perf_stat");
434 os_perf_stat.dump(f);
435 f->close_section();
11fdf7f2
TL
436 f->open_array_section("alerts");
437 ::dump(f, os_alerts);
438 f->close_section();
ded94939 439 if (with_net) {
9f95a23c
TL
440 dump_ping_time(f);
441 }
442}
443
444void osd_stat_t::dump_ping_time(Formatter *f) const
445{
eafe8130
TL
446 f->open_array_section("network_ping_times");
447 for (auto &i : hb_pingtime) {
448 f->open_object_section("entry");
449 f->dump_int("osd", i.first);
450 const time_t lu(i.second.last_update);
451 char buffer[26];
452 string lustr(ctime_r(&lu, buffer));
453 lustr.pop_back(); // Remove trailing \n
454 f->dump_string("last update", lustr);
455 f->open_array_section("interfaces");
456 f->open_object_section("interface");
457 f->dump_string("interface", "back");
458 f->open_object_section("average");
9f95a23c
TL
459 f->dump_float("1min", i.second.back_pingtime[0]/1000.0);
460 f->dump_float("5min", i.second.back_pingtime[1]/1000.0);
461 f->dump_float("15min", i.second.back_pingtime[2]/1000.0);
eafe8130
TL
462 f->close_section(); // average
463 f->open_object_section("min");
9f95a23c
TL
464 f->dump_float("1min", i.second.back_min[0]/1000.0);
465 f->dump_float("5min", i.second.back_min[1]/1000.0);
466 f->dump_float("15min", i.second.back_min[2]/1000.0);
eafe8130
TL
467 f->close_section(); // min
468 f->open_object_section("max");
9f95a23c
TL
469 f->dump_float("1min", i.second.back_max[0]/1000.0);
470 f->dump_float("5min", i.second.back_max[1]/1000.0);
471 f->dump_float("15min", i.second.back_max[2]/1000.0);
eafe8130 472 f->close_section(); // max
9f95a23c 473 f->dump_float("last", i.second.back_last/1000.0);
eafe8130
TL
474 f->close_section(); // interface
475
476 if (i.second.front_pingtime[0] != 0) {
477 f->open_object_section("interface");
478 f->dump_string("interface", "front");
479 f->open_object_section("average");
9f95a23c
TL
480 f->dump_float("1min", i.second.front_pingtime[0]/1000.0);
481 f->dump_float("5min", i.second.front_pingtime[1]/1000.0);
482 f->dump_float("15min", i.second.front_pingtime[2]/1000.0);
eafe8130
TL
483 f->close_section(); // average
484 f->open_object_section("min");
9f95a23c
TL
485 f->dump_float("1min", i.second.front_min[0]/1000.0);
486 f->dump_float("5min", i.second.front_min[1]/1000.0);
487 f->dump_float("15min", i.second.front_min[2]/1000.0);
eafe8130
TL
488 f->close_section(); // min
489 f->open_object_section("max");
9f95a23c
TL
490 f->dump_float("1min", i.second.front_max[0]/1000.0);
491 f->dump_float("5min", i.second.front_max[1]/1000.0);
492 f->dump_float("15min", i.second.front_max[2]/1000.0);
eafe8130 493 f->close_section(); // max
9f95a23c 494 f->dump_float("last", i.second.front_last/1000.0);
eafe8130
TL
495 f->close_section(); // interface
496 }
497 f->close_section(); // interfaces
498 f->close_section(); // entry
499 }
500 f->close_section(); // network_ping_time
7c673cae
FG
501}
502
9f95a23c 503void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
11fdf7f2 504{
eafe8130 505 ENCODE_START(14, 2, bl);
11fdf7f2
TL
506
507 //////// for compatibility ////////
508 int64_t kb = statfs.kb();
509 int64_t kb_used = statfs.kb_used_raw();
510 int64_t kb_avail = statfs.kb_avail();
511 encode(kb, bl);
512 encode(kb_used, bl);
513 encode(kb_avail, bl);
514 ///////////////////////////////////
515
516 encode(snap_trim_queue_len, bl);
517 encode(num_snap_trimming, bl);
518 encode(hb_peers, bl);
519 encode((uint32_t)0, bl);
520 encode(op_queue_age_hist, bl);
521 encode(os_perf_stat, bl, features);
522 encode(up_from, bl);
523 encode(seq, bl);
524 encode(num_pgs, bl);
525
526 //////// for compatibility ////////
527 int64_t kb_used_data = statfs.kb_used_data();
528 int64_t kb_used_omap = statfs.kb_used_omap();
529 int64_t kb_used_meta = statfs.kb_used_internal_metadata();
530 encode(kb_used_data, bl);
531 encode(kb_used_omap, bl);
532 encode(kb_used_meta, bl);
533 encode(statfs, bl);
534 ///////////////////////////////////
535 encode(os_alerts, bl);
536 encode(num_shards_repaired, bl);
81eedcae
TL
537 encode(num_osds, bl);
538 encode(num_per_pool_osds, bl);
9f95a23c 539 encode(num_per_pool_omap_osds, bl);
eafe8130
TL
540
541 // hb_pingtime map
542 encode((int)hb_pingtime.size(), bl);
543 for (auto i : hb_pingtime) {
544 encode(i.first, bl); // osd
545 encode(i.second.last_update, bl);
546 encode(i.second.back_pingtime[0], bl);
547 encode(i.second.back_pingtime[1], bl);
548 encode(i.second.back_pingtime[2], bl);
549 encode(i.second.back_min[0], bl);
550 encode(i.second.back_min[1], bl);
551 encode(i.second.back_min[2], bl);
552 encode(i.second.back_max[0], bl);
553 encode(i.second.back_max[1], bl);
554 encode(i.second.back_max[2], bl);
555 encode(i.second.back_last, bl);
556 encode(i.second.front_pingtime[0], bl);
557 encode(i.second.front_pingtime[1], bl);
558 encode(i.second.front_pingtime[2], bl);
559 encode(i.second.front_min[0], bl);
560 encode(i.second.front_min[1], bl);
561 encode(i.second.front_min[2], bl);
562 encode(i.second.front_max[0], bl);
563 encode(i.second.front_max[1], bl);
564 encode(i.second.front_max[2], bl);
565 encode(i.second.front_last, bl);
566 }
7c673cae
FG
567 ENCODE_FINISH(bl);
568}
569
9f95a23c 570void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae 571{
11fdf7f2
TL
572 int64_t kb, kb_used,kb_avail;
573 int64_t kb_used_data, kb_used_omap, kb_used_meta;
eafe8130 574 DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl);
11fdf7f2
TL
575 decode(kb, bl);
576 decode(kb_used, bl);
577 decode(kb_avail, bl);
578 decode(snap_trim_queue_len, bl);
579 decode(num_snap_trimming, bl);
580 decode(hb_peers, bl);
7c673cae 581 vector<int> num_hb_out;
11fdf7f2 582 decode(num_hb_out, bl);
7c673cae 583 if (struct_v >= 3)
11fdf7f2 584 decode(op_queue_age_hist, bl);
7c673cae 585 if (struct_v >= 4)
11fdf7f2 586 decode(os_perf_stat, bl);
31f18b77 587 if (struct_v >= 6) {
11fdf7f2
TL
588 decode(up_from, bl);
589 decode(seq, bl);
31f18b77 590 }
35e4c445 591 if (struct_v >= 7) {
11fdf7f2
TL
592 decode(num_pgs, bl);
593 }
594 if (struct_v >= 8) {
595 decode(kb_used_data, bl);
596 decode(kb_used_omap, bl);
597 decode(kb_used_meta, bl);
598 } else {
599 kb_used_data = kb_used;
600 kb_used_omap = 0;
601 kb_used_meta = 0;
602 }
603 if (struct_v >= 9) {
604 decode(statfs, bl);
605 } else {
606 statfs.reset();
607 statfs.total = kb << 10;
608 statfs.available = kb_avail << 10;
609 // actually it's totally unexpected to have ststfs.total < statfs.available
610 // here but unfortunately legacy generate_test_instances produced such a
611 // case hence inserting some handling rather than assert
612 statfs.internally_reserved =
613 statfs.total > statfs.available ? statfs.total - statfs.available : 0;
614 kb_used <<= 10;
615 if ((int64_t)statfs.internally_reserved > kb_used) {
616 statfs.internally_reserved -= kb_used;
617 } else {
618 statfs.internally_reserved = 0;
619 }
620 statfs.allocated = kb_used_data << 10;
621 statfs.omap_allocated = kb_used_omap << 10;
622 statfs.internal_metadata = kb_used_meta << 10;
623 }
624 if (struct_v >= 10) {
625 decode(os_alerts, bl);
626 } else {
627 os_alerts.clear();
628 }
629 if (struct_v >= 11) {
630 decode(num_shards_repaired, bl);
631 } else {
632 num_shards_repaired = 0;
35e4c445 633 }
81eedcae
TL
634 if (struct_v >= 12) {
635 decode(num_osds, bl);
636 decode(num_per_pool_osds, bl);
637 } else {
638 num_osds = 0;
639 num_per_pool_osds = 0;
640 }
eafe8130 641 if (struct_v >= 13) {
9f95a23c
TL
642 decode(num_per_pool_omap_osds, bl);
643 } else {
644 num_per_pool_omap_osds = 0;
eafe8130
TL
645 }
646 hb_pingtime.clear();
647 if (struct_v >= 14) {
648 int count;
649 decode(count, bl);
650 for (int i = 0 ; i < count ; i++) {
651 int osd;
652 decode(osd, bl);
653 struct Interfaces ifs;
654 decode(ifs.last_update, bl);
655 decode(ifs.back_pingtime[0],bl);
656 decode(ifs.back_pingtime[1], bl);
657 decode(ifs.back_pingtime[2], bl);
658 decode(ifs.back_min[0],bl);
659 decode(ifs.back_min[1], bl);
660 decode(ifs.back_min[2], bl);
661 decode(ifs.back_max[0],bl);
662 decode(ifs.back_max[1], bl);
663 decode(ifs.back_max[2], bl);
664 decode(ifs.back_last, bl);
665 decode(ifs.front_pingtime[0], bl);
666 decode(ifs.front_pingtime[1], bl);
667 decode(ifs.front_pingtime[2], bl);
668 decode(ifs.front_min[0], bl);
669 decode(ifs.front_min[1], bl);
670 decode(ifs.front_min[2], bl);
671 decode(ifs.front_max[0], bl);
672 decode(ifs.front_max[1], bl);
673 decode(ifs.front_max[2], bl);
674 decode(ifs.front_last, bl);
675 hb_pingtime[osd] = ifs;
676 }
677 }
7c673cae
FG
678 DECODE_FINISH(bl);
679}
680
681void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
682{
683 o.push_back(new osd_stat_t);
684
685 o.push_back(new osd_stat_t);
11fdf7f2
TL
686 list<store_statfs_t*> ll;
687 store_statfs_t::generate_test_instances(ll);
688 o.back()->statfs = *ll.back();
7c673cae
FG
689 o.back()->hb_peers.push_back(7);
690 o.back()->snap_trim_queue_len = 8;
691 o.back()->num_snap_trimming = 99;
11fdf7f2
TL
692 o.back()->num_shards_repaired = 101;
693 o.back()->os_alerts[0].emplace(
694 "some alert", "some alert details");
695 o.back()->os_alerts[1].emplace(
696 "some alert2", "some alert2 details");
eafe8130
TL
697 struct Interfaces gen_interfaces = {
698 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
699 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
700 o.back()->hb_pingtime[20] = gen_interfaces;
701 gen_interfaces = {
702 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
703 o.back()->hb_pingtime[30] = gen_interfaces;
7c673cae
FG
704}
705
706// -- pg_t --
707
708int pg_t::print(char *o, int maxlen) const
709{
11fdf7f2 710 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
7c673cae
FG
711}
712
713bool pg_t::parse(const char *s)
714{
715 uint64_t ppool;
716 uint32_t pseed;
11fdf7f2 717 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
7c673cae
FG
718 if (r < 2)
719 return false;
720 m_pool = ppool;
721 m_seed = pseed;
7c673cae
FG
722 return true;
723}
724
725bool spg_t::parse(const char *s)
726{
7c673cae
FG
727 shard = shard_id_t::NO_SHARD;
728 uint64_t ppool;
729 uint32_t pseed;
7c673cae
FG
730 uint32_t pshard;
731 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
732 if (r < 2)
733 return false;
734 pgid.set_pool(ppool);
735 pgid.set_ps(pseed);
736
11fdf7f2 737 const char *p = strchr(s, 's');
7c673cae 738 if (p) {
11fdf7f2 739 r = sscanf(p, "s%u", &pshard);
7c673cae
FG
740 if (r == 1) {
741 shard = shard_id_t(pshard);
742 } else {
743 return false;
744 }
745 }
746 return true;
747}
748
749char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
750{
751 while (*suffix_backwords)
752 *--buf = *suffix_backwords++;
753
754 if (!is_no_shard()) {
755 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
756 *--buf = 's';
757 }
758
759 return pgid.calc_name(buf, "");
760}
761
1e59de90
TL
762std::string spg_t::calc_name_sring() const
763{
764 char buf[spg_t::calc_name_buf_size];
765 buf[spg_t::calc_name_buf_size - 1] = '\0';
766 return string{calc_name(buf + spg_t::calc_name_buf_size - 1, "")};
767}
768
7c673cae
FG
769ostream& operator<<(ostream& out, const spg_t &pg)
770{
771 char buf[spg_t::calc_name_buf_size];
772 buf[spg_t::calc_name_buf_size - 1] = '\0';
773 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
774 return out;
775}
776
777pg_t pg_t::get_ancestor(unsigned old_pg_num) const
778{
779 int old_bits = cbits(old_pg_num);
780 int old_mask = (1 << old_bits) - 1;
781 pg_t ret = *this;
782 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
783 return ret;
784}
785
786bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
787{
11fdf7f2
TL
788 //ceph_assert(m_seed < old_pg_num);
789 if (m_seed >= old_pg_num) {
790 // degenerate case
791 return false;
792 }
7c673cae
FG
793 if (new_pg_num <= old_pg_num)
794 return false;
795
796 bool split = false;
797 if (true) {
798 unsigned old_bits = cbits(old_pg_num);
799 unsigned old_mask = (1 << old_bits) - 1;
800 for (unsigned n = 1; ; n++) {
801 unsigned next_bit = (n << (old_bits-1));
802 unsigned s = next_bit | m_seed;
803
804 if (s < old_pg_num || s == m_seed)
805 continue;
806 if (s >= new_pg_num)
807 break;
808 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
809 split = true;
810 if (children)
11fdf7f2 811 children->insert(pg_t(s, m_pool));
7c673cae
FG
812 }
813 }
814 }
815 if (false) {
816 // brute force
817 int old_bits = cbits(old_pg_num);
818 int old_mask = (1 << old_bits) - 1;
819 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
820 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
821 if (o == m_seed) {
822 split = true;
11fdf7f2 823 children->insert(pg_t(x, m_pool));
7c673cae
FG
824 }
825 }
826 }
827 return split;
828}
829
830unsigned pg_t::get_split_bits(unsigned pg_num) const {
831 if (pg_num == 1)
832 return 0;
11fdf7f2 833 ceph_assert(pg_num > 1);
7c673cae
FG
834
835 // Find unique p such that pg_num \in [2^(p-1), 2^p)
836 unsigned p = cbits(pg_num);
11fdf7f2 837 ceph_assert(p); // silence coverity #751330
7c673cae
FG
838
839 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
840 return p;
841 else
842 return p - 1;
843}
844
11fdf7f2
TL
845bool pg_t::is_merge_source(
846 unsigned old_pg_num,
847 unsigned new_pg_num,
848 pg_t *parent) const
849{
850 if (m_seed < old_pg_num &&
851 m_seed >= new_pg_num) {
852 if (parent) {
853 pg_t t = *this;
854 while (t.m_seed >= new_pg_num) {
855 t = t.get_parent();
856 }
857 *parent = t;
858 }
859 return true;
860 }
861 return false;
862}
863
7c673cae
FG
864pg_t pg_t::get_parent() const
865{
866 unsigned bits = cbits(m_seed);
11fdf7f2 867 ceph_assert(bits);
7c673cae
FG
868 pg_t retval = *this;
869 retval.m_seed &= ~((~0)<<(bits - 1));
870 return retval;
871}
872
873hobject_t pg_t::get_hobj_start() const
874{
11fdf7f2 875 return hobject_t(object_t(), string(), 0, m_seed, m_pool,
7c673cae
FG
876 string());
877}
878
879hobject_t pg_t::get_hobj_end(unsigned pg_num) const
880{
881 // note: this assumes a bitwise sort; with the legacy nibblewise
882 // sort a PG did not always cover a single contiguous range of the
883 // (bit-reversed) hash range.
884 unsigned bits = get_split_bits(pg_num);
885 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
886 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
887 if (rev_end >= 0x100000000) {
11fdf7f2 888 ceph_assert(rev_end == 0x100000000);
7c673cae
FG
889 return hobject_t::get_max();
890 } else {
891 return hobject_t(object_t(), string(), CEPH_NOSNAP,
892 hobject_t::_reverse_bits(rev_end), m_pool,
893 string());
894 }
895}
896
897void pg_t::dump(Formatter *f) const
898{
899 f->dump_unsigned("pool", m_pool);
900 f->dump_unsigned("seed", m_seed);
7c673cae
FG
901}
902
903void pg_t::generate_test_instances(list<pg_t*>& o)
904{
905 o.push_back(new pg_t);
11fdf7f2
TL
906 o.push_back(new pg_t(1, 2));
907 o.push_back(new pg_t(13123, 3));
908 o.push_back(new pg_t(131223, 4));
7c673cae
FG
909}
910
911char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
912{
913 while (*suffix_backwords)
914 *--buf = *suffix_backwords++;
915
7c673cae
FG
916 buf = ritoa<uint32_t, 16>(m_seed, buf);
917
918 *--buf = '.';
919
920 return ritoa<uint64_t, 10>(m_pool, buf);
921}
922
923ostream& operator<<(ostream& out, const pg_t &pg)
924{
925 char buf[pg_t::calc_name_buf_size];
926 buf[pg_t::calc_name_buf_size - 1] = '\0';
927 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
928 return out;
929}
930
931
932// -- coll_t --
933
934void coll_t::calc_str()
935{
936 switch (type) {
937 case TYPE_META:
938 strcpy(_str_buff, "meta");
939 _str = _str_buff;
940 break;
941 case TYPE_PG:
942 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
943 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
944 break;
945 case TYPE_PG_TEMP:
946 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
947 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
948 break;
949 default:
11fdf7f2 950 ceph_abort_msg("unknown collection type");
7c673cae
FG
951 }
952}
953
954bool coll_t::parse(const std::string& s)
955{
956 if (s == "meta") {
957 type = TYPE_META;
958 pgid = spg_t();
959 removal_seq = 0;
960 calc_str();
11fdf7f2 961 ceph_assert(s == _str);
7c673cae
FG
962 return true;
963 }
964 if (s.find("_head") == s.length() - 5 &&
965 pgid.parse(s.substr(0, s.length() - 5))) {
966 type = TYPE_PG;
967 removal_seq = 0;
968 calc_str();
11fdf7f2 969 ceph_assert(s == _str);
7c673cae
FG
970 return true;
971 }
972 if (s.find("_TEMP") == s.length() - 5 &&
973 pgid.parse(s.substr(0, s.length() - 5))) {
974 type = TYPE_PG_TEMP;
975 removal_seq = 0;
976 calc_str();
11fdf7f2 977 ceph_assert(s == _str);
7c673cae
FG
978 return true;
979 }
980 return false;
981}
982
9f95a23c 983void coll_t::encode(ceph::buffer::list& bl) const
7c673cae 984{
11fdf7f2 985 using ceph::encode;
7c673cae
FG
986 // when changing this, remember to update encoded_size() too.
987 if (is_temp()) {
988 // can't express this as v2...
989 __u8 struct_v = 3;
11fdf7f2
TL
990 encode(struct_v, bl);
991 encode(to_str(), bl);
7c673cae
FG
992 } else {
993 __u8 struct_v = 2;
11fdf7f2
TL
994 encode(struct_v, bl);
995 encode((__u8)type, bl);
996 encode(pgid, bl);
7c673cae 997 snapid_t snap = CEPH_NOSNAP;
11fdf7f2 998 encode(snap, bl);
7c673cae
FG
999 }
1000}
1001
1002size_t coll_t::encoded_size() const
1003{
1004 size_t r = sizeof(__u8);
1005 if (is_temp()) {
1006 // v3
1007 r += sizeof(__u32);
1008 if (_str) {
1009 r += strlen(_str);
1010 }
1011 } else {
1012 // v2
1013 // 1. type
1014 r += sizeof(__u8);
1015 // 2. pgid
1016 // - encoding header
1017 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
1018 // - pg_t
1019 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
1020 // - shard_id_t
1021 r += sizeof(int8_t);
1022 // 3. snapid_t
1023 r += sizeof(uint64_t);
1024 }
1025
1026 return r;
1027}
1028
9f95a23c 1029void coll_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 1030{
11fdf7f2 1031 using ceph::decode;
7c673cae 1032 __u8 struct_v;
11fdf7f2 1033 decode(struct_v, bl);
7c673cae
FG
1034 switch (struct_v) {
1035 case 1:
1036 {
1037 snapid_t snap;
11fdf7f2
TL
1038 decode(pgid, bl);
1039 decode(snap, bl);
7c673cae
FG
1040
1041 // infer the type
1042 if (pgid == spg_t() && snap == 0) {
1043 type = TYPE_META;
1044 } else {
1045 type = TYPE_PG;
1046 }
1047 removal_seq = 0;
1048 }
1049 break;
1050
1051 case 2:
1052 {
1053 __u8 _type;
1054 snapid_t snap;
11fdf7f2
TL
1055 decode(_type, bl);
1056 decode(pgid, bl);
1057 decode(snap, bl);
7c673cae
FG
1058 type = (type_t)_type;
1059 removal_seq = 0;
1060 }
1061 break;
1062
1063 case 3:
1064 {
1065 string str;
11fdf7f2 1066 decode(str, bl);
7c673cae
FG
1067 bool ok = parse(str);
1068 if (!ok)
1069 throw std::domain_error(std::string("unable to parse pg ") + str);
1070 }
1071 break;
1072
1073 default:
1074 {
f67539c2
TL
1075 CachedStackStringStream css;
1076 *css << "coll_t::decode(): don't know how to decode version "
1077 << struct_v;
1078 throw std::domain_error(css->str());
7c673cae
FG
1079 }
1080 }
1081}
1082
1083void coll_t::dump(Formatter *f) const
1084{
1085 f->dump_unsigned("type_id", (unsigned)type);
1086 if (type != TYPE_META)
1087 f->dump_stream("pgid") << pgid;
1088 f->dump_string("name", to_str());
1089}
1090
1091void coll_t::generate_test_instances(list<coll_t*>& o)
1092{
1093 o.push_back(new coll_t());
1094 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
1095 o.push_back(new coll_t(o.back()->get_temp()));
1096 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
1097 o.push_back(new coll_t(o.back()->get_temp()));
1098 o.push_back(new coll_t());
1099}
1100
1101// ---
1102
1103std::string pg_vector_string(const vector<int32_t> &a)
1104{
f67539c2
TL
1105 CachedStackStringStream css;
1106 *css << "[";
9f95a23c
TL
1107 for (auto i = a.cbegin(); i != a.cend(); ++i) {
1108 if (i != a.begin())
f67539c2 1109 *css << ",";
9f95a23c 1110 if (*i != CRUSH_ITEM_NONE)
f67539c2 1111 *css << *i;
9f95a23c 1112 else
f67539c2 1113 *css << "NONE";
7c673cae 1114 }
f67539c2
TL
1115 *css << "]";
1116 return css->str();
7c673cae
FG
1117}
1118
11fdf7f2 1119std::string pg_state_string(uint64_t state)
7c673cae 1120{
f67539c2 1121 CachedStackStringStream css;
7c673cae 1122 if (state & PG_STATE_STALE)
f67539c2 1123 *css << "stale+";
7c673cae 1124 if (state & PG_STATE_CREATING)
f67539c2 1125 *css << "creating+";
7c673cae 1126 if (state & PG_STATE_ACTIVE)
f67539c2 1127 *css << "active+";
7c673cae 1128 if (state & PG_STATE_ACTIVATING)
f67539c2 1129 *css << "activating+";
7c673cae 1130 if (state & PG_STATE_CLEAN)
f67539c2 1131 *css << "clean+";
7c673cae 1132 if (state & PG_STATE_RECOVERY_WAIT)
f67539c2 1133 *css << "recovery_wait+";
7c673cae 1134 if (state & PG_STATE_RECOVERY_TOOFULL)
f67539c2 1135 *css << "recovery_toofull+";
7c673cae 1136 if (state & PG_STATE_RECOVERING)
f67539c2 1137 *css << "recovering+";
c07f9fc5 1138 if (state & PG_STATE_FORCED_RECOVERY)
f67539c2 1139 *css << "forced_recovery+";
7c673cae 1140 if (state & PG_STATE_DOWN)
f67539c2 1141 *css << "down+";
b32b8144 1142 if (state & PG_STATE_RECOVERY_UNFOUND)
f67539c2 1143 *css << "recovery_unfound+";
b32b8144 1144 if (state & PG_STATE_BACKFILL_UNFOUND)
f67539c2 1145 *css << "backfill_unfound+";
7c673cae 1146 if (state & PG_STATE_UNDERSIZED)
f67539c2 1147 *css << "undersized+";
7c673cae 1148 if (state & PG_STATE_DEGRADED)
f67539c2 1149 *css << "degraded+";
7c673cae 1150 if (state & PG_STATE_REMAPPED)
f67539c2 1151 *css << "remapped+";
11fdf7f2 1152 if (state & PG_STATE_PREMERGE)
f67539c2 1153 *css << "premerge+";
7c673cae 1154 if (state & PG_STATE_SCRUBBING)
f67539c2 1155 *css << "scrubbing+";
7c673cae 1156 if (state & PG_STATE_DEEP_SCRUB)
f67539c2 1157 *css << "deep+";
7c673cae 1158 if (state & PG_STATE_INCONSISTENT)
f67539c2 1159 *css << "inconsistent+";
7c673cae 1160 if (state & PG_STATE_PEERING)
f67539c2 1161 *css << "peering+";
7c673cae 1162 if (state & PG_STATE_REPAIR)
f67539c2 1163 *css << "repair+";
3efd9988 1164 if (state & PG_STATE_BACKFILL_WAIT)
f67539c2 1165 *css << "backfill_wait+";
3efd9988 1166 if (state & PG_STATE_BACKFILLING)
f67539c2 1167 *css << "backfilling+";
c07f9fc5 1168 if (state & PG_STATE_FORCED_BACKFILL)
f67539c2 1169 *css << "forced_backfill+";
7c673cae 1170 if (state & PG_STATE_BACKFILL_TOOFULL)
f67539c2 1171 *css << "backfill_toofull+";
7c673cae 1172 if (state & PG_STATE_INCOMPLETE)
f67539c2 1173 *css << "incomplete+";
7c673cae 1174 if (state & PG_STATE_PEERED)
f67539c2 1175 *css << "peered+";
7c673cae 1176 if (state & PG_STATE_SNAPTRIM)
f67539c2 1177 *css << "snaptrim+";
7c673cae 1178 if (state & PG_STATE_SNAPTRIM_WAIT)
f67539c2 1179 *css << "snaptrim_wait+";
224ce89b 1180 if (state & PG_STATE_SNAPTRIM_ERROR)
f67539c2 1181 *css << "snaptrim_error+";
11fdf7f2 1182 if (state & PG_STATE_FAILED_REPAIR)
f67539c2 1183 *css << "failed_repair+";
9f95a23c 1184 if (state & PG_STATE_LAGGY)
f67539c2 1185 *css << "laggy+";
9f95a23c 1186 if (state & PG_STATE_WAIT)
f67539c2
TL
1187 *css << "wait+";
1188 auto ret = css->str();
7c673cae
FG
1189 if (ret.length() > 0)
1190 ret.resize(ret.length() - 1);
1191 else
31f18b77 1192 ret = "unknown";
7c673cae
FG
1193 return ret;
1194}
1195
9f95a23c 1196std::optional<uint64_t> pg_string_state(const std::string& state)
7c673cae 1197{
9f95a23c 1198 std::optional<uint64_t> type;
7c673cae
FG
1199 if (state == "active")
1200 type = PG_STATE_ACTIVE;
1201 else if (state == "clean")
1202 type = PG_STATE_CLEAN;
1203 else if (state == "down")
1204 type = PG_STATE_DOWN;
b32b8144
FG
1205 else if (state == "recovery_unfound")
1206 type = PG_STATE_RECOVERY_UNFOUND;
1207 else if (state == "backfill_unfound")
1208 type = PG_STATE_BACKFILL_UNFOUND;
11fdf7f2
TL
1209 else if (state == "premerge")
1210 type = PG_STATE_PREMERGE;
7c673cae
FG
1211 else if (state == "scrubbing")
1212 type = PG_STATE_SCRUBBING;
1213 else if (state == "degraded")
1214 type = PG_STATE_DEGRADED;
1215 else if (state == "inconsistent")
1216 type = PG_STATE_INCONSISTENT;
1217 else if (state == "peering")
1218 type = PG_STATE_PEERING;
1219 else if (state == "repair")
1220 type = PG_STATE_REPAIR;
1221 else if (state == "recovering")
1222 type = PG_STATE_RECOVERING;
c07f9fc5
FG
1223 else if (state == "forced_recovery")
1224 type = PG_STATE_FORCED_RECOVERY;
7c673cae
FG
1225 else if (state == "backfill_wait")
1226 type = PG_STATE_BACKFILL_WAIT;
1227 else if (state == "incomplete")
1228 type = PG_STATE_INCOMPLETE;
1229 else if (state == "stale")
1230 type = PG_STATE_STALE;
1231 else if (state == "remapped")
1232 type = PG_STATE_REMAPPED;
94b18763 1233 else if (state == "deep")
7c673cae 1234 type = PG_STATE_DEEP_SCRUB;
3efd9988
FG
1235 else if (state == "backfilling")
1236 type = PG_STATE_BACKFILLING;
c07f9fc5
FG
1237 else if (state == "forced_backfill")
1238 type = PG_STATE_FORCED_BACKFILL;
7c673cae
FG
1239 else if (state == "backfill_toofull")
1240 type = PG_STATE_BACKFILL_TOOFULL;
1241 else if (state == "recovery_wait")
1242 type = PG_STATE_RECOVERY_WAIT;
1243 else if (state == "recovery_toofull")
1244 type = PG_STATE_RECOVERY_TOOFULL;
1245 else if (state == "undersized")
1246 type = PG_STATE_UNDERSIZED;
1247 else if (state == "activating")
1248 type = PG_STATE_ACTIVATING;
1249 else if (state == "peered")
1250 type = PG_STATE_PEERED;
1251 else if (state == "snaptrim")
1252 type = PG_STATE_SNAPTRIM;
1253 else if (state == "snaptrim_wait")
1254 type = PG_STATE_SNAPTRIM_WAIT;
224ce89b
WB
1255 else if (state == "snaptrim_error")
1256 type = PG_STATE_SNAPTRIM_ERROR;
91327a77
AA
1257 else if (state == "creating")
1258 type = PG_STATE_CREATING;
11fdf7f2
TL
1259 else if (state == "failed_repair")
1260 type = PG_STATE_FAILED_REPAIR;
9f95a23c
TL
1261 else if (state == "laggy")
1262 type = PG_STATE_LAGGY;
1263 else if (state == "wait")
1264 type = PG_STATE_WAIT;
11fdf7f2
TL
1265 else if (state == "unknown")
1266 type = 0;
7c673cae 1267 else
9f95a23c 1268 type = std::nullopt;
7c673cae
FG
1269 return type;
1270}
1271
1272// -- eversion_t --
1273string eversion_t::get_key_name() const
1274{
11fdf7f2
TL
1275 std::string key(32, ' ');
1276 get_key_name(&key[0]);
1277 key.resize(31); // remove the null terminator
1278 return key;
7c673cae
FG
1279}
1280
7c673cae
FG
1281// -- pool_snap_info_t --
1282void pool_snap_info_t::dump(Formatter *f) const
1283{
1284 f->dump_unsigned("snapid", snapid);
1285 f->dump_stream("stamp") << stamp;
1286 f->dump_string("name", name);
1287}
1288
9f95a23c 1289void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 1290{
11fdf7f2 1291 using ceph::encode;
7c673cae
FG
1292 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1293 __u8 struct_v = 1;
11fdf7f2
TL
1294 encode(struct_v, bl);
1295 encode(snapid, bl);
1296 encode(stamp, bl);
1297 encode(name, bl);
7c673cae
FG
1298 return;
1299 }
1300 ENCODE_START(2, 2, bl);
11fdf7f2
TL
1301 encode(snapid, bl);
1302 encode(stamp, bl);
1303 encode(name, bl);
7c673cae
FG
1304 ENCODE_FINISH(bl);
1305}
1306
9f95a23c 1307void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae
FG
1308{
1309 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2
TL
1310 decode(snapid, bl);
1311 decode(stamp, bl);
1312 decode(name, bl);
7c673cae
FG
1313 DECODE_FINISH(bl);
1314}
1315
1316void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
1317{
1318 o.push_back(new pool_snap_info_t);
1319 o.push_back(new pool_snap_info_t);
1320 o.back()->snapid = 1;
1321 o.back()->stamp = utime_t(1, 2);
1322 o.back()->name = "foo";
1323}
1324
1325// -- pool_opts_t --
1326
1d09f67e
TL
1327// The order of items in the list is important, therefore,
1328// you should always add to the end of the list when adding new options.
1329
7c673cae
FG
1330typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
1331static opt_mapping_t opt_mapping = boost::assign::map_list_of
1332 ("scrub_min_interval", pool_opts_t::opt_desc_t(
1333 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
1334 ("scrub_max_interval", pool_opts_t::opt_desc_t(
1335 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
1336 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1337 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
1338 ("recovery_priority", pool_opts_t::opt_desc_t(
1339 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
1340 ("recovery_op_priority", pool_opts_t::opt_desc_t(
1341 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
1342 ("scrub_priority", pool_opts_t::opt_desc_t(
1343 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
1344 ("compression_mode", pool_opts_t::opt_desc_t(
1345 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
1346 ("compression_algorithm", pool_opts_t::opt_desc_t(
1347 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
1348 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1349 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
1350 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1351 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
1352 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1353 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
1354 ("csum_type", pool_opts_t::opt_desc_t(
1355 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1356 ("csum_max_block", pool_opts_t::opt_desc_t(
1357 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1358 ("csum_min_block", pool_opts_t::opt_desc_t(
11fdf7f2
TL
1359 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT))
1360 ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1361 pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR))
1362 ("pg_num_min", pool_opts_t::opt_desc_t(
1363 pool_opts_t::PG_NUM_MIN, pool_opts_t::INT))
1364 ("target_size_bytes", pool_opts_t::opt_desc_t(
1365 pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT))
1366 ("target_size_ratio", pool_opts_t::opt_desc_t(
1367 pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE))
1368 ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
9f95a23c
TL
1369 pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE))
1370 ("read_lease_interval", pool_opts_t::opt_desc_t(
f67539c2
TL
1371 pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE))
1372 ("dedup_tier", pool_opts_t::opt_desc_t(
1373 pool_opts_t::DEDUP_TIER, pool_opts_t::INT))
1374 ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t(
1375 pool_opts_t::DEDUP_CHUNK_ALGORITHM, pool_opts_t::STR))
1376 ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t(
1d09f67e
TL
1377 pool_opts_t::DEDUP_CDC_CHUNK_SIZE, pool_opts_t::INT))
1378 ("pg_num_max", pool_opts_t::opt_desc_t(
1379 pool_opts_t::PG_NUM_MAX, pool_opts_t::INT));
7c673cae 1380
11fdf7f2
TL
1381bool pool_opts_t::is_opt_name(const std::string& name)
1382{
1383 return opt_mapping.count(name);
7c673cae
FG
1384}
1385
11fdf7f2
TL
1386pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name)
1387{
9f95a23c 1388 auto i = opt_mapping.find(name);
11fdf7f2
TL
1389 ceph_assert(i != opt_mapping.end());
1390 return i->second;
7c673cae
FG
1391}
1392
11fdf7f2
TL
1393bool pool_opts_t::is_set(pool_opts_t::key_t key) const
1394{
1395 return opts.count(key);
7c673cae
FG
1396}
1397
11fdf7f2
TL
1398const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const
1399{
9f95a23c 1400 auto i = opts.find(key);
11fdf7f2 1401 ceph_assert(i != opts.end());
7c673cae
FG
1402 return i->second;
1403}
1404
1405bool pool_opts_t::unset(pool_opts_t::key_t key) {
1406 return opts.erase(key) > 0;
1407}
1408
11fdf7f2 1409class pool_opts_dumper_t : public boost::static_visitor<> {
7c673cae
FG
1410public:
1411 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1412 name(name_.c_str()), f(f_) {}
1413
1414 void operator()(std::string s) const {
1415 f->dump_string(name, s);
1416 }
11fdf7f2 1417 void operator()(int64_t i) const {
7c673cae
FG
1418 f->dump_int(name, i);
1419 }
1420 void operator()(double d) const {
1421 f->dump_float(name, d);
1422 }
1423
1424private:
1425 const char* name;
1426 Formatter* f;
1427};
1428
1429void pool_opts_t::dump(const std::string& name, Formatter* f) const
1430{
1431 const opt_desc_t& desc = get_opt_desc(name);
9f95a23c 1432 auto i = opts.find(desc.key);
7c673cae
FG
1433 if (i == opts.end()) {
1434 return;
1435 }
1436 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1437}
1438
1439void pool_opts_t::dump(Formatter* f) const
1440{
9f95a23c 1441 for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) {
7c673cae
FG
1442 const std::string& name = i->first;
1443 const opt_desc_t& desc = i->second;
9f95a23c 1444 auto j = opts.find(desc.key);
7c673cae
FG
1445 if (j == opts.end()) {
1446 continue;
1447 }
1448 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1449 }
1450}
1451
11fdf7f2 1452class pool_opts_encoder_t : public boost::static_visitor<> {
7c673cae 1453public:
9f95a23c 1454 explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features)
11fdf7f2
TL
1455 : bl(bl_),
1456 features(features) {}
1457
1458 void operator()(const std::string &s) const {
1459 encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1460 encode(s, bl);
1461 }
1462 void operator()(int64_t i) const {
1463 encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1464 if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1465 encode(i, bl);
1466 } else {
1467 encode(static_cast<int32_t>(i), bl);
1468 }
7c673cae
FG
1469 }
1470 void operator()(double d) const {
11fdf7f2
TL
1471 encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1472 encode(d, bl);
7c673cae
FG
1473 }
1474
1475private:
9f95a23c 1476 ceph::buffer::list& bl;
11fdf7f2 1477 uint64_t features;
7c673cae
FG
1478};
1479
9f95a23c 1480void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const
11fdf7f2
TL
1481{
1482 unsigned v = 2;
1483 if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1484 v = 1;
1485 }
1486 ENCODE_START(v, 1, bl);
7c673cae 1487 uint32_t n = static_cast<uint32_t>(opts.size());
11fdf7f2 1488 encode(n, bl);
9f95a23c 1489 for (auto i = opts.cbegin(); i != opts.cend(); ++i) {
11fdf7f2
TL
1490 encode(static_cast<int32_t>(i->first), bl);
1491 boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second);
7c673cae
FG
1492 }
1493 ENCODE_FINISH(bl);
1494}
1495
9f95a23c 1496void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl)
11fdf7f2 1497{
7c673cae
FG
1498 DECODE_START(1, bl);
1499 __u32 n;
11fdf7f2 1500 decode(n, bl);
7c673cae
FG
1501 opts.clear();
1502 while (n--) {
1503 int32_t k, t;
11fdf7f2
TL
1504 decode(k, bl);
1505 decode(t, bl);
7c673cae
FG
1506 if (t == STR) {
1507 std::string s;
11fdf7f2 1508 decode(s, bl);
7c673cae
FG
1509 opts[static_cast<key_t>(k)] = s;
1510 } else if (t == INT) {
11fdf7f2
TL
1511 int64_t i;
1512 if (struct_v >= 2) {
1513 decode(i, bl);
1514 } else {
1515 int ii;
1516 decode(ii, bl);
1517 i = ii;
1518 }
7c673cae
FG
1519 opts[static_cast<key_t>(k)] = i;
1520 } else if (t == DOUBLE) {
1521 double d;
11fdf7f2 1522 decode(d, bl);
7c673cae
FG
1523 opts[static_cast<key_t>(k)] = d;
1524 } else {
11fdf7f2 1525 ceph_assert(!"invalid type");
7c673cae
FG
1526 }
1527 }
1528 DECODE_FINISH(bl);
1529}
1530
1531ostream& operator<<(ostream& out, const pool_opts_t& opts)
1532{
9f95a23c 1533 for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) {
7c673cae
FG
1534 const std::string& name = i->first;
1535 const pool_opts_t::opt_desc_t& desc = i->second;
9f95a23c 1536 auto j = opts.opts.find(desc.key);
7c673cae
FG
1537 if (j == opts.opts.end()) {
1538 continue;
1539 }
1540 out << " " << name << " " << j->second;
1541 }
1542 return out;
1543}
1544
1545// -- pg_pool_t --
1546
c07f9fc5
FG
1547const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1548const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1549const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1550
7c673cae
FG
1551void pg_pool_t::dump(Formatter *f) const
1552{
11fdf7f2 1553 f->dump_stream("create_time") << get_create_time();
7c673cae
FG
1554 f->dump_unsigned("flags", get_flags());
1555 f->dump_string("flags_names", get_flags_string());
1556 f->dump_int("type", get_type());
1557 f->dump_int("size", get_size());
1558 f->dump_int("min_size", get_min_size());
31f18b77 1559 f->dump_int("crush_rule", get_crush_rule());
f67539c2
TL
1560 f->dump_int("peering_crush_bucket_count", peering_crush_bucket_count);
1561 f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target);
1562 f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier);
1563 f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member);
7c673cae 1564 f->dump_int("object_hash", get_object_hash());
11fdf7f2
TL
1565 f->dump_string("pg_autoscale_mode",
1566 get_pg_autoscale_mode_name(pg_autoscale_mode));
7c673cae
FG
1567 f->dump_unsigned("pg_num", get_pg_num());
1568 f->dump_unsigned("pg_placement_num", get_pgp_num());
11fdf7f2
TL
1569 f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1570 f->dump_unsigned("pg_num_target", get_pg_num_target());
1571 f->dump_unsigned("pg_num_pending", get_pg_num_pending());
1572 f->dump_object("last_pg_merge_meta", last_pg_merge_meta);
7c673cae
FG
1573 f->dump_stream("last_change") << get_last_change();
1574 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
11fdf7f2
TL
1575 f->dump_stream("last_force_op_resend_prenautilus")
1576 << get_last_force_op_resend_prenautilus();
7c673cae
FG
1577 f->dump_stream("last_force_op_resend_preluminous")
1578 << get_last_force_op_resend_preluminous();
1579 f->dump_unsigned("auid", get_auid());
1580 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1581 f->dump_unsigned("snap_seq", get_snap_seq());
1582 f->dump_unsigned("snap_epoch", get_snap_epoch());
1583 f->open_array_section("pool_snaps");
9f95a23c 1584 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) {
7c673cae
FG
1585 f->open_object_section("pool_snap_info");
1586 p->second.dump(f);
1587 f->close_section();
1588 }
1589 f->close_section();
1590 f->dump_stream("removed_snaps") << removed_snaps;
1591 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1592 f->dump_unsigned("quota_max_objects", quota_max_objects);
1593 f->open_array_section("tiers");
9f95a23c 1594 for (auto p = tiers.cbegin(); p != tiers.cend(); ++p)
7c673cae
FG
1595 f->dump_unsigned("pool_id", *p);
1596 f->close_section();
1597 f->dump_int("tier_of", tier_of);
1598 f->dump_int("read_tier", read_tier);
1599 f->dump_int("write_tier", write_tier);
1600 f->dump_string("cache_mode", get_cache_mode_name());
1601 f->dump_unsigned("target_max_bytes", target_max_bytes);
1602 f->dump_unsigned("target_max_objects", target_max_objects);
1603 f->dump_unsigned("cache_target_dirty_ratio_micro",
1604 cache_target_dirty_ratio_micro);
1605 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1606 cache_target_dirty_high_ratio_micro);
1607 f->dump_unsigned("cache_target_full_ratio_micro",
1608 cache_target_full_ratio_micro);
1609 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1610 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1611 f->dump_string("erasure_code_profile", erasure_code_profile);
1612 f->open_object_section("hit_set_params");
1613 hit_set_params.dump(f);
1614 f->close_section(); // hit_set_params
1615 f->dump_unsigned("hit_set_period", hit_set_period);
1616 f->dump_unsigned("hit_set_count", hit_set_count);
1617 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1618 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1619 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1620 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1621 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1622 f->open_array_section("grade_table");
1623 for (unsigned i = 0; i < hit_set_count; ++i)
1624 f->dump_unsigned("value", get_grade(i));
1625 f->close_section();
1626 f->dump_unsigned("stripe_width", get_stripe_width());
1627 f->dump_unsigned("expected_num_objects", expected_num_objects);
1628 f->dump_bool("fast_read", fast_read);
1629 f->open_object_section("options");
1630 opts.dump(f);
1631 f->close_section(); // options
c07f9fc5
FG
1632 f->open_object_section("application_metadata");
1633 for (auto &app_pair : application_metadata) {
1634 f->open_object_section(app_pair.first.c_str());
1635 for (auto &kv_pair : app_pair.second) {
1636 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1637 }
1638 f->close_section(); // application
1639 }
1640 f->close_section(); // application_metadata
7c673cae
FG
1641}
1642
1643void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1644 for (size_t i = 0; i < from.size(); ++i) {
1645 if (from[i] != CRUSH_ITEM_NONE) {
1646 to->insert(
1647 pg_shard_t(
1648 from[i],
11fdf7f2 1649 is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
7c673cae
FG
1650 }
1651 }
1652}
1653
1654void pg_pool_t::calc_pg_masks()
1655{
1656 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1657 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1658}
1659
1660unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1661{
1662 if (pg_num == pg_num_mask + 1)
1663 return pg_num; // power-of-2 split
1664 unsigned mask = pg_num_mask >> 1;
1665 if ((pgid.ps() & mask) < (pg_num & mask))
1666 return pg_num_mask + 1; // smaller bin size (already split)
1667 else
1668 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1669}
1670
11fdf7f2
TL
1671bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const
1672{
1673 if (pg_num_pending >= pg_num) {
1674 return false;
1675 }
1676 if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) {
1677 if (target) {
1678 *target = false;
1679 }
1680 return true;
1681 }
1682 for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) {
1683 if (pg_t(ps, pgid.pool()).get_parent() == pgid) {
1684 if (target) {
1685 *target = true;
1686 }
1687 return true;
1688 }
1689 }
1690 return false;
1691}
1692
7c673cae
FG
1693/*
1694 * we have two snap modes:
11fdf7f2 1695 * - pool snaps
7c673cae
FG
1696 * - snap existence/non-existence defined by snaps[] and snap_seq
1697 * - user managed snaps
11fdf7f2 1698 * - existence tracked by librados user
7c673cae
FG
1699 */
1700bool pg_pool_t::is_pool_snaps_mode() const
1701{
11fdf7f2 1702 return has_flag(FLAG_POOL_SNAPS);
7c673cae
FG
1703}
1704
1705bool pg_pool_t::is_unmanaged_snaps_mode() const
1706{
11fdf7f2 1707 return has_flag(FLAG_SELFMANAGED_SNAPS);
7c673cae
FG
1708}
1709
1710bool pg_pool_t::is_removed_snap(snapid_t s) const
1711{
1712 if (is_pool_snaps_mode())
1713 return s <= get_snap_seq() && snaps.count(s) == 0;
1714 else
1715 return removed_snaps.contains(s);
1716}
1717
f67539c2 1718snapid_t pg_pool_t::snap_exists(std::string_view s) const
7c673cae 1719{
9f95a23c 1720 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
7c673cae
FG
1721 if (p->second.name == s)
1722 return p->second.snapid;
1723 return 0;
1724}
1725
1726void pg_pool_t::add_snap(const char *n, utime_t stamp)
1727{
11fdf7f2
TL
1728 ceph_assert(!is_unmanaged_snaps_mode());
1729 flags |= FLAG_POOL_SNAPS;
7c673cae
FG
1730 snapid_t s = get_snap_seq() + 1;
1731 snap_seq = s;
1732 snaps[s].snapid = s;
1733 snaps[s].name = n;
1734 snaps[s].stamp = stamp;
1735}
1736
9f95a23c 1737uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat)
7c673cae 1738{
11fdf7f2
TL
1739 ceph_assert(!is_pool_snaps_mode());
1740 if (snap_seq == 0) {
9f95a23c
TL
1741 if (preoctopus_compat) {
1742 // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
1743 // mimic this field is not decoded but our flag is set; pre-mimic, we
1744 // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1745 removed_snaps.insert(snapid_t(1));
1746 }
7c673cae
FG
1747 snap_seq = 1;
1748 }
11fdf7f2 1749 flags |= FLAG_SELFMANAGED_SNAPS;
9f95a23c
TL
1750 snap_seq = snap_seq + 1;
1751 return snap_seq;
7c673cae
FG
1752}
1753
1754void pg_pool_t::remove_snap(snapid_t s)
1755{
11fdf7f2 1756 ceph_assert(snaps.count(s));
7c673cae
FG
1757 snaps.erase(s);
1758 snap_seq = snap_seq + 1;
1759}
1760
9f95a23c 1761void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat)
7c673cae 1762{
11fdf7f2 1763 ceph_assert(is_unmanaged_snaps_mode());
9f95a23c
TL
1764 ++snap_seq;
1765 if (preoctopus_compat) {
1766 removed_snaps.insert(s);
1767 // try to add in the new seq, just to try to keep the interval_set contiguous
1768 if (!removed_snaps.contains(get_snap_seq())) {
1769 removed_snaps.insert(get_snap_seq());
1770 }
28e407b8 1771 }
7c673cae
FG
1772}
1773
1774SnapContext pg_pool_t::get_snap_context() const
1775{
1776 vector<snapid_t> s(snaps.size());
1777 unsigned i = 0;
9f95a23c 1778 for (auto p = snaps.crbegin(); p != snaps.crend(); ++p)
7c673cae
FG
1779 s[i++] = p->first;
1780 return SnapContext(get_snap_seq(), s);
1781}
1782
1783uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1784{
1785 if (ns.empty())
1786 return ceph_str_hash(object_hash, key.data(), key.length());
1787 int nsl = ns.length();
1788 int len = key.length() + nsl + 1;
1789 char buf[len];
1790 memcpy(&buf[0], ns.data(), nsl);
1791 buf[nsl] = '\037';
1792 memcpy(&buf[nsl+1], key.data(), key.length());
1793 return ceph_str_hash(object_hash, &buf[0], len);
1794}
1795
1796uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1797{
1798 return ceph_stable_mod(v, pg_num, pg_num_mask);
1799}
1800
1801/*
1802 * map a raw pg (with full precision ps) into an actual pg, for storage
1803 */
1804pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1805{
1806 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1807 return pg;
1808}
1809
1810/*
1811 * map raw pg (full precision ps) into a placement seed. include
1812 * pool id in that value so that different pools don't use the same
1813 * seeds.
1814 */
1815ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1816{
1817 if (flags & FLAG_HASHPSPOOL) {
1818 // Hash the pool id so that pool PGs do not overlap.
1819 return
1820 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1821 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1822 pg.pool());
1823 } else {
1824 // Legacy behavior; add ps and pool together. This is not a great
1825 // idea because the PGs from each pool will essentially overlap on
1826 // top of each other: 0.5 == 1.4 == 2.3 == ...
1827 return
1828 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1829 pg.pool();
1830 }
1831}
1832
1833uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1834{
1835 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1836 if (pg_num == pg_num_mask + 1) {
1837 r &= ~pg_num_mask;
1838 } else {
1839 unsigned smaller_mask = pg_num_mask >> 1;
1840 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1841 r &= ~pg_num_mask;
1842 } else {
1843 r &= ~smaller_mask;
1844 }
1845 }
1846 r |= pg.ps();
1847 return r;
1848}
1849
9f95a23c 1850void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 1851{
11fdf7f2 1852 using ceph::encode;
7c673cae
FG
1853 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1854 // this encoding matches the old struct ceph_pg_pool
1855 __u8 struct_v = 2;
11fdf7f2
TL
1856 encode(struct_v, bl);
1857 encode(type, bl);
1858 encode(size, bl);
1859 encode(crush_rule, bl);
1860 encode(object_hash, bl);
1861 encode(pg_num, bl);
1862 encode(pgp_num, bl);
7c673cae 1863 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
11fdf7f2
TL
1864 encode(lpg_num, bl);
1865 encode(lpgp_num, bl);
1866 encode(last_change, bl);
1867 encode(snap_seq, bl);
1868 encode(snap_epoch, bl);
7c673cae
FG
1869
1870 __u32 n = snaps.size();
11fdf7f2 1871 encode(n, bl);
7c673cae 1872 n = removed_snaps.num_intervals();
11fdf7f2 1873 encode(n, bl);
7c673cae 1874
11fdf7f2 1875 encode(auid, bl);
7c673cae 1876
11fdf7f2
TL
1877 encode_nohead(snaps, bl, features);
1878 encode_nohead(removed_snaps, bl);
7c673cae
FG
1879 return;
1880 }
1881
1882 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1883 __u8 struct_v = 4;
11fdf7f2
TL
1884 encode(struct_v, bl);
1885 encode(type, bl);
1886 encode(size, bl);
1887 encode(crush_rule, bl);
1888 encode(object_hash, bl);
1889 encode(pg_num, bl);
1890 encode(pgp_num, bl);
7c673cae 1891 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
11fdf7f2
TL
1892 encode(lpg_num, bl);
1893 encode(lpgp_num, bl);
1894 encode(last_change, bl);
1895 encode(snap_seq, bl);
1896 encode(snap_epoch, bl);
1897 encode(snaps, bl, features);
1898 encode(removed_snaps, bl);
1899 encode(auid, bl);
1900 encode(flags, bl);
1901 encode((uint32_t)0, bl); // crash_replay_interval
7c673cae
FG
1902 return;
1903 }
1904
1905 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1906 // we simply added last_force_op_resend here, which is a fully
1907 // backward compatible change. however, encoding the same map
1908 // differently between monitors triggers scrub noise (even though
1909 // they are decodable without the feature), so let's be pendantic
1910 // about it.
1911 ENCODE_START(14, 5, bl);
11fdf7f2
TL
1912 encode(type, bl);
1913 encode(size, bl);
1914 encode(crush_rule, bl);
1915 encode(object_hash, bl);
1916 encode(pg_num, bl);
1917 encode(pgp_num, bl);
7c673cae 1918 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
11fdf7f2
TL
1919 encode(lpg_num, bl);
1920 encode(lpgp_num, bl);
1921 encode(last_change, bl);
1922 encode(snap_seq, bl);
1923 encode(snap_epoch, bl);
1924 encode(snaps, bl, features);
1925 encode(removed_snaps, bl);
1926 encode(auid, bl);
1927 encode(flags, bl);
1928 encode((uint32_t)0, bl); // crash_replay_interval
1929 encode(min_size, bl);
1930 encode(quota_max_bytes, bl);
1931 encode(quota_max_objects, bl);
1932 encode(tiers, bl);
1933 encode(tier_of, bl);
7c673cae 1934 __u8 c = cache_mode;
11fdf7f2
TL
1935 encode(c, bl);
1936 encode(read_tier, bl);
1937 encode(write_tier, bl);
1938 encode(properties, bl);
1939 encode(hit_set_params, bl);
1940 encode(hit_set_period, bl);
1941 encode(hit_set_count, bl);
1942 encode(stripe_width, bl);
1943 encode(target_max_bytes, bl);
1944 encode(target_max_objects, bl);
1945 encode(cache_target_dirty_ratio_micro, bl);
1946 encode(cache_target_full_ratio_micro, bl);
1947 encode(cache_min_flush_age, bl);
1948 encode(cache_min_evict_age, bl);
1949 encode(erasure_code_profile, bl);
7c673cae
FG
1950 ENCODE_FINISH(bl);
1951 return;
1952 }
1953
f67539c2 1954 uint8_t v = 30;
28e407b8
AA
1955 // NOTE: any new encoding dependencies must be reflected by
1956 // SIGNIFICANT_FEATURES
7c673cae
FG
1957 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1958 // this was the first post-hammer thing we added; if it's missing, encode
1959 // like hammer.
1960 v = 21;
94b18763 1961 } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae 1962 v = 24;
11fdf7f2
TL
1963 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
1964 v = 26;
1965 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1966 v = 27;
f67539c2
TL
1967 } else if (!is_stretch_pool()) {
1968 v = 29;
7c673cae
FG
1969 }
1970
1971 ENCODE_START(v, 5, bl);
11fdf7f2
TL
1972 encode(type, bl);
1973 encode(size, bl);
1974 encode(crush_rule, bl);
1975 encode(object_hash, bl);
1976 encode(pg_num, bl);
1977 encode(pgp_num, bl);
7c673cae 1978 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
11fdf7f2
TL
1979 encode(lpg_num, bl);
1980 encode(lpgp_num, bl);
1981 encode(last_change, bl);
1982 encode(snap_seq, bl);
1983 encode(snap_epoch, bl);
1984 encode(snaps, bl, features);
1985 encode(removed_snaps, bl);
1986 encode(auid, bl);
1987 if (v >= 27) {
1988 encode(flags, bl);
1989 } else {
1990 auto tmp = flags;
1991 tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
1992 encode(tmp, bl);
1993 }
1994 encode((uint32_t)0, bl); // crash_replay_interval
1995 encode(min_size, bl);
1996 encode(quota_max_bytes, bl);
1997 encode(quota_max_objects, bl);
1998 encode(tiers, bl);
1999 encode(tier_of, bl);
7c673cae 2000 __u8 c = cache_mode;
11fdf7f2
TL
2001 encode(c, bl);
2002 encode(read_tier, bl);
2003 encode(write_tier, bl);
2004 encode(properties, bl);
2005 encode(hit_set_params, bl);
2006 encode(hit_set_period, bl);
2007 encode(hit_set_count, bl);
2008 encode(stripe_width, bl);
2009 encode(target_max_bytes, bl);
2010 encode(target_max_objects, bl);
2011 encode(cache_target_dirty_ratio_micro, bl);
2012 encode(cache_target_full_ratio_micro, bl);
2013 encode(cache_min_flush_age, bl);
2014 encode(cache_min_evict_age, bl);
2015 encode(erasure_code_profile, bl);
2016 encode(last_force_op_resend_preluminous, bl);
2017 encode(min_read_recency_for_promote, bl);
2018 encode(expected_num_objects, bl);
7c673cae 2019 if (v >= 19) {
11fdf7f2 2020 encode(cache_target_dirty_high_ratio_micro, bl);
7c673cae
FG
2021 }
2022 if (v >= 20) {
11fdf7f2 2023 encode(min_write_recency_for_promote, bl);
7c673cae
FG
2024 }
2025 if (v >= 21) {
11fdf7f2 2026 encode(use_gmt_hitset, bl);
7c673cae
FG
2027 }
2028 if (v >= 22) {
11fdf7f2 2029 encode(fast_read, bl);
7c673cae
FG
2030 }
2031 if (v >= 23) {
11fdf7f2
TL
2032 encode(hit_set_grade_decay_rate, bl);
2033 encode(hit_set_search_last_n, bl);
7c673cae
FG
2034 }
2035 if (v >= 24) {
11fdf7f2 2036 encode(opts, bl, features);
7c673cae
FG
2037 }
2038 if (v >= 25) {
11fdf7f2 2039 encode(last_force_op_resend_prenautilus, bl);
7c673cae 2040 }
c07f9fc5 2041 if (v >= 26) {
11fdf7f2
TL
2042 encode(application_metadata, bl);
2043 }
2044 if (v >= 27) {
2045 encode(create_time, bl);
2046 }
2047 if (v >= 28) {
2048 encode(pg_num_target, bl);
2049 encode(pgp_num_target, bl);
2050 encode(pg_num_pending, bl);
2051 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01]
2052 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01]
2053 encode(last_force_op_resend, bl);
2054 encode(pg_autoscale_mode, bl);
2055 }
2056 if (v >= 29) {
2057 encode(last_pg_merge_meta, bl);
c07f9fc5 2058 }
f67539c2
TL
2059 if (v >= 30) {
2060 encode(peering_crush_bucket_count, bl);
2061 encode(peering_crush_bucket_target, bl);
2062 encode(peering_crush_bucket_barrier, bl);
2063 encode(peering_crush_mandatory_member, bl);
2064 }
7c673cae
FG
2065 ENCODE_FINISH(bl);
2066}
2067
9f95a23c 2068void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 2069{
f67539c2 2070 DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl);
11fdf7f2
TL
2071 decode(type, bl);
2072 decode(size, bl);
2073 decode(crush_rule, bl);
2074 decode(object_hash, bl);
2075 decode(pg_num, bl);
2076 decode(pgp_num, bl);
7c673cae
FG
2077 {
2078 __u32 lpg_num, lpgp_num;
11fdf7f2
TL
2079 decode(lpg_num, bl);
2080 decode(lpgp_num, bl);
7c673cae 2081 }
11fdf7f2
TL
2082 decode(last_change, bl);
2083 decode(snap_seq, bl);
2084 decode(snap_epoch, bl);
7c673cae
FG
2085
2086 if (struct_v >= 3) {
11fdf7f2
TL
2087 decode(snaps, bl);
2088 decode(removed_snaps, bl);
2089 decode(auid, bl);
7c673cae
FG
2090 } else {
2091 __u32 n, m;
11fdf7f2
TL
2092 decode(n, bl);
2093 decode(m, bl);
2094 decode(auid, bl);
2095 decode_nohead(n, snaps, bl);
2096 decode_nohead(m, removed_snaps, bl);
7c673cae
FG
2097 }
2098
2099 if (struct_v >= 4) {
11fdf7f2
TL
2100 decode(flags, bl);
2101 uint32_t crash_replay_interval;
2102 decode(crash_replay_interval, bl);
7c673cae
FG
2103 } else {
2104 flags = 0;
11fdf7f2
TL
2105 }
2106 // upgrade path for selfmanaged vs pool snaps
2107 if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
2108 if (!removed_snaps.empty()) {
2109 flags |= FLAG_SELFMANAGED_SNAPS;
2110 } else {
2111 flags |= FLAG_POOL_SNAPS;
2112 }
7c673cae
FG
2113 }
2114 if (struct_v >= 7) {
11fdf7f2 2115 decode(min_size, bl);
7c673cae
FG
2116 } else {
2117 min_size = size - size/2;
2118 }
2119 if (struct_v >= 8) {
11fdf7f2
TL
2120 decode(quota_max_bytes, bl);
2121 decode(quota_max_objects, bl);
7c673cae
FG
2122 }
2123 if (struct_v >= 9) {
11fdf7f2
TL
2124 decode(tiers, bl);
2125 decode(tier_of, bl);
7c673cae 2126 __u8 v;
11fdf7f2 2127 decode(v, bl);
7c673cae 2128 cache_mode = (cache_mode_t)v;
11fdf7f2
TL
2129 decode(read_tier, bl);
2130 decode(write_tier, bl);
7c673cae
FG
2131 }
2132 if (struct_v >= 10) {
11fdf7f2 2133 decode(properties, bl);
7c673cae
FG
2134 }
2135 if (struct_v >= 11) {
11fdf7f2
TL
2136 decode(hit_set_params, bl);
2137 decode(hit_set_period, bl);
2138 decode(hit_set_count, bl);
7c673cae
FG
2139 } else {
2140 pg_pool_t def;
2141 hit_set_period = def.hit_set_period;
2142 hit_set_count = def.hit_set_count;
2143 }
2144 if (struct_v >= 12) {
11fdf7f2 2145 decode(stripe_width, bl);
7c673cae
FG
2146 } else {
2147 set_stripe_width(0);
2148 }
2149 if (struct_v >= 13) {
11fdf7f2
TL
2150 decode(target_max_bytes, bl);
2151 decode(target_max_objects, bl);
2152 decode(cache_target_dirty_ratio_micro, bl);
2153 decode(cache_target_full_ratio_micro, bl);
2154 decode(cache_min_flush_age, bl);
2155 decode(cache_min_evict_age, bl);
7c673cae
FG
2156 } else {
2157 target_max_bytes = 0;
2158 target_max_objects = 0;
2159 cache_target_dirty_ratio_micro = 0;
2160 cache_target_full_ratio_micro = 0;
2161 cache_min_flush_age = 0;
2162 cache_min_evict_age = 0;
2163 }
2164 if (struct_v >= 14) {
11fdf7f2 2165 decode(erasure_code_profile, bl);
7c673cae
FG
2166 }
2167 if (struct_v >= 15) {
11fdf7f2 2168 decode(last_force_op_resend_preluminous, bl);
7c673cae
FG
2169 } else {
2170 last_force_op_resend_preluminous = 0;
2171 }
2172 if (struct_v >= 16) {
11fdf7f2 2173 decode(min_read_recency_for_promote, bl);
7c673cae
FG
2174 } else {
2175 min_read_recency_for_promote = 1;
2176 }
2177 if (struct_v >= 17) {
11fdf7f2 2178 decode(expected_num_objects, bl);
7c673cae
FG
2179 } else {
2180 expected_num_objects = 0;
2181 }
2182 if (struct_v >= 19) {
11fdf7f2 2183 decode(cache_target_dirty_high_ratio_micro, bl);
7c673cae
FG
2184 } else {
2185 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
2186 }
2187 if (struct_v >= 20) {
11fdf7f2 2188 decode(min_write_recency_for_promote, bl);
7c673cae
FG
2189 } else {
2190 min_write_recency_for_promote = 1;
2191 }
2192 if (struct_v >= 21) {
11fdf7f2 2193 decode(use_gmt_hitset, bl);
7c673cae
FG
2194 } else {
2195 use_gmt_hitset = false;
2196 }
2197 if (struct_v >= 22) {
11fdf7f2 2198 decode(fast_read, bl);
7c673cae
FG
2199 } else {
2200 fast_read = false;
2201 }
2202 if (struct_v >= 23) {
11fdf7f2
TL
2203 decode(hit_set_grade_decay_rate, bl);
2204 decode(hit_set_search_last_n, bl);
7c673cae
FG
2205 } else {
2206 hit_set_grade_decay_rate = 0;
2207 hit_set_search_last_n = 1;
2208 }
2209 if (struct_v >= 24) {
11fdf7f2 2210 decode(opts, bl);
7c673cae
FG
2211 }
2212 if (struct_v >= 25) {
11fdf7f2 2213 decode(last_force_op_resend_prenautilus, bl);
7c673cae 2214 } else {
11fdf7f2 2215 last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
7c673cae 2216 }
c07f9fc5 2217 if (struct_v >= 26) {
11fdf7f2
TL
2218 decode(application_metadata, bl);
2219 }
2220 if (struct_v >= 27) {
2221 decode(create_time, bl);
2222 }
2223 if (struct_v >= 28) {
2224 decode(pg_num_target, bl);
2225 decode(pgp_num_target, bl);
2226 decode(pg_num_pending, bl);
2227 epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
2228 decode(old_merge_last_epoch_started, bl);
2229 decode(old_merge_last_epoch_clean, bl);
2230 decode(last_force_op_resend, bl);
2231 decode(pg_autoscale_mode, bl);
2232 if (struct_v >= 29) {
2233 decode(last_pg_merge_meta, bl);
2234 } else {
2235 last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
2236 last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
2237 }
2238 } else {
2239 pg_num_target = pg_num;
2240 pgp_num_target = pgp_num;
2241 pg_num_pending = pg_num;
2242 last_force_op_resend = last_force_op_resend_prenautilus;
9f95a23c 2243 pg_autoscale_mode = pg_autoscale_mode_t::WARN; // default to warn on upgrade
c07f9fc5 2244 }
f67539c2
TL
2245 if (struct_v >= 30) {
2246 decode(peering_crush_bucket_count, bl);
2247 decode(peering_crush_bucket_target, bl);
2248 decode(peering_crush_bucket_barrier, bl);
2249 decode(peering_crush_mandatory_member, bl);
2250 }
7c673cae
FG
2251 DECODE_FINISH(bl);
2252 calc_pg_masks();
2253 calc_grade_table();
2254}
2255
f67539c2
TL
2256bool pg_pool_t::stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
2257 std::ostream * out) const
2258{
2259 if (!is_stretch_pool()) return true;
2260 const uint32_t barrier_id = peering_crush_bucket_barrier;
2261 const uint32_t barrier_count = peering_crush_bucket_count;
2262 set<int> ancestors;
2263 const shared_ptr<CrushWrapper>& crush = osdmap.crush;
2264 for (int osdid : want) {
2265 int ancestor = crush->get_parent_of_type(osdid, barrier_id,
2266 crush_rule);
2267 ancestors.insert(ancestor);
2268 }
2269 if (ancestors.size() < barrier_count) {
2270 if (out) {
2271 *out << __func__ << ": not enough crush buckets with OSDs in want set "
2272 << want;
2273 }
2274 return false;
2275 } else if (peering_crush_mandatory_member != CRUSH_ITEM_NONE &&
2276 !ancestors.count(peering_crush_mandatory_member)) {
2277 if (out) {
2278 *out << __func__ << ": missing mandatory crush bucket member "
2279 << peering_crush_mandatory_member;
2280 }
2281 return false;
2282 }
2283 return true;
2284}
2285
7c673cae
FG
2286void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
2287{
2288 pg_pool_t a;
2289 o.push_back(new pg_pool_t(a));
2290
11fdf7f2 2291 a.create_time = utime_t(4,5);
7c673cae
FG
2292 a.type = TYPE_REPLICATED;
2293 a.size = 2;
31f18b77 2294 a.crush_rule = 3;
7c673cae
FG
2295 a.object_hash = 4;
2296 a.pg_num = 6;
11fdf7f2
TL
2297 a.pgp_num = 4;
2298 a.pgp_num_target = 4;
2299 a.pg_num_target = 5;
2300 a.pg_num_pending = 5;
2301 a.last_pg_merge_meta.last_epoch_started = 2;
2302 a.last_pg_merge_meta.last_epoch_clean = 2;
7c673cae
FG
2303 a.last_change = 9;
2304 a.last_force_op_resend = 123823;
2305 a.last_force_op_resend_preluminous = 123824;
2306 a.snap_seq = 10;
2307 a.snap_epoch = 11;
11fdf7f2 2308 a.flags = FLAG_POOL_SNAPS;
7c673cae 2309 a.auid = 12;
7c673cae
FG
2310 a.quota_max_bytes = 473;
2311 a.quota_max_objects = 474;
2312 o.push_back(new pg_pool_t(a));
2313
2314 a.snaps[3].name = "asdf";
2315 a.snaps[3].snapid = 3;
2316 a.snaps[3].stamp = utime_t(123, 4);
2317 a.snaps[6].name = "qwer";
2318 a.snaps[6].snapid = 6;
2319 a.snaps[6].stamp = utime_t(23423, 4);
2320 o.push_back(new pg_pool_t(a));
2321
11fdf7f2
TL
2322 a.flags = FLAG_SELFMANAGED_SNAPS;
2323 a.snaps.clear();
2324 a.removed_snaps.insert(2);
7c673cae
FG
2325 a.quota_max_bytes = 2473;
2326 a.quota_max_objects = 4374;
2327 a.tiers.insert(0);
2328 a.tiers.insert(1);
2329 a.tier_of = 2;
2330 a.cache_mode = CACHEMODE_WRITEBACK;
2331 a.read_tier = 1;
2332 a.write_tier = 1;
2333 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
2334 a.hit_set_period = 3600;
2335 a.hit_set_count = 8;
2336 a.min_read_recency_for_promote = 1;
2337 a.min_write_recency_for_promote = 1;
2338 a.hit_set_grade_decay_rate = 50;
2339 a.hit_set_search_last_n = 1;
2340 a.calc_grade_table();
2341 a.set_stripe_width(12345);
2342 a.target_max_bytes = 1238132132;
2343 a.target_max_objects = 1232132;
2344 a.cache_target_dirty_ratio_micro = 187232;
2345 a.cache_target_dirty_high_ratio_micro = 309856;
2346 a.cache_target_full_ratio_micro = 987222;
2347 a.cache_min_flush_age = 231;
2348 a.cache_min_evict_age = 2321;
2349 a.erasure_code_profile = "profile in osdmap";
2350 a.expected_num_objects = 123456;
2351 a.fast_read = false;
c07f9fc5 2352 a.application_metadata = {{"rbd", {{"key", "value"}}}};
7c673cae
FG
2353 o.push_back(new pg_pool_t(a));
2354}
2355
2356ostream& operator<<(ostream& out, const pg_pool_t& p)
2357{
9f95a23c
TL
2358 out << p.get_type_name();
2359 if (p.get_type_name() == "erasure") {
2360 out << " profile " << p.erasure_code_profile;
2361 }
2362 out << " size " << p.get_size()
7c673cae 2363 << " min_size " << p.get_min_size()
31f18b77 2364 << " crush_rule " << p.get_crush_rule()
7c673cae
FG
2365 << " object_hash " << p.get_object_hash_name()
2366 << " pg_num " << p.get_pg_num()
11fdf7f2
TL
2367 << " pgp_num " << p.get_pgp_num();
2368 if (p.get_pg_num_target() != p.get_pg_num()) {
2369 out << " pg_num_target " << p.get_pg_num_target();
2370 }
2371 if (p.get_pgp_num_target() != p.get_pgp_num()) {
2372 out << " pgp_num_target " << p.get_pgp_num_target();
2373 }
2374 if (p.get_pg_num_pending() != p.get_pg_num()) {
2375 out << " pg_num_pending " << p.get_pg_num_pending();
2376 }
9f95a23c 2377 if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
11fdf7f2
TL
2378 out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode);
2379 }
2380 out << " last_change " << p.get_last_change();
7c673cae 2381 if (p.get_last_force_op_resend() ||
11fdf7f2 2382 p.get_last_force_op_resend_prenautilus() ||
7c673cae
FG
2383 p.get_last_force_op_resend_preluminous())
2384 out << " lfor " << p.get_last_force_op_resend() << "/"
11fdf7f2 2385 << p.get_last_force_op_resend_prenautilus() << "/"
7c673cae
FG
2386 << p.get_last_force_op_resend_preluminous();
2387 if (p.get_auid())
2388 out << " owner " << p.get_auid();
2389 if (p.flags)
2390 out << " flags " << p.get_flags_string();
7c673cae
FG
2391 if (p.quota_max_bytes)
2392 out << " max_bytes " << p.quota_max_bytes;
2393 if (p.quota_max_objects)
2394 out << " max_objects " << p.quota_max_objects;
2395 if (!p.tiers.empty())
2396 out << " tiers " << p.tiers;
2397 if (p.is_tier())
2398 out << " tier_of " << p.tier_of;
2399 if (p.has_read_tier())
2400 out << " read_tier " << p.read_tier;
2401 if (p.has_write_tier())
2402 out << " write_tier " << p.write_tier;
2403 if (p.cache_mode)
2404 out << " cache_mode " << p.get_cache_mode_name();
2405 if (p.target_max_bytes)
2406 out << " target_bytes " << p.target_max_bytes;
2407 if (p.target_max_objects)
2408 out << " target_objects " << p.target_max_objects;
2409 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
2410 out << " hit_set " << p.hit_set_params
2411 << " " << p.hit_set_period << "s"
2412 << " x" << p.hit_set_count << " decay_rate "
2413 << p.hit_set_grade_decay_rate
2414 << " search_last_n " << p.hit_set_search_last_n;
2415 }
2416 if (p.min_read_recency_for_promote)
2417 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
2418 if (p.min_write_recency_for_promote)
2419 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
2420 out << " stripe_width " << p.get_stripe_width();
2421 if (p.expected_num_objects)
2422 out << " expected_num_objects " << p.expected_num_objects;
2423 if (p.fast_read)
2424 out << " fast_read " << p.fast_read;
2425 out << p.opts;
c07f9fc5
FG
2426 if (!p.application_metadata.empty()) {
2427 out << " application ";
2428 for (auto it = p.application_metadata.begin();
2429 it != p.application_metadata.end(); ++it) {
2430 if (it != p.application_metadata.begin())
2431 out << ",";
2432 out << it->first;
2433 }
2434 }
7c673cae
FG
2435 return out;
2436}
2437
2438
2439// -- object_stat_sum_t --
2440
2441void object_stat_sum_t::dump(Formatter *f) const
2442{
2443 f->dump_int("num_bytes", num_bytes);
2444 f->dump_int("num_objects", num_objects);
2445 f->dump_int("num_object_clones", num_object_clones);
2446 f->dump_int("num_object_copies", num_object_copies);
2447 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
2448 f->dump_int("num_objects_missing", num_objects_missing);
2449 f->dump_int("num_objects_degraded", num_objects_degraded);
2450 f->dump_int("num_objects_misplaced", num_objects_misplaced);
2451 f->dump_int("num_objects_unfound", num_objects_unfound);
2452 f->dump_int("num_objects_dirty", num_objects_dirty);
2453 f->dump_int("num_whiteouts", num_whiteouts);
2454 f->dump_int("num_read", num_rd);
2455 f->dump_int("num_read_kb", num_rd_kb);
2456 f->dump_int("num_write", num_wr);
2457 f->dump_int("num_write_kb", num_wr_kb);
2458 f->dump_int("num_scrub_errors", num_scrub_errors);
2459 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
2460 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
2461 f->dump_int("num_objects_recovered", num_objects_recovered);
2462 f->dump_int("num_bytes_recovered", num_bytes_recovered);
2463 f->dump_int("num_keys_recovered", num_keys_recovered);
2464 f->dump_int("num_objects_omap", num_objects_omap);
2465 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
2466 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
2467 f->dump_int("num_flush", num_flush);
2468 f->dump_int("num_flush_kb", num_flush_kb);
2469 f->dump_int("num_evict", num_evict);
2470 f->dump_int("num_evict_kb", num_evict_kb);
2471 f->dump_int("num_promote", num_promote);
2472 f->dump_int("num_flush_mode_high", num_flush_mode_high);
2473 f->dump_int("num_flush_mode_low", num_flush_mode_low);
2474 f->dump_int("num_evict_mode_some", num_evict_mode_some);
2475 f->dump_int("num_evict_mode_full", num_evict_mode_full);
2476 f->dump_int("num_objects_pinned", num_objects_pinned);
2477 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
28e407b8 2478 f->dump_int("num_large_omap_objects", num_large_omap_objects);
11fdf7f2
TL
2479 f->dump_int("num_objects_manifest", num_objects_manifest);
2480 f->dump_int("num_omap_bytes", num_omap_bytes);
2481 f->dump_int("num_omap_keys", num_omap_keys);
2482 f->dump_int("num_objects_repaired", num_objects_repaired);
7c673cae
FG
2483}
2484
9f95a23c 2485void object_stat_sum_t::encode(ceph::buffer::list& bl) const
7c673cae 2486{
11fdf7f2 2487 ENCODE_START(20, 14, bl);
7c673cae
FG
2488#if defined(CEPH_LITTLE_ENDIAN)
2489 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
2490#else
11fdf7f2
TL
2491 encode(num_bytes, bl);
2492 encode(num_objects, bl);
2493 encode(num_object_clones, bl);
2494 encode(num_object_copies, bl);
2495 encode(num_objects_missing_on_primary, bl);
2496 encode(num_objects_degraded, bl);
2497 encode(num_objects_unfound, bl);
2498 encode(num_rd, bl);
2499 encode(num_rd_kb, bl);
2500 encode(num_wr, bl);
2501 encode(num_wr_kb, bl);
2502 encode(num_scrub_errors, bl);
2503 encode(num_objects_recovered, bl);
2504 encode(num_bytes_recovered, bl);
2505 encode(num_keys_recovered, bl);
2506 encode(num_shallow_scrub_errors, bl);
2507 encode(num_deep_scrub_errors, bl);
2508 encode(num_objects_dirty, bl);
2509 encode(num_whiteouts, bl);
2510 encode(num_objects_omap, bl);
2511 encode(num_objects_hit_set_archive, bl);
2512 encode(num_objects_misplaced, bl);
2513 encode(num_bytes_hit_set_archive, bl);
2514 encode(num_flush, bl);
2515 encode(num_flush_kb, bl);
2516 encode(num_evict, bl);
2517 encode(num_evict_kb, bl);
2518 encode(num_promote, bl);
2519 encode(num_flush_mode_high, bl);
2520 encode(num_flush_mode_low, bl);
2521 encode(num_evict_mode_some, bl);
2522 encode(num_evict_mode_full, bl);
2523 encode(num_objects_pinned, bl);
2524 encode(num_objects_missing, bl);
2525 encode(num_legacy_snapsets, bl);
2526 encode(num_large_omap_objects, bl);
2527 encode(num_objects_manifest, bl);
2528 encode(num_omap_bytes, bl);
2529 encode(num_omap_keys, bl);
2530 encode(num_objects_repaired, bl);
7c673cae
FG
2531#endif
2532 ENCODE_FINISH(bl);
2533}
2534
9f95a23c 2535void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae
FG
2536{
2537 bool decode_finish = false;
11fdf7f2
TL
2538 static const int STAT_SUM_DECODE_VERSION = 20;
2539 DECODE_START(STAT_SUM_DECODE_VERSION, bl);
7c673cae 2540#if defined(CEPH_LITTLE_ENDIAN)
11fdf7f2 2541 if (struct_v == STAT_SUM_DECODE_VERSION) {
7c673cae
FG
2542 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
2543 decode_finish = true;
2544 }
2545#endif
2546 if (!decode_finish) {
11fdf7f2
TL
2547 decode(num_bytes, bl);
2548 decode(num_objects, bl);
2549 decode(num_object_clones, bl);
2550 decode(num_object_copies, bl);
2551 decode(num_objects_missing_on_primary, bl);
2552 decode(num_objects_degraded, bl);
2553 decode(num_objects_unfound, bl);
2554 decode(num_rd, bl);
2555 decode(num_rd_kb, bl);
2556 decode(num_wr, bl);
2557 decode(num_wr_kb, bl);
2558 decode(num_scrub_errors, bl);
2559 decode(num_objects_recovered, bl);
2560 decode(num_bytes_recovered, bl);
2561 decode(num_keys_recovered, bl);
2562 decode(num_shallow_scrub_errors, bl);
2563 decode(num_deep_scrub_errors, bl);
2564 decode(num_objects_dirty, bl);
2565 decode(num_whiteouts, bl);
2566 decode(num_objects_omap, bl);
2567 decode(num_objects_hit_set_archive, bl);
2568 decode(num_objects_misplaced, bl);
2569 decode(num_bytes_hit_set_archive, bl);
2570 decode(num_flush, bl);
2571 decode(num_flush_kb, bl);
2572 decode(num_evict, bl);
2573 decode(num_evict_kb, bl);
2574 decode(num_promote, bl);
2575 decode(num_flush_mode_high, bl);
2576 decode(num_flush_mode_low, bl);
2577 decode(num_evict_mode_some, bl);
2578 decode(num_evict_mode_full, bl);
2579 decode(num_objects_pinned, bl);
2580 decode(num_objects_missing, bl);
7c673cae 2581 if (struct_v >= 16) {
11fdf7f2 2582 decode(num_legacy_snapsets, bl);
7c673cae
FG
2583 } else {
2584 num_legacy_snapsets = num_object_clones; // upper bound
2585 }
28e407b8 2586 if (struct_v >= 17) {
11fdf7f2
TL
2587 decode(num_large_omap_objects, bl);
2588 }
2589 if (struct_v >= 18) {
2590 decode(num_objects_manifest, bl);
2591 }
2592 if (struct_v >= 19) {
2593 decode(num_omap_bytes, bl);
2594 decode(num_omap_keys, bl);
2595 }
2596 if (struct_v >= 20) {
2597 decode(num_objects_repaired, bl);
28e407b8 2598 }
7c673cae
FG
2599 }
2600 DECODE_FINISH(bl);
2601}
2602
2603void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2604{
2605 object_stat_sum_t a;
2606
2607 a.num_bytes = 1;
2608 a.num_objects = 3;
2609 a.num_object_clones = 4;
2610 a.num_object_copies = 5;
2611 a.num_objects_missing_on_primary = 6;
2612 a.num_objects_missing = 123;
2613 a.num_objects_degraded = 7;
2614 a.num_objects_unfound = 8;
2615 a.num_rd = 9; a.num_rd_kb = 10;
2616 a.num_wr = 11; a.num_wr_kb = 12;
2617 a.num_objects_recovered = 14;
2618 a.num_bytes_recovered = 15;
2619 a.num_keys_recovered = 16;
2620 a.num_deep_scrub_errors = 17;
2621 a.num_shallow_scrub_errors = 18;
2622 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2623 a.num_objects_dirty = 21;
2624 a.num_whiteouts = 22;
2625 a.num_objects_misplaced = 1232;
2626 a.num_objects_hit_set_archive = 2;
2627 a.num_bytes_hit_set_archive = 27;
2628 a.num_flush = 5;
2629 a.num_flush_kb = 6;
2630 a.num_evict = 7;
2631 a.num_evict_kb = 8;
2632 a.num_promote = 9;
2633 a.num_flush_mode_high = 0;
2634 a.num_flush_mode_low = 1;
2635 a.num_evict_mode_some = 1;
2636 a.num_evict_mode_full = 0;
2637 a.num_objects_pinned = 20;
28e407b8 2638 a.num_large_omap_objects = 5;
11fdf7f2
TL
2639 a.num_objects_manifest = 2;
2640 a.num_omap_bytes = 20000;
2641 a.num_omap_keys = 200;
2642 a.num_objects_repaired = 300;
7c673cae
FG
2643 o.push_back(new object_stat_sum_t(a));
2644}
2645
2646void object_stat_sum_t::add(const object_stat_sum_t& o)
2647{
2648 num_bytes += o.num_bytes;
2649 num_objects += o.num_objects;
2650 num_object_clones += o.num_object_clones;
2651 num_object_copies += o.num_object_copies;
2652 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2653 num_objects_missing += o.num_objects_missing;
2654 num_objects_degraded += o.num_objects_degraded;
2655 num_objects_misplaced += o.num_objects_misplaced;
2656 num_rd += o.num_rd;
2657 num_rd_kb += o.num_rd_kb;
2658 num_wr += o.num_wr;
2659 num_wr_kb += o.num_wr_kb;
2660 num_objects_unfound += o.num_objects_unfound;
2661 num_scrub_errors += o.num_scrub_errors;
2662 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2663 num_deep_scrub_errors += o.num_deep_scrub_errors;
2664 num_objects_recovered += o.num_objects_recovered;
2665 num_bytes_recovered += o.num_bytes_recovered;
2666 num_keys_recovered += o.num_keys_recovered;
2667 num_objects_dirty += o.num_objects_dirty;
2668 num_whiteouts += o.num_whiteouts;
2669 num_objects_omap += o.num_objects_omap;
2670 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2671 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2672 num_flush += o.num_flush;
2673 num_flush_kb += o.num_flush_kb;
2674 num_evict += o.num_evict;
2675 num_evict_kb += o.num_evict_kb;
2676 num_promote += o.num_promote;
2677 num_flush_mode_high += o.num_flush_mode_high;
2678 num_flush_mode_low += o.num_flush_mode_low;
2679 num_evict_mode_some += o.num_evict_mode_some;
2680 num_evict_mode_full += o.num_evict_mode_full;
2681 num_objects_pinned += o.num_objects_pinned;
2682 num_legacy_snapsets += o.num_legacy_snapsets;
28e407b8 2683 num_large_omap_objects += o.num_large_omap_objects;
11fdf7f2
TL
2684 num_objects_manifest += o.num_objects_manifest;
2685 num_omap_bytes += o.num_omap_bytes;
2686 num_omap_keys += o.num_omap_keys;
2687 num_objects_repaired += o.num_objects_repaired;
7c673cae
FG
2688}
2689
2690void object_stat_sum_t::sub(const object_stat_sum_t& o)
2691{
2692 num_bytes -= o.num_bytes;
2693 num_objects -= o.num_objects;
2694 num_object_clones -= o.num_object_clones;
2695 num_object_copies -= o.num_object_copies;
2696 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2697 num_objects_missing -= o.num_objects_missing;
2698 num_objects_degraded -= o.num_objects_degraded;
2699 num_objects_misplaced -= o.num_objects_misplaced;
2700 num_rd -= o.num_rd;
2701 num_rd_kb -= o.num_rd_kb;
2702 num_wr -= o.num_wr;
2703 num_wr_kb -= o.num_wr_kb;
2704 num_objects_unfound -= o.num_objects_unfound;
2705 num_scrub_errors -= o.num_scrub_errors;
2706 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2707 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2708 num_objects_recovered -= o.num_objects_recovered;
2709 num_bytes_recovered -= o.num_bytes_recovered;
2710 num_keys_recovered -= o.num_keys_recovered;
2711 num_objects_dirty -= o.num_objects_dirty;
2712 num_whiteouts -= o.num_whiteouts;
2713 num_objects_omap -= o.num_objects_omap;
2714 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2715 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2716 num_flush -= o.num_flush;
2717 num_flush_kb -= o.num_flush_kb;
2718 num_evict -= o.num_evict;
2719 num_evict_kb -= o.num_evict_kb;
2720 num_promote -= o.num_promote;
2721 num_flush_mode_high -= o.num_flush_mode_high;
2722 num_flush_mode_low -= o.num_flush_mode_low;
2723 num_evict_mode_some -= o.num_evict_mode_some;
2724 num_evict_mode_full -= o.num_evict_mode_full;
2725 num_objects_pinned -= o.num_objects_pinned;
2726 num_legacy_snapsets -= o.num_legacy_snapsets;
28e407b8 2727 num_large_omap_objects -= o.num_large_omap_objects;
11fdf7f2
TL
2728 num_objects_manifest -= o.num_objects_manifest;
2729 num_omap_bytes -= o.num_omap_bytes;
2730 num_omap_keys -= o.num_omap_keys;
2731 num_objects_repaired -= o.num_objects_repaired;
7c673cae
FG
2732}
2733
2734bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2735{
2736 return
2737 l.num_bytes == r.num_bytes &&
2738 l.num_objects == r.num_objects &&
2739 l.num_object_clones == r.num_object_clones &&
2740 l.num_object_copies == r.num_object_copies &&
2741 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2742 l.num_objects_missing == r.num_objects_missing &&
2743 l.num_objects_degraded == r.num_objects_degraded &&
2744 l.num_objects_misplaced == r.num_objects_misplaced &&
2745 l.num_objects_unfound == r.num_objects_unfound &&
2746 l.num_rd == r.num_rd &&
2747 l.num_rd_kb == r.num_rd_kb &&
2748 l.num_wr == r.num_wr &&
2749 l.num_wr_kb == r.num_wr_kb &&
2750 l.num_scrub_errors == r.num_scrub_errors &&
2751 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2752 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2753 l.num_objects_recovered == r.num_objects_recovered &&
2754 l.num_bytes_recovered == r.num_bytes_recovered &&
2755 l.num_keys_recovered == r.num_keys_recovered &&
2756 l.num_objects_dirty == r.num_objects_dirty &&
2757 l.num_whiteouts == r.num_whiteouts &&
2758 l.num_objects_omap == r.num_objects_omap &&
2759 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2760 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2761 l.num_flush == r.num_flush &&
2762 l.num_flush_kb == r.num_flush_kb &&
2763 l.num_evict == r.num_evict &&
2764 l.num_evict_kb == r.num_evict_kb &&
2765 l.num_promote == r.num_promote &&
2766 l.num_flush_mode_high == r.num_flush_mode_high &&
2767 l.num_flush_mode_low == r.num_flush_mode_low &&
2768 l.num_evict_mode_some == r.num_evict_mode_some &&
2769 l.num_evict_mode_full == r.num_evict_mode_full &&
2770 l.num_objects_pinned == r.num_objects_pinned &&
28e407b8 2771 l.num_legacy_snapsets == r.num_legacy_snapsets &&
11fdf7f2
TL
2772 l.num_large_omap_objects == r.num_large_omap_objects &&
2773 l.num_objects_manifest == r.num_objects_manifest &&
2774 l.num_omap_bytes == r.num_omap_bytes &&
2775 l.num_omap_keys == r.num_omap_keys &&
2776 l.num_objects_repaired == r.num_objects_repaired;
7c673cae
FG
2777}
2778
2779// -- object_stat_collection_t --
2780
2781void object_stat_collection_t::dump(Formatter *f) const
2782{
2783 f->open_object_section("stat_sum");
2784 sum.dump(f);
2785 f->close_section();
2786}
2787
9f95a23c 2788void object_stat_collection_t::encode(ceph::buffer::list& bl) const
7c673cae
FG
2789{
2790 ENCODE_START(2, 2, bl);
11fdf7f2
TL
2791 encode(sum, bl);
2792 encode((__u32)0, bl);
7c673cae
FG
2793 ENCODE_FINISH(bl);
2794}
2795
9f95a23c 2796void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae
FG
2797{
2798 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2 2799 decode(sum, bl);
7c673cae
FG
2800 {
2801 map<string,object_stat_sum_t> cat_sum;
11fdf7f2 2802 decode(cat_sum, bl);
7c673cae
FG
2803 }
2804 DECODE_FINISH(bl);
2805}
2806
2807void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2808{
2809 object_stat_collection_t a;
2810 o.push_back(new object_stat_collection_t(a));
2811 list<object_stat_sum_t*> l;
2812 object_stat_sum_t::generate_test_instances(l);
9f95a23c 2813 for (auto p = l.begin(); p != l.end(); ++p) {
7c673cae
FG
2814 a.add(**p);
2815 o.push_back(new object_stat_collection_t(a));
2816 }
2817}
2818
2819
2820// -- pg_stat_t --
2821
2822bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2823{
2824 if (primary && osd == acting_primary) {
2825 return true;
2826 } else if (!primary) {
9f95a23c 2827 for(auto it = acting.cbegin(); it != acting.cend(); ++it)
7c673cae
FG
2828 {
2829 if (*it == osd)
2830 return true;
2831 }
2832 }
2833 return false;
2834}
2835
2836void pg_stat_t::dump(Formatter *f) const
2837{
2838 f->dump_stream("version") << version;
b3b6e05e
TL
2839 f->dump_unsigned("reported_seq", reported_seq);
2840 f->dump_unsigned("reported_epoch", reported_epoch);
7c673cae
FG
2841 f->dump_string("state", pg_state_string(state));
2842 f->dump_stream("last_fresh") << last_fresh;
2843 f->dump_stream("last_change") << last_change;
2844 f->dump_stream("last_active") << last_active;
2845 f->dump_stream("last_peered") << last_peered;
2846 f->dump_stream("last_clean") << last_clean;
2847 f->dump_stream("last_became_active") << last_became_active;
2848 f->dump_stream("last_became_peered") << last_became_peered;
2849 f->dump_stream("last_unstale") << last_unstale;
2850 f->dump_stream("last_undegraded") << last_undegraded;
2851 f->dump_stream("last_fullsized") << last_fullsized;
2852 f->dump_unsigned("mapping_epoch", mapping_epoch);
2853 f->dump_stream("log_start") << log_start;
2854 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2855 f->dump_unsigned("created", created);
2856 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2857 f->dump_stream("parent") << parent;
2858 f->dump_unsigned("parent_split_bits", parent_split_bits);
2859 f->dump_stream("last_scrub") << last_scrub;
2860 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2861 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2862 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2863 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
20effc67 2864 f->dump_int("objects_scrubbed", objects_scrubbed);
7c673cae 2865 f->dump_int("log_size", log_size);
1e59de90 2866 f->dump_int("log_dups_size", log_dups_size);
7c673cae
FG
2867 f->dump_int("ondisk_log_size", ondisk_log_size);
2868 f->dump_bool("stats_invalid", stats_invalid);
2869 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2870 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2871 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2872 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2873 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
11fdf7f2 2874 f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
b32b8144 2875 f->dump_unsigned("snaptrimq_len", snaptrimq_len);
20effc67
TL
2876 f->dump_int("last_scrub_duration", last_scrub_duration);
2877 f->dump_string("scrub_schedule", dump_scrub_schedule());
1d09f67e
TL
2878 f->dump_float("scrub_duration", scrub_duration);
2879 f->dump_int("objects_trimmed", objects_trimmed);
2880 f->dump_float("snaptrim_duration", snaptrim_duration);
7c673cae
FG
2881 stats.dump(f);
2882 f->open_array_section("up");
9f95a23c 2883 for (auto p = up.cbegin(); p != up.cend(); ++p)
7c673cae
FG
2884 f->dump_int("osd", *p);
2885 f->close_section();
2886 f->open_array_section("acting");
9f95a23c 2887 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
7c673cae
FG
2888 f->dump_int("osd", *p);
2889 f->close_section();
81eedcae
TL
2890 f->open_array_section("avail_no_missing");
2891 for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p)
2892 f->dump_stream("shard") << *p;
2893 f->close_section();
2894 f->open_array_section("object_location_counts");
2895 for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) {
2896 f->open_object_section("entry");
2897 f->dump_stream("shards") << p->first;
2898 f->dump_int("objects", p->second);
2899 f->close_section();
2900 }
2901 f->close_section();
7c673cae 2902 f->open_array_section("blocked_by");
9f95a23c 2903 for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p)
7c673cae
FG
2904 f->dump_int("osd", *p);
2905 f->close_section();
2906 f->dump_int("up_primary", up_primary);
2907 f->dump_int("acting_primary", acting_primary);
11fdf7f2 2908 f->open_array_section("purged_snaps");
9f95a23c 2909 for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) {
11fdf7f2
TL
2910 f->open_object_section("interval");
2911 f->dump_stream("start") << i.get_start();
2912 f->dump_stream("length") << i.get_len();
2913 f->close_section();
2914 }
2915 f->close_section();
7c673cae
FG
2916}
2917
2918void pg_stat_t::dump_brief(Formatter *f) const
2919{
2920 f->dump_string("state", pg_state_string(state));
2921 f->open_array_section("up");
9f95a23c 2922 for (auto p = up.cbegin(); p != up.cend(); ++p)
7c673cae
FG
2923 f->dump_int("osd", *p);
2924 f->close_section();
2925 f->open_array_section("acting");
9f95a23c 2926 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
7c673cae
FG
2927 f->dump_int("osd", *p);
2928 f->close_section();
2929 f->dump_int("up_primary", up_primary);
2930 f->dump_int("acting_primary", acting_primary);
2931}
2932
20effc67
TL
2933std::string pg_stat_t::dump_scrub_schedule() const
2934{
2935 if (scrub_sched_status.m_is_active) {
1e59de90
TL
2936 // are we blocked (in fact, stuck) on some locked object?
2937 if (scrub_sched_status.m_sched_status == pg_scrub_sched_status_t::blocked) {
2938 return fmt::format(
2939 "Blocked! locked objects (for {}s)",
2940 scrub_sched_status.m_duration_seconds);
2941 } else {
2942 return fmt::format(
2943 "{}scrubbing for {}s",
2944 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
2945 scrub_sched_status.m_duration_seconds);
2946 }
20effc67
TL
2947 }
2948 switch (scrub_sched_status.m_sched_status) {
2949 case pg_scrub_sched_status_t::unknown:
2950 // no reported scrub schedule yet
2951 return "--"s;
2952 case pg_scrub_sched_status_t::not_queued:
2953 return "no scrub is scheduled"s;
2954 case pg_scrub_sched_status_t::scheduled:
2955 return fmt::format(
2956 "{} {}scrub scheduled @ {}",
2957 (scrub_sched_status.m_is_periodic ? "periodic" : "user requested"),
2958 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
2959 scrub_sched_status.m_scheduled_at);
2960 case pg_scrub_sched_status_t::queued:
2961 return fmt::format(
2962 "queued for {}scrub",
2963 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""));
2964 default:
2965 // a bug!
2966 return "SCRUB STATE MISMATCH!"s;
2967 }
2968}
2969
2970bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r)
2971{
2972 return
2973 l.m_sched_status == r.m_sched_status &&
2974 l.m_scheduled_at == r.m_scheduled_at &&
2975 l.m_duration_seconds == r.m_duration_seconds &&
2976 l.m_is_active == r.m_is_active &&
2977 l.m_is_deep == r.m_is_deep &&
2978 l.m_is_periodic == r.m_is_periodic;
2979}
2980
9f95a23c 2981void pg_stat_t::encode(ceph::buffer::list &bl) const
7c673cae 2982{
1e59de90 2983 ENCODE_START(29, 22, bl);
11fdf7f2
TL
2984 encode(version, bl);
2985 encode(reported_seq, bl);
2986 encode(reported_epoch, bl);
2987 encode((__u32)state, bl); // for older peers
2988 encode(log_start, bl);
2989 encode(ondisk_log_start, bl);
2990 encode(created, bl);
2991 encode(last_epoch_clean, bl);
2992 encode(parent, bl);
2993 encode(parent_split_bits, bl);
2994 encode(last_scrub, bl);
2995 encode(last_scrub_stamp, bl);
2996 encode(stats, bl);
2997 encode(log_size, bl);
2998 encode(ondisk_log_size, bl);
2999 encode(up, bl);
3000 encode(acting, bl);
3001 encode(last_fresh, bl);
3002 encode(last_change, bl);
3003 encode(last_active, bl);
3004 encode(last_clean, bl);
3005 encode(last_unstale, bl);
3006 encode(mapping_epoch, bl);
3007 encode(last_deep_scrub, bl);
3008 encode(last_deep_scrub_stamp, bl);
3009 encode(stats_invalid, bl);
3010 encode(last_clean_scrub_stamp, bl);
3011 encode(last_became_active, bl);
3012 encode(dirty_stats_invalid, bl);
3013 encode(up_primary, bl);
3014 encode(acting_primary, bl);
3015 encode(omap_stats_invalid, bl);
3016 encode(hitset_stats_invalid, bl);
3017 encode(blocked_by, bl);
3018 encode(last_undegraded, bl);
3019 encode(last_fullsized, bl);
3020 encode(hitset_bytes_stats_invalid, bl);
3021 encode(last_peered, bl);
3022 encode(last_became_peered, bl);
3023 encode(pin_stats_invalid, bl);
3024 encode(snaptrimq_len, bl);
3025 __u32 top_state = (state >> 32);
3026 encode(top_state, bl);
3027 encode(purged_snaps, bl);
3028 encode(manifest_stats_invalid, bl);
81eedcae
TL
3029 encode(avail_no_missing, bl);
3030 encode(object_location_counts, bl);
20effc67
TL
3031 encode(last_scrub_duration, bl);
3032 encode(scrub_sched_status.m_scheduled_at, bl);
3033 encode(scrub_sched_status.m_duration_seconds, bl);
3034 encode((__u16)scrub_sched_status.m_sched_status, bl);
3035 encode(scrub_sched_status.m_is_active, bl);
3036 encode((scrub_sched_status.m_is_deep==scrub_level_t::deep), bl);
3037 encode(scrub_sched_status.m_is_periodic, bl);
3038 encode(objects_scrubbed, bl);
1d09f67e
TL
3039 encode(scrub_duration, bl);
3040 encode(objects_trimmed, bl);
3041 encode(snaptrim_duration, bl);
1e59de90 3042 encode(log_dups_size, bl);
20effc67 3043
7c673cae
FG
3044 ENCODE_FINISH(bl);
3045}
3046
9f95a23c 3047void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
3048{
3049 bool tmp;
11fdf7f2 3050 uint32_t old_state;
1e59de90 3051 DECODE_START(29, bl);
11fdf7f2
TL
3052 decode(version, bl);
3053 decode(reported_seq, bl);
3054 decode(reported_epoch, bl);
3055 decode(old_state, bl);
3056 decode(log_start, bl);
3057 decode(ondisk_log_start, bl);
3058 decode(created, bl);
3059 decode(last_epoch_clean, bl);
3060 decode(parent, bl);
3061 decode(parent_split_bits, bl);
3062 decode(last_scrub, bl);
3063 decode(last_scrub_stamp, bl);
3064 decode(stats, bl);
3065 decode(log_size, bl);
3066 decode(ondisk_log_size, bl);
3067 decode(up, bl);
3068 decode(acting, bl);
3069 decode(last_fresh, bl);
3070 decode(last_change, bl);
3071 decode(last_active, bl);
3072 decode(last_clean, bl);
3073 decode(last_unstale, bl);
3074 decode(mapping_epoch, bl);
3075 decode(last_deep_scrub, bl);
3076 decode(last_deep_scrub_stamp, bl);
3077 decode(tmp, bl);
7c673cae 3078 stats_invalid = tmp;
11fdf7f2
TL
3079 decode(last_clean_scrub_stamp, bl);
3080 decode(last_became_active, bl);
3081 decode(tmp, bl);
7c673cae 3082 dirty_stats_invalid = tmp;
11fdf7f2
TL
3083 decode(up_primary, bl);
3084 decode(acting_primary, bl);
3085 decode(tmp, bl);
7c673cae 3086 omap_stats_invalid = tmp;
11fdf7f2 3087 decode(tmp, bl);
7c673cae 3088 hitset_stats_invalid = tmp;
11fdf7f2
TL
3089 decode(blocked_by, bl);
3090 decode(last_undegraded, bl);
3091 decode(last_fullsized, bl);
3092 decode(tmp, bl);
7c673cae 3093 hitset_bytes_stats_invalid = tmp;
11fdf7f2
TL
3094 decode(last_peered, bl);
3095 decode(last_became_peered, bl);
3096 decode(tmp, bl);
7c673cae 3097 pin_stats_invalid = tmp;
b32b8144 3098 if (struct_v >= 23) {
11fdf7f2
TL
3099 decode(snaptrimq_len, bl);
3100 if (struct_v >= 24) {
3101 __u32 top_state;
3102 decode(top_state, bl);
3103 state = (uint64_t)old_state | ((uint64_t)top_state << 32);
3104 decode(purged_snaps, bl);
3105 } else {
3106 state = old_state;
3107 }
3108 if (struct_v >= 25) {
3109 decode(tmp, bl);
3110 manifest_stats_invalid = tmp;
3111 } else {
3112 manifest_stats_invalid = true;
3113 }
81eedcae
TL
3114 if (struct_v >= 26) {
3115 decode(avail_no_missing, bl);
3116 decode(object_location_counts, bl);
3117 }
20effc67
TL
3118 if (struct_v >= 27) {
3119 decode(last_scrub_duration, bl);
3120 decode(scrub_sched_status.m_scheduled_at, bl);
3121 decode(scrub_sched_status.m_duration_seconds, bl);
3122 __u16 scrub_sched_as_u16;
3123 decode(scrub_sched_as_u16, bl);
3124 scrub_sched_status.m_sched_status = (pg_scrub_sched_status_t)(scrub_sched_as_u16);
3125 decode(tmp, bl);
3126 scrub_sched_status.m_is_active = tmp;
3127 decode(tmp, bl);
3128 scrub_sched_status.m_is_deep = tmp ? scrub_level_t::deep : scrub_level_t::shallow;
3129 decode(tmp, bl);
3130 scrub_sched_status.m_is_periodic = tmp;
3131 decode(objects_scrubbed, bl);
3132 }
1d09f67e
TL
3133 if (struct_v >= 28) {
3134 decode(scrub_duration, bl);
3135 decode(objects_trimmed, bl);
3136 decode(snaptrim_duration, bl);
3137 }
1e59de90
TL
3138 if (struct_v >= 29) {
3139 decode(log_dups_size, bl);
3140 }
b32b8144 3141 }
7c673cae
FG
3142 DECODE_FINISH(bl);
3143}
3144
3145void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
3146{
3147 pg_stat_t a;
3148 o.push_back(new pg_stat_t(a));
3149
3150 a.version = eversion_t(1, 3);
3151 a.reported_epoch = 1;
3152 a.reported_seq = 2;
3153 a.state = 123;
3154 a.mapping_epoch = 998;
3155 a.last_fresh = utime_t(1002, 1);
3156 a.last_change = utime_t(1002, 2);
3157 a.last_active = utime_t(1002, 3);
3158 a.last_clean = utime_t(1002, 4);
3159 a.last_unstale = utime_t(1002, 5);
3160 a.last_undegraded = utime_t(1002, 7);
3161 a.last_fullsized = utime_t(1002, 8);
3162 a.log_start = eversion_t(1, 4);
3163 a.ondisk_log_start = eversion_t(1, 5);
3164 a.created = 6;
3165 a.last_epoch_clean = 7;
11fdf7f2 3166 a.parent = pg_t(1, 2);
7c673cae
FG
3167 a.parent_split_bits = 12;
3168 a.last_scrub = eversion_t(9, 10);
3169 a.last_scrub_stamp = utime_t(11, 12);
3170 a.last_deep_scrub = eversion_t(13, 14);
3171 a.last_deep_scrub_stamp = utime_t(15, 16);
3172 a.last_clean_scrub_stamp = utime_t(17, 18);
20effc67 3173 a.last_scrub_duration = 3617;
1d09f67e 3174 a.scrub_duration = 0.003;
b32b8144 3175 a.snaptrimq_len = 1048576;
20effc67 3176 a.objects_scrubbed = 0;
1d09f67e
TL
3177 a.objects_trimmed = 0;
3178 a.snaptrim_duration = 0.123;
7c673cae
FG
3179 list<object_stat_collection_t*> l;
3180 object_stat_collection_t::generate_test_instances(l);
3181 a.stats = *l.back();
3182 a.log_size = 99;
3183 a.ondisk_log_size = 88;
3184 a.up.push_back(123);
3185 a.up_primary = 123;
3186 a.acting.push_back(456);
81eedcae
TL
3187 a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD));
3188 set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) };
3189 a.object_location_counts.insert(make_pair(sset, 10));
3190 sset.insert(pg_shard_t(2));
3191 a.object_location_counts.insert(make_pair(sset, 5));
7c673cae
FG
3192 a.acting_primary = 456;
3193 o.push_back(new pg_stat_t(a));
3194
3195 a.up.push_back(124);
3196 a.up_primary = 124;
3197 a.acting.push_back(124);
3198 a.acting_primary = 124;
3199 a.blocked_by.push_back(155);
3200 a.blocked_by.push_back(156);
3201 o.push_back(new pg_stat_t(a));
3202}
3203
3204bool operator==(const pg_stat_t& l, const pg_stat_t& r)
3205{
3206 return
3207 l.version == r.version &&
3208 l.reported_seq == r.reported_seq &&
3209 l.reported_epoch == r.reported_epoch &&
3210 l.state == r.state &&
3211 l.last_fresh == r.last_fresh &&
3212 l.last_change == r.last_change &&
3213 l.last_active == r.last_active &&
3214 l.last_peered == r.last_peered &&
3215 l.last_clean == r.last_clean &&
3216 l.last_unstale == r.last_unstale &&
3217 l.last_undegraded == r.last_undegraded &&
3218 l.last_fullsized == r.last_fullsized &&
3219 l.log_start == r.log_start &&
3220 l.ondisk_log_start == r.ondisk_log_start &&
3221 l.created == r.created &&
3222 l.last_epoch_clean == r.last_epoch_clean &&
3223 l.parent == r.parent &&
3224 l.parent_split_bits == r.parent_split_bits &&
3225 l.last_scrub == r.last_scrub &&
3226 l.last_deep_scrub == r.last_deep_scrub &&
3227 l.last_scrub_stamp == r.last_scrub_stamp &&
3228 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
3229 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
3230 l.stats == r.stats &&
3231 l.stats_invalid == r.stats_invalid &&
3232 l.log_size == r.log_size &&
1e59de90 3233 l.log_dups_size == r.log_dups_size &&
7c673cae
FG
3234 l.ondisk_log_size == r.ondisk_log_size &&
3235 l.up == r.up &&
3236 l.acting == r.acting &&
81eedcae
TL
3237 l.avail_no_missing == r.avail_no_missing &&
3238 l.object_location_counts == r.object_location_counts &&
7c673cae
FG
3239 l.mapping_epoch == r.mapping_epoch &&
3240 l.blocked_by == r.blocked_by &&
3241 l.last_became_active == r.last_became_active &&
3242 l.last_became_peered == r.last_became_peered &&
3243 l.dirty_stats_invalid == r.dirty_stats_invalid &&
3244 l.omap_stats_invalid == r.omap_stats_invalid &&
3245 l.hitset_stats_invalid == r.hitset_stats_invalid &&
3246 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
3247 l.up_primary == r.up_primary &&
3248 l.acting_primary == r.acting_primary &&
b32b8144 3249 l.pin_stats_invalid == r.pin_stats_invalid &&
11fdf7f2
TL
3250 l.manifest_stats_invalid == r.manifest_stats_invalid &&
3251 l.purged_snaps == r.purged_snaps &&
20effc67
TL
3252 l.snaptrimq_len == r.snaptrimq_len &&
3253 l.last_scrub_duration == r.last_scrub_duration &&
3254 l.scrub_sched_status == r.scrub_sched_status &&
1d09f67e
TL
3255 l.objects_scrubbed == r.objects_scrubbed &&
3256 l.scrub_duration == r.scrub_duration &&
3257 l.objects_trimmed == r.objects_trimmed &&
3258 l.snaptrim_duration == r.snaptrim_duration;
7c673cae
FG
3259}
3260
11fdf7f2
TL
3261// -- store_statfs_t --
3262
3263bool store_statfs_t::operator==(const store_statfs_t& other) const
3264{
3265 return total == other.total
3266 && available == other.available
3267 && allocated == other.allocated
3268 && internally_reserved == other.internally_reserved
3269 && data_stored == other.data_stored
3270 && data_compressed == other.data_compressed
3271 && data_compressed_allocated == other.data_compressed_allocated
3272 && data_compressed_original == other.data_compressed_original
3273 && omap_allocated == other.omap_allocated
3274 && internal_metadata == other.internal_metadata;
3275}
3276
3277void store_statfs_t::dump(Formatter *f) const
3278{
3279 f->dump_int("total", total);
3280 f->dump_int("available", available);
3281 f->dump_int("internally_reserved", internally_reserved);
3282 f->dump_int("allocated", allocated);
3283 f->dump_int("data_stored", data_stored);
3284 f->dump_int("data_compressed", data_compressed);
3285 f->dump_int("data_compressed_allocated", data_compressed_allocated);
3286 f->dump_int("data_compressed_original", data_compressed_original);
3287 f->dump_int("omap_allocated", omap_allocated);
3288 f->dump_int("internal_metadata", internal_metadata);
3289}
3290
3291ostream& operator<<(ostream& out, const store_statfs_t &s)
3292{
3293 out << std::hex
3294 << "store_statfs(0x" << s.available
3295 << "/0x" << s.internally_reserved
3296 << "/0x" << s.total
3297 << ", data 0x" << s.data_stored
3298 << "/0x" << s.allocated
3299 << ", compress 0x" << s.data_compressed
3300 << "/0x" << s.data_compressed_allocated
3301 << "/0x" << s.data_compressed_original
3302 << ", omap 0x" << s.omap_allocated
3303 << ", meta 0x" << s.internal_metadata
3304 << std::dec
3305 << ")";
3306 return out;
3307}
3308
3309void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o)
3310{
3311 store_statfs_t a;
3312 o.push_back(new store_statfs_t(a));
3313 a.total = 234;
3314 a.available = 123;
3315 a.internally_reserved = 33;
3316 a.allocated = 32;
3317 a.data_stored = 44;
3318 a.data_compressed = 21;
3319 a.data_compressed_allocated = 12;
3320 a.data_compressed_original = 13;
3321 a.omap_allocated = 14;
3322 a.internal_metadata = 15;
3323 o.push_back(new store_statfs_t(a));
3324}
3325
7c673cae
FG
3326// -- pool_stat_t --
3327
3328void pool_stat_t::dump(Formatter *f) const
3329{
3330 stats.dump(f);
11fdf7f2
TL
3331 f->open_object_section("store_stats");
3332 store_stats.dump(f);
3333 f->close_section();
7c673cae
FG
3334 f->dump_int("log_size", log_size);
3335 f->dump_int("ondisk_log_size", ondisk_log_size);
3336 f->dump_int("up", up);
3337 f->dump_int("acting", acting);
eafe8130 3338 f->dump_int("num_store_stats", num_store_stats);
7c673cae
FG
3339}
3340
9f95a23c 3341void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
7c673cae 3342{
11fdf7f2 3343 using ceph::encode;
7c673cae
FG
3344 if ((features & CEPH_FEATURE_OSDENC) == 0) {
3345 __u8 v = 4;
11fdf7f2
TL
3346 encode(v, bl);
3347 encode(stats, bl);
3348 encode(log_size, bl);
3349 encode(ondisk_log_size, bl);
7c673cae
FG
3350 return;
3351 }
3352
11fdf7f2
TL
3353 ENCODE_START(7, 5, bl);
3354 encode(stats, bl);
3355 encode(log_size, bl);
3356 encode(ondisk_log_size, bl);
3357 encode(up, bl);
3358 encode(acting, bl);
3359 encode(store_stats, bl);
3360 encode(num_store_stats, bl);
7c673cae
FG
3361 ENCODE_FINISH(bl);
3362}
3363
9f95a23c 3364void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae 3365{
11fdf7f2 3366 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
7c673cae 3367 if (struct_v >= 4) {
11fdf7f2
TL
3368 decode(stats, bl);
3369 decode(log_size, bl);
3370 decode(ondisk_log_size, bl);
7c673cae 3371 if (struct_v >= 6) {
11fdf7f2
TL
3372 decode(up, bl);
3373 decode(acting, bl);
7c673cae
FG
3374 } else {
3375 up = 0;
3376 acting = 0;
3377 }
11fdf7f2
TL
3378 if (struct_v >= 7) {
3379 decode(store_stats, bl);
3380 decode(num_store_stats, bl);
3381 } else {
3382 store_stats.reset();
3383 num_store_stats = 0;
3384 }
3385
7c673cae 3386 } else {
11fdf7f2 3387 decode(stats.sum.num_bytes, bl);
7c673cae 3388 uint64_t num_kb;
11fdf7f2
TL
3389 decode(num_kb, bl);
3390 decode(stats.sum.num_objects, bl);
3391 decode(stats.sum.num_object_clones, bl);
3392 decode(stats.sum.num_object_copies, bl);
3393 decode(stats.sum.num_objects_missing_on_primary, bl);
3394 decode(stats.sum.num_objects_degraded, bl);
3395 decode(log_size, bl);
3396 decode(ondisk_log_size, bl);
7c673cae 3397 if (struct_v >= 2) {
11fdf7f2
TL
3398 decode(stats.sum.num_rd, bl);
3399 decode(stats.sum.num_rd_kb, bl);
3400 decode(stats.sum.num_wr, bl);
3401 decode(stats.sum.num_wr_kb, bl);
7c673cae
FG
3402 }
3403 if (struct_v >= 3) {
11fdf7f2 3404 decode(stats.sum.num_objects_unfound, bl);
7c673cae
FG
3405 }
3406 }
3407 DECODE_FINISH(bl);
3408}
3409
3410void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
3411{
3412 pool_stat_t a;
3413 o.push_back(new pool_stat_t(a));
3414
3415 list<object_stat_collection_t*> l;
3416 object_stat_collection_t::generate_test_instances(l);
11fdf7f2
TL
3417 list<store_statfs_t*> ll;
3418 store_statfs_t::generate_test_instances(ll);
7c673cae 3419 a.stats = *l.back();
11fdf7f2 3420 a.store_stats = *ll.back();
7c673cae
FG
3421 a.log_size = 123;
3422 a.ondisk_log_size = 456;
3423 a.acting = 3;
3424 a.up = 4;
11fdf7f2 3425 a.num_store_stats = 1;
7c673cae
FG
3426 o.push_back(new pool_stat_t(a));
3427}
3428
3429
3430// -- pg_history_t --
3431
9f95a23c 3432void pg_history_t::encode(ceph::buffer::list &bl) const
7c673cae 3433{
9f95a23c 3434 ENCODE_START(10, 4, bl);
11fdf7f2
TL
3435 encode(epoch_created, bl);
3436 encode(last_epoch_started, bl);
3437 encode(last_epoch_clean, bl);
3438 encode(last_epoch_split, bl);
3439 encode(same_interval_since, bl);
3440 encode(same_up_since, bl);
3441 encode(same_primary_since, bl);
3442 encode(last_scrub, bl);
3443 encode(last_scrub_stamp, bl);
3444 encode(last_deep_scrub, bl);
3445 encode(last_deep_scrub_stamp, bl);
3446 encode(last_clean_scrub_stamp, bl);
3447 encode(last_epoch_marked_full, bl);
3448 encode(last_interval_started, bl);
3449 encode(last_interval_clean, bl);
3450 encode(epoch_pool_created, bl);
9f95a23c 3451 encode(prior_readable_until_ub, bl);
7c673cae
FG
3452 ENCODE_FINISH(bl);
3453}
3454
9f95a23c 3455void pg_history_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae 3456{
9f95a23c 3457 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
11fdf7f2
TL
3458 decode(epoch_created, bl);
3459 decode(last_epoch_started, bl);
7c673cae 3460 if (struct_v >= 3)
11fdf7f2 3461 decode(last_epoch_clean, bl);
7c673cae
FG
3462 else
3463 last_epoch_clean = last_epoch_started; // careful, it's a lie!
11fdf7f2
TL
3464 decode(last_epoch_split, bl);
3465 decode(same_interval_since, bl);
3466 decode(same_up_since, bl);
3467 decode(same_primary_since, bl);
7c673cae 3468 if (struct_v >= 2) {
11fdf7f2
TL
3469 decode(last_scrub, bl);
3470 decode(last_scrub_stamp, bl);
7c673cae
FG
3471 }
3472 if (struct_v >= 5) {
11fdf7f2
TL
3473 decode(last_deep_scrub, bl);
3474 decode(last_deep_scrub_stamp, bl);
7c673cae
FG
3475 }
3476 if (struct_v >= 6) {
11fdf7f2 3477 decode(last_clean_scrub_stamp, bl);
7c673cae
FG
3478 }
3479 if (struct_v >= 7) {
11fdf7f2 3480 decode(last_epoch_marked_full, bl);
7c673cae
FG
3481 }
3482 if (struct_v >= 8) {
11fdf7f2
TL
3483 decode(last_interval_started, bl);
3484 decode(last_interval_clean, bl);
7c673cae
FG
3485 } else {
3486 if (last_epoch_started >= same_interval_since) {
3487 last_interval_started = same_interval_since;
3488 } else {
3489 last_interval_started = last_epoch_started; // best guess
3490 }
3491 if (last_epoch_clean >= same_interval_since) {
3492 last_interval_clean = same_interval_since;
3493 } else {
3494 last_interval_clean = last_epoch_clean; // best guess
3495 }
3496 }
31f18b77 3497 if (struct_v >= 9) {
11fdf7f2 3498 decode(epoch_pool_created, bl);
31f18b77
FG
3499 } else {
3500 epoch_pool_created = epoch_created;
3501 }
9f95a23c
TL
3502 if (struct_v >= 10) {
3503 decode(prior_readable_until_ub, bl);
3504 }
7c673cae
FG
3505 DECODE_FINISH(bl);
3506}
3507
3508void pg_history_t::dump(Formatter *f) const
3509{
3510 f->dump_int("epoch_created", epoch_created);
31f18b77 3511 f->dump_int("epoch_pool_created", epoch_pool_created);
7c673cae
FG
3512 f->dump_int("last_epoch_started", last_epoch_started);
3513 f->dump_int("last_interval_started", last_interval_started);
3514 f->dump_int("last_epoch_clean", last_epoch_clean);
3515 f->dump_int("last_interval_clean", last_interval_clean);
3516 f->dump_int("last_epoch_split", last_epoch_split);
3517 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
3518 f->dump_int("same_up_since", same_up_since);
3519 f->dump_int("same_interval_since", same_interval_since);
3520 f->dump_int("same_primary_since", same_primary_since);
3521 f->dump_stream("last_scrub") << last_scrub;
3522 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
3523 f->dump_stream("last_deep_scrub") << last_deep_scrub;
3524 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
3525 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
9f95a23c
TL
3526 f->dump_float(
3527 "prior_readable_until_ub",
3528 std::chrono::duration<double>(prior_readable_until_ub).count());
7c673cae
FG
3529}
3530
3531void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
3532{
3533 o.push_back(new pg_history_t);
3534 o.push_back(new pg_history_t);
3535 o.back()->epoch_created = 1;
31f18b77 3536 o.back()->epoch_pool_created = 1;
7c673cae
FG
3537 o.back()->last_epoch_started = 2;
3538 o.back()->last_interval_started = 2;
3539 o.back()->last_epoch_clean = 3;
3540 o.back()->last_interval_clean = 2;
3541 o.back()->last_epoch_split = 4;
9f95a23c 3542 o.back()->prior_readable_until_ub = make_timespan(3.1415);
7c673cae
FG
3543 o.back()->same_up_since = 5;
3544 o.back()->same_interval_since = 6;
3545 o.back()->same_primary_since = 7;
3546 o.back()->last_scrub = eversion_t(8, 9);
3547 o.back()->last_scrub_stamp = utime_t(10, 11);
3548 o.back()->last_deep_scrub = eversion_t(12, 13);
3549 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
3550 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
3551 o.back()->last_epoch_marked_full = 18;
3552}
3553
3554
3555// -- pg_info_t --
3556
9f95a23c 3557void pg_info_t::encode(ceph::buffer::list &bl) const
7c673cae
FG
3558{
3559 ENCODE_START(32, 26, bl);
11fdf7f2
TL
3560 encode(pgid.pgid, bl);
3561 encode(last_update, bl);
3562 encode(last_complete, bl);
3563 encode(log_tail, bl);
9f95a23c 3564 encode(hobject_t(), bl); // old (nibblewise) last_backfill
11fdf7f2 3565 encode(stats, bl);
7c673cae 3566 history.encode(bl);
11fdf7f2
TL
3567 encode(purged_snaps, bl);
3568 encode(last_epoch_started, bl);
3569 encode(last_user_version, bl);
3570 encode(hit_set, bl);
3571 encode(pgid.shard, bl);
3572 encode(last_backfill, bl);
9f95a23c 3573 encode(true, bl); // was last_backfill_bitwise
11fdf7f2 3574 encode(last_interval_started, bl);
7c673cae
FG
3575 ENCODE_FINISH(bl);
3576}
3577
9f95a23c 3578void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
3579{
3580 DECODE_START(32, bl);
11fdf7f2
TL
3581 decode(pgid.pgid, bl);
3582 decode(last_update, bl);
3583 decode(last_complete, bl);
3584 decode(log_tail, bl);
7c673cae
FG
3585 {
3586 hobject_t old_last_backfill;
11fdf7f2 3587 decode(old_last_backfill, bl);
7c673cae 3588 }
11fdf7f2 3589 decode(stats, bl);
7c673cae 3590 history.decode(bl);
11fdf7f2
TL
3591 decode(purged_snaps, bl);
3592 decode(last_epoch_started, bl);
3593 decode(last_user_version, bl);
3594 decode(hit_set, bl);
3595 decode(pgid.shard, bl);
3596 decode(last_backfill, bl);
9f95a23c
TL
3597 {
3598 bool last_backfill_bitwise;
3599 decode(last_backfill_bitwise, bl);
3600 // note: we may see a false value here since the default value for
3601 // the member was false, so it often didn't get set to true until
3602 // peering progressed.
3603 }
7c673cae 3604 if (struct_v >= 32) {
11fdf7f2 3605 decode(last_interval_started, bl);
7c673cae
FG
3606 } else {
3607 last_interval_started = last_epoch_started;
3608 }
3609 DECODE_FINISH(bl);
3610}
3611
3612// -- pg_info_t --
3613
3614void pg_info_t::dump(Formatter *f) const
3615{
3616 f->dump_stream("pgid") << pgid;
3617 f->dump_stream("last_update") << last_update;
3618 f->dump_stream("last_complete") << last_complete;
3619 f->dump_stream("log_tail") << log_tail;
3620 f->dump_int("last_user_version", last_user_version);
3621 f->dump_stream("last_backfill") << last_backfill;
7c673cae
FG
3622 f->open_array_section("purged_snaps");
3623 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
3624 i != purged_snaps.end();
3625 ++i) {
3626 f->open_object_section("purged_snap_interval");
3627 f->dump_stream("start") << i.get_start();
3628 f->dump_stream("length") << i.get_len();
3629 f->close_section();
3630 }
3631 f->close_section();
3632 f->open_object_section("history");
3633 history.dump(f);
3634 f->close_section();
3635 f->open_object_section("stats");
3636 stats.dump(f);
3637 f->close_section();
3638
3639 f->dump_int("empty", is_empty());
3640 f->dump_int("dne", dne());
3641 f->dump_int("incomplete", is_incomplete());
3642 f->dump_int("last_epoch_started", last_epoch_started);
3643
3644 f->open_object_section("hit_set_history");
3645 hit_set.dump(f);
3646 f->close_section();
3647}
3648
3649void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
3650{
3651 o.push_back(new pg_info_t);
3652 o.push_back(new pg_info_t);
3653 list<pg_history_t*> h;
3654 pg_history_t::generate_test_instances(h);
3655 o.back()->history = *h.back();
11fdf7f2 3656 o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD);
7c673cae
FG
3657 o.back()->last_update = eversion_t(3, 4);
3658 o.back()->last_complete = eversion_t(5, 6);
3659 o.back()->last_user_version = 2;
3660 o.back()->log_tail = eversion_t(7, 8);
3661 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
7c673cae
FG
3662 {
3663 list<pg_stat_t*> s;
3664 pg_stat_t::generate_test_instances(s);
3665 o.back()->stats = *s.back();
3666 }
3667 {
3668 list<pg_hit_set_history_t*> s;
3669 pg_hit_set_history_t::generate_test_instances(s);
3670 o.back()->hit_set = *s.back();
3671 }
3672}
3673
3674// -- pg_notify_t --
9f95a23c 3675void pg_notify_t::encode(ceph::buffer::list &bl) const
7c673cae 3676{
9f95a23c 3677 ENCODE_START(3, 2, bl);
11fdf7f2
TL
3678 encode(query_epoch, bl);
3679 encode(epoch_sent, bl);
3680 encode(info, bl);
3681 encode(to, bl);
3682 encode(from, bl);
9f95a23c 3683 encode(past_intervals, bl);
7c673cae
FG
3684 ENCODE_FINISH(bl);
3685}
3686
9f95a23c 3687void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae 3688{
9f95a23c 3689 DECODE_START(3, bl);
11fdf7f2
TL
3690 decode(query_epoch, bl);
3691 decode(epoch_sent, bl);
3692 decode(info, bl);
3693 decode(to, bl);
3694 decode(from, bl);
9f95a23c
TL
3695 if (struct_v >= 3) {
3696 decode(past_intervals, bl);
3697 }
7c673cae
FG
3698 DECODE_FINISH(bl);
3699}
3700
3701void pg_notify_t::dump(Formatter *f) const
3702{
3703 f->dump_int("from", from);
3704 f->dump_int("to", to);
3705 f->dump_unsigned("query_epoch", query_epoch);
3706 f->dump_unsigned("epoch_sent", epoch_sent);
3707 {
3708 f->open_object_section("info");
3709 info.dump(f);
3710 f->close_section();
3711 }
9f95a23c 3712 f->dump_object("past_intervals", past_intervals);
7c673cae
FG
3713}
3714
3715void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
3716{
9f95a23c
TL
3717 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1,
3718 pg_info_t(), PastIntervals()));
3719 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
3720 pg_info_t(), PastIntervals()));
7c673cae
FG
3721}
3722
3723ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
3724{
3725 lhs << "(query:" << notify.query_epoch
3726 << " sent:" << notify.epoch_sent
3727 << " " << notify.info;
3728 if (notify.from != shard_id_t::NO_SHARD ||
3729 notify.to != shard_id_t::NO_SHARD)
3730 lhs << " " << (unsigned)notify.from
3731 << "->" << (unsigned)notify.to;
9f95a23c 3732 lhs << " " << notify.past_intervals;
7c673cae
FG
3733 return lhs << ")";
3734}
3735
3736// -- pg_interval_t --
3737
9f95a23c 3738void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const
7c673cae
FG
3739{
3740 ENCODE_START(4, 2, bl);
11fdf7f2
TL
3741 encode(first, bl);
3742 encode(last, bl);
3743 encode(up, bl);
3744 encode(acting, bl);
3745 encode(maybe_went_rw, bl);
3746 encode(primary, bl);
3747 encode(up_primary, bl);
7c673cae
FG
3748 ENCODE_FINISH(bl);
3749}
3750
9f95a23c 3751void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae
FG
3752{
3753 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
11fdf7f2
TL
3754 decode(first, bl);
3755 decode(last, bl);
3756 decode(up, bl);
3757 decode(acting, bl);
3758 decode(maybe_went_rw, bl);
7c673cae 3759 if (struct_v >= 3) {
11fdf7f2 3760 decode(primary, bl);
7c673cae
FG
3761 } else {
3762 if (acting.size())
3763 primary = acting[0];
3764 }
3765 if (struct_v >= 4) {
11fdf7f2 3766 decode(up_primary, bl);
7c673cae
FG
3767 } else {
3768 if (up.size())
3769 up_primary = up[0];
3770 }
3771 DECODE_FINISH(bl);
3772}
3773
3774void PastIntervals::pg_interval_t::dump(Formatter *f) const
3775{
3776 f->dump_unsigned("first", first);
3777 f->dump_unsigned("last", last);
3778 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
3779 f->open_array_section("up");
9f95a23c 3780 for (auto p = up.cbegin(); p != up.cend(); ++p)
7c673cae
FG
3781 f->dump_int("osd", *p);
3782 f->close_section();
3783 f->open_array_section("acting");
9f95a23c 3784 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
7c673cae
FG
3785 f->dump_int("osd", *p);
3786 f->close_section();
3787 f->dump_int("primary", primary);
3788 f->dump_int("up_primary", up_primary);
3789}
3790
3791void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
3792{
3793 o.push_back(new pg_interval_t);
3794 o.push_back(new pg_interval_t);
3795 o.back()->up.push_back(1);
3796 o.back()->acting.push_back(2);
3797 o.back()->acting.push_back(3);
3798 o.back()->first = 4;
3799 o.back()->last = 5;
3800 o.back()->maybe_went_rw = true;
3801}
3802
3803WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
3804
7c673cae
FG
3805
3806/**
3807 * pi_compact_rep
3808 *
3809 * PastIntervals only needs to be able to answer two questions:
3810 * 1) Where should the primary look for unfound objects?
3811 * 2) List a set of subsets of the OSDs such that contacting at least
11fdf7f2 3812 * one from each subset guarantees we speak to at least one witness
7c673cae
FG
3813 * of any completed write.
3814 *
3815 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3816 * we don't need to keep any where maybe_went_rw would be false. We also
3817 * needn't keep two intervals where the actingset in one is a subset
3818 * of the other (only need to keep the smaller of the two sets). In order
3819 * to accurately trim the set of intervals as last_epoch_started changes
3820 * without rebuilding the set from scratch, we'll retain the larger set
3821 * if it in an older interval.
3822 */
3823struct compact_interval_t {
3824 epoch_t first;
3825 epoch_t last;
3826 set<pg_shard_t> acting;
3827 bool supersedes(const compact_interval_t &other) {
3828 for (auto &&i: acting) {
3829 if (!other.acting.count(i))
3830 return false;
3831 }
3832 return true;
3833 }
3834 void dump(Formatter *f) const {
3835 f->open_object_section("compact_interval_t");
3836 f->dump_stream("first") << first;
3837 f->dump_stream("last") << last;
3838 f->dump_stream("acting") << acting;
3839 f->close_section();
3840 }
9f95a23c 3841 void encode(ceph::buffer::list &bl) const {
7c673cae 3842 ENCODE_START(1, 1, bl);
11fdf7f2
TL
3843 encode(first, bl);
3844 encode(last, bl);
3845 encode(acting, bl);
7c673cae
FG
3846 ENCODE_FINISH(bl);
3847 }
9f95a23c 3848 void decode(ceph::buffer::list::const_iterator &bl) {
7c673cae 3849 DECODE_START(1, bl);
11fdf7f2
TL
3850 decode(first, bl);
3851 decode(last, bl);
3852 decode(acting, bl);
7c673cae
FG
3853 DECODE_FINISH(bl);
3854 }
3855 static void generate_test_instances(list<compact_interval_t*> & o) {
3856 /* Not going to be used, we'll generate pi_compact_rep directly */
3857 }
3858};
3859ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3860{
3861 return o << "([" << rhs.first << "," << rhs.last
3862 << "] acting " << rhs.acting << ")";
3863}
3864WRITE_CLASS_ENCODER(compact_interval_t)
3865
3866class pi_compact_rep : public PastIntervals::interval_rep {
3867 epoch_t first = 0;
3868 epoch_t last = 0; // inclusive
3869 set<pg_shard_t> all_participants;
3870 list<compact_interval_t> intervals;
3871 pi_compact_rep(
3872 bool ec_pool,
3873 std::list<PastIntervals::pg_interval_t> &&intervals) {
3874 for (auto &&i: intervals)
3875 add_interval(ec_pool, i);
3876 }
3877public:
3878 pi_compact_rep() = default;
3879 pi_compact_rep(const pi_compact_rep &) = default;
3880 pi_compact_rep(pi_compact_rep &&) = default;
3881 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3882 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3883
3884 size_t size() const override { return intervals.size(); }
3885 bool empty() const override {
3886 return first > last || (first == 0 && last == 0);
3887 }
3888 void clear() override {
3889 *this = pi_compact_rep();
3890 }
3891 pair<epoch_t, epoch_t> get_bounds() const override {
3892 return make_pair(first, last + 1);
3893 }
f67539c2 3894 void adjust_start_backwards(epoch_t last_epoch_clean) override {
11fdf7f2
TL
3895 first = last_epoch_clean;
3896 }
3897
7c673cae
FG
3898 set<pg_shard_t> get_all_participants(
3899 bool ec_pool) const override {
3900 return all_participants;
3901 }
3902 void add_interval(
3903 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3904 if (first == 0)
3905 first = interval.first;
11fdf7f2 3906 ceph_assert(interval.last > last);
7c673cae
FG
3907 last = interval.last;
3908 set<pg_shard_t> acting;
3909 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3910 if (interval.acting[i] == CRUSH_ITEM_NONE)
3911 continue;
3912 acting.insert(
3913 pg_shard_t(
3914 interval.acting[i],
3915 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3916 }
3917 all_participants.insert(acting.begin(), acting.end());
3918 if (!interval.maybe_went_rw)
3919 return;
3920 intervals.push_back(
3921 compact_interval_t{interval.first, interval.last, acting});
3922 auto plast = intervals.end();
3923 --plast;
3924 for (auto cur = intervals.begin(); cur != plast; ) {
3925 if (plast->supersedes(*cur)) {
3926 intervals.erase(cur++);
3927 } else {
3928 ++cur;
3929 }
3930 }
3931 }
3932 unique_ptr<PastIntervals::interval_rep> clone() const override {
3933 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3934 }
3935 ostream &print(ostream &out) const override {
3936 return out << "([" << first << "," << last
9f95a23c
TL
3937 << "] all_participants=" << all_participants
3938 << " intervals=" << intervals << ")";
7c673cae 3939 }
9f95a23c 3940 void encode(ceph::buffer::list &bl) const override {
7c673cae 3941 ENCODE_START(1, 1, bl);
11fdf7f2
TL
3942 encode(first, bl);
3943 encode(last, bl);
3944 encode(all_participants, bl);
3945 encode(intervals, bl);
7c673cae
FG
3946 ENCODE_FINISH(bl);
3947 }
9f95a23c 3948 void decode(ceph::buffer::list::const_iterator &bl) override {
7c673cae 3949 DECODE_START(1, bl);
11fdf7f2
TL
3950 decode(first, bl);
3951 decode(last, bl);
3952 decode(all_participants, bl);
3953 decode(intervals, bl);
7c673cae
FG
3954 DECODE_FINISH(bl);
3955 }
3956 void dump(Formatter *f) const override {
3957 f->open_object_section("PastIntervals::compact_rep");
3958 f->dump_stream("first") << first;
3959 f->dump_stream("last") << last;
3960 f->open_array_section("all_participants");
3961 for (auto& i : all_participants) {
3962 f->dump_object("pg_shard", i);
3963 }
3964 f->close_section();
3965 f->open_array_section("intervals");
3966 for (auto &&i: intervals) {
3967 i.dump(f);
3968 }
3969 f->close_section();
3970 f->close_section();
3971 }
7c673cae
FG
3972 static void generate_test_instances(list<pi_compact_rep*> &o) {
3973 using ival = PastIntervals::pg_interval_t;
3974 using ivallst = std::list<ival>;
3975 o.push_back(
3976 new pi_compact_rep(
3977 true, ivallst
3978 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3979 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3980 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3981 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3982 }));
3983 o.push_back(
3984 new pi_compact_rep(
3985 false, ivallst
3986 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3987 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3988 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3989 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3990 }));
3991 o.push_back(
3992 new pi_compact_rep(
3993 true, ivallst
3994 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3995 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3996 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3997 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3998 }));
3999 }
4000 void iterate_mayberw_back_to(
7c673cae
FG
4001 epoch_t les,
4002 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
4003 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
4004 if (i->last < les)
4005 break;
4006 f(i->first, i->acting);
4007 }
4008 }
4009 virtual ~pi_compact_rep() override {}
4010};
4011WRITE_CLASS_ENCODER(pi_compact_rep)
4012
11fdf7f2
TL
4013PastIntervals::PastIntervals()
4014{
4015 past_intervals.reset(new pi_compact_rep);
4016}
4017
7c673cae
FG
4018PastIntervals::PastIntervals(const PastIntervals &rhs)
4019 : past_intervals(rhs.past_intervals ?
4020 rhs.past_intervals->clone() :
4021 nullptr) {}
4022
4023PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
4024{
4025 PastIntervals other(rhs);
31f18b77 4026 swap(other);
7c673cae
FG
4027 return *this;
4028}
4029
4030ostream& operator<<(ostream& out, const PastIntervals &i)
4031{
4032 if (i.past_intervals) {
4033 return i.past_intervals->print(out);
4034 } else {
4035 return out << "(empty)";
4036 }
4037}
4038
4039ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
4040{
4041 return out << "PriorSet("
4042 << "ec_pool: " << i.ec_pool
4043 << ", probe: " << i.probe
4044 << ", down: " << i.down
4045 << ", blocked_by: " << i.blocked_by
4046 << ", pg_down: " << i.pg_down
4047 << ")";
4048}
4049
9f95a23c 4050void PastIntervals::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
4051{
4052 DECODE_START(1, bl);
4053 __u8 type = 0;
11fdf7f2 4054 decode(type, bl);
7c673cae
FG
4055 switch (type) {
4056 case 0:
4057 break;
4058 case 1:
11fdf7f2 4059 ceph_abort_msg("pi_simple_rep support removed post-luminous");
7c673cae
FG
4060 break;
4061 case 2:
4062 past_intervals.reset(new pi_compact_rep);
4063 past_intervals->decode(bl);
4064 break;
4065 }
4066 DECODE_FINISH(bl);
4067}
4068
7c673cae
FG
4069void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
4070{
7c673cae
FG
4071 {
4072 list<pi_compact_rep *> compact;
4073 pi_compact_rep::generate_test_instances(compact);
4074 for (auto &&i: compact) {
4075 // takes ownership of contents
4076 o.push_back(new PastIntervals(i));
4077 }
4078 }
4079 return;
4080}
4081
7c673cae
FG
4082bool PastIntervals::is_new_interval(
4083 int old_acting_primary,
4084 int new_acting_primary,
4085 const vector<int> &old_acting,
4086 const vector<int> &new_acting,
4087 int old_up_primary,
4088 int new_up_primary,
4089 const vector<int> &old_up,
4090 const vector<int> &new_up,
4091 int old_size,
4092 int new_size,
4093 int old_min_size,
4094 int new_min_size,
4095 unsigned old_pg_num,
4096 unsigned new_pg_num,
11fdf7f2
TL
4097 unsigned old_pg_num_pending,
4098 unsigned new_pg_num_pending,
7c673cae
FG
4099 bool old_sort_bitwise,
4100 bool new_sort_bitwise,
c07f9fc5
FG
4101 bool old_recovery_deletes,
4102 bool new_recovery_deletes,
f67539c2
TL
4103 uint32_t old_crush_count,
4104 uint32_t new_crush_count,
4105 uint32_t old_crush_target,
4106 uint32_t new_crush_target,
4107 uint32_t old_crush_barrier,
4108 uint32_t new_crush_barrier,
4109 int32_t old_crush_member,
4110 int32_t new_crush_member,
7c673cae
FG
4111 pg_t pgid) {
4112 return old_acting_primary != new_acting_primary ||
4113 new_acting != old_acting ||
4114 old_up_primary != new_up_primary ||
4115 new_up != old_up ||
4116 old_min_size != new_min_size ||
4117 old_size != new_size ||
4118 pgid.is_split(old_pg_num, new_pg_num, 0) ||
11fdf7f2
TL
4119 // (is or was) pre-merge source
4120 pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) ||
4121 pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) ||
4122 // merge source
4123 pgid.is_merge_source(old_pg_num, new_pg_num, 0) ||
4124 // (is or was) pre-merge target
4125 pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) ||
4126 pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) ||
4127 // merge target
4128 pgid.is_merge_target(old_pg_num, new_pg_num) ||
c07f9fc5 4129 old_sort_bitwise != new_sort_bitwise ||
f67539c2
TL
4130 old_recovery_deletes != new_recovery_deletes ||
4131 old_crush_count != new_crush_count ||
4132 old_crush_target != new_crush_target ||
4133 old_crush_barrier != new_crush_barrier ||
4134 old_crush_member != new_crush_member;
7c673cae
FG
4135}
4136
4137bool PastIntervals::is_new_interval(
4138 int old_acting_primary,
4139 int new_acting_primary,
4140 const vector<int> &old_acting,
4141 const vector<int> &new_acting,
4142 int old_up_primary,
4143 int new_up_primary,
4144 const vector<int> &old_up,
4145 const vector<int> &new_up,
9f95a23c
TL
4146 const OSDMap *osdmap,
4147 const OSDMap *lastmap,
11fdf7f2
TL
4148 pg_t pgid)
4149{
4150 const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool());
4151 if (!plast) {
4152 return false; // after pool is deleted there are no more interval changes
4153 }
4154 const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool());
4155 if (!pi) {
4156 return true; // pool was deleted this epoch -> (final!) interval change
4157 }
4158 return
7c673cae
FG
4159 is_new_interval(old_acting_primary,
4160 new_acting_primary,
4161 old_acting,
4162 new_acting,
4163 old_up_primary,
4164 new_up_primary,
4165 old_up,
4166 new_up,
11fdf7f2
TL
4167 plast->size,
4168 pi->size,
4169 plast->min_size,
4170 pi->min_size,
4171 plast->get_pg_num(),
4172 pi->get_pg_num(),
4173 plast->get_pg_num_pending(),
4174 pi->get_pg_num_pending(),
7c673cae
FG
4175 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
4176 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
c07f9fc5
FG
4177 lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
4178 osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
f67539c2
TL
4179 plast->peering_crush_bucket_count, pi->peering_crush_bucket_count,
4180 plast->peering_crush_bucket_target, pi->peering_crush_bucket_target,
4181 plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier,
4182 plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member,
7c673cae
FG
4183 pgid);
4184}
4185
4186bool PastIntervals::check_new_interval(
4187 int old_acting_primary,
4188 int new_acting_primary,
4189 const vector<int> &old_acting,
4190 const vector<int> &new_acting,
4191 int old_up_primary,
4192 int new_up_primary,
4193 const vector<int> &old_up,
4194 const vector<int> &new_up,
4195 epoch_t same_interval_since,
4196 epoch_t last_epoch_clean,
9f95a23c
TL
4197 const OSDMap *osdmap,
4198 const OSDMap *lastmap,
7c673cae 4199 pg_t pgid,
9f95a23c 4200 const IsPGRecoverablePredicate &could_have_gone_active,
7c673cae
FG
4201 PastIntervals *past_intervals,
4202 std::ostream *out)
4203{
4204 /*
4205 * We have to be careful to gracefully deal with situations like
4206 * so. Say we have a power outage or something that takes out both
4207 * OSDs, but the monitor doesn't mark them down in the same epoch.
4208 * The history may look like
4209 *
4210 * 1: A B
4211 * 2: B
4212 * 3: let's say B dies for good, too (say, from the power spike)
4213 * 4: A
4214 *
4215 * which makes it look like B may have applied updates to the PG
4216 * that we need in order to proceed. This sucks...
4217 *
4218 * To minimize the risk of this happening, we CANNOT go active if
4219 * _any_ OSDs in the prior set are down until we send an MOSDAlive
4220 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
4221 * Then, we have something like
4222 *
4223 * 1: A B
4224 * 2: B up_thru[B]=0
4225 * 3:
4226 * 4: A
4227 *
4228 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
4229 *
4230 * or,
4231 *
4232 * 1: A B
4233 * 2: B up_thru[B]=0
4234 * 3: B up_thru[B]=2
4235 * 4:
4236 * 5: A
4237 *
4238 * -> we must wait for B, bc it was alive through 2, and could have
4239 * written to the pg.
4240 *
4241 * If B is really dead, then an administrator will need to manually
4242 * intervene by marking the OSD as "lost."
4243 */
4244
4245 // remember past interval
4246 // NOTE: a change in the up set primary triggers an interval
4247 // change, even though the interval members in the pg_interval_t
4248 // do not change.
11fdf7f2
TL
4249 ceph_assert(past_intervals);
4250 ceph_assert(past_intervals->past_intervals);
7c673cae
FG
4251 if (is_new_interval(
4252 old_acting_primary,
4253 new_acting_primary,
4254 old_acting,
4255 new_acting,
4256 old_up_primary,
4257 new_up_primary,
4258 old_up,
4259 new_up,
4260 osdmap,
4261 lastmap,
4262 pgid)) {
4263 pg_interval_t i;
4264 i.first = same_interval_since;
4265 i.last = osdmap->get_epoch() - 1;
11fdf7f2 4266 ceph_assert(i.first <= i.last);
7c673cae
FG
4267 i.acting = old_acting;
4268 i.up = old_up;
4269 i.primary = old_acting_primary;
4270 i.up_primary = old_up_primary;
4271
4272 unsigned num_acting = 0;
9f95a23c 4273 for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p)
7c673cae
FG
4274 if (*p != CRUSH_ITEM_NONE)
4275 ++num_acting;
4276
11fdf7f2 4277 ceph_assert(lastmap->get_pools().count(pgid.pool()));
7c673cae
FG
4278 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
4279 set<pg_shard_t> old_acting_shards;
4280 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
4281
4282 if (num_acting &&
4283 i.primary != -1 &&
4284 num_acting >= old_pg_pool.min_size &&
f67539c2
TL
4285 (!old_pg_pool.is_stretch_pool() ||
4286 old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) &&
9f95a23c 4287 could_have_gone_active(old_acting_shards)) {
7c673cae
FG
4288 if (out)
4289 *out << __func__ << " " << i
7c673cae
FG
4290 << " up_thru " << lastmap->get_up_thru(i.primary)
4291 << " up_from " << lastmap->get_up_from(i.primary)
11fdf7f2 4292 << " last_epoch_clean " << last_epoch_clean;
7c673cae
FG
4293 if (lastmap->get_up_thru(i.primary) >= i.first &&
4294 lastmap->get_up_from(i.primary) <= i.first) {
4295 i.maybe_went_rw = true;
4296 if (out)
11fdf7f2 4297 *out << " " << i
7c673cae
FG
4298 << " : primary up " << lastmap->get_up_from(i.primary)
4299 << "-" << lastmap->get_up_thru(i.primary)
4300 << " includes interval"
11fdf7f2 4301 << std::endl;
7c673cae
FG
4302 } else if (last_epoch_clean >= i.first &&
4303 last_epoch_clean <= i.last) {
4304 // If the last_epoch_clean is included in this interval, then
4305 // the pg must have been rw (for recovery to have completed).
4306 // This is important because we won't know the _real_
4307 // first_epoch because we stop at last_epoch_clean, and we
4308 // don't want the oldest interval to randomly have
4309 // maybe_went_rw false depending on the relative up_thru vs
4310 // last_epoch_clean timing.
4311 i.maybe_went_rw = true;
4312 if (out)
11fdf7f2 4313 *out << " " << i
7c673cae
FG
4314 << " : includes last_epoch_clean " << last_epoch_clean
4315 << " and presumed to have been rw"
4316 << std::endl;
4317 } else {
4318 i.maybe_went_rw = false;
4319 if (out)
11fdf7f2 4320 *out << " " << i
7c673cae
FG
4321 << " : primary up " << lastmap->get_up_from(i.primary)
4322 << "-" << lastmap->get_up_thru(i.primary)
4323 << " does not include interval"
11fdf7f2 4324 << std::endl;
7c673cae
FG
4325 }
4326 } else {
4327 i.maybe_went_rw = false;
4328 if (out)
4329 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
4330 }
11fdf7f2 4331 past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);
7c673cae
FG
4332 return true;
4333 } else {
4334 return false;
4335 }
4336}
4337
7c673cae
FG
4338// true if the given map affects the prior set
4339bool PastIntervals::PriorSet::affected_by_map(
4340 const OSDMap &osdmap,
4341 const DoutPrefixProvider *dpp) const
4342{
9f95a23c 4343 for (auto p = probe.begin(); p != probe.end(); ++p) {
7c673cae
FG
4344 int o = p->osd;
4345
4346 // did someone in the prior set go down?
4347 if (osdmap.is_down(o) && down.count(o) == 0) {
4348 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
4349 return true;
4350 }
4351
4352 // did a down osd in cur get (re)marked as lost?
9f95a23c 4353 auto r = blocked_by.find(o);
7c673cae
FG
4354 if (r != blocked_by.end()) {
4355 if (!osdmap.exists(o)) {
4356 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4357 return true;
4358 }
4359 if (osdmap.get_info(o).lost_at != r->second) {
4360 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4361 return true;
4362 }
4363 }
4364 }
4365
4366 // did someone in the prior down set go up?
9f95a23c 4367 for (auto p = down.cbegin(); p != down.cend(); ++p) {
7c673cae
FG
4368 int o = *p;
4369
4370 if (osdmap.is_up(o)) {
4371 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
4372 return true;
4373 }
4374
4375 // did someone in the prior set get lost or destroyed?
4376 if (!osdmap.exists(o)) {
4377 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4378 return true;
4379 }
4380 // did a down osd in down get (re)marked as lost?
9f95a23c 4381 auto r = blocked_by.find(o);
7c673cae
FG
4382 if (r != blocked_by.end()) {
4383 if (osdmap.get_info(o).lost_at != r->second) {
4384 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4385 return true;
4386 }
4387 }
4388 }
4389
4390 return false;
4391}
4392
4393ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
4394{
4395 out << "interval(" << i.first << "-" << i.last
4396 << " up " << i.up << "(" << i.up_primary << ")"
4397 << " acting " << i.acting << "(" << i.primary << ")";
4398 if (i.maybe_went_rw)
4399 out << " maybe_went_rw";
4400 out << ")";
4401 return out;
4402}
4403
4404
4405
4406// -- pg_query_t --
4407
9f95a23c 4408void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const {
7c673cae 4409 ENCODE_START(3, 3, bl);
11fdf7f2
TL
4410 encode(type, bl);
4411 encode(since, bl);
7c673cae 4412 history.encode(bl);
11fdf7f2
TL
4413 encode(epoch_sent, bl);
4414 encode(to, bl);
4415 encode(from, bl);
7c673cae
FG
4416 ENCODE_FINISH(bl);
4417}
4418
9f95a23c 4419void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) {
7c673cae 4420 DECODE_START(3, bl);
11fdf7f2
TL
4421 decode(type, bl);
4422 decode(since, bl);
7c673cae 4423 history.decode(bl);
11fdf7f2
TL
4424 decode(epoch_sent, bl);
4425 decode(to, bl);
4426 decode(from, bl);
7c673cae
FG
4427 DECODE_FINISH(bl);
4428}
4429
4430void pg_query_t::dump(Formatter *f) const
4431{
4432 f->dump_int("from", from);
4433 f->dump_int("to", to);
4434 f->dump_string("type", get_type_name());
4435 f->dump_stream("since") << since;
4436 f->dump_stream("epoch_sent") << epoch_sent;
4437 f->open_object_section("history");
4438 history.dump(f);
4439 f->close_section();
4440}
4441void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
4442{
4443 o.push_back(new pg_query_t());
4444 list<pg_history_t*> h;
4445 pg_history_t::generate_test_instances(h);
4446 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
4447 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
4448 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
4449 eversion_t(4, 5), *h.back(), 4));
4450 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
4451 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
4452 *h.back(), 5));
4453}
4454
9f95a23c
TL
4455// -- pg_lease_t --
4456
4457void pg_lease_t::encode(bufferlist& bl) const
4458{
4459 ENCODE_START(1, 1, bl);
4460 encode(readable_until, bl);
4461 encode(readable_until_ub, bl);
4462 encode(interval, bl);
4463 ENCODE_FINISH(bl);
4464}
4465
4466void pg_lease_t::decode(bufferlist::const_iterator& p)
4467{
4468 DECODE_START(1, p);
4469 decode(readable_until, p);
4470 decode(readable_until_ub, p);
4471 decode(interval, p);
4472 DECODE_FINISH(p);
4473}
4474
4475void pg_lease_t::dump(Formatter *f) const
4476{
4477 f->dump_stream("readable_until") << readable_until;
4478 f->dump_stream("readable_until_ub") << readable_until_ub;
4479 f->dump_stream("interval") << interval;
4480}
4481
4482void pg_lease_t::generate_test_instances(std::list<pg_lease_t*>& o)
4483{
4484 o.push_back(new pg_lease_t());
4485 o.push_back(new pg_lease_t());
4486 o.back()->readable_until = make_timespan(1.5);
4487 o.back()->readable_until_ub = make_timespan(3.4);
4488 o.back()->interval = make_timespan(1.0);
4489}
4490
4491// -- pg_lease_ack_t --
4492
4493void pg_lease_ack_t::encode(bufferlist& bl) const
4494{
4495 ENCODE_START(1, 1, bl);
4496 encode(readable_until_ub, bl);
4497 ENCODE_FINISH(bl);
4498}
4499
4500void pg_lease_ack_t::decode(bufferlist::const_iterator& p)
4501{
4502 DECODE_START(1, p);
4503 decode(readable_until_ub, p);
4504 DECODE_FINISH(p);
4505}
4506
4507void pg_lease_ack_t::dump(Formatter *f) const
4508{
4509 f->dump_stream("readable_until_ub") << readable_until_ub;
4510}
4511
4512void pg_lease_ack_t::generate_test_instances(std::list<pg_lease_ack_t*>& o)
4513{
4514 o.push_back(new pg_lease_ack_t());
4515 o.push_back(new pg_lease_ack_t());
4516 o.back()->readable_until_ub = make_timespan(3.4);
4517}
4518
4519
7c673cae
FG
4520// -- ObjectModDesc --
4521void ObjectModDesc::visit(Visitor *visitor) const
4522{
11fdf7f2 4523 auto bp = bl.cbegin();
7c673cae
FG
4524 try {
4525 while (!bp.end()) {
4526 DECODE_START(max_required_version, bp);
4527 uint8_t code;
11fdf7f2 4528 decode(code, bp);
7c673cae
FG
4529 switch (code) {
4530 case APPEND: {
4531 uint64_t size;
11fdf7f2 4532 decode(size, bp);
7c673cae
FG
4533 visitor->append(size);
4534 break;
4535 }
4536 case SETATTRS: {
9f95a23c 4537 map<string, std::optional<ceph::buffer::list> > attrs;
11fdf7f2 4538 decode(attrs, bp);
7c673cae
FG
4539 visitor->setattrs(attrs);
4540 break;
4541 }
4542 case DELETE: {
4543 version_t old_version;
11fdf7f2 4544 decode(old_version, bp);
7c673cae
FG
4545 visitor->rmobject(old_version);
4546 break;
4547 }
4548 case CREATE: {
4549 visitor->create();
4550 break;
4551 }
4552 case UPDATE_SNAPS: {
4553 set<snapid_t> snaps;
11fdf7f2 4554 decode(snaps, bp);
7c673cae
FG
4555 visitor->update_snaps(snaps);
4556 break;
4557 }
4558 case TRY_DELETE: {
4559 version_t old_version;
11fdf7f2 4560 decode(old_version, bp);
7c673cae
FG
4561 visitor->try_rmobject(old_version);
4562 break;
4563 }
4564 case ROLLBACK_EXTENTS: {
4565 vector<pair<uint64_t, uint64_t> > extents;
4566 version_t gen;
11fdf7f2
TL
4567 decode(gen, bp);
4568 decode(extents, bp);
7c673cae
FG
4569 visitor->rollback_extents(gen,extents);
4570 break;
4571 }
4572 default:
11fdf7f2 4573 ceph_abort_msg("Invalid rollback code");
7c673cae
FG
4574 }
4575 DECODE_FINISH(bp);
4576 }
4577 } catch (...) {
11fdf7f2 4578 ceph_abort_msg("Invalid encoding");
7c673cae
FG
4579 }
4580}
4581
4582struct DumpVisitor : public ObjectModDesc::Visitor {
4583 Formatter *f;
4584 explicit DumpVisitor(Formatter *f) : f(f) {}
4585 void append(uint64_t old_size) override {
4586 f->open_object_section("op");
4587 f->dump_string("code", "APPEND");
4588 f->dump_unsigned("old_size", old_size);
4589 f->close_section();
4590 }
9f95a23c 4591 void setattrs(map<string, std::optional<ceph::buffer::list> > &attrs) override {
7c673cae
FG
4592 f->open_object_section("op");
4593 f->dump_string("code", "SETATTRS");
4594 f->open_array_section("attrs");
9f95a23c 4595 for (auto i = attrs.begin(); i != attrs.end(); ++i) {
7c673cae
FG
4596 f->dump_string("attr_name", i->first);
4597 }
4598 f->close_section();
4599 f->close_section();
4600 }
4601 void rmobject(version_t old_version) override {
4602 f->open_object_section("op");
4603 f->dump_string("code", "RMOBJECT");
4604 f->dump_unsigned("old_version", old_version);
4605 f->close_section();
4606 }
4607 void try_rmobject(version_t old_version) override {
4608 f->open_object_section("op");
4609 f->dump_string("code", "TRY_RMOBJECT");
4610 f->dump_unsigned("old_version", old_version);
4611 f->close_section();
4612 }
4613 void create() override {
4614 f->open_object_section("op");
4615 f->dump_string("code", "CREATE");
4616 f->close_section();
4617 }
4618 void update_snaps(const set<snapid_t> &snaps) override {
4619 f->open_object_section("op");
4620 f->dump_string("code", "UPDATE_SNAPS");
4621 f->dump_stream("snaps") << snaps;
4622 f->close_section();
4623 }
4624 void rollback_extents(
4625 version_t gen,
4626 const vector<pair<uint64_t, uint64_t> > &extents) override {
4627 f->open_object_section("op");
4628 f->dump_string("code", "ROLLBACK_EXTENTS");
4629 f->dump_unsigned("gen", gen);
4630 f->dump_stream("snaps") << extents;
4631 f->close_section();
4632 }
4633};
4634
4635void ObjectModDesc::dump(Formatter *f) const
4636{
4637 f->open_object_section("object_mod_desc");
4638 f->dump_bool("can_local_rollback", can_local_rollback);
4639 f->dump_bool("rollback_info_completed", rollback_info_completed);
4640 {
4641 f->open_array_section("ops");
4642 DumpVisitor vis(f);
4643 visit(&vis);
4644 f->close_section();
4645 }
4646 f->close_section();
4647}
4648
4649void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
4650{
9f95a23c 4651 map<string, std::optional<ceph::buffer::list> > attrs;
7c673cae
FG
4652 attrs[OI_ATTR];
4653 attrs[SS_ATTR];
4654 attrs["asdf"];
4655 o.push_back(new ObjectModDesc());
4656 o.back()->append(100);
4657 o.back()->setattrs(attrs);
4658 o.push_back(new ObjectModDesc());
4659 o.back()->rmobject(1001);
4660 o.push_back(new ObjectModDesc());
4661 o.back()->create();
4662 o.back()->setattrs(attrs);
4663 o.push_back(new ObjectModDesc());
4664 o.back()->create();
4665 o.back()->setattrs(attrs);
4666 o.back()->mark_unrollbackable();
4667 o.back()->append(1000);
4668}
4669
9f95a23c 4670void ObjectModDesc::encode(ceph::buffer::list &_bl) const
7c673cae
FG
4671{
4672 ENCODE_START(max_required_version, max_required_version, _bl);
11fdf7f2
TL
4673 encode(can_local_rollback, _bl);
4674 encode(rollback_info_completed, _bl);
4675 encode(bl, _bl);
7c673cae
FG
4676 ENCODE_FINISH(_bl);
4677}
9f95a23c 4678void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl)
7c673cae
FG
4679{
4680 DECODE_START(2, _bl);
4681 max_required_version = struct_v;
11fdf7f2
TL
4682 decode(can_local_rollback, _bl);
4683 decode(rollback_info_completed, _bl);
4684 decode(bl, _bl);
9f95a23c 4685 // ensure bl does not pin a larger ceph::buffer in memory
7c673cae 4686 bl.rebuild();
31f18b77 4687 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
7c673cae
FG
4688 DECODE_FINISH(_bl);
4689}
4690
9f95a23c
TL
4691std::atomic<uint32_t> ObjectCleanRegions::max_num_intervals = {10};
4692
4693void ObjectCleanRegions::set_max_num_intervals(uint32_t num)
4694{
4695 max_num_intervals = num;
4696}
4697
4698void ObjectCleanRegions::trim()
4699{
4700 while(clean_offsets.num_intervals() > max_num_intervals) {
4701 typename interval_set<uint64_t>::iterator shortest_interval = clean_offsets.begin();
4702 if (shortest_interval == clean_offsets.end())
4703 break;
4704 for (typename interval_set<uint64_t>::iterator it = clean_offsets.begin();
4705 it != clean_offsets.end();
4706 ++it) {
4707 if (it.get_len() < shortest_interval.get_len())
4708 shortest_interval = it;
4709 }
4710 clean_offsets.erase(shortest_interval);
4711 }
4712}
4713
4714void ObjectCleanRegions::merge(const ObjectCleanRegions &other)
4715{
4716 clean_offsets.intersection_of(other.clean_offsets);
4717 clean_omap = clean_omap && other.clean_omap;
4718 trim();
4719}
4720
4721void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len)
4722{
4723 interval_set<uint64_t> clean_region;
4724 clean_region.insert(0, (uint64_t)-1);
4725 clean_region.erase(offset, len);
4726 clean_offsets.intersection_of(clean_region);
4727 trim();
4728}
4729
f67539c2
TL
4730bool ObjectCleanRegions::is_clean_region(uint64_t offset, uint64_t len) const
4731{
4732 return clean_offsets.contains(offset, len);
4733}
4734
9f95a23c
TL
4735void ObjectCleanRegions::mark_omap_dirty()
4736{
4737 clean_omap = false;
4738}
4739
4740void ObjectCleanRegions::mark_object_new()
4741{
4742 new_object = true;
4743}
4744
4745void ObjectCleanRegions::mark_fully_dirty()
4746{
4747 mark_data_region_dirty(0, (uint64_t)-1);
4748 mark_omap_dirty();
4749 mark_object_new();
4750}
4751
4752interval_set<uint64_t> ObjectCleanRegions::get_dirty_regions() const
4753{
4754 interval_set<uint64_t> dirty_region;
4755 dirty_region.insert(0, (uint64_t)-1);
4756 dirty_region.subtract(clean_offsets);
4757 return dirty_region;
4758}
4759
4760bool ObjectCleanRegions::omap_is_dirty() const
4761{
4762 return !clean_omap;
4763}
4764
4765bool ObjectCleanRegions::object_is_exist() const
4766{
4767 return !new_object;
4768}
4769
4770void ObjectCleanRegions::encode(bufferlist &bl) const
4771{
4772 ENCODE_START(1, 1, bl);
4773 using ceph::encode;
4774 encode(clean_offsets, bl);
4775 encode(clean_omap, bl);
4776 encode(new_object, bl);
4777 ENCODE_FINISH(bl);
4778}
4779
4780void ObjectCleanRegions::decode(bufferlist::const_iterator &bl)
4781{
4782 DECODE_START(1, bl);
4783 using ceph::decode;
4784 decode(clean_offsets, bl);
4785 decode(clean_omap, bl);
4786 decode(new_object, bl);
4787 DECODE_FINISH(bl);
4788}
4789
4790void ObjectCleanRegions::dump(Formatter *f) const
4791{
4792 f->open_object_section("object_clean_regions");
4793 f->dump_stream("clean_offsets") << clean_offsets;
4794 f->dump_bool("clean_omap", clean_omap);
4795 f->dump_bool("new_object", new_object);
4796 f->close_section();
4797}
4798
4799void ObjectCleanRegions::generate_test_instances(list<ObjectCleanRegions*>& o)
4800{
4801 o.push_back(new ObjectCleanRegions());
4802 o.push_back(new ObjectCleanRegions());
4803 o.back()->mark_data_region_dirty(4096, 40960);
4804 o.back()->mark_omap_dirty();
4805 o.back()->mark_object_new();
4806}
4807
4808ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr)
4809{
4810 return out << "clean_offsets: " << ocr.clean_offsets
4811 << ", clean_omap: " << ocr.clean_omap
4812 << ", new_object: " << ocr.new_object;
4813}
4814
7c673cae
FG
4815// -- pg_log_entry_t --
4816
4817string pg_log_entry_t::get_key_name() const
4818{
4819 return version.get_key_name();
4820}
4821
9f95a23c 4822void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const
7c673cae 4823{
11fdf7f2 4824 using ceph::encode;
9f95a23c 4825 ceph::buffer::list ebl(sizeof(*this)*2);
11fdf7f2 4826 this->encode(ebl);
7c673cae 4827 __u32 crc = ebl.crc32c(0);
11fdf7f2
TL
4828 encode(ebl, bl);
4829 encode(crc, bl);
7c673cae
FG
4830}
4831
9f95a23c 4832void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p)
7c673cae 4833{
11fdf7f2 4834 using ceph::decode;
9f95a23c 4835 ceph::buffer::list bl;
11fdf7f2 4836 decode(bl, p);
7c673cae 4837 __u32 crc;
11fdf7f2 4838 decode(crc, p);
7c673cae 4839 if (crc != bl.crc32c(0))
9f95a23c 4840 throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
11fdf7f2
TL
4841 auto q = bl.cbegin();
4842 this->decode(q);
7c673cae
FG
4843}
4844
9f95a23c 4845void pg_log_entry_t::encode(ceph::buffer::list &bl) const
7c673cae 4846{
9f95a23c 4847 ENCODE_START(14, 4, bl);
11fdf7f2
TL
4848 encode(op, bl);
4849 encode(soid, bl);
4850 encode(version, bl);
7c673cae
FG
4851
4852 /**
4853 * Added with reverting_to:
4854 * Previous code used prior_version to encode
4855 * what we now call reverting_to. This will
4856 * allow older code to decode reverting_to
4857 * into prior_version as expected.
4858 */
4859 if (op == LOST_REVERT)
11fdf7f2 4860 encode(reverting_to, bl);
7c673cae 4861 else
11fdf7f2 4862 encode(prior_version, bl);
7c673cae 4863
11fdf7f2
TL
4864 encode(reqid, bl);
4865 encode(mtime, bl);
7c673cae 4866 if (op == LOST_REVERT)
11fdf7f2
TL
4867 encode(prior_version, bl);
4868 encode(snaps, bl);
4869 encode(user_version, bl);
4870 encode(mod_desc, bl);
4871 encode(extra_reqids, bl);
7c673cae 4872 if (op == ERROR)
11fdf7f2
TL
4873 encode(return_code, bl);
4874 if (!extra_reqids.empty())
4875 encode(extra_reqid_return_codes, bl);
9f95a23c
TL
4876 encode(clean_regions, bl);
4877 if (op != ERROR)
4878 encode(return_code, bl);
4879 encode(op_returns, bl);
7c673cae
FG
4880 ENCODE_FINISH(bl);
4881}
4882
9f95a23c 4883void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae 4884{
9f95a23c 4885 DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl);
11fdf7f2 4886 decode(op, bl);
7c673cae
FG
4887 if (struct_v < 2) {
4888 sobject_t old_soid;
11fdf7f2 4889 decode(old_soid, bl);
7c673cae
FG
4890 soid.oid = old_soid.oid;
4891 soid.snap = old_soid.snap;
4892 invalid_hash = true;
4893 } else {
11fdf7f2 4894 decode(soid, bl);
7c673cae
FG
4895 }
4896 if (struct_v < 3)
4897 invalid_hash = true;
11fdf7f2 4898 decode(version, bl);
7c673cae
FG
4899
4900 if (struct_v >= 6 && op == LOST_REVERT)
11fdf7f2 4901 decode(reverting_to, bl);
7c673cae 4902 else
11fdf7f2 4903 decode(prior_version, bl);
7c673cae 4904
11fdf7f2 4905 decode(reqid, bl);
7c673cae 4906
11fdf7f2 4907 decode(mtime, bl);
7c673cae
FG
4908 if (struct_v < 5)
4909 invalid_pool = true;
4910
4911 if (op == LOST_REVERT) {
4912 if (struct_v >= 6) {
11fdf7f2 4913 decode(prior_version, bl);
7c673cae
FG
4914 } else {
4915 reverting_to = prior_version;
4916 }
4917 }
4918 if (struct_v >= 7 || // for v >= 7, this is for all ops.
4919 op == CLONE) { // for v < 7, it's only present for CLONE.
11fdf7f2 4920 decode(snaps, bl);
9f95a23c 4921 // ensure snaps does not pin a larger ceph::buffer in memory
7c673cae 4922 snaps.rebuild();
31f18b77 4923 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
7c673cae
FG
4924 }
4925
4926 if (struct_v >= 8)
11fdf7f2 4927 decode(user_version, bl);
7c673cae
FG
4928 else
4929 user_version = version.version;
4930
4931 if (struct_v >= 9)
11fdf7f2 4932 decode(mod_desc, bl);
7c673cae
FG
4933 else
4934 mod_desc.mark_unrollbackable();
4935 if (struct_v >= 10)
11fdf7f2 4936 decode(extra_reqids, bl);
7c673cae 4937 if (struct_v >= 11 && op == ERROR)
11fdf7f2
TL
4938 decode(return_code, bl);
4939 if (struct_v >= 12 && !extra_reqids.empty())
4940 decode(extra_reqid_return_codes, bl);
9f95a23c
TL
4941 if (struct_v >= 13)
4942 decode(clean_regions, bl);
4943 else
4944 clean_regions.mark_fully_dirty();
4945 if (struct_v >= 14) {
4946 if (op != ERROR) {
4947 decode(return_code, bl);
4948 }
4949 decode(op_returns, bl);
4950 }
7c673cae
FG
4951 DECODE_FINISH(bl);
4952}
4953
4954void pg_log_entry_t::dump(Formatter *f) const
4955{
4956 f->dump_string("op", get_op_name());
4957 f->dump_stream("object") << soid;
4958 f->dump_stream("version") << version;
4959 f->dump_stream("prior_version") << prior_version;
4960 f->dump_stream("reqid") << reqid;
4961 f->open_array_section("extra_reqids");
11fdf7f2 4962 uint32_t idx = 0;
31f18b77 4963 for (auto p = extra_reqids.begin();
7c673cae 4964 p != extra_reqids.end();
11fdf7f2 4965 ++idx, ++p) {
7c673cae
FG
4966 f->open_object_section("extra_reqid");
4967 f->dump_stream("reqid") << p->first;
4968 f->dump_stream("user_version") << p->second;
11fdf7f2
TL
4969 auto it = extra_reqid_return_codes.find(idx);
4970 if (it != extra_reqid_return_codes.end()) {
4971 f->dump_int("return_code", it->second);
4972 }
7c673cae
FG
4973 f->close_section();
4974 }
4975 f->close_section();
4976 f->dump_stream("mtime") << mtime;
4977 f->dump_int("return_code", return_code);
9f95a23c
TL
4978 if (!op_returns.empty()) {
4979 f->open_array_section("op_returns");
4980 for (auto& i : op_returns) {
4981 f->dump_object("op", i);
4982 }
4983 f->close_section();
4984 }
7c673cae
FG
4985 if (snaps.length() > 0) {
4986 vector<snapid_t> v;
9f95a23c 4987 ceph::buffer::list c = snaps;
11fdf7f2 4988 auto p = c.cbegin();
7c673cae 4989 try {
11fdf7f2
TL
4990 using ceph::decode;
4991 decode(v, p);
7c673cae
FG
4992 } catch (...) {
4993 v.clear();
4994 }
4995 f->open_object_section("snaps");
9f95a23c 4996 for (auto p = v.begin(); p != v.end(); ++p)
7c673cae
FG
4997 f->dump_unsigned("snap", *p);
4998 f->close_section();
4999 }
5000 {
5001 f->open_object_section("mod_desc");
5002 mod_desc.dump(f);
5003 f->close_section();
5004 }
9f95a23c
TL
5005 {
5006 f->open_object_section("clean_regions");
5007 clean_regions.dump(f);
5008 f->close_section();
5009 }
7c673cae
FG
5010}
5011
5012void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
5013{
5014 o.push_back(new pg_log_entry_t());
5015 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
5016 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
5017 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5018 utime_t(8,9), 0));
5019 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
5020 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5021 utime_t(8,9), -ENOENT));
5022}
5023
5024ostream& operator<<(ostream& out, const pg_log_entry_t& e)
5025{
5026 out << e.version << " (" << e.prior_version << ") "
5027 << std::left << std::setw(8) << e.get_op_name() << ' '
5028 << e.soid << " by " << e.reqid << " " << e.mtime
5029 << " " << e.return_code;
9f95a23c
TL
5030 if (!e.op_returns.empty()) {
5031 out << " " << e.op_returns;
5032 }
7c673cae
FG
5033 if (e.snaps.length()) {
5034 vector<snapid_t> snaps;
9f95a23c 5035 ceph::buffer::list c = e.snaps;
11fdf7f2 5036 auto p = c.cbegin();
7c673cae 5037 try {
11fdf7f2 5038 decode(snaps, p);
7c673cae
FG
5039 } catch (...) {
5040 snaps.clear();
5041 }
5042 out << " snaps " << snaps;
5043 }
9f95a23c 5044 out << " ObjectCleanRegions " << e.clean_regions;
7c673cae
FG
5045 return out;
5046}
5047
c07f9fc5
FG
5048// -- pg_log_dup_t --
5049
11fdf7f2 5050std::string pg_log_dup_t::get_key_name() const
c07f9fc5 5051{
11fdf7f2
TL
5052 static const char prefix[] = "dup_";
5053 std::string key(36, ' ');
5054 memcpy(&key[0], prefix, 4);
5055 version.get_key_name(&key[4]);
5056 key.resize(35); // remove the null terminator
5057 return key;
c07f9fc5
FG
5058}
5059
9f95a23c 5060void pg_log_dup_t::encode(ceph::buffer::list &bl) const
c07f9fc5 5061{
9f95a23c 5062 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5063 encode(reqid, bl);
5064 encode(version, bl);
5065 encode(user_version, bl);
5066 encode(return_code, bl);
9f95a23c 5067 encode(op_returns, bl);
c07f9fc5
FG
5068 ENCODE_FINISH(bl);
5069}
5070
9f95a23c 5071void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl)
c07f9fc5 5072{
9f95a23c 5073 DECODE_START(2, bl);
11fdf7f2
TL
5074 decode(reqid, bl);
5075 decode(version, bl);
5076 decode(user_version, bl);
5077 decode(return_code, bl);
9f95a23c
TL
5078 if (struct_v >= 2) {
5079 decode(op_returns, bl);
5080 }
c07f9fc5
FG
5081 DECODE_FINISH(bl);
5082}
5083
5084void pg_log_dup_t::dump(Formatter *f) const
5085{
5086 f->dump_stream("reqid") << reqid;
5087 f->dump_stream("version") << version;
5088 f->dump_stream("user_version") << user_version;
5089 f->dump_stream("return_code") << return_code;
9f95a23c
TL
5090 if (!op_returns.empty()) {
5091 f->open_array_section("op_returns");
5092 for (auto& i : op_returns) {
5093 f->dump_object("op", i);
5094 }
5095 f->close_section();
5096 }
c07f9fc5
FG
5097}
5098
5099void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
5100{
5101 o.push_back(new pg_log_dup_t());
5102 o.push_back(new pg_log_dup_t(eversion_t(1,2),
5103 1,
5104 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5105 0));
5106 o.push_back(new pg_log_dup_t(eversion_t(1,2),
5107 2,
5108 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5109 -ENOENT));
5110}
5111
5112
5113std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
9f95a23c 5114 out << "log_dup(reqid=" << e.reqid <<
c07f9fc5 5115 " v=" << e.version << " uv=" << e.user_version <<
9f95a23c
TL
5116 " rc=" << e.return_code;
5117 if (!e.op_returns.empty()) {
5118 out << " " << e.op_returns;
5119 }
5120 return out << ")";
c07f9fc5
FG
5121}
5122
7c673cae
FG
5123
5124// -- pg_log_t --
5125
5126// out: pg_log_t that only has entries that apply to import_pgid using curmap
5127// reject: Entries rejected from "in" are in the reject.log. Other fields not set.
5128void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
5129 const string &hit_set_namespace, const pg_log_t &in,
5130 pg_log_t &out, pg_log_t &reject)
5131{
5132 out = in;
5133 out.log.clear();
5134 reject.log.clear();
5135
9f95a23c 5136 for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) {
7c673cae
FG
5137
5138 // Reject pg log entries for temporary objects
5139 if (i->soid.is_temp()) {
5140 reject.log.push_back(*i);
5141 continue;
5142 }
5143
5144 if (i->soid.nspace != hit_set_namespace) {
5145 object_t oid = i->soid.oid;
5146 object_locator_t loc(i->soid);
5147 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
5148 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
5149
5150 if (import_pgid.pgid == pgid) {
5151 out.log.push_back(*i);
5152 } else {
5153 reject.log.push_back(*i);
5154 }
5155 } else {
5156 out.log.push_back(*i);
5157 }
5158 }
5159}
5160
9f95a23c 5161void pg_log_t::encode(ceph::buffer::list& bl) const
7c673cae 5162{
c07f9fc5 5163 ENCODE_START(7, 3, bl);
11fdf7f2
TL
5164 encode(head, bl);
5165 encode(tail, bl);
5166 encode(log, bl);
5167 encode(can_rollback_to, bl);
5168 encode(rollback_info_trimmed_to, bl);
5169 encode(dups, bl);
7c673cae
FG
5170 ENCODE_FINISH(bl);
5171}
5172
9f95a23c 5173void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool)
7c673cae 5174{
c07f9fc5 5175 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
11fdf7f2
TL
5176 decode(head, bl);
5177 decode(tail, bl);
7c673cae
FG
5178 if (struct_v < 2) {
5179 bool backlog;
11fdf7f2 5180 decode(backlog, bl);
7c673cae 5181 }
11fdf7f2 5182 decode(log, bl);
7c673cae 5183 if (struct_v >= 5)
11fdf7f2 5184 decode(can_rollback_to, bl);
7c673cae
FG
5185
5186 if (struct_v >= 6)
11fdf7f2 5187 decode(rollback_info_trimmed_to, bl);
7c673cae
FG
5188 else
5189 rollback_info_trimmed_to = tail;
c07f9fc5
FG
5190
5191 if (struct_v >= 7)
11fdf7f2 5192 decode(dups, bl);
c07f9fc5 5193
7c673cae
FG
5194 DECODE_FINISH(bl);
5195
5196 // handle hobject_t format change
5197 if (struct_v < 4) {
9f95a23c 5198 for (auto i = log.begin(); i != log.end(); ++i) {
7c673cae
FG
5199 if (!i->soid.is_max() && i->soid.pool == -1)
5200 i->soid.pool = pool;
5201 }
5202 }
5203}
5204
5205void pg_log_t::dump(Formatter *f) const
5206{
5207 f->dump_stream("head") << head;
5208 f->dump_stream("tail") << tail;
5209 f->open_array_section("log");
9f95a23c 5210 for (auto p = log.cbegin(); p != log.cend(); ++p) {
7c673cae
FG
5211 f->open_object_section("entry");
5212 p->dump(f);
5213 f->close_section();
5214 }
5215 f->close_section();
c07f9fc5
FG
5216 f->open_array_section("dups");
5217 for (const auto& entry : dups) {
5218 f->open_object_section("entry");
5219 entry.dump(f);
5220 f->close_section();
5221 }
5222 f->close_section();
7c673cae
FG
5223}
5224
5225void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
5226{
5227 o.push_back(new pg_log_t);
5228
5229 // this is nonsensical:
5230 o.push_back(new pg_log_t);
5231 o.back()->head = eversion_t(1,2);
5232 o.back()->tail = eversion_t(3,4);
5233 list<pg_log_entry_t*> e;
5234 pg_log_entry_t::generate_test_instances(e);
9f95a23c 5235 for (auto p = e.begin(); p != e.end(); ++p)
7c673cae
FG
5236 o.back()->log.push_back(**p);
5237}
5238
81eedcae
TL
5239static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups)
5240{
5241 auto earliest_dup_version =
5242 target.head.version < maxdups ? 0u : target.head.version - maxdups + 1;
2a845540
TL
5243 lgeneric_subdout(cct, osd, 20) << __func__ << " earliest_dup_version "
5244 << earliest_dup_version << dendl;
81eedcae
TL
5245
5246 for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) {
5247 if (d->version.version >= earliest_dup_version) {
5248 lgeneric_subdout(cct, osd, 20)
5249 << "copy_up_to/copy_after copy dup version "
5250 << d->version << dendl;
5251 target.dups.push_back(pg_log_dup_t(*d));
5252 }
5253 }
5254
5255 for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) {
5256 ceph_assert(i->version > other.tail);
5257 if (i->version > target.tail)
5258 break;
5259 if (i->version.version >= earliest_dup_version) {
5260 lgeneric_subdout(cct, osd, 20)
5261 << "copy_up_to/copy_after copy dup from log version "
5262 << i->version << dendl;
5263 target.dups.push_back(pg_log_dup_t(*i));
5264 }
5265 }
5266}
5267
5268
5269void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v)
7c673cae
FG
5270{
5271 can_rollback_to = other.can_rollback_to;
5272 head = other.head;
5273 tail = other.tail;
2a845540
TL
5274 lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v
5275 << " dups.size()=" << dups.size()
5276 << " other.dups.size()=" << other.dups.size() << dendl;
9f95a23c 5277 for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
11fdf7f2 5278 ceph_assert(i->version > other.tail);
7c673cae
FG
5279 if (i->version <= v) {
5280 // make tail accurate.
5281 tail = i->version;
5282 break;
5283 }
81eedcae 5284 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
7c673cae
FG
5285 log.push_front(*i);
5286 }
81eedcae 5287 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
2a845540
TL
5288 lgeneric_subdout(cct, osd, 20) << __func__ << " END v " << v
5289 << " dups.size()=" << dups.size()
5290 << " other.dups.size()=" << other.dups.size() << dendl;
7c673cae
FG
5291}
5292
81eedcae 5293void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max)
7c673cae
FG
5294{
5295 can_rollback_to = other.can_rollback_to;
5296 int n = 0;
5297 head = other.head;
5298 tail = other.tail;
2a845540
TL
5299 lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max
5300 << " dups.size()=" << dups.size()
5301 << " other.dups.size()=" << other.dups.size() << dendl;
9f95a23c 5302 for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
81eedcae 5303 ceph_assert(i->version > other.tail);
7c673cae
FG
5304 if (n++ >= max) {
5305 tail = i->version;
5306 break;
5307 }
81eedcae 5308 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
7c673cae
FG
5309 log.push_front(*i);
5310 }
81eedcae 5311 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
2a845540
TL
5312 lgeneric_subdout(cct, osd, 20) << __func__ << " END max " << max
5313 << " dups.size()=" << dups.size()
5314 << " other.dups.size()=" << other.dups.size() << dendl;
7c673cae
FG
5315}
5316
c07f9fc5 5317ostream& pg_log_t::print(ostream& out) const
7c673cae
FG
5318{
5319 out << *this << std::endl;
9f95a23c 5320 for (auto p = log.cbegin(); p != log.cend(); ++p)
7c673cae 5321 out << *p << std::endl;
c07f9fc5
FG
5322 for (const auto& entry : dups) {
5323 out << " dup entry: " << entry << std::endl;
5324 }
7c673cae
FG
5325 return out;
5326}
5327
5328// -- pg_missing_t --
5329
5330ostream& operator<<(ostream& out, const pg_missing_item& i)
5331{
5332 out << i.need;
5333 if (i.have != eversion_t())
5334 out << "(" << i.have << ")";
9f95a23c
TL
5335 out << " flags = " << i.flag_str()
5336 << " " << i.clean_regions;
7c673cae
FG
5337 return out;
5338}
5339
5340// -- object_copy_cursor_t --
5341
9f95a23c 5342void object_copy_cursor_t::encode(ceph::buffer::list& bl) const
7c673cae
FG
5343{
5344 ENCODE_START(1, 1, bl);
11fdf7f2
TL
5345 encode(attr_complete, bl);
5346 encode(data_offset, bl);
5347 encode(data_complete, bl);
5348 encode(omap_offset, bl);
5349 encode(omap_complete, bl);
7c673cae
FG
5350 ENCODE_FINISH(bl);
5351}
5352
9f95a23c 5353void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
5354{
5355 DECODE_START(1, bl);
11fdf7f2
TL
5356 decode(attr_complete, bl);
5357 decode(data_offset, bl);
5358 decode(data_complete, bl);
5359 decode(omap_offset, bl);
5360 decode(omap_complete, bl);
7c673cae
FG
5361 DECODE_FINISH(bl);
5362}
5363
5364void object_copy_cursor_t::dump(Formatter *f) const
5365{
5366 f->dump_unsigned("attr_complete", (int)attr_complete);
5367 f->dump_unsigned("data_offset", data_offset);
5368 f->dump_unsigned("data_complete", (int)data_complete);
5369 f->dump_string("omap_offset", omap_offset);
5370 f->dump_unsigned("omap_complete", (int)omap_complete);
5371}
5372
5373void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
5374{
5375 o.push_back(new object_copy_cursor_t);
5376 o.push_back(new object_copy_cursor_t);
5377 o.back()->attr_complete = true;
5378 o.back()->data_offset = 123;
5379 o.push_back(new object_copy_cursor_t);
5380 o.back()->attr_complete = true;
5381 o.back()->data_complete = true;
5382 o.back()->omap_offset = "foo";
5383 o.push_back(new object_copy_cursor_t);
5384 o.back()->attr_complete = true;
5385 o.back()->data_complete = true;
5386 o.back()->omap_complete = true;
5387}
5388
5389// -- object_copy_data_t --
5390
9f95a23c 5391void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 5392{
11fdf7f2
TL
5393 ENCODE_START(8, 5, bl);
5394 encode(size, bl);
5395 encode(mtime, bl);
5396 encode(attrs, bl);
5397 encode(data, bl);
5398 encode(omap_data, bl);
5399 encode(cursor, bl);
5400 encode(omap_header, bl);
5401 encode(snaps, bl);
5402 encode(snap_seq, bl);
5403 encode(flags, bl);
5404 encode(data_digest, bl);
5405 encode(omap_digest, bl);
5406 encode(reqids, bl);
5407 encode(truncate_seq, bl);
5408 encode(truncate_size, bl);
5409 encode(reqid_return_codes, bl);
7c673cae
FG
5410 ENCODE_FINISH(bl);
5411}
5412
9f95a23c 5413void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 5414{
9f95a23c 5415 DECODE_START(8, bl);
7c673cae
FG
5416 if (struct_v < 5) {
5417 // old
11fdf7f2
TL
5418 decode(size, bl);
5419 decode(mtime, bl);
7c673cae
FG
5420 {
5421 string category;
11fdf7f2 5422 decode(category, bl); // no longer used
7c673cae 5423 }
11fdf7f2
TL
5424 decode(attrs, bl);
5425 decode(data, bl);
7c673cae 5426 {
9f95a23c 5427 map<string,ceph::buffer::list> omap;
11fdf7f2 5428 decode(omap, bl);
7c673cae 5429 omap_data.clear();
11fdf7f2
TL
5430 if (!omap.empty()) {
5431 using ceph::encode;
5432 encode(omap, omap_data);
5433 }
7c673cae 5434 }
11fdf7f2 5435 decode(cursor, bl);
7c673cae 5436 if (struct_v >= 2)
11fdf7f2 5437 decode(omap_header, bl);
7c673cae 5438 if (struct_v >= 3) {
11fdf7f2
TL
5439 decode(snaps, bl);
5440 decode(snap_seq, bl);
7c673cae
FG
5441 } else {
5442 snaps.clear();
5443 snap_seq = 0;
5444 }
5445 if (struct_v >= 4) {
11fdf7f2
TL
5446 decode(flags, bl);
5447 decode(data_digest, bl);
5448 decode(omap_digest, bl);
7c673cae
FG
5449 }
5450 } else {
5451 // current
11fdf7f2
TL
5452 decode(size, bl);
5453 decode(mtime, bl);
5454 decode(attrs, bl);
5455 decode(data, bl);
5456 decode(omap_data, bl);
5457 decode(cursor, bl);
5458 decode(omap_header, bl);
5459 decode(snaps, bl);
5460 decode(snap_seq, bl);
7c673cae 5461 if (struct_v >= 4) {
11fdf7f2
TL
5462 decode(flags, bl);
5463 decode(data_digest, bl);
5464 decode(omap_digest, bl);
7c673cae
FG
5465 }
5466 if (struct_v >= 6) {
11fdf7f2 5467 decode(reqids, bl);
7c673cae
FG
5468 }
5469 if (struct_v >= 7) {
11fdf7f2
TL
5470 decode(truncate_seq, bl);
5471 decode(truncate_size, bl);
5472 }
5473 if (struct_v >= 8) {
5474 decode(reqid_return_codes, bl);
7c673cae
FG
5475 }
5476 }
5477 DECODE_FINISH(bl);
5478}
5479
5480void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
5481{
5482 o.push_back(new object_copy_data_t());
5483
5484 list<object_copy_cursor_t*> cursors;
5485 object_copy_cursor_t::generate_test_instances(cursors);
9f95a23c 5486 auto ci = cursors.begin();
7c673cae
FG
5487 o.back()->cursor = **(ci++);
5488
5489 o.push_back(new object_copy_data_t());
5490 o.back()->cursor = **(ci++);
5491
5492 o.push_back(new object_copy_data_t());
5493 o.back()->size = 1234;
5494 o.back()->mtime.set_from_double(1234);
9f95a23c
TL
5495 ceph::buffer::ptr bp("there", 5);
5496 ceph::buffer::list bl;
7c673cae
FG
5497 bl.push_back(bp);
5498 o.back()->attrs["hello"] = bl;
9f95a23c
TL
5499 ceph::buffer::ptr bp2("not", 3);
5500 ceph::buffer::list bl2;
7c673cae 5501 bl2.push_back(bp2);
9f95a23c 5502 map<string,ceph::buffer::list> omap;
7c673cae 5503 omap["why"] = bl2;
11fdf7f2
TL
5504 using ceph::encode;
5505 encode(omap, o.back()->omap_data);
9f95a23c 5506 ceph::buffer::ptr databp("iamsomedatatocontain", 20);
7c673cae
FG
5507 o.back()->data.push_back(databp);
5508 o.back()->omap_header.append("this is an omap header");
5509 o.back()->snaps.push_back(123);
5510 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
5511}
5512
5513void object_copy_data_t::dump(Formatter *f) const
5514{
5515 f->open_object_section("cursor");
5516 cursor.dump(f);
5517 f->close_section(); // cursor
5518 f->dump_int("size", size);
5519 f->dump_stream("mtime") << mtime;
9f95a23c 5520 /* we should really print out the attrs here, but ceph::buffer::list
7c673cae
FG
5521 const-correctness prevents that */
5522 f->dump_int("attrs_size", attrs.size());
5523 f->dump_int("flags", flags);
5524 f->dump_unsigned("data_digest", data_digest);
5525 f->dump_unsigned("omap_digest", omap_digest);
5526 f->dump_int("omap_data_length", omap_data.length());
5527 f->dump_int("omap_header_length", omap_header.length());
5528 f->dump_int("data_length", data.length());
5529 f->open_array_section("snaps");
9f95a23c 5530 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
7c673cae
FG
5531 f->dump_unsigned("snap", *p);
5532 f->close_section();
5533 f->open_array_section("reqids");
11fdf7f2 5534 uint32_t idx = 0;
31f18b77 5535 for (auto p = reqids.begin();
7c673cae 5536 p != reqids.end();
11fdf7f2 5537 ++idx, ++p) {
7c673cae
FG
5538 f->open_object_section("extra_reqid");
5539 f->dump_stream("reqid") << p->first;
5540 f->dump_stream("user_version") << p->second;
11fdf7f2
TL
5541 auto it = reqid_return_codes.find(idx);
5542 if (it != reqid_return_codes.end()) {
5543 f->dump_int("return_code", it->second);
5544 }
7c673cae
FG
5545 f->close_section();
5546 }
5547 f->close_section();
5548}
5549
5550// -- pg_create_t --
5551
9f95a23c 5552void pg_create_t::encode(ceph::buffer::list &bl) const
7c673cae
FG
5553{
5554 ENCODE_START(1, 1, bl);
11fdf7f2
TL
5555 encode(created, bl);
5556 encode(parent, bl);
5557 encode(split_bits, bl);
7c673cae
FG
5558 ENCODE_FINISH(bl);
5559}
5560
9f95a23c 5561void pg_create_t::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
5562{
5563 DECODE_START(1, bl);
11fdf7f2
TL
5564 decode(created, bl);
5565 decode(parent, bl);
5566 decode(split_bits, bl);
7c673cae
FG
5567 DECODE_FINISH(bl);
5568}
5569
5570void pg_create_t::dump(Formatter *f) const
5571{
5572 f->dump_unsigned("created", created);
5573 f->dump_stream("parent") << parent;
5574 f->dump_int("split_bits", split_bits);
5575}
5576
5577void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
5578{
5579 o.push_back(new pg_create_t);
11fdf7f2 5580 o.push_back(new pg_create_t(1, pg_t(3, 4), 2));
7c673cae
FG
5581}
5582
5583
5584// -- pg_hit_set_info_t --
5585
9f95a23c 5586void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const
7c673cae
FG
5587{
5588 ENCODE_START(2, 1, bl);
11fdf7f2
TL
5589 encode(begin, bl);
5590 encode(end, bl);
5591 encode(version, bl);
5592 encode(using_gmt, bl);
7c673cae
FG
5593 ENCODE_FINISH(bl);
5594}
5595
9f95a23c 5596void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p)
7c673cae
FG
5597{
5598 DECODE_START(2, p);
11fdf7f2
TL
5599 decode(begin, p);
5600 decode(end, p);
5601 decode(version, p);
7c673cae 5602 if (struct_v >= 2) {
11fdf7f2 5603 decode(using_gmt, p);
7c673cae
FG
5604 } else {
5605 using_gmt = false;
5606 }
5607 DECODE_FINISH(p);
5608}
5609
5610void pg_hit_set_info_t::dump(Formatter *f) const
5611{
5612 f->dump_stream("begin") << begin;
5613 f->dump_stream("end") << end;
5614 f->dump_stream("version") << version;
5615 f->dump_stream("using_gmt") << using_gmt;
5616}
5617
5618void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
5619{
5620 ls.push_back(new pg_hit_set_info_t);
5621 ls.push_back(new pg_hit_set_info_t);
5622 ls.back()->begin = utime_t(1, 2);
5623 ls.back()->end = utime_t(3, 4);
5624}
5625
5626
5627// -- pg_hit_set_history_t --
5628
9f95a23c 5629void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const
7c673cae
FG
5630{
5631 ENCODE_START(1, 1, bl);
11fdf7f2 5632 encode(current_last_update, bl);
7c673cae
FG
5633 {
5634 utime_t dummy_stamp;
11fdf7f2 5635 encode(dummy_stamp, bl);
7c673cae
FG
5636 }
5637 {
5638 pg_hit_set_info_t dummy_info;
11fdf7f2 5639 encode(dummy_info, bl);
7c673cae 5640 }
11fdf7f2 5641 encode(history, bl);
7c673cae
FG
5642 ENCODE_FINISH(bl);
5643}
5644
9f95a23c 5645void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p)
7c673cae
FG
5646{
5647 DECODE_START(1, p);
11fdf7f2 5648 decode(current_last_update, p);
7c673cae
FG
5649 {
5650 utime_t dummy_stamp;
11fdf7f2 5651 decode(dummy_stamp, p);
7c673cae
FG
5652 }
5653 {
5654 pg_hit_set_info_t dummy_info;
11fdf7f2 5655 decode(dummy_info, p);
7c673cae 5656 }
11fdf7f2 5657 decode(history, p);
7c673cae
FG
5658 DECODE_FINISH(p);
5659}
5660
5661void pg_hit_set_history_t::dump(Formatter *f) const
5662{
5663 f->dump_stream("current_last_update") << current_last_update;
5664 f->open_array_section("history");
9f95a23c 5665 for (auto p = history.cbegin(); p != history.cend(); ++p) {
7c673cae
FG
5666 f->open_object_section("info");
5667 p->dump(f);
5668 f->close_section();
5669 }
5670 f->close_section();
5671}
5672
5673void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
5674{
5675 ls.push_back(new pg_hit_set_history_t);
5676 ls.push_back(new pg_hit_set_history_t);
5677 ls.back()->current_last_update = eversion_t(1, 2);
5678 ls.back()->history.push_back(pg_hit_set_info_t());
5679}
5680
7c673cae
FG
5681// -- OSDSuperblock --
5682
9f95a23c 5683void OSDSuperblock::encode(ceph::buffer::list &bl) const
7c673cae 5684{
1e59de90 5685 ENCODE_START(10, 5, bl);
11fdf7f2
TL
5686 encode(cluster_fsid, bl);
5687 encode(whoami, bl);
5688 encode(current_epoch, bl);
5689 encode(oldest_map, bl);
5690 encode(newest_map, bl);
5691 encode(weight, bl);
7c673cae 5692 compat_features.encode(bl);
11fdf7f2
TL
5693 encode(clean_thru, bl);
5694 encode(mounted, bl);
5695 encode(osd_fsid, bl);
5696 encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
5697 encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
9f95a23c
TL
5698 encode(purged_snaps_last, bl);
5699 encode(last_purged_snaps_scrub, bl);
1e59de90 5700 encode(cluster_osdmap_trim_lower_bound, bl);
7c673cae
FG
5701 ENCODE_FINISH(bl);
5702}
5703
9f95a23c 5704void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl)
7c673cae 5705{
1e59de90 5706 DECODE_START_LEGACY_COMPAT_LEN(10, 5, 5, bl);
7c673cae
FG
5707 if (struct_v < 3) {
5708 string magic;
11fdf7f2
TL
5709 decode(magic, bl);
5710 }
5711 decode(cluster_fsid, bl);
5712 decode(whoami, bl);
5713 decode(current_epoch, bl);
5714 decode(oldest_map, bl);
5715 decode(newest_map, bl);
5716 decode(weight, bl);
7c673cae
FG
5717 if (struct_v >= 2) {
5718 compat_features.decode(bl);
5719 } else { //upgrade it!
5720 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
5721 }
11fdf7f2
TL
5722 decode(clean_thru, bl);
5723 decode(mounted, bl);
7c673cae 5724 if (struct_v >= 4)
11fdf7f2 5725 decode(osd_fsid, bl);
7c673cae
FG
5726 if (struct_v >= 6) {
5727 epoch_t last_map_marked_full;
11fdf7f2 5728 decode(last_map_marked_full, bl);
7c673cae
FG
5729 }
5730 if (struct_v >= 7) {
5731 map<int64_t,epoch_t> pool_last_map_marked_full;
11fdf7f2 5732 decode(pool_last_map_marked_full, bl);
7c673cae 5733 }
9f95a23c
TL
5734 if (struct_v >= 9) {
5735 decode(purged_snaps_last, bl);
5736 decode(last_purged_snaps_scrub, bl);
5737 } else {
5738 purged_snaps_last = 0;
5739 }
1e59de90
TL
5740 if (struct_v >= 10) {
5741 decode(cluster_osdmap_trim_lower_bound, bl);
5742 } else {
5743 cluster_osdmap_trim_lower_bound = 0;
5744 }
7c673cae
FG
5745 DECODE_FINISH(bl);
5746}
5747
5748void OSDSuperblock::dump(Formatter *f) const
5749{
5750 f->dump_stream("cluster_fsid") << cluster_fsid;
5751 f->dump_stream("osd_fsid") << osd_fsid;
5752 f->dump_int("whoami", whoami);
5753 f->dump_int("current_epoch", current_epoch);
5754 f->dump_int("oldest_map", oldest_map);
5755 f->dump_int("newest_map", newest_map);
5756 f->dump_float("weight", weight);
5757 f->open_object_section("compat");
5758 compat_features.dump(f);
5759 f->close_section();
5760 f->dump_int("clean_thru", clean_thru);
5761 f->dump_int("last_epoch_mounted", mounted);
9f95a23c
TL
5762 f->dump_unsigned("purged_snaps_last", purged_snaps_last);
5763 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
1e59de90
TL
5764 f->dump_int("cluster_osdmap_trim_lower_bound",
5765 cluster_osdmap_trim_lower_bound);
7c673cae
FG
5766}
5767
5768void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
5769{
5770 OSDSuperblock z;
5771 o.push_back(new OSDSuperblock(z));
11fdf7f2
TL
5772 z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101");
5773 z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
7c673cae
FG
5774 z.whoami = 3;
5775 z.current_epoch = 4;
5776 z.oldest_map = 5;
5777 z.newest_map = 9;
5778 z.mounted = 8;
5779 z.clean_thru = 7;
5780 o.push_back(new OSDSuperblock(z));
5781 o.push_back(new OSDSuperblock(z));
5782}
5783
5784// -- SnapSet --
5785
9f95a23c 5786void SnapSet::encode(ceph::buffer::list& bl) const
7c673cae
FG
5787{
5788 ENCODE_START(3, 2, bl);
11fdf7f2
TL
5789 encode(seq, bl);
5790 encode(true, bl); // head_exists
5791 encode(snaps, bl);
5792 encode(clones, bl);
5793 encode(clone_overlap, bl);
5794 encode(clone_size, bl);
5795 encode(clone_snaps, bl);
7c673cae
FG
5796 ENCODE_FINISH(bl);
5797}
5798
9f95a23c 5799void SnapSet::decode(ceph::buffer::list::const_iterator& bl)
7c673cae
FG
5800{
5801 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
11fdf7f2 5802 decode(seq, bl);
9f95a23c 5803 bl += 1u; // skip legacy head_exists (always true)
11fdf7f2
TL
5804 decode(snaps, bl);
5805 decode(clones, bl);
5806 decode(clone_overlap, bl);
5807 decode(clone_size, bl);
7c673cae 5808 if (struct_v >= 3) {
11fdf7f2 5809 decode(clone_snaps, bl);
7c673cae
FG
5810 } else {
5811 clone_snaps.clear();
5812 }
5813 DECODE_FINISH(bl);
5814}
5815
5816void SnapSet::dump(Formatter *f) const
5817{
9f95a23c 5818 f->dump_unsigned("seq", seq);
7c673cae 5819 f->open_array_section("clones");
9f95a23c 5820 for (auto p = clones.cbegin(); p != clones.cend(); ++p) {
7c673cae
FG
5821 f->open_object_section("clone");
5822 f->dump_unsigned("snap", *p);
94b18763
FG
5823 auto cs = clone_size.find(*p);
5824 if (cs != clone_size.end())
5825 f->dump_unsigned("size", cs->second);
5826 else
5827 f->dump_string("size", "????");
5828 auto co = clone_overlap.find(*p);
5829 if (co != clone_overlap.end())
5830 f->dump_stream("overlap") << co->second;
5831 else
5832 f->dump_stream("overlap") << "????";
7c673cae
FG
5833 auto q = clone_snaps.find(*p);
5834 if (q != clone_snaps.end()) {
5835 f->open_array_section("snaps");
5836 for (auto s : q->second) {
5837 f->dump_unsigned("snap", s);
5838 }
5839 f->close_section();
5840 }
5841 f->close_section();
5842 }
5843 f->close_section();
5844}
5845
5846void SnapSet::generate_test_instances(list<SnapSet*>& o)
5847{
5848 o.push_back(new SnapSet);
5849 o.push_back(new SnapSet);
7c673cae
FG
5850 o.back()->seq = 123;
5851 o.back()->snaps.push_back(123);
5852 o.back()->snaps.push_back(12);
5853 o.push_back(new SnapSet);
7c673cae
FG
5854 o.back()->seq = 123;
5855 o.back()->snaps.push_back(123);
5856 o.back()->snaps.push_back(12);
5857 o.back()->clones.push_back(12);
5858 o.back()->clone_size[12] = 12345;
5859 o.back()->clone_overlap[12];
5860 o.back()->clone_snaps[12] = {12, 10, 8};
5861}
5862
5863ostream& operator<<(ostream& out, const SnapSet& cs)
5864{
11fdf7f2
TL
5865 return out << cs.seq << "=" << cs.snaps << ":"
5866 << cs.clone_snaps;
7c673cae
FG
5867}
5868
5869void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
5870{
5871 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5872 // correct: it will not include snaps that still logically exist
5873 // but for which there was no clone that is defined. For all
5874 // practical purposes this doesn't matter, since we only use that
5875 // information to clone on the OSD, and we have already moved
5876 // forward past that part of the object history.
5877
5878 seq = ss.seq;
5879 set<snapid_t> _snaps;
5880 set<snapid_t> _clones;
9f95a23c 5881 for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) {
11fdf7f2 5882 if (p->cloneid != librados::SNAP_HEAD) {
7c673cae
FG
5883 _clones.insert(p->cloneid);
5884 _snaps.insert(p->snaps.begin(), p->snaps.end());
5885 clone_size[p->cloneid] = p->size;
5886 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
9f95a23c 5887 for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q)
7c673cae
FG
5888 clone_overlap[p->cloneid].insert(q->first, q->second);
5889 if (!legacy) {
5890 // p->snaps is ascending; clone_snaps is descending
5891 vector<snapid_t>& v = clone_snaps[p->cloneid];
5892 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
5893 v.push_back(*q);
5894 }
5895 }
5896 }
5897 }
5898
5899 // ascending
5900 clones.clear();
5901 clones.reserve(_clones.size());
9f95a23c 5902 for (auto p = _clones.begin(); p != _clones.end(); ++p)
7c673cae
FG
5903 clones.push_back(*p);
5904
5905 // descending
5906 snaps.clear();
5907 snaps.reserve(_snaps.size());
9f95a23c 5908 for (auto p = _snaps.rbegin();
7c673cae
FG
5909 p != _snaps.rend(); ++p)
5910 snaps.push_back(*p);
5911}
5912
5913uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
5914{
11fdf7f2 5915 ceph_assert(clone_size.count(clone));
7c673cae 5916 uint64_t size = clone_size.find(clone)->second;
11fdf7f2 5917 ceph_assert(clone_overlap.count(clone));
7c673cae 5918 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
11fdf7f2
TL
5919 ceph_assert(size >= (uint64_t)overlap.size());
5920 return size - overlap.size();
7c673cae
FG
5921}
5922
5923void SnapSet::filter(const pg_pool_t &pinfo)
5924{
5925 vector<snapid_t> oldsnaps;
5926 oldsnaps.swap(snaps);
9f95a23c 5927 for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) {
7c673cae
FG
5928 if (!pinfo.is_removed_snap(*i))
5929 snaps.push_back(*i);
5930 }
5931}
5932
5933SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5934{
5935 SnapSet ss = *this;
5936 ss.filter(pinfo);
5937 return ss;
5938}
5939
5940// -- watch_info_t --
5941
9f95a23c 5942void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae
FG
5943{
5944 ENCODE_START(4, 3, bl);
11fdf7f2
TL
5945 encode(cookie, bl);
5946 encode(timeout_seconds, bl);
5947 encode(addr, bl, features);
7c673cae
FG
5948 ENCODE_FINISH(bl);
5949}
5950
9f95a23c 5951void watch_info_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae
FG
5952{
5953 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
11fdf7f2 5954 decode(cookie, bl);
7c673cae
FG
5955 if (struct_v < 2) {
5956 uint64_t ver;
11fdf7f2 5957 decode(ver, bl);
7c673cae 5958 }
11fdf7f2 5959 decode(timeout_seconds, bl);
7c673cae 5960 if (struct_v >= 4) {
11fdf7f2 5961 decode(addr, bl);
7c673cae
FG
5962 }
5963 DECODE_FINISH(bl);
5964}
5965
5966void watch_info_t::dump(Formatter *f) const
5967{
5968 f->dump_unsigned("cookie", cookie);
5969 f->dump_unsigned("timeout_seconds", timeout_seconds);
5970 f->open_object_section("addr");
5971 addr.dump(f);
5972 f->close_section();
5973}
5974
5975void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5976{
5977 o.push_back(new watch_info_t);
5978 o.push_back(new watch_info_t);
5979 o.back()->cookie = 123;
5980 o.back()->timeout_seconds = 99;
5981 entity_addr_t ea;
5982 ea.set_type(entity_addr_t::TYPE_LEGACY);
5983 ea.set_nonce(1);
5984 ea.set_family(AF_INET);
5985 ea.set_in4_quad(0, 127);
5986 ea.set_in4_quad(1, 0);
5987 ea.set_in4_quad(2, 1);
5988 ea.set_in4_quad(3, 2);
5989 ea.set_port(2);
5990 o.back()->addr = ea;
5991}
5992
11fdf7f2
TL
5993// -- chunk_info_t --
5994
9f95a23c 5995void chunk_info_t::encode(ceph::buffer::list& bl) const
11fdf7f2
TL
5996{
5997 ENCODE_START(1, 1, bl);
5998 encode(offset, bl);
5999 encode(length, bl);
6000 encode(oid, bl);
6001 __u32 _flags = flags;
6002 encode(_flags, bl);
6003 ENCODE_FINISH(bl);
6004}
6005
9f95a23c 6006void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl)
11fdf7f2
TL
6007{
6008 DECODE_START(1, bl);
6009 decode(offset, bl);
6010 decode(length, bl);
6011 decode(oid, bl);
6012 __u32 _flags;
6013 decode(_flags, bl);
6014 flags = (cflag_t)_flags;
6015 DECODE_FINISH(bl);
6016}
6017
6018void chunk_info_t::dump(Formatter *f) const
6019{
6020 f->dump_unsigned("length", length);
6021 f->open_object_section("oid");
6022 oid.dump(f);
6023 f->close_section();
6024 f->dump_unsigned("flags", flags);
6025}
6026
f67539c2
TL
6027
6028bool chunk_info_t::operator==(const chunk_info_t& cit) const
6029{
6030 if (has_fingerprint()) {
6031 if (oid.oid.name == cit.oid.oid.name) {
6032 return true;
6033 }
6034 } else {
6035 if (offset == cit.offset && length == cit.length &&
6036 oid.oid.name == cit.oid.oid.name) {
6037 return true;
6038 }
6039
6040 }
6041 return false;
6042}
6043
6044bool operator==(const std::pair<const long unsigned int, chunk_info_t> & l,
6045 const std::pair<const long unsigned int, chunk_info_t> & r)
6046{
6047 return l.first == r.first &&
6048 l.second == r.second;
6049}
6050
11fdf7f2
TL
6051ostream& operator<<(ostream& out, const chunk_info_t& ci)
6052{
6053 return out << "(len: " << ci.length << " oid: " << ci.oid
6054 << " offset: " << ci.offset
6055 << " flags: " << ci.get_flag_string(ci.flags) << ")";
6056}
6057
31f18b77
FG
6058// -- object_manifest_t --
6059
f67539c2
TL
6060std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci)
6061{
6062 return out << ci.ref_delta << std::endl;
6063}
6064
6065void object_manifest_t::calc_refs_to_inc_on_set(
6066 const object_manifest_t* _g,
6067 const object_manifest_t* _l,
6068 object_ref_delta_t &refs) const
6069{
6070 /* avoid to increment the same reference on adjacent clones */
6071 auto iter = chunk_map.begin();
6072 auto find_chunk = [](decltype(iter) &i, const object_manifest_t* cur)
6073 -> bool {
6074 if (cur) {
6075 auto c = cur->chunk_map.find(i->first);
6076 if (c != cur->chunk_map.end() && c->second == i->second) {
6077 return true;
6078
6079 }
6080 }
6081 return false;
6082 };
6083
6084 /* If at least a same chunk exists on either _g or _l, do not increment
6085 * the reference
6086 *
6087 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6088 * 20: [0, 2) aaa, <- set_chunk
6089 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6090 * --> incremnt the reference
6091 *
6092 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6093 * 20: [0, 2) ccc, <- set_chunk
6094 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6095 * --> do not need to increment
6096 *
6097 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6098 * 20: [0, 2) ccc, <- set_chunk
6099 * 30: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6100 * --> decrement the reference of ccc
6101 *
6102 */
6103 for (; iter != chunk_map.end(); ++iter) {
6104 auto found_g = find_chunk(iter, _g);
6105 auto found_l = find_chunk(iter, _l);
6106 if (!found_g && !found_l) {
6107 refs.inc_ref(iter->second.oid);
6108 } else if (found_g && found_l) {
6109 refs.dec_ref(iter->second.oid);
6110 }
6111 }
6112}
6113
6114void object_manifest_t::calc_refs_to_drop_on_modify(
6115 const object_manifest_t* _l,
6116 const ObjectCleanRegions& clean_regions,
6117 object_ref_delta_t &refs) const
6118{
6119 for (auto &p : chunk_map) {
6120 if (!clean_regions.is_clean_region(p.first, p.second.length)) {
6121 // has previous snapshot
6122 if (_l) {
6123 /*
6124 * Let's assume that there is a manifest snapshotted object which has three chunks
6125 * head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6126 * 20: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6127 *
6128 * If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because
6129 * 20 has the reference for bbb. Therefore, we only drop the reference if two chunks
6130 * (head: [6, 2) and 20: [6, 2)) are different.
6131 *
6132 */
6133 auto c = _l->chunk_map.find(p.first);
6134 if (c != _l->chunk_map.end()) {
6135 if (p.second == c->second) {
6136 continue;
6137 }
6138 }
6139 refs.dec_ref(p.second.oid);
6140 } else {
6141 // decrement the reference of the updated chunks if the manifest object has no snapshot
6142 refs.dec_ref(p.second.oid);
6143 }
6144 }
6145 }
6146}
6147
6148void object_manifest_t::calc_refs_to_drop_on_removal(
6149 const object_manifest_t* _g,
6150 const object_manifest_t* _l,
6151 object_ref_delta_t &refs) const
6152{
6153 /* At a high level, the rule is that consecutive clones with the same reference
6154 * at the same offset share a reference. As such, removing *this may result
6155 * in removing references in two cases:
6156 * 1) *this has a reference which it shares with neither _g nor _l
6157 * 2) _g and _l have a reference which they share with each other but not
6158 * *this.
6159 *
6160 * For a particular offset, both 1 and 2 can happen.
6161 *
6162 * Notably, this means that to evaluate the reference change from removing
6163 * the object with *this, we only need to look at the two adjacent clones.
6164 */
6165
6166 // Paper over possibly missing _g or _l -- nullopt is semantically the same
6167 // as an empty chunk_map
6168 static const object_manifest_t empty;
6169 const object_manifest_t &g = _g ? *_g : empty;
6170 const object_manifest_t &l = _l ? *_l : empty;
6171
6172 auto giter = g.chunk_map.begin();
6173 auto iter = chunk_map.begin();
6174 auto liter = l.chunk_map.begin();
6175
6176 // Translate iter, map pair to the current offset, end() -> max
6177 auto get_offset = [](decltype(iter) &i, const object_manifest_t &manifest)
6178 -> uint64_t {
6179 return i == manifest.chunk_map.end() ?
6180 std::numeric_limits<uint64_t>::max() : i->first;
6181 };
6182
6183 /* If current matches the offset at iter, returns the chunk at *iter
6184 * and increments iter. Otherwise, returns nullptr.
6185 *
6186 * current will always be derived from the min of *giter, *iter, and
6187 * *liter on each cycle, so the result will be that each loop iteration
6188 * will pick up all chunks at the offest being considered, each offset
6189 * will be considered once, and all offsets will be considered.
6190 */
6191 auto get_chunk = [](
6192 uint64_t current, decltype(iter) &i, const object_manifest_t &manifest)
6193 -> const chunk_info_t * {
6194 if (i == manifest.chunk_map.end() || current != i->first) {
6195 return nullptr;
6196 } else {
6197 return &(i++)->second;
6198 }
6199 };
6200
6201 while (giter != g.chunk_map.end() ||
6202 iter != chunk_map.end() ||
6203 liter != l.chunk_map.end()) {
6204 auto current = std::min(
6205 std::min(get_offset(giter, g), get_offset(iter, *this)),
6206 get_offset(liter, l));
6207
6208 auto gchunk = get_chunk(current, giter, g);
6209 auto chunk = get_chunk(current, iter, *this);
6210 auto lchunk = get_chunk(current, liter, l);
6211
6212 if (gchunk && lchunk && *gchunk == *lchunk &&
6213 (!chunk || *gchunk != *chunk)) {
6214 // case 1 from above: l and g match, chunk does not
6215 refs.dec_ref(gchunk->oid);
6216 }
6217
6218 if (chunk &&
6219 (!gchunk || chunk->oid != gchunk->oid) &&
6220 (!lchunk || chunk->oid != lchunk->oid)) {
6221 // case 2 from above: *this matches neither
6222 refs.dec_ref(chunk->oid);
6223 }
6224 }
6225}
6226
9f95a23c 6227void object_manifest_t::encode(ceph::buffer::list& bl) const
31f18b77
FG
6228{
6229 ENCODE_START(1, 1, bl);
11fdf7f2 6230 encode(type, bl);
31f18b77
FG
6231 switch (type) {
6232 case TYPE_NONE: break;
6233 case TYPE_REDIRECT:
11fdf7f2
TL
6234 encode(redirect_target, bl);
6235 break;
6236 case TYPE_CHUNKED:
9f95a23c 6237 encode(chunk_map, bl);
31f18b77
FG
6238 break;
6239 default:
6240 ceph_abort();
6241 }
6242 ENCODE_FINISH(bl);
6243}
6244
9f95a23c 6245void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl)
31f18b77
FG
6246{
6247 DECODE_START(1, bl);
11fdf7f2 6248 decode(type, bl);
31f18b77
FG
6249 switch (type) {
6250 case TYPE_NONE: break;
6251 case TYPE_REDIRECT:
11fdf7f2
TL
6252 decode(redirect_target, bl);
6253 break;
6254 case TYPE_CHUNKED:
6255 decode(chunk_map, bl);
31f18b77
FG
6256 break;
6257 default:
6258 ceph_abort();
6259 }
6260 DECODE_FINISH(bl);
6261}
6262
6263void object_manifest_t::dump(Formatter *f) const
6264{
6265 f->dump_unsigned("type", type);
11fdf7f2
TL
6266 if (type == TYPE_REDIRECT) {
6267 f->open_object_section("redirect_target");
6268 redirect_target.dump(f);
6269 f->close_section();
6270 } else if (type == TYPE_CHUNKED) {
6271 f->open_array_section("chunk_map");
6272 for (auto& p : chunk_map) {
6273 f->open_object_section("chunk");
6274 f->dump_unsigned("offset", p.first);
6275 p.second.dump(f);
6276 f->close_section();
6277 }
6278 f->close_section();
6279 }
31f18b77
FG
6280}
6281
6282void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
6283{
6284 o.push_back(new object_manifest_t());
6285 o.back()->type = TYPE_REDIRECT;
6286}
6287
6288ostream& operator<<(ostream& out, const object_manifest_t& om)
6289{
11fdf7f2
TL
6290 out << "manifest(" << om.get_type_name();
6291 if (om.is_redirect()) {
6292 out << " " << om.redirect_target;
6293 } else if (om.is_chunked()) {
6294 out << " " << om.chunk_map;
6295 }
6296 out << ")";
6297 return out;
31f18b77 6298}
7c673cae
FG
6299
6300// -- object_info_t --
6301
6302void object_info_t::copy_user_bits(const object_info_t& other)
6303{
6304 // these bits are copied from head->clone.
6305 size = other.size;
6306 mtime = other.mtime;
6307 local_mtime = other.local_mtime;
6308 last_reqid = other.last_reqid;
6309 truncate_seq = other.truncate_seq;
6310 truncate_size = other.truncate_size;
6311 flags = other.flags;
6312 user_version = other.user_version;
6313 data_digest = other.data_digest;
6314 omap_digest = other.omap_digest;
6315}
6316
9f95a23c 6317void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae
FG
6318{
6319 object_locator_t myoloc(soid);
6320 map<entity_name_t, watch_info_t> old_watchers;
9f95a23c 6321 for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) {
7c673cae
FG
6322 old_watchers.insert(make_pair(i->first.second, i->second));
6323 }
31f18b77 6324 ENCODE_START(17, 8, bl);
11fdf7f2
TL
6325 encode(soid, bl);
6326 encode(myoloc, bl); //Retained for compatibility
6327 encode((__u32)0, bl); // was category, no longer used
6328 encode(version, bl);
6329 encode(prior_version, bl);
6330 encode(last_reqid, bl);
6331 encode(size, bl);
6332 encode(mtime, bl);
7c673cae 6333 if (soid.snap == CEPH_NOSNAP)
11fdf7f2 6334 encode(osd_reqid_t(), bl); // used to be wrlock_by
7c673cae 6335 else
11fdf7f2
TL
6336 encode((uint32_t)0, bl); // was legacy_snaps
6337 encode(truncate_seq, bl);
6338 encode(truncate_size, bl);
6339 encode(is_lost(), bl);
6340 encode(old_watchers, bl, features);
7c673cae
FG
6341 /* shenanigans to avoid breaking backwards compatibility in the disk format.
6342 * When we can, switch this out for simply putting the version_t on disk. */
6343 eversion_t user_eversion(0, user_version);
11fdf7f2
TL
6344 encode(user_eversion, bl);
6345 encode(test_flag(FLAG_USES_TMAP), bl);
6346 encode(watchers, bl, features);
7c673cae 6347 __u32 _flags = flags;
11fdf7f2
TL
6348 encode(_flags, bl);
6349 encode(local_mtime, bl);
6350 encode(data_digest, bl);
6351 encode(omap_digest, bl);
6352 encode(expected_object_size, bl);
6353 encode(expected_write_size, bl);
6354 encode(alloc_hint_flags, bl);
31f18b77 6355 if (has_manifest()) {
11fdf7f2 6356 encode(manifest, bl);
31f18b77 6357 }
7c673cae
FG
6358 ENCODE_FINISH(bl);
6359}
6360
9f95a23c 6361void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae
FG
6362{
6363 object_locator_t myoloc;
31f18b77 6364 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
7c673cae 6365 map<entity_name_t, watch_info_t> old_watchers;
11fdf7f2
TL
6366 decode(soid, bl);
6367 decode(myoloc, bl);
7c673cae
FG
6368 {
6369 string category;
11fdf7f2 6370 decode(category, bl); // no longer used
7c673cae 6371 }
11fdf7f2
TL
6372 decode(version, bl);
6373 decode(prior_version, bl);
6374 decode(last_reqid, bl);
6375 decode(size, bl);
6376 decode(mtime, bl);
7c673cae
FG
6377 if (soid.snap == CEPH_NOSNAP) {
6378 osd_reqid_t wrlock_by;
11fdf7f2 6379 decode(wrlock_by, bl);
7c673cae 6380 } else {
11fdf7f2
TL
6381 vector<snapid_t> legacy_snaps;
6382 decode(legacy_snaps, bl);
7c673cae 6383 }
11fdf7f2
TL
6384 decode(truncate_seq, bl);
6385 decode(truncate_size, bl);
7c673cae
FG
6386
6387 // if this is struct_v >= 13, we will overwrite this
6388 // below since this field is just here for backwards
6389 // compatibility
6390 __u8 lo;
11fdf7f2 6391 decode(lo, bl);
7c673cae
FG
6392 flags = (flag_t)lo;
6393
11fdf7f2 6394 decode(old_watchers, bl);
7c673cae 6395 eversion_t user_eversion;
11fdf7f2 6396 decode(user_eversion, bl);
7c673cae
FG
6397 user_version = user_eversion.version;
6398
6399 if (struct_v >= 9) {
6400 bool uses_tmap = false;
11fdf7f2 6401 decode(uses_tmap, bl);
7c673cae
FG
6402 if (uses_tmap)
6403 set_flag(FLAG_USES_TMAP);
6404 } else {
6405 set_flag(FLAG_USES_TMAP);
6406 }
6407 if (struct_v < 10)
6408 soid.pool = myoloc.pool;
6409 if (struct_v >= 11) {
11fdf7f2 6410 decode(watchers, bl);
7c673cae 6411 } else {
9f95a23c 6412 for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) {
7c673cae
FG
6413 watchers.insert(
6414 make_pair(
6415 make_pair(i->second.cookie, i->first), i->second));
6416 }
6417 }
6418 if (struct_v >= 13) {
6419 __u32 _flags;
11fdf7f2 6420 decode(_flags, bl);
7c673cae
FG
6421 flags = (flag_t)_flags;
6422 }
6423 if (struct_v >= 14) {
11fdf7f2 6424 decode(local_mtime, bl);
7c673cae
FG
6425 } else {
6426 local_mtime = utime_t();
6427 }
6428 if (struct_v >= 15) {
11fdf7f2
TL
6429 decode(data_digest, bl);
6430 decode(omap_digest, bl);
7c673cae
FG
6431 } else {
6432 data_digest = omap_digest = -1;
6433 clear_flag(FLAG_DATA_DIGEST);
6434 clear_flag(FLAG_OMAP_DIGEST);
6435 }
6436 if (struct_v >= 16) {
11fdf7f2
TL
6437 decode(expected_object_size, bl);
6438 decode(expected_write_size, bl);
6439 decode(alloc_hint_flags, bl);
7c673cae
FG
6440 } else {
6441 expected_object_size = 0;
6442 expected_write_size = 0;
6443 alloc_hint_flags = 0;
6444 }
31f18b77
FG
6445 if (struct_v >= 17) {
6446 if (has_manifest()) {
11fdf7f2 6447 decode(manifest, bl);
31f18b77
FG
6448 }
6449 }
7c673cae
FG
6450 DECODE_FINISH(bl);
6451}
6452
6453void object_info_t::dump(Formatter *f) const
6454{
6455 f->open_object_section("oid");
6456 soid.dump(f);
6457 f->close_section();
6458 f->dump_stream("version") << version;
6459 f->dump_stream("prior_version") << prior_version;
6460 f->dump_stream("last_reqid") << last_reqid;
6461 f->dump_unsigned("user_version", user_version);
6462 f->dump_unsigned("size", size);
6463 f->dump_stream("mtime") << mtime;
6464 f->dump_stream("local_mtime") << local_mtime;
6465 f->dump_unsigned("lost", (int)is_lost());
94b18763
FG
6466 vector<string> sv = get_flag_vector(flags);
6467 f->open_array_section("flags");
20effc67 6468 for (const auto& str: sv) {
94b18763 6469 f->dump_string("flags", str);
20effc67 6470 }
94b18763 6471 f->close_section();
7c673cae
FG
6472 f->dump_unsigned("truncate_seq", truncate_seq);
6473 f->dump_unsigned("truncate_size", truncate_size);
94b18763
FG
6474 f->dump_format("data_digest", "0x%08x", data_digest);
6475 f->dump_format("omap_digest", "0x%08x", omap_digest);
7c673cae
FG
6476 f->dump_unsigned("expected_object_size", expected_object_size);
6477 f->dump_unsigned("expected_write_size", expected_write_size);
6478 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
31f18b77 6479 f->dump_object("manifest", manifest);
7c673cae 6480 f->open_object_section("watchers");
9f95a23c 6481 for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) {
f67539c2
TL
6482 CachedStackStringStream css;
6483 *css << p->first.second;
6484 f->open_object_section(css->strv());
7c673cae
FG
6485 p->second.dump(f);
6486 f->close_section();
6487 }
6488 f->close_section();
6489}
6490
6491void object_info_t::generate_test_instances(list<object_info_t*>& o)
6492{
6493 o.push_back(new object_info_t());
6494
6495 // fixme
6496}
6497
6498
6499ostream& operator<<(ostream& out, const object_info_t& oi)
6500{
6501 out << oi.soid << "(" << oi.version
6502 << " " << oi.last_reqid;
7c673cae
FG
6503 if (oi.flags)
6504 out << " " << oi.get_flag_string();
6505 out << " s " << oi.size;
6506 out << " uv " << oi.user_version;
6507 if (oi.is_data_digest())
6508 out << " dd " << std::hex << oi.data_digest << std::dec;
6509 if (oi.is_omap_digest())
6510 out << " od " << std::hex << oi.omap_digest << std::dec;
6511 out << " alloc_hint [" << oi.expected_object_size
6512 << " " << oi.expected_write_size
6513 << " " << oi.alloc_hint_flags << "]";
31f18b77
FG
6514 if (oi.has_manifest())
6515 out << " " << oi.manifest;
7c673cae
FG
6516 out << ")";
6517 return out;
6518}
6519
6520// -- ObjectRecovery --
9f95a23c 6521void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const
7c673cae
FG
6522{
6523 ENCODE_START(1, 1, bl);
11fdf7f2
TL
6524 encode(first, bl);
6525 encode(data_complete, bl);
6526 encode(data_recovered_to, bl);
6527 encode(omap_recovered_to, bl);
6528 encode(omap_complete, bl);
7c673cae
FG
6529 ENCODE_FINISH(bl);
6530}
6531
9f95a23c 6532void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
6533{
6534 DECODE_START(1, bl);
11fdf7f2
TL
6535 decode(first, bl);
6536 decode(data_complete, bl);
6537 decode(data_recovered_to, bl);
6538 decode(omap_recovered_to, bl);
6539 decode(omap_complete, bl);
7c673cae
FG
6540 DECODE_FINISH(bl);
6541}
6542
6543ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
6544{
6545 return prog.print(out);
6546}
6547
6548void ObjectRecoveryProgress::generate_test_instances(
6549 list<ObjectRecoveryProgress*>& o)
6550{
6551 o.push_back(new ObjectRecoveryProgress);
6552 o.back()->first = false;
6553 o.back()->data_complete = true;
6554 o.back()->omap_complete = true;
6555 o.back()->data_recovered_to = 100;
6556
6557 o.push_back(new ObjectRecoveryProgress);
6558 o.back()->first = true;
6559 o.back()->data_complete = false;
6560 o.back()->omap_complete = false;
6561 o.back()->data_recovered_to = 0;
6562}
6563
6564ostream &ObjectRecoveryProgress::print(ostream &out) const
6565{
6566 return out << "ObjectRecoveryProgress("
6567 << ( first ? "" : "!" ) << "first, "
6568 << "data_recovered_to:" << data_recovered_to
6569 << ", data_complete:" << ( data_complete ? "true" : "false" )
6570 << ", omap_recovered_to:" << omap_recovered_to
6571 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
224ce89b 6572 << ", error:" << ( error ? "true" : "false" )
7c673cae
FG
6573 << ")";
6574}
6575
6576void ObjectRecoveryProgress::dump(Formatter *f) const
6577{
6578 f->dump_int("first?", first);
6579 f->dump_int("data_complete?", data_complete);
6580 f->dump_unsigned("data_recovered_to", data_recovered_to);
6581 f->dump_int("omap_complete?", omap_complete);
6582 f->dump_string("omap_recovered_to", omap_recovered_to);
6583}
6584
9f95a23c 6585void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
7c673cae 6586{
9f95a23c 6587 ENCODE_START(3, 1, bl);
11fdf7f2
TL
6588 encode(soid, bl);
6589 encode(version, bl);
6590 encode(size, bl);
6591 encode(oi, bl, features);
6592 encode(ss, bl);
6593 encode(copy_subset, bl);
6594 encode(clone_subset, bl);
9f95a23c 6595 encode(object_exist, bl);
7c673cae
FG
6596 ENCODE_FINISH(bl);
6597}
6598
9f95a23c 6599void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
7c673cae
FG
6600 int64_t pool)
6601{
9f95a23c 6602 DECODE_START(3, bl);
11fdf7f2
TL
6603 decode(soid, bl);
6604 decode(version, bl);
6605 decode(size, bl);
6606 decode(oi, bl);
6607 decode(ss, bl);
6608 decode(copy_subset, bl);
6609 decode(clone_subset, bl);
9f95a23c
TL
6610 if (struct_v > 2)
6611 decode(object_exist, bl);
6612 else
6613 object_exist = false;
7c673cae 6614 DECODE_FINISH(bl);
7c673cae
FG
6615 if (struct_v < 2) {
6616 if (!soid.is_max() && soid.pool == -1)
6617 soid.pool = pool;
6618 map<hobject_t, interval_set<uint64_t>> tmp;
6619 tmp.swap(clone_subset);
9f95a23c 6620 for (auto i = tmp.begin(); i != tmp.end(); ++i) {
7c673cae
FG
6621 hobject_t first(i->first);
6622 if (!first.is_max() && first.pool == -1)
6623 first.pool = pool;
6624 clone_subset[first].swap(i->second);
6625 }
6626 }
6627}
6628
6629void ObjectRecoveryInfo::generate_test_instances(
6630 list<ObjectRecoveryInfo*>& o)
6631{
6632 o.push_back(new ObjectRecoveryInfo);
6633 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
6634 o.back()->version = eversion_t(0,0);
6635 o.back()->size = 100;
9f95a23c 6636 o.back()->object_exist = false;
7c673cae
FG
6637}
6638
6639
6640void ObjectRecoveryInfo::dump(Formatter *f) const
6641{
6642 f->dump_stream("object") << soid;
6643 f->dump_stream("at_version") << version;
6644 f->dump_stream("size") << size;
6645 {
6646 f->open_object_section("object_info");
6647 oi.dump(f);
6648 f->close_section();
6649 }
6650 {
6651 f->open_object_section("snapset");
6652 ss.dump(f);
6653 f->close_section();
6654 }
6655 f->dump_stream("copy_subset") << copy_subset;
6656 f->dump_stream("clone_subset") << clone_subset;
9f95a23c 6657 f->dump_stream("object_exist") << object_exist;
7c673cae
FG
6658}
6659
6660ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
6661{
6662 return inf.print(out);
6663}
6664
6665ostream &ObjectRecoveryInfo::print(ostream &out) const
6666{
6667 return out << "ObjectRecoveryInfo("
6668 << soid << "@" << version
6669 << ", size: " << size
6670 << ", copy_subset: " << copy_subset
6671 << ", clone_subset: " << clone_subset
6672 << ", snapset: " << ss
9f95a23c 6673 << ", object_exist: " << object_exist
7c673cae
FG
6674 << ")";
6675}
6676
6677// -- PushReplyOp --
6678void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
6679{
6680 o.push_back(new PushReplyOp);
6681 o.push_back(new PushReplyOp);
6682 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6683 o.push_back(new PushReplyOp);
6684 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6685}
6686
9f95a23c 6687void PushReplyOp::encode(ceph::buffer::list &bl) const
7c673cae
FG
6688{
6689 ENCODE_START(1, 1, bl);
11fdf7f2 6690 encode(soid, bl);
7c673cae
FG
6691 ENCODE_FINISH(bl);
6692}
6693
9f95a23c 6694void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
6695{
6696 DECODE_START(1, bl);
11fdf7f2 6697 decode(soid, bl);
7c673cae
FG
6698 DECODE_FINISH(bl);
6699}
6700
6701void PushReplyOp::dump(Formatter *f) const
6702{
6703 f->dump_stream("soid") << soid;
6704}
6705
6706ostream &PushReplyOp::print(ostream &out) const
6707{
6708 return out
6709 << "PushReplyOp(" << soid
6710 << ")";
6711}
6712
6713ostream& operator<<(ostream& out, const PushReplyOp &op)
6714{
6715 return op.print(out);
6716}
6717
6718uint64_t PushReplyOp::cost(CephContext *cct) const
6719{
1e59de90
TL
6720 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
6721 /* In general, we really never want to throttle PushReplyOp messages.
6722 * As long as the object is smaller than osd_recovery_max_chunk (8M at
6723 * time of writing this comment, so this is basically always true),
6724 * processing the PushReplyOp does not cost any further IO and simply
6725 * permits the object once more to be written to.
6726 *
6727 * In the unlikely event that the object is larger than
6728 * osd_recovery_max_chunk (again, 8M at the moment, so never for common
6729 * configurations of rbd and virtually never for cephfs and rgw),
6730 * we *still* want to push out the next portion immediately so that we can
6731 * release the object for IO.
6732 *
6733 * The throttling for this operation on the primary occurs at the point
6734 * where we queue the PGRecoveryContext which calls into recover_missing
6735 * and recover_backfill to initiate pushes.
6736 * See OSD::queue_recovery_context.
6737 */
6738 return 1;
6739 } else {
6740 /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
6741 * require very large costs for several messages in order to do any
6742 * meaningful amount of throttling. This branch should be removed after
6743 * Reef.
6744 */
6745 return cct->_conf->osd_push_per_object_cost +
6746 cct->_conf->osd_recovery_max_chunk;
6747 }
7c673cae
FG
6748}
6749
6750// -- PullOp --
6751void PullOp::generate_test_instances(list<PullOp*> &o)
6752{
6753 o.push_back(new PullOp);
6754 o.push_back(new PullOp);
6755 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6756 o.back()->recovery_info.version = eversion_t(3, 10);
6757 o.push_back(new PullOp);
6758 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6759 o.back()->recovery_info.version = eversion_t(0, 0);
6760}
6761
9f95a23c 6762void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const
7c673cae
FG
6763{
6764 ENCODE_START(1, 1, bl);
11fdf7f2
TL
6765 encode(soid, bl);
6766 encode(recovery_info, bl, features);
6767 encode(recovery_progress, bl);
7c673cae
FG
6768 ENCODE_FINISH(bl);
6769}
6770
9f95a23c 6771void PullOp::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
6772{
6773 DECODE_START(1, bl);
11fdf7f2
TL
6774 decode(soid, bl);
6775 decode(recovery_info, bl);
6776 decode(recovery_progress, bl);
7c673cae
FG
6777 DECODE_FINISH(bl);
6778}
6779
6780void PullOp::dump(Formatter *f) const
6781{
6782 f->dump_stream("soid") << soid;
6783 {
6784 f->open_object_section("recovery_info");
6785 recovery_info.dump(f);
6786 f->close_section();
6787 }
6788 {
6789 f->open_object_section("recovery_progress");
6790 recovery_progress.dump(f);
6791 f->close_section();
6792 }
6793}
6794
6795ostream &PullOp::print(ostream &out) const
6796{
6797 return out
6798 << "PullOp(" << soid
6799 << ", recovery_info: " << recovery_info
6800 << ", recovery_progress: " << recovery_progress
6801 << ")";
6802}
6803
6804ostream& operator<<(ostream& out, const PullOp &op)
6805{
6806 return op.print(out);
6807}
6808
6809uint64_t PullOp::cost(CephContext *cct) const
6810{
1e59de90
TL
6811 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
6812 return std::clamp<uint64_t>(
6813 recovery_progress.estimate_remaining_data_to_recover(recovery_info),
6814 1,
6815 cct->_conf->osd_recovery_max_chunk);
6816 } else {
6817 /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
6818 * require very large costs for several messages in order to do any
6819 * meaningful amount of throttling. This branch should be removed after
6820 * Reef.
6821 */
6822 return cct->_conf->osd_push_per_object_cost +
6823 cct->_conf->osd_recovery_max_chunk;
6824 }
7c673cae
FG
6825}
6826
6827// -- PushOp --
6828void PushOp::generate_test_instances(list<PushOp*> &o)
6829{
6830 o.push_back(new PushOp);
6831 o.push_back(new PushOp);
6832 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6833 o.back()->version = eversion_t(3, 10);
6834 o.push_back(new PushOp);
6835 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6836 o.back()->version = eversion_t(0, 0);
6837}
6838
9f95a23c 6839void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const
7c673cae
FG
6840{
6841 ENCODE_START(1, 1, bl);
11fdf7f2
TL
6842 encode(soid, bl);
6843 encode(version, bl);
6844 encode(data, bl);
6845 encode(data_included, bl);
6846 encode(omap_header, bl);
6847 encode(omap_entries, bl);
6848 encode(attrset, bl);
6849 encode(recovery_info, bl, features);
6850 encode(after_progress, bl);
6851 encode(before_progress, bl);
7c673cae
FG
6852 ENCODE_FINISH(bl);
6853}
6854
9f95a23c 6855void PushOp::decode(ceph::buffer::list::const_iterator &bl)
7c673cae
FG
6856{
6857 DECODE_START(1, bl);
11fdf7f2
TL
6858 decode(soid, bl);
6859 decode(version, bl);
6860 decode(data, bl);
6861 decode(data_included, bl);
6862 decode(omap_header, bl);
6863 decode(omap_entries, bl);
6864 decode(attrset, bl);
6865 decode(recovery_info, bl);
6866 decode(after_progress, bl);
6867 decode(before_progress, bl);
7c673cae
FG
6868 DECODE_FINISH(bl);
6869}
6870
6871void PushOp::dump(Formatter *f) const
6872{
6873 f->dump_stream("soid") << soid;
6874 f->dump_stream("version") << version;
6875 f->dump_int("data_len", data.length());
6876 f->dump_stream("data_included") << data_included;
6877 f->dump_int("omap_header_len", omap_header.length());
6878 f->dump_int("omap_entries_len", omap_entries.size());
6879 f->dump_int("attrset_len", attrset.size());
6880 {
6881 f->open_object_section("recovery_info");
6882 recovery_info.dump(f);
6883 f->close_section();
6884 }
6885 {
6886 f->open_object_section("after_progress");
6887 after_progress.dump(f);
6888 f->close_section();
6889 }
6890 {
6891 f->open_object_section("before_progress");
6892 before_progress.dump(f);
6893 f->close_section();
6894 }
6895}
6896
6897ostream &PushOp::print(ostream &out) const
6898{
6899 return out
6900 << "PushOp(" << soid
6901 << ", version: " << version
6902 << ", data_included: " << data_included
6903 << ", data_size: " << data.length()
6904 << ", omap_header_size: " << omap_header.length()
6905 << ", omap_entries_size: " << omap_entries.size()
6906 << ", attrset_size: " << attrset.size()
6907 << ", recovery_info: " << recovery_info
6908 << ", after_progress: " << after_progress
6909 << ", before_progress: " << before_progress
6910 << ")";
6911}
6912
6913ostream& operator<<(ostream& out, const PushOp &op)
6914{
6915 return op.print(out);
6916}
6917
6918uint64_t PushOp::cost(CephContext *cct) const
6919{
6920 uint64_t cost = data_included.size();
9f95a23c 6921 for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) {
7c673cae
FG
6922 cost += i->second.length();
6923 }
6924 cost += cct->_conf->osd_push_per_object_cost;
6925 return cost;
6926}
6927
6928// -- ScrubMap --
6929
6930void ScrubMap::merge_incr(const ScrubMap &l)
6931{
11fdf7f2 6932 ceph_assert(valid_through == l.incr_since);
7c673cae
FG
6933 valid_through = l.valid_through;
6934
9f95a23c 6935 for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){
7c673cae 6936 if (p->second.negative) {
9f95a23c 6937 auto q = objects.find(p->first);
7c673cae
FG
6938 if (q != objects.end()) {
6939 objects.erase(q);
6940 }
6941 } else {
6942 objects[p->first] = p->second;
6943 }
6944 }
6945}
6946
9f95a23c 6947void ScrubMap::encode(ceph::buffer::list& bl) const
7c673cae
FG
6948{
6949 ENCODE_START(3, 2, bl);
11fdf7f2
TL
6950 encode(objects, bl);
6951 encode((__u32)0, bl); // used to be attrs; now deprecated
9f95a23c 6952 ceph::buffer::list old_logbl; // not used
11fdf7f2
TL
6953 encode(old_logbl, bl);
6954 encode(valid_through, bl);
6955 encode(incr_since, bl);
7c673cae
FG
6956 ENCODE_FINISH(bl);
6957}
6958
9f95a23c 6959void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool)
7c673cae
FG
6960{
6961 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
11fdf7f2 6962 decode(objects, bl);
7c673cae
FG
6963 {
6964 map<string,string> attrs; // deprecated
11fdf7f2 6965 decode(attrs, bl);
7c673cae 6966 }
9f95a23c 6967 ceph::buffer::list old_logbl; // not used
11fdf7f2
TL
6968 decode(old_logbl, bl);
6969 decode(valid_through, bl);
6970 decode(incr_since, bl);
7c673cae
FG
6971 DECODE_FINISH(bl);
6972
6973 // handle hobject_t upgrade
6974 if (struct_v < 3) {
6975 map<hobject_t, object> tmp;
6976 tmp.swap(objects);
9f95a23c 6977 for (auto i = tmp.begin(); i != tmp.end(); ++i) {
7c673cae
FG
6978 hobject_t first(i->first);
6979 if (!first.is_max() && first.pool == -1)
6980 first.pool = pool;
6981 objects[first] = i->second;
6982 }
6983 }
6984}
6985
6986void ScrubMap::dump(Formatter *f) const
6987{
6988 f->dump_stream("valid_through") << valid_through;
6989 f->dump_stream("incremental_since") << incr_since;
6990 f->open_array_section("objects");
9f95a23c 6991 for (auto p = objects.cbegin(); p != objects.cend(); ++p) {
7c673cae
FG
6992 f->open_object_section("object");
6993 f->dump_string("name", p->first.oid.name);
6994 f->dump_unsigned("hash", p->first.get_hash());
6995 f->dump_string("key", p->first.get_key());
6996 f->dump_int("snapid", p->first.snap);
6997 p->second.dump(f);
6998 f->close_section();
6999 }
7000 f->close_section();
7001}
7002
7003void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
7004{
7005 o.push_back(new ScrubMap);
7006 o.push_back(new ScrubMap);
7007 o.back()->valid_through = eversion_t(1, 2);
7008 o.back()->incr_since = eversion_t(3, 4);
7009 list<object*> obj;
7010 object::generate_test_instances(obj);
7011 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
7012 obj.pop_back();
7013 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
7014}
7015
7016// -- ScrubMap::object --
7017
9f95a23c 7018void ScrubMap::object::encode(ceph::buffer::list& bl) const
7c673cae
FG
7019{
7020 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
11fdf7f2
TL
7021 ENCODE_START(10, 7, bl);
7022 encode(size, bl);
7023 encode(negative, bl);
7024 encode(attrs, bl);
7025 encode(digest, bl);
7026 encode(digest_present, bl);
7027 encode((uint32_t)0, bl); // obsolete nlinks
7028 encode((uint32_t)0, bl); // snapcolls
7029 encode(omap_digest, bl);
7030 encode(omap_digest_present, bl);
7031 encode(compat_read_error, bl);
7032 encode(stat_error, bl);
7033 encode(read_error, bl);
7034 encode(ec_hash_mismatch, bl);
7035 encode(ec_size_mismatch, bl);
7036 encode(large_omap_object_found, bl);
7037 encode(large_omap_object_key_count, bl);
7038 encode(large_omap_object_value_size, bl);
7039 encode(object_omap_bytes, bl);
7040 encode(object_omap_keys, bl);
7c673cae
FG
7041 ENCODE_FINISH(bl);
7042}
7043
9f95a23c 7044void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 7045{
11fdf7f2
TL
7046 DECODE_START(10, bl);
7047 decode(size, bl);
7c673cae 7048 bool tmp, compat_read_error = false;
11fdf7f2 7049 decode(tmp, bl);
7c673cae 7050 negative = tmp;
11fdf7f2
TL
7051 decode(attrs, bl);
7052 decode(digest, bl);
7053 decode(tmp, bl);
7c673cae
FG
7054 digest_present = tmp;
7055 {
7056 uint32_t nlinks;
11fdf7f2 7057 decode(nlinks, bl);
7c673cae 7058 set<snapid_t> snapcolls;
11fdf7f2 7059 decode(snapcolls, bl);
7c673cae 7060 }
11fdf7f2
TL
7061 decode(omap_digest, bl);
7062 decode(tmp, bl);
7c673cae 7063 omap_digest_present = tmp;
11fdf7f2
TL
7064 decode(compat_read_error, bl);
7065 decode(tmp, bl);
7c673cae
FG
7066 stat_error = tmp;
7067 if (struct_v >= 8) {
11fdf7f2 7068 decode(tmp, bl);
7c673cae 7069 read_error = tmp;
11fdf7f2 7070 decode(tmp, bl);
7c673cae 7071 ec_hash_mismatch = tmp;
11fdf7f2 7072 decode(tmp, bl);
7c673cae
FG
7073 ec_size_mismatch = tmp;
7074 }
7075 // If older encoder found a read_error, set read_error
7076 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
7077 read_error = true;
28e407b8 7078 if (struct_v >= 9) {
11fdf7f2 7079 decode(tmp, bl);
28e407b8 7080 large_omap_object_found = tmp;
11fdf7f2
TL
7081 decode(large_omap_object_key_count, bl);
7082 decode(large_omap_object_value_size, bl);
7083 }
7084 if (struct_v >= 10) {
7085 decode(object_omap_bytes, bl);
7086 decode(object_omap_keys, bl);
28e407b8 7087 }
7c673cae
FG
7088 DECODE_FINISH(bl);
7089}
7090
7091void ScrubMap::object::dump(Formatter *f) const
7092{
7093 f->dump_int("size", size);
7094 f->dump_int("negative", negative);
7095 f->open_array_section("attrs");
9f95a23c 7096 for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) {
7c673cae
FG
7097 f->open_object_section("attr");
7098 f->dump_string("name", p->first);
7099 f->dump_int("length", p->second.length());
7100 f->close_section();
7101 }
7102 f->close_section();
7103}
7104
7105void ScrubMap::object::generate_test_instances(list<object*>& o)
7106{
7107 o.push_back(new object);
7108 o.push_back(new object);
7109 o.back()->negative = true;
7110 o.push_back(new object);
7111 o.back()->size = 123;
9f95a23c
TL
7112 o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3);
7113 o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6);
7c673cae
FG
7114}
7115
7116// -- OSDOp --
7117
7118ostream& operator<<(ostream& out, const OSDOp& op)
7119{
7120 out << ceph_osd_op_name(op.op.op);
7121 if (ceph_osd_op_type_data(op.op.op)) {
7122 // data extent
7123 switch (op.op.op) {
7124 case CEPH_OSD_OP_ASSERT_VER:
7125 out << " v" << op.op.assert_ver.ver;
7126 break;
7127 case CEPH_OSD_OP_TRUNCATE:
7128 out << " " << op.op.extent.offset;
7129 break;
7130 case CEPH_OSD_OP_MASKTRUNC:
7131 case CEPH_OSD_OP_TRIMTRUNC:
7132 out << " " << op.op.extent.truncate_seq << "@"
7133 << (int64_t)op.op.extent.truncate_size;
7134 break;
7135 case CEPH_OSD_OP_ROLLBACK:
7136 out << " " << snapid_t(op.op.snap.snapid);
7137 break;
7138 case CEPH_OSD_OP_WATCH:
7139 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
7140 << " cookie " << op.op.watch.cookie;
7141 if (op.op.watch.gen)
7142 out << " gen " << op.op.watch.gen;
7143 break;
7144 case CEPH_OSD_OP_NOTIFY:
7c673cae
FG
7145 out << " cookie " << op.op.notify.cookie;
7146 break;
7147 case CEPH_OSD_OP_COPY_GET:
7148 out << " max " << op.op.copy_get.max;
7149 break;
7150 case CEPH_OSD_OP_COPY_FROM:
7151 out << " ver " << op.op.copy_from.src_version;
7152 break;
7153 case CEPH_OSD_OP_SETALLOCHINT:
7154 out << " object_size " << op.op.alloc_hint.expected_object_size
7155 << " write_size " << op.op.alloc_hint.expected_write_size;
7156 break;
7157 case CEPH_OSD_OP_READ:
7158 case CEPH_OSD_OP_SPARSE_READ:
7159 case CEPH_OSD_OP_SYNC_READ:
7160 case CEPH_OSD_OP_WRITE:
7161 case CEPH_OSD_OP_WRITEFULL:
7162 case CEPH_OSD_OP_ZERO:
7163 case CEPH_OSD_OP_APPEND:
7164 case CEPH_OSD_OP_MAPEXT:
11fdf7f2 7165 case CEPH_OSD_OP_CMPEXT:
7c673cae
FG
7166 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
7167 if (op.op.extent.truncate_seq)
7168 out << " [" << op.op.extent.truncate_seq << "@"
7169 << (int64_t)op.op.extent.truncate_size << "]";
7170 if (op.op.flags)
7171 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
7172 default:
7173 // don't show any arg info
7174 break;
7175 }
7176 } else if (ceph_osd_op_type_attr(op.op.op)) {
7177 // xattr name
7178 if (op.op.xattr.name_len && op.indata.length()) {
7179 out << " ";
7180 op.indata.write(0, op.op.xattr.name_len, out);
7181 }
7182 if (op.op.xattr.value_len)
7183 out << " (" << op.op.xattr.value_len << ")";
7184 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
7185 out << " op " << (int)op.op.xattr.cmp_op
7186 << " mode " << (int)op.op.xattr.cmp_mode;
7187 } else if (ceph_osd_op_type_exec(op.op.op)) {
7188 // class.method
7189 if (op.op.cls.class_len && op.indata.length()) {
7190 out << " ";
7191 op.indata.write(0, op.op.cls.class_len, out);
7192 out << ".";
7193 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
7194 }
7195 } else if (ceph_osd_op_type_pg(op.op.op)) {
7196 switch (op.op.op) {
7197 case CEPH_OSD_OP_PGLS:
7198 case CEPH_OSD_OP_PGLS_FILTER:
7199 case CEPH_OSD_OP_PGNLS:
7200 case CEPH_OSD_OP_PGNLS_FILTER:
7201 out << " start_epoch " << op.op.pgls.start_epoch;
7202 break;
7203 case CEPH_OSD_OP_PG_HITSET_LS:
7204 break;
7205 case CEPH_OSD_OP_PG_HITSET_GET:
7206 out << " " << utime_t(op.op.hit_set_get.stamp);
7207 break;
7208 case CEPH_OSD_OP_SCRUBLS:
7209 break;
7210 }
7211 }
9f95a23c
TL
7212 if (op.indata.length()) {
7213 out << " in=" << op.indata.length() << "b";
7214 }
7215 if (op.outdata.length()) {
7216 out << " out=" << op.outdata.length() << "b";
7217 }
7c673cae
FG
7218 return out;
7219}
7220
7221
9f95a23c 7222void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& in)
7c673cae 7223{
9f95a23c 7224 auto datap = in.begin();
7c673cae
FG
7225 for (unsigned i = 0; i < ops.size(); i++) {
7226 if (ops[i].op.payload_len) {
7227 datap.copy(ops[i].op.payload_len, ops[i].outdata);
7228 }
7229 }
7230}
7231
9f95a23c 7232void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& out)
7c673cae
FG
7233{
7234 for (unsigned i = 0; i < ops.size(); i++) {
9f95a23c 7235 ops[i].op.payload_len = ops[i].outdata.length();
7c673cae 7236 if (ops[i].outdata.length()) {
7c673cae
FG
7237 out.append(ops[i].outdata);
7238 }
7239 }
7240}
7241
9f95a23c
TL
7242int prepare_info_keymap(
7243 CephContext* cct,
7244 map<string,bufferlist> *km,
7245 string *key_to_remove,
7246 epoch_t epoch,
7247 pg_info_t &info,
7248 pg_info_t &last_written_info,
7249 PastIntervals &past_intervals,
7250 bool dirty_big_info,
7251 bool dirty_epoch,
7252 bool try_fast_info,
7253 PerfCounters *logger,
7254 DoutPrefixProvider *dpp)
7255{
7256 if (dirty_epoch) {
7257 encode(epoch, (*km)[string(epoch_key)]);
7258 }
7259
7260 if (logger)
7261 logger->inc(l_osd_pg_info);
7262
7263 // try to do info efficiently?
7264 if (!dirty_big_info && try_fast_info &&
7265 info.last_update > last_written_info.last_update) {
7266 pg_fast_info_t fast;
7267 fast.populate_from(info);
7268 bool did = fast.try_apply_to(&last_written_info);
7269 ceph_assert(did); // we verified last_update increased above
7270 if (info == last_written_info) {
7271 encode(fast, (*km)[string(fastinfo_key)]);
7272 if (logger)
7273 logger->inc(l_osd_pg_fastinfo);
7274 return 0;
7275 }
7276 if (dpp) {
7277 ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n";
7278 {
7279 JSONFormatter jf(true);
7280 jf.dump_object("info", info);
7281 jf.flush(*_dout);
7282 }
7283 {
7284 *_dout << "\nlast_written_info:\n";
7285 JSONFormatter jf(true);
7286 jf.dump_object("last_written_info", last_written_info);
7287 jf.flush(*_dout);
7288 }
7289 *_dout << dendl;
7290 }
7291 } else if (info.last_update <= last_written_info.last_update) {
7292 // clean up any potentially stale fastinfo key resulting from last_update
7293 // not moving forwards (e.g., a backwards jump during peering)
7294 *key_to_remove = fastinfo_key;
7295 }
7296
7297 last_written_info = info;
7298
7299 // info. store purged_snaps separately.
7300 interval_set<snapid_t> purged_snaps;
7301 purged_snaps.swap(info.purged_snaps);
7302 encode(info, (*km)[string(info_key)]);
7303 purged_snaps.swap(info.purged_snaps);
7304
7305 if (dirty_big_info) {
7306 // potentially big stuff
7307 bufferlist& bigbl = (*km)[string(biginfo_key)];
7308 encode(past_intervals, bigbl);
7309 encode(info.purged_snaps, bigbl);
7310 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
7311 if (logger)
7312 logger->inc(l_osd_pg_biginfo);
7313 }
7314
7315 return 0;
7316}
7317
7318void create_pg_collection(
7319 ceph::os::Transaction& t, spg_t pgid, int bits)
7320{
7321 coll_t coll(pgid);
7322 t.create_collection(coll, bits);
7323}
7324
7325void init_pg_ondisk(
7326 ceph::os::Transaction& t,
7327 spg_t pgid,
7328 const pg_pool_t *pool)
7329{
7330 coll_t coll(pgid);
7331 if (pool) {
7332 // Give a hint to the PG collection
7333 bufferlist hint;
7334 uint32_t pg_num = pool->get_pg_num();
7335 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
7336 encode(pg_num, hint);
7337 encode(expected_num_objects_pg, hint);
7338 uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
7339 t.collection_hint(coll, hint_type, hint);
7340 }
7341
7342 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
7343 t.touch(coll, pgmeta_oid);
7344 map<string,bufferlist> values;
7345 __u8 struct_v = pg_latest_struct_v;
7346 encode(struct_v, values[string(infover_key)]);
7347 t.omap_setkeys(coll, pgmeta_oid, values);
7348}
7349
7350PGLSFilter::PGLSFilter() : cct(nullptr)
7351{
7352}
7353
7354PGLSFilter::~PGLSFilter()
7355{
7356}
7357
7358int PGLSPlainFilter::init(ceph::bufferlist::const_iterator &params)
7359{
7360 try {
7361 decode(xattr, params);
7362 decode(val, params);
f67539c2 7363 } catch (ceph::buffer::error &e) {
9f95a23c
TL
7364 return -EINVAL;
7365 }
7366 return 0;
7367}
7368
7369bool PGLSPlainFilter::filter(const hobject_t& obj,
7370 const ceph::bufferlist& xattr_data) const
7371{
7372 return xattr_data.contents_equal(val.c_str(), val.size());
7373}