]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.cc
e794d78a2a909f21163c5b792cad0a640d80b17d
[ceph.git] / ceph / src / osd / osd_types.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <list>
19 #include <map>
20 #include <ostream>
21 #include <sstream>
22 #include <set>
23 #include <string>
24 #include <utility>
25 #include <vector>
26
27
28 #include <boost/assign/list_of.hpp>
29
30 #include "include/ceph_features.h"
31 #include "include/encoding.h"
32 #include "include/stringify.h"
33 extern "C" {
34 #include "crush/hash.h"
35 }
36
37 #include "common/Formatter.h"
38 #include "common/StackStringStream.h"
39 #include "include/utime_fmt.h"
40 #include "OSDMap.h"
41 #include "osd_types.h"
42 #include "osd_types_fmt.h"
43 #include "os/Transaction.h"
44
45 using std::list;
46 using std::make_pair;
47 using std::map;
48 using std::ostream;
49 using std::pair;
50 using std::set;
51 using std::shared_ptr;
52 using std::string;
53 using std::unique_ptr;
54 using std::vector;
55
56 using ceph::bufferlist;
57 using ceph::decode;
58 using ceph::decode_nohead;
59 using ceph::encode;
60 using ceph::encode_nohead;
61 using ceph::Formatter;
62 using ceph::make_timespan;
63 using ceph::JSONFormatter;
64
65 using namespace std::literals;
66
67 const char *ceph_osd_flag_name(unsigned flag)
68 {
69 switch (flag) {
70 case CEPH_OSD_FLAG_ACK: return "ack";
71 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
72 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
73 case CEPH_OSD_FLAG_RETRY: return "retry";
74 case CEPH_OSD_FLAG_READ: return "read";
75 case CEPH_OSD_FLAG_WRITE: return "write";
76 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
77 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
78 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
79 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
80 case CEPH_OSD_FLAG_PGOP: return "pgop";
81 case CEPH_OSD_FLAG_EXEC: return "exec";
82 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
83 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
84 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
85 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
86 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
87 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
88 case CEPH_OSD_FLAG_FLUSH: return "flush";
89 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
90 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
91 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
92 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
93 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
94 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
95 case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
96 case CEPH_OSD_FLAG_RETURNVEC: return "returnvec";
97 case CEPH_OSD_FLAG_SUPPORTSPOOLEIO: return "supports_pool_eio";
98 default: return "???";
99 }
100 }
101
102 string ceph_osd_flag_string(unsigned flags)
103 {
104 string s;
105 for (unsigned i=0; i<32; ++i) {
106 if (flags & (1u<<i)) {
107 if (s.length())
108 s += "+";
109 s += ceph_osd_flag_name(1u << i);
110 }
111 }
112 if (s.length())
113 return s;
114 return string("-");
115 }
116
117 const char * ceph_osd_op_flag_name(unsigned flag)
118 {
119 const char *name;
120
121 switch(flag) {
122 case CEPH_OSD_OP_FLAG_EXCL:
123 name = "excl";
124 break;
125 case CEPH_OSD_OP_FLAG_FAILOK:
126 name = "failok";
127 break;
128 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
129 name = "fadvise_random";
130 break;
131 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
132 name = "fadvise_sequential";
133 break;
134 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
135 name = "favise_willneed";
136 break;
137 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
138 name = "fadvise_dontneed";
139 break;
140 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
141 name = "fadvise_nocache";
142 break;
143 case CEPH_OSD_OP_FLAG_WITH_REFERENCE:
144 name = "with_reference";
145 break;
146 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
147 name = "bypass_clean_cache";
148 break;
149 default:
150 name = "???";
151 };
152
153 return name;
154 }
155
156 string ceph_osd_op_flag_string(unsigned flags)
157 {
158 string s;
159 for (unsigned i=0; i<32; ++i) {
160 if (flags & (1u<<i)) {
161 if (s.length())
162 s += "+";
163 s += ceph_osd_op_flag_name(1u << i);
164 }
165 }
166 if (s.length())
167 return s;
168 return string("-");
169 }
170
171 string ceph_osd_alloc_hint_flag_string(unsigned flags)
172 {
173 string s;
174 for (unsigned i=0; i<32; ++i) {
175 if (flags & (1u<<i)) {
176 if (s.length())
177 s += "+";
178 s += ceph_osd_alloc_hint_flag_name(1u << i);
179 }
180 }
181 if (s.length())
182 return s;
183 return string("-");
184 }
185
186 void pg_shard_t::encode(ceph::buffer::list &bl) const
187 {
188 ENCODE_START(1, 1, bl);
189 encode(osd, bl);
190 encode(shard, bl);
191 ENCODE_FINISH(bl);
192 }
193 void pg_shard_t::decode(ceph::buffer::list::const_iterator &bl)
194 {
195 DECODE_START(1, bl);
196 decode(osd, bl);
197 decode(shard, bl);
198 DECODE_FINISH(bl);
199 }
200
201 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
202 {
203 if (rhs.is_undefined())
204 return lhs << "?";
205 if (rhs.shard == shard_id_t::NO_SHARD)
206 return lhs << rhs.get_osd();
207 return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
208 }
209
210 void dump(Formatter* f, const osd_alerts_t& alerts)
211 {
212 for (auto& a : alerts) {
213 string s0 = " osd: ";
214 s0 += stringify(a.first);
215 string s;
216 for (auto& aa : a.second) {
217 s = s0;
218 s += " ";
219 s += aa.first;
220 s += ":";
221 s += aa.second;
222 f->dump_string("alert", s);
223 }
224 }
225 }
226
227 // -- osd_reqid_t --
228 void osd_reqid_t::dump(Formatter *f) const
229 {
230 f->dump_stream("name") << name;
231 f->dump_int("inc", inc);
232 f->dump_unsigned("tid", tid);
233 }
234
235 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
236 {
237 o.push_back(new osd_reqid_t);
238 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
239 }
240
241 // -- object_locator_t --
242
243 void object_locator_t::encode(ceph::buffer::list& bl) const
244 {
245 // verify that nobody's corrupted the locator
246 ceph_assert(hash == -1 || key.empty());
247 __u8 encode_compat = 3;
248 ENCODE_START(6, encode_compat, bl);
249 encode(pool, bl);
250 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
251 encode(preferred, bl);
252 encode(key, bl);
253 encode(nspace, bl);
254 encode(hash, bl);
255 if (hash != -1)
256 encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash
257 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
258 }
259
260 void object_locator_t::decode(ceph::buffer::list::const_iterator& p)
261 {
262 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
263 if (struct_v < 2) {
264 int32_t op;
265 decode(op, p);
266 pool = op;
267 int16_t pref;
268 decode(pref, p);
269 } else {
270 decode(pool, p);
271 int32_t preferred;
272 decode(preferred, p);
273 }
274 decode(key, p);
275 if (struct_v >= 5)
276 decode(nspace, p);
277 if (struct_v >= 6)
278 decode(hash, p);
279 else
280 hash = -1;
281 DECODE_FINISH(p);
282 // verify that nobody's corrupted the locator
283 ceph_assert(hash == -1 || key.empty());
284 }
285
286 void object_locator_t::dump(Formatter *f) const
287 {
288 f->dump_int("pool", pool);
289 f->dump_string("key", key);
290 f->dump_string("namespace", nspace);
291 f->dump_int("hash", hash);
292 }
293
294 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
295 {
296 o.push_back(new object_locator_t);
297 o.push_back(new object_locator_t(123));
298 o.push_back(new object_locator_t(123, 876));
299 o.push_back(new object_locator_t(1, "n2"));
300 o.push_back(new object_locator_t(1234, "", "key"));
301 o.push_back(new object_locator_t(12, "n1", "key2"));
302 }
303
304 // -- request_redirect_t --
305 void request_redirect_t::encode(ceph::buffer::list& bl) const
306 {
307 ENCODE_START(1, 1, bl);
308 encode(redirect_locator, bl);
309 encode(redirect_object, bl);
310 // legacy of the removed osd_instructions member
311 encode((uint32_t)0, bl);
312 ENCODE_FINISH(bl);
313 }
314
315 void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl)
316 {
317 DECODE_START(1, bl);
318 uint32_t legacy_osd_instructions_len;
319 decode(redirect_locator, bl);
320 decode(redirect_object, bl);
321 decode(legacy_osd_instructions_len, bl);
322 if (legacy_osd_instructions_len) {
323 bl += legacy_osd_instructions_len;
324 }
325 DECODE_FINISH(bl);
326 }
327
328 void request_redirect_t::dump(Formatter *f) const
329 {
330 f->dump_string("object", redirect_object);
331 f->open_object_section("locator");
332 redirect_locator.dump(f);
333 f->close_section(); // locator
334 }
335
336 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
337 {
338 object_locator_t loc(1, "redir_obj");
339 o.push_back(new request_redirect_t());
340 o.push_back(new request_redirect_t(loc, 0));
341 o.push_back(new request_redirect_t(loc, "redir_obj"));
342 o.push_back(new request_redirect_t(loc));
343 }
344
345 void objectstore_perf_stat_t::dump(Formatter *f) const
346 {
347 // *_ms values just for compatibility.
348 f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0);
349 f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0);
350 f->dump_unsigned("commit_latency_ns", os_commit_latency_ns);
351 f->dump_unsigned("apply_latency_ns", os_apply_latency_ns);
352 }
353
354 void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
355 {
356 uint8_t target_v = 2;
357 if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) {
358 target_v = 1;
359 }
360 ENCODE_START(target_v, target_v, bl);
361 if (target_v >= 2) {
362 encode(os_commit_latency_ns, bl);
363 encode(os_apply_latency_ns, bl);
364 } else {
365 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
366 uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS;
367 uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS;
368 encode(commit_latency_ms, bl); // for compatibility with older monitor.
369 encode(apply_latency_ms, bl); // for compatibility with older monitor.
370 }
371 ENCODE_FINISH(bl);
372 }
373
374 void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl)
375 {
376 DECODE_START(2, bl);
377 if (struct_v >= 2) {
378 decode(os_commit_latency_ns, bl);
379 decode(os_apply_latency_ns, bl);
380 } else {
381 uint32_t commit_latency_ms;
382 uint32_t apply_latency_ms;
383 decode(commit_latency_ms, bl);
384 decode(apply_latency_ms, bl);
385 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
386 os_commit_latency_ns = commit_latency_ms * NS_PER_MS;
387 os_apply_latency_ns = apply_latency_ms * NS_PER_MS;
388 }
389 DECODE_FINISH(bl);
390 }
391
392 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
393 {
394 o.push_back(new objectstore_perf_stat_t());
395 o.push_back(new objectstore_perf_stat_t());
396 o.back()->os_commit_latency_ns = 20000000;
397 o.back()->os_apply_latency_ns = 30000000;
398 }
399
400 // -- osd_stat_t --
401 void osd_stat_t::dump(Formatter *f, bool with_net) const
402 {
403 f->dump_unsigned("up_from", up_from);
404 f->dump_unsigned("seq", seq);
405 f->dump_unsigned("num_pgs", num_pgs);
406 f->dump_unsigned("num_osds", num_osds);
407 f->dump_unsigned("num_per_pool_osds", num_per_pool_osds);
408 f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds);
409
410 /// dump legacy stats fields to ensure backward compatibility.
411 f->dump_unsigned("kb", statfs.kb());
412 f->dump_unsigned("kb_used", statfs.kb_used_raw());
413 f->dump_unsigned("kb_used_data", statfs.kb_used_data());
414 f->dump_unsigned("kb_used_omap", statfs.kb_used_omap());
415 f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata());
416 f->dump_unsigned("kb_avail", statfs.kb_avail());
417 ////////////////////
418
419 f->open_object_section("statfs");
420 statfs.dump(f);
421 f->close_section();
422 f->open_array_section("hb_peers");
423 for (auto p : hb_peers)
424 f->dump_int("osd", p);
425 f->close_section();
426 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
427 f->dump_int("num_snap_trimming", num_snap_trimming);
428 f->dump_int("num_shards_repaired", num_shards_repaired);
429 f->open_object_section("op_queue_age_hist");
430 op_queue_age_hist.dump(f);
431 f->close_section();
432 f->open_object_section("perf_stat");
433 os_perf_stat.dump(f);
434 f->close_section();
435 f->open_array_section("alerts");
436 ::dump(f, os_alerts);
437 f->close_section();
438 if (with_net) {
439 dump_ping_time(f);
440 }
441 }
442
443 void osd_stat_t::dump_ping_time(Formatter *f) const
444 {
445 f->open_array_section("network_ping_times");
446 for (auto &i : hb_pingtime) {
447 f->open_object_section("entry");
448 f->dump_int("osd", i.first);
449 const time_t lu(i.second.last_update);
450 char buffer[26];
451 string lustr(ctime_r(&lu, buffer));
452 lustr.pop_back(); // Remove trailing \n
453 f->dump_string("last update", lustr);
454 f->open_array_section("interfaces");
455 f->open_object_section("interface");
456 f->dump_string("interface", "back");
457 f->open_object_section("average");
458 f->dump_float("1min", i.second.back_pingtime[0]/1000.0);
459 f->dump_float("5min", i.second.back_pingtime[1]/1000.0);
460 f->dump_float("15min", i.second.back_pingtime[2]/1000.0);
461 f->close_section(); // average
462 f->open_object_section("min");
463 f->dump_float("1min", i.second.back_min[0]/1000.0);
464 f->dump_float("5min", i.second.back_min[1]/1000.0);
465 f->dump_float("15min", i.second.back_min[2]/1000.0);
466 f->close_section(); // min
467 f->open_object_section("max");
468 f->dump_float("1min", i.second.back_max[0]/1000.0);
469 f->dump_float("5min", i.second.back_max[1]/1000.0);
470 f->dump_float("15min", i.second.back_max[2]/1000.0);
471 f->close_section(); // max
472 f->dump_float("last", i.second.back_last/1000.0);
473 f->close_section(); // interface
474
475 if (i.second.front_pingtime[0] != 0) {
476 f->open_object_section("interface");
477 f->dump_string("interface", "front");
478 f->open_object_section("average");
479 f->dump_float("1min", i.second.front_pingtime[0]/1000.0);
480 f->dump_float("5min", i.second.front_pingtime[1]/1000.0);
481 f->dump_float("15min", i.second.front_pingtime[2]/1000.0);
482 f->close_section(); // average
483 f->open_object_section("min");
484 f->dump_float("1min", i.second.front_min[0]/1000.0);
485 f->dump_float("5min", i.second.front_min[1]/1000.0);
486 f->dump_float("15min", i.second.front_min[2]/1000.0);
487 f->close_section(); // min
488 f->open_object_section("max");
489 f->dump_float("1min", i.second.front_max[0]/1000.0);
490 f->dump_float("5min", i.second.front_max[1]/1000.0);
491 f->dump_float("15min", i.second.front_max[2]/1000.0);
492 f->close_section(); // max
493 f->dump_float("last", i.second.front_last/1000.0);
494 f->close_section(); // interface
495 }
496 f->close_section(); // interfaces
497 f->close_section(); // entry
498 }
499 f->close_section(); // network_ping_time
500 }
501
502 void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
503 {
504 ENCODE_START(14, 2, bl);
505
506 //////// for compatibility ////////
507 int64_t kb = statfs.kb();
508 int64_t kb_used = statfs.kb_used_raw();
509 int64_t kb_avail = statfs.kb_avail();
510 encode(kb, bl);
511 encode(kb_used, bl);
512 encode(kb_avail, bl);
513 ///////////////////////////////////
514
515 encode(snap_trim_queue_len, bl);
516 encode(num_snap_trimming, bl);
517 encode(hb_peers, bl);
518 encode((uint32_t)0, bl);
519 encode(op_queue_age_hist, bl);
520 encode(os_perf_stat, bl, features);
521 encode(up_from, bl);
522 encode(seq, bl);
523 encode(num_pgs, bl);
524
525 //////// for compatibility ////////
526 int64_t kb_used_data = statfs.kb_used_data();
527 int64_t kb_used_omap = statfs.kb_used_omap();
528 int64_t kb_used_meta = statfs.kb_used_internal_metadata();
529 encode(kb_used_data, bl);
530 encode(kb_used_omap, bl);
531 encode(kb_used_meta, bl);
532 encode(statfs, bl);
533 ///////////////////////////////////
534 encode(os_alerts, bl);
535 encode(num_shards_repaired, bl);
536 encode(num_osds, bl);
537 encode(num_per_pool_osds, bl);
538 encode(num_per_pool_omap_osds, bl);
539
540 // hb_pingtime map
541 encode((int)hb_pingtime.size(), bl);
542 for (auto i : hb_pingtime) {
543 encode(i.first, bl); // osd
544 encode(i.second.last_update, bl);
545 encode(i.second.back_pingtime[0], bl);
546 encode(i.second.back_pingtime[1], bl);
547 encode(i.second.back_pingtime[2], bl);
548 encode(i.second.back_min[0], bl);
549 encode(i.second.back_min[1], bl);
550 encode(i.second.back_min[2], bl);
551 encode(i.second.back_max[0], bl);
552 encode(i.second.back_max[1], bl);
553 encode(i.second.back_max[2], bl);
554 encode(i.second.back_last, bl);
555 encode(i.second.front_pingtime[0], bl);
556 encode(i.second.front_pingtime[1], bl);
557 encode(i.second.front_pingtime[2], bl);
558 encode(i.second.front_min[0], bl);
559 encode(i.second.front_min[1], bl);
560 encode(i.second.front_min[2], bl);
561 encode(i.second.front_max[0], bl);
562 encode(i.second.front_max[1], bl);
563 encode(i.second.front_max[2], bl);
564 encode(i.second.front_last, bl);
565 }
566 ENCODE_FINISH(bl);
567 }
568
569 void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl)
570 {
571 int64_t kb, kb_used,kb_avail;
572 int64_t kb_used_data, kb_used_omap, kb_used_meta;
573 DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl);
574 decode(kb, bl);
575 decode(kb_used, bl);
576 decode(kb_avail, bl);
577 decode(snap_trim_queue_len, bl);
578 decode(num_snap_trimming, bl);
579 decode(hb_peers, bl);
580 vector<int> num_hb_out;
581 decode(num_hb_out, bl);
582 if (struct_v >= 3)
583 decode(op_queue_age_hist, bl);
584 if (struct_v >= 4)
585 decode(os_perf_stat, bl);
586 if (struct_v >= 6) {
587 decode(up_from, bl);
588 decode(seq, bl);
589 }
590 if (struct_v >= 7) {
591 decode(num_pgs, bl);
592 }
593 if (struct_v >= 8) {
594 decode(kb_used_data, bl);
595 decode(kb_used_omap, bl);
596 decode(kb_used_meta, bl);
597 } else {
598 kb_used_data = kb_used;
599 kb_used_omap = 0;
600 kb_used_meta = 0;
601 }
602 if (struct_v >= 9) {
603 decode(statfs, bl);
604 } else {
605 statfs.reset();
606 statfs.total = kb << 10;
607 statfs.available = kb_avail << 10;
608 // actually it's totally unexpected to have ststfs.total < statfs.available
609 // here but unfortunately legacy generate_test_instances produced such a
610 // case hence inserting some handling rather than assert
611 statfs.internally_reserved =
612 statfs.total > statfs.available ? statfs.total - statfs.available : 0;
613 kb_used <<= 10;
614 if ((int64_t)statfs.internally_reserved > kb_used) {
615 statfs.internally_reserved -= kb_used;
616 } else {
617 statfs.internally_reserved = 0;
618 }
619 statfs.allocated = kb_used_data << 10;
620 statfs.omap_allocated = kb_used_omap << 10;
621 statfs.internal_metadata = kb_used_meta << 10;
622 }
623 if (struct_v >= 10) {
624 decode(os_alerts, bl);
625 } else {
626 os_alerts.clear();
627 }
628 if (struct_v >= 11) {
629 decode(num_shards_repaired, bl);
630 } else {
631 num_shards_repaired = 0;
632 }
633 if (struct_v >= 12) {
634 decode(num_osds, bl);
635 decode(num_per_pool_osds, bl);
636 } else {
637 num_osds = 0;
638 num_per_pool_osds = 0;
639 }
640 if (struct_v >= 13) {
641 decode(num_per_pool_omap_osds, bl);
642 } else {
643 num_per_pool_omap_osds = 0;
644 }
645 hb_pingtime.clear();
646 if (struct_v >= 14) {
647 int count;
648 decode(count, bl);
649 for (int i = 0 ; i < count ; i++) {
650 int osd;
651 decode(osd, bl);
652 struct Interfaces ifs;
653 decode(ifs.last_update, bl);
654 decode(ifs.back_pingtime[0],bl);
655 decode(ifs.back_pingtime[1], bl);
656 decode(ifs.back_pingtime[2], bl);
657 decode(ifs.back_min[0],bl);
658 decode(ifs.back_min[1], bl);
659 decode(ifs.back_min[2], bl);
660 decode(ifs.back_max[0],bl);
661 decode(ifs.back_max[1], bl);
662 decode(ifs.back_max[2], bl);
663 decode(ifs.back_last, bl);
664 decode(ifs.front_pingtime[0], bl);
665 decode(ifs.front_pingtime[1], bl);
666 decode(ifs.front_pingtime[2], bl);
667 decode(ifs.front_min[0], bl);
668 decode(ifs.front_min[1], bl);
669 decode(ifs.front_min[2], bl);
670 decode(ifs.front_max[0], bl);
671 decode(ifs.front_max[1], bl);
672 decode(ifs.front_max[2], bl);
673 decode(ifs.front_last, bl);
674 hb_pingtime[osd] = ifs;
675 }
676 }
677 DECODE_FINISH(bl);
678 }
679
680 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
681 {
682 o.push_back(new osd_stat_t);
683
684 o.push_back(new osd_stat_t);
685 list<store_statfs_t*> ll;
686 store_statfs_t::generate_test_instances(ll);
687 o.back()->statfs = *ll.back();
688 o.back()->hb_peers.push_back(7);
689 o.back()->snap_trim_queue_len = 8;
690 o.back()->num_snap_trimming = 99;
691 o.back()->num_shards_repaired = 101;
692 o.back()->os_alerts[0].emplace(
693 "some alert", "some alert details");
694 o.back()->os_alerts[1].emplace(
695 "some alert2", "some alert2 details");
696 struct Interfaces gen_interfaces = {
697 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
698 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
699 o.back()->hb_pingtime[20] = gen_interfaces;
700 gen_interfaces = {
701 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
702 o.back()->hb_pingtime[30] = gen_interfaces;
703 }
704
705 // -- pg_t --
706
707 int pg_t::print(char *o, int maxlen) const
708 {
709 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
710 }
711
712 bool pg_t::parse(const char *s)
713 {
714 uint64_t ppool;
715 uint32_t pseed;
716 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
717 if (r < 2)
718 return false;
719 m_pool = ppool;
720 m_seed = pseed;
721 return true;
722 }
723
724 bool spg_t::parse(const char *s)
725 {
726 shard = shard_id_t::NO_SHARD;
727 uint64_t ppool;
728 uint32_t pseed;
729 uint32_t pshard;
730 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
731 if (r < 2)
732 return false;
733 pgid.set_pool(ppool);
734 pgid.set_ps(pseed);
735
736 const char *p = strchr(s, 's');
737 if (p) {
738 r = sscanf(p, "s%u", &pshard);
739 if (r == 1) {
740 shard = shard_id_t(pshard);
741 } else {
742 return false;
743 }
744 }
745 return true;
746 }
747
748 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
749 {
750 while (*suffix_backwords)
751 *--buf = *suffix_backwords++;
752
753 if (!is_no_shard()) {
754 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
755 *--buf = 's';
756 }
757
758 return pgid.calc_name(buf, "");
759 }
760
761 ostream& operator<<(ostream& out, const spg_t &pg)
762 {
763 char buf[spg_t::calc_name_buf_size];
764 buf[spg_t::calc_name_buf_size - 1] = '\0';
765 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
766 return out;
767 }
768
769 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
770 {
771 int old_bits = cbits(old_pg_num);
772 int old_mask = (1 << old_bits) - 1;
773 pg_t ret = *this;
774 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
775 return ret;
776 }
777
778 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
779 {
780 //ceph_assert(m_seed < old_pg_num);
781 if (m_seed >= old_pg_num) {
782 // degenerate case
783 return false;
784 }
785 if (new_pg_num <= old_pg_num)
786 return false;
787
788 bool split = false;
789 if (true) {
790 unsigned old_bits = cbits(old_pg_num);
791 unsigned old_mask = (1 << old_bits) - 1;
792 for (unsigned n = 1; ; n++) {
793 unsigned next_bit = (n << (old_bits-1));
794 unsigned s = next_bit | m_seed;
795
796 if (s < old_pg_num || s == m_seed)
797 continue;
798 if (s >= new_pg_num)
799 break;
800 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
801 split = true;
802 if (children)
803 children->insert(pg_t(s, m_pool));
804 }
805 }
806 }
807 if (false) {
808 // brute force
809 int old_bits = cbits(old_pg_num);
810 int old_mask = (1 << old_bits) - 1;
811 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
812 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
813 if (o == m_seed) {
814 split = true;
815 children->insert(pg_t(x, m_pool));
816 }
817 }
818 }
819 return split;
820 }
821
822 unsigned pg_t::get_split_bits(unsigned pg_num) const {
823 if (pg_num == 1)
824 return 0;
825 ceph_assert(pg_num > 1);
826
827 // Find unique p such that pg_num \in [2^(p-1), 2^p)
828 unsigned p = cbits(pg_num);
829 ceph_assert(p); // silence coverity #751330
830
831 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
832 return p;
833 else
834 return p - 1;
835 }
836
837 bool pg_t::is_merge_source(
838 unsigned old_pg_num,
839 unsigned new_pg_num,
840 pg_t *parent) const
841 {
842 if (m_seed < old_pg_num &&
843 m_seed >= new_pg_num) {
844 if (parent) {
845 pg_t t = *this;
846 while (t.m_seed >= new_pg_num) {
847 t = t.get_parent();
848 }
849 *parent = t;
850 }
851 return true;
852 }
853 return false;
854 }
855
856 pg_t pg_t::get_parent() const
857 {
858 unsigned bits = cbits(m_seed);
859 ceph_assert(bits);
860 pg_t retval = *this;
861 retval.m_seed &= ~((~0)<<(bits - 1));
862 return retval;
863 }
864
865 hobject_t pg_t::get_hobj_start() const
866 {
867 return hobject_t(object_t(), string(), 0, m_seed, m_pool,
868 string());
869 }
870
871 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
872 {
873 // note: this assumes a bitwise sort; with the legacy nibblewise
874 // sort a PG did not always cover a single contiguous range of the
875 // (bit-reversed) hash range.
876 unsigned bits = get_split_bits(pg_num);
877 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
878 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
879 if (rev_end >= 0x100000000) {
880 ceph_assert(rev_end == 0x100000000);
881 return hobject_t::get_max();
882 } else {
883 return hobject_t(object_t(), string(), CEPH_NOSNAP,
884 hobject_t::_reverse_bits(rev_end), m_pool,
885 string());
886 }
887 }
888
889 void pg_t::dump(Formatter *f) const
890 {
891 f->dump_unsigned("pool", m_pool);
892 f->dump_unsigned("seed", m_seed);
893 }
894
895 void pg_t::generate_test_instances(list<pg_t*>& o)
896 {
897 o.push_back(new pg_t);
898 o.push_back(new pg_t(1, 2));
899 o.push_back(new pg_t(13123, 3));
900 o.push_back(new pg_t(131223, 4));
901 }
902
903 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
904 {
905 while (*suffix_backwords)
906 *--buf = *suffix_backwords++;
907
908 buf = ritoa<uint32_t, 16>(m_seed, buf);
909
910 *--buf = '.';
911
912 return ritoa<uint64_t, 10>(m_pool, buf);
913 }
914
915 ostream& operator<<(ostream& out, const pg_t &pg)
916 {
917 char buf[pg_t::calc_name_buf_size];
918 buf[pg_t::calc_name_buf_size - 1] = '\0';
919 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
920 return out;
921 }
922
923
924 // -- coll_t --
925
926 void coll_t::calc_str()
927 {
928 switch (type) {
929 case TYPE_META:
930 strcpy(_str_buff, "meta");
931 _str = _str_buff;
932 break;
933 case TYPE_PG:
934 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
935 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
936 break;
937 case TYPE_PG_TEMP:
938 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
939 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
940 break;
941 default:
942 ceph_abort_msg("unknown collection type");
943 }
944 }
945
946 bool coll_t::parse(const std::string& s)
947 {
948 if (s == "meta") {
949 type = TYPE_META;
950 pgid = spg_t();
951 removal_seq = 0;
952 calc_str();
953 ceph_assert(s == _str);
954 return true;
955 }
956 if (s.find("_head") == s.length() - 5 &&
957 pgid.parse(s.substr(0, s.length() - 5))) {
958 type = TYPE_PG;
959 removal_seq = 0;
960 calc_str();
961 ceph_assert(s == _str);
962 return true;
963 }
964 if (s.find("_TEMP") == s.length() - 5 &&
965 pgid.parse(s.substr(0, s.length() - 5))) {
966 type = TYPE_PG_TEMP;
967 removal_seq = 0;
968 calc_str();
969 ceph_assert(s == _str);
970 return true;
971 }
972 return false;
973 }
974
975 void coll_t::encode(ceph::buffer::list& bl) const
976 {
977 using ceph::encode;
978 // when changing this, remember to update encoded_size() too.
979 if (is_temp()) {
980 // can't express this as v2...
981 __u8 struct_v = 3;
982 encode(struct_v, bl);
983 encode(to_str(), bl);
984 } else {
985 __u8 struct_v = 2;
986 encode(struct_v, bl);
987 encode((__u8)type, bl);
988 encode(pgid, bl);
989 snapid_t snap = CEPH_NOSNAP;
990 encode(snap, bl);
991 }
992 }
993
994 size_t coll_t::encoded_size() const
995 {
996 size_t r = sizeof(__u8);
997 if (is_temp()) {
998 // v3
999 r += sizeof(__u32);
1000 if (_str) {
1001 r += strlen(_str);
1002 }
1003 } else {
1004 // v2
1005 // 1. type
1006 r += sizeof(__u8);
1007 // 2. pgid
1008 // - encoding header
1009 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
1010 // - pg_t
1011 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
1012 // - shard_id_t
1013 r += sizeof(int8_t);
1014 // 3. snapid_t
1015 r += sizeof(uint64_t);
1016 }
1017
1018 return r;
1019 }
1020
1021 void coll_t::decode(ceph::buffer::list::const_iterator& bl)
1022 {
1023 using ceph::decode;
1024 __u8 struct_v;
1025 decode(struct_v, bl);
1026 switch (struct_v) {
1027 case 1:
1028 {
1029 snapid_t snap;
1030 decode(pgid, bl);
1031 decode(snap, bl);
1032
1033 // infer the type
1034 if (pgid == spg_t() && snap == 0) {
1035 type = TYPE_META;
1036 } else {
1037 type = TYPE_PG;
1038 }
1039 removal_seq = 0;
1040 }
1041 break;
1042
1043 case 2:
1044 {
1045 __u8 _type;
1046 snapid_t snap;
1047 decode(_type, bl);
1048 decode(pgid, bl);
1049 decode(snap, bl);
1050 type = (type_t)_type;
1051 removal_seq = 0;
1052 }
1053 break;
1054
1055 case 3:
1056 {
1057 string str;
1058 decode(str, bl);
1059 bool ok = parse(str);
1060 if (!ok)
1061 throw std::domain_error(std::string("unable to parse pg ") + str);
1062 }
1063 break;
1064
1065 default:
1066 {
1067 CachedStackStringStream css;
1068 *css << "coll_t::decode(): don't know how to decode version "
1069 << struct_v;
1070 throw std::domain_error(css->str());
1071 }
1072 }
1073 }
1074
1075 void coll_t::dump(Formatter *f) const
1076 {
1077 f->dump_unsigned("type_id", (unsigned)type);
1078 if (type != TYPE_META)
1079 f->dump_stream("pgid") << pgid;
1080 f->dump_string("name", to_str());
1081 }
1082
1083 void coll_t::generate_test_instances(list<coll_t*>& o)
1084 {
1085 o.push_back(new coll_t());
1086 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
1087 o.push_back(new coll_t(o.back()->get_temp()));
1088 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
1089 o.push_back(new coll_t(o.back()->get_temp()));
1090 o.push_back(new coll_t());
1091 }
1092
1093 // ---
1094
1095 std::string pg_vector_string(const vector<int32_t> &a)
1096 {
1097 CachedStackStringStream css;
1098 *css << "[";
1099 for (auto i = a.cbegin(); i != a.cend(); ++i) {
1100 if (i != a.begin())
1101 *css << ",";
1102 if (*i != CRUSH_ITEM_NONE)
1103 *css << *i;
1104 else
1105 *css << "NONE";
1106 }
1107 *css << "]";
1108 return css->str();
1109 }
1110
1111 std::string pg_state_string(uint64_t state)
1112 {
1113 CachedStackStringStream css;
1114 if (state & PG_STATE_STALE)
1115 *css << "stale+";
1116 if (state & PG_STATE_CREATING)
1117 *css << "creating+";
1118 if (state & PG_STATE_ACTIVE)
1119 *css << "active+";
1120 if (state & PG_STATE_ACTIVATING)
1121 *css << "activating+";
1122 if (state & PG_STATE_CLEAN)
1123 *css << "clean+";
1124 if (state & PG_STATE_RECOVERY_WAIT)
1125 *css << "recovery_wait+";
1126 if (state & PG_STATE_RECOVERY_TOOFULL)
1127 *css << "recovery_toofull+";
1128 if (state & PG_STATE_RECOVERING)
1129 *css << "recovering+";
1130 if (state & PG_STATE_FORCED_RECOVERY)
1131 *css << "forced_recovery+";
1132 if (state & PG_STATE_DOWN)
1133 *css << "down+";
1134 if (state & PG_STATE_RECOVERY_UNFOUND)
1135 *css << "recovery_unfound+";
1136 if (state & PG_STATE_BACKFILL_UNFOUND)
1137 *css << "backfill_unfound+";
1138 if (state & PG_STATE_UNDERSIZED)
1139 *css << "undersized+";
1140 if (state & PG_STATE_DEGRADED)
1141 *css << "degraded+";
1142 if (state & PG_STATE_REMAPPED)
1143 *css << "remapped+";
1144 if (state & PG_STATE_PREMERGE)
1145 *css << "premerge+";
1146 if (state & PG_STATE_SCRUBBING)
1147 *css << "scrubbing+";
1148 if (state & PG_STATE_DEEP_SCRUB)
1149 *css << "deep+";
1150 if (state & PG_STATE_INCONSISTENT)
1151 *css << "inconsistent+";
1152 if (state & PG_STATE_PEERING)
1153 *css << "peering+";
1154 if (state & PG_STATE_REPAIR)
1155 *css << "repair+";
1156 if (state & PG_STATE_BACKFILL_WAIT)
1157 *css << "backfill_wait+";
1158 if (state & PG_STATE_BACKFILLING)
1159 *css << "backfilling+";
1160 if (state & PG_STATE_FORCED_BACKFILL)
1161 *css << "forced_backfill+";
1162 if (state & PG_STATE_BACKFILL_TOOFULL)
1163 *css << "backfill_toofull+";
1164 if (state & PG_STATE_INCOMPLETE)
1165 *css << "incomplete+";
1166 if (state & PG_STATE_PEERED)
1167 *css << "peered+";
1168 if (state & PG_STATE_SNAPTRIM)
1169 *css << "snaptrim+";
1170 if (state & PG_STATE_SNAPTRIM_WAIT)
1171 *css << "snaptrim_wait+";
1172 if (state & PG_STATE_SNAPTRIM_ERROR)
1173 *css << "snaptrim_error+";
1174 if (state & PG_STATE_FAILED_REPAIR)
1175 *css << "failed_repair+";
1176 if (state & PG_STATE_LAGGY)
1177 *css << "laggy+";
1178 if (state & PG_STATE_WAIT)
1179 *css << "wait+";
1180 auto ret = css->str();
1181 if (ret.length() > 0)
1182 ret.resize(ret.length() - 1);
1183 else
1184 ret = "unknown";
1185 return ret;
1186 }
1187
1188 std::optional<uint64_t> pg_string_state(const std::string& state)
1189 {
1190 std::optional<uint64_t> type;
1191 if (state == "active")
1192 type = PG_STATE_ACTIVE;
1193 else if (state == "clean")
1194 type = PG_STATE_CLEAN;
1195 else if (state == "down")
1196 type = PG_STATE_DOWN;
1197 else if (state == "recovery_unfound")
1198 type = PG_STATE_RECOVERY_UNFOUND;
1199 else if (state == "backfill_unfound")
1200 type = PG_STATE_BACKFILL_UNFOUND;
1201 else if (state == "premerge")
1202 type = PG_STATE_PREMERGE;
1203 else if (state == "scrubbing")
1204 type = PG_STATE_SCRUBBING;
1205 else if (state == "degraded")
1206 type = PG_STATE_DEGRADED;
1207 else if (state == "inconsistent")
1208 type = PG_STATE_INCONSISTENT;
1209 else if (state == "peering")
1210 type = PG_STATE_PEERING;
1211 else if (state == "repair")
1212 type = PG_STATE_REPAIR;
1213 else if (state == "recovering")
1214 type = PG_STATE_RECOVERING;
1215 else if (state == "forced_recovery")
1216 type = PG_STATE_FORCED_RECOVERY;
1217 else if (state == "backfill_wait")
1218 type = PG_STATE_BACKFILL_WAIT;
1219 else if (state == "incomplete")
1220 type = PG_STATE_INCOMPLETE;
1221 else if (state == "stale")
1222 type = PG_STATE_STALE;
1223 else if (state == "remapped")
1224 type = PG_STATE_REMAPPED;
1225 else if (state == "deep")
1226 type = PG_STATE_DEEP_SCRUB;
1227 else if (state == "backfilling")
1228 type = PG_STATE_BACKFILLING;
1229 else if (state == "forced_backfill")
1230 type = PG_STATE_FORCED_BACKFILL;
1231 else if (state == "backfill_toofull")
1232 type = PG_STATE_BACKFILL_TOOFULL;
1233 else if (state == "recovery_wait")
1234 type = PG_STATE_RECOVERY_WAIT;
1235 else if (state == "recovery_toofull")
1236 type = PG_STATE_RECOVERY_TOOFULL;
1237 else if (state == "undersized")
1238 type = PG_STATE_UNDERSIZED;
1239 else if (state == "activating")
1240 type = PG_STATE_ACTIVATING;
1241 else if (state == "peered")
1242 type = PG_STATE_PEERED;
1243 else if (state == "snaptrim")
1244 type = PG_STATE_SNAPTRIM;
1245 else if (state == "snaptrim_wait")
1246 type = PG_STATE_SNAPTRIM_WAIT;
1247 else if (state == "snaptrim_error")
1248 type = PG_STATE_SNAPTRIM_ERROR;
1249 else if (state == "creating")
1250 type = PG_STATE_CREATING;
1251 else if (state == "failed_repair")
1252 type = PG_STATE_FAILED_REPAIR;
1253 else if (state == "laggy")
1254 type = PG_STATE_LAGGY;
1255 else if (state == "wait")
1256 type = PG_STATE_WAIT;
1257 else if (state == "unknown")
1258 type = 0;
1259 else
1260 type = std::nullopt;
1261 return type;
1262 }
1263
1264 // -- eversion_t --
1265 string eversion_t::get_key_name() const
1266 {
1267 std::string key(32, ' ');
1268 get_key_name(&key[0]);
1269 key.resize(31); // remove the null terminator
1270 return key;
1271 }
1272
1273 // -- pool_snap_info_t --
1274 void pool_snap_info_t::dump(Formatter *f) const
1275 {
1276 f->dump_unsigned("snapid", snapid);
1277 f->dump_stream("stamp") << stamp;
1278 f->dump_string("name", name);
1279 }
1280
1281 void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
1282 {
1283 using ceph::encode;
1284 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1285 __u8 struct_v = 1;
1286 encode(struct_v, bl);
1287 encode(snapid, bl);
1288 encode(stamp, bl);
1289 encode(name, bl);
1290 return;
1291 }
1292 ENCODE_START(2, 2, bl);
1293 encode(snapid, bl);
1294 encode(stamp, bl);
1295 encode(name, bl);
1296 ENCODE_FINISH(bl);
1297 }
1298
1299 void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl)
1300 {
1301 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
1302 decode(snapid, bl);
1303 decode(stamp, bl);
1304 decode(name, bl);
1305 DECODE_FINISH(bl);
1306 }
1307
1308 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
1309 {
1310 o.push_back(new pool_snap_info_t);
1311 o.push_back(new pool_snap_info_t);
1312 o.back()->snapid = 1;
1313 o.back()->stamp = utime_t(1, 2);
1314 o.back()->name = "foo";
1315 }
1316
1317 // -- pool_opts_t --
1318
1319 // The order of items in the list is important, therefore,
1320 // you should always add to the end of the list when adding new options.
1321
1322 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
1323 static opt_mapping_t opt_mapping = boost::assign::map_list_of
1324 ("scrub_min_interval", pool_opts_t::opt_desc_t(
1325 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
1326 ("scrub_max_interval", pool_opts_t::opt_desc_t(
1327 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
1328 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1329 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
1330 ("recovery_priority", pool_opts_t::opt_desc_t(
1331 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
1332 ("recovery_op_priority", pool_opts_t::opt_desc_t(
1333 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
1334 ("scrub_priority", pool_opts_t::opt_desc_t(
1335 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
1336 ("compression_mode", pool_opts_t::opt_desc_t(
1337 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
1338 ("compression_algorithm", pool_opts_t::opt_desc_t(
1339 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
1340 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1341 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
1342 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1343 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
1344 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1345 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
1346 ("csum_type", pool_opts_t::opt_desc_t(
1347 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1348 ("csum_max_block", pool_opts_t::opt_desc_t(
1349 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1350 ("csum_min_block", pool_opts_t::opt_desc_t(
1351 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT))
1352 ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1353 pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR))
1354 ("pg_num_min", pool_opts_t::opt_desc_t(
1355 pool_opts_t::PG_NUM_MIN, pool_opts_t::INT))
1356 ("target_size_bytes", pool_opts_t::opt_desc_t(
1357 pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT))
1358 ("target_size_ratio", pool_opts_t::opt_desc_t(
1359 pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE))
1360 ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
1361 pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE))
1362 ("read_lease_interval", pool_opts_t::opt_desc_t(
1363 pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE))
1364 ("dedup_tier", pool_opts_t::opt_desc_t(
1365 pool_opts_t::DEDUP_TIER, pool_opts_t::INT))
1366 ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t(
1367 pool_opts_t::DEDUP_CHUNK_ALGORITHM, pool_opts_t::STR))
1368 ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t(
1369 pool_opts_t::DEDUP_CDC_CHUNK_SIZE, pool_opts_t::INT))
1370 ("pg_num_max", pool_opts_t::opt_desc_t(
1371 pool_opts_t::PG_NUM_MAX, pool_opts_t::INT));
1372
1373 bool pool_opts_t::is_opt_name(const std::string& name)
1374 {
1375 return opt_mapping.count(name);
1376 }
1377
1378 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name)
1379 {
1380 auto i = opt_mapping.find(name);
1381 ceph_assert(i != opt_mapping.end());
1382 return i->second;
1383 }
1384
1385 bool pool_opts_t::is_set(pool_opts_t::key_t key) const
1386 {
1387 return opts.count(key);
1388 }
1389
1390 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const
1391 {
1392 auto i = opts.find(key);
1393 ceph_assert(i != opts.end());
1394 return i->second;
1395 }
1396
1397 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1398 return opts.erase(key) > 0;
1399 }
1400
1401 class pool_opts_dumper_t : public boost::static_visitor<> {
1402 public:
1403 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1404 name(name_.c_str()), f(f_) {}
1405
1406 void operator()(std::string s) const {
1407 f->dump_string(name, s);
1408 }
1409 void operator()(int64_t i) const {
1410 f->dump_int(name, i);
1411 }
1412 void operator()(double d) const {
1413 f->dump_float(name, d);
1414 }
1415
1416 private:
1417 const char* name;
1418 Formatter* f;
1419 };
1420
1421 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1422 {
1423 const opt_desc_t& desc = get_opt_desc(name);
1424 auto i = opts.find(desc.key);
1425 if (i == opts.end()) {
1426 return;
1427 }
1428 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1429 }
1430
1431 void pool_opts_t::dump(Formatter* f) const
1432 {
1433 for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) {
1434 const std::string& name = i->first;
1435 const opt_desc_t& desc = i->second;
1436 auto j = opts.find(desc.key);
1437 if (j == opts.end()) {
1438 continue;
1439 }
1440 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1441 }
1442 }
1443
1444 class pool_opts_encoder_t : public boost::static_visitor<> {
1445 public:
1446 explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features)
1447 : bl(bl_),
1448 features(features) {}
1449
1450 void operator()(const std::string &s) const {
1451 encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1452 encode(s, bl);
1453 }
1454 void operator()(int64_t i) const {
1455 encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1456 if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1457 encode(i, bl);
1458 } else {
1459 encode(static_cast<int32_t>(i), bl);
1460 }
1461 }
1462 void operator()(double d) const {
1463 encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1464 encode(d, bl);
1465 }
1466
1467 private:
1468 ceph::buffer::list& bl;
1469 uint64_t features;
1470 };
1471
1472 void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const
1473 {
1474 unsigned v = 2;
1475 if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1476 v = 1;
1477 }
1478 ENCODE_START(v, 1, bl);
1479 uint32_t n = static_cast<uint32_t>(opts.size());
1480 encode(n, bl);
1481 for (auto i = opts.cbegin(); i != opts.cend(); ++i) {
1482 encode(static_cast<int32_t>(i->first), bl);
1483 boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second);
1484 }
1485 ENCODE_FINISH(bl);
1486 }
1487
1488 void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl)
1489 {
1490 DECODE_START(1, bl);
1491 __u32 n;
1492 decode(n, bl);
1493 opts.clear();
1494 while (n--) {
1495 int32_t k, t;
1496 decode(k, bl);
1497 decode(t, bl);
1498 if (t == STR) {
1499 std::string s;
1500 decode(s, bl);
1501 opts[static_cast<key_t>(k)] = s;
1502 } else if (t == INT) {
1503 int64_t i;
1504 if (struct_v >= 2) {
1505 decode(i, bl);
1506 } else {
1507 int ii;
1508 decode(ii, bl);
1509 i = ii;
1510 }
1511 opts[static_cast<key_t>(k)] = i;
1512 } else if (t == DOUBLE) {
1513 double d;
1514 decode(d, bl);
1515 opts[static_cast<key_t>(k)] = d;
1516 } else {
1517 ceph_assert(!"invalid type");
1518 }
1519 }
1520 DECODE_FINISH(bl);
1521 }
1522
1523 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1524 {
1525 for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) {
1526 const std::string& name = i->first;
1527 const pool_opts_t::opt_desc_t& desc = i->second;
1528 auto j = opts.opts.find(desc.key);
1529 if (j == opts.opts.end()) {
1530 continue;
1531 }
1532 out << " " << name << " " << j->second;
1533 }
1534 return out;
1535 }
1536
1537 // -- pg_pool_t --
1538
1539 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1540 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1541 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1542
1543 void pg_pool_t::dump(Formatter *f) const
1544 {
1545 f->dump_stream("create_time") << get_create_time();
1546 f->dump_unsigned("flags", get_flags());
1547 f->dump_string("flags_names", get_flags_string());
1548 f->dump_int("type", get_type());
1549 f->dump_int("size", get_size());
1550 f->dump_int("min_size", get_min_size());
1551 f->dump_int("crush_rule", get_crush_rule());
1552 f->dump_int("peering_crush_bucket_count", peering_crush_bucket_count);
1553 f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target);
1554 f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier);
1555 f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member);
1556 f->dump_int("object_hash", get_object_hash());
1557 f->dump_string("pg_autoscale_mode",
1558 get_pg_autoscale_mode_name(pg_autoscale_mode));
1559 f->dump_unsigned("pg_num", get_pg_num());
1560 f->dump_unsigned("pg_placement_num", get_pgp_num());
1561 f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1562 f->dump_unsigned("pg_num_target", get_pg_num_target());
1563 f->dump_unsigned("pg_num_pending", get_pg_num_pending());
1564 f->dump_object("last_pg_merge_meta", last_pg_merge_meta);
1565 f->dump_stream("last_change") << get_last_change();
1566 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1567 f->dump_stream("last_force_op_resend_prenautilus")
1568 << get_last_force_op_resend_prenautilus();
1569 f->dump_stream("last_force_op_resend_preluminous")
1570 << get_last_force_op_resend_preluminous();
1571 f->dump_unsigned("auid", get_auid());
1572 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1573 f->dump_unsigned("snap_seq", get_snap_seq());
1574 f->dump_unsigned("snap_epoch", get_snap_epoch());
1575 f->open_array_section("pool_snaps");
1576 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) {
1577 f->open_object_section("pool_snap_info");
1578 p->second.dump(f);
1579 f->close_section();
1580 }
1581 f->close_section();
1582 f->dump_stream("removed_snaps") << removed_snaps;
1583 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1584 f->dump_unsigned("quota_max_objects", quota_max_objects);
1585 f->open_array_section("tiers");
1586 for (auto p = tiers.cbegin(); p != tiers.cend(); ++p)
1587 f->dump_unsigned("pool_id", *p);
1588 f->close_section();
1589 f->dump_int("tier_of", tier_of);
1590 f->dump_int("read_tier", read_tier);
1591 f->dump_int("write_tier", write_tier);
1592 f->dump_string("cache_mode", get_cache_mode_name());
1593 f->dump_unsigned("target_max_bytes", target_max_bytes);
1594 f->dump_unsigned("target_max_objects", target_max_objects);
1595 f->dump_unsigned("cache_target_dirty_ratio_micro",
1596 cache_target_dirty_ratio_micro);
1597 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1598 cache_target_dirty_high_ratio_micro);
1599 f->dump_unsigned("cache_target_full_ratio_micro",
1600 cache_target_full_ratio_micro);
1601 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1602 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1603 f->dump_string("erasure_code_profile", erasure_code_profile);
1604 f->open_object_section("hit_set_params");
1605 hit_set_params.dump(f);
1606 f->close_section(); // hit_set_params
1607 f->dump_unsigned("hit_set_period", hit_set_period);
1608 f->dump_unsigned("hit_set_count", hit_set_count);
1609 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1610 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1611 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1612 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1613 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1614 f->open_array_section("grade_table");
1615 for (unsigned i = 0; i < hit_set_count; ++i)
1616 f->dump_unsigned("value", get_grade(i));
1617 f->close_section();
1618 f->dump_unsigned("stripe_width", get_stripe_width());
1619 f->dump_unsigned("expected_num_objects", expected_num_objects);
1620 f->dump_bool("fast_read", fast_read);
1621 f->open_object_section("options");
1622 opts.dump(f);
1623 f->close_section(); // options
1624 f->open_object_section("application_metadata");
1625 for (auto &app_pair : application_metadata) {
1626 f->open_object_section(app_pair.first.c_str());
1627 for (auto &kv_pair : app_pair.second) {
1628 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1629 }
1630 f->close_section(); // application
1631 }
1632 f->close_section(); // application_metadata
1633 }
1634
1635 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1636 for (size_t i = 0; i < from.size(); ++i) {
1637 if (from[i] != CRUSH_ITEM_NONE) {
1638 to->insert(
1639 pg_shard_t(
1640 from[i],
1641 is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1642 }
1643 }
1644 }
1645
1646 void pg_pool_t::calc_pg_masks()
1647 {
1648 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1649 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1650 }
1651
1652 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1653 {
1654 if (pg_num == pg_num_mask + 1)
1655 return pg_num; // power-of-2 split
1656 unsigned mask = pg_num_mask >> 1;
1657 if ((pgid.ps() & mask) < (pg_num & mask))
1658 return pg_num_mask + 1; // smaller bin size (already split)
1659 else
1660 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1661 }
1662
1663 bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const
1664 {
1665 if (pg_num_pending >= pg_num) {
1666 return false;
1667 }
1668 if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) {
1669 if (target) {
1670 *target = false;
1671 }
1672 return true;
1673 }
1674 for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) {
1675 if (pg_t(ps, pgid.pool()).get_parent() == pgid) {
1676 if (target) {
1677 *target = true;
1678 }
1679 return true;
1680 }
1681 }
1682 return false;
1683 }
1684
1685 /*
1686 * we have two snap modes:
1687 * - pool snaps
1688 * - snap existence/non-existence defined by snaps[] and snap_seq
1689 * - user managed snaps
1690 * - existence tracked by librados user
1691 */
1692 bool pg_pool_t::is_pool_snaps_mode() const
1693 {
1694 return has_flag(FLAG_POOL_SNAPS);
1695 }
1696
1697 bool pg_pool_t::is_unmanaged_snaps_mode() const
1698 {
1699 return has_flag(FLAG_SELFMANAGED_SNAPS);
1700 }
1701
1702 bool pg_pool_t::is_removed_snap(snapid_t s) const
1703 {
1704 if (is_pool_snaps_mode())
1705 return s <= get_snap_seq() && snaps.count(s) == 0;
1706 else
1707 return removed_snaps.contains(s);
1708 }
1709
1710 snapid_t pg_pool_t::snap_exists(std::string_view s) const
1711 {
1712 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
1713 if (p->second.name == s)
1714 return p->second.snapid;
1715 return 0;
1716 }
1717
1718 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1719 {
1720 ceph_assert(!is_unmanaged_snaps_mode());
1721 flags |= FLAG_POOL_SNAPS;
1722 snapid_t s = get_snap_seq() + 1;
1723 snap_seq = s;
1724 snaps[s].snapid = s;
1725 snaps[s].name = n;
1726 snaps[s].stamp = stamp;
1727 }
1728
1729 uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat)
1730 {
1731 ceph_assert(!is_pool_snaps_mode());
1732 if (snap_seq == 0) {
1733 if (preoctopus_compat) {
1734 // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
1735 // mimic this field is not decoded but our flag is set; pre-mimic, we
1736 // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1737 removed_snaps.insert(snapid_t(1));
1738 }
1739 snap_seq = 1;
1740 }
1741 flags |= FLAG_SELFMANAGED_SNAPS;
1742 snap_seq = snap_seq + 1;
1743 return snap_seq;
1744 }
1745
1746 void pg_pool_t::remove_snap(snapid_t s)
1747 {
1748 ceph_assert(snaps.count(s));
1749 snaps.erase(s);
1750 snap_seq = snap_seq + 1;
1751 }
1752
1753 void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat)
1754 {
1755 ceph_assert(is_unmanaged_snaps_mode());
1756 ++snap_seq;
1757 if (preoctopus_compat) {
1758 removed_snaps.insert(s);
1759 // try to add in the new seq, just to try to keep the interval_set contiguous
1760 if (!removed_snaps.contains(get_snap_seq())) {
1761 removed_snaps.insert(get_snap_seq());
1762 }
1763 }
1764 }
1765
1766 SnapContext pg_pool_t::get_snap_context() const
1767 {
1768 vector<snapid_t> s(snaps.size());
1769 unsigned i = 0;
1770 for (auto p = snaps.crbegin(); p != snaps.crend(); ++p)
1771 s[i++] = p->first;
1772 return SnapContext(get_snap_seq(), s);
1773 }
1774
1775 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1776 {
1777 if (ns.empty())
1778 return ceph_str_hash(object_hash, key.data(), key.length());
1779 int nsl = ns.length();
1780 int len = key.length() + nsl + 1;
1781 char buf[len];
1782 memcpy(&buf[0], ns.data(), nsl);
1783 buf[nsl] = '\037';
1784 memcpy(&buf[nsl+1], key.data(), key.length());
1785 return ceph_str_hash(object_hash, &buf[0], len);
1786 }
1787
1788 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1789 {
1790 return ceph_stable_mod(v, pg_num, pg_num_mask);
1791 }
1792
1793 /*
1794 * map a raw pg (with full precision ps) into an actual pg, for storage
1795 */
1796 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1797 {
1798 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1799 return pg;
1800 }
1801
1802 /*
1803 * map raw pg (full precision ps) into a placement seed. include
1804 * pool id in that value so that different pools don't use the same
1805 * seeds.
1806 */
1807 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1808 {
1809 if (flags & FLAG_HASHPSPOOL) {
1810 // Hash the pool id so that pool PGs do not overlap.
1811 return
1812 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1813 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1814 pg.pool());
1815 } else {
1816 // Legacy behavior; add ps and pool together. This is not a great
1817 // idea because the PGs from each pool will essentially overlap on
1818 // top of each other: 0.5 == 1.4 == 2.3 == ...
1819 return
1820 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1821 pg.pool();
1822 }
1823 }
1824
1825 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1826 {
1827 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1828 if (pg_num == pg_num_mask + 1) {
1829 r &= ~pg_num_mask;
1830 } else {
1831 unsigned smaller_mask = pg_num_mask >> 1;
1832 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1833 r &= ~pg_num_mask;
1834 } else {
1835 r &= ~smaller_mask;
1836 }
1837 }
1838 r |= pg.ps();
1839 return r;
1840 }
1841
1842 void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
1843 {
1844 using ceph::encode;
1845 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1846 // this encoding matches the old struct ceph_pg_pool
1847 __u8 struct_v = 2;
1848 encode(struct_v, bl);
1849 encode(type, bl);
1850 encode(size, bl);
1851 encode(crush_rule, bl);
1852 encode(object_hash, bl);
1853 encode(pg_num, bl);
1854 encode(pgp_num, bl);
1855 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1856 encode(lpg_num, bl);
1857 encode(lpgp_num, bl);
1858 encode(last_change, bl);
1859 encode(snap_seq, bl);
1860 encode(snap_epoch, bl);
1861
1862 __u32 n = snaps.size();
1863 encode(n, bl);
1864 n = removed_snaps.num_intervals();
1865 encode(n, bl);
1866
1867 encode(auid, bl);
1868
1869 encode_nohead(snaps, bl, features);
1870 encode_nohead(removed_snaps, bl);
1871 return;
1872 }
1873
1874 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1875 __u8 struct_v = 4;
1876 encode(struct_v, bl);
1877 encode(type, bl);
1878 encode(size, bl);
1879 encode(crush_rule, bl);
1880 encode(object_hash, bl);
1881 encode(pg_num, bl);
1882 encode(pgp_num, bl);
1883 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1884 encode(lpg_num, bl);
1885 encode(lpgp_num, bl);
1886 encode(last_change, bl);
1887 encode(snap_seq, bl);
1888 encode(snap_epoch, bl);
1889 encode(snaps, bl, features);
1890 encode(removed_snaps, bl);
1891 encode(auid, bl);
1892 encode(flags, bl);
1893 encode((uint32_t)0, bl); // crash_replay_interval
1894 return;
1895 }
1896
1897 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1898 // we simply added last_force_op_resend here, which is a fully
1899 // backward compatible change. however, encoding the same map
1900 // differently between monitors triggers scrub noise (even though
1901 // they are decodable without the feature), so let's be pendantic
1902 // about it.
1903 ENCODE_START(14, 5, bl);
1904 encode(type, bl);
1905 encode(size, bl);
1906 encode(crush_rule, bl);
1907 encode(object_hash, bl);
1908 encode(pg_num, bl);
1909 encode(pgp_num, bl);
1910 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1911 encode(lpg_num, bl);
1912 encode(lpgp_num, bl);
1913 encode(last_change, bl);
1914 encode(snap_seq, bl);
1915 encode(snap_epoch, bl);
1916 encode(snaps, bl, features);
1917 encode(removed_snaps, bl);
1918 encode(auid, bl);
1919 encode(flags, bl);
1920 encode((uint32_t)0, bl); // crash_replay_interval
1921 encode(min_size, bl);
1922 encode(quota_max_bytes, bl);
1923 encode(quota_max_objects, bl);
1924 encode(tiers, bl);
1925 encode(tier_of, bl);
1926 __u8 c = cache_mode;
1927 encode(c, bl);
1928 encode(read_tier, bl);
1929 encode(write_tier, bl);
1930 encode(properties, bl);
1931 encode(hit_set_params, bl);
1932 encode(hit_set_period, bl);
1933 encode(hit_set_count, bl);
1934 encode(stripe_width, bl);
1935 encode(target_max_bytes, bl);
1936 encode(target_max_objects, bl);
1937 encode(cache_target_dirty_ratio_micro, bl);
1938 encode(cache_target_full_ratio_micro, bl);
1939 encode(cache_min_flush_age, bl);
1940 encode(cache_min_evict_age, bl);
1941 encode(erasure_code_profile, bl);
1942 ENCODE_FINISH(bl);
1943 return;
1944 }
1945
1946 uint8_t v = 30;
1947 // NOTE: any new encoding dependencies must be reflected by
1948 // SIGNIFICANT_FEATURES
1949 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1950 // this was the first post-hammer thing we added; if it's missing, encode
1951 // like hammer.
1952 v = 21;
1953 } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1954 v = 24;
1955 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
1956 v = 26;
1957 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1958 v = 27;
1959 } else if (!is_stretch_pool()) {
1960 v = 29;
1961 }
1962
1963 ENCODE_START(v, 5, bl);
1964 encode(type, bl);
1965 encode(size, bl);
1966 encode(crush_rule, bl);
1967 encode(object_hash, bl);
1968 encode(pg_num, bl);
1969 encode(pgp_num, bl);
1970 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1971 encode(lpg_num, bl);
1972 encode(lpgp_num, bl);
1973 encode(last_change, bl);
1974 encode(snap_seq, bl);
1975 encode(snap_epoch, bl);
1976 encode(snaps, bl, features);
1977 encode(removed_snaps, bl);
1978 encode(auid, bl);
1979 if (v >= 27) {
1980 encode(flags, bl);
1981 } else {
1982 auto tmp = flags;
1983 tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
1984 encode(tmp, bl);
1985 }
1986 encode((uint32_t)0, bl); // crash_replay_interval
1987 encode(min_size, bl);
1988 encode(quota_max_bytes, bl);
1989 encode(quota_max_objects, bl);
1990 encode(tiers, bl);
1991 encode(tier_of, bl);
1992 __u8 c = cache_mode;
1993 encode(c, bl);
1994 encode(read_tier, bl);
1995 encode(write_tier, bl);
1996 encode(properties, bl);
1997 encode(hit_set_params, bl);
1998 encode(hit_set_period, bl);
1999 encode(hit_set_count, bl);
2000 encode(stripe_width, bl);
2001 encode(target_max_bytes, bl);
2002 encode(target_max_objects, bl);
2003 encode(cache_target_dirty_ratio_micro, bl);
2004 encode(cache_target_full_ratio_micro, bl);
2005 encode(cache_min_flush_age, bl);
2006 encode(cache_min_evict_age, bl);
2007 encode(erasure_code_profile, bl);
2008 encode(last_force_op_resend_preluminous, bl);
2009 encode(min_read_recency_for_promote, bl);
2010 encode(expected_num_objects, bl);
2011 if (v >= 19) {
2012 encode(cache_target_dirty_high_ratio_micro, bl);
2013 }
2014 if (v >= 20) {
2015 encode(min_write_recency_for_promote, bl);
2016 }
2017 if (v >= 21) {
2018 encode(use_gmt_hitset, bl);
2019 }
2020 if (v >= 22) {
2021 encode(fast_read, bl);
2022 }
2023 if (v >= 23) {
2024 encode(hit_set_grade_decay_rate, bl);
2025 encode(hit_set_search_last_n, bl);
2026 }
2027 if (v >= 24) {
2028 encode(opts, bl, features);
2029 }
2030 if (v >= 25) {
2031 encode(last_force_op_resend_prenautilus, bl);
2032 }
2033 if (v >= 26) {
2034 encode(application_metadata, bl);
2035 }
2036 if (v >= 27) {
2037 encode(create_time, bl);
2038 }
2039 if (v >= 28) {
2040 encode(pg_num_target, bl);
2041 encode(pgp_num_target, bl);
2042 encode(pg_num_pending, bl);
2043 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01]
2044 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01]
2045 encode(last_force_op_resend, bl);
2046 encode(pg_autoscale_mode, bl);
2047 }
2048 if (v >= 29) {
2049 encode(last_pg_merge_meta, bl);
2050 }
2051 if (v >= 30) {
2052 encode(peering_crush_bucket_count, bl);
2053 encode(peering_crush_bucket_target, bl);
2054 encode(peering_crush_bucket_barrier, bl);
2055 encode(peering_crush_mandatory_member, bl);
2056 }
2057 ENCODE_FINISH(bl);
2058 }
2059
2060 void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
2061 {
2062 DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl);
2063 decode(type, bl);
2064 decode(size, bl);
2065 decode(crush_rule, bl);
2066 decode(object_hash, bl);
2067 decode(pg_num, bl);
2068 decode(pgp_num, bl);
2069 {
2070 __u32 lpg_num, lpgp_num;
2071 decode(lpg_num, bl);
2072 decode(lpgp_num, bl);
2073 }
2074 decode(last_change, bl);
2075 decode(snap_seq, bl);
2076 decode(snap_epoch, bl);
2077
2078 if (struct_v >= 3) {
2079 decode(snaps, bl);
2080 decode(removed_snaps, bl);
2081 decode(auid, bl);
2082 } else {
2083 __u32 n, m;
2084 decode(n, bl);
2085 decode(m, bl);
2086 decode(auid, bl);
2087 decode_nohead(n, snaps, bl);
2088 decode_nohead(m, removed_snaps, bl);
2089 }
2090
2091 if (struct_v >= 4) {
2092 decode(flags, bl);
2093 uint32_t crash_replay_interval;
2094 decode(crash_replay_interval, bl);
2095 } else {
2096 flags = 0;
2097 }
2098 // upgrade path for selfmanaged vs pool snaps
2099 if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
2100 if (!removed_snaps.empty()) {
2101 flags |= FLAG_SELFMANAGED_SNAPS;
2102 } else {
2103 flags |= FLAG_POOL_SNAPS;
2104 }
2105 }
2106 if (struct_v >= 7) {
2107 decode(min_size, bl);
2108 } else {
2109 min_size = size - size/2;
2110 }
2111 if (struct_v >= 8) {
2112 decode(quota_max_bytes, bl);
2113 decode(quota_max_objects, bl);
2114 }
2115 if (struct_v >= 9) {
2116 decode(tiers, bl);
2117 decode(tier_of, bl);
2118 __u8 v;
2119 decode(v, bl);
2120 cache_mode = (cache_mode_t)v;
2121 decode(read_tier, bl);
2122 decode(write_tier, bl);
2123 }
2124 if (struct_v >= 10) {
2125 decode(properties, bl);
2126 }
2127 if (struct_v >= 11) {
2128 decode(hit_set_params, bl);
2129 decode(hit_set_period, bl);
2130 decode(hit_set_count, bl);
2131 } else {
2132 pg_pool_t def;
2133 hit_set_period = def.hit_set_period;
2134 hit_set_count = def.hit_set_count;
2135 }
2136 if (struct_v >= 12) {
2137 decode(stripe_width, bl);
2138 } else {
2139 set_stripe_width(0);
2140 }
2141 if (struct_v >= 13) {
2142 decode(target_max_bytes, bl);
2143 decode(target_max_objects, bl);
2144 decode(cache_target_dirty_ratio_micro, bl);
2145 decode(cache_target_full_ratio_micro, bl);
2146 decode(cache_min_flush_age, bl);
2147 decode(cache_min_evict_age, bl);
2148 } else {
2149 target_max_bytes = 0;
2150 target_max_objects = 0;
2151 cache_target_dirty_ratio_micro = 0;
2152 cache_target_full_ratio_micro = 0;
2153 cache_min_flush_age = 0;
2154 cache_min_evict_age = 0;
2155 }
2156 if (struct_v >= 14) {
2157 decode(erasure_code_profile, bl);
2158 }
2159 if (struct_v >= 15) {
2160 decode(last_force_op_resend_preluminous, bl);
2161 } else {
2162 last_force_op_resend_preluminous = 0;
2163 }
2164 if (struct_v >= 16) {
2165 decode(min_read_recency_for_promote, bl);
2166 } else {
2167 min_read_recency_for_promote = 1;
2168 }
2169 if (struct_v >= 17) {
2170 decode(expected_num_objects, bl);
2171 } else {
2172 expected_num_objects = 0;
2173 }
2174 if (struct_v >= 19) {
2175 decode(cache_target_dirty_high_ratio_micro, bl);
2176 } else {
2177 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
2178 }
2179 if (struct_v >= 20) {
2180 decode(min_write_recency_for_promote, bl);
2181 } else {
2182 min_write_recency_for_promote = 1;
2183 }
2184 if (struct_v >= 21) {
2185 decode(use_gmt_hitset, bl);
2186 } else {
2187 use_gmt_hitset = false;
2188 }
2189 if (struct_v >= 22) {
2190 decode(fast_read, bl);
2191 } else {
2192 fast_read = false;
2193 }
2194 if (struct_v >= 23) {
2195 decode(hit_set_grade_decay_rate, bl);
2196 decode(hit_set_search_last_n, bl);
2197 } else {
2198 hit_set_grade_decay_rate = 0;
2199 hit_set_search_last_n = 1;
2200 }
2201 if (struct_v >= 24) {
2202 decode(opts, bl);
2203 }
2204 if (struct_v >= 25) {
2205 decode(last_force_op_resend_prenautilus, bl);
2206 } else {
2207 last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
2208 }
2209 if (struct_v >= 26) {
2210 decode(application_metadata, bl);
2211 }
2212 if (struct_v >= 27) {
2213 decode(create_time, bl);
2214 }
2215 if (struct_v >= 28) {
2216 decode(pg_num_target, bl);
2217 decode(pgp_num_target, bl);
2218 decode(pg_num_pending, bl);
2219 epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
2220 decode(old_merge_last_epoch_started, bl);
2221 decode(old_merge_last_epoch_clean, bl);
2222 decode(last_force_op_resend, bl);
2223 decode(pg_autoscale_mode, bl);
2224 if (struct_v >= 29) {
2225 decode(last_pg_merge_meta, bl);
2226 } else {
2227 last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
2228 last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
2229 }
2230 } else {
2231 pg_num_target = pg_num;
2232 pgp_num_target = pgp_num;
2233 pg_num_pending = pg_num;
2234 last_force_op_resend = last_force_op_resend_prenautilus;
2235 pg_autoscale_mode = pg_autoscale_mode_t::WARN; // default to warn on upgrade
2236 }
2237 if (struct_v >= 30) {
2238 decode(peering_crush_bucket_count, bl);
2239 decode(peering_crush_bucket_target, bl);
2240 decode(peering_crush_bucket_barrier, bl);
2241 decode(peering_crush_mandatory_member, bl);
2242 }
2243 DECODE_FINISH(bl);
2244 calc_pg_masks();
2245 calc_grade_table();
2246 }
2247
2248 bool pg_pool_t::stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
2249 std::ostream * out) const
2250 {
2251 if (!is_stretch_pool()) return true;
2252 const uint32_t barrier_id = peering_crush_bucket_barrier;
2253 const uint32_t barrier_count = peering_crush_bucket_count;
2254 set<int> ancestors;
2255 const shared_ptr<CrushWrapper>& crush = osdmap.crush;
2256 for (int osdid : want) {
2257 int ancestor = crush->get_parent_of_type(osdid, barrier_id,
2258 crush_rule);
2259 ancestors.insert(ancestor);
2260 }
2261 if (ancestors.size() < barrier_count) {
2262 if (out) {
2263 *out << __func__ << ": not enough crush buckets with OSDs in want set "
2264 << want;
2265 }
2266 return false;
2267 } else if (peering_crush_mandatory_member != CRUSH_ITEM_NONE &&
2268 !ancestors.count(peering_crush_mandatory_member)) {
2269 if (out) {
2270 *out << __func__ << ": missing mandatory crush bucket member "
2271 << peering_crush_mandatory_member;
2272 }
2273 return false;
2274 }
2275 return true;
2276 }
2277
2278 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
2279 {
2280 pg_pool_t a;
2281 o.push_back(new pg_pool_t(a));
2282
2283 a.create_time = utime_t(4,5);
2284 a.type = TYPE_REPLICATED;
2285 a.size = 2;
2286 a.crush_rule = 3;
2287 a.object_hash = 4;
2288 a.pg_num = 6;
2289 a.pgp_num = 4;
2290 a.pgp_num_target = 4;
2291 a.pg_num_target = 5;
2292 a.pg_num_pending = 5;
2293 a.last_pg_merge_meta.last_epoch_started = 2;
2294 a.last_pg_merge_meta.last_epoch_clean = 2;
2295 a.last_change = 9;
2296 a.last_force_op_resend = 123823;
2297 a.last_force_op_resend_preluminous = 123824;
2298 a.snap_seq = 10;
2299 a.snap_epoch = 11;
2300 a.flags = FLAG_POOL_SNAPS;
2301 a.auid = 12;
2302 a.quota_max_bytes = 473;
2303 a.quota_max_objects = 474;
2304 o.push_back(new pg_pool_t(a));
2305
2306 a.snaps[3].name = "asdf";
2307 a.snaps[3].snapid = 3;
2308 a.snaps[3].stamp = utime_t(123, 4);
2309 a.snaps[6].name = "qwer";
2310 a.snaps[6].snapid = 6;
2311 a.snaps[6].stamp = utime_t(23423, 4);
2312 o.push_back(new pg_pool_t(a));
2313
2314 a.flags = FLAG_SELFMANAGED_SNAPS;
2315 a.snaps.clear();
2316 a.removed_snaps.insert(2);
2317 a.quota_max_bytes = 2473;
2318 a.quota_max_objects = 4374;
2319 a.tiers.insert(0);
2320 a.tiers.insert(1);
2321 a.tier_of = 2;
2322 a.cache_mode = CACHEMODE_WRITEBACK;
2323 a.read_tier = 1;
2324 a.write_tier = 1;
2325 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
2326 a.hit_set_period = 3600;
2327 a.hit_set_count = 8;
2328 a.min_read_recency_for_promote = 1;
2329 a.min_write_recency_for_promote = 1;
2330 a.hit_set_grade_decay_rate = 50;
2331 a.hit_set_search_last_n = 1;
2332 a.calc_grade_table();
2333 a.set_stripe_width(12345);
2334 a.target_max_bytes = 1238132132;
2335 a.target_max_objects = 1232132;
2336 a.cache_target_dirty_ratio_micro = 187232;
2337 a.cache_target_dirty_high_ratio_micro = 309856;
2338 a.cache_target_full_ratio_micro = 987222;
2339 a.cache_min_flush_age = 231;
2340 a.cache_min_evict_age = 2321;
2341 a.erasure_code_profile = "profile in osdmap";
2342 a.expected_num_objects = 123456;
2343 a.fast_read = false;
2344 a.application_metadata = {{"rbd", {{"key", "value"}}}};
2345 o.push_back(new pg_pool_t(a));
2346 }
2347
2348 ostream& operator<<(ostream& out, const pg_pool_t& p)
2349 {
2350 out << p.get_type_name();
2351 if (p.get_type_name() == "erasure") {
2352 out << " profile " << p.erasure_code_profile;
2353 }
2354 out << " size " << p.get_size()
2355 << " min_size " << p.get_min_size()
2356 << " crush_rule " << p.get_crush_rule()
2357 << " object_hash " << p.get_object_hash_name()
2358 << " pg_num " << p.get_pg_num()
2359 << " pgp_num " << p.get_pgp_num();
2360 if (p.get_pg_num_target() != p.get_pg_num()) {
2361 out << " pg_num_target " << p.get_pg_num_target();
2362 }
2363 if (p.get_pgp_num_target() != p.get_pgp_num()) {
2364 out << " pgp_num_target " << p.get_pgp_num_target();
2365 }
2366 if (p.get_pg_num_pending() != p.get_pg_num()) {
2367 out << " pg_num_pending " << p.get_pg_num_pending();
2368 }
2369 if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
2370 out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode);
2371 }
2372 out << " last_change " << p.get_last_change();
2373 if (p.get_last_force_op_resend() ||
2374 p.get_last_force_op_resend_prenautilus() ||
2375 p.get_last_force_op_resend_preluminous())
2376 out << " lfor " << p.get_last_force_op_resend() << "/"
2377 << p.get_last_force_op_resend_prenautilus() << "/"
2378 << p.get_last_force_op_resend_preluminous();
2379 if (p.get_auid())
2380 out << " owner " << p.get_auid();
2381 if (p.flags)
2382 out << " flags " << p.get_flags_string();
2383 if (p.quota_max_bytes)
2384 out << " max_bytes " << p.quota_max_bytes;
2385 if (p.quota_max_objects)
2386 out << " max_objects " << p.quota_max_objects;
2387 if (!p.tiers.empty())
2388 out << " tiers " << p.tiers;
2389 if (p.is_tier())
2390 out << " tier_of " << p.tier_of;
2391 if (p.has_read_tier())
2392 out << " read_tier " << p.read_tier;
2393 if (p.has_write_tier())
2394 out << " write_tier " << p.write_tier;
2395 if (p.cache_mode)
2396 out << " cache_mode " << p.get_cache_mode_name();
2397 if (p.target_max_bytes)
2398 out << " target_bytes " << p.target_max_bytes;
2399 if (p.target_max_objects)
2400 out << " target_objects " << p.target_max_objects;
2401 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
2402 out << " hit_set " << p.hit_set_params
2403 << " " << p.hit_set_period << "s"
2404 << " x" << p.hit_set_count << " decay_rate "
2405 << p.hit_set_grade_decay_rate
2406 << " search_last_n " << p.hit_set_search_last_n;
2407 }
2408 if (p.min_read_recency_for_promote)
2409 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
2410 if (p.min_write_recency_for_promote)
2411 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
2412 out << " stripe_width " << p.get_stripe_width();
2413 if (p.expected_num_objects)
2414 out << " expected_num_objects " << p.expected_num_objects;
2415 if (p.fast_read)
2416 out << " fast_read " << p.fast_read;
2417 out << p.opts;
2418 if (!p.application_metadata.empty()) {
2419 out << " application ";
2420 for (auto it = p.application_metadata.begin();
2421 it != p.application_metadata.end(); ++it) {
2422 if (it != p.application_metadata.begin())
2423 out << ",";
2424 out << it->first;
2425 }
2426 }
2427 return out;
2428 }
2429
2430
2431 // -- object_stat_sum_t --
2432
2433 void object_stat_sum_t::dump(Formatter *f) const
2434 {
2435 f->dump_int("num_bytes", num_bytes);
2436 f->dump_int("num_objects", num_objects);
2437 f->dump_int("num_object_clones", num_object_clones);
2438 f->dump_int("num_object_copies", num_object_copies);
2439 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
2440 f->dump_int("num_objects_missing", num_objects_missing);
2441 f->dump_int("num_objects_degraded", num_objects_degraded);
2442 f->dump_int("num_objects_misplaced", num_objects_misplaced);
2443 f->dump_int("num_objects_unfound", num_objects_unfound);
2444 f->dump_int("num_objects_dirty", num_objects_dirty);
2445 f->dump_int("num_whiteouts", num_whiteouts);
2446 f->dump_int("num_read", num_rd);
2447 f->dump_int("num_read_kb", num_rd_kb);
2448 f->dump_int("num_write", num_wr);
2449 f->dump_int("num_write_kb", num_wr_kb);
2450 f->dump_int("num_scrub_errors", num_scrub_errors);
2451 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
2452 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
2453 f->dump_int("num_objects_recovered", num_objects_recovered);
2454 f->dump_int("num_bytes_recovered", num_bytes_recovered);
2455 f->dump_int("num_keys_recovered", num_keys_recovered);
2456 f->dump_int("num_objects_omap", num_objects_omap);
2457 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
2458 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
2459 f->dump_int("num_flush", num_flush);
2460 f->dump_int("num_flush_kb", num_flush_kb);
2461 f->dump_int("num_evict", num_evict);
2462 f->dump_int("num_evict_kb", num_evict_kb);
2463 f->dump_int("num_promote", num_promote);
2464 f->dump_int("num_flush_mode_high", num_flush_mode_high);
2465 f->dump_int("num_flush_mode_low", num_flush_mode_low);
2466 f->dump_int("num_evict_mode_some", num_evict_mode_some);
2467 f->dump_int("num_evict_mode_full", num_evict_mode_full);
2468 f->dump_int("num_objects_pinned", num_objects_pinned);
2469 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
2470 f->dump_int("num_large_omap_objects", num_large_omap_objects);
2471 f->dump_int("num_objects_manifest", num_objects_manifest);
2472 f->dump_int("num_omap_bytes", num_omap_bytes);
2473 f->dump_int("num_omap_keys", num_omap_keys);
2474 f->dump_int("num_objects_repaired", num_objects_repaired);
2475 }
2476
2477 void object_stat_sum_t::encode(ceph::buffer::list& bl) const
2478 {
2479 ENCODE_START(20, 14, bl);
2480 #if defined(CEPH_LITTLE_ENDIAN)
2481 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
2482 #else
2483 encode(num_bytes, bl);
2484 encode(num_objects, bl);
2485 encode(num_object_clones, bl);
2486 encode(num_object_copies, bl);
2487 encode(num_objects_missing_on_primary, bl);
2488 encode(num_objects_degraded, bl);
2489 encode(num_objects_unfound, bl);
2490 encode(num_rd, bl);
2491 encode(num_rd_kb, bl);
2492 encode(num_wr, bl);
2493 encode(num_wr_kb, bl);
2494 encode(num_scrub_errors, bl);
2495 encode(num_objects_recovered, bl);
2496 encode(num_bytes_recovered, bl);
2497 encode(num_keys_recovered, bl);
2498 encode(num_shallow_scrub_errors, bl);
2499 encode(num_deep_scrub_errors, bl);
2500 encode(num_objects_dirty, bl);
2501 encode(num_whiteouts, bl);
2502 encode(num_objects_omap, bl);
2503 encode(num_objects_hit_set_archive, bl);
2504 encode(num_objects_misplaced, bl);
2505 encode(num_bytes_hit_set_archive, bl);
2506 encode(num_flush, bl);
2507 encode(num_flush_kb, bl);
2508 encode(num_evict, bl);
2509 encode(num_evict_kb, bl);
2510 encode(num_promote, bl);
2511 encode(num_flush_mode_high, bl);
2512 encode(num_flush_mode_low, bl);
2513 encode(num_evict_mode_some, bl);
2514 encode(num_evict_mode_full, bl);
2515 encode(num_objects_pinned, bl);
2516 encode(num_objects_missing, bl);
2517 encode(num_legacy_snapsets, bl);
2518 encode(num_large_omap_objects, bl);
2519 encode(num_objects_manifest, bl);
2520 encode(num_omap_bytes, bl);
2521 encode(num_omap_keys, bl);
2522 encode(num_objects_repaired, bl);
2523 #endif
2524 ENCODE_FINISH(bl);
2525 }
2526
2527 void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl)
2528 {
2529 bool decode_finish = false;
2530 static const int STAT_SUM_DECODE_VERSION = 20;
2531 DECODE_START(STAT_SUM_DECODE_VERSION, bl);
2532 #if defined(CEPH_LITTLE_ENDIAN)
2533 if (struct_v == STAT_SUM_DECODE_VERSION) {
2534 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
2535 decode_finish = true;
2536 }
2537 #endif
2538 if (!decode_finish) {
2539 decode(num_bytes, bl);
2540 decode(num_objects, bl);
2541 decode(num_object_clones, bl);
2542 decode(num_object_copies, bl);
2543 decode(num_objects_missing_on_primary, bl);
2544 decode(num_objects_degraded, bl);
2545 decode(num_objects_unfound, bl);
2546 decode(num_rd, bl);
2547 decode(num_rd_kb, bl);
2548 decode(num_wr, bl);
2549 decode(num_wr_kb, bl);
2550 decode(num_scrub_errors, bl);
2551 decode(num_objects_recovered, bl);
2552 decode(num_bytes_recovered, bl);
2553 decode(num_keys_recovered, bl);
2554 decode(num_shallow_scrub_errors, bl);
2555 decode(num_deep_scrub_errors, bl);
2556 decode(num_objects_dirty, bl);
2557 decode(num_whiteouts, bl);
2558 decode(num_objects_omap, bl);
2559 decode(num_objects_hit_set_archive, bl);
2560 decode(num_objects_misplaced, bl);
2561 decode(num_bytes_hit_set_archive, bl);
2562 decode(num_flush, bl);
2563 decode(num_flush_kb, bl);
2564 decode(num_evict, bl);
2565 decode(num_evict_kb, bl);
2566 decode(num_promote, bl);
2567 decode(num_flush_mode_high, bl);
2568 decode(num_flush_mode_low, bl);
2569 decode(num_evict_mode_some, bl);
2570 decode(num_evict_mode_full, bl);
2571 decode(num_objects_pinned, bl);
2572 decode(num_objects_missing, bl);
2573 if (struct_v >= 16) {
2574 decode(num_legacy_snapsets, bl);
2575 } else {
2576 num_legacy_snapsets = num_object_clones; // upper bound
2577 }
2578 if (struct_v >= 17) {
2579 decode(num_large_omap_objects, bl);
2580 }
2581 if (struct_v >= 18) {
2582 decode(num_objects_manifest, bl);
2583 }
2584 if (struct_v >= 19) {
2585 decode(num_omap_bytes, bl);
2586 decode(num_omap_keys, bl);
2587 }
2588 if (struct_v >= 20) {
2589 decode(num_objects_repaired, bl);
2590 }
2591 }
2592 DECODE_FINISH(bl);
2593 }
2594
2595 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2596 {
2597 object_stat_sum_t a;
2598
2599 a.num_bytes = 1;
2600 a.num_objects = 3;
2601 a.num_object_clones = 4;
2602 a.num_object_copies = 5;
2603 a.num_objects_missing_on_primary = 6;
2604 a.num_objects_missing = 123;
2605 a.num_objects_degraded = 7;
2606 a.num_objects_unfound = 8;
2607 a.num_rd = 9; a.num_rd_kb = 10;
2608 a.num_wr = 11; a.num_wr_kb = 12;
2609 a.num_objects_recovered = 14;
2610 a.num_bytes_recovered = 15;
2611 a.num_keys_recovered = 16;
2612 a.num_deep_scrub_errors = 17;
2613 a.num_shallow_scrub_errors = 18;
2614 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2615 a.num_objects_dirty = 21;
2616 a.num_whiteouts = 22;
2617 a.num_objects_misplaced = 1232;
2618 a.num_objects_hit_set_archive = 2;
2619 a.num_bytes_hit_set_archive = 27;
2620 a.num_flush = 5;
2621 a.num_flush_kb = 6;
2622 a.num_evict = 7;
2623 a.num_evict_kb = 8;
2624 a.num_promote = 9;
2625 a.num_flush_mode_high = 0;
2626 a.num_flush_mode_low = 1;
2627 a.num_evict_mode_some = 1;
2628 a.num_evict_mode_full = 0;
2629 a.num_objects_pinned = 20;
2630 a.num_large_omap_objects = 5;
2631 a.num_objects_manifest = 2;
2632 a.num_omap_bytes = 20000;
2633 a.num_omap_keys = 200;
2634 a.num_objects_repaired = 300;
2635 o.push_back(new object_stat_sum_t(a));
2636 }
2637
2638 void object_stat_sum_t::add(const object_stat_sum_t& o)
2639 {
2640 num_bytes += o.num_bytes;
2641 num_objects += o.num_objects;
2642 num_object_clones += o.num_object_clones;
2643 num_object_copies += o.num_object_copies;
2644 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2645 num_objects_missing += o.num_objects_missing;
2646 num_objects_degraded += o.num_objects_degraded;
2647 num_objects_misplaced += o.num_objects_misplaced;
2648 num_rd += o.num_rd;
2649 num_rd_kb += o.num_rd_kb;
2650 num_wr += o.num_wr;
2651 num_wr_kb += o.num_wr_kb;
2652 num_objects_unfound += o.num_objects_unfound;
2653 num_scrub_errors += o.num_scrub_errors;
2654 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2655 num_deep_scrub_errors += o.num_deep_scrub_errors;
2656 num_objects_recovered += o.num_objects_recovered;
2657 num_bytes_recovered += o.num_bytes_recovered;
2658 num_keys_recovered += o.num_keys_recovered;
2659 num_objects_dirty += o.num_objects_dirty;
2660 num_whiteouts += o.num_whiteouts;
2661 num_objects_omap += o.num_objects_omap;
2662 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2663 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2664 num_flush += o.num_flush;
2665 num_flush_kb += o.num_flush_kb;
2666 num_evict += o.num_evict;
2667 num_evict_kb += o.num_evict_kb;
2668 num_promote += o.num_promote;
2669 num_flush_mode_high += o.num_flush_mode_high;
2670 num_flush_mode_low += o.num_flush_mode_low;
2671 num_evict_mode_some += o.num_evict_mode_some;
2672 num_evict_mode_full += o.num_evict_mode_full;
2673 num_objects_pinned += o.num_objects_pinned;
2674 num_legacy_snapsets += o.num_legacy_snapsets;
2675 num_large_omap_objects += o.num_large_omap_objects;
2676 num_objects_manifest += o.num_objects_manifest;
2677 num_omap_bytes += o.num_omap_bytes;
2678 num_omap_keys += o.num_omap_keys;
2679 num_objects_repaired += o.num_objects_repaired;
2680 }
2681
2682 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2683 {
2684 num_bytes -= o.num_bytes;
2685 num_objects -= o.num_objects;
2686 num_object_clones -= o.num_object_clones;
2687 num_object_copies -= o.num_object_copies;
2688 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2689 num_objects_missing -= o.num_objects_missing;
2690 num_objects_degraded -= o.num_objects_degraded;
2691 num_objects_misplaced -= o.num_objects_misplaced;
2692 num_rd -= o.num_rd;
2693 num_rd_kb -= o.num_rd_kb;
2694 num_wr -= o.num_wr;
2695 num_wr_kb -= o.num_wr_kb;
2696 num_objects_unfound -= o.num_objects_unfound;
2697 num_scrub_errors -= o.num_scrub_errors;
2698 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2699 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2700 num_objects_recovered -= o.num_objects_recovered;
2701 num_bytes_recovered -= o.num_bytes_recovered;
2702 num_keys_recovered -= o.num_keys_recovered;
2703 num_objects_dirty -= o.num_objects_dirty;
2704 num_whiteouts -= o.num_whiteouts;
2705 num_objects_omap -= o.num_objects_omap;
2706 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2707 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2708 num_flush -= o.num_flush;
2709 num_flush_kb -= o.num_flush_kb;
2710 num_evict -= o.num_evict;
2711 num_evict_kb -= o.num_evict_kb;
2712 num_promote -= o.num_promote;
2713 num_flush_mode_high -= o.num_flush_mode_high;
2714 num_flush_mode_low -= o.num_flush_mode_low;
2715 num_evict_mode_some -= o.num_evict_mode_some;
2716 num_evict_mode_full -= o.num_evict_mode_full;
2717 num_objects_pinned -= o.num_objects_pinned;
2718 num_legacy_snapsets -= o.num_legacy_snapsets;
2719 num_large_omap_objects -= o.num_large_omap_objects;
2720 num_objects_manifest -= o.num_objects_manifest;
2721 num_omap_bytes -= o.num_omap_bytes;
2722 num_omap_keys -= o.num_omap_keys;
2723 num_objects_repaired -= o.num_objects_repaired;
2724 }
2725
2726 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2727 {
2728 return
2729 l.num_bytes == r.num_bytes &&
2730 l.num_objects == r.num_objects &&
2731 l.num_object_clones == r.num_object_clones &&
2732 l.num_object_copies == r.num_object_copies &&
2733 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2734 l.num_objects_missing == r.num_objects_missing &&
2735 l.num_objects_degraded == r.num_objects_degraded &&
2736 l.num_objects_misplaced == r.num_objects_misplaced &&
2737 l.num_objects_unfound == r.num_objects_unfound &&
2738 l.num_rd == r.num_rd &&
2739 l.num_rd_kb == r.num_rd_kb &&
2740 l.num_wr == r.num_wr &&
2741 l.num_wr_kb == r.num_wr_kb &&
2742 l.num_scrub_errors == r.num_scrub_errors &&
2743 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2744 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2745 l.num_objects_recovered == r.num_objects_recovered &&
2746 l.num_bytes_recovered == r.num_bytes_recovered &&
2747 l.num_keys_recovered == r.num_keys_recovered &&
2748 l.num_objects_dirty == r.num_objects_dirty &&
2749 l.num_whiteouts == r.num_whiteouts &&
2750 l.num_objects_omap == r.num_objects_omap &&
2751 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2752 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2753 l.num_flush == r.num_flush &&
2754 l.num_flush_kb == r.num_flush_kb &&
2755 l.num_evict == r.num_evict &&
2756 l.num_evict_kb == r.num_evict_kb &&
2757 l.num_promote == r.num_promote &&
2758 l.num_flush_mode_high == r.num_flush_mode_high &&
2759 l.num_flush_mode_low == r.num_flush_mode_low &&
2760 l.num_evict_mode_some == r.num_evict_mode_some &&
2761 l.num_evict_mode_full == r.num_evict_mode_full &&
2762 l.num_objects_pinned == r.num_objects_pinned &&
2763 l.num_legacy_snapsets == r.num_legacy_snapsets &&
2764 l.num_large_omap_objects == r.num_large_omap_objects &&
2765 l.num_objects_manifest == r.num_objects_manifest &&
2766 l.num_omap_bytes == r.num_omap_bytes &&
2767 l.num_omap_keys == r.num_omap_keys &&
2768 l.num_objects_repaired == r.num_objects_repaired;
2769 }
2770
2771 // -- object_stat_collection_t --
2772
2773 void object_stat_collection_t::dump(Formatter *f) const
2774 {
2775 f->open_object_section("stat_sum");
2776 sum.dump(f);
2777 f->close_section();
2778 }
2779
2780 void object_stat_collection_t::encode(ceph::buffer::list& bl) const
2781 {
2782 ENCODE_START(2, 2, bl);
2783 encode(sum, bl);
2784 encode((__u32)0, bl);
2785 ENCODE_FINISH(bl);
2786 }
2787
2788 void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl)
2789 {
2790 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2791 decode(sum, bl);
2792 {
2793 map<string,object_stat_sum_t> cat_sum;
2794 decode(cat_sum, bl);
2795 }
2796 DECODE_FINISH(bl);
2797 }
2798
2799 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2800 {
2801 object_stat_collection_t a;
2802 o.push_back(new object_stat_collection_t(a));
2803 list<object_stat_sum_t*> l;
2804 object_stat_sum_t::generate_test_instances(l);
2805 for (auto p = l.begin(); p != l.end(); ++p) {
2806 a.add(**p);
2807 o.push_back(new object_stat_collection_t(a));
2808 }
2809 }
2810
2811
2812 // -- pg_stat_t --
2813
2814 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2815 {
2816 if (primary && osd == acting_primary) {
2817 return true;
2818 } else if (!primary) {
2819 for(auto it = acting.cbegin(); it != acting.cend(); ++it)
2820 {
2821 if (*it == osd)
2822 return true;
2823 }
2824 }
2825 return false;
2826 }
2827
2828 void pg_stat_t::dump(Formatter *f) const
2829 {
2830 f->dump_stream("version") << version;
2831 f->dump_unsigned("reported_seq", reported_seq);
2832 f->dump_unsigned("reported_epoch", reported_epoch);
2833 f->dump_string("state", pg_state_string(state));
2834 f->dump_stream("last_fresh") << last_fresh;
2835 f->dump_stream("last_change") << last_change;
2836 f->dump_stream("last_active") << last_active;
2837 f->dump_stream("last_peered") << last_peered;
2838 f->dump_stream("last_clean") << last_clean;
2839 f->dump_stream("last_became_active") << last_became_active;
2840 f->dump_stream("last_became_peered") << last_became_peered;
2841 f->dump_stream("last_unstale") << last_unstale;
2842 f->dump_stream("last_undegraded") << last_undegraded;
2843 f->dump_stream("last_fullsized") << last_fullsized;
2844 f->dump_unsigned("mapping_epoch", mapping_epoch);
2845 f->dump_stream("log_start") << log_start;
2846 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2847 f->dump_unsigned("created", created);
2848 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2849 f->dump_stream("parent") << parent;
2850 f->dump_unsigned("parent_split_bits", parent_split_bits);
2851 f->dump_stream("last_scrub") << last_scrub;
2852 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2853 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2854 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2855 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2856 f->dump_int("objects_scrubbed", objects_scrubbed);
2857 f->dump_int("log_size", log_size);
2858 f->dump_int("ondisk_log_size", ondisk_log_size);
2859 f->dump_bool("stats_invalid", stats_invalid);
2860 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2861 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2862 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2863 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2864 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2865 f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
2866 f->dump_unsigned("snaptrimq_len", snaptrimq_len);
2867 f->dump_int("last_scrub_duration", last_scrub_duration);
2868 f->dump_string("scrub_schedule", dump_scrub_schedule());
2869 f->dump_float("scrub_duration", scrub_duration);
2870 f->dump_int("objects_trimmed", objects_trimmed);
2871 f->dump_float("snaptrim_duration", snaptrim_duration);
2872 stats.dump(f);
2873 f->open_array_section("up");
2874 for (auto p = up.cbegin(); p != up.cend(); ++p)
2875 f->dump_int("osd", *p);
2876 f->close_section();
2877 f->open_array_section("acting");
2878 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
2879 f->dump_int("osd", *p);
2880 f->close_section();
2881 f->open_array_section("avail_no_missing");
2882 for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p)
2883 f->dump_stream("shard") << *p;
2884 f->close_section();
2885 f->open_array_section("object_location_counts");
2886 for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) {
2887 f->open_object_section("entry");
2888 f->dump_stream("shards") << p->first;
2889 f->dump_int("objects", p->second);
2890 f->close_section();
2891 }
2892 f->close_section();
2893 f->open_array_section("blocked_by");
2894 for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p)
2895 f->dump_int("osd", *p);
2896 f->close_section();
2897 f->dump_int("up_primary", up_primary);
2898 f->dump_int("acting_primary", acting_primary);
2899 f->open_array_section("purged_snaps");
2900 for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) {
2901 f->open_object_section("interval");
2902 f->dump_stream("start") << i.get_start();
2903 f->dump_stream("length") << i.get_len();
2904 f->close_section();
2905 }
2906 f->close_section();
2907 }
2908
2909 void pg_stat_t::dump_brief(Formatter *f) const
2910 {
2911 f->dump_string("state", pg_state_string(state));
2912 f->open_array_section("up");
2913 for (auto p = up.cbegin(); p != up.cend(); ++p)
2914 f->dump_int("osd", *p);
2915 f->close_section();
2916 f->open_array_section("acting");
2917 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
2918 f->dump_int("osd", *p);
2919 f->close_section();
2920 f->dump_int("up_primary", up_primary);
2921 f->dump_int("acting_primary", acting_primary);
2922 }
2923
2924 std::string pg_stat_t::dump_scrub_schedule() const
2925 {
2926 if (scrub_sched_status.m_is_active) {
2927 return fmt::format(
2928 "{}scrubbing for {}s",
2929 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
2930 scrub_sched_status.m_duration_seconds);
2931 }
2932 switch (scrub_sched_status.m_sched_status) {
2933 case pg_scrub_sched_status_t::unknown:
2934 // no reported scrub schedule yet
2935 return "--"s;
2936 case pg_scrub_sched_status_t::not_queued:
2937 return "no scrub is scheduled"s;
2938 case pg_scrub_sched_status_t::scheduled:
2939 return fmt::format(
2940 "{} {}scrub scheduled @ {}",
2941 (scrub_sched_status.m_is_periodic ? "periodic" : "user requested"),
2942 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
2943 scrub_sched_status.m_scheduled_at);
2944 case pg_scrub_sched_status_t::queued:
2945 return fmt::format(
2946 "queued for {}scrub",
2947 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""));
2948 default:
2949 // a bug!
2950 return "SCRUB STATE MISMATCH!"s;
2951 }
2952 }
2953
2954 bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r)
2955 {
2956 return
2957 l.m_sched_status == r.m_sched_status &&
2958 l.m_scheduled_at == r.m_scheduled_at &&
2959 l.m_duration_seconds == r.m_duration_seconds &&
2960 l.m_is_active == r.m_is_active &&
2961 l.m_is_deep == r.m_is_deep &&
2962 l.m_is_periodic == r.m_is_periodic;
2963 }
2964
2965 void pg_stat_t::encode(ceph::buffer::list &bl) const
2966 {
2967 ENCODE_START(28, 22, bl);
2968 encode(version, bl);
2969 encode(reported_seq, bl);
2970 encode(reported_epoch, bl);
2971 encode((__u32)state, bl); // for older peers
2972 encode(log_start, bl);
2973 encode(ondisk_log_start, bl);
2974 encode(created, bl);
2975 encode(last_epoch_clean, bl);
2976 encode(parent, bl);
2977 encode(parent_split_bits, bl);
2978 encode(last_scrub, bl);
2979 encode(last_scrub_stamp, bl);
2980 encode(stats, bl);
2981 encode(log_size, bl);
2982 encode(ondisk_log_size, bl);
2983 encode(up, bl);
2984 encode(acting, bl);
2985 encode(last_fresh, bl);
2986 encode(last_change, bl);
2987 encode(last_active, bl);
2988 encode(last_clean, bl);
2989 encode(last_unstale, bl);
2990 encode(mapping_epoch, bl);
2991 encode(last_deep_scrub, bl);
2992 encode(last_deep_scrub_stamp, bl);
2993 encode(stats_invalid, bl);
2994 encode(last_clean_scrub_stamp, bl);
2995 encode(last_became_active, bl);
2996 encode(dirty_stats_invalid, bl);
2997 encode(up_primary, bl);
2998 encode(acting_primary, bl);
2999 encode(omap_stats_invalid, bl);
3000 encode(hitset_stats_invalid, bl);
3001 encode(blocked_by, bl);
3002 encode(last_undegraded, bl);
3003 encode(last_fullsized, bl);
3004 encode(hitset_bytes_stats_invalid, bl);
3005 encode(last_peered, bl);
3006 encode(last_became_peered, bl);
3007 encode(pin_stats_invalid, bl);
3008 encode(snaptrimq_len, bl);
3009 __u32 top_state = (state >> 32);
3010 encode(top_state, bl);
3011 encode(purged_snaps, bl);
3012 encode(manifest_stats_invalid, bl);
3013 encode(avail_no_missing, bl);
3014 encode(object_location_counts, bl);
3015 encode(last_scrub_duration, bl);
3016 encode(scrub_sched_status.m_scheduled_at, bl);
3017 encode(scrub_sched_status.m_duration_seconds, bl);
3018 encode((__u16)scrub_sched_status.m_sched_status, bl);
3019 encode(scrub_sched_status.m_is_active, bl);
3020 encode((scrub_sched_status.m_is_deep==scrub_level_t::deep), bl);
3021 encode(scrub_sched_status.m_is_periodic, bl);
3022 encode(objects_scrubbed, bl);
3023 encode(scrub_duration, bl);
3024 encode(objects_trimmed, bl);
3025 encode(snaptrim_duration, bl);
3026
3027 ENCODE_FINISH(bl);
3028 }
3029
3030 void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
3031 {
3032 bool tmp;
3033 uint32_t old_state;
3034 DECODE_START(28, bl);
3035 decode(version, bl);
3036 decode(reported_seq, bl);
3037 decode(reported_epoch, bl);
3038 decode(old_state, bl);
3039 decode(log_start, bl);
3040 decode(ondisk_log_start, bl);
3041 decode(created, bl);
3042 decode(last_epoch_clean, bl);
3043 decode(parent, bl);
3044 decode(parent_split_bits, bl);
3045 decode(last_scrub, bl);
3046 decode(last_scrub_stamp, bl);
3047 decode(stats, bl);
3048 decode(log_size, bl);
3049 decode(ondisk_log_size, bl);
3050 decode(up, bl);
3051 decode(acting, bl);
3052 decode(last_fresh, bl);
3053 decode(last_change, bl);
3054 decode(last_active, bl);
3055 decode(last_clean, bl);
3056 decode(last_unstale, bl);
3057 decode(mapping_epoch, bl);
3058 decode(last_deep_scrub, bl);
3059 decode(last_deep_scrub_stamp, bl);
3060 decode(tmp, bl);
3061 stats_invalid = tmp;
3062 decode(last_clean_scrub_stamp, bl);
3063 decode(last_became_active, bl);
3064 decode(tmp, bl);
3065 dirty_stats_invalid = tmp;
3066 decode(up_primary, bl);
3067 decode(acting_primary, bl);
3068 decode(tmp, bl);
3069 omap_stats_invalid = tmp;
3070 decode(tmp, bl);
3071 hitset_stats_invalid = tmp;
3072 decode(blocked_by, bl);
3073 decode(last_undegraded, bl);
3074 decode(last_fullsized, bl);
3075 decode(tmp, bl);
3076 hitset_bytes_stats_invalid = tmp;
3077 decode(last_peered, bl);
3078 decode(last_became_peered, bl);
3079 decode(tmp, bl);
3080 pin_stats_invalid = tmp;
3081 if (struct_v >= 23) {
3082 decode(snaptrimq_len, bl);
3083 if (struct_v >= 24) {
3084 __u32 top_state;
3085 decode(top_state, bl);
3086 state = (uint64_t)old_state | ((uint64_t)top_state << 32);
3087 decode(purged_snaps, bl);
3088 } else {
3089 state = old_state;
3090 }
3091 if (struct_v >= 25) {
3092 decode(tmp, bl);
3093 manifest_stats_invalid = tmp;
3094 } else {
3095 manifest_stats_invalid = true;
3096 }
3097 if (struct_v >= 26) {
3098 decode(avail_no_missing, bl);
3099 decode(object_location_counts, bl);
3100 }
3101 if (struct_v >= 27) {
3102 decode(last_scrub_duration, bl);
3103 decode(scrub_sched_status.m_scheduled_at, bl);
3104 decode(scrub_sched_status.m_duration_seconds, bl);
3105 __u16 scrub_sched_as_u16;
3106 decode(scrub_sched_as_u16, bl);
3107 scrub_sched_status.m_sched_status = (pg_scrub_sched_status_t)(scrub_sched_as_u16);
3108 decode(tmp, bl);
3109 scrub_sched_status.m_is_active = tmp;
3110 decode(tmp, bl);
3111 scrub_sched_status.m_is_deep = tmp ? scrub_level_t::deep : scrub_level_t::shallow;
3112 decode(tmp, bl);
3113 scrub_sched_status.m_is_periodic = tmp;
3114 decode(objects_scrubbed, bl);
3115 }
3116 if (struct_v >= 28) {
3117 decode(scrub_duration, bl);
3118 decode(objects_trimmed, bl);
3119 decode(snaptrim_duration, bl);
3120 }
3121 }
3122 DECODE_FINISH(bl);
3123 }
3124
3125 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
3126 {
3127 pg_stat_t a;
3128 o.push_back(new pg_stat_t(a));
3129
3130 a.version = eversion_t(1, 3);
3131 a.reported_epoch = 1;
3132 a.reported_seq = 2;
3133 a.state = 123;
3134 a.mapping_epoch = 998;
3135 a.last_fresh = utime_t(1002, 1);
3136 a.last_change = utime_t(1002, 2);
3137 a.last_active = utime_t(1002, 3);
3138 a.last_clean = utime_t(1002, 4);
3139 a.last_unstale = utime_t(1002, 5);
3140 a.last_undegraded = utime_t(1002, 7);
3141 a.last_fullsized = utime_t(1002, 8);
3142 a.log_start = eversion_t(1, 4);
3143 a.ondisk_log_start = eversion_t(1, 5);
3144 a.created = 6;
3145 a.last_epoch_clean = 7;
3146 a.parent = pg_t(1, 2);
3147 a.parent_split_bits = 12;
3148 a.last_scrub = eversion_t(9, 10);
3149 a.last_scrub_stamp = utime_t(11, 12);
3150 a.last_deep_scrub = eversion_t(13, 14);
3151 a.last_deep_scrub_stamp = utime_t(15, 16);
3152 a.last_clean_scrub_stamp = utime_t(17, 18);
3153 a.last_scrub_duration = 3617;
3154 a.scrub_duration = 0.003;
3155 a.snaptrimq_len = 1048576;
3156 a.objects_scrubbed = 0;
3157 a.objects_trimmed = 0;
3158 a.snaptrim_duration = 0.123;
3159 list<object_stat_collection_t*> l;
3160 object_stat_collection_t::generate_test_instances(l);
3161 a.stats = *l.back();
3162 a.log_size = 99;
3163 a.ondisk_log_size = 88;
3164 a.up.push_back(123);
3165 a.up_primary = 123;
3166 a.acting.push_back(456);
3167 a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD));
3168 set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) };
3169 a.object_location_counts.insert(make_pair(sset, 10));
3170 sset.insert(pg_shard_t(2));
3171 a.object_location_counts.insert(make_pair(sset, 5));
3172 a.acting_primary = 456;
3173 o.push_back(new pg_stat_t(a));
3174
3175 a.up.push_back(124);
3176 a.up_primary = 124;
3177 a.acting.push_back(124);
3178 a.acting_primary = 124;
3179 a.blocked_by.push_back(155);
3180 a.blocked_by.push_back(156);
3181 o.push_back(new pg_stat_t(a));
3182 }
3183
3184 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
3185 {
3186 return
3187 l.version == r.version &&
3188 l.reported_seq == r.reported_seq &&
3189 l.reported_epoch == r.reported_epoch &&
3190 l.state == r.state &&
3191 l.last_fresh == r.last_fresh &&
3192 l.last_change == r.last_change &&
3193 l.last_active == r.last_active &&
3194 l.last_peered == r.last_peered &&
3195 l.last_clean == r.last_clean &&
3196 l.last_unstale == r.last_unstale &&
3197 l.last_undegraded == r.last_undegraded &&
3198 l.last_fullsized == r.last_fullsized &&
3199 l.log_start == r.log_start &&
3200 l.ondisk_log_start == r.ondisk_log_start &&
3201 l.created == r.created &&
3202 l.last_epoch_clean == r.last_epoch_clean &&
3203 l.parent == r.parent &&
3204 l.parent_split_bits == r.parent_split_bits &&
3205 l.last_scrub == r.last_scrub &&
3206 l.last_deep_scrub == r.last_deep_scrub &&
3207 l.last_scrub_stamp == r.last_scrub_stamp &&
3208 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
3209 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
3210 l.stats == r.stats &&
3211 l.stats_invalid == r.stats_invalid &&
3212 l.log_size == r.log_size &&
3213 l.ondisk_log_size == r.ondisk_log_size &&
3214 l.up == r.up &&
3215 l.acting == r.acting &&
3216 l.avail_no_missing == r.avail_no_missing &&
3217 l.object_location_counts == r.object_location_counts &&
3218 l.mapping_epoch == r.mapping_epoch &&
3219 l.blocked_by == r.blocked_by &&
3220 l.last_became_active == r.last_became_active &&
3221 l.last_became_peered == r.last_became_peered &&
3222 l.dirty_stats_invalid == r.dirty_stats_invalid &&
3223 l.omap_stats_invalid == r.omap_stats_invalid &&
3224 l.hitset_stats_invalid == r.hitset_stats_invalid &&
3225 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
3226 l.up_primary == r.up_primary &&
3227 l.acting_primary == r.acting_primary &&
3228 l.pin_stats_invalid == r.pin_stats_invalid &&
3229 l.manifest_stats_invalid == r.manifest_stats_invalid &&
3230 l.purged_snaps == r.purged_snaps &&
3231 l.snaptrimq_len == r.snaptrimq_len &&
3232 l.last_scrub_duration == r.last_scrub_duration &&
3233 l.scrub_sched_status == r.scrub_sched_status &&
3234 l.objects_scrubbed == r.objects_scrubbed &&
3235 l.scrub_duration == r.scrub_duration &&
3236 l.objects_trimmed == r.objects_trimmed &&
3237 l.snaptrim_duration == r.snaptrim_duration;
3238 }
3239
3240 // -- store_statfs_t --
3241
3242 bool store_statfs_t::operator==(const store_statfs_t& other) const
3243 {
3244 return total == other.total
3245 && available == other.available
3246 && allocated == other.allocated
3247 && internally_reserved == other.internally_reserved
3248 && data_stored == other.data_stored
3249 && data_compressed == other.data_compressed
3250 && data_compressed_allocated == other.data_compressed_allocated
3251 && data_compressed_original == other.data_compressed_original
3252 && omap_allocated == other.omap_allocated
3253 && internal_metadata == other.internal_metadata;
3254 }
3255
3256 void store_statfs_t::dump(Formatter *f) const
3257 {
3258 f->dump_int("total", total);
3259 f->dump_int("available", available);
3260 f->dump_int("internally_reserved", internally_reserved);
3261 f->dump_int("allocated", allocated);
3262 f->dump_int("data_stored", data_stored);
3263 f->dump_int("data_compressed", data_compressed);
3264 f->dump_int("data_compressed_allocated", data_compressed_allocated);
3265 f->dump_int("data_compressed_original", data_compressed_original);
3266 f->dump_int("omap_allocated", omap_allocated);
3267 f->dump_int("internal_metadata", internal_metadata);
3268 }
3269
3270 ostream& operator<<(ostream& out, const store_statfs_t &s)
3271 {
3272 out << std::hex
3273 << "store_statfs(0x" << s.available
3274 << "/0x" << s.internally_reserved
3275 << "/0x" << s.total
3276 << ", data 0x" << s.data_stored
3277 << "/0x" << s.allocated
3278 << ", compress 0x" << s.data_compressed
3279 << "/0x" << s.data_compressed_allocated
3280 << "/0x" << s.data_compressed_original
3281 << ", omap 0x" << s.omap_allocated
3282 << ", meta 0x" << s.internal_metadata
3283 << std::dec
3284 << ")";
3285 return out;
3286 }
3287
3288 void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o)
3289 {
3290 store_statfs_t a;
3291 o.push_back(new store_statfs_t(a));
3292 a.total = 234;
3293 a.available = 123;
3294 a.internally_reserved = 33;
3295 a.allocated = 32;
3296 a.data_stored = 44;
3297 a.data_compressed = 21;
3298 a.data_compressed_allocated = 12;
3299 a.data_compressed_original = 13;
3300 a.omap_allocated = 14;
3301 a.internal_metadata = 15;
3302 o.push_back(new store_statfs_t(a));
3303 }
3304
3305 // -- pool_stat_t --
3306
3307 void pool_stat_t::dump(Formatter *f) const
3308 {
3309 stats.dump(f);
3310 f->open_object_section("store_stats");
3311 store_stats.dump(f);
3312 f->close_section();
3313 f->dump_int("log_size", log_size);
3314 f->dump_int("ondisk_log_size", ondisk_log_size);
3315 f->dump_int("up", up);
3316 f->dump_int("acting", acting);
3317 f->dump_int("num_store_stats", num_store_stats);
3318 }
3319
3320 void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
3321 {
3322 using ceph::encode;
3323 if ((features & CEPH_FEATURE_OSDENC) == 0) {
3324 __u8 v = 4;
3325 encode(v, bl);
3326 encode(stats, bl);
3327 encode(log_size, bl);
3328 encode(ondisk_log_size, bl);
3329 return;
3330 }
3331
3332 ENCODE_START(7, 5, bl);
3333 encode(stats, bl);
3334 encode(log_size, bl);
3335 encode(ondisk_log_size, bl);
3336 encode(up, bl);
3337 encode(acting, bl);
3338 encode(store_stats, bl);
3339 encode(num_store_stats, bl);
3340 ENCODE_FINISH(bl);
3341 }
3342
3343 void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl)
3344 {
3345 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
3346 if (struct_v >= 4) {
3347 decode(stats, bl);
3348 decode(log_size, bl);
3349 decode(ondisk_log_size, bl);
3350 if (struct_v >= 6) {
3351 decode(up, bl);
3352 decode(acting, bl);
3353 } else {
3354 up = 0;
3355 acting = 0;
3356 }
3357 if (struct_v >= 7) {
3358 decode(store_stats, bl);
3359 decode(num_store_stats, bl);
3360 } else {
3361 store_stats.reset();
3362 num_store_stats = 0;
3363 }
3364
3365 } else {
3366 decode(stats.sum.num_bytes, bl);
3367 uint64_t num_kb;
3368 decode(num_kb, bl);
3369 decode(stats.sum.num_objects, bl);
3370 decode(stats.sum.num_object_clones, bl);
3371 decode(stats.sum.num_object_copies, bl);
3372 decode(stats.sum.num_objects_missing_on_primary, bl);
3373 decode(stats.sum.num_objects_degraded, bl);
3374 decode(log_size, bl);
3375 decode(ondisk_log_size, bl);
3376 if (struct_v >= 2) {
3377 decode(stats.sum.num_rd, bl);
3378 decode(stats.sum.num_rd_kb, bl);
3379 decode(stats.sum.num_wr, bl);
3380 decode(stats.sum.num_wr_kb, bl);
3381 }
3382 if (struct_v >= 3) {
3383 decode(stats.sum.num_objects_unfound, bl);
3384 }
3385 }
3386 DECODE_FINISH(bl);
3387 }
3388
3389 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
3390 {
3391 pool_stat_t a;
3392 o.push_back(new pool_stat_t(a));
3393
3394 list<object_stat_collection_t*> l;
3395 object_stat_collection_t::generate_test_instances(l);
3396 list<store_statfs_t*> ll;
3397 store_statfs_t::generate_test_instances(ll);
3398 a.stats = *l.back();
3399 a.store_stats = *ll.back();
3400 a.log_size = 123;
3401 a.ondisk_log_size = 456;
3402 a.acting = 3;
3403 a.up = 4;
3404 a.num_store_stats = 1;
3405 o.push_back(new pool_stat_t(a));
3406 }
3407
3408
3409 // -- pg_history_t --
3410
3411 void pg_history_t::encode(ceph::buffer::list &bl) const
3412 {
3413 ENCODE_START(10, 4, bl);
3414 encode(epoch_created, bl);
3415 encode(last_epoch_started, bl);
3416 encode(last_epoch_clean, bl);
3417 encode(last_epoch_split, bl);
3418 encode(same_interval_since, bl);
3419 encode(same_up_since, bl);
3420 encode(same_primary_since, bl);
3421 encode(last_scrub, bl);
3422 encode(last_scrub_stamp, bl);
3423 encode(last_deep_scrub, bl);
3424 encode(last_deep_scrub_stamp, bl);
3425 encode(last_clean_scrub_stamp, bl);
3426 encode(last_epoch_marked_full, bl);
3427 encode(last_interval_started, bl);
3428 encode(last_interval_clean, bl);
3429 encode(epoch_pool_created, bl);
3430 encode(prior_readable_until_ub, bl);
3431 ENCODE_FINISH(bl);
3432 }
3433
3434 void pg_history_t::decode(ceph::buffer::list::const_iterator &bl)
3435 {
3436 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
3437 decode(epoch_created, bl);
3438 decode(last_epoch_started, bl);
3439 if (struct_v >= 3)
3440 decode(last_epoch_clean, bl);
3441 else
3442 last_epoch_clean = last_epoch_started; // careful, it's a lie!
3443 decode(last_epoch_split, bl);
3444 decode(same_interval_since, bl);
3445 decode(same_up_since, bl);
3446 decode(same_primary_since, bl);
3447 if (struct_v >= 2) {
3448 decode(last_scrub, bl);
3449 decode(last_scrub_stamp, bl);
3450 }
3451 if (struct_v >= 5) {
3452 decode(last_deep_scrub, bl);
3453 decode(last_deep_scrub_stamp, bl);
3454 }
3455 if (struct_v >= 6) {
3456 decode(last_clean_scrub_stamp, bl);
3457 }
3458 if (struct_v >= 7) {
3459 decode(last_epoch_marked_full, bl);
3460 }
3461 if (struct_v >= 8) {
3462 decode(last_interval_started, bl);
3463 decode(last_interval_clean, bl);
3464 } else {
3465 if (last_epoch_started >= same_interval_since) {
3466 last_interval_started = same_interval_since;
3467 } else {
3468 last_interval_started = last_epoch_started; // best guess
3469 }
3470 if (last_epoch_clean >= same_interval_since) {
3471 last_interval_clean = same_interval_since;
3472 } else {
3473 last_interval_clean = last_epoch_clean; // best guess
3474 }
3475 }
3476 if (struct_v >= 9) {
3477 decode(epoch_pool_created, bl);
3478 } else {
3479 epoch_pool_created = epoch_created;
3480 }
3481 if (struct_v >= 10) {
3482 decode(prior_readable_until_ub, bl);
3483 }
3484 DECODE_FINISH(bl);
3485 }
3486
3487 void pg_history_t::dump(Formatter *f) const
3488 {
3489 f->dump_int("epoch_created", epoch_created);
3490 f->dump_int("epoch_pool_created", epoch_pool_created);
3491 f->dump_int("last_epoch_started", last_epoch_started);
3492 f->dump_int("last_interval_started", last_interval_started);
3493 f->dump_int("last_epoch_clean", last_epoch_clean);
3494 f->dump_int("last_interval_clean", last_interval_clean);
3495 f->dump_int("last_epoch_split", last_epoch_split);
3496 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
3497 f->dump_int("same_up_since", same_up_since);
3498 f->dump_int("same_interval_since", same_interval_since);
3499 f->dump_int("same_primary_since", same_primary_since);
3500 f->dump_stream("last_scrub") << last_scrub;
3501 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
3502 f->dump_stream("last_deep_scrub") << last_deep_scrub;
3503 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
3504 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
3505 f->dump_float(
3506 "prior_readable_until_ub",
3507 std::chrono::duration<double>(prior_readable_until_ub).count());
3508 }
3509
3510 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
3511 {
3512 o.push_back(new pg_history_t);
3513 o.push_back(new pg_history_t);
3514 o.back()->epoch_created = 1;
3515 o.back()->epoch_pool_created = 1;
3516 o.back()->last_epoch_started = 2;
3517 o.back()->last_interval_started = 2;
3518 o.back()->last_epoch_clean = 3;
3519 o.back()->last_interval_clean = 2;
3520 o.back()->last_epoch_split = 4;
3521 o.back()->prior_readable_until_ub = make_timespan(3.1415);
3522 o.back()->same_up_since = 5;
3523 o.back()->same_interval_since = 6;
3524 o.back()->same_primary_since = 7;
3525 o.back()->last_scrub = eversion_t(8, 9);
3526 o.back()->last_scrub_stamp = utime_t(10, 11);
3527 o.back()->last_deep_scrub = eversion_t(12, 13);
3528 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
3529 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
3530 o.back()->last_epoch_marked_full = 18;
3531 }
3532
3533
3534 // -- pg_info_t --
3535
3536 void pg_info_t::encode(ceph::buffer::list &bl) const
3537 {
3538 ENCODE_START(32, 26, bl);
3539 encode(pgid.pgid, bl);
3540 encode(last_update, bl);
3541 encode(last_complete, bl);
3542 encode(log_tail, bl);
3543 encode(hobject_t(), bl); // old (nibblewise) last_backfill
3544 encode(stats, bl);
3545 history.encode(bl);
3546 encode(purged_snaps, bl);
3547 encode(last_epoch_started, bl);
3548 encode(last_user_version, bl);
3549 encode(hit_set, bl);
3550 encode(pgid.shard, bl);
3551 encode(last_backfill, bl);
3552 encode(true, bl); // was last_backfill_bitwise
3553 encode(last_interval_started, bl);
3554 ENCODE_FINISH(bl);
3555 }
3556
3557 void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
3558 {
3559 DECODE_START(32, bl);
3560 decode(pgid.pgid, bl);
3561 decode(last_update, bl);
3562 decode(last_complete, bl);
3563 decode(log_tail, bl);
3564 {
3565 hobject_t old_last_backfill;
3566 decode(old_last_backfill, bl);
3567 }
3568 decode(stats, bl);
3569 history.decode(bl);
3570 decode(purged_snaps, bl);
3571 decode(last_epoch_started, bl);
3572 decode(last_user_version, bl);
3573 decode(hit_set, bl);
3574 decode(pgid.shard, bl);
3575 decode(last_backfill, bl);
3576 {
3577 bool last_backfill_bitwise;
3578 decode(last_backfill_bitwise, bl);
3579 // note: we may see a false value here since the default value for
3580 // the member was false, so it often didn't get set to true until
3581 // peering progressed.
3582 }
3583 if (struct_v >= 32) {
3584 decode(last_interval_started, bl);
3585 } else {
3586 last_interval_started = last_epoch_started;
3587 }
3588 DECODE_FINISH(bl);
3589 }
3590
3591 // -- pg_info_t --
3592
3593 void pg_info_t::dump(Formatter *f) const
3594 {
3595 f->dump_stream("pgid") << pgid;
3596 f->dump_stream("last_update") << last_update;
3597 f->dump_stream("last_complete") << last_complete;
3598 f->dump_stream("log_tail") << log_tail;
3599 f->dump_int("last_user_version", last_user_version);
3600 f->dump_stream("last_backfill") << last_backfill;
3601 f->open_array_section("purged_snaps");
3602 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
3603 i != purged_snaps.end();
3604 ++i) {
3605 f->open_object_section("purged_snap_interval");
3606 f->dump_stream("start") << i.get_start();
3607 f->dump_stream("length") << i.get_len();
3608 f->close_section();
3609 }
3610 f->close_section();
3611 f->open_object_section("history");
3612 history.dump(f);
3613 f->close_section();
3614 f->open_object_section("stats");
3615 stats.dump(f);
3616 f->close_section();
3617
3618 f->dump_int("empty", is_empty());
3619 f->dump_int("dne", dne());
3620 f->dump_int("incomplete", is_incomplete());
3621 f->dump_int("last_epoch_started", last_epoch_started);
3622
3623 f->open_object_section("hit_set_history");
3624 hit_set.dump(f);
3625 f->close_section();
3626 }
3627
3628 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
3629 {
3630 o.push_back(new pg_info_t);
3631 o.push_back(new pg_info_t);
3632 list<pg_history_t*> h;
3633 pg_history_t::generate_test_instances(h);
3634 o.back()->history = *h.back();
3635 o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD);
3636 o.back()->last_update = eversion_t(3, 4);
3637 o.back()->last_complete = eversion_t(5, 6);
3638 o.back()->last_user_version = 2;
3639 o.back()->log_tail = eversion_t(7, 8);
3640 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
3641 {
3642 list<pg_stat_t*> s;
3643 pg_stat_t::generate_test_instances(s);
3644 o.back()->stats = *s.back();
3645 }
3646 {
3647 list<pg_hit_set_history_t*> s;
3648 pg_hit_set_history_t::generate_test_instances(s);
3649 o.back()->hit_set = *s.back();
3650 }
3651 }
3652
3653 // -- pg_notify_t --
3654 void pg_notify_t::encode(ceph::buffer::list &bl) const
3655 {
3656 ENCODE_START(3, 2, bl);
3657 encode(query_epoch, bl);
3658 encode(epoch_sent, bl);
3659 encode(info, bl);
3660 encode(to, bl);
3661 encode(from, bl);
3662 encode(past_intervals, bl);
3663 ENCODE_FINISH(bl);
3664 }
3665
3666 void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl)
3667 {
3668 DECODE_START(3, bl);
3669 decode(query_epoch, bl);
3670 decode(epoch_sent, bl);
3671 decode(info, bl);
3672 decode(to, bl);
3673 decode(from, bl);
3674 if (struct_v >= 3) {
3675 decode(past_intervals, bl);
3676 }
3677 DECODE_FINISH(bl);
3678 }
3679
3680 void pg_notify_t::dump(Formatter *f) const
3681 {
3682 f->dump_int("from", from);
3683 f->dump_int("to", to);
3684 f->dump_unsigned("query_epoch", query_epoch);
3685 f->dump_unsigned("epoch_sent", epoch_sent);
3686 {
3687 f->open_object_section("info");
3688 info.dump(f);
3689 f->close_section();
3690 }
3691 f->dump_object("past_intervals", past_intervals);
3692 }
3693
3694 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
3695 {
3696 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1,
3697 pg_info_t(), PastIntervals()));
3698 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
3699 pg_info_t(), PastIntervals()));
3700 }
3701
3702 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
3703 {
3704 lhs << "(query:" << notify.query_epoch
3705 << " sent:" << notify.epoch_sent
3706 << " " << notify.info;
3707 if (notify.from != shard_id_t::NO_SHARD ||
3708 notify.to != shard_id_t::NO_SHARD)
3709 lhs << " " << (unsigned)notify.from
3710 << "->" << (unsigned)notify.to;
3711 lhs << " " << notify.past_intervals;
3712 return lhs << ")";
3713 }
3714
3715 // -- pg_interval_t --
3716
3717 void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const
3718 {
3719 ENCODE_START(4, 2, bl);
3720 encode(first, bl);
3721 encode(last, bl);
3722 encode(up, bl);
3723 encode(acting, bl);
3724 encode(maybe_went_rw, bl);
3725 encode(primary, bl);
3726 encode(up_primary, bl);
3727 ENCODE_FINISH(bl);
3728 }
3729
3730 void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl)
3731 {
3732 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
3733 decode(first, bl);
3734 decode(last, bl);
3735 decode(up, bl);
3736 decode(acting, bl);
3737 decode(maybe_went_rw, bl);
3738 if (struct_v >= 3) {
3739 decode(primary, bl);
3740 } else {
3741 if (acting.size())
3742 primary = acting[0];
3743 }
3744 if (struct_v >= 4) {
3745 decode(up_primary, bl);
3746 } else {
3747 if (up.size())
3748 up_primary = up[0];
3749 }
3750 DECODE_FINISH(bl);
3751 }
3752
3753 void PastIntervals::pg_interval_t::dump(Formatter *f) const
3754 {
3755 f->dump_unsigned("first", first);
3756 f->dump_unsigned("last", last);
3757 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
3758 f->open_array_section("up");
3759 for (auto p = up.cbegin(); p != up.cend(); ++p)
3760 f->dump_int("osd", *p);
3761 f->close_section();
3762 f->open_array_section("acting");
3763 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
3764 f->dump_int("osd", *p);
3765 f->close_section();
3766 f->dump_int("primary", primary);
3767 f->dump_int("up_primary", up_primary);
3768 }
3769
3770 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
3771 {
3772 o.push_back(new pg_interval_t);
3773 o.push_back(new pg_interval_t);
3774 o.back()->up.push_back(1);
3775 o.back()->acting.push_back(2);
3776 o.back()->acting.push_back(3);
3777 o.back()->first = 4;
3778 o.back()->last = 5;
3779 o.back()->maybe_went_rw = true;
3780 }
3781
3782 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
3783
3784
3785 /**
3786 * pi_compact_rep
3787 *
3788 * PastIntervals only needs to be able to answer two questions:
3789 * 1) Where should the primary look for unfound objects?
3790 * 2) List a set of subsets of the OSDs such that contacting at least
3791 * one from each subset guarantees we speak to at least one witness
3792 * of any completed write.
3793 *
3794 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3795 * we don't need to keep any where maybe_went_rw would be false. We also
3796 * needn't keep two intervals where the actingset in one is a subset
3797 * of the other (only need to keep the smaller of the two sets). In order
3798 * to accurately trim the set of intervals as last_epoch_started changes
3799 * without rebuilding the set from scratch, we'll retain the larger set
3800 * if it in an older interval.
3801 */
3802 struct compact_interval_t {
3803 epoch_t first;
3804 epoch_t last;
3805 set<pg_shard_t> acting;
3806 bool supersedes(const compact_interval_t &other) {
3807 for (auto &&i: acting) {
3808 if (!other.acting.count(i))
3809 return false;
3810 }
3811 return true;
3812 }
3813 void dump(Formatter *f) const {
3814 f->open_object_section("compact_interval_t");
3815 f->dump_stream("first") << first;
3816 f->dump_stream("last") << last;
3817 f->dump_stream("acting") << acting;
3818 f->close_section();
3819 }
3820 void encode(ceph::buffer::list &bl) const {
3821 ENCODE_START(1, 1, bl);
3822 encode(first, bl);
3823 encode(last, bl);
3824 encode(acting, bl);
3825 ENCODE_FINISH(bl);
3826 }
3827 void decode(ceph::buffer::list::const_iterator &bl) {
3828 DECODE_START(1, bl);
3829 decode(first, bl);
3830 decode(last, bl);
3831 decode(acting, bl);
3832 DECODE_FINISH(bl);
3833 }
3834 static void generate_test_instances(list<compact_interval_t*> & o) {
3835 /* Not going to be used, we'll generate pi_compact_rep directly */
3836 }
3837 };
3838 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3839 {
3840 return o << "([" << rhs.first << "," << rhs.last
3841 << "] acting " << rhs.acting << ")";
3842 }
3843 WRITE_CLASS_ENCODER(compact_interval_t)
3844
3845 class pi_compact_rep : public PastIntervals::interval_rep {
3846 epoch_t first = 0;
3847 epoch_t last = 0; // inclusive
3848 set<pg_shard_t> all_participants;
3849 list<compact_interval_t> intervals;
3850 pi_compact_rep(
3851 bool ec_pool,
3852 std::list<PastIntervals::pg_interval_t> &&intervals) {
3853 for (auto &&i: intervals)
3854 add_interval(ec_pool, i);
3855 }
3856 public:
3857 pi_compact_rep() = default;
3858 pi_compact_rep(const pi_compact_rep &) = default;
3859 pi_compact_rep(pi_compact_rep &&) = default;
3860 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3861 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3862
3863 size_t size() const override { return intervals.size(); }
3864 bool empty() const override {
3865 return first > last || (first == 0 && last == 0);
3866 }
3867 void clear() override {
3868 *this = pi_compact_rep();
3869 }
3870 pair<epoch_t, epoch_t> get_bounds() const override {
3871 return make_pair(first, last + 1);
3872 }
3873 void adjust_start_backwards(epoch_t last_epoch_clean) override {
3874 first = last_epoch_clean;
3875 }
3876
3877 set<pg_shard_t> get_all_participants(
3878 bool ec_pool) const override {
3879 return all_participants;
3880 }
3881 void add_interval(
3882 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3883 if (first == 0)
3884 first = interval.first;
3885 ceph_assert(interval.last > last);
3886 last = interval.last;
3887 set<pg_shard_t> acting;
3888 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3889 if (interval.acting[i] == CRUSH_ITEM_NONE)
3890 continue;
3891 acting.insert(
3892 pg_shard_t(
3893 interval.acting[i],
3894 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3895 }
3896 all_participants.insert(acting.begin(), acting.end());
3897 if (!interval.maybe_went_rw)
3898 return;
3899 intervals.push_back(
3900 compact_interval_t{interval.first, interval.last, acting});
3901 auto plast = intervals.end();
3902 --plast;
3903 for (auto cur = intervals.begin(); cur != plast; ) {
3904 if (plast->supersedes(*cur)) {
3905 intervals.erase(cur++);
3906 } else {
3907 ++cur;
3908 }
3909 }
3910 }
3911 unique_ptr<PastIntervals::interval_rep> clone() const override {
3912 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3913 }
3914 ostream &print(ostream &out) const override {
3915 return out << "([" << first << "," << last
3916 << "] all_participants=" << all_participants
3917 << " intervals=" << intervals << ")";
3918 }
3919 void encode(ceph::buffer::list &bl) const override {
3920 ENCODE_START(1, 1, bl);
3921 encode(first, bl);
3922 encode(last, bl);
3923 encode(all_participants, bl);
3924 encode(intervals, bl);
3925 ENCODE_FINISH(bl);
3926 }
3927 void decode(ceph::buffer::list::const_iterator &bl) override {
3928 DECODE_START(1, bl);
3929 decode(first, bl);
3930 decode(last, bl);
3931 decode(all_participants, bl);
3932 decode(intervals, bl);
3933 DECODE_FINISH(bl);
3934 }
3935 void dump(Formatter *f) const override {
3936 f->open_object_section("PastIntervals::compact_rep");
3937 f->dump_stream("first") << first;
3938 f->dump_stream("last") << last;
3939 f->open_array_section("all_participants");
3940 for (auto& i : all_participants) {
3941 f->dump_object("pg_shard", i);
3942 }
3943 f->close_section();
3944 f->open_array_section("intervals");
3945 for (auto &&i: intervals) {
3946 i.dump(f);
3947 }
3948 f->close_section();
3949 f->close_section();
3950 }
3951 static void generate_test_instances(list<pi_compact_rep*> &o) {
3952 using ival = PastIntervals::pg_interval_t;
3953 using ivallst = std::list<ival>;
3954 o.push_back(
3955 new pi_compact_rep(
3956 true, ivallst
3957 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3958 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3959 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3960 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3961 }));
3962 o.push_back(
3963 new pi_compact_rep(
3964 false, ivallst
3965 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3966 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3967 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3968 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3969 }));
3970 o.push_back(
3971 new pi_compact_rep(
3972 true, ivallst
3973 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3974 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3975 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3976 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3977 }));
3978 }
3979 void iterate_mayberw_back_to(
3980 epoch_t les,
3981 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3982 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3983 if (i->last < les)
3984 break;
3985 f(i->first, i->acting);
3986 }
3987 }
3988 virtual ~pi_compact_rep() override {}
3989 };
3990 WRITE_CLASS_ENCODER(pi_compact_rep)
3991
3992 PastIntervals::PastIntervals()
3993 {
3994 past_intervals.reset(new pi_compact_rep);
3995 }
3996
3997 PastIntervals::PastIntervals(const PastIntervals &rhs)
3998 : past_intervals(rhs.past_intervals ?
3999 rhs.past_intervals->clone() :
4000 nullptr) {}
4001
4002 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
4003 {
4004 PastIntervals other(rhs);
4005 swap(other);
4006 return *this;
4007 }
4008
4009 ostream& operator<<(ostream& out, const PastIntervals &i)
4010 {
4011 if (i.past_intervals) {
4012 return i.past_intervals->print(out);
4013 } else {
4014 return out << "(empty)";
4015 }
4016 }
4017
4018 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
4019 {
4020 return out << "PriorSet("
4021 << "ec_pool: " << i.ec_pool
4022 << ", probe: " << i.probe
4023 << ", down: " << i.down
4024 << ", blocked_by: " << i.blocked_by
4025 << ", pg_down: " << i.pg_down
4026 << ")";
4027 }
4028
4029 void PastIntervals::decode(ceph::buffer::list::const_iterator &bl)
4030 {
4031 DECODE_START(1, bl);
4032 __u8 type = 0;
4033 decode(type, bl);
4034 switch (type) {
4035 case 0:
4036 break;
4037 case 1:
4038 ceph_abort_msg("pi_simple_rep support removed post-luminous");
4039 break;
4040 case 2:
4041 past_intervals.reset(new pi_compact_rep);
4042 past_intervals->decode(bl);
4043 break;
4044 }
4045 DECODE_FINISH(bl);
4046 }
4047
4048 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
4049 {
4050 {
4051 list<pi_compact_rep *> compact;
4052 pi_compact_rep::generate_test_instances(compact);
4053 for (auto &&i: compact) {
4054 // takes ownership of contents
4055 o.push_back(new PastIntervals(i));
4056 }
4057 }
4058 return;
4059 }
4060
4061 bool PastIntervals::is_new_interval(
4062 int old_acting_primary,
4063 int new_acting_primary,
4064 const vector<int> &old_acting,
4065 const vector<int> &new_acting,
4066 int old_up_primary,
4067 int new_up_primary,
4068 const vector<int> &old_up,
4069 const vector<int> &new_up,
4070 int old_size,
4071 int new_size,
4072 int old_min_size,
4073 int new_min_size,
4074 unsigned old_pg_num,
4075 unsigned new_pg_num,
4076 unsigned old_pg_num_pending,
4077 unsigned new_pg_num_pending,
4078 bool old_sort_bitwise,
4079 bool new_sort_bitwise,
4080 bool old_recovery_deletes,
4081 bool new_recovery_deletes,
4082 uint32_t old_crush_count,
4083 uint32_t new_crush_count,
4084 uint32_t old_crush_target,
4085 uint32_t new_crush_target,
4086 uint32_t old_crush_barrier,
4087 uint32_t new_crush_barrier,
4088 int32_t old_crush_member,
4089 int32_t new_crush_member,
4090 pg_t pgid) {
4091 return old_acting_primary != new_acting_primary ||
4092 new_acting != old_acting ||
4093 old_up_primary != new_up_primary ||
4094 new_up != old_up ||
4095 old_min_size != new_min_size ||
4096 old_size != new_size ||
4097 pgid.is_split(old_pg_num, new_pg_num, 0) ||
4098 // (is or was) pre-merge source
4099 pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) ||
4100 pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) ||
4101 // merge source
4102 pgid.is_merge_source(old_pg_num, new_pg_num, 0) ||
4103 // (is or was) pre-merge target
4104 pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) ||
4105 pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) ||
4106 // merge target
4107 pgid.is_merge_target(old_pg_num, new_pg_num) ||
4108 old_sort_bitwise != new_sort_bitwise ||
4109 old_recovery_deletes != new_recovery_deletes ||
4110 old_crush_count != new_crush_count ||
4111 old_crush_target != new_crush_target ||
4112 old_crush_barrier != new_crush_barrier ||
4113 old_crush_member != new_crush_member;
4114 }
4115
4116 bool PastIntervals::is_new_interval(
4117 int old_acting_primary,
4118 int new_acting_primary,
4119 const vector<int> &old_acting,
4120 const vector<int> &new_acting,
4121 int old_up_primary,
4122 int new_up_primary,
4123 const vector<int> &old_up,
4124 const vector<int> &new_up,
4125 const OSDMap *osdmap,
4126 const OSDMap *lastmap,
4127 pg_t pgid)
4128 {
4129 const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool());
4130 if (!plast) {
4131 return false; // after pool is deleted there are no more interval changes
4132 }
4133 const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool());
4134 if (!pi) {
4135 return true; // pool was deleted this epoch -> (final!) interval change
4136 }
4137 return
4138 is_new_interval(old_acting_primary,
4139 new_acting_primary,
4140 old_acting,
4141 new_acting,
4142 old_up_primary,
4143 new_up_primary,
4144 old_up,
4145 new_up,
4146 plast->size,
4147 pi->size,
4148 plast->min_size,
4149 pi->min_size,
4150 plast->get_pg_num(),
4151 pi->get_pg_num(),
4152 plast->get_pg_num_pending(),
4153 pi->get_pg_num_pending(),
4154 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
4155 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
4156 lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
4157 osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
4158 plast->peering_crush_bucket_count, pi->peering_crush_bucket_count,
4159 plast->peering_crush_bucket_target, pi->peering_crush_bucket_target,
4160 plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier,
4161 plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member,
4162 pgid);
4163 }
4164
4165 bool PastIntervals::check_new_interval(
4166 int old_acting_primary,
4167 int new_acting_primary,
4168 const vector<int> &old_acting,
4169 const vector<int> &new_acting,
4170 int old_up_primary,
4171 int new_up_primary,
4172 const vector<int> &old_up,
4173 const vector<int> &new_up,
4174 epoch_t same_interval_since,
4175 epoch_t last_epoch_clean,
4176 const OSDMap *osdmap,
4177 const OSDMap *lastmap,
4178 pg_t pgid,
4179 const IsPGRecoverablePredicate &could_have_gone_active,
4180 PastIntervals *past_intervals,
4181 std::ostream *out)
4182 {
4183 /*
4184 * We have to be careful to gracefully deal with situations like
4185 * so. Say we have a power outage or something that takes out both
4186 * OSDs, but the monitor doesn't mark them down in the same epoch.
4187 * The history may look like
4188 *
4189 * 1: A B
4190 * 2: B
4191 * 3: let's say B dies for good, too (say, from the power spike)
4192 * 4: A
4193 *
4194 * which makes it look like B may have applied updates to the PG
4195 * that we need in order to proceed. This sucks...
4196 *
4197 * To minimize the risk of this happening, we CANNOT go active if
4198 * _any_ OSDs in the prior set are down until we send an MOSDAlive
4199 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
4200 * Then, we have something like
4201 *
4202 * 1: A B
4203 * 2: B up_thru[B]=0
4204 * 3:
4205 * 4: A
4206 *
4207 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
4208 *
4209 * or,
4210 *
4211 * 1: A B
4212 * 2: B up_thru[B]=0
4213 * 3: B up_thru[B]=2
4214 * 4:
4215 * 5: A
4216 *
4217 * -> we must wait for B, bc it was alive through 2, and could have
4218 * written to the pg.
4219 *
4220 * If B is really dead, then an administrator will need to manually
4221 * intervene by marking the OSD as "lost."
4222 */
4223
4224 // remember past interval
4225 // NOTE: a change in the up set primary triggers an interval
4226 // change, even though the interval members in the pg_interval_t
4227 // do not change.
4228 ceph_assert(past_intervals);
4229 ceph_assert(past_intervals->past_intervals);
4230 if (is_new_interval(
4231 old_acting_primary,
4232 new_acting_primary,
4233 old_acting,
4234 new_acting,
4235 old_up_primary,
4236 new_up_primary,
4237 old_up,
4238 new_up,
4239 osdmap,
4240 lastmap,
4241 pgid)) {
4242 pg_interval_t i;
4243 i.first = same_interval_since;
4244 i.last = osdmap->get_epoch() - 1;
4245 ceph_assert(i.first <= i.last);
4246 i.acting = old_acting;
4247 i.up = old_up;
4248 i.primary = old_acting_primary;
4249 i.up_primary = old_up_primary;
4250
4251 unsigned num_acting = 0;
4252 for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p)
4253 if (*p != CRUSH_ITEM_NONE)
4254 ++num_acting;
4255
4256 ceph_assert(lastmap->get_pools().count(pgid.pool()));
4257 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
4258 set<pg_shard_t> old_acting_shards;
4259 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
4260
4261 if (num_acting &&
4262 i.primary != -1 &&
4263 num_acting >= old_pg_pool.min_size &&
4264 (!old_pg_pool.is_stretch_pool() ||
4265 old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) &&
4266 could_have_gone_active(old_acting_shards)) {
4267 if (out)
4268 *out << __func__ << " " << i
4269 << " up_thru " << lastmap->get_up_thru(i.primary)
4270 << " up_from " << lastmap->get_up_from(i.primary)
4271 << " last_epoch_clean " << last_epoch_clean;
4272 if (lastmap->get_up_thru(i.primary) >= i.first &&
4273 lastmap->get_up_from(i.primary) <= i.first) {
4274 i.maybe_went_rw = true;
4275 if (out)
4276 *out << " " << i
4277 << " : primary up " << lastmap->get_up_from(i.primary)
4278 << "-" << lastmap->get_up_thru(i.primary)
4279 << " includes interval"
4280 << std::endl;
4281 } else if (last_epoch_clean >= i.first &&
4282 last_epoch_clean <= i.last) {
4283 // If the last_epoch_clean is included in this interval, then
4284 // the pg must have been rw (for recovery to have completed).
4285 // This is important because we won't know the _real_
4286 // first_epoch because we stop at last_epoch_clean, and we
4287 // don't want the oldest interval to randomly have
4288 // maybe_went_rw false depending on the relative up_thru vs
4289 // last_epoch_clean timing.
4290 i.maybe_went_rw = true;
4291 if (out)
4292 *out << " " << i
4293 << " : includes last_epoch_clean " << last_epoch_clean
4294 << " and presumed to have been rw"
4295 << std::endl;
4296 } else {
4297 i.maybe_went_rw = false;
4298 if (out)
4299 *out << " " << i
4300 << " : primary up " << lastmap->get_up_from(i.primary)
4301 << "-" << lastmap->get_up_thru(i.primary)
4302 << " does not include interval"
4303 << std::endl;
4304 }
4305 } else {
4306 i.maybe_went_rw = false;
4307 if (out)
4308 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
4309 }
4310 past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);
4311 return true;
4312 } else {
4313 return false;
4314 }
4315 }
4316
4317 // true if the given map affects the prior set
4318 bool PastIntervals::PriorSet::affected_by_map(
4319 const OSDMap &osdmap,
4320 const DoutPrefixProvider *dpp) const
4321 {
4322 for (auto p = probe.begin(); p != probe.end(); ++p) {
4323 int o = p->osd;
4324
4325 // did someone in the prior set go down?
4326 if (osdmap.is_down(o) && down.count(o) == 0) {
4327 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
4328 return true;
4329 }
4330
4331 // did a down osd in cur get (re)marked as lost?
4332 auto r = blocked_by.find(o);
4333 if (r != blocked_by.end()) {
4334 if (!osdmap.exists(o)) {
4335 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4336 return true;
4337 }
4338 if (osdmap.get_info(o).lost_at != r->second) {
4339 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4340 return true;
4341 }
4342 }
4343 }
4344
4345 // did someone in the prior down set go up?
4346 for (auto p = down.cbegin(); p != down.cend(); ++p) {
4347 int o = *p;
4348
4349 if (osdmap.is_up(o)) {
4350 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
4351 return true;
4352 }
4353
4354 // did someone in the prior set get lost or destroyed?
4355 if (!osdmap.exists(o)) {
4356 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4357 return true;
4358 }
4359 // did a down osd in down get (re)marked as lost?
4360 auto r = blocked_by.find(o);
4361 if (r != blocked_by.end()) {
4362 if (osdmap.get_info(o).lost_at != r->second) {
4363 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4364 return true;
4365 }
4366 }
4367 }
4368
4369 return false;
4370 }
4371
4372 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
4373 {
4374 out << "interval(" << i.first << "-" << i.last
4375 << " up " << i.up << "(" << i.up_primary << ")"
4376 << " acting " << i.acting << "(" << i.primary << ")";
4377 if (i.maybe_went_rw)
4378 out << " maybe_went_rw";
4379 out << ")";
4380 return out;
4381 }
4382
4383
4384
4385 // -- pg_query_t --
4386
4387 void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const {
4388 ENCODE_START(3, 3, bl);
4389 encode(type, bl);
4390 encode(since, bl);
4391 history.encode(bl);
4392 encode(epoch_sent, bl);
4393 encode(to, bl);
4394 encode(from, bl);
4395 ENCODE_FINISH(bl);
4396 }
4397
4398 void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) {
4399 DECODE_START(3, bl);
4400 decode(type, bl);
4401 decode(since, bl);
4402 history.decode(bl);
4403 decode(epoch_sent, bl);
4404 decode(to, bl);
4405 decode(from, bl);
4406 DECODE_FINISH(bl);
4407 }
4408
4409 void pg_query_t::dump(Formatter *f) const
4410 {
4411 f->dump_int("from", from);
4412 f->dump_int("to", to);
4413 f->dump_string("type", get_type_name());
4414 f->dump_stream("since") << since;
4415 f->dump_stream("epoch_sent") << epoch_sent;
4416 f->open_object_section("history");
4417 history.dump(f);
4418 f->close_section();
4419 }
4420 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
4421 {
4422 o.push_back(new pg_query_t());
4423 list<pg_history_t*> h;
4424 pg_history_t::generate_test_instances(h);
4425 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
4426 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
4427 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
4428 eversion_t(4, 5), *h.back(), 4));
4429 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
4430 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
4431 *h.back(), 5));
4432 }
4433
4434 // -- pg_lease_t --
4435
4436 void pg_lease_t::encode(bufferlist& bl) const
4437 {
4438 ENCODE_START(1, 1, bl);
4439 encode(readable_until, bl);
4440 encode(readable_until_ub, bl);
4441 encode(interval, bl);
4442 ENCODE_FINISH(bl);
4443 }
4444
4445 void pg_lease_t::decode(bufferlist::const_iterator& p)
4446 {
4447 DECODE_START(1, p);
4448 decode(readable_until, p);
4449 decode(readable_until_ub, p);
4450 decode(interval, p);
4451 DECODE_FINISH(p);
4452 }
4453
4454 void pg_lease_t::dump(Formatter *f) const
4455 {
4456 f->dump_stream("readable_until") << readable_until;
4457 f->dump_stream("readable_until_ub") << readable_until_ub;
4458 f->dump_stream("interval") << interval;
4459 }
4460
4461 void pg_lease_t::generate_test_instances(std::list<pg_lease_t*>& o)
4462 {
4463 o.push_back(new pg_lease_t());
4464 o.push_back(new pg_lease_t());
4465 o.back()->readable_until = make_timespan(1.5);
4466 o.back()->readable_until_ub = make_timespan(3.4);
4467 o.back()->interval = make_timespan(1.0);
4468 }
4469
4470 // -- pg_lease_ack_t --
4471
4472 void pg_lease_ack_t::encode(bufferlist& bl) const
4473 {
4474 ENCODE_START(1, 1, bl);
4475 encode(readable_until_ub, bl);
4476 ENCODE_FINISH(bl);
4477 }
4478
4479 void pg_lease_ack_t::decode(bufferlist::const_iterator& p)
4480 {
4481 DECODE_START(1, p);
4482 decode(readable_until_ub, p);
4483 DECODE_FINISH(p);
4484 }
4485
4486 void pg_lease_ack_t::dump(Formatter *f) const
4487 {
4488 f->dump_stream("readable_until_ub") << readable_until_ub;
4489 }
4490
4491 void pg_lease_ack_t::generate_test_instances(std::list<pg_lease_ack_t*>& o)
4492 {
4493 o.push_back(new pg_lease_ack_t());
4494 o.push_back(new pg_lease_ack_t());
4495 o.back()->readable_until_ub = make_timespan(3.4);
4496 }
4497
4498
4499 // -- ObjectModDesc --
4500 void ObjectModDesc::visit(Visitor *visitor) const
4501 {
4502 auto bp = bl.cbegin();
4503 try {
4504 while (!bp.end()) {
4505 DECODE_START(max_required_version, bp);
4506 uint8_t code;
4507 decode(code, bp);
4508 switch (code) {
4509 case APPEND: {
4510 uint64_t size;
4511 decode(size, bp);
4512 visitor->append(size);
4513 break;
4514 }
4515 case SETATTRS: {
4516 map<string, std::optional<ceph::buffer::list> > attrs;
4517 decode(attrs, bp);
4518 visitor->setattrs(attrs);
4519 break;
4520 }
4521 case DELETE: {
4522 version_t old_version;
4523 decode(old_version, bp);
4524 visitor->rmobject(old_version);
4525 break;
4526 }
4527 case CREATE: {
4528 visitor->create();
4529 break;
4530 }
4531 case UPDATE_SNAPS: {
4532 set<snapid_t> snaps;
4533 decode(snaps, bp);
4534 visitor->update_snaps(snaps);
4535 break;
4536 }
4537 case TRY_DELETE: {
4538 version_t old_version;
4539 decode(old_version, bp);
4540 visitor->try_rmobject(old_version);
4541 break;
4542 }
4543 case ROLLBACK_EXTENTS: {
4544 vector<pair<uint64_t, uint64_t> > extents;
4545 version_t gen;
4546 decode(gen, bp);
4547 decode(extents, bp);
4548 visitor->rollback_extents(gen,extents);
4549 break;
4550 }
4551 default:
4552 ceph_abort_msg("Invalid rollback code");
4553 }
4554 DECODE_FINISH(bp);
4555 }
4556 } catch (...) {
4557 ceph_abort_msg("Invalid encoding");
4558 }
4559 }
4560
4561 struct DumpVisitor : public ObjectModDesc::Visitor {
4562 Formatter *f;
4563 explicit DumpVisitor(Formatter *f) : f(f) {}
4564 void append(uint64_t old_size) override {
4565 f->open_object_section("op");
4566 f->dump_string("code", "APPEND");
4567 f->dump_unsigned("old_size", old_size);
4568 f->close_section();
4569 }
4570 void setattrs(map<string, std::optional<ceph::buffer::list> > &attrs) override {
4571 f->open_object_section("op");
4572 f->dump_string("code", "SETATTRS");
4573 f->open_array_section("attrs");
4574 for (auto i = attrs.begin(); i != attrs.end(); ++i) {
4575 f->dump_string("attr_name", i->first);
4576 }
4577 f->close_section();
4578 f->close_section();
4579 }
4580 void rmobject(version_t old_version) override {
4581 f->open_object_section("op");
4582 f->dump_string("code", "RMOBJECT");
4583 f->dump_unsigned("old_version", old_version);
4584 f->close_section();
4585 }
4586 void try_rmobject(version_t old_version) override {
4587 f->open_object_section("op");
4588 f->dump_string("code", "TRY_RMOBJECT");
4589 f->dump_unsigned("old_version", old_version);
4590 f->close_section();
4591 }
4592 void create() override {
4593 f->open_object_section("op");
4594 f->dump_string("code", "CREATE");
4595 f->close_section();
4596 }
4597 void update_snaps(const set<snapid_t> &snaps) override {
4598 f->open_object_section("op");
4599 f->dump_string("code", "UPDATE_SNAPS");
4600 f->dump_stream("snaps") << snaps;
4601 f->close_section();
4602 }
4603 void rollback_extents(
4604 version_t gen,
4605 const vector<pair<uint64_t, uint64_t> > &extents) override {
4606 f->open_object_section("op");
4607 f->dump_string("code", "ROLLBACK_EXTENTS");
4608 f->dump_unsigned("gen", gen);
4609 f->dump_stream("snaps") << extents;
4610 f->close_section();
4611 }
4612 };
4613
4614 void ObjectModDesc::dump(Formatter *f) const
4615 {
4616 f->open_object_section("object_mod_desc");
4617 f->dump_bool("can_local_rollback", can_local_rollback);
4618 f->dump_bool("rollback_info_completed", rollback_info_completed);
4619 {
4620 f->open_array_section("ops");
4621 DumpVisitor vis(f);
4622 visit(&vis);
4623 f->close_section();
4624 }
4625 f->close_section();
4626 }
4627
4628 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
4629 {
4630 map<string, std::optional<ceph::buffer::list> > attrs;
4631 attrs[OI_ATTR];
4632 attrs[SS_ATTR];
4633 attrs["asdf"];
4634 o.push_back(new ObjectModDesc());
4635 o.back()->append(100);
4636 o.back()->setattrs(attrs);
4637 o.push_back(new ObjectModDesc());
4638 o.back()->rmobject(1001);
4639 o.push_back(new ObjectModDesc());
4640 o.back()->create();
4641 o.back()->setattrs(attrs);
4642 o.push_back(new ObjectModDesc());
4643 o.back()->create();
4644 o.back()->setattrs(attrs);
4645 o.back()->mark_unrollbackable();
4646 o.back()->append(1000);
4647 }
4648
4649 void ObjectModDesc::encode(ceph::buffer::list &_bl) const
4650 {
4651 ENCODE_START(max_required_version, max_required_version, _bl);
4652 encode(can_local_rollback, _bl);
4653 encode(rollback_info_completed, _bl);
4654 encode(bl, _bl);
4655 ENCODE_FINISH(_bl);
4656 }
4657 void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl)
4658 {
4659 DECODE_START(2, _bl);
4660 max_required_version = struct_v;
4661 decode(can_local_rollback, _bl);
4662 decode(rollback_info_completed, _bl);
4663 decode(bl, _bl);
4664 // ensure bl does not pin a larger ceph::buffer in memory
4665 bl.rebuild();
4666 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
4667 DECODE_FINISH(_bl);
4668 }
4669
4670 std::atomic<uint32_t> ObjectCleanRegions::max_num_intervals = {10};
4671
4672 void ObjectCleanRegions::set_max_num_intervals(uint32_t num)
4673 {
4674 max_num_intervals = num;
4675 }
4676
4677 void ObjectCleanRegions::trim()
4678 {
4679 while(clean_offsets.num_intervals() > max_num_intervals) {
4680 typename interval_set<uint64_t>::iterator shortest_interval = clean_offsets.begin();
4681 if (shortest_interval == clean_offsets.end())
4682 break;
4683 for (typename interval_set<uint64_t>::iterator it = clean_offsets.begin();
4684 it != clean_offsets.end();
4685 ++it) {
4686 if (it.get_len() < shortest_interval.get_len())
4687 shortest_interval = it;
4688 }
4689 clean_offsets.erase(shortest_interval);
4690 }
4691 }
4692
4693 void ObjectCleanRegions::merge(const ObjectCleanRegions &other)
4694 {
4695 clean_offsets.intersection_of(other.clean_offsets);
4696 clean_omap = clean_omap && other.clean_omap;
4697 trim();
4698 }
4699
4700 void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len)
4701 {
4702 interval_set<uint64_t> clean_region;
4703 clean_region.insert(0, (uint64_t)-1);
4704 clean_region.erase(offset, len);
4705 clean_offsets.intersection_of(clean_region);
4706 trim();
4707 }
4708
4709 bool ObjectCleanRegions::is_clean_region(uint64_t offset, uint64_t len) const
4710 {
4711 return clean_offsets.contains(offset, len);
4712 }
4713
4714 void ObjectCleanRegions::mark_omap_dirty()
4715 {
4716 clean_omap = false;
4717 }
4718
4719 void ObjectCleanRegions::mark_object_new()
4720 {
4721 new_object = true;
4722 }
4723
4724 void ObjectCleanRegions::mark_fully_dirty()
4725 {
4726 mark_data_region_dirty(0, (uint64_t)-1);
4727 mark_omap_dirty();
4728 mark_object_new();
4729 }
4730
4731 interval_set<uint64_t> ObjectCleanRegions::get_dirty_regions() const
4732 {
4733 interval_set<uint64_t> dirty_region;
4734 dirty_region.insert(0, (uint64_t)-1);
4735 dirty_region.subtract(clean_offsets);
4736 return dirty_region;
4737 }
4738
4739 bool ObjectCleanRegions::omap_is_dirty() const
4740 {
4741 return !clean_omap;
4742 }
4743
4744 bool ObjectCleanRegions::object_is_exist() const
4745 {
4746 return !new_object;
4747 }
4748
4749 void ObjectCleanRegions::encode(bufferlist &bl) const
4750 {
4751 ENCODE_START(1, 1, bl);
4752 using ceph::encode;
4753 encode(clean_offsets, bl);
4754 encode(clean_omap, bl);
4755 encode(new_object, bl);
4756 ENCODE_FINISH(bl);
4757 }
4758
4759 void ObjectCleanRegions::decode(bufferlist::const_iterator &bl)
4760 {
4761 DECODE_START(1, bl);
4762 using ceph::decode;
4763 decode(clean_offsets, bl);
4764 decode(clean_omap, bl);
4765 decode(new_object, bl);
4766 DECODE_FINISH(bl);
4767 }
4768
4769 void ObjectCleanRegions::dump(Formatter *f) const
4770 {
4771 f->open_object_section("object_clean_regions");
4772 f->dump_stream("clean_offsets") << clean_offsets;
4773 f->dump_bool("clean_omap", clean_omap);
4774 f->dump_bool("new_object", new_object);
4775 f->close_section();
4776 }
4777
4778 void ObjectCleanRegions::generate_test_instances(list<ObjectCleanRegions*>& o)
4779 {
4780 o.push_back(new ObjectCleanRegions());
4781 o.push_back(new ObjectCleanRegions());
4782 o.back()->mark_data_region_dirty(4096, 40960);
4783 o.back()->mark_omap_dirty();
4784 o.back()->mark_object_new();
4785 }
4786
4787 ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr)
4788 {
4789 return out << "clean_offsets: " << ocr.clean_offsets
4790 << ", clean_omap: " << ocr.clean_omap
4791 << ", new_object: " << ocr.new_object;
4792 }
4793
4794 // -- pg_log_entry_t --
4795
4796 string pg_log_entry_t::get_key_name() const
4797 {
4798 return version.get_key_name();
4799 }
4800
4801 void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const
4802 {
4803 using ceph::encode;
4804 ceph::buffer::list ebl(sizeof(*this)*2);
4805 this->encode(ebl);
4806 __u32 crc = ebl.crc32c(0);
4807 encode(ebl, bl);
4808 encode(crc, bl);
4809 }
4810
4811 void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p)
4812 {
4813 using ceph::decode;
4814 ceph::buffer::list bl;
4815 decode(bl, p);
4816 __u32 crc;
4817 decode(crc, p);
4818 if (crc != bl.crc32c(0))
4819 throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
4820 auto q = bl.cbegin();
4821 this->decode(q);
4822 }
4823
4824 void pg_log_entry_t::encode(ceph::buffer::list &bl) const
4825 {
4826 ENCODE_START(14, 4, bl);
4827 encode(op, bl);
4828 encode(soid, bl);
4829 encode(version, bl);
4830
4831 /**
4832 * Added with reverting_to:
4833 * Previous code used prior_version to encode
4834 * what we now call reverting_to. This will
4835 * allow older code to decode reverting_to
4836 * into prior_version as expected.
4837 */
4838 if (op == LOST_REVERT)
4839 encode(reverting_to, bl);
4840 else
4841 encode(prior_version, bl);
4842
4843 encode(reqid, bl);
4844 encode(mtime, bl);
4845 if (op == LOST_REVERT)
4846 encode(prior_version, bl);
4847 encode(snaps, bl);
4848 encode(user_version, bl);
4849 encode(mod_desc, bl);
4850 encode(extra_reqids, bl);
4851 if (op == ERROR)
4852 encode(return_code, bl);
4853 if (!extra_reqids.empty())
4854 encode(extra_reqid_return_codes, bl);
4855 encode(clean_regions, bl);
4856 if (op != ERROR)
4857 encode(return_code, bl);
4858 encode(op_returns, bl);
4859 ENCODE_FINISH(bl);
4860 }
4861
4862 void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
4863 {
4864 DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl);
4865 decode(op, bl);
4866 if (struct_v < 2) {
4867 sobject_t old_soid;
4868 decode(old_soid, bl);
4869 soid.oid = old_soid.oid;
4870 soid.snap = old_soid.snap;
4871 invalid_hash = true;
4872 } else {
4873 decode(soid, bl);
4874 }
4875 if (struct_v < 3)
4876 invalid_hash = true;
4877 decode(version, bl);
4878
4879 if (struct_v >= 6 && op == LOST_REVERT)
4880 decode(reverting_to, bl);
4881 else
4882 decode(prior_version, bl);
4883
4884 decode(reqid, bl);
4885
4886 decode(mtime, bl);
4887 if (struct_v < 5)
4888 invalid_pool = true;
4889
4890 if (op == LOST_REVERT) {
4891 if (struct_v >= 6) {
4892 decode(prior_version, bl);
4893 } else {
4894 reverting_to = prior_version;
4895 }
4896 }
4897 if (struct_v >= 7 || // for v >= 7, this is for all ops.
4898 op == CLONE) { // for v < 7, it's only present for CLONE.
4899 decode(snaps, bl);
4900 // ensure snaps does not pin a larger ceph::buffer in memory
4901 snaps.rebuild();
4902 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4903 }
4904
4905 if (struct_v >= 8)
4906 decode(user_version, bl);
4907 else
4908 user_version = version.version;
4909
4910 if (struct_v >= 9)
4911 decode(mod_desc, bl);
4912 else
4913 mod_desc.mark_unrollbackable();
4914 if (struct_v >= 10)
4915 decode(extra_reqids, bl);
4916 if (struct_v >= 11 && op == ERROR)
4917 decode(return_code, bl);
4918 if (struct_v >= 12 && !extra_reqids.empty())
4919 decode(extra_reqid_return_codes, bl);
4920 if (struct_v >= 13)
4921 decode(clean_regions, bl);
4922 else
4923 clean_regions.mark_fully_dirty();
4924 if (struct_v >= 14) {
4925 if (op != ERROR) {
4926 decode(return_code, bl);
4927 }
4928 decode(op_returns, bl);
4929 }
4930 DECODE_FINISH(bl);
4931 }
4932
4933 void pg_log_entry_t::dump(Formatter *f) const
4934 {
4935 f->dump_string("op", get_op_name());
4936 f->dump_stream("object") << soid;
4937 f->dump_stream("version") << version;
4938 f->dump_stream("prior_version") << prior_version;
4939 f->dump_stream("reqid") << reqid;
4940 f->open_array_section("extra_reqids");
4941 uint32_t idx = 0;
4942 for (auto p = extra_reqids.begin();
4943 p != extra_reqids.end();
4944 ++idx, ++p) {
4945 f->open_object_section("extra_reqid");
4946 f->dump_stream("reqid") << p->first;
4947 f->dump_stream("user_version") << p->second;
4948 auto it = extra_reqid_return_codes.find(idx);
4949 if (it != extra_reqid_return_codes.end()) {
4950 f->dump_int("return_code", it->second);
4951 }
4952 f->close_section();
4953 }
4954 f->close_section();
4955 f->dump_stream("mtime") << mtime;
4956 f->dump_int("return_code", return_code);
4957 if (!op_returns.empty()) {
4958 f->open_array_section("op_returns");
4959 for (auto& i : op_returns) {
4960 f->dump_object("op", i);
4961 }
4962 f->close_section();
4963 }
4964 if (snaps.length() > 0) {
4965 vector<snapid_t> v;
4966 ceph::buffer::list c = snaps;
4967 auto p = c.cbegin();
4968 try {
4969 using ceph::decode;
4970 decode(v, p);
4971 } catch (...) {
4972 v.clear();
4973 }
4974 f->open_object_section("snaps");
4975 for (auto p = v.begin(); p != v.end(); ++p)
4976 f->dump_unsigned("snap", *p);
4977 f->close_section();
4978 }
4979 {
4980 f->open_object_section("mod_desc");
4981 mod_desc.dump(f);
4982 f->close_section();
4983 }
4984 {
4985 f->open_object_section("clean_regions");
4986 clean_regions.dump(f);
4987 f->close_section();
4988 }
4989 }
4990
4991 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4992 {
4993 o.push_back(new pg_log_entry_t());
4994 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4995 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4996 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4997 utime_t(8,9), 0));
4998 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4999 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5000 utime_t(8,9), -ENOENT));
5001 }
5002
5003 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
5004 {
5005 out << e.version << " (" << e.prior_version << ") "
5006 << std::left << std::setw(8) << e.get_op_name() << ' '
5007 << e.soid << " by " << e.reqid << " " << e.mtime
5008 << " " << e.return_code;
5009 if (!e.op_returns.empty()) {
5010 out << " " << e.op_returns;
5011 }
5012 if (e.snaps.length()) {
5013 vector<snapid_t> snaps;
5014 ceph::buffer::list c = e.snaps;
5015 auto p = c.cbegin();
5016 try {
5017 decode(snaps, p);
5018 } catch (...) {
5019 snaps.clear();
5020 }
5021 out << " snaps " << snaps;
5022 }
5023 out << " ObjectCleanRegions " << e.clean_regions;
5024 return out;
5025 }
5026
5027 // -- pg_log_dup_t --
5028
5029 std::string pg_log_dup_t::get_key_name() const
5030 {
5031 static const char prefix[] = "dup_";
5032 std::string key(36, ' ');
5033 memcpy(&key[0], prefix, 4);
5034 version.get_key_name(&key[4]);
5035 key.resize(35); // remove the null terminator
5036 return key;
5037 }
5038
5039 void pg_log_dup_t::encode(ceph::buffer::list &bl) const
5040 {
5041 ENCODE_START(2, 1, bl);
5042 encode(reqid, bl);
5043 encode(version, bl);
5044 encode(user_version, bl);
5045 encode(return_code, bl);
5046 encode(op_returns, bl);
5047 ENCODE_FINISH(bl);
5048 }
5049
5050 void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl)
5051 {
5052 DECODE_START(2, bl);
5053 decode(reqid, bl);
5054 decode(version, bl);
5055 decode(user_version, bl);
5056 decode(return_code, bl);
5057 if (struct_v >= 2) {
5058 decode(op_returns, bl);
5059 }
5060 DECODE_FINISH(bl);
5061 }
5062
5063 void pg_log_dup_t::dump(Formatter *f) const
5064 {
5065 f->dump_stream("reqid") << reqid;
5066 f->dump_stream("version") << version;
5067 f->dump_stream("user_version") << user_version;
5068 f->dump_stream("return_code") << return_code;
5069 if (!op_returns.empty()) {
5070 f->open_array_section("op_returns");
5071 for (auto& i : op_returns) {
5072 f->dump_object("op", i);
5073 }
5074 f->close_section();
5075 }
5076 }
5077
5078 void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
5079 {
5080 o.push_back(new pg_log_dup_t());
5081 o.push_back(new pg_log_dup_t(eversion_t(1,2),
5082 1,
5083 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5084 0));
5085 o.push_back(new pg_log_dup_t(eversion_t(1,2),
5086 2,
5087 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5088 -ENOENT));
5089 }
5090
5091
5092 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
5093 out << "log_dup(reqid=" << e.reqid <<
5094 " v=" << e.version << " uv=" << e.user_version <<
5095 " rc=" << e.return_code;
5096 if (!e.op_returns.empty()) {
5097 out << " " << e.op_returns;
5098 }
5099 return out << ")";
5100 }
5101
5102
5103 // -- pg_log_t --
5104
5105 // out: pg_log_t that only has entries that apply to import_pgid using curmap
5106 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
5107 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
5108 const string &hit_set_namespace, const pg_log_t &in,
5109 pg_log_t &out, pg_log_t &reject)
5110 {
5111 out = in;
5112 out.log.clear();
5113 reject.log.clear();
5114
5115 for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) {
5116
5117 // Reject pg log entries for temporary objects
5118 if (i->soid.is_temp()) {
5119 reject.log.push_back(*i);
5120 continue;
5121 }
5122
5123 if (i->soid.nspace != hit_set_namespace) {
5124 object_t oid = i->soid.oid;
5125 object_locator_t loc(i->soid);
5126 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
5127 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
5128
5129 if (import_pgid.pgid == pgid) {
5130 out.log.push_back(*i);
5131 } else {
5132 reject.log.push_back(*i);
5133 }
5134 } else {
5135 out.log.push_back(*i);
5136 }
5137 }
5138 }
5139
5140 void pg_log_t::encode(ceph::buffer::list& bl) const
5141 {
5142 ENCODE_START(7, 3, bl);
5143 encode(head, bl);
5144 encode(tail, bl);
5145 encode(log, bl);
5146 encode(can_rollback_to, bl);
5147 encode(rollback_info_trimmed_to, bl);
5148 encode(dups, bl);
5149 ENCODE_FINISH(bl);
5150 }
5151
5152 void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool)
5153 {
5154 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
5155 decode(head, bl);
5156 decode(tail, bl);
5157 if (struct_v < 2) {
5158 bool backlog;
5159 decode(backlog, bl);
5160 }
5161 decode(log, bl);
5162 if (struct_v >= 5)
5163 decode(can_rollback_to, bl);
5164
5165 if (struct_v >= 6)
5166 decode(rollback_info_trimmed_to, bl);
5167 else
5168 rollback_info_trimmed_to = tail;
5169
5170 if (struct_v >= 7)
5171 decode(dups, bl);
5172
5173 DECODE_FINISH(bl);
5174
5175 // handle hobject_t format change
5176 if (struct_v < 4) {
5177 for (auto i = log.begin(); i != log.end(); ++i) {
5178 if (!i->soid.is_max() && i->soid.pool == -1)
5179 i->soid.pool = pool;
5180 }
5181 }
5182 }
5183
5184 void pg_log_t::dump(Formatter *f) const
5185 {
5186 f->dump_stream("head") << head;
5187 f->dump_stream("tail") << tail;
5188 f->open_array_section("log");
5189 for (auto p = log.cbegin(); p != log.cend(); ++p) {
5190 f->open_object_section("entry");
5191 p->dump(f);
5192 f->close_section();
5193 }
5194 f->close_section();
5195 f->open_array_section("dups");
5196 for (const auto& entry : dups) {
5197 f->open_object_section("entry");
5198 entry.dump(f);
5199 f->close_section();
5200 }
5201 f->close_section();
5202 }
5203
5204 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
5205 {
5206 o.push_back(new pg_log_t);
5207
5208 // this is nonsensical:
5209 o.push_back(new pg_log_t);
5210 o.back()->head = eversion_t(1,2);
5211 o.back()->tail = eversion_t(3,4);
5212 list<pg_log_entry_t*> e;
5213 pg_log_entry_t::generate_test_instances(e);
5214 for (auto p = e.begin(); p != e.end(); ++p)
5215 o.back()->log.push_back(**p);
5216 }
5217
5218 static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups)
5219 {
5220 auto earliest_dup_version =
5221 target.head.version < maxdups ? 0u : target.head.version - maxdups + 1;
5222 lgeneric_subdout(cct, osd, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version << dendl;
5223
5224 for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) {
5225 if (d->version.version >= earliest_dup_version) {
5226 lgeneric_subdout(cct, osd, 20)
5227 << "copy_up_to/copy_after copy dup version "
5228 << d->version << dendl;
5229 target.dups.push_back(pg_log_dup_t(*d));
5230 }
5231 }
5232
5233 for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) {
5234 ceph_assert(i->version > other.tail);
5235 if (i->version > target.tail)
5236 break;
5237 if (i->version.version >= earliest_dup_version) {
5238 lgeneric_subdout(cct, osd, 20)
5239 << "copy_up_to/copy_after copy dup from log version "
5240 << i->version << dendl;
5241 target.dups.push_back(pg_log_dup_t(*i));
5242 }
5243 }
5244 }
5245
5246
5247 void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v)
5248 {
5249 can_rollback_to = other.can_rollback_to;
5250 head = other.head;
5251 tail = other.tail;
5252 lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v << dendl;
5253 for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
5254 ceph_assert(i->version > other.tail);
5255 if (i->version <= v) {
5256 // make tail accurate.
5257 tail = i->version;
5258 break;
5259 }
5260 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
5261 log.push_front(*i);
5262 }
5263 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
5264 }
5265
5266 void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max)
5267 {
5268 can_rollback_to = other.can_rollback_to;
5269 int n = 0;
5270 head = other.head;
5271 tail = other.tail;
5272 lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max << dendl;
5273 for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
5274 ceph_assert(i->version > other.tail);
5275 if (n++ >= max) {
5276 tail = i->version;
5277 break;
5278 }
5279 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
5280 log.push_front(*i);
5281 }
5282 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
5283 }
5284
5285 ostream& pg_log_t::print(ostream& out) const
5286 {
5287 out << *this << std::endl;
5288 for (auto p = log.cbegin(); p != log.cend(); ++p)
5289 out << *p << std::endl;
5290 for (const auto& entry : dups) {
5291 out << " dup entry: " << entry << std::endl;
5292 }
5293 return out;
5294 }
5295
5296 // -- pg_missing_t --
5297
5298 ostream& operator<<(ostream& out, const pg_missing_item& i)
5299 {
5300 out << i.need;
5301 if (i.have != eversion_t())
5302 out << "(" << i.have << ")";
5303 out << " flags = " << i.flag_str()
5304 << " " << i.clean_regions;
5305 return out;
5306 }
5307
5308 // -- object_copy_cursor_t --
5309
5310 void object_copy_cursor_t::encode(ceph::buffer::list& bl) const
5311 {
5312 ENCODE_START(1, 1, bl);
5313 encode(attr_complete, bl);
5314 encode(data_offset, bl);
5315 encode(data_complete, bl);
5316 encode(omap_offset, bl);
5317 encode(omap_complete, bl);
5318 ENCODE_FINISH(bl);
5319 }
5320
5321 void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl)
5322 {
5323 DECODE_START(1, bl);
5324 decode(attr_complete, bl);
5325 decode(data_offset, bl);
5326 decode(data_complete, bl);
5327 decode(omap_offset, bl);
5328 decode(omap_complete, bl);
5329 DECODE_FINISH(bl);
5330 }
5331
5332 void object_copy_cursor_t::dump(Formatter *f) const
5333 {
5334 f->dump_unsigned("attr_complete", (int)attr_complete);
5335 f->dump_unsigned("data_offset", data_offset);
5336 f->dump_unsigned("data_complete", (int)data_complete);
5337 f->dump_string("omap_offset", omap_offset);
5338 f->dump_unsigned("omap_complete", (int)omap_complete);
5339 }
5340
5341 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
5342 {
5343 o.push_back(new object_copy_cursor_t);
5344 o.push_back(new object_copy_cursor_t);
5345 o.back()->attr_complete = true;
5346 o.back()->data_offset = 123;
5347 o.push_back(new object_copy_cursor_t);
5348 o.back()->attr_complete = true;
5349 o.back()->data_complete = true;
5350 o.back()->omap_offset = "foo";
5351 o.push_back(new object_copy_cursor_t);
5352 o.back()->attr_complete = true;
5353 o.back()->data_complete = true;
5354 o.back()->omap_complete = true;
5355 }
5356
5357 // -- object_copy_data_t --
5358
5359 void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const
5360 {
5361 ENCODE_START(8, 5, bl);
5362 encode(size, bl);
5363 encode(mtime, bl);
5364 encode(attrs, bl);
5365 encode(data, bl);
5366 encode(omap_data, bl);
5367 encode(cursor, bl);
5368 encode(omap_header, bl);
5369 encode(snaps, bl);
5370 encode(snap_seq, bl);
5371 encode(flags, bl);
5372 encode(data_digest, bl);
5373 encode(omap_digest, bl);
5374 encode(reqids, bl);
5375 encode(truncate_seq, bl);
5376 encode(truncate_size, bl);
5377 encode(reqid_return_codes, bl);
5378 ENCODE_FINISH(bl);
5379 }
5380
5381 void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl)
5382 {
5383 DECODE_START(8, bl);
5384 if (struct_v < 5) {
5385 // old
5386 decode(size, bl);
5387 decode(mtime, bl);
5388 {
5389 string category;
5390 decode(category, bl); // no longer used
5391 }
5392 decode(attrs, bl);
5393 decode(data, bl);
5394 {
5395 map<string,ceph::buffer::list> omap;
5396 decode(omap, bl);
5397 omap_data.clear();
5398 if (!omap.empty()) {
5399 using ceph::encode;
5400 encode(omap, omap_data);
5401 }
5402 }
5403 decode(cursor, bl);
5404 if (struct_v >= 2)
5405 decode(omap_header, bl);
5406 if (struct_v >= 3) {
5407 decode(snaps, bl);
5408 decode(snap_seq, bl);
5409 } else {
5410 snaps.clear();
5411 snap_seq = 0;
5412 }
5413 if (struct_v >= 4) {
5414 decode(flags, bl);
5415 decode(data_digest, bl);
5416 decode(omap_digest, bl);
5417 }
5418 } else {
5419 // current
5420 decode(size, bl);
5421 decode(mtime, bl);
5422 decode(attrs, bl);
5423 decode(data, bl);
5424 decode(omap_data, bl);
5425 decode(cursor, bl);
5426 decode(omap_header, bl);
5427 decode(snaps, bl);
5428 decode(snap_seq, bl);
5429 if (struct_v >= 4) {
5430 decode(flags, bl);
5431 decode(data_digest, bl);
5432 decode(omap_digest, bl);
5433 }
5434 if (struct_v >= 6) {
5435 decode(reqids, bl);
5436 }
5437 if (struct_v >= 7) {
5438 decode(truncate_seq, bl);
5439 decode(truncate_size, bl);
5440 }
5441 if (struct_v >= 8) {
5442 decode(reqid_return_codes, bl);
5443 }
5444 }
5445 DECODE_FINISH(bl);
5446 }
5447
5448 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
5449 {
5450 o.push_back(new object_copy_data_t());
5451
5452 list<object_copy_cursor_t*> cursors;
5453 object_copy_cursor_t::generate_test_instances(cursors);
5454 auto ci = cursors.begin();
5455 o.back()->cursor = **(ci++);
5456
5457 o.push_back(new object_copy_data_t());
5458 o.back()->cursor = **(ci++);
5459
5460 o.push_back(new object_copy_data_t());
5461 o.back()->size = 1234;
5462 o.back()->mtime.set_from_double(1234);
5463 ceph::buffer::ptr bp("there", 5);
5464 ceph::buffer::list bl;
5465 bl.push_back(bp);
5466 o.back()->attrs["hello"] = bl;
5467 ceph::buffer::ptr bp2("not", 3);
5468 ceph::buffer::list bl2;
5469 bl2.push_back(bp2);
5470 map<string,ceph::buffer::list> omap;
5471 omap["why"] = bl2;
5472 using ceph::encode;
5473 encode(omap, o.back()->omap_data);
5474 ceph::buffer::ptr databp("iamsomedatatocontain", 20);
5475 o.back()->data.push_back(databp);
5476 o.back()->omap_header.append("this is an omap header");
5477 o.back()->snaps.push_back(123);
5478 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
5479 }
5480
5481 void object_copy_data_t::dump(Formatter *f) const
5482 {
5483 f->open_object_section("cursor");
5484 cursor.dump(f);
5485 f->close_section(); // cursor
5486 f->dump_int("size", size);
5487 f->dump_stream("mtime") << mtime;
5488 /* we should really print out the attrs here, but ceph::buffer::list
5489 const-correctness prevents that */
5490 f->dump_int("attrs_size", attrs.size());
5491 f->dump_int("flags", flags);
5492 f->dump_unsigned("data_digest", data_digest);
5493 f->dump_unsigned("omap_digest", omap_digest);
5494 f->dump_int("omap_data_length", omap_data.length());
5495 f->dump_int("omap_header_length", omap_header.length());
5496 f->dump_int("data_length", data.length());
5497 f->open_array_section("snaps");
5498 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
5499 f->dump_unsigned("snap", *p);
5500 f->close_section();
5501 f->open_array_section("reqids");
5502 uint32_t idx = 0;
5503 for (auto p = reqids.begin();
5504 p != reqids.end();
5505 ++idx, ++p) {
5506 f->open_object_section("extra_reqid");
5507 f->dump_stream("reqid") << p->first;
5508 f->dump_stream("user_version") << p->second;
5509 auto it = reqid_return_codes.find(idx);
5510 if (it != reqid_return_codes.end()) {
5511 f->dump_int("return_code", it->second);
5512 }
5513 f->close_section();
5514 }
5515 f->close_section();
5516 }
5517
5518 // -- pg_create_t --
5519
5520 void pg_create_t::encode(ceph::buffer::list &bl) const
5521 {
5522 ENCODE_START(1, 1, bl);
5523 encode(created, bl);
5524 encode(parent, bl);
5525 encode(split_bits, bl);
5526 ENCODE_FINISH(bl);
5527 }
5528
5529 void pg_create_t::decode(ceph::buffer::list::const_iterator &bl)
5530 {
5531 DECODE_START(1, bl);
5532 decode(created, bl);
5533 decode(parent, bl);
5534 decode(split_bits, bl);
5535 DECODE_FINISH(bl);
5536 }
5537
5538 void pg_create_t::dump(Formatter *f) const
5539 {
5540 f->dump_unsigned("created", created);
5541 f->dump_stream("parent") << parent;
5542 f->dump_int("split_bits", split_bits);
5543 }
5544
5545 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
5546 {
5547 o.push_back(new pg_create_t);
5548 o.push_back(new pg_create_t(1, pg_t(3, 4), 2));
5549 }
5550
5551
5552 // -- pg_hit_set_info_t --
5553
5554 void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const
5555 {
5556 ENCODE_START(2, 1, bl);
5557 encode(begin, bl);
5558 encode(end, bl);
5559 encode(version, bl);
5560 encode(using_gmt, bl);
5561 ENCODE_FINISH(bl);
5562 }
5563
5564 void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p)
5565 {
5566 DECODE_START(2, p);
5567 decode(begin, p);
5568 decode(end, p);
5569 decode(version, p);
5570 if (struct_v >= 2) {
5571 decode(using_gmt, p);
5572 } else {
5573 using_gmt = false;
5574 }
5575 DECODE_FINISH(p);
5576 }
5577
5578 void pg_hit_set_info_t::dump(Formatter *f) const
5579 {
5580 f->dump_stream("begin") << begin;
5581 f->dump_stream("end") << end;
5582 f->dump_stream("version") << version;
5583 f->dump_stream("using_gmt") << using_gmt;
5584 }
5585
5586 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
5587 {
5588 ls.push_back(new pg_hit_set_info_t);
5589 ls.push_back(new pg_hit_set_info_t);
5590 ls.back()->begin = utime_t(1, 2);
5591 ls.back()->end = utime_t(3, 4);
5592 }
5593
5594
5595 // -- pg_hit_set_history_t --
5596
5597 void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const
5598 {
5599 ENCODE_START(1, 1, bl);
5600 encode(current_last_update, bl);
5601 {
5602 utime_t dummy_stamp;
5603 encode(dummy_stamp, bl);
5604 }
5605 {
5606 pg_hit_set_info_t dummy_info;
5607 encode(dummy_info, bl);
5608 }
5609 encode(history, bl);
5610 ENCODE_FINISH(bl);
5611 }
5612
5613 void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p)
5614 {
5615 DECODE_START(1, p);
5616 decode(current_last_update, p);
5617 {
5618 utime_t dummy_stamp;
5619 decode(dummy_stamp, p);
5620 }
5621 {
5622 pg_hit_set_info_t dummy_info;
5623 decode(dummy_info, p);
5624 }
5625 decode(history, p);
5626 DECODE_FINISH(p);
5627 }
5628
5629 void pg_hit_set_history_t::dump(Formatter *f) const
5630 {
5631 f->dump_stream("current_last_update") << current_last_update;
5632 f->open_array_section("history");
5633 for (auto p = history.cbegin(); p != history.cend(); ++p) {
5634 f->open_object_section("info");
5635 p->dump(f);
5636 f->close_section();
5637 }
5638 f->close_section();
5639 }
5640
5641 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
5642 {
5643 ls.push_back(new pg_hit_set_history_t);
5644 ls.push_back(new pg_hit_set_history_t);
5645 ls.back()->current_last_update = eversion_t(1, 2);
5646 ls.back()->history.push_back(pg_hit_set_info_t());
5647 }
5648
5649 // -- OSDSuperblock --
5650
5651 void OSDSuperblock::encode(ceph::buffer::list &bl) const
5652 {
5653 ENCODE_START(9, 5, bl);
5654 encode(cluster_fsid, bl);
5655 encode(whoami, bl);
5656 encode(current_epoch, bl);
5657 encode(oldest_map, bl);
5658 encode(newest_map, bl);
5659 encode(weight, bl);
5660 compat_features.encode(bl);
5661 encode(clean_thru, bl);
5662 encode(mounted, bl);
5663 encode(osd_fsid, bl);
5664 encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
5665 encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
5666 encode(purged_snaps_last, bl);
5667 encode(last_purged_snaps_scrub, bl);
5668 ENCODE_FINISH(bl);
5669 }
5670
5671 void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl)
5672 {
5673 DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
5674 if (struct_v < 3) {
5675 string magic;
5676 decode(magic, bl);
5677 }
5678 decode(cluster_fsid, bl);
5679 decode(whoami, bl);
5680 decode(current_epoch, bl);
5681 decode(oldest_map, bl);
5682 decode(newest_map, bl);
5683 decode(weight, bl);
5684 if (struct_v >= 2) {
5685 compat_features.decode(bl);
5686 } else { //upgrade it!
5687 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
5688 }
5689 decode(clean_thru, bl);
5690 decode(mounted, bl);
5691 if (struct_v >= 4)
5692 decode(osd_fsid, bl);
5693 if (struct_v >= 6) {
5694 epoch_t last_map_marked_full;
5695 decode(last_map_marked_full, bl);
5696 }
5697 if (struct_v >= 7) {
5698 map<int64_t,epoch_t> pool_last_map_marked_full;
5699 decode(pool_last_map_marked_full, bl);
5700 }
5701 if (struct_v >= 9) {
5702 decode(purged_snaps_last, bl);
5703 decode(last_purged_snaps_scrub, bl);
5704 } else {
5705 purged_snaps_last = 0;
5706 }
5707 DECODE_FINISH(bl);
5708 }
5709
5710 void OSDSuperblock::dump(Formatter *f) const
5711 {
5712 f->dump_stream("cluster_fsid") << cluster_fsid;
5713 f->dump_stream("osd_fsid") << osd_fsid;
5714 f->dump_int("whoami", whoami);
5715 f->dump_int("current_epoch", current_epoch);
5716 f->dump_int("oldest_map", oldest_map);
5717 f->dump_int("newest_map", newest_map);
5718 f->dump_float("weight", weight);
5719 f->open_object_section("compat");
5720 compat_features.dump(f);
5721 f->close_section();
5722 f->dump_int("clean_thru", clean_thru);
5723 f->dump_int("last_epoch_mounted", mounted);
5724 f->dump_unsigned("purged_snaps_last", purged_snaps_last);
5725 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
5726 }
5727
5728 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
5729 {
5730 OSDSuperblock z;
5731 o.push_back(new OSDSuperblock(z));
5732 z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101");
5733 z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
5734 z.whoami = 3;
5735 z.current_epoch = 4;
5736 z.oldest_map = 5;
5737 z.newest_map = 9;
5738 z.mounted = 8;
5739 z.clean_thru = 7;
5740 o.push_back(new OSDSuperblock(z));
5741 o.push_back(new OSDSuperblock(z));
5742 }
5743
5744 // -- SnapSet --
5745
5746 void SnapSet::encode(ceph::buffer::list& bl) const
5747 {
5748 ENCODE_START(3, 2, bl);
5749 encode(seq, bl);
5750 encode(true, bl); // head_exists
5751 encode(snaps, bl);
5752 encode(clones, bl);
5753 encode(clone_overlap, bl);
5754 encode(clone_size, bl);
5755 encode(clone_snaps, bl);
5756 ENCODE_FINISH(bl);
5757 }
5758
5759 void SnapSet::decode(ceph::buffer::list::const_iterator& bl)
5760 {
5761 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5762 decode(seq, bl);
5763 bl += 1u; // skip legacy head_exists (always true)
5764 decode(snaps, bl);
5765 decode(clones, bl);
5766 decode(clone_overlap, bl);
5767 decode(clone_size, bl);
5768 if (struct_v >= 3) {
5769 decode(clone_snaps, bl);
5770 } else {
5771 clone_snaps.clear();
5772 }
5773 DECODE_FINISH(bl);
5774 }
5775
5776 void SnapSet::dump(Formatter *f) const
5777 {
5778 f->dump_unsigned("seq", seq);
5779 f->open_array_section("clones");
5780 for (auto p = clones.cbegin(); p != clones.cend(); ++p) {
5781 f->open_object_section("clone");
5782 f->dump_unsigned("snap", *p);
5783 auto cs = clone_size.find(*p);
5784 if (cs != clone_size.end())
5785 f->dump_unsigned("size", cs->second);
5786 else
5787 f->dump_string("size", "????");
5788 auto co = clone_overlap.find(*p);
5789 if (co != clone_overlap.end())
5790 f->dump_stream("overlap") << co->second;
5791 else
5792 f->dump_stream("overlap") << "????";
5793 auto q = clone_snaps.find(*p);
5794 if (q != clone_snaps.end()) {
5795 f->open_array_section("snaps");
5796 for (auto s : q->second) {
5797 f->dump_unsigned("snap", s);
5798 }
5799 f->close_section();
5800 }
5801 f->close_section();
5802 }
5803 f->close_section();
5804 }
5805
5806 void SnapSet::generate_test_instances(list<SnapSet*>& o)
5807 {
5808 o.push_back(new SnapSet);
5809 o.push_back(new SnapSet);
5810 o.back()->seq = 123;
5811 o.back()->snaps.push_back(123);
5812 o.back()->snaps.push_back(12);
5813 o.push_back(new SnapSet);
5814 o.back()->seq = 123;
5815 o.back()->snaps.push_back(123);
5816 o.back()->snaps.push_back(12);
5817 o.back()->clones.push_back(12);
5818 o.back()->clone_size[12] = 12345;
5819 o.back()->clone_overlap[12];
5820 o.back()->clone_snaps[12] = {12, 10, 8};
5821 }
5822
5823 ostream& operator<<(ostream& out, const SnapSet& cs)
5824 {
5825 return out << cs.seq << "=" << cs.snaps << ":"
5826 << cs.clone_snaps;
5827 }
5828
5829 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
5830 {
5831 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5832 // correct: it will not include snaps that still logically exist
5833 // but for which there was no clone that is defined. For all
5834 // practical purposes this doesn't matter, since we only use that
5835 // information to clone on the OSD, and we have already moved
5836 // forward past that part of the object history.
5837
5838 seq = ss.seq;
5839 set<snapid_t> _snaps;
5840 set<snapid_t> _clones;
5841 for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) {
5842 if (p->cloneid != librados::SNAP_HEAD) {
5843 _clones.insert(p->cloneid);
5844 _snaps.insert(p->snaps.begin(), p->snaps.end());
5845 clone_size[p->cloneid] = p->size;
5846 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
5847 for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q)
5848 clone_overlap[p->cloneid].insert(q->first, q->second);
5849 if (!legacy) {
5850 // p->snaps is ascending; clone_snaps is descending
5851 vector<snapid_t>& v = clone_snaps[p->cloneid];
5852 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
5853 v.push_back(*q);
5854 }
5855 }
5856 }
5857 }
5858
5859 // ascending
5860 clones.clear();
5861 clones.reserve(_clones.size());
5862 for (auto p = _clones.begin(); p != _clones.end(); ++p)
5863 clones.push_back(*p);
5864
5865 // descending
5866 snaps.clear();
5867 snaps.reserve(_snaps.size());
5868 for (auto p = _snaps.rbegin();
5869 p != _snaps.rend(); ++p)
5870 snaps.push_back(*p);
5871 }
5872
5873 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
5874 {
5875 ceph_assert(clone_size.count(clone));
5876 uint64_t size = clone_size.find(clone)->second;
5877 ceph_assert(clone_overlap.count(clone));
5878 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
5879 ceph_assert(size >= (uint64_t)overlap.size());
5880 return size - overlap.size();
5881 }
5882
5883 void SnapSet::filter(const pg_pool_t &pinfo)
5884 {
5885 vector<snapid_t> oldsnaps;
5886 oldsnaps.swap(snaps);
5887 for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) {
5888 if (!pinfo.is_removed_snap(*i))
5889 snaps.push_back(*i);
5890 }
5891 }
5892
5893 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5894 {
5895 SnapSet ss = *this;
5896 ss.filter(pinfo);
5897 return ss;
5898 }
5899
5900 // -- watch_info_t --
5901
5902 void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
5903 {
5904 ENCODE_START(4, 3, bl);
5905 encode(cookie, bl);
5906 encode(timeout_seconds, bl);
5907 encode(addr, bl, features);
5908 ENCODE_FINISH(bl);
5909 }
5910
5911 void watch_info_t::decode(ceph::buffer::list::const_iterator& bl)
5912 {
5913 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
5914 decode(cookie, bl);
5915 if (struct_v < 2) {
5916 uint64_t ver;
5917 decode(ver, bl);
5918 }
5919 decode(timeout_seconds, bl);
5920 if (struct_v >= 4) {
5921 decode(addr, bl);
5922 }
5923 DECODE_FINISH(bl);
5924 }
5925
5926 void watch_info_t::dump(Formatter *f) const
5927 {
5928 f->dump_unsigned("cookie", cookie);
5929 f->dump_unsigned("timeout_seconds", timeout_seconds);
5930 f->open_object_section("addr");
5931 addr.dump(f);
5932 f->close_section();
5933 }
5934
5935 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5936 {
5937 o.push_back(new watch_info_t);
5938 o.push_back(new watch_info_t);
5939 o.back()->cookie = 123;
5940 o.back()->timeout_seconds = 99;
5941 entity_addr_t ea;
5942 ea.set_type(entity_addr_t::TYPE_LEGACY);
5943 ea.set_nonce(1);
5944 ea.set_family(AF_INET);
5945 ea.set_in4_quad(0, 127);
5946 ea.set_in4_quad(1, 0);
5947 ea.set_in4_quad(2, 1);
5948 ea.set_in4_quad(3, 2);
5949 ea.set_port(2);
5950 o.back()->addr = ea;
5951 }
5952
5953 // -- chunk_info_t --
5954
5955 void chunk_info_t::encode(ceph::buffer::list& bl) const
5956 {
5957 ENCODE_START(1, 1, bl);
5958 encode(offset, bl);
5959 encode(length, bl);
5960 encode(oid, bl);
5961 __u32 _flags = flags;
5962 encode(_flags, bl);
5963 ENCODE_FINISH(bl);
5964 }
5965
5966 void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl)
5967 {
5968 DECODE_START(1, bl);
5969 decode(offset, bl);
5970 decode(length, bl);
5971 decode(oid, bl);
5972 __u32 _flags;
5973 decode(_flags, bl);
5974 flags = (cflag_t)_flags;
5975 DECODE_FINISH(bl);
5976 }
5977
5978 void chunk_info_t::dump(Formatter *f) const
5979 {
5980 f->dump_unsigned("length", length);
5981 f->open_object_section("oid");
5982 oid.dump(f);
5983 f->close_section();
5984 f->dump_unsigned("flags", flags);
5985 }
5986
5987
5988 bool chunk_info_t::operator==(const chunk_info_t& cit) const
5989 {
5990 if (has_fingerprint()) {
5991 if (oid.oid.name == cit.oid.oid.name) {
5992 return true;
5993 }
5994 } else {
5995 if (offset == cit.offset && length == cit.length &&
5996 oid.oid.name == cit.oid.oid.name) {
5997 return true;
5998 }
5999
6000 }
6001 return false;
6002 }
6003
6004 bool operator==(const std::pair<const long unsigned int, chunk_info_t> & l,
6005 const std::pair<const long unsigned int, chunk_info_t> & r)
6006 {
6007 return l.first == r.first &&
6008 l.second == r.second;
6009 }
6010
6011 ostream& operator<<(ostream& out, const chunk_info_t& ci)
6012 {
6013 return out << "(len: " << ci.length << " oid: " << ci.oid
6014 << " offset: " << ci.offset
6015 << " flags: " << ci.get_flag_string(ci.flags) << ")";
6016 }
6017
6018 // -- object_manifest_t --
6019
6020 std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci)
6021 {
6022 return out << ci.ref_delta << std::endl;
6023 }
6024
6025 void object_manifest_t::calc_refs_to_inc_on_set(
6026 const object_manifest_t* _g,
6027 const object_manifest_t* _l,
6028 object_ref_delta_t &refs) const
6029 {
6030 /* avoid to increment the same reference on adjacent clones */
6031 auto iter = chunk_map.begin();
6032 auto find_chunk = [](decltype(iter) &i, const object_manifest_t* cur)
6033 -> bool {
6034 if (cur) {
6035 auto c = cur->chunk_map.find(i->first);
6036 if (c != cur->chunk_map.end() && c->second == i->second) {
6037 return true;
6038
6039 }
6040 }
6041 return false;
6042 };
6043
6044 /* If at least a same chunk exists on either _g or _l, do not increment
6045 * the reference
6046 *
6047 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6048 * 20: [0, 2) aaa, <- set_chunk
6049 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6050 * --> incremnt the reference
6051 *
6052 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6053 * 20: [0, 2) ccc, <- set_chunk
6054 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6055 * --> do not need to increment
6056 *
6057 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6058 * 20: [0, 2) ccc, <- set_chunk
6059 * 30: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6060 * --> decrement the reference of ccc
6061 *
6062 */
6063 for (; iter != chunk_map.end(); ++iter) {
6064 auto found_g = find_chunk(iter, _g);
6065 auto found_l = find_chunk(iter, _l);
6066 if (!found_g && !found_l) {
6067 refs.inc_ref(iter->second.oid);
6068 } else if (found_g && found_l) {
6069 refs.dec_ref(iter->second.oid);
6070 }
6071 }
6072 }
6073
6074 void object_manifest_t::calc_refs_to_drop_on_modify(
6075 const object_manifest_t* _l,
6076 const ObjectCleanRegions& clean_regions,
6077 object_ref_delta_t &refs) const
6078 {
6079 for (auto &p : chunk_map) {
6080 if (!clean_regions.is_clean_region(p.first, p.second.length)) {
6081 // has previous snapshot
6082 if (_l) {
6083 /*
6084 * Let's assume that there is a manifest snapshotted object which has three chunks
6085 * head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6086 * 20: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6087 *
6088 * If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because
6089 * 20 has the reference for bbb. Therefore, we only drop the reference if two chunks
6090 * (head: [6, 2) and 20: [6, 2)) are different.
6091 *
6092 */
6093 auto c = _l->chunk_map.find(p.first);
6094 if (c != _l->chunk_map.end()) {
6095 if (p.second == c->second) {
6096 continue;
6097 }
6098 }
6099 refs.dec_ref(p.second.oid);
6100 } else {
6101 // decrement the reference of the updated chunks if the manifest object has no snapshot
6102 refs.dec_ref(p.second.oid);
6103 }
6104 }
6105 }
6106 }
6107
6108 void object_manifest_t::calc_refs_to_drop_on_removal(
6109 const object_manifest_t* _g,
6110 const object_manifest_t* _l,
6111 object_ref_delta_t &refs) const
6112 {
6113 /* At a high level, the rule is that consecutive clones with the same reference
6114 * at the same offset share a reference. As such, removing *this may result
6115 * in removing references in two cases:
6116 * 1) *this has a reference which it shares with neither _g nor _l
6117 * 2) _g and _l have a reference which they share with each other but not
6118 * *this.
6119 *
6120 * For a particular offset, both 1 and 2 can happen.
6121 *
6122 * Notably, this means that to evaluate the reference change from removing
6123 * the object with *this, we only need to look at the two adjacent clones.
6124 */
6125
6126 // Paper over possibly missing _g or _l -- nullopt is semantically the same
6127 // as an empty chunk_map
6128 static const object_manifest_t empty;
6129 const object_manifest_t &g = _g ? *_g : empty;
6130 const object_manifest_t &l = _l ? *_l : empty;
6131
6132 auto giter = g.chunk_map.begin();
6133 auto iter = chunk_map.begin();
6134 auto liter = l.chunk_map.begin();
6135
6136 // Translate iter, map pair to the current offset, end() -> max
6137 auto get_offset = [](decltype(iter) &i, const object_manifest_t &manifest)
6138 -> uint64_t {
6139 return i == manifest.chunk_map.end() ?
6140 std::numeric_limits<uint64_t>::max() : i->first;
6141 };
6142
6143 /* If current matches the offset at iter, returns the chunk at *iter
6144 * and increments iter. Otherwise, returns nullptr.
6145 *
6146 * current will always be derived from the min of *giter, *iter, and
6147 * *liter on each cycle, so the result will be that each loop iteration
6148 * will pick up all chunks at the offest being considered, each offset
6149 * will be considered once, and all offsets will be considered.
6150 */
6151 auto get_chunk = [](
6152 uint64_t current, decltype(iter) &i, const object_manifest_t &manifest)
6153 -> const chunk_info_t * {
6154 if (i == manifest.chunk_map.end() || current != i->first) {
6155 return nullptr;
6156 } else {
6157 return &(i++)->second;
6158 }
6159 };
6160
6161 while (giter != g.chunk_map.end() ||
6162 iter != chunk_map.end() ||
6163 liter != l.chunk_map.end()) {
6164 auto current = std::min(
6165 std::min(get_offset(giter, g), get_offset(iter, *this)),
6166 get_offset(liter, l));
6167
6168 auto gchunk = get_chunk(current, giter, g);
6169 auto chunk = get_chunk(current, iter, *this);
6170 auto lchunk = get_chunk(current, liter, l);
6171
6172 if (gchunk && lchunk && *gchunk == *lchunk &&
6173 (!chunk || *gchunk != *chunk)) {
6174 // case 1 from above: l and g match, chunk does not
6175 refs.dec_ref(gchunk->oid);
6176 }
6177
6178 if (chunk &&
6179 (!gchunk || chunk->oid != gchunk->oid) &&
6180 (!lchunk || chunk->oid != lchunk->oid)) {
6181 // case 2 from above: *this matches neither
6182 refs.dec_ref(chunk->oid);
6183 }
6184 }
6185 }
6186
6187 void object_manifest_t::encode(ceph::buffer::list& bl) const
6188 {
6189 ENCODE_START(1, 1, bl);
6190 encode(type, bl);
6191 switch (type) {
6192 case TYPE_NONE: break;
6193 case TYPE_REDIRECT:
6194 encode(redirect_target, bl);
6195 break;
6196 case TYPE_CHUNKED:
6197 encode(chunk_map, bl);
6198 break;
6199 default:
6200 ceph_abort();
6201 }
6202 ENCODE_FINISH(bl);
6203 }
6204
6205 void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl)
6206 {
6207 DECODE_START(1, bl);
6208 decode(type, bl);
6209 switch (type) {
6210 case TYPE_NONE: break;
6211 case TYPE_REDIRECT:
6212 decode(redirect_target, bl);
6213 break;
6214 case TYPE_CHUNKED:
6215 decode(chunk_map, bl);
6216 break;
6217 default:
6218 ceph_abort();
6219 }
6220 DECODE_FINISH(bl);
6221 }
6222
6223 void object_manifest_t::dump(Formatter *f) const
6224 {
6225 f->dump_unsigned("type", type);
6226 if (type == TYPE_REDIRECT) {
6227 f->open_object_section("redirect_target");
6228 redirect_target.dump(f);
6229 f->close_section();
6230 } else if (type == TYPE_CHUNKED) {
6231 f->open_array_section("chunk_map");
6232 for (auto& p : chunk_map) {
6233 f->open_object_section("chunk");
6234 f->dump_unsigned("offset", p.first);
6235 p.second.dump(f);
6236 f->close_section();
6237 }
6238 f->close_section();
6239 }
6240 }
6241
6242 void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
6243 {
6244 o.push_back(new object_manifest_t());
6245 o.back()->type = TYPE_REDIRECT;
6246 }
6247
6248 ostream& operator<<(ostream& out, const object_manifest_t& om)
6249 {
6250 out << "manifest(" << om.get_type_name();
6251 if (om.is_redirect()) {
6252 out << " " << om.redirect_target;
6253 } else if (om.is_chunked()) {
6254 out << " " << om.chunk_map;
6255 }
6256 out << ")";
6257 return out;
6258 }
6259
6260 // -- object_info_t --
6261
6262 void object_info_t::copy_user_bits(const object_info_t& other)
6263 {
6264 // these bits are copied from head->clone.
6265 size = other.size;
6266 mtime = other.mtime;
6267 local_mtime = other.local_mtime;
6268 last_reqid = other.last_reqid;
6269 truncate_seq = other.truncate_seq;
6270 truncate_size = other.truncate_size;
6271 flags = other.flags;
6272 user_version = other.user_version;
6273 data_digest = other.data_digest;
6274 omap_digest = other.omap_digest;
6275 }
6276
6277 void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
6278 {
6279 object_locator_t myoloc(soid);
6280 map<entity_name_t, watch_info_t> old_watchers;
6281 for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) {
6282 old_watchers.insert(make_pair(i->first.second, i->second));
6283 }
6284 ENCODE_START(17, 8, bl);
6285 encode(soid, bl);
6286 encode(myoloc, bl); //Retained for compatibility
6287 encode((__u32)0, bl); // was category, no longer used
6288 encode(version, bl);
6289 encode(prior_version, bl);
6290 encode(last_reqid, bl);
6291 encode(size, bl);
6292 encode(mtime, bl);
6293 if (soid.snap == CEPH_NOSNAP)
6294 encode(osd_reqid_t(), bl); // used to be wrlock_by
6295 else
6296 encode((uint32_t)0, bl); // was legacy_snaps
6297 encode(truncate_seq, bl);
6298 encode(truncate_size, bl);
6299 encode(is_lost(), bl);
6300 encode(old_watchers, bl, features);
6301 /* shenanigans to avoid breaking backwards compatibility in the disk format.
6302 * When we can, switch this out for simply putting the version_t on disk. */
6303 eversion_t user_eversion(0, user_version);
6304 encode(user_eversion, bl);
6305 encode(test_flag(FLAG_USES_TMAP), bl);
6306 encode(watchers, bl, features);
6307 __u32 _flags = flags;
6308 encode(_flags, bl);
6309 encode(local_mtime, bl);
6310 encode(data_digest, bl);
6311 encode(omap_digest, bl);
6312 encode(expected_object_size, bl);
6313 encode(expected_write_size, bl);
6314 encode(alloc_hint_flags, bl);
6315 if (has_manifest()) {
6316 encode(manifest, bl);
6317 }
6318 ENCODE_FINISH(bl);
6319 }
6320
6321 void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
6322 {
6323 object_locator_t myoloc;
6324 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
6325 map<entity_name_t, watch_info_t> old_watchers;
6326 decode(soid, bl);
6327 decode(myoloc, bl);
6328 {
6329 string category;
6330 decode(category, bl); // no longer used
6331 }
6332 decode(version, bl);
6333 decode(prior_version, bl);
6334 decode(last_reqid, bl);
6335 decode(size, bl);
6336 decode(mtime, bl);
6337 if (soid.snap == CEPH_NOSNAP) {
6338 osd_reqid_t wrlock_by;
6339 decode(wrlock_by, bl);
6340 } else {
6341 vector<snapid_t> legacy_snaps;
6342 decode(legacy_snaps, bl);
6343 }
6344 decode(truncate_seq, bl);
6345 decode(truncate_size, bl);
6346
6347 // if this is struct_v >= 13, we will overwrite this
6348 // below since this field is just here for backwards
6349 // compatibility
6350 __u8 lo;
6351 decode(lo, bl);
6352 flags = (flag_t)lo;
6353
6354 decode(old_watchers, bl);
6355 eversion_t user_eversion;
6356 decode(user_eversion, bl);
6357 user_version = user_eversion.version;
6358
6359 if (struct_v >= 9) {
6360 bool uses_tmap = false;
6361 decode(uses_tmap, bl);
6362 if (uses_tmap)
6363 set_flag(FLAG_USES_TMAP);
6364 } else {
6365 set_flag(FLAG_USES_TMAP);
6366 }
6367 if (struct_v < 10)
6368 soid.pool = myoloc.pool;
6369 if (struct_v >= 11) {
6370 decode(watchers, bl);
6371 } else {
6372 for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) {
6373 watchers.insert(
6374 make_pair(
6375 make_pair(i->second.cookie, i->first), i->second));
6376 }
6377 }
6378 if (struct_v >= 13) {
6379 __u32 _flags;
6380 decode(_flags, bl);
6381 flags = (flag_t)_flags;
6382 }
6383 if (struct_v >= 14) {
6384 decode(local_mtime, bl);
6385 } else {
6386 local_mtime = utime_t();
6387 }
6388 if (struct_v >= 15) {
6389 decode(data_digest, bl);
6390 decode(omap_digest, bl);
6391 } else {
6392 data_digest = omap_digest = -1;
6393 clear_flag(FLAG_DATA_DIGEST);
6394 clear_flag(FLAG_OMAP_DIGEST);
6395 }
6396 if (struct_v >= 16) {
6397 decode(expected_object_size, bl);
6398 decode(expected_write_size, bl);
6399 decode(alloc_hint_flags, bl);
6400 } else {
6401 expected_object_size = 0;
6402 expected_write_size = 0;
6403 alloc_hint_flags = 0;
6404 }
6405 if (struct_v >= 17) {
6406 if (has_manifest()) {
6407 decode(manifest, bl);
6408 }
6409 }
6410 DECODE_FINISH(bl);
6411 }
6412
6413 void object_info_t::dump(Formatter *f) const
6414 {
6415 f->open_object_section("oid");
6416 soid.dump(f);
6417 f->close_section();
6418 f->dump_stream("version") << version;
6419 f->dump_stream("prior_version") << prior_version;
6420 f->dump_stream("last_reqid") << last_reqid;
6421 f->dump_unsigned("user_version", user_version);
6422 f->dump_unsigned("size", size);
6423 f->dump_stream("mtime") << mtime;
6424 f->dump_stream("local_mtime") << local_mtime;
6425 f->dump_unsigned("lost", (int)is_lost());
6426 vector<string> sv = get_flag_vector(flags);
6427 f->open_array_section("flags");
6428 for (const auto& str: sv) {
6429 f->dump_string("flags", str);
6430 }
6431 f->close_section();
6432 f->dump_unsigned("truncate_seq", truncate_seq);
6433 f->dump_unsigned("truncate_size", truncate_size);
6434 f->dump_format("data_digest", "0x%08x", data_digest);
6435 f->dump_format("omap_digest", "0x%08x", omap_digest);
6436 f->dump_unsigned("expected_object_size", expected_object_size);
6437 f->dump_unsigned("expected_write_size", expected_write_size);
6438 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
6439 f->dump_object("manifest", manifest);
6440 f->open_object_section("watchers");
6441 for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) {
6442 CachedStackStringStream css;
6443 *css << p->first.second;
6444 f->open_object_section(css->strv());
6445 p->second.dump(f);
6446 f->close_section();
6447 }
6448 f->close_section();
6449 }
6450
6451 void object_info_t::generate_test_instances(list<object_info_t*>& o)
6452 {
6453 o.push_back(new object_info_t());
6454
6455 // fixme
6456 }
6457
6458
6459 ostream& operator<<(ostream& out, const object_info_t& oi)
6460 {
6461 out << oi.soid << "(" << oi.version
6462 << " " << oi.last_reqid;
6463 if (oi.flags)
6464 out << " " << oi.get_flag_string();
6465 out << " s " << oi.size;
6466 out << " uv " << oi.user_version;
6467 if (oi.is_data_digest())
6468 out << " dd " << std::hex << oi.data_digest << std::dec;
6469 if (oi.is_omap_digest())
6470 out << " od " << std::hex << oi.omap_digest << std::dec;
6471 out << " alloc_hint [" << oi.expected_object_size
6472 << " " << oi.expected_write_size
6473 << " " << oi.alloc_hint_flags << "]";
6474 if (oi.has_manifest())
6475 out << " " << oi.manifest;
6476 out << ")";
6477 return out;
6478 }
6479
6480 // -- ObjectRecovery --
6481 void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const
6482 {
6483 ENCODE_START(1, 1, bl);
6484 encode(first, bl);
6485 encode(data_complete, bl);
6486 encode(data_recovered_to, bl);
6487 encode(omap_recovered_to, bl);
6488 encode(omap_complete, bl);
6489 ENCODE_FINISH(bl);
6490 }
6491
6492 void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl)
6493 {
6494 DECODE_START(1, bl);
6495 decode(first, bl);
6496 decode(data_complete, bl);
6497 decode(data_recovered_to, bl);
6498 decode(omap_recovered_to, bl);
6499 decode(omap_complete, bl);
6500 DECODE_FINISH(bl);
6501 }
6502
6503 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
6504 {
6505 return prog.print(out);
6506 }
6507
6508 void ObjectRecoveryProgress::generate_test_instances(
6509 list<ObjectRecoveryProgress*>& o)
6510 {
6511 o.push_back(new ObjectRecoveryProgress);
6512 o.back()->first = false;
6513 o.back()->data_complete = true;
6514 o.back()->omap_complete = true;
6515 o.back()->data_recovered_to = 100;
6516
6517 o.push_back(new ObjectRecoveryProgress);
6518 o.back()->first = true;
6519 o.back()->data_complete = false;
6520 o.back()->omap_complete = false;
6521 o.back()->data_recovered_to = 0;
6522 }
6523
6524 ostream &ObjectRecoveryProgress::print(ostream &out) const
6525 {
6526 return out << "ObjectRecoveryProgress("
6527 << ( first ? "" : "!" ) << "first, "
6528 << "data_recovered_to:" << data_recovered_to
6529 << ", data_complete:" << ( data_complete ? "true" : "false" )
6530 << ", omap_recovered_to:" << omap_recovered_to
6531 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
6532 << ", error:" << ( error ? "true" : "false" )
6533 << ")";
6534 }
6535
6536 void ObjectRecoveryProgress::dump(Formatter *f) const
6537 {
6538 f->dump_int("first?", first);
6539 f->dump_int("data_complete?", data_complete);
6540 f->dump_unsigned("data_recovered_to", data_recovered_to);
6541 f->dump_int("omap_complete?", omap_complete);
6542 f->dump_string("omap_recovered_to", omap_recovered_to);
6543 }
6544
6545 void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
6546 {
6547 ENCODE_START(3, 1, bl);
6548 encode(soid, bl);
6549 encode(version, bl);
6550 encode(size, bl);
6551 encode(oi, bl, features);
6552 encode(ss, bl);
6553 encode(copy_subset, bl);
6554 encode(clone_subset, bl);
6555 encode(object_exist, bl);
6556 ENCODE_FINISH(bl);
6557 }
6558
6559 void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
6560 int64_t pool)
6561 {
6562 DECODE_START(3, bl);
6563 decode(soid, bl);
6564 decode(version, bl);
6565 decode(size, bl);
6566 decode(oi, bl);
6567 decode(ss, bl);
6568 decode(copy_subset, bl);
6569 decode(clone_subset, bl);
6570 if (struct_v > 2)
6571 decode(object_exist, bl);
6572 else
6573 object_exist = false;
6574 DECODE_FINISH(bl);
6575 if (struct_v < 2) {
6576 if (!soid.is_max() && soid.pool == -1)
6577 soid.pool = pool;
6578 map<hobject_t, interval_set<uint64_t>> tmp;
6579 tmp.swap(clone_subset);
6580 for (auto i = tmp.begin(); i != tmp.end(); ++i) {
6581 hobject_t first(i->first);
6582 if (!first.is_max() && first.pool == -1)
6583 first.pool = pool;
6584 clone_subset[first].swap(i->second);
6585 }
6586 }
6587 }
6588
6589 void ObjectRecoveryInfo::generate_test_instances(
6590 list<ObjectRecoveryInfo*>& o)
6591 {
6592 o.push_back(new ObjectRecoveryInfo);
6593 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
6594 o.back()->version = eversion_t(0,0);
6595 o.back()->size = 100;
6596 o.back()->object_exist = false;
6597 }
6598
6599
6600 void ObjectRecoveryInfo::dump(Formatter *f) const
6601 {
6602 f->dump_stream("object") << soid;
6603 f->dump_stream("at_version") << version;
6604 f->dump_stream("size") << size;
6605 {
6606 f->open_object_section("object_info");
6607 oi.dump(f);
6608 f->close_section();
6609 }
6610 {
6611 f->open_object_section("snapset");
6612 ss.dump(f);
6613 f->close_section();
6614 }
6615 f->dump_stream("copy_subset") << copy_subset;
6616 f->dump_stream("clone_subset") << clone_subset;
6617 f->dump_stream("object_exist") << object_exist;
6618 }
6619
6620 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
6621 {
6622 return inf.print(out);
6623 }
6624
6625 ostream &ObjectRecoveryInfo::print(ostream &out) const
6626 {
6627 return out << "ObjectRecoveryInfo("
6628 << soid << "@" << version
6629 << ", size: " << size
6630 << ", copy_subset: " << copy_subset
6631 << ", clone_subset: " << clone_subset
6632 << ", snapset: " << ss
6633 << ", object_exist: " << object_exist
6634 << ")";
6635 }
6636
6637 // -- PushReplyOp --
6638 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
6639 {
6640 o.push_back(new PushReplyOp);
6641 o.push_back(new PushReplyOp);
6642 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6643 o.push_back(new PushReplyOp);
6644 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6645 }
6646
6647 void PushReplyOp::encode(ceph::buffer::list &bl) const
6648 {
6649 ENCODE_START(1, 1, bl);
6650 encode(soid, bl);
6651 ENCODE_FINISH(bl);
6652 }
6653
6654 void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl)
6655 {
6656 DECODE_START(1, bl);
6657 decode(soid, bl);
6658 DECODE_FINISH(bl);
6659 }
6660
6661 void PushReplyOp::dump(Formatter *f) const
6662 {
6663 f->dump_stream("soid") << soid;
6664 }
6665
6666 ostream &PushReplyOp::print(ostream &out) const
6667 {
6668 return out
6669 << "PushReplyOp(" << soid
6670 << ")";
6671 }
6672
6673 ostream& operator<<(ostream& out, const PushReplyOp &op)
6674 {
6675 return op.print(out);
6676 }
6677
6678 uint64_t PushReplyOp::cost(CephContext *cct) const
6679 {
6680
6681 return cct->_conf->osd_push_per_object_cost +
6682 cct->_conf->osd_recovery_max_chunk;
6683 }
6684
6685 // -- PullOp --
6686 void PullOp::generate_test_instances(list<PullOp*> &o)
6687 {
6688 o.push_back(new PullOp);
6689 o.push_back(new PullOp);
6690 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6691 o.back()->recovery_info.version = eversion_t(3, 10);
6692 o.push_back(new PullOp);
6693 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6694 o.back()->recovery_info.version = eversion_t(0, 0);
6695 }
6696
6697 void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const
6698 {
6699 ENCODE_START(1, 1, bl);
6700 encode(soid, bl);
6701 encode(recovery_info, bl, features);
6702 encode(recovery_progress, bl);
6703 ENCODE_FINISH(bl);
6704 }
6705
6706 void PullOp::decode(ceph::buffer::list::const_iterator &bl)
6707 {
6708 DECODE_START(1, bl);
6709 decode(soid, bl);
6710 decode(recovery_info, bl);
6711 decode(recovery_progress, bl);
6712 DECODE_FINISH(bl);
6713 }
6714
6715 void PullOp::dump(Formatter *f) const
6716 {
6717 f->dump_stream("soid") << soid;
6718 {
6719 f->open_object_section("recovery_info");
6720 recovery_info.dump(f);
6721 f->close_section();
6722 }
6723 {
6724 f->open_object_section("recovery_progress");
6725 recovery_progress.dump(f);
6726 f->close_section();
6727 }
6728 }
6729
6730 ostream &PullOp::print(ostream &out) const
6731 {
6732 return out
6733 << "PullOp(" << soid
6734 << ", recovery_info: " << recovery_info
6735 << ", recovery_progress: " << recovery_progress
6736 << ")";
6737 }
6738
6739 ostream& operator<<(ostream& out, const PullOp &op)
6740 {
6741 return op.print(out);
6742 }
6743
6744 uint64_t PullOp::cost(CephContext *cct) const
6745 {
6746 return cct->_conf->osd_push_per_object_cost +
6747 cct->_conf->osd_recovery_max_chunk;
6748 }
6749
6750 // -- PushOp --
6751 void PushOp::generate_test_instances(list<PushOp*> &o)
6752 {
6753 o.push_back(new PushOp);
6754 o.push_back(new PushOp);
6755 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6756 o.back()->version = eversion_t(3, 10);
6757 o.push_back(new PushOp);
6758 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6759 o.back()->version = eversion_t(0, 0);
6760 }
6761
6762 void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const
6763 {
6764 ENCODE_START(1, 1, bl);
6765 encode(soid, bl);
6766 encode(version, bl);
6767 encode(data, bl);
6768 encode(data_included, bl);
6769 encode(omap_header, bl);
6770 encode(omap_entries, bl);
6771 encode(attrset, bl);
6772 encode(recovery_info, bl, features);
6773 encode(after_progress, bl);
6774 encode(before_progress, bl);
6775 ENCODE_FINISH(bl);
6776 }
6777
6778 void PushOp::decode(ceph::buffer::list::const_iterator &bl)
6779 {
6780 DECODE_START(1, bl);
6781 decode(soid, bl);
6782 decode(version, bl);
6783 decode(data, bl);
6784 decode(data_included, bl);
6785 decode(omap_header, bl);
6786 decode(omap_entries, bl);
6787 decode(attrset, bl);
6788 decode(recovery_info, bl);
6789 decode(after_progress, bl);
6790 decode(before_progress, bl);
6791 DECODE_FINISH(bl);
6792 }
6793
6794 void PushOp::dump(Formatter *f) const
6795 {
6796 f->dump_stream("soid") << soid;
6797 f->dump_stream("version") << version;
6798 f->dump_int("data_len", data.length());
6799 f->dump_stream("data_included") << data_included;
6800 f->dump_int("omap_header_len", omap_header.length());
6801 f->dump_int("omap_entries_len", omap_entries.size());
6802 f->dump_int("attrset_len", attrset.size());
6803 {
6804 f->open_object_section("recovery_info");
6805 recovery_info.dump(f);
6806 f->close_section();
6807 }
6808 {
6809 f->open_object_section("after_progress");
6810 after_progress.dump(f);
6811 f->close_section();
6812 }
6813 {
6814 f->open_object_section("before_progress");
6815 before_progress.dump(f);
6816 f->close_section();
6817 }
6818 }
6819
6820 ostream &PushOp::print(ostream &out) const
6821 {
6822 return out
6823 << "PushOp(" << soid
6824 << ", version: " << version
6825 << ", data_included: " << data_included
6826 << ", data_size: " << data.length()
6827 << ", omap_header_size: " << omap_header.length()
6828 << ", omap_entries_size: " << omap_entries.size()
6829 << ", attrset_size: " << attrset.size()
6830 << ", recovery_info: " << recovery_info
6831 << ", after_progress: " << after_progress
6832 << ", before_progress: " << before_progress
6833 << ")";
6834 }
6835
6836 ostream& operator<<(ostream& out, const PushOp &op)
6837 {
6838 return op.print(out);
6839 }
6840
6841 uint64_t PushOp::cost(CephContext *cct) const
6842 {
6843 uint64_t cost = data_included.size();
6844 for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) {
6845 cost += i->second.length();
6846 }
6847 cost += cct->_conf->osd_push_per_object_cost;
6848 return cost;
6849 }
6850
6851 // -- ScrubMap --
6852
6853 void ScrubMap::merge_incr(const ScrubMap &l)
6854 {
6855 ceph_assert(valid_through == l.incr_since);
6856 valid_through = l.valid_through;
6857
6858 for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){
6859 if (p->second.negative) {
6860 auto q = objects.find(p->first);
6861 if (q != objects.end()) {
6862 objects.erase(q);
6863 }
6864 } else {
6865 objects[p->first] = p->second;
6866 }
6867 }
6868 }
6869
6870 void ScrubMap::encode(ceph::buffer::list& bl) const
6871 {
6872 ENCODE_START(3, 2, bl);
6873 encode(objects, bl);
6874 encode((__u32)0, bl); // used to be attrs; now deprecated
6875 ceph::buffer::list old_logbl; // not used
6876 encode(old_logbl, bl);
6877 encode(valid_through, bl);
6878 encode(incr_since, bl);
6879 ENCODE_FINISH(bl);
6880 }
6881
6882 void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool)
6883 {
6884 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
6885 decode(objects, bl);
6886 {
6887 map<string,string> attrs; // deprecated
6888 decode(attrs, bl);
6889 }
6890 ceph::buffer::list old_logbl; // not used
6891 decode(old_logbl, bl);
6892 decode(valid_through, bl);
6893 decode(incr_since, bl);
6894 DECODE_FINISH(bl);
6895
6896 // handle hobject_t upgrade
6897 if (struct_v < 3) {
6898 map<hobject_t, object> tmp;
6899 tmp.swap(objects);
6900 for (auto i = tmp.begin(); i != tmp.end(); ++i) {
6901 hobject_t first(i->first);
6902 if (!first.is_max() && first.pool == -1)
6903 first.pool = pool;
6904 objects[first] = i->second;
6905 }
6906 }
6907 }
6908
6909 void ScrubMap::dump(Formatter *f) const
6910 {
6911 f->dump_stream("valid_through") << valid_through;
6912 f->dump_stream("incremental_since") << incr_since;
6913 f->open_array_section("objects");
6914 for (auto p = objects.cbegin(); p != objects.cend(); ++p) {
6915 f->open_object_section("object");
6916 f->dump_string("name", p->first.oid.name);
6917 f->dump_unsigned("hash", p->first.get_hash());
6918 f->dump_string("key", p->first.get_key());
6919 f->dump_int("snapid", p->first.snap);
6920 p->second.dump(f);
6921 f->close_section();
6922 }
6923 f->close_section();
6924 }
6925
6926 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
6927 {
6928 o.push_back(new ScrubMap);
6929 o.push_back(new ScrubMap);
6930 o.back()->valid_through = eversion_t(1, 2);
6931 o.back()->incr_since = eversion_t(3, 4);
6932 list<object*> obj;
6933 object::generate_test_instances(obj);
6934 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
6935 obj.pop_back();
6936 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
6937 }
6938
6939 // -- ScrubMap::object --
6940
6941 void ScrubMap::object::encode(ceph::buffer::list& bl) const
6942 {
6943 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
6944 ENCODE_START(10, 7, bl);
6945 encode(size, bl);
6946 encode(negative, bl);
6947 encode(attrs, bl);
6948 encode(digest, bl);
6949 encode(digest_present, bl);
6950 encode((uint32_t)0, bl); // obsolete nlinks
6951 encode((uint32_t)0, bl); // snapcolls
6952 encode(omap_digest, bl);
6953 encode(omap_digest_present, bl);
6954 encode(compat_read_error, bl);
6955 encode(stat_error, bl);
6956 encode(read_error, bl);
6957 encode(ec_hash_mismatch, bl);
6958 encode(ec_size_mismatch, bl);
6959 encode(large_omap_object_found, bl);
6960 encode(large_omap_object_key_count, bl);
6961 encode(large_omap_object_value_size, bl);
6962 encode(object_omap_bytes, bl);
6963 encode(object_omap_keys, bl);
6964 ENCODE_FINISH(bl);
6965 }
6966
6967 void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl)
6968 {
6969 DECODE_START(10, bl);
6970 decode(size, bl);
6971 bool tmp, compat_read_error = false;
6972 decode(tmp, bl);
6973 negative = tmp;
6974 decode(attrs, bl);
6975 decode(digest, bl);
6976 decode(tmp, bl);
6977 digest_present = tmp;
6978 {
6979 uint32_t nlinks;
6980 decode(nlinks, bl);
6981 set<snapid_t> snapcolls;
6982 decode(snapcolls, bl);
6983 }
6984 decode(omap_digest, bl);
6985 decode(tmp, bl);
6986 omap_digest_present = tmp;
6987 decode(compat_read_error, bl);
6988 decode(tmp, bl);
6989 stat_error = tmp;
6990 if (struct_v >= 8) {
6991 decode(tmp, bl);
6992 read_error = tmp;
6993 decode(tmp, bl);
6994 ec_hash_mismatch = tmp;
6995 decode(tmp, bl);
6996 ec_size_mismatch = tmp;
6997 }
6998 // If older encoder found a read_error, set read_error
6999 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
7000 read_error = true;
7001 if (struct_v >= 9) {
7002 decode(tmp, bl);
7003 large_omap_object_found = tmp;
7004 decode(large_omap_object_key_count, bl);
7005 decode(large_omap_object_value_size, bl);
7006 }
7007 if (struct_v >= 10) {
7008 decode(object_omap_bytes, bl);
7009 decode(object_omap_keys, bl);
7010 }
7011 DECODE_FINISH(bl);
7012 }
7013
7014 void ScrubMap::object::dump(Formatter *f) const
7015 {
7016 f->dump_int("size", size);
7017 f->dump_int("negative", negative);
7018 f->open_array_section("attrs");
7019 for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) {
7020 f->open_object_section("attr");
7021 f->dump_string("name", p->first);
7022 f->dump_int("length", p->second.length());
7023 f->close_section();
7024 }
7025 f->close_section();
7026 }
7027
7028 void ScrubMap::object::generate_test_instances(list<object*>& o)
7029 {
7030 o.push_back(new object);
7031 o.push_back(new object);
7032 o.back()->negative = true;
7033 o.push_back(new object);
7034 o.back()->size = 123;
7035 o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3);
7036 o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6);
7037 }
7038
7039 // -- OSDOp --
7040
7041 ostream& operator<<(ostream& out, const OSDOp& op)
7042 {
7043 out << ceph_osd_op_name(op.op.op);
7044 if (ceph_osd_op_type_data(op.op.op)) {
7045 // data extent
7046 switch (op.op.op) {
7047 case CEPH_OSD_OP_ASSERT_VER:
7048 out << " v" << op.op.assert_ver.ver;
7049 break;
7050 case CEPH_OSD_OP_TRUNCATE:
7051 out << " " << op.op.extent.offset;
7052 break;
7053 case CEPH_OSD_OP_MASKTRUNC:
7054 case CEPH_OSD_OP_TRIMTRUNC:
7055 out << " " << op.op.extent.truncate_seq << "@"
7056 << (int64_t)op.op.extent.truncate_size;
7057 break;
7058 case CEPH_OSD_OP_ROLLBACK:
7059 out << " " << snapid_t(op.op.snap.snapid);
7060 break;
7061 case CEPH_OSD_OP_WATCH:
7062 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
7063 << " cookie " << op.op.watch.cookie;
7064 if (op.op.watch.gen)
7065 out << " gen " << op.op.watch.gen;
7066 break;
7067 case CEPH_OSD_OP_NOTIFY:
7068 out << " cookie " << op.op.notify.cookie;
7069 break;
7070 case CEPH_OSD_OP_COPY_GET:
7071 out << " max " << op.op.copy_get.max;
7072 break;
7073 case CEPH_OSD_OP_COPY_FROM:
7074 out << " ver " << op.op.copy_from.src_version;
7075 break;
7076 case CEPH_OSD_OP_SETALLOCHINT:
7077 out << " object_size " << op.op.alloc_hint.expected_object_size
7078 << " write_size " << op.op.alloc_hint.expected_write_size;
7079 break;
7080 case CEPH_OSD_OP_READ:
7081 case CEPH_OSD_OP_SPARSE_READ:
7082 case CEPH_OSD_OP_SYNC_READ:
7083 case CEPH_OSD_OP_WRITE:
7084 case CEPH_OSD_OP_WRITEFULL:
7085 case CEPH_OSD_OP_ZERO:
7086 case CEPH_OSD_OP_APPEND:
7087 case CEPH_OSD_OP_MAPEXT:
7088 case CEPH_OSD_OP_CMPEXT:
7089 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
7090 if (op.op.extent.truncate_seq)
7091 out << " [" << op.op.extent.truncate_seq << "@"
7092 << (int64_t)op.op.extent.truncate_size << "]";
7093 if (op.op.flags)
7094 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
7095 default:
7096 // don't show any arg info
7097 break;
7098 }
7099 } else if (ceph_osd_op_type_attr(op.op.op)) {
7100 // xattr name
7101 if (op.op.xattr.name_len && op.indata.length()) {
7102 out << " ";
7103 op.indata.write(0, op.op.xattr.name_len, out);
7104 }
7105 if (op.op.xattr.value_len)
7106 out << " (" << op.op.xattr.value_len << ")";
7107 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
7108 out << " op " << (int)op.op.xattr.cmp_op
7109 << " mode " << (int)op.op.xattr.cmp_mode;
7110 } else if (ceph_osd_op_type_exec(op.op.op)) {
7111 // class.method
7112 if (op.op.cls.class_len && op.indata.length()) {
7113 out << " ";
7114 op.indata.write(0, op.op.cls.class_len, out);
7115 out << ".";
7116 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
7117 }
7118 } else if (ceph_osd_op_type_pg(op.op.op)) {
7119 switch (op.op.op) {
7120 case CEPH_OSD_OP_PGLS:
7121 case CEPH_OSD_OP_PGLS_FILTER:
7122 case CEPH_OSD_OP_PGNLS:
7123 case CEPH_OSD_OP_PGNLS_FILTER:
7124 out << " start_epoch " << op.op.pgls.start_epoch;
7125 break;
7126 case CEPH_OSD_OP_PG_HITSET_LS:
7127 break;
7128 case CEPH_OSD_OP_PG_HITSET_GET:
7129 out << " " << utime_t(op.op.hit_set_get.stamp);
7130 break;
7131 case CEPH_OSD_OP_SCRUBLS:
7132 break;
7133 }
7134 }
7135 if (op.indata.length()) {
7136 out << " in=" << op.indata.length() << "b";
7137 }
7138 if (op.outdata.length()) {
7139 out << " out=" << op.outdata.length() << "b";
7140 }
7141 return out;
7142 }
7143
7144
7145 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& in)
7146 {
7147 auto datap = in.begin();
7148 for (unsigned i = 0; i < ops.size(); i++) {
7149 if (ops[i].op.payload_len) {
7150 datap.copy(ops[i].op.payload_len, ops[i].outdata);
7151 }
7152 }
7153 }
7154
7155 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& out)
7156 {
7157 for (unsigned i = 0; i < ops.size(); i++) {
7158 ops[i].op.payload_len = ops[i].outdata.length();
7159 if (ops[i].outdata.length()) {
7160 out.append(ops[i].outdata);
7161 }
7162 }
7163 }
7164
7165 int prepare_info_keymap(
7166 CephContext* cct,
7167 map<string,bufferlist> *km,
7168 string *key_to_remove,
7169 epoch_t epoch,
7170 pg_info_t &info,
7171 pg_info_t &last_written_info,
7172 PastIntervals &past_intervals,
7173 bool dirty_big_info,
7174 bool dirty_epoch,
7175 bool try_fast_info,
7176 PerfCounters *logger,
7177 DoutPrefixProvider *dpp)
7178 {
7179 if (dirty_epoch) {
7180 encode(epoch, (*km)[string(epoch_key)]);
7181 }
7182
7183 if (logger)
7184 logger->inc(l_osd_pg_info);
7185
7186 // try to do info efficiently?
7187 if (!dirty_big_info && try_fast_info &&
7188 info.last_update > last_written_info.last_update) {
7189 pg_fast_info_t fast;
7190 fast.populate_from(info);
7191 bool did = fast.try_apply_to(&last_written_info);
7192 ceph_assert(did); // we verified last_update increased above
7193 if (info == last_written_info) {
7194 encode(fast, (*km)[string(fastinfo_key)]);
7195 if (logger)
7196 logger->inc(l_osd_pg_fastinfo);
7197 return 0;
7198 }
7199 if (dpp) {
7200 ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n";
7201 {
7202 JSONFormatter jf(true);
7203 jf.dump_object("info", info);
7204 jf.flush(*_dout);
7205 }
7206 {
7207 *_dout << "\nlast_written_info:\n";
7208 JSONFormatter jf(true);
7209 jf.dump_object("last_written_info", last_written_info);
7210 jf.flush(*_dout);
7211 }
7212 *_dout << dendl;
7213 }
7214 } else if (info.last_update <= last_written_info.last_update) {
7215 // clean up any potentially stale fastinfo key resulting from last_update
7216 // not moving forwards (e.g., a backwards jump during peering)
7217 *key_to_remove = fastinfo_key;
7218 }
7219
7220 last_written_info = info;
7221
7222 // info. store purged_snaps separately.
7223 interval_set<snapid_t> purged_snaps;
7224 purged_snaps.swap(info.purged_snaps);
7225 encode(info, (*km)[string(info_key)]);
7226 purged_snaps.swap(info.purged_snaps);
7227
7228 if (dirty_big_info) {
7229 // potentially big stuff
7230 bufferlist& bigbl = (*km)[string(biginfo_key)];
7231 encode(past_intervals, bigbl);
7232 encode(info.purged_snaps, bigbl);
7233 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
7234 if (logger)
7235 logger->inc(l_osd_pg_biginfo);
7236 }
7237
7238 return 0;
7239 }
7240
7241 void create_pg_collection(
7242 ceph::os::Transaction& t, spg_t pgid, int bits)
7243 {
7244 coll_t coll(pgid);
7245 t.create_collection(coll, bits);
7246 }
7247
7248 void init_pg_ondisk(
7249 ceph::os::Transaction& t,
7250 spg_t pgid,
7251 const pg_pool_t *pool)
7252 {
7253 coll_t coll(pgid);
7254 if (pool) {
7255 // Give a hint to the PG collection
7256 bufferlist hint;
7257 uint32_t pg_num = pool->get_pg_num();
7258 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
7259 encode(pg_num, hint);
7260 encode(expected_num_objects_pg, hint);
7261 uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
7262 t.collection_hint(coll, hint_type, hint);
7263 }
7264
7265 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
7266 t.touch(coll, pgmeta_oid);
7267 map<string,bufferlist> values;
7268 __u8 struct_v = pg_latest_struct_v;
7269 encode(struct_v, values[string(infover_key)]);
7270 t.omap_setkeys(coll, pgmeta_oid, values);
7271 }
7272
7273 PGLSFilter::PGLSFilter() : cct(nullptr)
7274 {
7275 }
7276
7277 PGLSFilter::~PGLSFilter()
7278 {
7279 }
7280
7281 int PGLSPlainFilter::init(ceph::bufferlist::const_iterator &params)
7282 {
7283 try {
7284 decode(xattr, params);
7285 decode(val, params);
7286 } catch (ceph::buffer::error &e) {
7287 return -EINVAL;
7288 }
7289 return 0;
7290 }
7291
7292 bool PGLSPlainFilter::filter(const hobject_t& obj,
7293 const ceph::bufferlist& xattr_data) const
7294 {
7295 return xattr_data.contents_equal(val.c_str(), val.size());
7296 }