]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.cc
9a0927d16a78de26301081c6b14466b42ba20efd
[ceph.git] / ceph / src / osd / osd_types.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <list>
19 #include <map>
20 #include <ostream>
21 #include <sstream>
22 #include <set>
23 #include <string>
24 #include <utility>
25 #include <vector>
26
27
28 #include <boost/assign/list_of.hpp>
29
30 #include "include/ceph_features.h"
31 #include "include/encoding.h"
32 #include "include/stringify.h"
33 extern "C" {
34 #include "crush/hash.h"
35 }
36
37 #include "common/Formatter.h"
38 #include "common/StackStringStream.h"
39 #include "include/utime_fmt.h"
40 #include "OSDMap.h"
41 #include "osd_types.h"
42 #include "osd_types_fmt.h"
43 #include "os/Transaction.h"
44
45 using std::list;
46 using std::make_pair;
47 using std::map;
48 using std::ostream;
49 using std::pair;
50 using std::set;
51 using std::shared_ptr;
52 using std::string;
53 using std::unique_ptr;
54 using std::vector;
55
56 using ceph::bufferlist;
57 using ceph::decode;
58 using ceph::decode_nohead;
59 using ceph::encode;
60 using ceph::encode_nohead;
61 using ceph::Formatter;
62 using ceph::make_timespan;
63 using ceph::JSONFormatter;
64
65 using namespace std::literals;
66
67 const char *ceph_osd_flag_name(unsigned flag)
68 {
69 switch (flag) {
70 case CEPH_OSD_FLAG_ACK: return "ack";
71 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
72 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
73 case CEPH_OSD_FLAG_RETRY: return "retry";
74 case CEPH_OSD_FLAG_READ: return "read";
75 case CEPH_OSD_FLAG_WRITE: return "write";
76 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
77 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
78 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
79 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
80 case CEPH_OSD_FLAG_PGOP: return "pgop";
81 case CEPH_OSD_FLAG_EXEC: return "exec";
82 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
83 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
84 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
85 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
86 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
87 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
88 case CEPH_OSD_FLAG_FLUSH: return "flush";
89 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
90 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
91 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
92 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
93 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
94 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
95 case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
96 case CEPH_OSD_FLAG_RETURNVEC: return "returnvec";
97 case CEPH_OSD_FLAG_SUPPORTSPOOLEIO: return "supports_pool_eio";
98 default: return "???";
99 }
100 }
101
102 string ceph_osd_flag_string(unsigned flags)
103 {
104 string s;
105 for (unsigned i=0; i<32; ++i) {
106 if (flags & (1u<<i)) {
107 if (s.length())
108 s += "+";
109 s += ceph_osd_flag_name(1u << i);
110 }
111 }
112 if (s.length())
113 return s;
114 return string("-");
115 }
116
117 const char * ceph_osd_op_flag_name(unsigned flag)
118 {
119 const char *name;
120
121 switch(flag) {
122 case CEPH_OSD_OP_FLAG_EXCL:
123 name = "excl";
124 break;
125 case CEPH_OSD_OP_FLAG_FAILOK:
126 name = "failok";
127 break;
128 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
129 name = "fadvise_random";
130 break;
131 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
132 name = "fadvise_sequential";
133 break;
134 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
135 name = "favise_willneed";
136 break;
137 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
138 name = "fadvise_dontneed";
139 break;
140 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
141 name = "fadvise_nocache";
142 break;
143 case CEPH_OSD_OP_FLAG_WITH_REFERENCE:
144 name = "with_reference";
145 break;
146 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
147 name = "bypass_clean_cache";
148 break;
149 default:
150 name = "???";
151 };
152
153 return name;
154 }
155
156 string ceph_osd_op_flag_string(unsigned flags)
157 {
158 string s;
159 for (unsigned i=0; i<32; ++i) {
160 if (flags & (1u<<i)) {
161 if (s.length())
162 s += "+";
163 s += ceph_osd_op_flag_name(1u << i);
164 }
165 }
166 if (s.length())
167 return s;
168 return string("-");
169 }
170
171 string ceph_osd_alloc_hint_flag_string(unsigned flags)
172 {
173 string s;
174 for (unsigned i=0; i<32; ++i) {
175 if (flags & (1u<<i)) {
176 if (s.length())
177 s += "+";
178 s += ceph_osd_alloc_hint_flag_name(1u << i);
179 }
180 }
181 if (s.length())
182 return s;
183 return string("-");
184 }
185
186 void pg_shard_t::encode(ceph::buffer::list &bl) const
187 {
188 ENCODE_START(1, 1, bl);
189 encode(osd, bl);
190 encode(shard, bl);
191 ENCODE_FINISH(bl);
192 }
193 void pg_shard_t::decode(ceph::buffer::list::const_iterator &bl)
194 {
195 DECODE_START(1, bl);
196 decode(osd, bl);
197 decode(shard, bl);
198 DECODE_FINISH(bl);
199 }
200
201 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
202 {
203 if (rhs.is_undefined())
204 return lhs << "?";
205 if (rhs.shard == shard_id_t::NO_SHARD)
206 return lhs << rhs.get_osd();
207 return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
208 }
209
210 void dump(Formatter* f, const osd_alerts_t& alerts)
211 {
212 for (auto& a : alerts) {
213 string s0 = " osd: ";
214 s0 += stringify(a.first);
215 string s;
216 for (auto& aa : a.second) {
217 s = s0;
218 s += " ";
219 s += aa.first;
220 s += ":";
221 s += aa.second;
222 f->dump_string("alert", s);
223 }
224 }
225 }
226
227 // -- osd_reqid_t --
228 void osd_reqid_t::dump(Formatter *f) const
229 {
230 f->dump_stream("name") << name;
231 f->dump_int("inc", inc);
232 f->dump_unsigned("tid", tid);
233 }
234
235 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
236 {
237 o.push_back(new osd_reqid_t);
238 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
239 }
240
241 // -- object_locator_t --
242
243 void object_locator_t::encode(ceph::buffer::list& bl) const
244 {
245 // verify that nobody's corrupted the locator
246 ceph_assert(hash == -1 || key.empty());
247 __u8 encode_compat = 3;
248 ENCODE_START(6, encode_compat, bl);
249 encode(pool, bl);
250 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
251 encode(preferred, bl);
252 encode(key, bl);
253 encode(nspace, bl);
254 encode(hash, bl);
255 if (hash != -1)
256 encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash
257 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
258 }
259
260 void object_locator_t::decode(ceph::buffer::list::const_iterator& p)
261 {
262 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
263 if (struct_v < 2) {
264 int32_t op;
265 decode(op, p);
266 pool = op;
267 int16_t pref;
268 decode(pref, p);
269 } else {
270 decode(pool, p);
271 int32_t preferred;
272 decode(preferred, p);
273 }
274 decode(key, p);
275 if (struct_v >= 5)
276 decode(nspace, p);
277 if (struct_v >= 6)
278 decode(hash, p);
279 else
280 hash = -1;
281 DECODE_FINISH(p);
282 // verify that nobody's corrupted the locator
283 ceph_assert(hash == -1 || key.empty());
284 }
285
286 void object_locator_t::dump(Formatter *f) const
287 {
288 f->dump_int("pool", pool);
289 f->dump_string("key", key);
290 f->dump_string("namespace", nspace);
291 f->dump_int("hash", hash);
292 }
293
294 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
295 {
296 o.push_back(new object_locator_t);
297 o.push_back(new object_locator_t(123));
298 o.push_back(new object_locator_t(123, 876));
299 o.push_back(new object_locator_t(1, "n2"));
300 o.push_back(new object_locator_t(1234, "", "key"));
301 o.push_back(new object_locator_t(12, "n1", "key2"));
302 }
303
304 // -- request_redirect_t --
305 void request_redirect_t::encode(ceph::buffer::list& bl) const
306 {
307 ENCODE_START(1, 1, bl);
308 encode(redirect_locator, bl);
309 encode(redirect_object, bl);
310 // legacy of the removed osd_instructions member
311 encode((uint32_t)0, bl);
312 ENCODE_FINISH(bl);
313 }
314
315 void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl)
316 {
317 DECODE_START(1, bl);
318 uint32_t legacy_osd_instructions_len;
319 decode(redirect_locator, bl);
320 decode(redirect_object, bl);
321 decode(legacy_osd_instructions_len, bl);
322 if (legacy_osd_instructions_len) {
323 bl += legacy_osd_instructions_len;
324 }
325 DECODE_FINISH(bl);
326 }
327
328 void request_redirect_t::dump(Formatter *f) const
329 {
330 f->dump_string("object", redirect_object);
331 f->open_object_section("locator");
332 redirect_locator.dump(f);
333 f->close_section(); // locator
334 }
335
336 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
337 {
338 object_locator_t loc(1, "redir_obj");
339 o.push_back(new request_redirect_t());
340 o.push_back(new request_redirect_t(loc, 0));
341 o.push_back(new request_redirect_t(loc, "redir_obj"));
342 o.push_back(new request_redirect_t(loc));
343 }
344
345 void objectstore_perf_stat_t::dump(Formatter *f) const
346 {
347 // *_ms values just for compatibility.
348 f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0);
349 f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0);
350 f->dump_unsigned("commit_latency_ns", os_commit_latency_ns);
351 f->dump_unsigned("apply_latency_ns", os_apply_latency_ns);
352 }
353
354 void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
355 {
356 uint8_t target_v = 2;
357 if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) {
358 target_v = 1;
359 }
360 ENCODE_START(target_v, target_v, bl);
361 if (target_v >= 2) {
362 encode(os_commit_latency_ns, bl);
363 encode(os_apply_latency_ns, bl);
364 } else {
365 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
366 uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS;
367 uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS;
368 encode(commit_latency_ms, bl); // for compatibility with older monitor.
369 encode(apply_latency_ms, bl); // for compatibility with older monitor.
370 }
371 ENCODE_FINISH(bl);
372 }
373
374 void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl)
375 {
376 DECODE_START(2, bl);
377 if (struct_v >= 2) {
378 decode(os_commit_latency_ns, bl);
379 decode(os_apply_latency_ns, bl);
380 } else {
381 uint32_t commit_latency_ms;
382 uint32_t apply_latency_ms;
383 decode(commit_latency_ms, bl);
384 decode(apply_latency_ms, bl);
385 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
386 os_commit_latency_ns = commit_latency_ms * NS_PER_MS;
387 os_apply_latency_ns = apply_latency_ms * NS_PER_MS;
388 }
389 DECODE_FINISH(bl);
390 }
391
392 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
393 {
394 o.push_back(new objectstore_perf_stat_t());
395 o.push_back(new objectstore_perf_stat_t());
396 o.back()->os_commit_latency_ns = 20000000;
397 o.back()->os_apply_latency_ns = 30000000;
398 }
399
400 // -- osd_stat_t --
401 void osd_stat_t::dump(Formatter *f, bool with_net) const
402 {
403 f->dump_unsigned("up_from", up_from);
404 f->dump_unsigned("seq", seq);
405 f->dump_unsigned("num_pgs", num_pgs);
406 f->dump_unsigned("num_osds", num_osds);
407 f->dump_unsigned("num_per_pool_osds", num_per_pool_osds);
408 f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds);
409
410 /// dump legacy stats fields to ensure backward compatibility.
411 f->dump_unsigned("kb", statfs.kb());
412 f->dump_unsigned("kb_used", statfs.kb_used_raw());
413 f->dump_unsigned("kb_used_data", statfs.kb_used_data());
414 f->dump_unsigned("kb_used_omap", statfs.kb_used_omap());
415 f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata());
416 f->dump_unsigned("kb_avail", statfs.kb_avail());
417 ////////////////////
418
419 f->open_object_section("statfs");
420 statfs.dump(f);
421 f->close_section();
422 f->open_array_section("hb_peers");
423 for (auto p : hb_peers)
424 f->dump_int("osd", p);
425 f->close_section();
426 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
427 f->dump_int("num_snap_trimming", num_snap_trimming);
428 f->dump_int("num_shards_repaired", num_shards_repaired);
429 f->open_object_section("op_queue_age_hist");
430 op_queue_age_hist.dump(f);
431 f->close_section();
432 f->open_object_section("perf_stat");
433 os_perf_stat.dump(f);
434 f->close_section();
435 f->open_array_section("alerts");
436 ::dump(f, os_alerts);
437 f->close_section();
438 if (with_net) {
439 dump_ping_time(f);
440 }
441 }
442
443 void osd_stat_t::dump_ping_time(Formatter *f) const
444 {
445 f->open_array_section("network_ping_times");
446 for (auto &i : hb_pingtime) {
447 f->open_object_section("entry");
448 f->dump_int("osd", i.first);
449 const time_t lu(i.second.last_update);
450 char buffer[26];
451 string lustr(ctime_r(&lu, buffer));
452 lustr.pop_back(); // Remove trailing \n
453 f->dump_string("last update", lustr);
454 f->open_array_section("interfaces");
455 f->open_object_section("interface");
456 f->dump_string("interface", "back");
457 f->open_object_section("average");
458 f->dump_float("1min", i.second.back_pingtime[0]/1000.0);
459 f->dump_float("5min", i.second.back_pingtime[1]/1000.0);
460 f->dump_float("15min", i.second.back_pingtime[2]/1000.0);
461 f->close_section(); // average
462 f->open_object_section("min");
463 f->dump_float("1min", i.second.back_min[0]/1000.0);
464 f->dump_float("5min", i.second.back_min[1]/1000.0);
465 f->dump_float("15min", i.second.back_min[2]/1000.0);
466 f->close_section(); // min
467 f->open_object_section("max");
468 f->dump_float("1min", i.second.back_max[0]/1000.0);
469 f->dump_float("5min", i.second.back_max[1]/1000.0);
470 f->dump_float("15min", i.second.back_max[2]/1000.0);
471 f->close_section(); // max
472 f->dump_float("last", i.second.back_last/1000.0);
473 f->close_section(); // interface
474
475 if (i.second.front_pingtime[0] != 0) {
476 f->open_object_section("interface");
477 f->dump_string("interface", "front");
478 f->open_object_section("average");
479 f->dump_float("1min", i.second.front_pingtime[0]/1000.0);
480 f->dump_float("5min", i.second.front_pingtime[1]/1000.0);
481 f->dump_float("15min", i.second.front_pingtime[2]/1000.0);
482 f->close_section(); // average
483 f->open_object_section("min");
484 f->dump_float("1min", i.second.front_min[0]/1000.0);
485 f->dump_float("5min", i.second.front_min[1]/1000.0);
486 f->dump_float("15min", i.second.front_min[2]/1000.0);
487 f->close_section(); // min
488 f->open_object_section("max");
489 f->dump_float("1min", i.second.front_max[0]/1000.0);
490 f->dump_float("5min", i.second.front_max[1]/1000.0);
491 f->dump_float("15min", i.second.front_max[2]/1000.0);
492 f->close_section(); // max
493 f->dump_float("last", i.second.front_last/1000.0);
494 f->close_section(); // interface
495 }
496 f->close_section(); // interfaces
497 f->close_section(); // entry
498 }
499 f->close_section(); // network_ping_time
500 }
501
502 void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
503 {
504 ENCODE_START(14, 2, bl);
505
506 //////// for compatibility ////////
507 int64_t kb = statfs.kb();
508 int64_t kb_used = statfs.kb_used_raw();
509 int64_t kb_avail = statfs.kb_avail();
510 encode(kb, bl);
511 encode(kb_used, bl);
512 encode(kb_avail, bl);
513 ///////////////////////////////////
514
515 encode(snap_trim_queue_len, bl);
516 encode(num_snap_trimming, bl);
517 encode(hb_peers, bl);
518 encode((uint32_t)0, bl);
519 encode(op_queue_age_hist, bl);
520 encode(os_perf_stat, bl, features);
521 encode(up_from, bl);
522 encode(seq, bl);
523 encode(num_pgs, bl);
524
525 //////// for compatibility ////////
526 int64_t kb_used_data = statfs.kb_used_data();
527 int64_t kb_used_omap = statfs.kb_used_omap();
528 int64_t kb_used_meta = statfs.kb_used_internal_metadata();
529 encode(kb_used_data, bl);
530 encode(kb_used_omap, bl);
531 encode(kb_used_meta, bl);
532 encode(statfs, bl);
533 ///////////////////////////////////
534 encode(os_alerts, bl);
535 encode(num_shards_repaired, bl);
536 encode(num_osds, bl);
537 encode(num_per_pool_osds, bl);
538 encode(num_per_pool_omap_osds, bl);
539
540 // hb_pingtime map
541 encode((int)hb_pingtime.size(), bl);
542 for (auto i : hb_pingtime) {
543 encode(i.first, bl); // osd
544 encode(i.second.last_update, bl);
545 encode(i.second.back_pingtime[0], bl);
546 encode(i.second.back_pingtime[1], bl);
547 encode(i.second.back_pingtime[2], bl);
548 encode(i.second.back_min[0], bl);
549 encode(i.second.back_min[1], bl);
550 encode(i.second.back_min[2], bl);
551 encode(i.second.back_max[0], bl);
552 encode(i.second.back_max[1], bl);
553 encode(i.second.back_max[2], bl);
554 encode(i.second.back_last, bl);
555 encode(i.second.front_pingtime[0], bl);
556 encode(i.second.front_pingtime[1], bl);
557 encode(i.second.front_pingtime[2], bl);
558 encode(i.second.front_min[0], bl);
559 encode(i.second.front_min[1], bl);
560 encode(i.second.front_min[2], bl);
561 encode(i.second.front_max[0], bl);
562 encode(i.second.front_max[1], bl);
563 encode(i.second.front_max[2], bl);
564 encode(i.second.front_last, bl);
565 }
566 ENCODE_FINISH(bl);
567 }
568
569 void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl)
570 {
571 int64_t kb, kb_used,kb_avail;
572 int64_t kb_used_data, kb_used_omap, kb_used_meta;
573 DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl);
574 decode(kb, bl);
575 decode(kb_used, bl);
576 decode(kb_avail, bl);
577 decode(snap_trim_queue_len, bl);
578 decode(num_snap_trimming, bl);
579 decode(hb_peers, bl);
580 vector<int> num_hb_out;
581 decode(num_hb_out, bl);
582 if (struct_v >= 3)
583 decode(op_queue_age_hist, bl);
584 if (struct_v >= 4)
585 decode(os_perf_stat, bl);
586 if (struct_v >= 6) {
587 decode(up_from, bl);
588 decode(seq, bl);
589 }
590 if (struct_v >= 7) {
591 decode(num_pgs, bl);
592 }
593 if (struct_v >= 8) {
594 decode(kb_used_data, bl);
595 decode(kb_used_omap, bl);
596 decode(kb_used_meta, bl);
597 } else {
598 kb_used_data = kb_used;
599 kb_used_omap = 0;
600 kb_used_meta = 0;
601 }
602 if (struct_v >= 9) {
603 decode(statfs, bl);
604 } else {
605 statfs.reset();
606 statfs.total = kb << 10;
607 statfs.available = kb_avail << 10;
608 // actually it's totally unexpected to have ststfs.total < statfs.available
609 // here but unfortunately legacy generate_test_instances produced such a
610 // case hence inserting some handling rather than assert
611 statfs.internally_reserved =
612 statfs.total > statfs.available ? statfs.total - statfs.available : 0;
613 kb_used <<= 10;
614 if ((int64_t)statfs.internally_reserved > kb_used) {
615 statfs.internally_reserved -= kb_used;
616 } else {
617 statfs.internally_reserved = 0;
618 }
619 statfs.allocated = kb_used_data << 10;
620 statfs.omap_allocated = kb_used_omap << 10;
621 statfs.internal_metadata = kb_used_meta << 10;
622 }
623 if (struct_v >= 10) {
624 decode(os_alerts, bl);
625 } else {
626 os_alerts.clear();
627 }
628 if (struct_v >= 11) {
629 decode(num_shards_repaired, bl);
630 } else {
631 num_shards_repaired = 0;
632 }
633 if (struct_v >= 12) {
634 decode(num_osds, bl);
635 decode(num_per_pool_osds, bl);
636 } else {
637 num_osds = 0;
638 num_per_pool_osds = 0;
639 }
640 if (struct_v >= 13) {
641 decode(num_per_pool_omap_osds, bl);
642 } else {
643 num_per_pool_omap_osds = 0;
644 }
645 hb_pingtime.clear();
646 if (struct_v >= 14) {
647 int count;
648 decode(count, bl);
649 for (int i = 0 ; i < count ; i++) {
650 int osd;
651 decode(osd, bl);
652 struct Interfaces ifs;
653 decode(ifs.last_update, bl);
654 decode(ifs.back_pingtime[0],bl);
655 decode(ifs.back_pingtime[1], bl);
656 decode(ifs.back_pingtime[2], bl);
657 decode(ifs.back_min[0],bl);
658 decode(ifs.back_min[1], bl);
659 decode(ifs.back_min[2], bl);
660 decode(ifs.back_max[0],bl);
661 decode(ifs.back_max[1], bl);
662 decode(ifs.back_max[2], bl);
663 decode(ifs.back_last, bl);
664 decode(ifs.front_pingtime[0], bl);
665 decode(ifs.front_pingtime[1], bl);
666 decode(ifs.front_pingtime[2], bl);
667 decode(ifs.front_min[0], bl);
668 decode(ifs.front_min[1], bl);
669 decode(ifs.front_min[2], bl);
670 decode(ifs.front_max[0], bl);
671 decode(ifs.front_max[1], bl);
672 decode(ifs.front_max[2], bl);
673 decode(ifs.front_last, bl);
674 hb_pingtime[osd] = ifs;
675 }
676 }
677 DECODE_FINISH(bl);
678 }
679
680 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
681 {
682 o.push_back(new osd_stat_t);
683
684 o.push_back(new osd_stat_t);
685 list<store_statfs_t*> ll;
686 store_statfs_t::generate_test_instances(ll);
687 o.back()->statfs = *ll.back();
688 o.back()->hb_peers.push_back(7);
689 o.back()->snap_trim_queue_len = 8;
690 o.back()->num_snap_trimming = 99;
691 o.back()->num_shards_repaired = 101;
692 o.back()->os_alerts[0].emplace(
693 "some alert", "some alert details");
694 o.back()->os_alerts[1].emplace(
695 "some alert2", "some alert2 details");
696 struct Interfaces gen_interfaces = {
697 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
698 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
699 o.back()->hb_pingtime[20] = gen_interfaces;
700 gen_interfaces = {
701 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
702 o.back()->hb_pingtime[30] = gen_interfaces;
703 }
704
705 // -- pg_t --
706
707 int pg_t::print(char *o, int maxlen) const
708 {
709 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
710 }
711
712 bool pg_t::parse(const char *s)
713 {
714 uint64_t ppool;
715 uint32_t pseed;
716 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
717 if (r < 2)
718 return false;
719 m_pool = ppool;
720 m_seed = pseed;
721 return true;
722 }
723
724 bool spg_t::parse(const char *s)
725 {
726 shard = shard_id_t::NO_SHARD;
727 uint64_t ppool;
728 uint32_t pseed;
729 uint32_t pshard;
730 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
731 if (r < 2)
732 return false;
733 pgid.set_pool(ppool);
734 pgid.set_ps(pseed);
735
736 const char *p = strchr(s, 's');
737 if (p) {
738 r = sscanf(p, "s%u", &pshard);
739 if (r == 1) {
740 shard = shard_id_t(pshard);
741 } else {
742 return false;
743 }
744 }
745 return true;
746 }
747
748 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
749 {
750 while (*suffix_backwords)
751 *--buf = *suffix_backwords++;
752
753 if (!is_no_shard()) {
754 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
755 *--buf = 's';
756 }
757
758 return pgid.calc_name(buf, "");
759 }
760
761 ostream& operator<<(ostream& out, const spg_t &pg)
762 {
763 char buf[spg_t::calc_name_buf_size];
764 buf[spg_t::calc_name_buf_size - 1] = '\0';
765 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
766 return out;
767 }
768
769 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
770 {
771 int old_bits = cbits(old_pg_num);
772 int old_mask = (1 << old_bits) - 1;
773 pg_t ret = *this;
774 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
775 return ret;
776 }
777
778 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
779 {
780 //ceph_assert(m_seed < old_pg_num);
781 if (m_seed >= old_pg_num) {
782 // degenerate case
783 return false;
784 }
785 if (new_pg_num <= old_pg_num)
786 return false;
787
788 bool split = false;
789 if (true) {
790 unsigned old_bits = cbits(old_pg_num);
791 unsigned old_mask = (1 << old_bits) - 1;
792 for (unsigned n = 1; ; n++) {
793 unsigned next_bit = (n << (old_bits-1));
794 unsigned s = next_bit | m_seed;
795
796 if (s < old_pg_num || s == m_seed)
797 continue;
798 if (s >= new_pg_num)
799 break;
800 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
801 split = true;
802 if (children)
803 children->insert(pg_t(s, m_pool));
804 }
805 }
806 }
807 if (false) {
808 // brute force
809 int old_bits = cbits(old_pg_num);
810 int old_mask = (1 << old_bits) - 1;
811 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
812 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
813 if (o == m_seed) {
814 split = true;
815 children->insert(pg_t(x, m_pool));
816 }
817 }
818 }
819 return split;
820 }
821
822 unsigned pg_t::get_split_bits(unsigned pg_num) const {
823 if (pg_num == 1)
824 return 0;
825 ceph_assert(pg_num > 1);
826
827 // Find unique p such that pg_num \in [2^(p-1), 2^p)
828 unsigned p = cbits(pg_num);
829 ceph_assert(p); // silence coverity #751330
830
831 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
832 return p;
833 else
834 return p - 1;
835 }
836
837 bool pg_t::is_merge_source(
838 unsigned old_pg_num,
839 unsigned new_pg_num,
840 pg_t *parent) const
841 {
842 if (m_seed < old_pg_num &&
843 m_seed >= new_pg_num) {
844 if (parent) {
845 pg_t t = *this;
846 while (t.m_seed >= new_pg_num) {
847 t = t.get_parent();
848 }
849 *parent = t;
850 }
851 return true;
852 }
853 return false;
854 }
855
856 pg_t pg_t::get_parent() const
857 {
858 unsigned bits = cbits(m_seed);
859 ceph_assert(bits);
860 pg_t retval = *this;
861 retval.m_seed &= ~((~0)<<(bits - 1));
862 return retval;
863 }
864
865 hobject_t pg_t::get_hobj_start() const
866 {
867 return hobject_t(object_t(), string(), 0, m_seed, m_pool,
868 string());
869 }
870
871 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
872 {
873 // note: this assumes a bitwise sort; with the legacy nibblewise
874 // sort a PG did not always cover a single contiguous range of the
875 // (bit-reversed) hash range.
876 unsigned bits = get_split_bits(pg_num);
877 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
878 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
879 if (rev_end >= 0x100000000) {
880 ceph_assert(rev_end == 0x100000000);
881 return hobject_t::get_max();
882 } else {
883 return hobject_t(object_t(), string(), CEPH_NOSNAP,
884 hobject_t::_reverse_bits(rev_end), m_pool,
885 string());
886 }
887 }
888
889 void pg_t::dump(Formatter *f) const
890 {
891 f->dump_unsigned("pool", m_pool);
892 f->dump_unsigned("seed", m_seed);
893 }
894
895 void pg_t::generate_test_instances(list<pg_t*>& o)
896 {
897 o.push_back(new pg_t);
898 o.push_back(new pg_t(1, 2));
899 o.push_back(new pg_t(13123, 3));
900 o.push_back(new pg_t(131223, 4));
901 }
902
903 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
904 {
905 while (*suffix_backwords)
906 *--buf = *suffix_backwords++;
907
908 buf = ritoa<uint32_t, 16>(m_seed, buf);
909
910 *--buf = '.';
911
912 return ritoa<uint64_t, 10>(m_pool, buf);
913 }
914
915 ostream& operator<<(ostream& out, const pg_t &pg)
916 {
917 char buf[pg_t::calc_name_buf_size];
918 buf[pg_t::calc_name_buf_size - 1] = '\0';
919 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
920 return out;
921 }
922
923
924 // -- coll_t --
925
926 void coll_t::calc_str()
927 {
928 switch (type) {
929 case TYPE_META:
930 strcpy(_str_buff, "meta");
931 _str = _str_buff;
932 break;
933 case TYPE_PG:
934 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
935 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
936 break;
937 case TYPE_PG_TEMP:
938 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
939 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
940 break;
941 default:
942 ceph_abort_msg("unknown collection type");
943 }
944 }
945
946 bool coll_t::parse(const std::string& s)
947 {
948 if (s == "meta") {
949 type = TYPE_META;
950 pgid = spg_t();
951 removal_seq = 0;
952 calc_str();
953 ceph_assert(s == _str);
954 return true;
955 }
956 if (s.find("_head") == s.length() - 5 &&
957 pgid.parse(s.substr(0, s.length() - 5))) {
958 type = TYPE_PG;
959 removal_seq = 0;
960 calc_str();
961 ceph_assert(s == _str);
962 return true;
963 }
964 if (s.find("_TEMP") == s.length() - 5 &&
965 pgid.parse(s.substr(0, s.length() - 5))) {
966 type = TYPE_PG_TEMP;
967 removal_seq = 0;
968 calc_str();
969 ceph_assert(s == _str);
970 return true;
971 }
972 return false;
973 }
974
975 void coll_t::encode(ceph::buffer::list& bl) const
976 {
977 using ceph::encode;
978 // when changing this, remember to update encoded_size() too.
979 if (is_temp()) {
980 // can't express this as v2...
981 __u8 struct_v = 3;
982 encode(struct_v, bl);
983 encode(to_str(), bl);
984 } else {
985 __u8 struct_v = 2;
986 encode(struct_v, bl);
987 encode((__u8)type, bl);
988 encode(pgid, bl);
989 snapid_t snap = CEPH_NOSNAP;
990 encode(snap, bl);
991 }
992 }
993
994 size_t coll_t::encoded_size() const
995 {
996 size_t r = sizeof(__u8);
997 if (is_temp()) {
998 // v3
999 r += sizeof(__u32);
1000 if (_str) {
1001 r += strlen(_str);
1002 }
1003 } else {
1004 // v2
1005 // 1. type
1006 r += sizeof(__u8);
1007 // 2. pgid
1008 // - encoding header
1009 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
1010 // - pg_t
1011 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
1012 // - shard_id_t
1013 r += sizeof(int8_t);
1014 // 3. snapid_t
1015 r += sizeof(uint64_t);
1016 }
1017
1018 return r;
1019 }
1020
1021 void coll_t::decode(ceph::buffer::list::const_iterator& bl)
1022 {
1023 using ceph::decode;
1024 __u8 struct_v;
1025 decode(struct_v, bl);
1026 switch (struct_v) {
1027 case 1:
1028 {
1029 snapid_t snap;
1030 decode(pgid, bl);
1031 decode(snap, bl);
1032
1033 // infer the type
1034 if (pgid == spg_t() && snap == 0) {
1035 type = TYPE_META;
1036 } else {
1037 type = TYPE_PG;
1038 }
1039 removal_seq = 0;
1040 }
1041 break;
1042
1043 case 2:
1044 {
1045 __u8 _type;
1046 snapid_t snap;
1047 decode(_type, bl);
1048 decode(pgid, bl);
1049 decode(snap, bl);
1050 type = (type_t)_type;
1051 removal_seq = 0;
1052 }
1053 break;
1054
1055 case 3:
1056 {
1057 string str;
1058 decode(str, bl);
1059 bool ok = parse(str);
1060 if (!ok)
1061 throw std::domain_error(std::string("unable to parse pg ") + str);
1062 }
1063 break;
1064
1065 default:
1066 {
1067 CachedStackStringStream css;
1068 *css << "coll_t::decode(): don't know how to decode version "
1069 << struct_v;
1070 throw std::domain_error(css->str());
1071 }
1072 }
1073 }
1074
1075 void coll_t::dump(Formatter *f) const
1076 {
1077 f->dump_unsigned("type_id", (unsigned)type);
1078 if (type != TYPE_META)
1079 f->dump_stream("pgid") << pgid;
1080 f->dump_string("name", to_str());
1081 }
1082
1083 void coll_t::generate_test_instances(list<coll_t*>& o)
1084 {
1085 o.push_back(new coll_t());
1086 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
1087 o.push_back(new coll_t(o.back()->get_temp()));
1088 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
1089 o.push_back(new coll_t(o.back()->get_temp()));
1090 o.push_back(new coll_t());
1091 }
1092
1093 // ---
1094
1095 std::string pg_vector_string(const vector<int32_t> &a)
1096 {
1097 CachedStackStringStream css;
1098 *css << "[";
1099 for (auto i = a.cbegin(); i != a.cend(); ++i) {
1100 if (i != a.begin())
1101 *css << ",";
1102 if (*i != CRUSH_ITEM_NONE)
1103 *css << *i;
1104 else
1105 *css << "NONE";
1106 }
1107 *css << "]";
1108 return css->str();
1109 }
1110
1111 std::string pg_state_string(uint64_t state)
1112 {
1113 CachedStackStringStream css;
1114 if (state & PG_STATE_STALE)
1115 *css << "stale+";
1116 if (state & PG_STATE_CREATING)
1117 *css << "creating+";
1118 if (state & PG_STATE_ACTIVE)
1119 *css << "active+";
1120 if (state & PG_STATE_ACTIVATING)
1121 *css << "activating+";
1122 if (state & PG_STATE_CLEAN)
1123 *css << "clean+";
1124 if (state & PG_STATE_RECOVERY_WAIT)
1125 *css << "recovery_wait+";
1126 if (state & PG_STATE_RECOVERY_TOOFULL)
1127 *css << "recovery_toofull+";
1128 if (state & PG_STATE_RECOVERING)
1129 *css << "recovering+";
1130 if (state & PG_STATE_FORCED_RECOVERY)
1131 *css << "forced_recovery+";
1132 if (state & PG_STATE_DOWN)
1133 *css << "down+";
1134 if (state & PG_STATE_RECOVERY_UNFOUND)
1135 *css << "recovery_unfound+";
1136 if (state & PG_STATE_BACKFILL_UNFOUND)
1137 *css << "backfill_unfound+";
1138 if (state & PG_STATE_UNDERSIZED)
1139 *css << "undersized+";
1140 if (state & PG_STATE_DEGRADED)
1141 *css << "degraded+";
1142 if (state & PG_STATE_REMAPPED)
1143 *css << "remapped+";
1144 if (state & PG_STATE_PREMERGE)
1145 *css << "premerge+";
1146 if (state & PG_STATE_SCRUBBING)
1147 *css << "scrubbing+";
1148 if (state & PG_STATE_DEEP_SCRUB)
1149 *css << "deep+";
1150 if (state & PG_STATE_INCONSISTENT)
1151 *css << "inconsistent+";
1152 if (state & PG_STATE_PEERING)
1153 *css << "peering+";
1154 if (state & PG_STATE_REPAIR)
1155 *css << "repair+";
1156 if (state & PG_STATE_BACKFILL_WAIT)
1157 *css << "backfill_wait+";
1158 if (state & PG_STATE_BACKFILLING)
1159 *css << "backfilling+";
1160 if (state & PG_STATE_FORCED_BACKFILL)
1161 *css << "forced_backfill+";
1162 if (state & PG_STATE_BACKFILL_TOOFULL)
1163 *css << "backfill_toofull+";
1164 if (state & PG_STATE_INCOMPLETE)
1165 *css << "incomplete+";
1166 if (state & PG_STATE_PEERED)
1167 *css << "peered+";
1168 if (state & PG_STATE_SNAPTRIM)
1169 *css << "snaptrim+";
1170 if (state & PG_STATE_SNAPTRIM_WAIT)
1171 *css << "snaptrim_wait+";
1172 if (state & PG_STATE_SNAPTRIM_ERROR)
1173 *css << "snaptrim_error+";
1174 if (state & PG_STATE_FAILED_REPAIR)
1175 *css << "failed_repair+";
1176 if (state & PG_STATE_LAGGY)
1177 *css << "laggy+";
1178 if (state & PG_STATE_WAIT)
1179 *css << "wait+";
1180 auto ret = css->str();
1181 if (ret.length() > 0)
1182 ret.resize(ret.length() - 1);
1183 else
1184 ret = "unknown";
1185 return ret;
1186 }
1187
1188 std::optional<uint64_t> pg_string_state(const std::string& state)
1189 {
1190 std::optional<uint64_t> type;
1191 if (state == "active")
1192 type = PG_STATE_ACTIVE;
1193 else if (state == "clean")
1194 type = PG_STATE_CLEAN;
1195 else if (state == "down")
1196 type = PG_STATE_DOWN;
1197 else if (state == "recovery_unfound")
1198 type = PG_STATE_RECOVERY_UNFOUND;
1199 else if (state == "backfill_unfound")
1200 type = PG_STATE_BACKFILL_UNFOUND;
1201 else if (state == "premerge")
1202 type = PG_STATE_PREMERGE;
1203 else if (state == "scrubbing")
1204 type = PG_STATE_SCRUBBING;
1205 else if (state == "degraded")
1206 type = PG_STATE_DEGRADED;
1207 else if (state == "inconsistent")
1208 type = PG_STATE_INCONSISTENT;
1209 else if (state == "peering")
1210 type = PG_STATE_PEERING;
1211 else if (state == "repair")
1212 type = PG_STATE_REPAIR;
1213 else if (state == "recovering")
1214 type = PG_STATE_RECOVERING;
1215 else if (state == "forced_recovery")
1216 type = PG_STATE_FORCED_RECOVERY;
1217 else if (state == "backfill_wait")
1218 type = PG_STATE_BACKFILL_WAIT;
1219 else if (state == "incomplete")
1220 type = PG_STATE_INCOMPLETE;
1221 else if (state == "stale")
1222 type = PG_STATE_STALE;
1223 else if (state == "remapped")
1224 type = PG_STATE_REMAPPED;
1225 else if (state == "deep")
1226 type = PG_STATE_DEEP_SCRUB;
1227 else if (state == "backfilling")
1228 type = PG_STATE_BACKFILLING;
1229 else if (state == "forced_backfill")
1230 type = PG_STATE_FORCED_BACKFILL;
1231 else if (state == "backfill_toofull")
1232 type = PG_STATE_BACKFILL_TOOFULL;
1233 else if (state == "recovery_wait")
1234 type = PG_STATE_RECOVERY_WAIT;
1235 else if (state == "recovery_toofull")
1236 type = PG_STATE_RECOVERY_TOOFULL;
1237 else if (state == "undersized")
1238 type = PG_STATE_UNDERSIZED;
1239 else if (state == "activating")
1240 type = PG_STATE_ACTIVATING;
1241 else if (state == "peered")
1242 type = PG_STATE_PEERED;
1243 else if (state == "snaptrim")
1244 type = PG_STATE_SNAPTRIM;
1245 else if (state == "snaptrim_wait")
1246 type = PG_STATE_SNAPTRIM_WAIT;
1247 else if (state == "snaptrim_error")
1248 type = PG_STATE_SNAPTRIM_ERROR;
1249 else if (state == "creating")
1250 type = PG_STATE_CREATING;
1251 else if (state == "failed_repair")
1252 type = PG_STATE_FAILED_REPAIR;
1253 else if (state == "laggy")
1254 type = PG_STATE_LAGGY;
1255 else if (state == "wait")
1256 type = PG_STATE_WAIT;
1257 else if (state == "unknown")
1258 type = 0;
1259 else
1260 type = std::nullopt;
1261 return type;
1262 }
1263
1264 // -- eversion_t --
1265 string eversion_t::get_key_name() const
1266 {
1267 std::string key(32, ' ');
1268 get_key_name(&key[0]);
1269 key.resize(31); // remove the null terminator
1270 return key;
1271 }
1272
1273 // -- pool_snap_info_t --
1274 void pool_snap_info_t::dump(Formatter *f) const
1275 {
1276 f->dump_unsigned("snapid", snapid);
1277 f->dump_stream("stamp") << stamp;
1278 f->dump_string("name", name);
1279 }
1280
1281 void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
1282 {
1283 using ceph::encode;
1284 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1285 __u8 struct_v = 1;
1286 encode(struct_v, bl);
1287 encode(snapid, bl);
1288 encode(stamp, bl);
1289 encode(name, bl);
1290 return;
1291 }
1292 ENCODE_START(2, 2, bl);
1293 encode(snapid, bl);
1294 encode(stamp, bl);
1295 encode(name, bl);
1296 ENCODE_FINISH(bl);
1297 }
1298
1299 void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl)
1300 {
1301 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
1302 decode(snapid, bl);
1303 decode(stamp, bl);
1304 decode(name, bl);
1305 DECODE_FINISH(bl);
1306 }
1307
1308 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
1309 {
1310 o.push_back(new pool_snap_info_t);
1311 o.push_back(new pool_snap_info_t);
1312 o.back()->snapid = 1;
1313 o.back()->stamp = utime_t(1, 2);
1314 o.back()->name = "foo";
1315 }
1316
1317 // -- pool_opts_t --
1318
1319 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
1320 static opt_mapping_t opt_mapping = boost::assign::map_list_of
1321 ("scrub_min_interval", pool_opts_t::opt_desc_t(
1322 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
1323 ("scrub_max_interval", pool_opts_t::opt_desc_t(
1324 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
1325 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1326 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
1327 ("recovery_priority", pool_opts_t::opt_desc_t(
1328 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
1329 ("recovery_op_priority", pool_opts_t::opt_desc_t(
1330 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
1331 ("scrub_priority", pool_opts_t::opt_desc_t(
1332 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
1333 ("compression_mode", pool_opts_t::opt_desc_t(
1334 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
1335 ("compression_algorithm", pool_opts_t::opt_desc_t(
1336 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
1337 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1338 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
1339 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1340 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
1341 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1342 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
1343 ("csum_type", pool_opts_t::opt_desc_t(
1344 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1345 ("csum_max_block", pool_opts_t::opt_desc_t(
1346 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1347 ("csum_min_block", pool_opts_t::opt_desc_t(
1348 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT))
1349 ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1350 pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR))
1351 ("pg_num_min", pool_opts_t::opt_desc_t(
1352 pool_opts_t::PG_NUM_MIN, pool_opts_t::INT))
1353 ("pg_num_max", pool_opts_t::opt_desc_t(
1354 pool_opts_t::PG_NUM_MAX, pool_opts_t::INT))
1355 ("target_size_bytes", pool_opts_t::opt_desc_t(
1356 pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT))
1357 ("target_size_ratio", pool_opts_t::opt_desc_t(
1358 pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE))
1359 ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
1360 pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE))
1361 ("read_lease_interval", pool_opts_t::opt_desc_t(
1362 pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE))
1363 ("dedup_tier", pool_opts_t::opt_desc_t(
1364 pool_opts_t::DEDUP_TIER, pool_opts_t::INT))
1365 ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t(
1366 pool_opts_t::DEDUP_CHUNK_ALGORITHM, pool_opts_t::STR))
1367 ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t(
1368 pool_opts_t::DEDUP_CDC_CHUNK_SIZE, pool_opts_t::INT));
1369
1370 bool pool_opts_t::is_opt_name(const std::string& name)
1371 {
1372 return opt_mapping.count(name);
1373 }
1374
1375 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name)
1376 {
1377 auto i = opt_mapping.find(name);
1378 ceph_assert(i != opt_mapping.end());
1379 return i->second;
1380 }
1381
1382 bool pool_opts_t::is_set(pool_opts_t::key_t key) const
1383 {
1384 return opts.count(key);
1385 }
1386
1387 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const
1388 {
1389 auto i = opts.find(key);
1390 ceph_assert(i != opts.end());
1391 return i->second;
1392 }
1393
1394 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1395 return opts.erase(key) > 0;
1396 }
1397
1398 class pool_opts_dumper_t : public boost::static_visitor<> {
1399 public:
1400 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1401 name(name_.c_str()), f(f_) {}
1402
1403 void operator()(std::string s) const {
1404 f->dump_string(name, s);
1405 }
1406 void operator()(int64_t i) const {
1407 f->dump_int(name, i);
1408 }
1409 void operator()(double d) const {
1410 f->dump_float(name, d);
1411 }
1412
1413 private:
1414 const char* name;
1415 Formatter* f;
1416 };
1417
1418 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1419 {
1420 const opt_desc_t& desc = get_opt_desc(name);
1421 auto i = opts.find(desc.key);
1422 if (i == opts.end()) {
1423 return;
1424 }
1425 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1426 }
1427
1428 void pool_opts_t::dump(Formatter* f) const
1429 {
1430 for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) {
1431 const std::string& name = i->first;
1432 const opt_desc_t& desc = i->second;
1433 auto j = opts.find(desc.key);
1434 if (j == opts.end()) {
1435 continue;
1436 }
1437 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1438 }
1439 }
1440
1441 class pool_opts_encoder_t : public boost::static_visitor<> {
1442 public:
1443 explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features)
1444 : bl(bl_),
1445 features(features) {}
1446
1447 void operator()(const std::string &s) const {
1448 encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1449 encode(s, bl);
1450 }
1451 void operator()(int64_t i) const {
1452 encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1453 if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1454 encode(i, bl);
1455 } else {
1456 encode(static_cast<int32_t>(i), bl);
1457 }
1458 }
1459 void operator()(double d) const {
1460 encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1461 encode(d, bl);
1462 }
1463
1464 private:
1465 ceph::buffer::list& bl;
1466 uint64_t features;
1467 };
1468
1469 void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const
1470 {
1471 unsigned v = 2;
1472 if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1473 v = 1;
1474 }
1475 ENCODE_START(v, 1, bl);
1476 uint32_t n = static_cast<uint32_t>(opts.size());
1477 encode(n, bl);
1478 for (auto i = opts.cbegin(); i != opts.cend(); ++i) {
1479 encode(static_cast<int32_t>(i->first), bl);
1480 boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second);
1481 }
1482 ENCODE_FINISH(bl);
1483 }
1484
1485 void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl)
1486 {
1487 DECODE_START(1, bl);
1488 __u32 n;
1489 decode(n, bl);
1490 opts.clear();
1491 while (n--) {
1492 int32_t k, t;
1493 decode(k, bl);
1494 decode(t, bl);
1495 if (t == STR) {
1496 std::string s;
1497 decode(s, bl);
1498 opts[static_cast<key_t>(k)] = s;
1499 } else if (t == INT) {
1500 int64_t i;
1501 if (struct_v >= 2) {
1502 decode(i, bl);
1503 } else {
1504 int ii;
1505 decode(ii, bl);
1506 i = ii;
1507 }
1508 opts[static_cast<key_t>(k)] = i;
1509 } else if (t == DOUBLE) {
1510 double d;
1511 decode(d, bl);
1512 opts[static_cast<key_t>(k)] = d;
1513 } else {
1514 ceph_assert(!"invalid type");
1515 }
1516 }
1517 DECODE_FINISH(bl);
1518 }
1519
1520 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1521 {
1522 for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) {
1523 const std::string& name = i->first;
1524 const pool_opts_t::opt_desc_t& desc = i->second;
1525 auto j = opts.opts.find(desc.key);
1526 if (j == opts.opts.end()) {
1527 continue;
1528 }
1529 out << " " << name << " " << j->second;
1530 }
1531 return out;
1532 }
1533
1534 // -- pg_pool_t --
1535
1536 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1537 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1538 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1539
1540 void pg_pool_t::dump(Formatter *f) const
1541 {
1542 f->dump_stream("create_time") << get_create_time();
1543 f->dump_unsigned("flags", get_flags());
1544 f->dump_string("flags_names", get_flags_string());
1545 f->dump_int("type", get_type());
1546 f->dump_int("size", get_size());
1547 f->dump_int("min_size", get_min_size());
1548 f->dump_int("crush_rule", get_crush_rule());
1549 f->dump_int("peering_crush_bucket_count", peering_crush_bucket_count);
1550 f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target);
1551 f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier);
1552 f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member);
1553 f->dump_int("object_hash", get_object_hash());
1554 f->dump_string("pg_autoscale_mode",
1555 get_pg_autoscale_mode_name(pg_autoscale_mode));
1556 f->dump_unsigned("pg_num", get_pg_num());
1557 f->dump_unsigned("pg_placement_num", get_pgp_num());
1558 f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1559 f->dump_unsigned("pg_num_target", get_pg_num_target());
1560 f->dump_unsigned("pg_num_pending", get_pg_num_pending());
1561 f->dump_object("last_pg_merge_meta", last_pg_merge_meta);
1562 f->dump_stream("last_change") << get_last_change();
1563 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1564 f->dump_stream("last_force_op_resend_prenautilus")
1565 << get_last_force_op_resend_prenautilus();
1566 f->dump_stream("last_force_op_resend_preluminous")
1567 << get_last_force_op_resend_preluminous();
1568 f->dump_unsigned("auid", get_auid());
1569 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1570 f->dump_unsigned("snap_seq", get_snap_seq());
1571 f->dump_unsigned("snap_epoch", get_snap_epoch());
1572 f->open_array_section("pool_snaps");
1573 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) {
1574 f->open_object_section("pool_snap_info");
1575 p->second.dump(f);
1576 f->close_section();
1577 }
1578 f->close_section();
1579 f->dump_stream("removed_snaps") << removed_snaps;
1580 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1581 f->dump_unsigned("quota_max_objects", quota_max_objects);
1582 f->open_array_section("tiers");
1583 for (auto p = tiers.cbegin(); p != tiers.cend(); ++p)
1584 f->dump_unsigned("pool_id", *p);
1585 f->close_section();
1586 f->dump_int("tier_of", tier_of);
1587 f->dump_int("read_tier", read_tier);
1588 f->dump_int("write_tier", write_tier);
1589 f->dump_string("cache_mode", get_cache_mode_name());
1590 f->dump_unsigned("target_max_bytes", target_max_bytes);
1591 f->dump_unsigned("target_max_objects", target_max_objects);
1592 f->dump_unsigned("cache_target_dirty_ratio_micro",
1593 cache_target_dirty_ratio_micro);
1594 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1595 cache_target_dirty_high_ratio_micro);
1596 f->dump_unsigned("cache_target_full_ratio_micro",
1597 cache_target_full_ratio_micro);
1598 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1599 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1600 f->dump_string("erasure_code_profile", erasure_code_profile);
1601 f->open_object_section("hit_set_params");
1602 hit_set_params.dump(f);
1603 f->close_section(); // hit_set_params
1604 f->dump_unsigned("hit_set_period", hit_set_period);
1605 f->dump_unsigned("hit_set_count", hit_set_count);
1606 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1607 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1608 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1609 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1610 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1611 f->open_array_section("grade_table");
1612 for (unsigned i = 0; i < hit_set_count; ++i)
1613 f->dump_unsigned("value", get_grade(i));
1614 f->close_section();
1615 f->dump_unsigned("stripe_width", get_stripe_width());
1616 f->dump_unsigned("expected_num_objects", expected_num_objects);
1617 f->dump_bool("fast_read", fast_read);
1618 f->open_object_section("options");
1619 opts.dump(f);
1620 f->close_section(); // options
1621 f->open_object_section("application_metadata");
1622 for (auto &app_pair : application_metadata) {
1623 f->open_object_section(app_pair.first.c_str());
1624 for (auto &kv_pair : app_pair.second) {
1625 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1626 }
1627 f->close_section(); // application
1628 }
1629 f->close_section(); // application_metadata
1630 }
1631
1632 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1633 for (size_t i = 0; i < from.size(); ++i) {
1634 if (from[i] != CRUSH_ITEM_NONE) {
1635 to->insert(
1636 pg_shard_t(
1637 from[i],
1638 is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1639 }
1640 }
1641 }
1642
1643 void pg_pool_t::calc_pg_masks()
1644 {
1645 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1646 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1647 }
1648
1649 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1650 {
1651 if (pg_num == pg_num_mask + 1)
1652 return pg_num; // power-of-2 split
1653 unsigned mask = pg_num_mask >> 1;
1654 if ((pgid.ps() & mask) < (pg_num & mask))
1655 return pg_num_mask + 1; // smaller bin size (already split)
1656 else
1657 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1658 }
1659
1660 bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const
1661 {
1662 if (pg_num_pending >= pg_num) {
1663 return false;
1664 }
1665 if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) {
1666 if (target) {
1667 *target = false;
1668 }
1669 return true;
1670 }
1671 for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) {
1672 if (pg_t(ps, pgid.pool()).get_parent() == pgid) {
1673 if (target) {
1674 *target = true;
1675 }
1676 return true;
1677 }
1678 }
1679 return false;
1680 }
1681
1682 /*
1683 * we have two snap modes:
1684 * - pool snaps
1685 * - snap existence/non-existence defined by snaps[] and snap_seq
1686 * - user managed snaps
1687 * - existence tracked by librados user
1688 */
1689 bool pg_pool_t::is_pool_snaps_mode() const
1690 {
1691 return has_flag(FLAG_POOL_SNAPS);
1692 }
1693
1694 bool pg_pool_t::is_unmanaged_snaps_mode() const
1695 {
1696 return has_flag(FLAG_SELFMANAGED_SNAPS);
1697 }
1698
1699 bool pg_pool_t::is_removed_snap(snapid_t s) const
1700 {
1701 if (is_pool_snaps_mode())
1702 return s <= get_snap_seq() && snaps.count(s) == 0;
1703 else
1704 return removed_snaps.contains(s);
1705 }
1706
1707 snapid_t pg_pool_t::snap_exists(std::string_view s) const
1708 {
1709 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
1710 if (p->second.name == s)
1711 return p->second.snapid;
1712 return 0;
1713 }
1714
1715 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1716 {
1717 ceph_assert(!is_unmanaged_snaps_mode());
1718 flags |= FLAG_POOL_SNAPS;
1719 snapid_t s = get_snap_seq() + 1;
1720 snap_seq = s;
1721 snaps[s].snapid = s;
1722 snaps[s].name = n;
1723 snaps[s].stamp = stamp;
1724 }
1725
1726 uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat)
1727 {
1728 ceph_assert(!is_pool_snaps_mode());
1729 if (snap_seq == 0) {
1730 if (preoctopus_compat) {
1731 // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
1732 // mimic this field is not decoded but our flag is set; pre-mimic, we
1733 // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1734 removed_snaps.insert(snapid_t(1));
1735 }
1736 snap_seq = 1;
1737 }
1738 flags |= FLAG_SELFMANAGED_SNAPS;
1739 snap_seq = snap_seq + 1;
1740 return snap_seq;
1741 }
1742
1743 void pg_pool_t::remove_snap(snapid_t s)
1744 {
1745 ceph_assert(snaps.count(s));
1746 snaps.erase(s);
1747 snap_seq = snap_seq + 1;
1748 }
1749
1750 void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat)
1751 {
1752 ceph_assert(is_unmanaged_snaps_mode());
1753 ++snap_seq;
1754 if (preoctopus_compat) {
1755 removed_snaps.insert(s);
1756 // try to add in the new seq, just to try to keep the interval_set contiguous
1757 if (!removed_snaps.contains(get_snap_seq())) {
1758 removed_snaps.insert(get_snap_seq());
1759 }
1760 }
1761 }
1762
1763 SnapContext pg_pool_t::get_snap_context() const
1764 {
1765 vector<snapid_t> s(snaps.size());
1766 unsigned i = 0;
1767 for (auto p = snaps.crbegin(); p != snaps.crend(); ++p)
1768 s[i++] = p->first;
1769 return SnapContext(get_snap_seq(), s);
1770 }
1771
1772 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1773 {
1774 if (ns.empty())
1775 return ceph_str_hash(object_hash, key.data(), key.length());
1776 int nsl = ns.length();
1777 int len = key.length() + nsl + 1;
1778 char buf[len];
1779 memcpy(&buf[0], ns.data(), nsl);
1780 buf[nsl] = '\037';
1781 memcpy(&buf[nsl+1], key.data(), key.length());
1782 return ceph_str_hash(object_hash, &buf[0], len);
1783 }
1784
1785 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1786 {
1787 return ceph_stable_mod(v, pg_num, pg_num_mask);
1788 }
1789
1790 /*
1791 * map a raw pg (with full precision ps) into an actual pg, for storage
1792 */
1793 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1794 {
1795 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1796 return pg;
1797 }
1798
1799 /*
1800 * map raw pg (full precision ps) into a placement seed. include
1801 * pool id in that value so that different pools don't use the same
1802 * seeds.
1803 */
1804 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1805 {
1806 if (flags & FLAG_HASHPSPOOL) {
1807 // Hash the pool id so that pool PGs do not overlap.
1808 return
1809 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1810 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1811 pg.pool());
1812 } else {
1813 // Legacy behavior; add ps and pool together. This is not a great
1814 // idea because the PGs from each pool will essentially overlap on
1815 // top of each other: 0.5 == 1.4 == 2.3 == ...
1816 return
1817 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1818 pg.pool();
1819 }
1820 }
1821
1822 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1823 {
1824 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1825 if (pg_num == pg_num_mask + 1) {
1826 r &= ~pg_num_mask;
1827 } else {
1828 unsigned smaller_mask = pg_num_mask >> 1;
1829 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1830 r &= ~pg_num_mask;
1831 } else {
1832 r &= ~smaller_mask;
1833 }
1834 }
1835 r |= pg.ps();
1836 return r;
1837 }
1838
1839 void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
1840 {
1841 using ceph::encode;
1842 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1843 // this encoding matches the old struct ceph_pg_pool
1844 __u8 struct_v = 2;
1845 encode(struct_v, bl);
1846 encode(type, bl);
1847 encode(size, bl);
1848 encode(crush_rule, bl);
1849 encode(object_hash, bl);
1850 encode(pg_num, bl);
1851 encode(pgp_num, bl);
1852 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1853 encode(lpg_num, bl);
1854 encode(lpgp_num, bl);
1855 encode(last_change, bl);
1856 encode(snap_seq, bl);
1857 encode(snap_epoch, bl);
1858
1859 __u32 n = snaps.size();
1860 encode(n, bl);
1861 n = removed_snaps.num_intervals();
1862 encode(n, bl);
1863
1864 encode(auid, bl);
1865
1866 encode_nohead(snaps, bl, features);
1867 encode_nohead(removed_snaps, bl);
1868 return;
1869 }
1870
1871 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1872 __u8 struct_v = 4;
1873 encode(struct_v, bl);
1874 encode(type, bl);
1875 encode(size, bl);
1876 encode(crush_rule, bl);
1877 encode(object_hash, bl);
1878 encode(pg_num, bl);
1879 encode(pgp_num, bl);
1880 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1881 encode(lpg_num, bl);
1882 encode(lpgp_num, bl);
1883 encode(last_change, bl);
1884 encode(snap_seq, bl);
1885 encode(snap_epoch, bl);
1886 encode(snaps, bl, features);
1887 encode(removed_snaps, bl);
1888 encode(auid, bl);
1889 encode(flags, bl);
1890 encode((uint32_t)0, bl); // crash_replay_interval
1891 return;
1892 }
1893
1894 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1895 // we simply added last_force_op_resend here, which is a fully
1896 // backward compatible change. however, encoding the same map
1897 // differently between monitors triggers scrub noise (even though
1898 // they are decodable without the feature), so let's be pendantic
1899 // about it.
1900 ENCODE_START(14, 5, bl);
1901 encode(type, bl);
1902 encode(size, bl);
1903 encode(crush_rule, bl);
1904 encode(object_hash, bl);
1905 encode(pg_num, bl);
1906 encode(pgp_num, bl);
1907 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1908 encode(lpg_num, bl);
1909 encode(lpgp_num, bl);
1910 encode(last_change, bl);
1911 encode(snap_seq, bl);
1912 encode(snap_epoch, bl);
1913 encode(snaps, bl, features);
1914 encode(removed_snaps, bl);
1915 encode(auid, bl);
1916 encode(flags, bl);
1917 encode((uint32_t)0, bl); // crash_replay_interval
1918 encode(min_size, bl);
1919 encode(quota_max_bytes, bl);
1920 encode(quota_max_objects, bl);
1921 encode(tiers, bl);
1922 encode(tier_of, bl);
1923 __u8 c = cache_mode;
1924 encode(c, bl);
1925 encode(read_tier, bl);
1926 encode(write_tier, bl);
1927 encode(properties, bl);
1928 encode(hit_set_params, bl);
1929 encode(hit_set_period, bl);
1930 encode(hit_set_count, bl);
1931 encode(stripe_width, bl);
1932 encode(target_max_bytes, bl);
1933 encode(target_max_objects, bl);
1934 encode(cache_target_dirty_ratio_micro, bl);
1935 encode(cache_target_full_ratio_micro, bl);
1936 encode(cache_min_flush_age, bl);
1937 encode(cache_min_evict_age, bl);
1938 encode(erasure_code_profile, bl);
1939 ENCODE_FINISH(bl);
1940 return;
1941 }
1942
1943 uint8_t v = 30;
1944 // NOTE: any new encoding dependencies must be reflected by
1945 // SIGNIFICANT_FEATURES
1946 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1947 // this was the first post-hammer thing we added; if it's missing, encode
1948 // like hammer.
1949 v = 21;
1950 } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1951 v = 24;
1952 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
1953 v = 26;
1954 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1955 v = 27;
1956 } else if (!is_stretch_pool()) {
1957 v = 29;
1958 }
1959
1960 ENCODE_START(v, 5, bl);
1961 encode(type, bl);
1962 encode(size, bl);
1963 encode(crush_rule, bl);
1964 encode(object_hash, bl);
1965 encode(pg_num, bl);
1966 encode(pgp_num, bl);
1967 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1968 encode(lpg_num, bl);
1969 encode(lpgp_num, bl);
1970 encode(last_change, bl);
1971 encode(snap_seq, bl);
1972 encode(snap_epoch, bl);
1973 encode(snaps, bl, features);
1974 encode(removed_snaps, bl);
1975 encode(auid, bl);
1976 if (v >= 27) {
1977 encode(flags, bl);
1978 } else {
1979 auto tmp = flags;
1980 tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
1981 encode(tmp, bl);
1982 }
1983 encode((uint32_t)0, bl); // crash_replay_interval
1984 encode(min_size, bl);
1985 encode(quota_max_bytes, bl);
1986 encode(quota_max_objects, bl);
1987 encode(tiers, bl);
1988 encode(tier_of, bl);
1989 __u8 c = cache_mode;
1990 encode(c, bl);
1991 encode(read_tier, bl);
1992 encode(write_tier, bl);
1993 encode(properties, bl);
1994 encode(hit_set_params, bl);
1995 encode(hit_set_period, bl);
1996 encode(hit_set_count, bl);
1997 encode(stripe_width, bl);
1998 encode(target_max_bytes, bl);
1999 encode(target_max_objects, bl);
2000 encode(cache_target_dirty_ratio_micro, bl);
2001 encode(cache_target_full_ratio_micro, bl);
2002 encode(cache_min_flush_age, bl);
2003 encode(cache_min_evict_age, bl);
2004 encode(erasure_code_profile, bl);
2005 encode(last_force_op_resend_preluminous, bl);
2006 encode(min_read_recency_for_promote, bl);
2007 encode(expected_num_objects, bl);
2008 if (v >= 19) {
2009 encode(cache_target_dirty_high_ratio_micro, bl);
2010 }
2011 if (v >= 20) {
2012 encode(min_write_recency_for_promote, bl);
2013 }
2014 if (v >= 21) {
2015 encode(use_gmt_hitset, bl);
2016 }
2017 if (v >= 22) {
2018 encode(fast_read, bl);
2019 }
2020 if (v >= 23) {
2021 encode(hit_set_grade_decay_rate, bl);
2022 encode(hit_set_search_last_n, bl);
2023 }
2024 if (v >= 24) {
2025 encode(opts, bl, features);
2026 }
2027 if (v >= 25) {
2028 encode(last_force_op_resend_prenautilus, bl);
2029 }
2030 if (v >= 26) {
2031 encode(application_metadata, bl);
2032 }
2033 if (v >= 27) {
2034 encode(create_time, bl);
2035 }
2036 if (v >= 28) {
2037 encode(pg_num_target, bl);
2038 encode(pgp_num_target, bl);
2039 encode(pg_num_pending, bl);
2040 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01]
2041 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01]
2042 encode(last_force_op_resend, bl);
2043 encode(pg_autoscale_mode, bl);
2044 }
2045 if (v >= 29) {
2046 encode(last_pg_merge_meta, bl);
2047 }
2048 if (v >= 30) {
2049 encode(peering_crush_bucket_count, bl);
2050 encode(peering_crush_bucket_target, bl);
2051 encode(peering_crush_bucket_barrier, bl);
2052 encode(peering_crush_mandatory_member, bl);
2053 }
2054 ENCODE_FINISH(bl);
2055 }
2056
2057 void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
2058 {
2059 DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl);
2060 decode(type, bl);
2061 decode(size, bl);
2062 decode(crush_rule, bl);
2063 decode(object_hash, bl);
2064 decode(pg_num, bl);
2065 decode(pgp_num, bl);
2066 {
2067 __u32 lpg_num, lpgp_num;
2068 decode(lpg_num, bl);
2069 decode(lpgp_num, bl);
2070 }
2071 decode(last_change, bl);
2072 decode(snap_seq, bl);
2073 decode(snap_epoch, bl);
2074
2075 if (struct_v >= 3) {
2076 decode(snaps, bl);
2077 decode(removed_snaps, bl);
2078 decode(auid, bl);
2079 } else {
2080 __u32 n, m;
2081 decode(n, bl);
2082 decode(m, bl);
2083 decode(auid, bl);
2084 decode_nohead(n, snaps, bl);
2085 decode_nohead(m, removed_snaps, bl);
2086 }
2087
2088 if (struct_v >= 4) {
2089 decode(flags, bl);
2090 uint32_t crash_replay_interval;
2091 decode(crash_replay_interval, bl);
2092 } else {
2093 flags = 0;
2094 }
2095 // upgrade path for selfmanaged vs pool snaps
2096 if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
2097 if (!removed_snaps.empty()) {
2098 flags |= FLAG_SELFMANAGED_SNAPS;
2099 } else {
2100 flags |= FLAG_POOL_SNAPS;
2101 }
2102 }
2103 if (struct_v >= 7) {
2104 decode(min_size, bl);
2105 } else {
2106 min_size = size - size/2;
2107 }
2108 if (struct_v >= 8) {
2109 decode(quota_max_bytes, bl);
2110 decode(quota_max_objects, bl);
2111 }
2112 if (struct_v >= 9) {
2113 decode(tiers, bl);
2114 decode(tier_of, bl);
2115 __u8 v;
2116 decode(v, bl);
2117 cache_mode = (cache_mode_t)v;
2118 decode(read_tier, bl);
2119 decode(write_tier, bl);
2120 }
2121 if (struct_v >= 10) {
2122 decode(properties, bl);
2123 }
2124 if (struct_v >= 11) {
2125 decode(hit_set_params, bl);
2126 decode(hit_set_period, bl);
2127 decode(hit_set_count, bl);
2128 } else {
2129 pg_pool_t def;
2130 hit_set_period = def.hit_set_period;
2131 hit_set_count = def.hit_set_count;
2132 }
2133 if (struct_v >= 12) {
2134 decode(stripe_width, bl);
2135 } else {
2136 set_stripe_width(0);
2137 }
2138 if (struct_v >= 13) {
2139 decode(target_max_bytes, bl);
2140 decode(target_max_objects, bl);
2141 decode(cache_target_dirty_ratio_micro, bl);
2142 decode(cache_target_full_ratio_micro, bl);
2143 decode(cache_min_flush_age, bl);
2144 decode(cache_min_evict_age, bl);
2145 } else {
2146 target_max_bytes = 0;
2147 target_max_objects = 0;
2148 cache_target_dirty_ratio_micro = 0;
2149 cache_target_full_ratio_micro = 0;
2150 cache_min_flush_age = 0;
2151 cache_min_evict_age = 0;
2152 }
2153 if (struct_v >= 14) {
2154 decode(erasure_code_profile, bl);
2155 }
2156 if (struct_v >= 15) {
2157 decode(last_force_op_resend_preluminous, bl);
2158 } else {
2159 last_force_op_resend_preluminous = 0;
2160 }
2161 if (struct_v >= 16) {
2162 decode(min_read_recency_for_promote, bl);
2163 } else {
2164 min_read_recency_for_promote = 1;
2165 }
2166 if (struct_v >= 17) {
2167 decode(expected_num_objects, bl);
2168 } else {
2169 expected_num_objects = 0;
2170 }
2171 if (struct_v >= 19) {
2172 decode(cache_target_dirty_high_ratio_micro, bl);
2173 } else {
2174 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
2175 }
2176 if (struct_v >= 20) {
2177 decode(min_write_recency_for_promote, bl);
2178 } else {
2179 min_write_recency_for_promote = 1;
2180 }
2181 if (struct_v >= 21) {
2182 decode(use_gmt_hitset, bl);
2183 } else {
2184 use_gmt_hitset = false;
2185 }
2186 if (struct_v >= 22) {
2187 decode(fast_read, bl);
2188 } else {
2189 fast_read = false;
2190 }
2191 if (struct_v >= 23) {
2192 decode(hit_set_grade_decay_rate, bl);
2193 decode(hit_set_search_last_n, bl);
2194 } else {
2195 hit_set_grade_decay_rate = 0;
2196 hit_set_search_last_n = 1;
2197 }
2198 if (struct_v >= 24) {
2199 decode(opts, bl);
2200 }
2201 if (struct_v >= 25) {
2202 decode(last_force_op_resend_prenautilus, bl);
2203 } else {
2204 last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
2205 }
2206 if (struct_v >= 26) {
2207 decode(application_metadata, bl);
2208 }
2209 if (struct_v >= 27) {
2210 decode(create_time, bl);
2211 }
2212 if (struct_v >= 28) {
2213 decode(pg_num_target, bl);
2214 decode(pgp_num_target, bl);
2215 decode(pg_num_pending, bl);
2216 epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
2217 decode(old_merge_last_epoch_started, bl);
2218 decode(old_merge_last_epoch_clean, bl);
2219 decode(last_force_op_resend, bl);
2220 decode(pg_autoscale_mode, bl);
2221 if (struct_v >= 29) {
2222 decode(last_pg_merge_meta, bl);
2223 } else {
2224 last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
2225 last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
2226 }
2227 } else {
2228 pg_num_target = pg_num;
2229 pgp_num_target = pgp_num;
2230 pg_num_pending = pg_num;
2231 last_force_op_resend = last_force_op_resend_prenautilus;
2232 pg_autoscale_mode = pg_autoscale_mode_t::WARN; // default to warn on upgrade
2233 }
2234 if (struct_v >= 30) {
2235 decode(peering_crush_bucket_count, bl);
2236 decode(peering_crush_bucket_target, bl);
2237 decode(peering_crush_bucket_barrier, bl);
2238 decode(peering_crush_mandatory_member, bl);
2239 }
2240 DECODE_FINISH(bl);
2241 calc_pg_masks();
2242 calc_grade_table();
2243 }
2244
2245 bool pg_pool_t::stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
2246 std::ostream * out) const
2247 {
2248 if (!is_stretch_pool()) return true;
2249 const uint32_t barrier_id = peering_crush_bucket_barrier;
2250 const uint32_t barrier_count = peering_crush_bucket_count;
2251 set<int> ancestors;
2252 const shared_ptr<CrushWrapper>& crush = osdmap.crush;
2253 for (int osdid : want) {
2254 int ancestor = crush->get_parent_of_type(osdid, barrier_id,
2255 crush_rule);
2256 ancestors.insert(ancestor);
2257 }
2258 if (ancestors.size() < barrier_count) {
2259 if (out) {
2260 *out << __func__ << ": not enough crush buckets with OSDs in want set "
2261 << want;
2262 }
2263 return false;
2264 } else if (peering_crush_mandatory_member != CRUSH_ITEM_NONE &&
2265 !ancestors.count(peering_crush_mandatory_member)) {
2266 if (out) {
2267 *out << __func__ << ": missing mandatory crush bucket member "
2268 << peering_crush_mandatory_member;
2269 }
2270 return false;
2271 }
2272 return true;
2273 }
2274
2275 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
2276 {
2277 pg_pool_t a;
2278 o.push_back(new pg_pool_t(a));
2279
2280 a.create_time = utime_t(4,5);
2281 a.type = TYPE_REPLICATED;
2282 a.size = 2;
2283 a.crush_rule = 3;
2284 a.object_hash = 4;
2285 a.pg_num = 6;
2286 a.pgp_num = 4;
2287 a.pgp_num_target = 4;
2288 a.pg_num_target = 5;
2289 a.pg_num_pending = 5;
2290 a.last_pg_merge_meta.last_epoch_started = 2;
2291 a.last_pg_merge_meta.last_epoch_clean = 2;
2292 a.last_change = 9;
2293 a.last_force_op_resend = 123823;
2294 a.last_force_op_resend_preluminous = 123824;
2295 a.snap_seq = 10;
2296 a.snap_epoch = 11;
2297 a.flags = FLAG_POOL_SNAPS;
2298 a.auid = 12;
2299 a.quota_max_bytes = 473;
2300 a.quota_max_objects = 474;
2301 o.push_back(new pg_pool_t(a));
2302
2303 a.snaps[3].name = "asdf";
2304 a.snaps[3].snapid = 3;
2305 a.snaps[3].stamp = utime_t(123, 4);
2306 a.snaps[6].name = "qwer";
2307 a.snaps[6].snapid = 6;
2308 a.snaps[6].stamp = utime_t(23423, 4);
2309 o.push_back(new pg_pool_t(a));
2310
2311 a.flags = FLAG_SELFMANAGED_SNAPS;
2312 a.snaps.clear();
2313 a.removed_snaps.insert(2);
2314 a.quota_max_bytes = 2473;
2315 a.quota_max_objects = 4374;
2316 a.tiers.insert(0);
2317 a.tiers.insert(1);
2318 a.tier_of = 2;
2319 a.cache_mode = CACHEMODE_WRITEBACK;
2320 a.read_tier = 1;
2321 a.write_tier = 1;
2322 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
2323 a.hit_set_period = 3600;
2324 a.hit_set_count = 8;
2325 a.min_read_recency_for_promote = 1;
2326 a.min_write_recency_for_promote = 1;
2327 a.hit_set_grade_decay_rate = 50;
2328 a.hit_set_search_last_n = 1;
2329 a.calc_grade_table();
2330 a.set_stripe_width(12345);
2331 a.target_max_bytes = 1238132132;
2332 a.target_max_objects = 1232132;
2333 a.cache_target_dirty_ratio_micro = 187232;
2334 a.cache_target_dirty_high_ratio_micro = 309856;
2335 a.cache_target_full_ratio_micro = 987222;
2336 a.cache_min_flush_age = 231;
2337 a.cache_min_evict_age = 2321;
2338 a.erasure_code_profile = "profile in osdmap";
2339 a.expected_num_objects = 123456;
2340 a.fast_read = false;
2341 a.application_metadata = {{"rbd", {{"key", "value"}}}};
2342 o.push_back(new pg_pool_t(a));
2343 }
2344
2345 ostream& operator<<(ostream& out, const pg_pool_t& p)
2346 {
2347 out << p.get_type_name();
2348 if (p.get_type_name() == "erasure") {
2349 out << " profile " << p.erasure_code_profile;
2350 }
2351 out << " size " << p.get_size()
2352 << " min_size " << p.get_min_size()
2353 << " crush_rule " << p.get_crush_rule()
2354 << " object_hash " << p.get_object_hash_name()
2355 << " pg_num " << p.get_pg_num()
2356 << " pgp_num " << p.get_pgp_num();
2357 if (p.get_pg_num_target() != p.get_pg_num()) {
2358 out << " pg_num_target " << p.get_pg_num_target();
2359 }
2360 if (p.get_pgp_num_target() != p.get_pgp_num()) {
2361 out << " pgp_num_target " << p.get_pgp_num_target();
2362 }
2363 if (p.get_pg_num_pending() != p.get_pg_num()) {
2364 out << " pg_num_pending " << p.get_pg_num_pending();
2365 }
2366 if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
2367 out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode);
2368 }
2369 out << " last_change " << p.get_last_change();
2370 if (p.get_last_force_op_resend() ||
2371 p.get_last_force_op_resend_prenautilus() ||
2372 p.get_last_force_op_resend_preluminous())
2373 out << " lfor " << p.get_last_force_op_resend() << "/"
2374 << p.get_last_force_op_resend_prenautilus() << "/"
2375 << p.get_last_force_op_resend_preluminous();
2376 if (p.get_auid())
2377 out << " owner " << p.get_auid();
2378 if (p.flags)
2379 out << " flags " << p.get_flags_string();
2380 if (p.quota_max_bytes)
2381 out << " max_bytes " << p.quota_max_bytes;
2382 if (p.quota_max_objects)
2383 out << " max_objects " << p.quota_max_objects;
2384 if (!p.tiers.empty())
2385 out << " tiers " << p.tiers;
2386 if (p.is_tier())
2387 out << " tier_of " << p.tier_of;
2388 if (p.has_read_tier())
2389 out << " read_tier " << p.read_tier;
2390 if (p.has_write_tier())
2391 out << " write_tier " << p.write_tier;
2392 if (p.cache_mode)
2393 out << " cache_mode " << p.get_cache_mode_name();
2394 if (p.target_max_bytes)
2395 out << " target_bytes " << p.target_max_bytes;
2396 if (p.target_max_objects)
2397 out << " target_objects " << p.target_max_objects;
2398 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
2399 out << " hit_set " << p.hit_set_params
2400 << " " << p.hit_set_period << "s"
2401 << " x" << p.hit_set_count << " decay_rate "
2402 << p.hit_set_grade_decay_rate
2403 << " search_last_n " << p.hit_set_search_last_n;
2404 }
2405 if (p.min_read_recency_for_promote)
2406 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
2407 if (p.min_write_recency_for_promote)
2408 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
2409 out << " stripe_width " << p.get_stripe_width();
2410 if (p.expected_num_objects)
2411 out << " expected_num_objects " << p.expected_num_objects;
2412 if (p.fast_read)
2413 out << " fast_read " << p.fast_read;
2414 out << p.opts;
2415 if (!p.application_metadata.empty()) {
2416 out << " application ";
2417 for (auto it = p.application_metadata.begin();
2418 it != p.application_metadata.end(); ++it) {
2419 if (it != p.application_metadata.begin())
2420 out << ",";
2421 out << it->first;
2422 }
2423 }
2424 return out;
2425 }
2426
2427
2428 // -- object_stat_sum_t --
2429
2430 void object_stat_sum_t::dump(Formatter *f) const
2431 {
2432 f->dump_int("num_bytes", num_bytes);
2433 f->dump_int("num_objects", num_objects);
2434 f->dump_int("num_object_clones", num_object_clones);
2435 f->dump_int("num_object_copies", num_object_copies);
2436 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
2437 f->dump_int("num_objects_missing", num_objects_missing);
2438 f->dump_int("num_objects_degraded", num_objects_degraded);
2439 f->dump_int("num_objects_misplaced", num_objects_misplaced);
2440 f->dump_int("num_objects_unfound", num_objects_unfound);
2441 f->dump_int("num_objects_dirty", num_objects_dirty);
2442 f->dump_int("num_whiteouts", num_whiteouts);
2443 f->dump_int("num_read", num_rd);
2444 f->dump_int("num_read_kb", num_rd_kb);
2445 f->dump_int("num_write", num_wr);
2446 f->dump_int("num_write_kb", num_wr_kb);
2447 f->dump_int("num_scrub_errors", num_scrub_errors);
2448 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
2449 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
2450 f->dump_int("num_objects_recovered", num_objects_recovered);
2451 f->dump_int("num_bytes_recovered", num_bytes_recovered);
2452 f->dump_int("num_keys_recovered", num_keys_recovered);
2453 f->dump_int("num_objects_omap", num_objects_omap);
2454 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
2455 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
2456 f->dump_int("num_flush", num_flush);
2457 f->dump_int("num_flush_kb", num_flush_kb);
2458 f->dump_int("num_evict", num_evict);
2459 f->dump_int("num_evict_kb", num_evict_kb);
2460 f->dump_int("num_promote", num_promote);
2461 f->dump_int("num_flush_mode_high", num_flush_mode_high);
2462 f->dump_int("num_flush_mode_low", num_flush_mode_low);
2463 f->dump_int("num_evict_mode_some", num_evict_mode_some);
2464 f->dump_int("num_evict_mode_full", num_evict_mode_full);
2465 f->dump_int("num_objects_pinned", num_objects_pinned);
2466 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
2467 f->dump_int("num_large_omap_objects", num_large_omap_objects);
2468 f->dump_int("num_objects_manifest", num_objects_manifest);
2469 f->dump_int("num_omap_bytes", num_omap_bytes);
2470 f->dump_int("num_omap_keys", num_omap_keys);
2471 f->dump_int("num_objects_repaired", num_objects_repaired);
2472 }
2473
2474 void object_stat_sum_t::encode(ceph::buffer::list& bl) const
2475 {
2476 ENCODE_START(20, 14, bl);
2477 #if defined(CEPH_LITTLE_ENDIAN)
2478 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
2479 #else
2480 encode(num_bytes, bl);
2481 encode(num_objects, bl);
2482 encode(num_object_clones, bl);
2483 encode(num_object_copies, bl);
2484 encode(num_objects_missing_on_primary, bl);
2485 encode(num_objects_degraded, bl);
2486 encode(num_objects_unfound, bl);
2487 encode(num_rd, bl);
2488 encode(num_rd_kb, bl);
2489 encode(num_wr, bl);
2490 encode(num_wr_kb, bl);
2491 encode(num_scrub_errors, bl);
2492 encode(num_objects_recovered, bl);
2493 encode(num_bytes_recovered, bl);
2494 encode(num_keys_recovered, bl);
2495 encode(num_shallow_scrub_errors, bl);
2496 encode(num_deep_scrub_errors, bl);
2497 encode(num_objects_dirty, bl);
2498 encode(num_whiteouts, bl);
2499 encode(num_objects_omap, bl);
2500 encode(num_objects_hit_set_archive, bl);
2501 encode(num_objects_misplaced, bl);
2502 encode(num_bytes_hit_set_archive, bl);
2503 encode(num_flush, bl);
2504 encode(num_flush_kb, bl);
2505 encode(num_evict, bl);
2506 encode(num_evict_kb, bl);
2507 encode(num_promote, bl);
2508 encode(num_flush_mode_high, bl);
2509 encode(num_flush_mode_low, bl);
2510 encode(num_evict_mode_some, bl);
2511 encode(num_evict_mode_full, bl);
2512 encode(num_objects_pinned, bl);
2513 encode(num_objects_missing, bl);
2514 encode(num_legacy_snapsets, bl);
2515 encode(num_large_omap_objects, bl);
2516 encode(num_objects_manifest, bl);
2517 encode(num_omap_bytes, bl);
2518 encode(num_omap_keys, bl);
2519 encode(num_objects_repaired, bl);
2520 #endif
2521 ENCODE_FINISH(bl);
2522 }
2523
2524 void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl)
2525 {
2526 bool decode_finish = false;
2527 static const int STAT_SUM_DECODE_VERSION = 20;
2528 DECODE_START(STAT_SUM_DECODE_VERSION, bl);
2529 #if defined(CEPH_LITTLE_ENDIAN)
2530 if (struct_v == STAT_SUM_DECODE_VERSION) {
2531 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
2532 decode_finish = true;
2533 }
2534 #endif
2535 if (!decode_finish) {
2536 decode(num_bytes, bl);
2537 decode(num_objects, bl);
2538 decode(num_object_clones, bl);
2539 decode(num_object_copies, bl);
2540 decode(num_objects_missing_on_primary, bl);
2541 decode(num_objects_degraded, bl);
2542 decode(num_objects_unfound, bl);
2543 decode(num_rd, bl);
2544 decode(num_rd_kb, bl);
2545 decode(num_wr, bl);
2546 decode(num_wr_kb, bl);
2547 decode(num_scrub_errors, bl);
2548 decode(num_objects_recovered, bl);
2549 decode(num_bytes_recovered, bl);
2550 decode(num_keys_recovered, bl);
2551 decode(num_shallow_scrub_errors, bl);
2552 decode(num_deep_scrub_errors, bl);
2553 decode(num_objects_dirty, bl);
2554 decode(num_whiteouts, bl);
2555 decode(num_objects_omap, bl);
2556 decode(num_objects_hit_set_archive, bl);
2557 decode(num_objects_misplaced, bl);
2558 decode(num_bytes_hit_set_archive, bl);
2559 decode(num_flush, bl);
2560 decode(num_flush_kb, bl);
2561 decode(num_evict, bl);
2562 decode(num_evict_kb, bl);
2563 decode(num_promote, bl);
2564 decode(num_flush_mode_high, bl);
2565 decode(num_flush_mode_low, bl);
2566 decode(num_evict_mode_some, bl);
2567 decode(num_evict_mode_full, bl);
2568 decode(num_objects_pinned, bl);
2569 decode(num_objects_missing, bl);
2570 if (struct_v >= 16) {
2571 decode(num_legacy_snapsets, bl);
2572 } else {
2573 num_legacy_snapsets = num_object_clones; // upper bound
2574 }
2575 if (struct_v >= 17) {
2576 decode(num_large_omap_objects, bl);
2577 }
2578 if (struct_v >= 18) {
2579 decode(num_objects_manifest, bl);
2580 }
2581 if (struct_v >= 19) {
2582 decode(num_omap_bytes, bl);
2583 decode(num_omap_keys, bl);
2584 }
2585 if (struct_v >= 20) {
2586 decode(num_objects_repaired, bl);
2587 }
2588 }
2589 DECODE_FINISH(bl);
2590 }
2591
2592 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2593 {
2594 object_stat_sum_t a;
2595
2596 a.num_bytes = 1;
2597 a.num_objects = 3;
2598 a.num_object_clones = 4;
2599 a.num_object_copies = 5;
2600 a.num_objects_missing_on_primary = 6;
2601 a.num_objects_missing = 123;
2602 a.num_objects_degraded = 7;
2603 a.num_objects_unfound = 8;
2604 a.num_rd = 9; a.num_rd_kb = 10;
2605 a.num_wr = 11; a.num_wr_kb = 12;
2606 a.num_objects_recovered = 14;
2607 a.num_bytes_recovered = 15;
2608 a.num_keys_recovered = 16;
2609 a.num_deep_scrub_errors = 17;
2610 a.num_shallow_scrub_errors = 18;
2611 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2612 a.num_objects_dirty = 21;
2613 a.num_whiteouts = 22;
2614 a.num_objects_misplaced = 1232;
2615 a.num_objects_hit_set_archive = 2;
2616 a.num_bytes_hit_set_archive = 27;
2617 a.num_flush = 5;
2618 a.num_flush_kb = 6;
2619 a.num_evict = 7;
2620 a.num_evict_kb = 8;
2621 a.num_promote = 9;
2622 a.num_flush_mode_high = 0;
2623 a.num_flush_mode_low = 1;
2624 a.num_evict_mode_some = 1;
2625 a.num_evict_mode_full = 0;
2626 a.num_objects_pinned = 20;
2627 a.num_large_omap_objects = 5;
2628 a.num_objects_manifest = 2;
2629 a.num_omap_bytes = 20000;
2630 a.num_omap_keys = 200;
2631 a.num_objects_repaired = 300;
2632 o.push_back(new object_stat_sum_t(a));
2633 }
2634
2635 void object_stat_sum_t::add(const object_stat_sum_t& o)
2636 {
2637 num_bytes += o.num_bytes;
2638 num_objects += o.num_objects;
2639 num_object_clones += o.num_object_clones;
2640 num_object_copies += o.num_object_copies;
2641 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2642 num_objects_missing += o.num_objects_missing;
2643 num_objects_degraded += o.num_objects_degraded;
2644 num_objects_misplaced += o.num_objects_misplaced;
2645 num_rd += o.num_rd;
2646 num_rd_kb += o.num_rd_kb;
2647 num_wr += o.num_wr;
2648 num_wr_kb += o.num_wr_kb;
2649 num_objects_unfound += o.num_objects_unfound;
2650 num_scrub_errors += o.num_scrub_errors;
2651 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2652 num_deep_scrub_errors += o.num_deep_scrub_errors;
2653 num_objects_recovered += o.num_objects_recovered;
2654 num_bytes_recovered += o.num_bytes_recovered;
2655 num_keys_recovered += o.num_keys_recovered;
2656 num_objects_dirty += o.num_objects_dirty;
2657 num_whiteouts += o.num_whiteouts;
2658 num_objects_omap += o.num_objects_omap;
2659 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2660 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2661 num_flush += o.num_flush;
2662 num_flush_kb += o.num_flush_kb;
2663 num_evict += o.num_evict;
2664 num_evict_kb += o.num_evict_kb;
2665 num_promote += o.num_promote;
2666 num_flush_mode_high += o.num_flush_mode_high;
2667 num_flush_mode_low += o.num_flush_mode_low;
2668 num_evict_mode_some += o.num_evict_mode_some;
2669 num_evict_mode_full += o.num_evict_mode_full;
2670 num_objects_pinned += o.num_objects_pinned;
2671 num_legacy_snapsets += o.num_legacy_snapsets;
2672 num_large_omap_objects += o.num_large_omap_objects;
2673 num_objects_manifest += o.num_objects_manifest;
2674 num_omap_bytes += o.num_omap_bytes;
2675 num_omap_keys += o.num_omap_keys;
2676 num_objects_repaired += o.num_objects_repaired;
2677 }
2678
2679 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2680 {
2681 num_bytes -= o.num_bytes;
2682 num_objects -= o.num_objects;
2683 num_object_clones -= o.num_object_clones;
2684 num_object_copies -= o.num_object_copies;
2685 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2686 num_objects_missing -= o.num_objects_missing;
2687 num_objects_degraded -= o.num_objects_degraded;
2688 num_objects_misplaced -= o.num_objects_misplaced;
2689 num_rd -= o.num_rd;
2690 num_rd_kb -= o.num_rd_kb;
2691 num_wr -= o.num_wr;
2692 num_wr_kb -= o.num_wr_kb;
2693 num_objects_unfound -= o.num_objects_unfound;
2694 num_scrub_errors -= o.num_scrub_errors;
2695 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2696 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2697 num_objects_recovered -= o.num_objects_recovered;
2698 num_bytes_recovered -= o.num_bytes_recovered;
2699 num_keys_recovered -= o.num_keys_recovered;
2700 num_objects_dirty -= o.num_objects_dirty;
2701 num_whiteouts -= o.num_whiteouts;
2702 num_objects_omap -= o.num_objects_omap;
2703 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2704 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2705 num_flush -= o.num_flush;
2706 num_flush_kb -= o.num_flush_kb;
2707 num_evict -= o.num_evict;
2708 num_evict_kb -= o.num_evict_kb;
2709 num_promote -= o.num_promote;
2710 num_flush_mode_high -= o.num_flush_mode_high;
2711 num_flush_mode_low -= o.num_flush_mode_low;
2712 num_evict_mode_some -= o.num_evict_mode_some;
2713 num_evict_mode_full -= o.num_evict_mode_full;
2714 num_objects_pinned -= o.num_objects_pinned;
2715 num_legacy_snapsets -= o.num_legacy_snapsets;
2716 num_large_omap_objects -= o.num_large_omap_objects;
2717 num_objects_manifest -= o.num_objects_manifest;
2718 num_omap_bytes -= o.num_omap_bytes;
2719 num_omap_keys -= o.num_omap_keys;
2720 num_objects_repaired -= o.num_objects_repaired;
2721 }
2722
2723 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2724 {
2725 return
2726 l.num_bytes == r.num_bytes &&
2727 l.num_objects == r.num_objects &&
2728 l.num_object_clones == r.num_object_clones &&
2729 l.num_object_copies == r.num_object_copies &&
2730 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2731 l.num_objects_missing == r.num_objects_missing &&
2732 l.num_objects_degraded == r.num_objects_degraded &&
2733 l.num_objects_misplaced == r.num_objects_misplaced &&
2734 l.num_objects_unfound == r.num_objects_unfound &&
2735 l.num_rd == r.num_rd &&
2736 l.num_rd_kb == r.num_rd_kb &&
2737 l.num_wr == r.num_wr &&
2738 l.num_wr_kb == r.num_wr_kb &&
2739 l.num_scrub_errors == r.num_scrub_errors &&
2740 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2741 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2742 l.num_objects_recovered == r.num_objects_recovered &&
2743 l.num_bytes_recovered == r.num_bytes_recovered &&
2744 l.num_keys_recovered == r.num_keys_recovered &&
2745 l.num_objects_dirty == r.num_objects_dirty &&
2746 l.num_whiteouts == r.num_whiteouts &&
2747 l.num_objects_omap == r.num_objects_omap &&
2748 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2749 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2750 l.num_flush == r.num_flush &&
2751 l.num_flush_kb == r.num_flush_kb &&
2752 l.num_evict == r.num_evict &&
2753 l.num_evict_kb == r.num_evict_kb &&
2754 l.num_promote == r.num_promote &&
2755 l.num_flush_mode_high == r.num_flush_mode_high &&
2756 l.num_flush_mode_low == r.num_flush_mode_low &&
2757 l.num_evict_mode_some == r.num_evict_mode_some &&
2758 l.num_evict_mode_full == r.num_evict_mode_full &&
2759 l.num_objects_pinned == r.num_objects_pinned &&
2760 l.num_legacy_snapsets == r.num_legacy_snapsets &&
2761 l.num_large_omap_objects == r.num_large_omap_objects &&
2762 l.num_objects_manifest == r.num_objects_manifest &&
2763 l.num_omap_bytes == r.num_omap_bytes &&
2764 l.num_omap_keys == r.num_omap_keys &&
2765 l.num_objects_repaired == r.num_objects_repaired;
2766 }
2767
2768 // -- object_stat_collection_t --
2769
2770 void object_stat_collection_t::dump(Formatter *f) const
2771 {
2772 f->open_object_section("stat_sum");
2773 sum.dump(f);
2774 f->close_section();
2775 }
2776
2777 void object_stat_collection_t::encode(ceph::buffer::list& bl) const
2778 {
2779 ENCODE_START(2, 2, bl);
2780 encode(sum, bl);
2781 encode((__u32)0, bl);
2782 ENCODE_FINISH(bl);
2783 }
2784
2785 void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl)
2786 {
2787 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2788 decode(sum, bl);
2789 {
2790 map<string,object_stat_sum_t> cat_sum;
2791 decode(cat_sum, bl);
2792 }
2793 DECODE_FINISH(bl);
2794 }
2795
2796 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2797 {
2798 object_stat_collection_t a;
2799 o.push_back(new object_stat_collection_t(a));
2800 list<object_stat_sum_t*> l;
2801 object_stat_sum_t::generate_test_instances(l);
2802 for (auto p = l.begin(); p != l.end(); ++p) {
2803 a.add(**p);
2804 o.push_back(new object_stat_collection_t(a));
2805 }
2806 }
2807
2808
2809 // -- pg_stat_t --
2810
2811 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2812 {
2813 if (primary && osd == acting_primary) {
2814 return true;
2815 } else if (!primary) {
2816 for(auto it = acting.cbegin(); it != acting.cend(); ++it)
2817 {
2818 if (*it == osd)
2819 return true;
2820 }
2821 }
2822 return false;
2823 }
2824
2825 void pg_stat_t::dump(Formatter *f) const
2826 {
2827 f->dump_stream("version") << version;
2828 f->dump_unsigned("reported_seq", reported_seq);
2829 f->dump_unsigned("reported_epoch", reported_epoch);
2830 f->dump_string("state", pg_state_string(state));
2831 f->dump_stream("last_fresh") << last_fresh;
2832 f->dump_stream("last_change") << last_change;
2833 f->dump_stream("last_active") << last_active;
2834 f->dump_stream("last_peered") << last_peered;
2835 f->dump_stream("last_clean") << last_clean;
2836 f->dump_stream("last_became_active") << last_became_active;
2837 f->dump_stream("last_became_peered") << last_became_peered;
2838 f->dump_stream("last_unstale") << last_unstale;
2839 f->dump_stream("last_undegraded") << last_undegraded;
2840 f->dump_stream("last_fullsized") << last_fullsized;
2841 f->dump_unsigned("mapping_epoch", mapping_epoch);
2842 f->dump_stream("log_start") << log_start;
2843 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2844 f->dump_unsigned("created", created);
2845 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2846 f->dump_stream("parent") << parent;
2847 f->dump_unsigned("parent_split_bits", parent_split_bits);
2848 f->dump_stream("last_scrub") << last_scrub;
2849 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2850 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2851 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2852 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2853 f->dump_int("objects_scrubbed", objects_scrubbed);
2854 f->dump_int("log_size", log_size);
2855 f->dump_int("ondisk_log_size", ondisk_log_size);
2856 f->dump_bool("stats_invalid", stats_invalid);
2857 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2858 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2859 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2860 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2861 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2862 f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
2863 f->dump_unsigned("snaptrimq_len", snaptrimq_len);
2864 f->dump_int("last_scrub_duration", last_scrub_duration);
2865 f->dump_string("scrub_schedule", dump_scrub_schedule());
2866 stats.dump(f);
2867 f->open_array_section("up");
2868 for (auto p = up.cbegin(); p != up.cend(); ++p)
2869 f->dump_int("osd", *p);
2870 f->close_section();
2871 f->open_array_section("acting");
2872 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
2873 f->dump_int("osd", *p);
2874 f->close_section();
2875 f->open_array_section("avail_no_missing");
2876 for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p)
2877 f->dump_stream("shard") << *p;
2878 f->close_section();
2879 f->open_array_section("object_location_counts");
2880 for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) {
2881 f->open_object_section("entry");
2882 f->dump_stream("shards") << p->first;
2883 f->dump_int("objects", p->second);
2884 f->close_section();
2885 }
2886 f->close_section();
2887 f->open_array_section("blocked_by");
2888 for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p)
2889 f->dump_int("osd", *p);
2890 f->close_section();
2891 f->dump_int("up_primary", up_primary);
2892 f->dump_int("acting_primary", acting_primary);
2893 f->open_array_section("purged_snaps");
2894 for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) {
2895 f->open_object_section("interval");
2896 f->dump_stream("start") << i.get_start();
2897 f->dump_stream("length") << i.get_len();
2898 f->close_section();
2899 }
2900 f->close_section();
2901 }
2902
2903 void pg_stat_t::dump_brief(Formatter *f) const
2904 {
2905 f->dump_string("state", pg_state_string(state));
2906 f->open_array_section("up");
2907 for (auto p = up.cbegin(); p != up.cend(); ++p)
2908 f->dump_int("osd", *p);
2909 f->close_section();
2910 f->open_array_section("acting");
2911 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
2912 f->dump_int("osd", *p);
2913 f->close_section();
2914 f->dump_int("up_primary", up_primary);
2915 f->dump_int("acting_primary", acting_primary);
2916 }
2917
2918 std::string pg_stat_t::dump_scrub_schedule() const
2919 {
2920 if (scrub_sched_status.m_is_active) {
2921 return fmt::format(
2922 "{}scrubbing for {}s",
2923 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
2924 scrub_sched_status.m_duration_seconds);
2925 }
2926 switch (scrub_sched_status.m_sched_status) {
2927 case pg_scrub_sched_status_t::unknown:
2928 // no reported scrub schedule yet
2929 return "--"s;
2930 case pg_scrub_sched_status_t::not_queued:
2931 return "no scrub is scheduled"s;
2932 case pg_scrub_sched_status_t::scheduled:
2933 return fmt::format(
2934 "{} {}scrub scheduled @ {}",
2935 (scrub_sched_status.m_is_periodic ? "periodic" : "user requested"),
2936 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
2937 scrub_sched_status.m_scheduled_at);
2938 case pg_scrub_sched_status_t::queued:
2939 return fmt::format(
2940 "queued for {}scrub",
2941 ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""));
2942 default:
2943 // a bug!
2944 return "SCRUB STATE MISMATCH!"s;
2945 }
2946 }
2947
2948 bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r)
2949 {
2950 return
2951 l.m_sched_status == r.m_sched_status &&
2952 l.m_scheduled_at == r.m_scheduled_at &&
2953 l.m_duration_seconds == r.m_duration_seconds &&
2954 l.m_is_active == r.m_is_active &&
2955 l.m_is_deep == r.m_is_deep &&
2956 l.m_is_periodic == r.m_is_periodic;
2957 }
2958
2959 void pg_stat_t::encode(ceph::buffer::list &bl) const
2960 {
2961 ENCODE_START(27, 22, bl);
2962 encode(version, bl);
2963 encode(reported_seq, bl);
2964 encode(reported_epoch, bl);
2965 encode((__u32)state, bl); // for older peers
2966 encode(log_start, bl);
2967 encode(ondisk_log_start, bl);
2968 encode(created, bl);
2969 encode(last_epoch_clean, bl);
2970 encode(parent, bl);
2971 encode(parent_split_bits, bl);
2972 encode(last_scrub, bl);
2973 encode(last_scrub_stamp, bl);
2974 encode(stats, bl);
2975 encode(log_size, bl);
2976 encode(ondisk_log_size, bl);
2977 encode(up, bl);
2978 encode(acting, bl);
2979 encode(last_fresh, bl);
2980 encode(last_change, bl);
2981 encode(last_active, bl);
2982 encode(last_clean, bl);
2983 encode(last_unstale, bl);
2984 encode(mapping_epoch, bl);
2985 encode(last_deep_scrub, bl);
2986 encode(last_deep_scrub_stamp, bl);
2987 encode(stats_invalid, bl);
2988 encode(last_clean_scrub_stamp, bl);
2989 encode(last_became_active, bl);
2990 encode(dirty_stats_invalid, bl);
2991 encode(up_primary, bl);
2992 encode(acting_primary, bl);
2993 encode(omap_stats_invalid, bl);
2994 encode(hitset_stats_invalid, bl);
2995 encode(blocked_by, bl);
2996 encode(last_undegraded, bl);
2997 encode(last_fullsized, bl);
2998 encode(hitset_bytes_stats_invalid, bl);
2999 encode(last_peered, bl);
3000 encode(last_became_peered, bl);
3001 encode(pin_stats_invalid, bl);
3002 encode(snaptrimq_len, bl);
3003 __u32 top_state = (state >> 32);
3004 encode(top_state, bl);
3005 encode(purged_snaps, bl);
3006 encode(manifest_stats_invalid, bl);
3007 encode(avail_no_missing, bl);
3008 encode(object_location_counts, bl);
3009 encode(last_scrub_duration, bl);
3010 encode(scrub_sched_status.m_scheduled_at, bl);
3011 encode(scrub_sched_status.m_duration_seconds, bl);
3012 encode((__u16)scrub_sched_status.m_sched_status, bl);
3013 encode(scrub_sched_status.m_is_active, bl);
3014 encode((scrub_sched_status.m_is_deep==scrub_level_t::deep), bl);
3015 encode(scrub_sched_status.m_is_periodic, bl);
3016 encode(objects_scrubbed, bl);
3017
3018 ENCODE_FINISH(bl);
3019 }
3020
3021 void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
3022 {
3023 bool tmp;
3024 uint32_t old_state;
3025 DECODE_START(27, bl);
3026 decode(version, bl);
3027 decode(reported_seq, bl);
3028 decode(reported_epoch, bl);
3029 decode(old_state, bl);
3030 decode(log_start, bl);
3031 decode(ondisk_log_start, bl);
3032 decode(created, bl);
3033 decode(last_epoch_clean, bl);
3034 decode(parent, bl);
3035 decode(parent_split_bits, bl);
3036 decode(last_scrub, bl);
3037 decode(last_scrub_stamp, bl);
3038 decode(stats, bl);
3039 decode(log_size, bl);
3040 decode(ondisk_log_size, bl);
3041 decode(up, bl);
3042 decode(acting, bl);
3043 decode(last_fresh, bl);
3044 decode(last_change, bl);
3045 decode(last_active, bl);
3046 decode(last_clean, bl);
3047 decode(last_unstale, bl);
3048 decode(mapping_epoch, bl);
3049 decode(last_deep_scrub, bl);
3050 decode(last_deep_scrub_stamp, bl);
3051 decode(tmp, bl);
3052 stats_invalid = tmp;
3053 decode(last_clean_scrub_stamp, bl);
3054 decode(last_became_active, bl);
3055 decode(tmp, bl);
3056 dirty_stats_invalid = tmp;
3057 decode(up_primary, bl);
3058 decode(acting_primary, bl);
3059 decode(tmp, bl);
3060 omap_stats_invalid = tmp;
3061 decode(tmp, bl);
3062 hitset_stats_invalid = tmp;
3063 decode(blocked_by, bl);
3064 decode(last_undegraded, bl);
3065 decode(last_fullsized, bl);
3066 decode(tmp, bl);
3067 hitset_bytes_stats_invalid = tmp;
3068 decode(last_peered, bl);
3069 decode(last_became_peered, bl);
3070 decode(tmp, bl);
3071 pin_stats_invalid = tmp;
3072 if (struct_v >= 23) {
3073 decode(snaptrimq_len, bl);
3074 if (struct_v >= 24) {
3075 __u32 top_state;
3076 decode(top_state, bl);
3077 state = (uint64_t)old_state | ((uint64_t)top_state << 32);
3078 decode(purged_snaps, bl);
3079 } else {
3080 state = old_state;
3081 }
3082 if (struct_v >= 25) {
3083 decode(tmp, bl);
3084 manifest_stats_invalid = tmp;
3085 } else {
3086 manifest_stats_invalid = true;
3087 }
3088 if (struct_v >= 26) {
3089 decode(avail_no_missing, bl);
3090 decode(object_location_counts, bl);
3091 }
3092 if (struct_v >= 27) {
3093 decode(last_scrub_duration, bl);
3094 decode(scrub_sched_status.m_scheduled_at, bl);
3095 decode(scrub_sched_status.m_duration_seconds, bl);
3096 __u16 scrub_sched_as_u16;
3097 decode(scrub_sched_as_u16, bl);
3098 scrub_sched_status.m_sched_status = (pg_scrub_sched_status_t)(scrub_sched_as_u16);
3099 decode(tmp, bl);
3100 scrub_sched_status.m_is_active = tmp;
3101 decode(tmp, bl);
3102 scrub_sched_status.m_is_deep = tmp ? scrub_level_t::deep : scrub_level_t::shallow;
3103 decode(tmp, bl);
3104 scrub_sched_status.m_is_periodic = tmp;
3105 decode(objects_scrubbed, bl);
3106 }
3107 }
3108 DECODE_FINISH(bl);
3109 }
3110
3111 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
3112 {
3113 pg_stat_t a;
3114 o.push_back(new pg_stat_t(a));
3115
3116 a.version = eversion_t(1, 3);
3117 a.reported_epoch = 1;
3118 a.reported_seq = 2;
3119 a.state = 123;
3120 a.mapping_epoch = 998;
3121 a.last_fresh = utime_t(1002, 1);
3122 a.last_change = utime_t(1002, 2);
3123 a.last_active = utime_t(1002, 3);
3124 a.last_clean = utime_t(1002, 4);
3125 a.last_unstale = utime_t(1002, 5);
3126 a.last_undegraded = utime_t(1002, 7);
3127 a.last_fullsized = utime_t(1002, 8);
3128 a.log_start = eversion_t(1, 4);
3129 a.ondisk_log_start = eversion_t(1, 5);
3130 a.created = 6;
3131 a.last_epoch_clean = 7;
3132 a.parent = pg_t(1, 2);
3133 a.parent_split_bits = 12;
3134 a.last_scrub = eversion_t(9, 10);
3135 a.last_scrub_stamp = utime_t(11, 12);
3136 a.last_deep_scrub = eversion_t(13, 14);
3137 a.last_deep_scrub_stamp = utime_t(15, 16);
3138 a.last_clean_scrub_stamp = utime_t(17, 18);
3139 a.last_scrub_duration = 3617;
3140 a.snaptrimq_len = 1048576;
3141 a.objects_scrubbed = 0;
3142 list<object_stat_collection_t*> l;
3143 object_stat_collection_t::generate_test_instances(l);
3144 a.stats = *l.back();
3145 a.log_size = 99;
3146 a.ondisk_log_size = 88;
3147 a.up.push_back(123);
3148 a.up_primary = 123;
3149 a.acting.push_back(456);
3150 a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD));
3151 set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) };
3152 a.object_location_counts.insert(make_pair(sset, 10));
3153 sset.insert(pg_shard_t(2));
3154 a.object_location_counts.insert(make_pair(sset, 5));
3155 a.acting_primary = 456;
3156 o.push_back(new pg_stat_t(a));
3157
3158 a.up.push_back(124);
3159 a.up_primary = 124;
3160 a.acting.push_back(124);
3161 a.acting_primary = 124;
3162 a.blocked_by.push_back(155);
3163 a.blocked_by.push_back(156);
3164 o.push_back(new pg_stat_t(a));
3165 }
3166
3167 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
3168 {
3169 return
3170 l.version == r.version &&
3171 l.reported_seq == r.reported_seq &&
3172 l.reported_epoch == r.reported_epoch &&
3173 l.state == r.state &&
3174 l.last_fresh == r.last_fresh &&
3175 l.last_change == r.last_change &&
3176 l.last_active == r.last_active &&
3177 l.last_peered == r.last_peered &&
3178 l.last_clean == r.last_clean &&
3179 l.last_unstale == r.last_unstale &&
3180 l.last_undegraded == r.last_undegraded &&
3181 l.last_fullsized == r.last_fullsized &&
3182 l.log_start == r.log_start &&
3183 l.ondisk_log_start == r.ondisk_log_start &&
3184 l.created == r.created &&
3185 l.last_epoch_clean == r.last_epoch_clean &&
3186 l.parent == r.parent &&
3187 l.parent_split_bits == r.parent_split_bits &&
3188 l.last_scrub == r.last_scrub &&
3189 l.last_deep_scrub == r.last_deep_scrub &&
3190 l.last_scrub_stamp == r.last_scrub_stamp &&
3191 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
3192 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
3193 l.stats == r.stats &&
3194 l.stats_invalid == r.stats_invalid &&
3195 l.log_size == r.log_size &&
3196 l.ondisk_log_size == r.ondisk_log_size &&
3197 l.up == r.up &&
3198 l.acting == r.acting &&
3199 l.avail_no_missing == r.avail_no_missing &&
3200 l.object_location_counts == r.object_location_counts &&
3201 l.mapping_epoch == r.mapping_epoch &&
3202 l.blocked_by == r.blocked_by &&
3203 l.last_became_active == r.last_became_active &&
3204 l.last_became_peered == r.last_became_peered &&
3205 l.dirty_stats_invalid == r.dirty_stats_invalid &&
3206 l.omap_stats_invalid == r.omap_stats_invalid &&
3207 l.hitset_stats_invalid == r.hitset_stats_invalid &&
3208 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
3209 l.up_primary == r.up_primary &&
3210 l.acting_primary == r.acting_primary &&
3211 l.pin_stats_invalid == r.pin_stats_invalid &&
3212 l.manifest_stats_invalid == r.manifest_stats_invalid &&
3213 l.purged_snaps == r.purged_snaps &&
3214 l.snaptrimq_len == r.snaptrimq_len &&
3215 l.last_scrub_duration == r.last_scrub_duration &&
3216 l.scrub_sched_status == r.scrub_sched_status &&
3217 l.objects_scrubbed == r.objects_scrubbed;
3218 }
3219
3220 // -- store_statfs_t --
3221
3222 bool store_statfs_t::operator==(const store_statfs_t& other) const
3223 {
3224 return total == other.total
3225 && available == other.available
3226 && allocated == other.allocated
3227 && internally_reserved == other.internally_reserved
3228 && data_stored == other.data_stored
3229 && data_compressed == other.data_compressed
3230 && data_compressed_allocated == other.data_compressed_allocated
3231 && data_compressed_original == other.data_compressed_original
3232 && omap_allocated == other.omap_allocated
3233 && internal_metadata == other.internal_metadata;
3234 }
3235
3236 void store_statfs_t::dump(Formatter *f) const
3237 {
3238 f->dump_int("total", total);
3239 f->dump_int("available", available);
3240 f->dump_int("internally_reserved", internally_reserved);
3241 f->dump_int("allocated", allocated);
3242 f->dump_int("data_stored", data_stored);
3243 f->dump_int("data_compressed", data_compressed);
3244 f->dump_int("data_compressed_allocated", data_compressed_allocated);
3245 f->dump_int("data_compressed_original", data_compressed_original);
3246 f->dump_int("omap_allocated", omap_allocated);
3247 f->dump_int("internal_metadata", internal_metadata);
3248 }
3249
3250 ostream& operator<<(ostream& out, const store_statfs_t &s)
3251 {
3252 out << std::hex
3253 << "store_statfs(0x" << s.available
3254 << "/0x" << s.internally_reserved
3255 << "/0x" << s.total
3256 << ", data 0x" << s.data_stored
3257 << "/0x" << s.allocated
3258 << ", compress 0x" << s.data_compressed
3259 << "/0x" << s.data_compressed_allocated
3260 << "/0x" << s.data_compressed_original
3261 << ", omap 0x" << s.omap_allocated
3262 << ", meta 0x" << s.internal_metadata
3263 << std::dec
3264 << ")";
3265 return out;
3266 }
3267
3268 void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o)
3269 {
3270 store_statfs_t a;
3271 o.push_back(new store_statfs_t(a));
3272 a.total = 234;
3273 a.available = 123;
3274 a.internally_reserved = 33;
3275 a.allocated = 32;
3276 a.data_stored = 44;
3277 a.data_compressed = 21;
3278 a.data_compressed_allocated = 12;
3279 a.data_compressed_original = 13;
3280 a.omap_allocated = 14;
3281 a.internal_metadata = 15;
3282 o.push_back(new store_statfs_t(a));
3283 }
3284
3285 // -- pool_stat_t --
3286
3287 void pool_stat_t::dump(Formatter *f) const
3288 {
3289 stats.dump(f);
3290 f->open_object_section("store_stats");
3291 store_stats.dump(f);
3292 f->close_section();
3293 f->dump_int("log_size", log_size);
3294 f->dump_int("ondisk_log_size", ondisk_log_size);
3295 f->dump_int("up", up);
3296 f->dump_int("acting", acting);
3297 f->dump_int("num_store_stats", num_store_stats);
3298 }
3299
3300 void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
3301 {
3302 using ceph::encode;
3303 if ((features & CEPH_FEATURE_OSDENC) == 0) {
3304 __u8 v = 4;
3305 encode(v, bl);
3306 encode(stats, bl);
3307 encode(log_size, bl);
3308 encode(ondisk_log_size, bl);
3309 return;
3310 }
3311
3312 ENCODE_START(7, 5, bl);
3313 encode(stats, bl);
3314 encode(log_size, bl);
3315 encode(ondisk_log_size, bl);
3316 encode(up, bl);
3317 encode(acting, bl);
3318 encode(store_stats, bl);
3319 encode(num_store_stats, bl);
3320 ENCODE_FINISH(bl);
3321 }
3322
3323 void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl)
3324 {
3325 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
3326 if (struct_v >= 4) {
3327 decode(stats, bl);
3328 decode(log_size, bl);
3329 decode(ondisk_log_size, bl);
3330 if (struct_v >= 6) {
3331 decode(up, bl);
3332 decode(acting, bl);
3333 } else {
3334 up = 0;
3335 acting = 0;
3336 }
3337 if (struct_v >= 7) {
3338 decode(store_stats, bl);
3339 decode(num_store_stats, bl);
3340 } else {
3341 store_stats.reset();
3342 num_store_stats = 0;
3343 }
3344
3345 } else {
3346 decode(stats.sum.num_bytes, bl);
3347 uint64_t num_kb;
3348 decode(num_kb, bl);
3349 decode(stats.sum.num_objects, bl);
3350 decode(stats.sum.num_object_clones, bl);
3351 decode(stats.sum.num_object_copies, bl);
3352 decode(stats.sum.num_objects_missing_on_primary, bl);
3353 decode(stats.sum.num_objects_degraded, bl);
3354 decode(log_size, bl);
3355 decode(ondisk_log_size, bl);
3356 if (struct_v >= 2) {
3357 decode(stats.sum.num_rd, bl);
3358 decode(stats.sum.num_rd_kb, bl);
3359 decode(stats.sum.num_wr, bl);
3360 decode(stats.sum.num_wr_kb, bl);
3361 }
3362 if (struct_v >= 3) {
3363 decode(stats.sum.num_objects_unfound, bl);
3364 }
3365 }
3366 DECODE_FINISH(bl);
3367 }
3368
3369 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
3370 {
3371 pool_stat_t a;
3372 o.push_back(new pool_stat_t(a));
3373
3374 list<object_stat_collection_t*> l;
3375 object_stat_collection_t::generate_test_instances(l);
3376 list<store_statfs_t*> ll;
3377 store_statfs_t::generate_test_instances(ll);
3378 a.stats = *l.back();
3379 a.store_stats = *ll.back();
3380 a.log_size = 123;
3381 a.ondisk_log_size = 456;
3382 a.acting = 3;
3383 a.up = 4;
3384 a.num_store_stats = 1;
3385 o.push_back(new pool_stat_t(a));
3386 }
3387
3388
3389 // -- pg_history_t --
3390
3391 void pg_history_t::encode(ceph::buffer::list &bl) const
3392 {
3393 ENCODE_START(10, 4, bl);
3394 encode(epoch_created, bl);
3395 encode(last_epoch_started, bl);
3396 encode(last_epoch_clean, bl);
3397 encode(last_epoch_split, bl);
3398 encode(same_interval_since, bl);
3399 encode(same_up_since, bl);
3400 encode(same_primary_since, bl);
3401 encode(last_scrub, bl);
3402 encode(last_scrub_stamp, bl);
3403 encode(last_deep_scrub, bl);
3404 encode(last_deep_scrub_stamp, bl);
3405 encode(last_clean_scrub_stamp, bl);
3406 encode(last_epoch_marked_full, bl);
3407 encode(last_interval_started, bl);
3408 encode(last_interval_clean, bl);
3409 encode(epoch_pool_created, bl);
3410 encode(prior_readable_until_ub, bl);
3411 ENCODE_FINISH(bl);
3412 }
3413
3414 void pg_history_t::decode(ceph::buffer::list::const_iterator &bl)
3415 {
3416 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
3417 decode(epoch_created, bl);
3418 decode(last_epoch_started, bl);
3419 if (struct_v >= 3)
3420 decode(last_epoch_clean, bl);
3421 else
3422 last_epoch_clean = last_epoch_started; // careful, it's a lie!
3423 decode(last_epoch_split, bl);
3424 decode(same_interval_since, bl);
3425 decode(same_up_since, bl);
3426 decode(same_primary_since, bl);
3427 if (struct_v >= 2) {
3428 decode(last_scrub, bl);
3429 decode(last_scrub_stamp, bl);
3430 }
3431 if (struct_v >= 5) {
3432 decode(last_deep_scrub, bl);
3433 decode(last_deep_scrub_stamp, bl);
3434 }
3435 if (struct_v >= 6) {
3436 decode(last_clean_scrub_stamp, bl);
3437 }
3438 if (struct_v >= 7) {
3439 decode(last_epoch_marked_full, bl);
3440 }
3441 if (struct_v >= 8) {
3442 decode(last_interval_started, bl);
3443 decode(last_interval_clean, bl);
3444 } else {
3445 if (last_epoch_started >= same_interval_since) {
3446 last_interval_started = same_interval_since;
3447 } else {
3448 last_interval_started = last_epoch_started; // best guess
3449 }
3450 if (last_epoch_clean >= same_interval_since) {
3451 last_interval_clean = same_interval_since;
3452 } else {
3453 last_interval_clean = last_epoch_clean; // best guess
3454 }
3455 }
3456 if (struct_v >= 9) {
3457 decode(epoch_pool_created, bl);
3458 } else {
3459 epoch_pool_created = epoch_created;
3460 }
3461 if (struct_v >= 10) {
3462 decode(prior_readable_until_ub, bl);
3463 }
3464 DECODE_FINISH(bl);
3465 }
3466
3467 void pg_history_t::dump(Formatter *f) const
3468 {
3469 f->dump_int("epoch_created", epoch_created);
3470 f->dump_int("epoch_pool_created", epoch_pool_created);
3471 f->dump_int("last_epoch_started", last_epoch_started);
3472 f->dump_int("last_interval_started", last_interval_started);
3473 f->dump_int("last_epoch_clean", last_epoch_clean);
3474 f->dump_int("last_interval_clean", last_interval_clean);
3475 f->dump_int("last_epoch_split", last_epoch_split);
3476 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
3477 f->dump_int("same_up_since", same_up_since);
3478 f->dump_int("same_interval_since", same_interval_since);
3479 f->dump_int("same_primary_since", same_primary_since);
3480 f->dump_stream("last_scrub") << last_scrub;
3481 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
3482 f->dump_stream("last_deep_scrub") << last_deep_scrub;
3483 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
3484 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
3485 f->dump_float(
3486 "prior_readable_until_ub",
3487 std::chrono::duration<double>(prior_readable_until_ub).count());
3488 }
3489
3490 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
3491 {
3492 o.push_back(new pg_history_t);
3493 o.push_back(new pg_history_t);
3494 o.back()->epoch_created = 1;
3495 o.back()->epoch_pool_created = 1;
3496 o.back()->last_epoch_started = 2;
3497 o.back()->last_interval_started = 2;
3498 o.back()->last_epoch_clean = 3;
3499 o.back()->last_interval_clean = 2;
3500 o.back()->last_epoch_split = 4;
3501 o.back()->prior_readable_until_ub = make_timespan(3.1415);
3502 o.back()->same_up_since = 5;
3503 o.back()->same_interval_since = 6;
3504 o.back()->same_primary_since = 7;
3505 o.back()->last_scrub = eversion_t(8, 9);
3506 o.back()->last_scrub_stamp = utime_t(10, 11);
3507 o.back()->last_deep_scrub = eversion_t(12, 13);
3508 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
3509 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
3510 o.back()->last_epoch_marked_full = 18;
3511 }
3512
3513
3514 // -- pg_info_t --
3515
3516 void pg_info_t::encode(ceph::buffer::list &bl) const
3517 {
3518 ENCODE_START(32, 26, bl);
3519 encode(pgid.pgid, bl);
3520 encode(last_update, bl);
3521 encode(last_complete, bl);
3522 encode(log_tail, bl);
3523 encode(hobject_t(), bl); // old (nibblewise) last_backfill
3524 encode(stats, bl);
3525 history.encode(bl);
3526 encode(purged_snaps, bl);
3527 encode(last_epoch_started, bl);
3528 encode(last_user_version, bl);
3529 encode(hit_set, bl);
3530 encode(pgid.shard, bl);
3531 encode(last_backfill, bl);
3532 encode(true, bl); // was last_backfill_bitwise
3533 encode(last_interval_started, bl);
3534 ENCODE_FINISH(bl);
3535 }
3536
3537 void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
3538 {
3539 DECODE_START(32, bl);
3540 decode(pgid.pgid, bl);
3541 decode(last_update, bl);
3542 decode(last_complete, bl);
3543 decode(log_tail, bl);
3544 {
3545 hobject_t old_last_backfill;
3546 decode(old_last_backfill, bl);
3547 }
3548 decode(stats, bl);
3549 history.decode(bl);
3550 decode(purged_snaps, bl);
3551 decode(last_epoch_started, bl);
3552 decode(last_user_version, bl);
3553 decode(hit_set, bl);
3554 decode(pgid.shard, bl);
3555 decode(last_backfill, bl);
3556 {
3557 bool last_backfill_bitwise;
3558 decode(last_backfill_bitwise, bl);
3559 // note: we may see a false value here since the default value for
3560 // the member was false, so it often didn't get set to true until
3561 // peering progressed.
3562 }
3563 if (struct_v >= 32) {
3564 decode(last_interval_started, bl);
3565 } else {
3566 last_interval_started = last_epoch_started;
3567 }
3568 DECODE_FINISH(bl);
3569 }
3570
3571 // -- pg_info_t --
3572
3573 void pg_info_t::dump(Formatter *f) const
3574 {
3575 f->dump_stream("pgid") << pgid;
3576 f->dump_stream("last_update") << last_update;
3577 f->dump_stream("last_complete") << last_complete;
3578 f->dump_stream("log_tail") << log_tail;
3579 f->dump_int("last_user_version", last_user_version);
3580 f->dump_stream("last_backfill") << last_backfill;
3581 f->open_array_section("purged_snaps");
3582 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
3583 i != purged_snaps.end();
3584 ++i) {
3585 f->open_object_section("purged_snap_interval");
3586 f->dump_stream("start") << i.get_start();
3587 f->dump_stream("length") << i.get_len();
3588 f->close_section();
3589 }
3590 f->close_section();
3591 f->open_object_section("history");
3592 history.dump(f);
3593 f->close_section();
3594 f->open_object_section("stats");
3595 stats.dump(f);
3596 f->close_section();
3597
3598 f->dump_int("empty", is_empty());
3599 f->dump_int("dne", dne());
3600 f->dump_int("incomplete", is_incomplete());
3601 f->dump_int("last_epoch_started", last_epoch_started);
3602
3603 f->open_object_section("hit_set_history");
3604 hit_set.dump(f);
3605 f->close_section();
3606 }
3607
3608 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
3609 {
3610 o.push_back(new pg_info_t);
3611 o.push_back(new pg_info_t);
3612 list<pg_history_t*> h;
3613 pg_history_t::generate_test_instances(h);
3614 o.back()->history = *h.back();
3615 o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD);
3616 o.back()->last_update = eversion_t(3, 4);
3617 o.back()->last_complete = eversion_t(5, 6);
3618 o.back()->last_user_version = 2;
3619 o.back()->log_tail = eversion_t(7, 8);
3620 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
3621 {
3622 list<pg_stat_t*> s;
3623 pg_stat_t::generate_test_instances(s);
3624 o.back()->stats = *s.back();
3625 }
3626 {
3627 list<pg_hit_set_history_t*> s;
3628 pg_hit_set_history_t::generate_test_instances(s);
3629 o.back()->hit_set = *s.back();
3630 }
3631 }
3632
3633 // -- pg_notify_t --
3634 void pg_notify_t::encode(ceph::buffer::list &bl) const
3635 {
3636 ENCODE_START(3, 2, bl);
3637 encode(query_epoch, bl);
3638 encode(epoch_sent, bl);
3639 encode(info, bl);
3640 encode(to, bl);
3641 encode(from, bl);
3642 encode(past_intervals, bl);
3643 ENCODE_FINISH(bl);
3644 }
3645
3646 void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl)
3647 {
3648 DECODE_START(3, bl);
3649 decode(query_epoch, bl);
3650 decode(epoch_sent, bl);
3651 decode(info, bl);
3652 decode(to, bl);
3653 decode(from, bl);
3654 if (struct_v >= 3) {
3655 decode(past_intervals, bl);
3656 }
3657 DECODE_FINISH(bl);
3658 }
3659
3660 void pg_notify_t::dump(Formatter *f) const
3661 {
3662 f->dump_int("from", from);
3663 f->dump_int("to", to);
3664 f->dump_unsigned("query_epoch", query_epoch);
3665 f->dump_unsigned("epoch_sent", epoch_sent);
3666 {
3667 f->open_object_section("info");
3668 info.dump(f);
3669 f->close_section();
3670 }
3671 f->dump_object("past_intervals", past_intervals);
3672 }
3673
3674 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
3675 {
3676 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1,
3677 pg_info_t(), PastIntervals()));
3678 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
3679 pg_info_t(), PastIntervals()));
3680 }
3681
3682 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
3683 {
3684 lhs << "(query:" << notify.query_epoch
3685 << " sent:" << notify.epoch_sent
3686 << " " << notify.info;
3687 if (notify.from != shard_id_t::NO_SHARD ||
3688 notify.to != shard_id_t::NO_SHARD)
3689 lhs << " " << (unsigned)notify.from
3690 << "->" << (unsigned)notify.to;
3691 lhs << " " << notify.past_intervals;
3692 return lhs << ")";
3693 }
3694
3695 // -- pg_interval_t --
3696
3697 void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const
3698 {
3699 ENCODE_START(4, 2, bl);
3700 encode(first, bl);
3701 encode(last, bl);
3702 encode(up, bl);
3703 encode(acting, bl);
3704 encode(maybe_went_rw, bl);
3705 encode(primary, bl);
3706 encode(up_primary, bl);
3707 ENCODE_FINISH(bl);
3708 }
3709
3710 void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl)
3711 {
3712 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
3713 decode(first, bl);
3714 decode(last, bl);
3715 decode(up, bl);
3716 decode(acting, bl);
3717 decode(maybe_went_rw, bl);
3718 if (struct_v >= 3) {
3719 decode(primary, bl);
3720 } else {
3721 if (acting.size())
3722 primary = acting[0];
3723 }
3724 if (struct_v >= 4) {
3725 decode(up_primary, bl);
3726 } else {
3727 if (up.size())
3728 up_primary = up[0];
3729 }
3730 DECODE_FINISH(bl);
3731 }
3732
3733 void PastIntervals::pg_interval_t::dump(Formatter *f) const
3734 {
3735 f->dump_unsigned("first", first);
3736 f->dump_unsigned("last", last);
3737 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
3738 f->open_array_section("up");
3739 for (auto p = up.cbegin(); p != up.cend(); ++p)
3740 f->dump_int("osd", *p);
3741 f->close_section();
3742 f->open_array_section("acting");
3743 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
3744 f->dump_int("osd", *p);
3745 f->close_section();
3746 f->dump_int("primary", primary);
3747 f->dump_int("up_primary", up_primary);
3748 }
3749
3750 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
3751 {
3752 o.push_back(new pg_interval_t);
3753 o.push_back(new pg_interval_t);
3754 o.back()->up.push_back(1);
3755 o.back()->acting.push_back(2);
3756 o.back()->acting.push_back(3);
3757 o.back()->first = 4;
3758 o.back()->last = 5;
3759 o.back()->maybe_went_rw = true;
3760 }
3761
3762 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
3763
3764
3765 /**
3766 * pi_compact_rep
3767 *
3768 * PastIntervals only needs to be able to answer two questions:
3769 * 1) Where should the primary look for unfound objects?
3770 * 2) List a set of subsets of the OSDs such that contacting at least
3771 * one from each subset guarantees we speak to at least one witness
3772 * of any completed write.
3773 *
3774 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3775 * we don't need to keep any where maybe_went_rw would be false. We also
3776 * needn't keep two intervals where the actingset in one is a subset
3777 * of the other (only need to keep the smaller of the two sets). In order
3778 * to accurately trim the set of intervals as last_epoch_started changes
3779 * without rebuilding the set from scratch, we'll retain the larger set
3780 * if it in an older interval.
3781 */
3782 struct compact_interval_t {
3783 epoch_t first;
3784 epoch_t last;
3785 set<pg_shard_t> acting;
3786 bool supersedes(const compact_interval_t &other) {
3787 for (auto &&i: acting) {
3788 if (!other.acting.count(i))
3789 return false;
3790 }
3791 return true;
3792 }
3793 void dump(Formatter *f) const {
3794 f->open_object_section("compact_interval_t");
3795 f->dump_stream("first") << first;
3796 f->dump_stream("last") << last;
3797 f->dump_stream("acting") << acting;
3798 f->close_section();
3799 }
3800 void encode(ceph::buffer::list &bl) const {
3801 ENCODE_START(1, 1, bl);
3802 encode(first, bl);
3803 encode(last, bl);
3804 encode(acting, bl);
3805 ENCODE_FINISH(bl);
3806 }
3807 void decode(ceph::buffer::list::const_iterator &bl) {
3808 DECODE_START(1, bl);
3809 decode(first, bl);
3810 decode(last, bl);
3811 decode(acting, bl);
3812 DECODE_FINISH(bl);
3813 }
3814 static void generate_test_instances(list<compact_interval_t*> & o) {
3815 /* Not going to be used, we'll generate pi_compact_rep directly */
3816 }
3817 };
3818 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3819 {
3820 return o << "([" << rhs.first << "," << rhs.last
3821 << "] acting " << rhs.acting << ")";
3822 }
3823 WRITE_CLASS_ENCODER(compact_interval_t)
3824
3825 class pi_compact_rep : public PastIntervals::interval_rep {
3826 epoch_t first = 0;
3827 epoch_t last = 0; // inclusive
3828 set<pg_shard_t> all_participants;
3829 list<compact_interval_t> intervals;
3830 pi_compact_rep(
3831 bool ec_pool,
3832 std::list<PastIntervals::pg_interval_t> &&intervals) {
3833 for (auto &&i: intervals)
3834 add_interval(ec_pool, i);
3835 }
3836 public:
3837 pi_compact_rep() = default;
3838 pi_compact_rep(const pi_compact_rep &) = default;
3839 pi_compact_rep(pi_compact_rep &&) = default;
3840 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3841 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3842
3843 size_t size() const override { return intervals.size(); }
3844 bool empty() const override {
3845 return first > last || (first == 0 && last == 0);
3846 }
3847 void clear() override {
3848 *this = pi_compact_rep();
3849 }
3850 pair<epoch_t, epoch_t> get_bounds() const override {
3851 return make_pair(first, last + 1);
3852 }
3853 void adjust_start_backwards(epoch_t last_epoch_clean) override {
3854 first = last_epoch_clean;
3855 }
3856
3857 set<pg_shard_t> get_all_participants(
3858 bool ec_pool) const override {
3859 return all_participants;
3860 }
3861 void add_interval(
3862 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3863 if (first == 0)
3864 first = interval.first;
3865 ceph_assert(interval.last > last);
3866 last = interval.last;
3867 set<pg_shard_t> acting;
3868 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3869 if (interval.acting[i] == CRUSH_ITEM_NONE)
3870 continue;
3871 acting.insert(
3872 pg_shard_t(
3873 interval.acting[i],
3874 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3875 }
3876 all_participants.insert(acting.begin(), acting.end());
3877 if (!interval.maybe_went_rw)
3878 return;
3879 intervals.push_back(
3880 compact_interval_t{interval.first, interval.last, acting});
3881 auto plast = intervals.end();
3882 --plast;
3883 for (auto cur = intervals.begin(); cur != plast; ) {
3884 if (plast->supersedes(*cur)) {
3885 intervals.erase(cur++);
3886 } else {
3887 ++cur;
3888 }
3889 }
3890 }
3891 unique_ptr<PastIntervals::interval_rep> clone() const override {
3892 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3893 }
3894 ostream &print(ostream &out) const override {
3895 return out << "([" << first << "," << last
3896 << "] all_participants=" << all_participants
3897 << " intervals=" << intervals << ")";
3898 }
3899 void encode(ceph::buffer::list &bl) const override {
3900 ENCODE_START(1, 1, bl);
3901 encode(first, bl);
3902 encode(last, bl);
3903 encode(all_participants, bl);
3904 encode(intervals, bl);
3905 ENCODE_FINISH(bl);
3906 }
3907 void decode(ceph::buffer::list::const_iterator &bl) override {
3908 DECODE_START(1, bl);
3909 decode(first, bl);
3910 decode(last, bl);
3911 decode(all_participants, bl);
3912 decode(intervals, bl);
3913 DECODE_FINISH(bl);
3914 }
3915 void dump(Formatter *f) const override {
3916 f->open_object_section("PastIntervals::compact_rep");
3917 f->dump_stream("first") << first;
3918 f->dump_stream("last") << last;
3919 f->open_array_section("all_participants");
3920 for (auto& i : all_participants) {
3921 f->dump_object("pg_shard", i);
3922 }
3923 f->close_section();
3924 f->open_array_section("intervals");
3925 for (auto &&i: intervals) {
3926 i.dump(f);
3927 }
3928 f->close_section();
3929 f->close_section();
3930 }
3931 static void generate_test_instances(list<pi_compact_rep*> &o) {
3932 using ival = PastIntervals::pg_interval_t;
3933 using ivallst = std::list<ival>;
3934 o.push_back(
3935 new pi_compact_rep(
3936 true, ivallst
3937 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3938 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3939 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3940 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3941 }));
3942 o.push_back(
3943 new pi_compact_rep(
3944 false, ivallst
3945 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3946 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3947 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3948 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3949 }));
3950 o.push_back(
3951 new pi_compact_rep(
3952 true, ivallst
3953 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3954 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3955 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3956 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3957 }));
3958 }
3959 void iterate_mayberw_back_to(
3960 epoch_t les,
3961 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3962 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3963 if (i->last < les)
3964 break;
3965 f(i->first, i->acting);
3966 }
3967 }
3968 virtual ~pi_compact_rep() override {}
3969 };
3970 WRITE_CLASS_ENCODER(pi_compact_rep)
3971
3972 PastIntervals::PastIntervals()
3973 {
3974 past_intervals.reset(new pi_compact_rep);
3975 }
3976
3977 PastIntervals::PastIntervals(const PastIntervals &rhs)
3978 : past_intervals(rhs.past_intervals ?
3979 rhs.past_intervals->clone() :
3980 nullptr) {}
3981
3982 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3983 {
3984 PastIntervals other(rhs);
3985 swap(other);
3986 return *this;
3987 }
3988
3989 ostream& operator<<(ostream& out, const PastIntervals &i)
3990 {
3991 if (i.past_intervals) {
3992 return i.past_intervals->print(out);
3993 } else {
3994 return out << "(empty)";
3995 }
3996 }
3997
3998 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3999 {
4000 return out << "PriorSet("
4001 << "ec_pool: " << i.ec_pool
4002 << ", probe: " << i.probe
4003 << ", down: " << i.down
4004 << ", blocked_by: " << i.blocked_by
4005 << ", pg_down: " << i.pg_down
4006 << ")";
4007 }
4008
4009 void PastIntervals::decode(ceph::buffer::list::const_iterator &bl)
4010 {
4011 DECODE_START(1, bl);
4012 __u8 type = 0;
4013 decode(type, bl);
4014 switch (type) {
4015 case 0:
4016 break;
4017 case 1:
4018 ceph_abort_msg("pi_simple_rep support removed post-luminous");
4019 break;
4020 case 2:
4021 past_intervals.reset(new pi_compact_rep);
4022 past_intervals->decode(bl);
4023 break;
4024 }
4025 DECODE_FINISH(bl);
4026 }
4027
4028 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
4029 {
4030 {
4031 list<pi_compact_rep *> compact;
4032 pi_compact_rep::generate_test_instances(compact);
4033 for (auto &&i: compact) {
4034 // takes ownership of contents
4035 o.push_back(new PastIntervals(i));
4036 }
4037 }
4038 return;
4039 }
4040
4041 bool PastIntervals::is_new_interval(
4042 int old_acting_primary,
4043 int new_acting_primary,
4044 const vector<int> &old_acting,
4045 const vector<int> &new_acting,
4046 int old_up_primary,
4047 int new_up_primary,
4048 const vector<int> &old_up,
4049 const vector<int> &new_up,
4050 int old_size,
4051 int new_size,
4052 int old_min_size,
4053 int new_min_size,
4054 unsigned old_pg_num,
4055 unsigned new_pg_num,
4056 unsigned old_pg_num_pending,
4057 unsigned new_pg_num_pending,
4058 bool old_sort_bitwise,
4059 bool new_sort_bitwise,
4060 bool old_recovery_deletes,
4061 bool new_recovery_deletes,
4062 uint32_t old_crush_count,
4063 uint32_t new_crush_count,
4064 uint32_t old_crush_target,
4065 uint32_t new_crush_target,
4066 uint32_t old_crush_barrier,
4067 uint32_t new_crush_barrier,
4068 int32_t old_crush_member,
4069 int32_t new_crush_member,
4070 pg_t pgid) {
4071 return old_acting_primary != new_acting_primary ||
4072 new_acting != old_acting ||
4073 old_up_primary != new_up_primary ||
4074 new_up != old_up ||
4075 old_min_size != new_min_size ||
4076 old_size != new_size ||
4077 pgid.is_split(old_pg_num, new_pg_num, 0) ||
4078 // (is or was) pre-merge source
4079 pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) ||
4080 pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) ||
4081 // merge source
4082 pgid.is_merge_source(old_pg_num, new_pg_num, 0) ||
4083 // (is or was) pre-merge target
4084 pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) ||
4085 pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) ||
4086 // merge target
4087 pgid.is_merge_target(old_pg_num, new_pg_num) ||
4088 old_sort_bitwise != new_sort_bitwise ||
4089 old_recovery_deletes != new_recovery_deletes ||
4090 old_crush_count != new_crush_count ||
4091 old_crush_target != new_crush_target ||
4092 old_crush_barrier != new_crush_barrier ||
4093 old_crush_member != new_crush_member;
4094 }
4095
4096 bool PastIntervals::is_new_interval(
4097 int old_acting_primary,
4098 int new_acting_primary,
4099 const vector<int> &old_acting,
4100 const vector<int> &new_acting,
4101 int old_up_primary,
4102 int new_up_primary,
4103 const vector<int> &old_up,
4104 const vector<int> &new_up,
4105 const OSDMap *osdmap,
4106 const OSDMap *lastmap,
4107 pg_t pgid)
4108 {
4109 const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool());
4110 if (!plast) {
4111 return false; // after pool is deleted there are no more interval changes
4112 }
4113 const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool());
4114 if (!pi) {
4115 return true; // pool was deleted this epoch -> (final!) interval change
4116 }
4117 return
4118 is_new_interval(old_acting_primary,
4119 new_acting_primary,
4120 old_acting,
4121 new_acting,
4122 old_up_primary,
4123 new_up_primary,
4124 old_up,
4125 new_up,
4126 plast->size,
4127 pi->size,
4128 plast->min_size,
4129 pi->min_size,
4130 plast->get_pg_num(),
4131 pi->get_pg_num(),
4132 plast->get_pg_num_pending(),
4133 pi->get_pg_num_pending(),
4134 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
4135 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
4136 lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
4137 osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
4138 plast->peering_crush_bucket_count, pi->peering_crush_bucket_count,
4139 plast->peering_crush_bucket_target, pi->peering_crush_bucket_target,
4140 plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier,
4141 plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member,
4142 pgid);
4143 }
4144
4145 bool PastIntervals::check_new_interval(
4146 int old_acting_primary,
4147 int new_acting_primary,
4148 const vector<int> &old_acting,
4149 const vector<int> &new_acting,
4150 int old_up_primary,
4151 int new_up_primary,
4152 const vector<int> &old_up,
4153 const vector<int> &new_up,
4154 epoch_t same_interval_since,
4155 epoch_t last_epoch_clean,
4156 const OSDMap *osdmap,
4157 const OSDMap *lastmap,
4158 pg_t pgid,
4159 const IsPGRecoverablePredicate &could_have_gone_active,
4160 PastIntervals *past_intervals,
4161 std::ostream *out)
4162 {
4163 /*
4164 * We have to be careful to gracefully deal with situations like
4165 * so. Say we have a power outage or something that takes out both
4166 * OSDs, but the monitor doesn't mark them down in the same epoch.
4167 * The history may look like
4168 *
4169 * 1: A B
4170 * 2: B
4171 * 3: let's say B dies for good, too (say, from the power spike)
4172 * 4: A
4173 *
4174 * which makes it look like B may have applied updates to the PG
4175 * that we need in order to proceed. This sucks...
4176 *
4177 * To minimize the risk of this happening, we CANNOT go active if
4178 * _any_ OSDs in the prior set are down until we send an MOSDAlive
4179 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
4180 * Then, we have something like
4181 *
4182 * 1: A B
4183 * 2: B up_thru[B]=0
4184 * 3:
4185 * 4: A
4186 *
4187 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
4188 *
4189 * or,
4190 *
4191 * 1: A B
4192 * 2: B up_thru[B]=0
4193 * 3: B up_thru[B]=2
4194 * 4:
4195 * 5: A
4196 *
4197 * -> we must wait for B, bc it was alive through 2, and could have
4198 * written to the pg.
4199 *
4200 * If B is really dead, then an administrator will need to manually
4201 * intervene by marking the OSD as "lost."
4202 */
4203
4204 // remember past interval
4205 // NOTE: a change in the up set primary triggers an interval
4206 // change, even though the interval members in the pg_interval_t
4207 // do not change.
4208 ceph_assert(past_intervals);
4209 ceph_assert(past_intervals->past_intervals);
4210 if (is_new_interval(
4211 old_acting_primary,
4212 new_acting_primary,
4213 old_acting,
4214 new_acting,
4215 old_up_primary,
4216 new_up_primary,
4217 old_up,
4218 new_up,
4219 osdmap,
4220 lastmap,
4221 pgid)) {
4222 pg_interval_t i;
4223 i.first = same_interval_since;
4224 i.last = osdmap->get_epoch() - 1;
4225 ceph_assert(i.first <= i.last);
4226 i.acting = old_acting;
4227 i.up = old_up;
4228 i.primary = old_acting_primary;
4229 i.up_primary = old_up_primary;
4230
4231 unsigned num_acting = 0;
4232 for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p)
4233 if (*p != CRUSH_ITEM_NONE)
4234 ++num_acting;
4235
4236 ceph_assert(lastmap->get_pools().count(pgid.pool()));
4237 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
4238 set<pg_shard_t> old_acting_shards;
4239 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
4240
4241 if (num_acting &&
4242 i.primary != -1 &&
4243 num_acting >= old_pg_pool.min_size &&
4244 (!old_pg_pool.is_stretch_pool() ||
4245 old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) &&
4246 could_have_gone_active(old_acting_shards)) {
4247 if (out)
4248 *out << __func__ << " " << i
4249 << " up_thru " << lastmap->get_up_thru(i.primary)
4250 << " up_from " << lastmap->get_up_from(i.primary)
4251 << " last_epoch_clean " << last_epoch_clean;
4252 if (lastmap->get_up_thru(i.primary) >= i.first &&
4253 lastmap->get_up_from(i.primary) <= i.first) {
4254 i.maybe_went_rw = true;
4255 if (out)
4256 *out << " " << i
4257 << " : primary up " << lastmap->get_up_from(i.primary)
4258 << "-" << lastmap->get_up_thru(i.primary)
4259 << " includes interval"
4260 << std::endl;
4261 } else if (last_epoch_clean >= i.first &&
4262 last_epoch_clean <= i.last) {
4263 // If the last_epoch_clean is included in this interval, then
4264 // the pg must have been rw (for recovery to have completed).
4265 // This is important because we won't know the _real_
4266 // first_epoch because we stop at last_epoch_clean, and we
4267 // don't want the oldest interval to randomly have
4268 // maybe_went_rw false depending on the relative up_thru vs
4269 // last_epoch_clean timing.
4270 i.maybe_went_rw = true;
4271 if (out)
4272 *out << " " << i
4273 << " : includes last_epoch_clean " << last_epoch_clean
4274 << " and presumed to have been rw"
4275 << std::endl;
4276 } else {
4277 i.maybe_went_rw = false;
4278 if (out)
4279 *out << " " << i
4280 << " : primary up " << lastmap->get_up_from(i.primary)
4281 << "-" << lastmap->get_up_thru(i.primary)
4282 << " does not include interval"
4283 << std::endl;
4284 }
4285 } else {
4286 i.maybe_went_rw = false;
4287 if (out)
4288 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
4289 }
4290 past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);
4291 return true;
4292 } else {
4293 return false;
4294 }
4295 }
4296
4297 // true if the given map affects the prior set
4298 bool PastIntervals::PriorSet::affected_by_map(
4299 const OSDMap &osdmap,
4300 const DoutPrefixProvider *dpp) const
4301 {
4302 for (auto p = probe.begin(); p != probe.end(); ++p) {
4303 int o = p->osd;
4304
4305 // did someone in the prior set go down?
4306 if (osdmap.is_down(o) && down.count(o) == 0) {
4307 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
4308 return true;
4309 }
4310
4311 // did a down osd in cur get (re)marked as lost?
4312 auto r = blocked_by.find(o);
4313 if (r != blocked_by.end()) {
4314 if (!osdmap.exists(o)) {
4315 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4316 return true;
4317 }
4318 if (osdmap.get_info(o).lost_at != r->second) {
4319 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4320 return true;
4321 }
4322 }
4323 }
4324
4325 // did someone in the prior down set go up?
4326 for (auto p = down.cbegin(); p != down.cend(); ++p) {
4327 int o = *p;
4328
4329 if (osdmap.is_up(o)) {
4330 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
4331 return true;
4332 }
4333
4334 // did someone in the prior set get lost or destroyed?
4335 if (!osdmap.exists(o)) {
4336 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4337 return true;
4338 }
4339 // did a down osd in down get (re)marked as lost?
4340 auto r = blocked_by.find(o);
4341 if (r != blocked_by.end()) {
4342 if (osdmap.get_info(o).lost_at != r->second) {
4343 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4344 return true;
4345 }
4346 }
4347 }
4348
4349 return false;
4350 }
4351
4352 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
4353 {
4354 out << "interval(" << i.first << "-" << i.last
4355 << " up " << i.up << "(" << i.up_primary << ")"
4356 << " acting " << i.acting << "(" << i.primary << ")";
4357 if (i.maybe_went_rw)
4358 out << " maybe_went_rw";
4359 out << ")";
4360 return out;
4361 }
4362
4363
4364
4365 // -- pg_query_t --
4366
4367 void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const {
4368 ENCODE_START(3, 3, bl);
4369 encode(type, bl);
4370 encode(since, bl);
4371 history.encode(bl);
4372 encode(epoch_sent, bl);
4373 encode(to, bl);
4374 encode(from, bl);
4375 ENCODE_FINISH(bl);
4376 }
4377
4378 void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) {
4379 DECODE_START(3, bl);
4380 decode(type, bl);
4381 decode(since, bl);
4382 history.decode(bl);
4383 decode(epoch_sent, bl);
4384 decode(to, bl);
4385 decode(from, bl);
4386 DECODE_FINISH(bl);
4387 }
4388
4389 void pg_query_t::dump(Formatter *f) const
4390 {
4391 f->dump_int("from", from);
4392 f->dump_int("to", to);
4393 f->dump_string("type", get_type_name());
4394 f->dump_stream("since") << since;
4395 f->dump_stream("epoch_sent") << epoch_sent;
4396 f->open_object_section("history");
4397 history.dump(f);
4398 f->close_section();
4399 }
4400 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
4401 {
4402 o.push_back(new pg_query_t());
4403 list<pg_history_t*> h;
4404 pg_history_t::generate_test_instances(h);
4405 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
4406 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
4407 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
4408 eversion_t(4, 5), *h.back(), 4));
4409 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
4410 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
4411 *h.back(), 5));
4412 }
4413
4414 // -- pg_lease_t --
4415
4416 void pg_lease_t::encode(bufferlist& bl) const
4417 {
4418 ENCODE_START(1, 1, bl);
4419 encode(readable_until, bl);
4420 encode(readable_until_ub, bl);
4421 encode(interval, bl);
4422 ENCODE_FINISH(bl);
4423 }
4424
4425 void pg_lease_t::decode(bufferlist::const_iterator& p)
4426 {
4427 DECODE_START(1, p);
4428 decode(readable_until, p);
4429 decode(readable_until_ub, p);
4430 decode(interval, p);
4431 DECODE_FINISH(p);
4432 }
4433
4434 void pg_lease_t::dump(Formatter *f) const
4435 {
4436 f->dump_stream("readable_until") << readable_until;
4437 f->dump_stream("readable_until_ub") << readable_until_ub;
4438 f->dump_stream("interval") << interval;
4439 }
4440
4441 void pg_lease_t::generate_test_instances(std::list<pg_lease_t*>& o)
4442 {
4443 o.push_back(new pg_lease_t());
4444 o.push_back(new pg_lease_t());
4445 o.back()->readable_until = make_timespan(1.5);
4446 o.back()->readable_until_ub = make_timespan(3.4);
4447 o.back()->interval = make_timespan(1.0);
4448 }
4449
4450 // -- pg_lease_ack_t --
4451
4452 void pg_lease_ack_t::encode(bufferlist& bl) const
4453 {
4454 ENCODE_START(1, 1, bl);
4455 encode(readable_until_ub, bl);
4456 ENCODE_FINISH(bl);
4457 }
4458
4459 void pg_lease_ack_t::decode(bufferlist::const_iterator& p)
4460 {
4461 DECODE_START(1, p);
4462 decode(readable_until_ub, p);
4463 DECODE_FINISH(p);
4464 }
4465
4466 void pg_lease_ack_t::dump(Formatter *f) const
4467 {
4468 f->dump_stream("readable_until_ub") << readable_until_ub;
4469 }
4470
4471 void pg_lease_ack_t::generate_test_instances(std::list<pg_lease_ack_t*>& o)
4472 {
4473 o.push_back(new pg_lease_ack_t());
4474 o.push_back(new pg_lease_ack_t());
4475 o.back()->readable_until_ub = make_timespan(3.4);
4476 }
4477
4478
4479 // -- ObjectModDesc --
4480 void ObjectModDesc::visit(Visitor *visitor) const
4481 {
4482 auto bp = bl.cbegin();
4483 try {
4484 while (!bp.end()) {
4485 DECODE_START(max_required_version, bp);
4486 uint8_t code;
4487 decode(code, bp);
4488 switch (code) {
4489 case APPEND: {
4490 uint64_t size;
4491 decode(size, bp);
4492 visitor->append(size);
4493 break;
4494 }
4495 case SETATTRS: {
4496 map<string, std::optional<ceph::buffer::list> > attrs;
4497 decode(attrs, bp);
4498 visitor->setattrs(attrs);
4499 break;
4500 }
4501 case DELETE: {
4502 version_t old_version;
4503 decode(old_version, bp);
4504 visitor->rmobject(old_version);
4505 break;
4506 }
4507 case CREATE: {
4508 visitor->create();
4509 break;
4510 }
4511 case UPDATE_SNAPS: {
4512 set<snapid_t> snaps;
4513 decode(snaps, bp);
4514 visitor->update_snaps(snaps);
4515 break;
4516 }
4517 case TRY_DELETE: {
4518 version_t old_version;
4519 decode(old_version, bp);
4520 visitor->try_rmobject(old_version);
4521 break;
4522 }
4523 case ROLLBACK_EXTENTS: {
4524 vector<pair<uint64_t, uint64_t> > extents;
4525 version_t gen;
4526 decode(gen, bp);
4527 decode(extents, bp);
4528 visitor->rollback_extents(gen,extents);
4529 break;
4530 }
4531 default:
4532 ceph_abort_msg("Invalid rollback code");
4533 }
4534 DECODE_FINISH(bp);
4535 }
4536 } catch (...) {
4537 ceph_abort_msg("Invalid encoding");
4538 }
4539 }
4540
4541 struct DumpVisitor : public ObjectModDesc::Visitor {
4542 Formatter *f;
4543 explicit DumpVisitor(Formatter *f) : f(f) {}
4544 void append(uint64_t old_size) override {
4545 f->open_object_section("op");
4546 f->dump_string("code", "APPEND");
4547 f->dump_unsigned("old_size", old_size);
4548 f->close_section();
4549 }
4550 void setattrs(map<string, std::optional<ceph::buffer::list> > &attrs) override {
4551 f->open_object_section("op");
4552 f->dump_string("code", "SETATTRS");
4553 f->open_array_section("attrs");
4554 for (auto i = attrs.begin(); i != attrs.end(); ++i) {
4555 f->dump_string("attr_name", i->first);
4556 }
4557 f->close_section();
4558 f->close_section();
4559 }
4560 void rmobject(version_t old_version) override {
4561 f->open_object_section("op");
4562 f->dump_string("code", "RMOBJECT");
4563 f->dump_unsigned("old_version", old_version);
4564 f->close_section();
4565 }
4566 void try_rmobject(version_t old_version) override {
4567 f->open_object_section("op");
4568 f->dump_string("code", "TRY_RMOBJECT");
4569 f->dump_unsigned("old_version", old_version);
4570 f->close_section();
4571 }
4572 void create() override {
4573 f->open_object_section("op");
4574 f->dump_string("code", "CREATE");
4575 f->close_section();
4576 }
4577 void update_snaps(const set<snapid_t> &snaps) override {
4578 f->open_object_section("op");
4579 f->dump_string("code", "UPDATE_SNAPS");
4580 f->dump_stream("snaps") << snaps;
4581 f->close_section();
4582 }
4583 void rollback_extents(
4584 version_t gen,
4585 const vector<pair<uint64_t, uint64_t> > &extents) override {
4586 f->open_object_section("op");
4587 f->dump_string("code", "ROLLBACK_EXTENTS");
4588 f->dump_unsigned("gen", gen);
4589 f->dump_stream("snaps") << extents;
4590 f->close_section();
4591 }
4592 };
4593
4594 void ObjectModDesc::dump(Formatter *f) const
4595 {
4596 f->open_object_section("object_mod_desc");
4597 f->dump_bool("can_local_rollback", can_local_rollback);
4598 f->dump_bool("rollback_info_completed", rollback_info_completed);
4599 {
4600 f->open_array_section("ops");
4601 DumpVisitor vis(f);
4602 visit(&vis);
4603 f->close_section();
4604 }
4605 f->close_section();
4606 }
4607
4608 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
4609 {
4610 map<string, std::optional<ceph::buffer::list> > attrs;
4611 attrs[OI_ATTR];
4612 attrs[SS_ATTR];
4613 attrs["asdf"];
4614 o.push_back(new ObjectModDesc());
4615 o.back()->append(100);
4616 o.back()->setattrs(attrs);
4617 o.push_back(new ObjectModDesc());
4618 o.back()->rmobject(1001);
4619 o.push_back(new ObjectModDesc());
4620 o.back()->create();
4621 o.back()->setattrs(attrs);
4622 o.push_back(new ObjectModDesc());
4623 o.back()->create();
4624 o.back()->setattrs(attrs);
4625 o.back()->mark_unrollbackable();
4626 o.back()->append(1000);
4627 }
4628
4629 void ObjectModDesc::encode(ceph::buffer::list &_bl) const
4630 {
4631 ENCODE_START(max_required_version, max_required_version, _bl);
4632 encode(can_local_rollback, _bl);
4633 encode(rollback_info_completed, _bl);
4634 encode(bl, _bl);
4635 ENCODE_FINISH(_bl);
4636 }
4637 void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl)
4638 {
4639 DECODE_START(2, _bl);
4640 max_required_version = struct_v;
4641 decode(can_local_rollback, _bl);
4642 decode(rollback_info_completed, _bl);
4643 decode(bl, _bl);
4644 // ensure bl does not pin a larger ceph::buffer in memory
4645 bl.rebuild();
4646 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
4647 DECODE_FINISH(_bl);
4648 }
4649
4650 std::atomic<uint32_t> ObjectCleanRegions::max_num_intervals = {10};
4651
4652 void ObjectCleanRegions::set_max_num_intervals(uint32_t num)
4653 {
4654 max_num_intervals = num;
4655 }
4656
4657 void ObjectCleanRegions::trim()
4658 {
4659 while(clean_offsets.num_intervals() > max_num_intervals) {
4660 typename interval_set<uint64_t>::iterator shortest_interval = clean_offsets.begin();
4661 if (shortest_interval == clean_offsets.end())
4662 break;
4663 for (typename interval_set<uint64_t>::iterator it = clean_offsets.begin();
4664 it != clean_offsets.end();
4665 ++it) {
4666 if (it.get_len() < shortest_interval.get_len())
4667 shortest_interval = it;
4668 }
4669 clean_offsets.erase(shortest_interval);
4670 }
4671 }
4672
4673 void ObjectCleanRegions::merge(const ObjectCleanRegions &other)
4674 {
4675 clean_offsets.intersection_of(other.clean_offsets);
4676 clean_omap = clean_omap && other.clean_omap;
4677 trim();
4678 }
4679
4680 void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len)
4681 {
4682 interval_set<uint64_t> clean_region;
4683 clean_region.insert(0, (uint64_t)-1);
4684 clean_region.erase(offset, len);
4685 clean_offsets.intersection_of(clean_region);
4686 trim();
4687 }
4688
4689 bool ObjectCleanRegions::is_clean_region(uint64_t offset, uint64_t len) const
4690 {
4691 return clean_offsets.contains(offset, len);
4692 }
4693
4694 void ObjectCleanRegions::mark_omap_dirty()
4695 {
4696 clean_omap = false;
4697 }
4698
4699 void ObjectCleanRegions::mark_object_new()
4700 {
4701 new_object = true;
4702 }
4703
4704 void ObjectCleanRegions::mark_fully_dirty()
4705 {
4706 mark_data_region_dirty(0, (uint64_t)-1);
4707 mark_omap_dirty();
4708 mark_object_new();
4709 }
4710
4711 interval_set<uint64_t> ObjectCleanRegions::get_dirty_regions() const
4712 {
4713 interval_set<uint64_t> dirty_region;
4714 dirty_region.insert(0, (uint64_t)-1);
4715 dirty_region.subtract(clean_offsets);
4716 return dirty_region;
4717 }
4718
4719 bool ObjectCleanRegions::omap_is_dirty() const
4720 {
4721 return !clean_omap;
4722 }
4723
4724 bool ObjectCleanRegions::object_is_exist() const
4725 {
4726 return !new_object;
4727 }
4728
4729 void ObjectCleanRegions::encode(bufferlist &bl) const
4730 {
4731 ENCODE_START(1, 1, bl);
4732 using ceph::encode;
4733 encode(clean_offsets, bl);
4734 encode(clean_omap, bl);
4735 encode(new_object, bl);
4736 ENCODE_FINISH(bl);
4737 }
4738
4739 void ObjectCleanRegions::decode(bufferlist::const_iterator &bl)
4740 {
4741 DECODE_START(1, bl);
4742 using ceph::decode;
4743 decode(clean_offsets, bl);
4744 decode(clean_omap, bl);
4745 decode(new_object, bl);
4746 DECODE_FINISH(bl);
4747 }
4748
4749 void ObjectCleanRegions::dump(Formatter *f) const
4750 {
4751 f->open_object_section("object_clean_regions");
4752 f->dump_stream("clean_offsets") << clean_offsets;
4753 f->dump_bool("clean_omap", clean_omap);
4754 f->dump_bool("new_object", new_object);
4755 f->close_section();
4756 }
4757
4758 void ObjectCleanRegions::generate_test_instances(list<ObjectCleanRegions*>& o)
4759 {
4760 o.push_back(new ObjectCleanRegions());
4761 o.push_back(new ObjectCleanRegions());
4762 o.back()->mark_data_region_dirty(4096, 40960);
4763 o.back()->mark_omap_dirty();
4764 o.back()->mark_object_new();
4765 }
4766
4767 ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr)
4768 {
4769 return out << "clean_offsets: " << ocr.clean_offsets
4770 << ", clean_omap: " << ocr.clean_omap
4771 << ", new_object: " << ocr.new_object;
4772 }
4773
4774 // -- pg_log_entry_t --
4775
4776 string pg_log_entry_t::get_key_name() const
4777 {
4778 return version.get_key_name();
4779 }
4780
4781 void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const
4782 {
4783 using ceph::encode;
4784 ceph::buffer::list ebl(sizeof(*this)*2);
4785 this->encode(ebl);
4786 __u32 crc = ebl.crc32c(0);
4787 encode(ebl, bl);
4788 encode(crc, bl);
4789 }
4790
4791 void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p)
4792 {
4793 using ceph::decode;
4794 ceph::buffer::list bl;
4795 decode(bl, p);
4796 __u32 crc;
4797 decode(crc, p);
4798 if (crc != bl.crc32c(0))
4799 throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
4800 auto q = bl.cbegin();
4801 this->decode(q);
4802 }
4803
4804 void pg_log_entry_t::encode(ceph::buffer::list &bl) const
4805 {
4806 ENCODE_START(14, 4, bl);
4807 encode(op, bl);
4808 encode(soid, bl);
4809 encode(version, bl);
4810
4811 /**
4812 * Added with reverting_to:
4813 * Previous code used prior_version to encode
4814 * what we now call reverting_to. This will
4815 * allow older code to decode reverting_to
4816 * into prior_version as expected.
4817 */
4818 if (op == LOST_REVERT)
4819 encode(reverting_to, bl);
4820 else
4821 encode(prior_version, bl);
4822
4823 encode(reqid, bl);
4824 encode(mtime, bl);
4825 if (op == LOST_REVERT)
4826 encode(prior_version, bl);
4827 encode(snaps, bl);
4828 encode(user_version, bl);
4829 encode(mod_desc, bl);
4830 encode(extra_reqids, bl);
4831 if (op == ERROR)
4832 encode(return_code, bl);
4833 if (!extra_reqids.empty())
4834 encode(extra_reqid_return_codes, bl);
4835 encode(clean_regions, bl);
4836 if (op != ERROR)
4837 encode(return_code, bl);
4838 encode(op_returns, bl);
4839 ENCODE_FINISH(bl);
4840 }
4841
4842 void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
4843 {
4844 DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl);
4845 decode(op, bl);
4846 if (struct_v < 2) {
4847 sobject_t old_soid;
4848 decode(old_soid, bl);
4849 soid.oid = old_soid.oid;
4850 soid.snap = old_soid.snap;
4851 invalid_hash = true;
4852 } else {
4853 decode(soid, bl);
4854 }
4855 if (struct_v < 3)
4856 invalid_hash = true;
4857 decode(version, bl);
4858
4859 if (struct_v >= 6 && op == LOST_REVERT)
4860 decode(reverting_to, bl);
4861 else
4862 decode(prior_version, bl);
4863
4864 decode(reqid, bl);
4865
4866 decode(mtime, bl);
4867 if (struct_v < 5)
4868 invalid_pool = true;
4869
4870 if (op == LOST_REVERT) {
4871 if (struct_v >= 6) {
4872 decode(prior_version, bl);
4873 } else {
4874 reverting_to = prior_version;
4875 }
4876 }
4877 if (struct_v >= 7 || // for v >= 7, this is for all ops.
4878 op == CLONE) { // for v < 7, it's only present for CLONE.
4879 decode(snaps, bl);
4880 // ensure snaps does not pin a larger ceph::buffer in memory
4881 snaps.rebuild();
4882 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4883 }
4884
4885 if (struct_v >= 8)
4886 decode(user_version, bl);
4887 else
4888 user_version = version.version;
4889
4890 if (struct_v >= 9)
4891 decode(mod_desc, bl);
4892 else
4893 mod_desc.mark_unrollbackable();
4894 if (struct_v >= 10)
4895 decode(extra_reqids, bl);
4896 if (struct_v >= 11 && op == ERROR)
4897 decode(return_code, bl);
4898 if (struct_v >= 12 && !extra_reqids.empty())
4899 decode(extra_reqid_return_codes, bl);
4900 if (struct_v >= 13)
4901 decode(clean_regions, bl);
4902 else
4903 clean_regions.mark_fully_dirty();
4904 if (struct_v >= 14) {
4905 if (op != ERROR) {
4906 decode(return_code, bl);
4907 }
4908 decode(op_returns, bl);
4909 }
4910 DECODE_FINISH(bl);
4911 }
4912
4913 void pg_log_entry_t::dump(Formatter *f) const
4914 {
4915 f->dump_string("op", get_op_name());
4916 f->dump_stream("object") << soid;
4917 f->dump_stream("version") << version;
4918 f->dump_stream("prior_version") << prior_version;
4919 f->dump_stream("reqid") << reqid;
4920 f->open_array_section("extra_reqids");
4921 uint32_t idx = 0;
4922 for (auto p = extra_reqids.begin();
4923 p != extra_reqids.end();
4924 ++idx, ++p) {
4925 f->open_object_section("extra_reqid");
4926 f->dump_stream("reqid") << p->first;
4927 f->dump_stream("user_version") << p->second;
4928 auto it = extra_reqid_return_codes.find(idx);
4929 if (it != extra_reqid_return_codes.end()) {
4930 f->dump_int("return_code", it->second);
4931 }
4932 f->close_section();
4933 }
4934 f->close_section();
4935 f->dump_stream("mtime") << mtime;
4936 f->dump_int("return_code", return_code);
4937 if (!op_returns.empty()) {
4938 f->open_array_section("op_returns");
4939 for (auto& i : op_returns) {
4940 f->dump_object("op", i);
4941 }
4942 f->close_section();
4943 }
4944 if (snaps.length() > 0) {
4945 vector<snapid_t> v;
4946 ceph::buffer::list c = snaps;
4947 auto p = c.cbegin();
4948 try {
4949 using ceph::decode;
4950 decode(v, p);
4951 } catch (...) {
4952 v.clear();
4953 }
4954 f->open_object_section("snaps");
4955 for (auto p = v.begin(); p != v.end(); ++p)
4956 f->dump_unsigned("snap", *p);
4957 f->close_section();
4958 }
4959 {
4960 f->open_object_section("mod_desc");
4961 mod_desc.dump(f);
4962 f->close_section();
4963 }
4964 {
4965 f->open_object_section("clean_regions");
4966 clean_regions.dump(f);
4967 f->close_section();
4968 }
4969 }
4970
4971 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4972 {
4973 o.push_back(new pg_log_entry_t());
4974 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4975 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4976 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4977 utime_t(8,9), 0));
4978 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4979 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4980 utime_t(8,9), -ENOENT));
4981 }
4982
4983 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4984 {
4985 out << e.version << " (" << e.prior_version << ") "
4986 << std::left << std::setw(8) << e.get_op_name() << ' '
4987 << e.soid << " by " << e.reqid << " " << e.mtime
4988 << " " << e.return_code;
4989 if (!e.op_returns.empty()) {
4990 out << " " << e.op_returns;
4991 }
4992 if (e.snaps.length()) {
4993 vector<snapid_t> snaps;
4994 ceph::buffer::list c = e.snaps;
4995 auto p = c.cbegin();
4996 try {
4997 decode(snaps, p);
4998 } catch (...) {
4999 snaps.clear();
5000 }
5001 out << " snaps " << snaps;
5002 }
5003 out << " ObjectCleanRegions " << e.clean_regions;
5004 return out;
5005 }
5006
5007 // -- pg_log_dup_t --
5008
5009 std::string pg_log_dup_t::get_key_name() const
5010 {
5011 static const char prefix[] = "dup_";
5012 std::string key(36, ' ');
5013 memcpy(&key[0], prefix, 4);
5014 version.get_key_name(&key[4]);
5015 key.resize(35); // remove the null terminator
5016 return key;
5017 }
5018
5019 void pg_log_dup_t::encode(ceph::buffer::list &bl) const
5020 {
5021 ENCODE_START(2, 1, bl);
5022 encode(reqid, bl);
5023 encode(version, bl);
5024 encode(user_version, bl);
5025 encode(return_code, bl);
5026 encode(op_returns, bl);
5027 ENCODE_FINISH(bl);
5028 }
5029
5030 void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl)
5031 {
5032 DECODE_START(2, bl);
5033 decode(reqid, bl);
5034 decode(version, bl);
5035 decode(user_version, bl);
5036 decode(return_code, bl);
5037 if (struct_v >= 2) {
5038 decode(op_returns, bl);
5039 }
5040 DECODE_FINISH(bl);
5041 }
5042
5043 void pg_log_dup_t::dump(Formatter *f) const
5044 {
5045 f->dump_stream("reqid") << reqid;
5046 f->dump_stream("version") << version;
5047 f->dump_stream("user_version") << user_version;
5048 f->dump_stream("return_code") << return_code;
5049 if (!op_returns.empty()) {
5050 f->open_array_section("op_returns");
5051 for (auto& i : op_returns) {
5052 f->dump_object("op", i);
5053 }
5054 f->close_section();
5055 }
5056 }
5057
5058 void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
5059 {
5060 o.push_back(new pg_log_dup_t());
5061 o.push_back(new pg_log_dup_t(eversion_t(1,2),
5062 1,
5063 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5064 0));
5065 o.push_back(new pg_log_dup_t(eversion_t(1,2),
5066 2,
5067 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
5068 -ENOENT));
5069 }
5070
5071
5072 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
5073 out << "log_dup(reqid=" << e.reqid <<
5074 " v=" << e.version << " uv=" << e.user_version <<
5075 " rc=" << e.return_code;
5076 if (!e.op_returns.empty()) {
5077 out << " " << e.op_returns;
5078 }
5079 return out << ")";
5080 }
5081
5082
5083 // -- pg_log_t --
5084
5085 // out: pg_log_t that only has entries that apply to import_pgid using curmap
5086 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
5087 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
5088 const string &hit_set_namespace, const pg_log_t &in,
5089 pg_log_t &out, pg_log_t &reject)
5090 {
5091 out = in;
5092 out.log.clear();
5093 reject.log.clear();
5094
5095 for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) {
5096
5097 // Reject pg log entries for temporary objects
5098 if (i->soid.is_temp()) {
5099 reject.log.push_back(*i);
5100 continue;
5101 }
5102
5103 if (i->soid.nspace != hit_set_namespace) {
5104 object_t oid = i->soid.oid;
5105 object_locator_t loc(i->soid);
5106 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
5107 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
5108
5109 if (import_pgid.pgid == pgid) {
5110 out.log.push_back(*i);
5111 } else {
5112 reject.log.push_back(*i);
5113 }
5114 } else {
5115 out.log.push_back(*i);
5116 }
5117 }
5118 }
5119
5120 void pg_log_t::encode(ceph::buffer::list& bl) const
5121 {
5122 ENCODE_START(7, 3, bl);
5123 encode(head, bl);
5124 encode(tail, bl);
5125 encode(log, bl);
5126 encode(can_rollback_to, bl);
5127 encode(rollback_info_trimmed_to, bl);
5128 encode(dups, bl);
5129 ENCODE_FINISH(bl);
5130 }
5131
5132 void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool)
5133 {
5134 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
5135 decode(head, bl);
5136 decode(tail, bl);
5137 if (struct_v < 2) {
5138 bool backlog;
5139 decode(backlog, bl);
5140 }
5141 decode(log, bl);
5142 if (struct_v >= 5)
5143 decode(can_rollback_to, bl);
5144
5145 if (struct_v >= 6)
5146 decode(rollback_info_trimmed_to, bl);
5147 else
5148 rollback_info_trimmed_to = tail;
5149
5150 if (struct_v >= 7)
5151 decode(dups, bl);
5152
5153 DECODE_FINISH(bl);
5154
5155 // handle hobject_t format change
5156 if (struct_v < 4) {
5157 for (auto i = log.begin(); i != log.end(); ++i) {
5158 if (!i->soid.is_max() && i->soid.pool == -1)
5159 i->soid.pool = pool;
5160 }
5161 }
5162 }
5163
5164 void pg_log_t::dump(Formatter *f) const
5165 {
5166 f->dump_stream("head") << head;
5167 f->dump_stream("tail") << tail;
5168 f->open_array_section("log");
5169 for (auto p = log.cbegin(); p != log.cend(); ++p) {
5170 f->open_object_section("entry");
5171 p->dump(f);
5172 f->close_section();
5173 }
5174 f->close_section();
5175 f->open_array_section("dups");
5176 for (const auto& entry : dups) {
5177 f->open_object_section("entry");
5178 entry.dump(f);
5179 f->close_section();
5180 }
5181 f->close_section();
5182 }
5183
5184 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
5185 {
5186 o.push_back(new pg_log_t);
5187
5188 // this is nonsensical:
5189 o.push_back(new pg_log_t);
5190 o.back()->head = eversion_t(1,2);
5191 o.back()->tail = eversion_t(3,4);
5192 list<pg_log_entry_t*> e;
5193 pg_log_entry_t::generate_test_instances(e);
5194 for (auto p = e.begin(); p != e.end(); ++p)
5195 o.back()->log.push_back(**p);
5196 }
5197
5198 static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups)
5199 {
5200 auto earliest_dup_version =
5201 target.head.version < maxdups ? 0u : target.head.version - maxdups + 1;
5202 lgeneric_subdout(cct, osd, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version << dendl;
5203
5204 for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) {
5205 if (d->version.version >= earliest_dup_version) {
5206 lgeneric_subdout(cct, osd, 20)
5207 << "copy_up_to/copy_after copy dup version "
5208 << d->version << dendl;
5209 target.dups.push_back(pg_log_dup_t(*d));
5210 }
5211 }
5212
5213 for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) {
5214 ceph_assert(i->version > other.tail);
5215 if (i->version > target.tail)
5216 break;
5217 if (i->version.version >= earliest_dup_version) {
5218 lgeneric_subdout(cct, osd, 20)
5219 << "copy_up_to/copy_after copy dup from log version "
5220 << i->version << dendl;
5221 target.dups.push_back(pg_log_dup_t(*i));
5222 }
5223 }
5224 }
5225
5226
5227 void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v)
5228 {
5229 can_rollback_to = other.can_rollback_to;
5230 head = other.head;
5231 tail = other.tail;
5232 lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v << dendl;
5233 for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
5234 ceph_assert(i->version > other.tail);
5235 if (i->version <= v) {
5236 // make tail accurate.
5237 tail = i->version;
5238 break;
5239 }
5240 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
5241 log.push_front(*i);
5242 }
5243 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
5244 }
5245
5246 void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max)
5247 {
5248 can_rollback_to = other.can_rollback_to;
5249 int n = 0;
5250 head = other.head;
5251 tail = other.tail;
5252 lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max << dendl;
5253 for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
5254 ceph_assert(i->version > other.tail);
5255 if (n++ >= max) {
5256 tail = i->version;
5257 break;
5258 }
5259 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
5260 log.push_front(*i);
5261 }
5262 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
5263 }
5264
5265 ostream& pg_log_t::print(ostream& out) const
5266 {
5267 out << *this << std::endl;
5268 for (auto p = log.cbegin(); p != log.cend(); ++p)
5269 out << *p << std::endl;
5270 for (const auto& entry : dups) {
5271 out << " dup entry: " << entry << std::endl;
5272 }
5273 return out;
5274 }
5275
5276 // -- pg_missing_t --
5277
5278 ostream& operator<<(ostream& out, const pg_missing_item& i)
5279 {
5280 out << i.need;
5281 if (i.have != eversion_t())
5282 out << "(" << i.have << ")";
5283 out << " flags = " << i.flag_str()
5284 << " " << i.clean_regions;
5285 return out;
5286 }
5287
5288 // -- object_copy_cursor_t --
5289
5290 void object_copy_cursor_t::encode(ceph::buffer::list& bl) const
5291 {
5292 ENCODE_START(1, 1, bl);
5293 encode(attr_complete, bl);
5294 encode(data_offset, bl);
5295 encode(data_complete, bl);
5296 encode(omap_offset, bl);
5297 encode(omap_complete, bl);
5298 ENCODE_FINISH(bl);
5299 }
5300
5301 void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl)
5302 {
5303 DECODE_START(1, bl);
5304 decode(attr_complete, bl);
5305 decode(data_offset, bl);
5306 decode(data_complete, bl);
5307 decode(omap_offset, bl);
5308 decode(omap_complete, bl);
5309 DECODE_FINISH(bl);
5310 }
5311
5312 void object_copy_cursor_t::dump(Formatter *f) const
5313 {
5314 f->dump_unsigned("attr_complete", (int)attr_complete);
5315 f->dump_unsigned("data_offset", data_offset);
5316 f->dump_unsigned("data_complete", (int)data_complete);
5317 f->dump_string("omap_offset", omap_offset);
5318 f->dump_unsigned("omap_complete", (int)omap_complete);
5319 }
5320
5321 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
5322 {
5323 o.push_back(new object_copy_cursor_t);
5324 o.push_back(new object_copy_cursor_t);
5325 o.back()->attr_complete = true;
5326 o.back()->data_offset = 123;
5327 o.push_back(new object_copy_cursor_t);
5328 o.back()->attr_complete = true;
5329 o.back()->data_complete = true;
5330 o.back()->omap_offset = "foo";
5331 o.push_back(new object_copy_cursor_t);
5332 o.back()->attr_complete = true;
5333 o.back()->data_complete = true;
5334 o.back()->omap_complete = true;
5335 }
5336
5337 // -- object_copy_data_t --
5338
5339 void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const
5340 {
5341 ENCODE_START(8, 5, bl);
5342 encode(size, bl);
5343 encode(mtime, bl);
5344 encode(attrs, bl);
5345 encode(data, bl);
5346 encode(omap_data, bl);
5347 encode(cursor, bl);
5348 encode(omap_header, bl);
5349 encode(snaps, bl);
5350 encode(snap_seq, bl);
5351 encode(flags, bl);
5352 encode(data_digest, bl);
5353 encode(omap_digest, bl);
5354 encode(reqids, bl);
5355 encode(truncate_seq, bl);
5356 encode(truncate_size, bl);
5357 encode(reqid_return_codes, bl);
5358 ENCODE_FINISH(bl);
5359 }
5360
5361 void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl)
5362 {
5363 DECODE_START(8, bl);
5364 if (struct_v < 5) {
5365 // old
5366 decode(size, bl);
5367 decode(mtime, bl);
5368 {
5369 string category;
5370 decode(category, bl); // no longer used
5371 }
5372 decode(attrs, bl);
5373 decode(data, bl);
5374 {
5375 map<string,ceph::buffer::list> omap;
5376 decode(omap, bl);
5377 omap_data.clear();
5378 if (!omap.empty()) {
5379 using ceph::encode;
5380 encode(omap, omap_data);
5381 }
5382 }
5383 decode(cursor, bl);
5384 if (struct_v >= 2)
5385 decode(omap_header, bl);
5386 if (struct_v >= 3) {
5387 decode(snaps, bl);
5388 decode(snap_seq, bl);
5389 } else {
5390 snaps.clear();
5391 snap_seq = 0;
5392 }
5393 if (struct_v >= 4) {
5394 decode(flags, bl);
5395 decode(data_digest, bl);
5396 decode(omap_digest, bl);
5397 }
5398 } else {
5399 // current
5400 decode(size, bl);
5401 decode(mtime, bl);
5402 decode(attrs, bl);
5403 decode(data, bl);
5404 decode(omap_data, bl);
5405 decode(cursor, bl);
5406 decode(omap_header, bl);
5407 decode(snaps, bl);
5408 decode(snap_seq, bl);
5409 if (struct_v >= 4) {
5410 decode(flags, bl);
5411 decode(data_digest, bl);
5412 decode(omap_digest, bl);
5413 }
5414 if (struct_v >= 6) {
5415 decode(reqids, bl);
5416 }
5417 if (struct_v >= 7) {
5418 decode(truncate_seq, bl);
5419 decode(truncate_size, bl);
5420 }
5421 if (struct_v >= 8) {
5422 decode(reqid_return_codes, bl);
5423 }
5424 }
5425 DECODE_FINISH(bl);
5426 }
5427
5428 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
5429 {
5430 o.push_back(new object_copy_data_t());
5431
5432 list<object_copy_cursor_t*> cursors;
5433 object_copy_cursor_t::generate_test_instances(cursors);
5434 auto ci = cursors.begin();
5435 o.back()->cursor = **(ci++);
5436
5437 o.push_back(new object_copy_data_t());
5438 o.back()->cursor = **(ci++);
5439
5440 o.push_back(new object_copy_data_t());
5441 o.back()->size = 1234;
5442 o.back()->mtime.set_from_double(1234);
5443 ceph::buffer::ptr bp("there", 5);
5444 ceph::buffer::list bl;
5445 bl.push_back(bp);
5446 o.back()->attrs["hello"] = bl;
5447 ceph::buffer::ptr bp2("not", 3);
5448 ceph::buffer::list bl2;
5449 bl2.push_back(bp2);
5450 map<string,ceph::buffer::list> omap;
5451 omap["why"] = bl2;
5452 using ceph::encode;
5453 encode(omap, o.back()->omap_data);
5454 ceph::buffer::ptr databp("iamsomedatatocontain", 20);
5455 o.back()->data.push_back(databp);
5456 o.back()->omap_header.append("this is an omap header");
5457 o.back()->snaps.push_back(123);
5458 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
5459 }
5460
5461 void object_copy_data_t::dump(Formatter *f) const
5462 {
5463 f->open_object_section("cursor");
5464 cursor.dump(f);
5465 f->close_section(); // cursor
5466 f->dump_int("size", size);
5467 f->dump_stream("mtime") << mtime;
5468 /* we should really print out the attrs here, but ceph::buffer::list
5469 const-correctness prevents that */
5470 f->dump_int("attrs_size", attrs.size());
5471 f->dump_int("flags", flags);
5472 f->dump_unsigned("data_digest", data_digest);
5473 f->dump_unsigned("omap_digest", omap_digest);
5474 f->dump_int("omap_data_length", omap_data.length());
5475 f->dump_int("omap_header_length", omap_header.length());
5476 f->dump_int("data_length", data.length());
5477 f->open_array_section("snaps");
5478 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
5479 f->dump_unsigned("snap", *p);
5480 f->close_section();
5481 f->open_array_section("reqids");
5482 uint32_t idx = 0;
5483 for (auto p = reqids.begin();
5484 p != reqids.end();
5485 ++idx, ++p) {
5486 f->open_object_section("extra_reqid");
5487 f->dump_stream("reqid") << p->first;
5488 f->dump_stream("user_version") << p->second;
5489 auto it = reqid_return_codes.find(idx);
5490 if (it != reqid_return_codes.end()) {
5491 f->dump_int("return_code", it->second);
5492 }
5493 f->close_section();
5494 }
5495 f->close_section();
5496 }
5497
5498 // -- pg_create_t --
5499
5500 void pg_create_t::encode(ceph::buffer::list &bl) const
5501 {
5502 ENCODE_START(1, 1, bl);
5503 encode(created, bl);
5504 encode(parent, bl);
5505 encode(split_bits, bl);
5506 ENCODE_FINISH(bl);
5507 }
5508
5509 void pg_create_t::decode(ceph::buffer::list::const_iterator &bl)
5510 {
5511 DECODE_START(1, bl);
5512 decode(created, bl);
5513 decode(parent, bl);
5514 decode(split_bits, bl);
5515 DECODE_FINISH(bl);
5516 }
5517
5518 void pg_create_t::dump(Formatter *f) const
5519 {
5520 f->dump_unsigned("created", created);
5521 f->dump_stream("parent") << parent;
5522 f->dump_int("split_bits", split_bits);
5523 }
5524
5525 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
5526 {
5527 o.push_back(new pg_create_t);
5528 o.push_back(new pg_create_t(1, pg_t(3, 4), 2));
5529 }
5530
5531
5532 // -- pg_hit_set_info_t --
5533
5534 void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const
5535 {
5536 ENCODE_START(2, 1, bl);
5537 encode(begin, bl);
5538 encode(end, bl);
5539 encode(version, bl);
5540 encode(using_gmt, bl);
5541 ENCODE_FINISH(bl);
5542 }
5543
5544 void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p)
5545 {
5546 DECODE_START(2, p);
5547 decode(begin, p);
5548 decode(end, p);
5549 decode(version, p);
5550 if (struct_v >= 2) {
5551 decode(using_gmt, p);
5552 } else {
5553 using_gmt = false;
5554 }
5555 DECODE_FINISH(p);
5556 }
5557
5558 void pg_hit_set_info_t::dump(Formatter *f) const
5559 {
5560 f->dump_stream("begin") << begin;
5561 f->dump_stream("end") << end;
5562 f->dump_stream("version") << version;
5563 f->dump_stream("using_gmt") << using_gmt;
5564 }
5565
5566 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
5567 {
5568 ls.push_back(new pg_hit_set_info_t);
5569 ls.push_back(new pg_hit_set_info_t);
5570 ls.back()->begin = utime_t(1, 2);
5571 ls.back()->end = utime_t(3, 4);
5572 }
5573
5574
5575 // -- pg_hit_set_history_t --
5576
5577 void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const
5578 {
5579 ENCODE_START(1, 1, bl);
5580 encode(current_last_update, bl);
5581 {
5582 utime_t dummy_stamp;
5583 encode(dummy_stamp, bl);
5584 }
5585 {
5586 pg_hit_set_info_t dummy_info;
5587 encode(dummy_info, bl);
5588 }
5589 encode(history, bl);
5590 ENCODE_FINISH(bl);
5591 }
5592
5593 void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p)
5594 {
5595 DECODE_START(1, p);
5596 decode(current_last_update, p);
5597 {
5598 utime_t dummy_stamp;
5599 decode(dummy_stamp, p);
5600 }
5601 {
5602 pg_hit_set_info_t dummy_info;
5603 decode(dummy_info, p);
5604 }
5605 decode(history, p);
5606 DECODE_FINISH(p);
5607 }
5608
5609 void pg_hit_set_history_t::dump(Formatter *f) const
5610 {
5611 f->dump_stream("current_last_update") << current_last_update;
5612 f->open_array_section("history");
5613 for (auto p = history.cbegin(); p != history.cend(); ++p) {
5614 f->open_object_section("info");
5615 p->dump(f);
5616 f->close_section();
5617 }
5618 f->close_section();
5619 }
5620
5621 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
5622 {
5623 ls.push_back(new pg_hit_set_history_t);
5624 ls.push_back(new pg_hit_set_history_t);
5625 ls.back()->current_last_update = eversion_t(1, 2);
5626 ls.back()->history.push_back(pg_hit_set_info_t());
5627 }
5628
5629 // -- OSDSuperblock --
5630
5631 void OSDSuperblock::encode(ceph::buffer::list &bl) const
5632 {
5633 ENCODE_START(9, 5, bl);
5634 encode(cluster_fsid, bl);
5635 encode(whoami, bl);
5636 encode(current_epoch, bl);
5637 encode(oldest_map, bl);
5638 encode(newest_map, bl);
5639 encode(weight, bl);
5640 compat_features.encode(bl);
5641 encode(clean_thru, bl);
5642 encode(mounted, bl);
5643 encode(osd_fsid, bl);
5644 encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
5645 encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
5646 encode(purged_snaps_last, bl);
5647 encode(last_purged_snaps_scrub, bl);
5648 ENCODE_FINISH(bl);
5649 }
5650
5651 void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl)
5652 {
5653 DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
5654 if (struct_v < 3) {
5655 string magic;
5656 decode(magic, bl);
5657 }
5658 decode(cluster_fsid, bl);
5659 decode(whoami, bl);
5660 decode(current_epoch, bl);
5661 decode(oldest_map, bl);
5662 decode(newest_map, bl);
5663 decode(weight, bl);
5664 if (struct_v >= 2) {
5665 compat_features.decode(bl);
5666 } else { //upgrade it!
5667 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
5668 }
5669 decode(clean_thru, bl);
5670 decode(mounted, bl);
5671 if (struct_v >= 4)
5672 decode(osd_fsid, bl);
5673 if (struct_v >= 6) {
5674 epoch_t last_map_marked_full;
5675 decode(last_map_marked_full, bl);
5676 }
5677 if (struct_v >= 7) {
5678 map<int64_t,epoch_t> pool_last_map_marked_full;
5679 decode(pool_last_map_marked_full, bl);
5680 }
5681 if (struct_v >= 9) {
5682 decode(purged_snaps_last, bl);
5683 decode(last_purged_snaps_scrub, bl);
5684 } else {
5685 purged_snaps_last = 0;
5686 }
5687 DECODE_FINISH(bl);
5688 }
5689
5690 void OSDSuperblock::dump(Formatter *f) const
5691 {
5692 f->dump_stream("cluster_fsid") << cluster_fsid;
5693 f->dump_stream("osd_fsid") << osd_fsid;
5694 f->dump_int("whoami", whoami);
5695 f->dump_int("current_epoch", current_epoch);
5696 f->dump_int("oldest_map", oldest_map);
5697 f->dump_int("newest_map", newest_map);
5698 f->dump_float("weight", weight);
5699 f->open_object_section("compat");
5700 compat_features.dump(f);
5701 f->close_section();
5702 f->dump_int("clean_thru", clean_thru);
5703 f->dump_int("last_epoch_mounted", mounted);
5704 f->dump_unsigned("purged_snaps_last", purged_snaps_last);
5705 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
5706 }
5707
5708 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
5709 {
5710 OSDSuperblock z;
5711 o.push_back(new OSDSuperblock(z));
5712 z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101");
5713 z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
5714 z.whoami = 3;
5715 z.current_epoch = 4;
5716 z.oldest_map = 5;
5717 z.newest_map = 9;
5718 z.mounted = 8;
5719 z.clean_thru = 7;
5720 o.push_back(new OSDSuperblock(z));
5721 o.push_back(new OSDSuperblock(z));
5722 }
5723
5724 // -- SnapSet --
5725
5726 void SnapSet::encode(ceph::buffer::list& bl) const
5727 {
5728 ENCODE_START(3, 2, bl);
5729 encode(seq, bl);
5730 encode(true, bl); // head_exists
5731 encode(snaps, bl);
5732 encode(clones, bl);
5733 encode(clone_overlap, bl);
5734 encode(clone_size, bl);
5735 encode(clone_snaps, bl);
5736 ENCODE_FINISH(bl);
5737 }
5738
5739 void SnapSet::decode(ceph::buffer::list::const_iterator& bl)
5740 {
5741 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5742 decode(seq, bl);
5743 bl += 1u; // skip legacy head_exists (always true)
5744 decode(snaps, bl);
5745 decode(clones, bl);
5746 decode(clone_overlap, bl);
5747 decode(clone_size, bl);
5748 if (struct_v >= 3) {
5749 decode(clone_snaps, bl);
5750 } else {
5751 clone_snaps.clear();
5752 }
5753 DECODE_FINISH(bl);
5754 }
5755
5756 void SnapSet::dump(Formatter *f) const
5757 {
5758 f->dump_unsigned("seq", seq);
5759 f->open_array_section("clones");
5760 for (auto p = clones.cbegin(); p != clones.cend(); ++p) {
5761 f->open_object_section("clone");
5762 f->dump_unsigned("snap", *p);
5763 auto cs = clone_size.find(*p);
5764 if (cs != clone_size.end())
5765 f->dump_unsigned("size", cs->second);
5766 else
5767 f->dump_string("size", "????");
5768 auto co = clone_overlap.find(*p);
5769 if (co != clone_overlap.end())
5770 f->dump_stream("overlap") << co->second;
5771 else
5772 f->dump_stream("overlap") << "????";
5773 auto q = clone_snaps.find(*p);
5774 if (q != clone_snaps.end()) {
5775 f->open_array_section("snaps");
5776 for (auto s : q->second) {
5777 f->dump_unsigned("snap", s);
5778 }
5779 f->close_section();
5780 }
5781 f->close_section();
5782 }
5783 f->close_section();
5784 }
5785
5786 void SnapSet::generate_test_instances(list<SnapSet*>& o)
5787 {
5788 o.push_back(new SnapSet);
5789 o.push_back(new SnapSet);
5790 o.back()->seq = 123;
5791 o.back()->snaps.push_back(123);
5792 o.back()->snaps.push_back(12);
5793 o.push_back(new SnapSet);
5794 o.back()->seq = 123;
5795 o.back()->snaps.push_back(123);
5796 o.back()->snaps.push_back(12);
5797 o.back()->clones.push_back(12);
5798 o.back()->clone_size[12] = 12345;
5799 o.back()->clone_overlap[12];
5800 o.back()->clone_snaps[12] = {12, 10, 8};
5801 }
5802
5803 ostream& operator<<(ostream& out, const SnapSet& cs)
5804 {
5805 return out << cs.seq << "=" << cs.snaps << ":"
5806 << cs.clone_snaps;
5807 }
5808
5809 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
5810 {
5811 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5812 // correct: it will not include snaps that still logically exist
5813 // but for which there was no clone that is defined. For all
5814 // practical purposes this doesn't matter, since we only use that
5815 // information to clone on the OSD, and we have already moved
5816 // forward past that part of the object history.
5817
5818 seq = ss.seq;
5819 set<snapid_t> _snaps;
5820 set<snapid_t> _clones;
5821 for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) {
5822 if (p->cloneid != librados::SNAP_HEAD) {
5823 _clones.insert(p->cloneid);
5824 _snaps.insert(p->snaps.begin(), p->snaps.end());
5825 clone_size[p->cloneid] = p->size;
5826 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
5827 for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q)
5828 clone_overlap[p->cloneid].insert(q->first, q->second);
5829 if (!legacy) {
5830 // p->snaps is ascending; clone_snaps is descending
5831 vector<snapid_t>& v = clone_snaps[p->cloneid];
5832 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
5833 v.push_back(*q);
5834 }
5835 }
5836 }
5837 }
5838
5839 // ascending
5840 clones.clear();
5841 clones.reserve(_clones.size());
5842 for (auto p = _clones.begin(); p != _clones.end(); ++p)
5843 clones.push_back(*p);
5844
5845 // descending
5846 snaps.clear();
5847 snaps.reserve(_snaps.size());
5848 for (auto p = _snaps.rbegin();
5849 p != _snaps.rend(); ++p)
5850 snaps.push_back(*p);
5851 }
5852
5853 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
5854 {
5855 ceph_assert(clone_size.count(clone));
5856 uint64_t size = clone_size.find(clone)->second;
5857 ceph_assert(clone_overlap.count(clone));
5858 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
5859 ceph_assert(size >= (uint64_t)overlap.size());
5860 return size - overlap.size();
5861 }
5862
5863 void SnapSet::filter(const pg_pool_t &pinfo)
5864 {
5865 vector<snapid_t> oldsnaps;
5866 oldsnaps.swap(snaps);
5867 for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) {
5868 if (!pinfo.is_removed_snap(*i))
5869 snaps.push_back(*i);
5870 }
5871 }
5872
5873 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5874 {
5875 SnapSet ss = *this;
5876 ss.filter(pinfo);
5877 return ss;
5878 }
5879
5880 // -- watch_info_t --
5881
5882 void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
5883 {
5884 ENCODE_START(4, 3, bl);
5885 encode(cookie, bl);
5886 encode(timeout_seconds, bl);
5887 encode(addr, bl, features);
5888 ENCODE_FINISH(bl);
5889 }
5890
5891 void watch_info_t::decode(ceph::buffer::list::const_iterator& bl)
5892 {
5893 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
5894 decode(cookie, bl);
5895 if (struct_v < 2) {
5896 uint64_t ver;
5897 decode(ver, bl);
5898 }
5899 decode(timeout_seconds, bl);
5900 if (struct_v >= 4) {
5901 decode(addr, bl);
5902 }
5903 DECODE_FINISH(bl);
5904 }
5905
5906 void watch_info_t::dump(Formatter *f) const
5907 {
5908 f->dump_unsigned("cookie", cookie);
5909 f->dump_unsigned("timeout_seconds", timeout_seconds);
5910 f->open_object_section("addr");
5911 addr.dump(f);
5912 f->close_section();
5913 }
5914
5915 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5916 {
5917 o.push_back(new watch_info_t);
5918 o.push_back(new watch_info_t);
5919 o.back()->cookie = 123;
5920 o.back()->timeout_seconds = 99;
5921 entity_addr_t ea;
5922 ea.set_type(entity_addr_t::TYPE_LEGACY);
5923 ea.set_nonce(1);
5924 ea.set_family(AF_INET);
5925 ea.set_in4_quad(0, 127);
5926 ea.set_in4_quad(1, 0);
5927 ea.set_in4_quad(2, 1);
5928 ea.set_in4_quad(3, 2);
5929 ea.set_port(2);
5930 o.back()->addr = ea;
5931 }
5932
5933 // -- chunk_info_t --
5934
5935 void chunk_info_t::encode(ceph::buffer::list& bl) const
5936 {
5937 ENCODE_START(1, 1, bl);
5938 encode(offset, bl);
5939 encode(length, bl);
5940 encode(oid, bl);
5941 __u32 _flags = flags;
5942 encode(_flags, bl);
5943 ENCODE_FINISH(bl);
5944 }
5945
5946 void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl)
5947 {
5948 DECODE_START(1, bl);
5949 decode(offset, bl);
5950 decode(length, bl);
5951 decode(oid, bl);
5952 __u32 _flags;
5953 decode(_flags, bl);
5954 flags = (cflag_t)_flags;
5955 DECODE_FINISH(bl);
5956 }
5957
5958 void chunk_info_t::dump(Formatter *f) const
5959 {
5960 f->dump_unsigned("length", length);
5961 f->open_object_section("oid");
5962 oid.dump(f);
5963 f->close_section();
5964 f->dump_unsigned("flags", flags);
5965 }
5966
5967
5968 bool chunk_info_t::operator==(const chunk_info_t& cit) const
5969 {
5970 if (has_fingerprint()) {
5971 if (oid.oid.name == cit.oid.oid.name) {
5972 return true;
5973 }
5974 } else {
5975 if (offset == cit.offset && length == cit.length &&
5976 oid.oid.name == cit.oid.oid.name) {
5977 return true;
5978 }
5979
5980 }
5981 return false;
5982 }
5983
5984 bool operator==(const std::pair<const long unsigned int, chunk_info_t> & l,
5985 const std::pair<const long unsigned int, chunk_info_t> & r)
5986 {
5987 return l.first == r.first &&
5988 l.second == r.second;
5989 }
5990
5991 ostream& operator<<(ostream& out, const chunk_info_t& ci)
5992 {
5993 return out << "(len: " << ci.length << " oid: " << ci.oid
5994 << " offset: " << ci.offset
5995 << " flags: " << ci.get_flag_string(ci.flags) << ")";
5996 }
5997
5998 // -- object_manifest_t --
5999
6000 std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci)
6001 {
6002 return out << ci.ref_delta << std::endl;
6003 }
6004
6005 void object_manifest_t::calc_refs_to_inc_on_set(
6006 const object_manifest_t* _g,
6007 const object_manifest_t* _l,
6008 object_ref_delta_t &refs) const
6009 {
6010 /* avoid to increment the same reference on adjacent clones */
6011 auto iter = chunk_map.begin();
6012 auto find_chunk = [](decltype(iter) &i, const object_manifest_t* cur)
6013 -> bool {
6014 if (cur) {
6015 auto c = cur->chunk_map.find(i->first);
6016 if (c != cur->chunk_map.end() && c->second == i->second) {
6017 return true;
6018
6019 }
6020 }
6021 return false;
6022 };
6023
6024 /* If at least a same chunk exists on either _g or _l, do not increment
6025 * the reference
6026 *
6027 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6028 * 20: [0, 2) aaa, <- set_chunk
6029 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6030 * --> incremnt the reference
6031 *
6032 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6033 * 20: [0, 2) ccc, <- set_chunk
6034 * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
6035 * --> do not need to increment
6036 *
6037 * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6038 * 20: [0, 2) ccc, <- set_chunk
6039 * 30: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
6040 * --> decrement the reference of ccc
6041 *
6042 */
6043 for (; iter != chunk_map.end(); ++iter) {
6044 auto found_g = find_chunk(iter, _g);
6045 auto found_l = find_chunk(iter, _l);
6046 if (!found_g && !found_l) {
6047 refs.inc_ref(iter->second.oid);
6048 } else if (found_g && found_l) {
6049 refs.dec_ref(iter->second.oid);
6050 }
6051 }
6052 }
6053
6054 void object_manifest_t::calc_refs_to_drop_on_modify(
6055 const object_manifest_t* _l,
6056 const ObjectCleanRegions& clean_regions,
6057 object_ref_delta_t &refs) const
6058 {
6059 for (auto &p : chunk_map) {
6060 if (!clean_regions.is_clean_region(p.first, p.second.length)) {
6061 // has previous snapshot
6062 if (_l) {
6063 /*
6064 * Let's assume that there is a manifest snapshotted object which has three chunks
6065 * head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6066 * 20: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
6067 *
6068 * If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because
6069 * 20 has the reference for bbb. Therefore, we only drop the reference if two chunks
6070 * (head: [6, 2) and 20: [6, 2)) are different.
6071 *
6072 */
6073 auto c = _l->chunk_map.find(p.first);
6074 if (c != _l->chunk_map.end()) {
6075 if (p.second == c->second) {
6076 continue;
6077 }
6078 }
6079 refs.dec_ref(p.second.oid);
6080 } else {
6081 // decrement the reference of the updated chunks if the manifest object has no snapshot
6082 refs.dec_ref(p.second.oid);
6083 }
6084 }
6085 }
6086 }
6087
6088 void object_manifest_t::calc_refs_to_drop_on_removal(
6089 const object_manifest_t* _g,
6090 const object_manifest_t* _l,
6091 object_ref_delta_t &refs) const
6092 {
6093 /* At a high level, the rule is that consecutive clones with the same reference
6094 * at the same offset share a reference. As such, removing *this may result
6095 * in removing references in two cases:
6096 * 1) *this has a reference which it shares with neither _g nor _l
6097 * 2) _g and _l have a reference which they share with each other but not
6098 * *this.
6099 *
6100 * For a particular offset, both 1 and 2 can happen.
6101 *
6102 * Notably, this means that to evaluate the reference change from removing
6103 * the object with *this, we only need to look at the two adjacent clones.
6104 */
6105
6106 // Paper over possibly missing _g or _l -- nullopt is semantically the same
6107 // as an empty chunk_map
6108 static const object_manifest_t empty;
6109 const object_manifest_t &g = _g ? *_g : empty;
6110 const object_manifest_t &l = _l ? *_l : empty;
6111
6112 auto giter = g.chunk_map.begin();
6113 auto iter = chunk_map.begin();
6114 auto liter = l.chunk_map.begin();
6115
6116 // Translate iter, map pair to the current offset, end() -> max
6117 auto get_offset = [](decltype(iter) &i, const object_manifest_t &manifest)
6118 -> uint64_t {
6119 return i == manifest.chunk_map.end() ?
6120 std::numeric_limits<uint64_t>::max() : i->first;
6121 };
6122
6123 /* If current matches the offset at iter, returns the chunk at *iter
6124 * and increments iter. Otherwise, returns nullptr.
6125 *
6126 * current will always be derived from the min of *giter, *iter, and
6127 * *liter on each cycle, so the result will be that each loop iteration
6128 * will pick up all chunks at the offest being considered, each offset
6129 * will be considered once, and all offsets will be considered.
6130 */
6131 auto get_chunk = [](
6132 uint64_t current, decltype(iter) &i, const object_manifest_t &manifest)
6133 -> const chunk_info_t * {
6134 if (i == manifest.chunk_map.end() || current != i->first) {
6135 return nullptr;
6136 } else {
6137 return &(i++)->second;
6138 }
6139 };
6140
6141 while (giter != g.chunk_map.end() ||
6142 iter != chunk_map.end() ||
6143 liter != l.chunk_map.end()) {
6144 auto current = std::min(
6145 std::min(get_offset(giter, g), get_offset(iter, *this)),
6146 get_offset(liter, l));
6147
6148 auto gchunk = get_chunk(current, giter, g);
6149 auto chunk = get_chunk(current, iter, *this);
6150 auto lchunk = get_chunk(current, liter, l);
6151
6152 if (gchunk && lchunk && *gchunk == *lchunk &&
6153 (!chunk || *gchunk != *chunk)) {
6154 // case 1 from above: l and g match, chunk does not
6155 refs.dec_ref(gchunk->oid);
6156 }
6157
6158 if (chunk &&
6159 (!gchunk || chunk->oid != gchunk->oid) &&
6160 (!lchunk || chunk->oid != lchunk->oid)) {
6161 // case 2 from above: *this matches neither
6162 refs.dec_ref(chunk->oid);
6163 }
6164 }
6165 }
6166
6167 void object_manifest_t::encode(ceph::buffer::list& bl) const
6168 {
6169 ENCODE_START(1, 1, bl);
6170 encode(type, bl);
6171 switch (type) {
6172 case TYPE_NONE: break;
6173 case TYPE_REDIRECT:
6174 encode(redirect_target, bl);
6175 break;
6176 case TYPE_CHUNKED:
6177 encode(chunk_map, bl);
6178 break;
6179 default:
6180 ceph_abort();
6181 }
6182 ENCODE_FINISH(bl);
6183 }
6184
6185 void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl)
6186 {
6187 DECODE_START(1, bl);
6188 decode(type, bl);
6189 switch (type) {
6190 case TYPE_NONE: break;
6191 case TYPE_REDIRECT:
6192 decode(redirect_target, bl);
6193 break;
6194 case TYPE_CHUNKED:
6195 decode(chunk_map, bl);
6196 break;
6197 default:
6198 ceph_abort();
6199 }
6200 DECODE_FINISH(bl);
6201 }
6202
6203 void object_manifest_t::dump(Formatter *f) const
6204 {
6205 f->dump_unsigned("type", type);
6206 if (type == TYPE_REDIRECT) {
6207 f->open_object_section("redirect_target");
6208 redirect_target.dump(f);
6209 f->close_section();
6210 } else if (type == TYPE_CHUNKED) {
6211 f->open_array_section("chunk_map");
6212 for (auto& p : chunk_map) {
6213 f->open_object_section("chunk");
6214 f->dump_unsigned("offset", p.first);
6215 p.second.dump(f);
6216 f->close_section();
6217 }
6218 f->close_section();
6219 }
6220 }
6221
6222 void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
6223 {
6224 o.push_back(new object_manifest_t());
6225 o.back()->type = TYPE_REDIRECT;
6226 }
6227
6228 ostream& operator<<(ostream& out, const object_manifest_t& om)
6229 {
6230 out << "manifest(" << om.get_type_name();
6231 if (om.is_redirect()) {
6232 out << " " << om.redirect_target;
6233 } else if (om.is_chunked()) {
6234 out << " " << om.chunk_map;
6235 }
6236 out << ")";
6237 return out;
6238 }
6239
6240 // -- object_info_t --
6241
6242 void object_info_t::copy_user_bits(const object_info_t& other)
6243 {
6244 // these bits are copied from head->clone.
6245 size = other.size;
6246 mtime = other.mtime;
6247 local_mtime = other.local_mtime;
6248 last_reqid = other.last_reqid;
6249 truncate_seq = other.truncate_seq;
6250 truncate_size = other.truncate_size;
6251 flags = other.flags;
6252 user_version = other.user_version;
6253 data_digest = other.data_digest;
6254 omap_digest = other.omap_digest;
6255 }
6256
6257 void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
6258 {
6259 object_locator_t myoloc(soid);
6260 map<entity_name_t, watch_info_t> old_watchers;
6261 for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) {
6262 old_watchers.insert(make_pair(i->first.second, i->second));
6263 }
6264 ENCODE_START(17, 8, bl);
6265 encode(soid, bl);
6266 encode(myoloc, bl); //Retained for compatibility
6267 encode((__u32)0, bl); // was category, no longer used
6268 encode(version, bl);
6269 encode(prior_version, bl);
6270 encode(last_reqid, bl);
6271 encode(size, bl);
6272 encode(mtime, bl);
6273 if (soid.snap == CEPH_NOSNAP)
6274 encode(osd_reqid_t(), bl); // used to be wrlock_by
6275 else
6276 encode((uint32_t)0, bl); // was legacy_snaps
6277 encode(truncate_seq, bl);
6278 encode(truncate_size, bl);
6279 encode(is_lost(), bl);
6280 encode(old_watchers, bl, features);
6281 /* shenanigans to avoid breaking backwards compatibility in the disk format.
6282 * When we can, switch this out for simply putting the version_t on disk. */
6283 eversion_t user_eversion(0, user_version);
6284 encode(user_eversion, bl);
6285 encode(test_flag(FLAG_USES_TMAP), bl);
6286 encode(watchers, bl, features);
6287 __u32 _flags = flags;
6288 encode(_flags, bl);
6289 encode(local_mtime, bl);
6290 encode(data_digest, bl);
6291 encode(omap_digest, bl);
6292 encode(expected_object_size, bl);
6293 encode(expected_write_size, bl);
6294 encode(alloc_hint_flags, bl);
6295 if (has_manifest()) {
6296 encode(manifest, bl);
6297 }
6298 ENCODE_FINISH(bl);
6299 }
6300
6301 void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
6302 {
6303 object_locator_t myoloc;
6304 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
6305 map<entity_name_t, watch_info_t> old_watchers;
6306 decode(soid, bl);
6307 decode(myoloc, bl);
6308 {
6309 string category;
6310 decode(category, bl); // no longer used
6311 }
6312 decode(version, bl);
6313 decode(prior_version, bl);
6314 decode(last_reqid, bl);
6315 decode(size, bl);
6316 decode(mtime, bl);
6317 if (soid.snap == CEPH_NOSNAP) {
6318 osd_reqid_t wrlock_by;
6319 decode(wrlock_by, bl);
6320 } else {
6321 vector<snapid_t> legacy_snaps;
6322 decode(legacy_snaps, bl);
6323 }
6324 decode(truncate_seq, bl);
6325 decode(truncate_size, bl);
6326
6327 // if this is struct_v >= 13, we will overwrite this
6328 // below since this field is just here for backwards
6329 // compatibility
6330 __u8 lo;
6331 decode(lo, bl);
6332 flags = (flag_t)lo;
6333
6334 decode(old_watchers, bl);
6335 eversion_t user_eversion;
6336 decode(user_eversion, bl);
6337 user_version = user_eversion.version;
6338
6339 if (struct_v >= 9) {
6340 bool uses_tmap = false;
6341 decode(uses_tmap, bl);
6342 if (uses_tmap)
6343 set_flag(FLAG_USES_TMAP);
6344 } else {
6345 set_flag(FLAG_USES_TMAP);
6346 }
6347 if (struct_v < 10)
6348 soid.pool = myoloc.pool;
6349 if (struct_v >= 11) {
6350 decode(watchers, bl);
6351 } else {
6352 for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) {
6353 watchers.insert(
6354 make_pair(
6355 make_pair(i->second.cookie, i->first), i->second));
6356 }
6357 }
6358 if (struct_v >= 13) {
6359 __u32 _flags;
6360 decode(_flags, bl);
6361 flags = (flag_t)_flags;
6362 }
6363 if (struct_v >= 14) {
6364 decode(local_mtime, bl);
6365 } else {
6366 local_mtime = utime_t();
6367 }
6368 if (struct_v >= 15) {
6369 decode(data_digest, bl);
6370 decode(omap_digest, bl);
6371 } else {
6372 data_digest = omap_digest = -1;
6373 clear_flag(FLAG_DATA_DIGEST);
6374 clear_flag(FLAG_OMAP_DIGEST);
6375 }
6376 if (struct_v >= 16) {
6377 decode(expected_object_size, bl);
6378 decode(expected_write_size, bl);
6379 decode(alloc_hint_flags, bl);
6380 } else {
6381 expected_object_size = 0;
6382 expected_write_size = 0;
6383 alloc_hint_flags = 0;
6384 }
6385 if (struct_v >= 17) {
6386 if (has_manifest()) {
6387 decode(manifest, bl);
6388 }
6389 }
6390 DECODE_FINISH(bl);
6391 }
6392
6393 void object_info_t::dump(Formatter *f) const
6394 {
6395 f->open_object_section("oid");
6396 soid.dump(f);
6397 f->close_section();
6398 f->dump_stream("version") << version;
6399 f->dump_stream("prior_version") << prior_version;
6400 f->dump_stream("last_reqid") << last_reqid;
6401 f->dump_unsigned("user_version", user_version);
6402 f->dump_unsigned("size", size);
6403 f->dump_stream("mtime") << mtime;
6404 f->dump_stream("local_mtime") << local_mtime;
6405 f->dump_unsigned("lost", (int)is_lost());
6406 vector<string> sv = get_flag_vector(flags);
6407 f->open_array_section("flags");
6408 for (const auto& str: sv) {
6409 f->dump_string("flags", str);
6410 }
6411 f->close_section();
6412 f->dump_unsigned("truncate_seq", truncate_seq);
6413 f->dump_unsigned("truncate_size", truncate_size);
6414 f->dump_format("data_digest", "0x%08x", data_digest);
6415 f->dump_format("omap_digest", "0x%08x", omap_digest);
6416 f->dump_unsigned("expected_object_size", expected_object_size);
6417 f->dump_unsigned("expected_write_size", expected_write_size);
6418 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
6419 f->dump_object("manifest", manifest);
6420 f->open_object_section("watchers");
6421 for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) {
6422 CachedStackStringStream css;
6423 *css << p->first.second;
6424 f->open_object_section(css->strv());
6425 p->second.dump(f);
6426 f->close_section();
6427 }
6428 f->close_section();
6429 }
6430
6431 void object_info_t::generate_test_instances(list<object_info_t*>& o)
6432 {
6433 o.push_back(new object_info_t());
6434
6435 // fixme
6436 }
6437
6438
6439 ostream& operator<<(ostream& out, const object_info_t& oi)
6440 {
6441 out << oi.soid << "(" << oi.version
6442 << " " << oi.last_reqid;
6443 if (oi.flags)
6444 out << " " << oi.get_flag_string();
6445 out << " s " << oi.size;
6446 out << " uv " << oi.user_version;
6447 if (oi.is_data_digest())
6448 out << " dd " << std::hex << oi.data_digest << std::dec;
6449 if (oi.is_omap_digest())
6450 out << " od " << std::hex << oi.omap_digest << std::dec;
6451 out << " alloc_hint [" << oi.expected_object_size
6452 << " " << oi.expected_write_size
6453 << " " << oi.alloc_hint_flags << "]";
6454 if (oi.has_manifest())
6455 out << " " << oi.manifest;
6456 out << ")";
6457 return out;
6458 }
6459
6460 // -- ObjectRecovery --
6461 void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const
6462 {
6463 ENCODE_START(1, 1, bl);
6464 encode(first, bl);
6465 encode(data_complete, bl);
6466 encode(data_recovered_to, bl);
6467 encode(omap_recovered_to, bl);
6468 encode(omap_complete, bl);
6469 ENCODE_FINISH(bl);
6470 }
6471
6472 void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl)
6473 {
6474 DECODE_START(1, bl);
6475 decode(first, bl);
6476 decode(data_complete, bl);
6477 decode(data_recovered_to, bl);
6478 decode(omap_recovered_to, bl);
6479 decode(omap_complete, bl);
6480 DECODE_FINISH(bl);
6481 }
6482
6483 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
6484 {
6485 return prog.print(out);
6486 }
6487
6488 void ObjectRecoveryProgress::generate_test_instances(
6489 list<ObjectRecoveryProgress*>& o)
6490 {
6491 o.push_back(new ObjectRecoveryProgress);
6492 o.back()->first = false;
6493 o.back()->data_complete = true;
6494 o.back()->omap_complete = true;
6495 o.back()->data_recovered_to = 100;
6496
6497 o.push_back(new ObjectRecoveryProgress);
6498 o.back()->first = true;
6499 o.back()->data_complete = false;
6500 o.back()->omap_complete = false;
6501 o.back()->data_recovered_to = 0;
6502 }
6503
6504 ostream &ObjectRecoveryProgress::print(ostream &out) const
6505 {
6506 return out << "ObjectRecoveryProgress("
6507 << ( first ? "" : "!" ) << "first, "
6508 << "data_recovered_to:" << data_recovered_to
6509 << ", data_complete:" << ( data_complete ? "true" : "false" )
6510 << ", omap_recovered_to:" << omap_recovered_to
6511 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
6512 << ", error:" << ( error ? "true" : "false" )
6513 << ")";
6514 }
6515
6516 void ObjectRecoveryProgress::dump(Formatter *f) const
6517 {
6518 f->dump_int("first?", first);
6519 f->dump_int("data_complete?", data_complete);
6520 f->dump_unsigned("data_recovered_to", data_recovered_to);
6521 f->dump_int("omap_complete?", omap_complete);
6522 f->dump_string("omap_recovered_to", omap_recovered_to);
6523 }
6524
6525 void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
6526 {
6527 ENCODE_START(3, 1, bl);
6528 encode(soid, bl);
6529 encode(version, bl);
6530 encode(size, bl);
6531 encode(oi, bl, features);
6532 encode(ss, bl);
6533 encode(copy_subset, bl);
6534 encode(clone_subset, bl);
6535 encode(object_exist, bl);
6536 ENCODE_FINISH(bl);
6537 }
6538
6539 void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
6540 int64_t pool)
6541 {
6542 DECODE_START(3, bl);
6543 decode(soid, bl);
6544 decode(version, bl);
6545 decode(size, bl);
6546 decode(oi, bl);
6547 decode(ss, bl);
6548 decode(copy_subset, bl);
6549 decode(clone_subset, bl);
6550 if (struct_v > 2)
6551 decode(object_exist, bl);
6552 else
6553 object_exist = false;
6554 DECODE_FINISH(bl);
6555 if (struct_v < 2) {
6556 if (!soid.is_max() && soid.pool == -1)
6557 soid.pool = pool;
6558 map<hobject_t, interval_set<uint64_t>> tmp;
6559 tmp.swap(clone_subset);
6560 for (auto i = tmp.begin(); i != tmp.end(); ++i) {
6561 hobject_t first(i->first);
6562 if (!first.is_max() && first.pool == -1)
6563 first.pool = pool;
6564 clone_subset[first].swap(i->second);
6565 }
6566 }
6567 }
6568
6569 void ObjectRecoveryInfo::generate_test_instances(
6570 list<ObjectRecoveryInfo*>& o)
6571 {
6572 o.push_back(new ObjectRecoveryInfo);
6573 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
6574 o.back()->version = eversion_t(0,0);
6575 o.back()->size = 100;
6576 o.back()->object_exist = false;
6577 }
6578
6579
6580 void ObjectRecoveryInfo::dump(Formatter *f) const
6581 {
6582 f->dump_stream("object") << soid;
6583 f->dump_stream("at_version") << version;
6584 f->dump_stream("size") << size;
6585 {
6586 f->open_object_section("object_info");
6587 oi.dump(f);
6588 f->close_section();
6589 }
6590 {
6591 f->open_object_section("snapset");
6592 ss.dump(f);
6593 f->close_section();
6594 }
6595 f->dump_stream("copy_subset") << copy_subset;
6596 f->dump_stream("clone_subset") << clone_subset;
6597 f->dump_stream("object_exist") << object_exist;
6598 }
6599
6600 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
6601 {
6602 return inf.print(out);
6603 }
6604
6605 ostream &ObjectRecoveryInfo::print(ostream &out) const
6606 {
6607 return out << "ObjectRecoveryInfo("
6608 << soid << "@" << version
6609 << ", size: " << size
6610 << ", copy_subset: " << copy_subset
6611 << ", clone_subset: " << clone_subset
6612 << ", snapset: " << ss
6613 << ", object_exist: " << object_exist
6614 << ")";
6615 }
6616
6617 // -- PushReplyOp --
6618 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
6619 {
6620 o.push_back(new PushReplyOp);
6621 o.push_back(new PushReplyOp);
6622 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6623 o.push_back(new PushReplyOp);
6624 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6625 }
6626
6627 void PushReplyOp::encode(ceph::buffer::list &bl) const
6628 {
6629 ENCODE_START(1, 1, bl);
6630 encode(soid, bl);
6631 ENCODE_FINISH(bl);
6632 }
6633
6634 void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl)
6635 {
6636 DECODE_START(1, bl);
6637 decode(soid, bl);
6638 DECODE_FINISH(bl);
6639 }
6640
6641 void PushReplyOp::dump(Formatter *f) const
6642 {
6643 f->dump_stream("soid") << soid;
6644 }
6645
6646 ostream &PushReplyOp::print(ostream &out) const
6647 {
6648 return out
6649 << "PushReplyOp(" << soid
6650 << ")";
6651 }
6652
6653 ostream& operator<<(ostream& out, const PushReplyOp &op)
6654 {
6655 return op.print(out);
6656 }
6657
6658 uint64_t PushReplyOp::cost(CephContext *cct) const
6659 {
6660
6661 return cct->_conf->osd_push_per_object_cost +
6662 cct->_conf->osd_recovery_max_chunk;
6663 }
6664
6665 // -- PullOp --
6666 void PullOp::generate_test_instances(list<PullOp*> &o)
6667 {
6668 o.push_back(new PullOp);
6669 o.push_back(new PullOp);
6670 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6671 o.back()->recovery_info.version = eversion_t(3, 10);
6672 o.push_back(new PullOp);
6673 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6674 o.back()->recovery_info.version = eversion_t(0, 0);
6675 }
6676
6677 void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const
6678 {
6679 ENCODE_START(1, 1, bl);
6680 encode(soid, bl);
6681 encode(recovery_info, bl, features);
6682 encode(recovery_progress, bl);
6683 ENCODE_FINISH(bl);
6684 }
6685
6686 void PullOp::decode(ceph::buffer::list::const_iterator &bl)
6687 {
6688 DECODE_START(1, bl);
6689 decode(soid, bl);
6690 decode(recovery_info, bl);
6691 decode(recovery_progress, bl);
6692 DECODE_FINISH(bl);
6693 }
6694
6695 void PullOp::dump(Formatter *f) const
6696 {
6697 f->dump_stream("soid") << soid;
6698 {
6699 f->open_object_section("recovery_info");
6700 recovery_info.dump(f);
6701 f->close_section();
6702 }
6703 {
6704 f->open_object_section("recovery_progress");
6705 recovery_progress.dump(f);
6706 f->close_section();
6707 }
6708 }
6709
6710 ostream &PullOp::print(ostream &out) const
6711 {
6712 return out
6713 << "PullOp(" << soid
6714 << ", recovery_info: " << recovery_info
6715 << ", recovery_progress: " << recovery_progress
6716 << ")";
6717 }
6718
6719 ostream& operator<<(ostream& out, const PullOp &op)
6720 {
6721 return op.print(out);
6722 }
6723
6724 uint64_t PullOp::cost(CephContext *cct) const
6725 {
6726 return cct->_conf->osd_push_per_object_cost +
6727 cct->_conf->osd_recovery_max_chunk;
6728 }
6729
6730 // -- PushOp --
6731 void PushOp::generate_test_instances(list<PushOp*> &o)
6732 {
6733 o.push_back(new PushOp);
6734 o.push_back(new PushOp);
6735 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6736 o.back()->version = eversion_t(3, 10);
6737 o.push_back(new PushOp);
6738 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6739 o.back()->version = eversion_t(0, 0);
6740 }
6741
6742 void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const
6743 {
6744 ENCODE_START(1, 1, bl);
6745 encode(soid, bl);
6746 encode(version, bl);
6747 encode(data, bl);
6748 encode(data_included, bl);
6749 encode(omap_header, bl);
6750 encode(omap_entries, bl);
6751 encode(attrset, bl);
6752 encode(recovery_info, bl, features);
6753 encode(after_progress, bl);
6754 encode(before_progress, bl);
6755 ENCODE_FINISH(bl);
6756 }
6757
6758 void PushOp::decode(ceph::buffer::list::const_iterator &bl)
6759 {
6760 DECODE_START(1, bl);
6761 decode(soid, bl);
6762 decode(version, bl);
6763 decode(data, bl);
6764 decode(data_included, bl);
6765 decode(omap_header, bl);
6766 decode(omap_entries, bl);
6767 decode(attrset, bl);
6768 decode(recovery_info, bl);
6769 decode(after_progress, bl);
6770 decode(before_progress, bl);
6771 DECODE_FINISH(bl);
6772 }
6773
6774 void PushOp::dump(Formatter *f) const
6775 {
6776 f->dump_stream("soid") << soid;
6777 f->dump_stream("version") << version;
6778 f->dump_int("data_len", data.length());
6779 f->dump_stream("data_included") << data_included;
6780 f->dump_int("omap_header_len", omap_header.length());
6781 f->dump_int("omap_entries_len", omap_entries.size());
6782 f->dump_int("attrset_len", attrset.size());
6783 {
6784 f->open_object_section("recovery_info");
6785 recovery_info.dump(f);
6786 f->close_section();
6787 }
6788 {
6789 f->open_object_section("after_progress");
6790 after_progress.dump(f);
6791 f->close_section();
6792 }
6793 {
6794 f->open_object_section("before_progress");
6795 before_progress.dump(f);
6796 f->close_section();
6797 }
6798 }
6799
6800 ostream &PushOp::print(ostream &out) const
6801 {
6802 return out
6803 << "PushOp(" << soid
6804 << ", version: " << version
6805 << ", data_included: " << data_included
6806 << ", data_size: " << data.length()
6807 << ", omap_header_size: " << omap_header.length()
6808 << ", omap_entries_size: " << omap_entries.size()
6809 << ", attrset_size: " << attrset.size()
6810 << ", recovery_info: " << recovery_info
6811 << ", after_progress: " << after_progress
6812 << ", before_progress: " << before_progress
6813 << ")";
6814 }
6815
6816 ostream& operator<<(ostream& out, const PushOp &op)
6817 {
6818 return op.print(out);
6819 }
6820
6821 uint64_t PushOp::cost(CephContext *cct) const
6822 {
6823 uint64_t cost = data_included.size();
6824 for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) {
6825 cost += i->second.length();
6826 }
6827 cost += cct->_conf->osd_push_per_object_cost;
6828 return cost;
6829 }
6830
6831 // -- ScrubMap --
6832
6833 void ScrubMap::merge_incr(const ScrubMap &l)
6834 {
6835 ceph_assert(valid_through == l.incr_since);
6836 valid_through = l.valid_through;
6837
6838 for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){
6839 if (p->second.negative) {
6840 auto q = objects.find(p->first);
6841 if (q != objects.end()) {
6842 objects.erase(q);
6843 }
6844 } else {
6845 objects[p->first] = p->second;
6846 }
6847 }
6848 }
6849
6850 void ScrubMap::encode(ceph::buffer::list& bl) const
6851 {
6852 ENCODE_START(3, 2, bl);
6853 encode(objects, bl);
6854 encode((__u32)0, bl); // used to be attrs; now deprecated
6855 ceph::buffer::list old_logbl; // not used
6856 encode(old_logbl, bl);
6857 encode(valid_through, bl);
6858 encode(incr_since, bl);
6859 ENCODE_FINISH(bl);
6860 }
6861
6862 void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool)
6863 {
6864 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
6865 decode(objects, bl);
6866 {
6867 map<string,string> attrs; // deprecated
6868 decode(attrs, bl);
6869 }
6870 ceph::buffer::list old_logbl; // not used
6871 decode(old_logbl, bl);
6872 decode(valid_through, bl);
6873 decode(incr_since, bl);
6874 DECODE_FINISH(bl);
6875
6876 // handle hobject_t upgrade
6877 if (struct_v < 3) {
6878 map<hobject_t, object> tmp;
6879 tmp.swap(objects);
6880 for (auto i = tmp.begin(); i != tmp.end(); ++i) {
6881 hobject_t first(i->first);
6882 if (!first.is_max() && first.pool == -1)
6883 first.pool = pool;
6884 objects[first] = i->second;
6885 }
6886 }
6887 }
6888
6889 void ScrubMap::dump(Formatter *f) const
6890 {
6891 f->dump_stream("valid_through") << valid_through;
6892 f->dump_stream("incremental_since") << incr_since;
6893 f->open_array_section("objects");
6894 for (auto p = objects.cbegin(); p != objects.cend(); ++p) {
6895 f->open_object_section("object");
6896 f->dump_string("name", p->first.oid.name);
6897 f->dump_unsigned("hash", p->first.get_hash());
6898 f->dump_string("key", p->first.get_key());
6899 f->dump_int("snapid", p->first.snap);
6900 p->second.dump(f);
6901 f->close_section();
6902 }
6903 f->close_section();
6904 }
6905
6906 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
6907 {
6908 o.push_back(new ScrubMap);
6909 o.push_back(new ScrubMap);
6910 o.back()->valid_through = eversion_t(1, 2);
6911 o.back()->incr_since = eversion_t(3, 4);
6912 list<object*> obj;
6913 object::generate_test_instances(obj);
6914 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
6915 obj.pop_back();
6916 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
6917 }
6918
6919 // -- ScrubMap::object --
6920
6921 void ScrubMap::object::encode(ceph::buffer::list& bl) const
6922 {
6923 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
6924 ENCODE_START(10, 7, bl);
6925 encode(size, bl);
6926 encode(negative, bl);
6927 encode(attrs, bl);
6928 encode(digest, bl);
6929 encode(digest_present, bl);
6930 encode((uint32_t)0, bl); // obsolete nlinks
6931 encode((uint32_t)0, bl); // snapcolls
6932 encode(omap_digest, bl);
6933 encode(omap_digest_present, bl);
6934 encode(compat_read_error, bl);
6935 encode(stat_error, bl);
6936 encode(read_error, bl);
6937 encode(ec_hash_mismatch, bl);
6938 encode(ec_size_mismatch, bl);
6939 encode(large_omap_object_found, bl);
6940 encode(large_omap_object_key_count, bl);
6941 encode(large_omap_object_value_size, bl);
6942 encode(object_omap_bytes, bl);
6943 encode(object_omap_keys, bl);
6944 ENCODE_FINISH(bl);
6945 }
6946
6947 void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl)
6948 {
6949 DECODE_START(10, bl);
6950 decode(size, bl);
6951 bool tmp, compat_read_error = false;
6952 decode(tmp, bl);
6953 negative = tmp;
6954 decode(attrs, bl);
6955 decode(digest, bl);
6956 decode(tmp, bl);
6957 digest_present = tmp;
6958 {
6959 uint32_t nlinks;
6960 decode(nlinks, bl);
6961 set<snapid_t> snapcolls;
6962 decode(snapcolls, bl);
6963 }
6964 decode(omap_digest, bl);
6965 decode(tmp, bl);
6966 omap_digest_present = tmp;
6967 decode(compat_read_error, bl);
6968 decode(tmp, bl);
6969 stat_error = tmp;
6970 if (struct_v >= 8) {
6971 decode(tmp, bl);
6972 read_error = tmp;
6973 decode(tmp, bl);
6974 ec_hash_mismatch = tmp;
6975 decode(tmp, bl);
6976 ec_size_mismatch = tmp;
6977 }
6978 // If older encoder found a read_error, set read_error
6979 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
6980 read_error = true;
6981 if (struct_v >= 9) {
6982 decode(tmp, bl);
6983 large_omap_object_found = tmp;
6984 decode(large_omap_object_key_count, bl);
6985 decode(large_omap_object_value_size, bl);
6986 }
6987 if (struct_v >= 10) {
6988 decode(object_omap_bytes, bl);
6989 decode(object_omap_keys, bl);
6990 }
6991 DECODE_FINISH(bl);
6992 }
6993
6994 void ScrubMap::object::dump(Formatter *f) const
6995 {
6996 f->dump_int("size", size);
6997 f->dump_int("negative", negative);
6998 f->open_array_section("attrs");
6999 for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) {
7000 f->open_object_section("attr");
7001 f->dump_string("name", p->first);
7002 f->dump_int("length", p->second.length());
7003 f->close_section();
7004 }
7005 f->close_section();
7006 }
7007
7008 void ScrubMap::object::generate_test_instances(list<object*>& o)
7009 {
7010 o.push_back(new object);
7011 o.push_back(new object);
7012 o.back()->negative = true;
7013 o.push_back(new object);
7014 o.back()->size = 123;
7015 o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3);
7016 o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6);
7017 }
7018
7019 // -- OSDOp --
7020
7021 ostream& operator<<(ostream& out, const OSDOp& op)
7022 {
7023 out << ceph_osd_op_name(op.op.op);
7024 if (ceph_osd_op_type_data(op.op.op)) {
7025 // data extent
7026 switch (op.op.op) {
7027 case CEPH_OSD_OP_ASSERT_VER:
7028 out << " v" << op.op.assert_ver.ver;
7029 break;
7030 case CEPH_OSD_OP_TRUNCATE:
7031 out << " " << op.op.extent.offset;
7032 break;
7033 case CEPH_OSD_OP_MASKTRUNC:
7034 case CEPH_OSD_OP_TRIMTRUNC:
7035 out << " " << op.op.extent.truncate_seq << "@"
7036 << (int64_t)op.op.extent.truncate_size;
7037 break;
7038 case CEPH_OSD_OP_ROLLBACK:
7039 out << " " << snapid_t(op.op.snap.snapid);
7040 break;
7041 case CEPH_OSD_OP_WATCH:
7042 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
7043 << " cookie " << op.op.watch.cookie;
7044 if (op.op.watch.gen)
7045 out << " gen " << op.op.watch.gen;
7046 break;
7047 case CEPH_OSD_OP_NOTIFY:
7048 out << " cookie " << op.op.notify.cookie;
7049 break;
7050 case CEPH_OSD_OP_COPY_GET:
7051 out << " max " << op.op.copy_get.max;
7052 break;
7053 case CEPH_OSD_OP_COPY_FROM:
7054 out << " ver " << op.op.copy_from.src_version;
7055 break;
7056 case CEPH_OSD_OP_SETALLOCHINT:
7057 out << " object_size " << op.op.alloc_hint.expected_object_size
7058 << " write_size " << op.op.alloc_hint.expected_write_size;
7059 break;
7060 case CEPH_OSD_OP_READ:
7061 case CEPH_OSD_OP_SPARSE_READ:
7062 case CEPH_OSD_OP_SYNC_READ:
7063 case CEPH_OSD_OP_WRITE:
7064 case CEPH_OSD_OP_WRITEFULL:
7065 case CEPH_OSD_OP_ZERO:
7066 case CEPH_OSD_OP_APPEND:
7067 case CEPH_OSD_OP_MAPEXT:
7068 case CEPH_OSD_OP_CMPEXT:
7069 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
7070 if (op.op.extent.truncate_seq)
7071 out << " [" << op.op.extent.truncate_seq << "@"
7072 << (int64_t)op.op.extent.truncate_size << "]";
7073 if (op.op.flags)
7074 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
7075 default:
7076 // don't show any arg info
7077 break;
7078 }
7079 } else if (ceph_osd_op_type_attr(op.op.op)) {
7080 // xattr name
7081 if (op.op.xattr.name_len && op.indata.length()) {
7082 out << " ";
7083 op.indata.write(0, op.op.xattr.name_len, out);
7084 }
7085 if (op.op.xattr.value_len)
7086 out << " (" << op.op.xattr.value_len << ")";
7087 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
7088 out << " op " << (int)op.op.xattr.cmp_op
7089 << " mode " << (int)op.op.xattr.cmp_mode;
7090 } else if (ceph_osd_op_type_exec(op.op.op)) {
7091 // class.method
7092 if (op.op.cls.class_len && op.indata.length()) {
7093 out << " ";
7094 op.indata.write(0, op.op.cls.class_len, out);
7095 out << ".";
7096 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
7097 }
7098 } else if (ceph_osd_op_type_pg(op.op.op)) {
7099 switch (op.op.op) {
7100 case CEPH_OSD_OP_PGLS:
7101 case CEPH_OSD_OP_PGLS_FILTER:
7102 case CEPH_OSD_OP_PGNLS:
7103 case CEPH_OSD_OP_PGNLS_FILTER:
7104 out << " start_epoch " << op.op.pgls.start_epoch;
7105 break;
7106 case CEPH_OSD_OP_PG_HITSET_LS:
7107 break;
7108 case CEPH_OSD_OP_PG_HITSET_GET:
7109 out << " " << utime_t(op.op.hit_set_get.stamp);
7110 break;
7111 case CEPH_OSD_OP_SCRUBLS:
7112 break;
7113 }
7114 }
7115 if (op.indata.length()) {
7116 out << " in=" << op.indata.length() << "b";
7117 }
7118 if (op.outdata.length()) {
7119 out << " out=" << op.outdata.length() << "b";
7120 }
7121 return out;
7122 }
7123
7124
7125 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& in)
7126 {
7127 auto datap = in.begin();
7128 for (unsigned i = 0; i < ops.size(); i++) {
7129 if (ops[i].op.payload_len) {
7130 datap.copy(ops[i].op.payload_len, ops[i].outdata);
7131 }
7132 }
7133 }
7134
7135 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& out)
7136 {
7137 for (unsigned i = 0; i < ops.size(); i++) {
7138 ops[i].op.payload_len = ops[i].outdata.length();
7139 if (ops[i].outdata.length()) {
7140 out.append(ops[i].outdata);
7141 }
7142 }
7143 }
7144
7145 int prepare_info_keymap(
7146 CephContext* cct,
7147 map<string,bufferlist> *km,
7148 string *key_to_remove,
7149 epoch_t epoch,
7150 pg_info_t &info,
7151 pg_info_t &last_written_info,
7152 PastIntervals &past_intervals,
7153 bool dirty_big_info,
7154 bool dirty_epoch,
7155 bool try_fast_info,
7156 PerfCounters *logger,
7157 DoutPrefixProvider *dpp)
7158 {
7159 if (dirty_epoch) {
7160 encode(epoch, (*km)[string(epoch_key)]);
7161 }
7162
7163 if (logger)
7164 logger->inc(l_osd_pg_info);
7165
7166 // try to do info efficiently?
7167 if (!dirty_big_info && try_fast_info &&
7168 info.last_update > last_written_info.last_update) {
7169 pg_fast_info_t fast;
7170 fast.populate_from(info);
7171 bool did = fast.try_apply_to(&last_written_info);
7172 ceph_assert(did); // we verified last_update increased above
7173 if (info == last_written_info) {
7174 encode(fast, (*km)[string(fastinfo_key)]);
7175 if (logger)
7176 logger->inc(l_osd_pg_fastinfo);
7177 return 0;
7178 }
7179 if (dpp) {
7180 ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n";
7181 {
7182 JSONFormatter jf(true);
7183 jf.dump_object("info", info);
7184 jf.flush(*_dout);
7185 }
7186 {
7187 *_dout << "\nlast_written_info:\n";
7188 JSONFormatter jf(true);
7189 jf.dump_object("last_written_info", last_written_info);
7190 jf.flush(*_dout);
7191 }
7192 *_dout << dendl;
7193 }
7194 } else if (info.last_update <= last_written_info.last_update) {
7195 // clean up any potentially stale fastinfo key resulting from last_update
7196 // not moving forwards (e.g., a backwards jump during peering)
7197 *key_to_remove = fastinfo_key;
7198 }
7199
7200 last_written_info = info;
7201
7202 // info. store purged_snaps separately.
7203 interval_set<snapid_t> purged_snaps;
7204 purged_snaps.swap(info.purged_snaps);
7205 encode(info, (*km)[string(info_key)]);
7206 purged_snaps.swap(info.purged_snaps);
7207
7208 if (dirty_big_info) {
7209 // potentially big stuff
7210 bufferlist& bigbl = (*km)[string(biginfo_key)];
7211 encode(past_intervals, bigbl);
7212 encode(info.purged_snaps, bigbl);
7213 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
7214 if (logger)
7215 logger->inc(l_osd_pg_biginfo);
7216 }
7217
7218 return 0;
7219 }
7220
7221 void create_pg_collection(
7222 ceph::os::Transaction& t, spg_t pgid, int bits)
7223 {
7224 coll_t coll(pgid);
7225 t.create_collection(coll, bits);
7226 }
7227
7228 void init_pg_ondisk(
7229 ceph::os::Transaction& t,
7230 spg_t pgid,
7231 const pg_pool_t *pool)
7232 {
7233 coll_t coll(pgid);
7234 if (pool) {
7235 // Give a hint to the PG collection
7236 bufferlist hint;
7237 uint32_t pg_num = pool->get_pg_num();
7238 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
7239 encode(pg_num, hint);
7240 encode(expected_num_objects_pg, hint);
7241 uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
7242 t.collection_hint(coll, hint_type, hint);
7243 }
7244
7245 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
7246 t.touch(coll, pgmeta_oid);
7247 map<string,bufferlist> values;
7248 __u8 struct_v = pg_latest_struct_v;
7249 encode(struct_v, values[string(infover_key)]);
7250 t.omap_setkeys(coll, pgmeta_oid, values);
7251 }
7252
7253 PGLSFilter::PGLSFilter() : cct(nullptr)
7254 {
7255 }
7256
7257 PGLSFilter::~PGLSFilter()
7258 {
7259 }
7260
7261 int PGLSPlainFilter::init(ceph::bufferlist::const_iterator &params)
7262 {
7263 try {
7264 decode(xattr, params);
7265 decode(val, params);
7266 } catch (ceph::buffer::error &e) {
7267 return -EINVAL;
7268 }
7269 return 0;
7270 }
7271
7272 bool PGLSPlainFilter::filter(const hobject_t& obj,
7273 const ceph::bufferlist& xattr_data) const
7274 {
7275 return xattr_data.contents_equal(val.c_str(), val.size());
7276 }