]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.cc
import ceph 14.2.5
[ceph.git] / ceph / src / osd / osd_types.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <boost/assign/list_of.hpp>
19
20 #include "osd_types.h"
21 #include "include/ceph_features.h"
22 #include "include/stringify.h"
23 extern "C" {
24 #include "crush/hash.h"
25 }
26 #include "OSDMap.h"
27
28 const char *ceph_osd_flag_name(unsigned flag)
29 {
30 switch (flag) {
31 case CEPH_OSD_FLAG_ACK: return "ack";
32 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
33 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
34 case CEPH_OSD_FLAG_RETRY: return "retry";
35 case CEPH_OSD_FLAG_READ: return "read";
36 case CEPH_OSD_FLAG_WRITE: return "write";
37 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
38 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
39 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
40 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
41 case CEPH_OSD_FLAG_PGOP: return "pgop";
42 case CEPH_OSD_FLAG_EXEC: return "exec";
43 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
44 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
45 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
46 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
47 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
48 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
49 case CEPH_OSD_FLAG_FLUSH: return "flush";
50 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
51 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
52 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
53 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
54 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
55 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
56 case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
57 default: return "???";
58 }
59 }
60
61 string ceph_osd_flag_string(unsigned flags)
62 {
63 string s;
64 for (unsigned i=0; i<32; ++i) {
65 if (flags & (1u<<i)) {
66 if (s.length())
67 s += "+";
68 s += ceph_osd_flag_name(1u << i);
69 }
70 }
71 if (s.length())
72 return s;
73 return string("-");
74 }
75
76 const char * ceph_osd_op_flag_name(unsigned flag)
77 {
78 const char *name;
79
80 switch(flag) {
81 case CEPH_OSD_OP_FLAG_EXCL:
82 name = "excl";
83 break;
84 case CEPH_OSD_OP_FLAG_FAILOK:
85 name = "failok";
86 break;
87 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
88 name = "fadvise_random";
89 break;
90 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
91 name = "fadvise_sequential";
92 break;
93 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
94 name = "favise_willneed";
95 break;
96 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
97 name = "fadvise_dontneed";
98 break;
99 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
100 name = "fadvise_nocache";
101 break;
102 case CEPH_OSD_OP_FLAG_WITH_REFERENCE:
103 name = "with_reference";
104 break;
105 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
106 name = "bypass_clean_cache";
107 break;
108 default:
109 name = "???";
110 };
111
112 return name;
113 }
114
115 string ceph_osd_op_flag_string(unsigned flags)
116 {
117 string s;
118 for (unsigned i=0; i<32; ++i) {
119 if (flags & (1u<<i)) {
120 if (s.length())
121 s += "+";
122 s += ceph_osd_op_flag_name(1u << i);
123 }
124 }
125 if (s.length())
126 return s;
127 return string("-");
128 }
129
130 string ceph_osd_alloc_hint_flag_string(unsigned flags)
131 {
132 string s;
133 for (unsigned i=0; i<32; ++i) {
134 if (flags & (1u<<i)) {
135 if (s.length())
136 s += "+";
137 s += ceph_osd_alloc_hint_flag_name(1u << i);
138 }
139 }
140 if (s.length())
141 return s;
142 return string("-");
143 }
144
145 void pg_shard_t::encode(bufferlist &bl) const
146 {
147 ENCODE_START(1, 1, bl);
148 encode(osd, bl);
149 encode(shard, bl);
150 ENCODE_FINISH(bl);
151 }
152 void pg_shard_t::decode(bufferlist::const_iterator &bl)
153 {
154 DECODE_START(1, bl);
155 decode(osd, bl);
156 decode(shard, bl);
157 DECODE_FINISH(bl);
158 }
159
160 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
161 {
162 if (rhs.is_undefined())
163 return lhs << "?";
164 if (rhs.shard == shard_id_t::NO_SHARD)
165 return lhs << rhs.get_osd();
166 return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
167 }
168
169 void dump(Formatter* f, const osd_alerts_t& alerts)
170 {
171 for (auto& a : alerts) {
172 string s0 = " osd: ";
173 s0 += stringify(a.first);
174 string s;
175 for (auto& aa : a.second) {
176 s = s0;
177 s += " ";
178 s += aa.first;
179 s += ":";
180 s += aa.second;
181 f->dump_string("alert", s);
182 }
183 }
184 }
185
186 // -- osd_reqid_t --
187 void osd_reqid_t::dump(Formatter *f) const
188 {
189 f->dump_stream("name") << name;
190 f->dump_int("inc", inc);
191 f->dump_unsigned("tid", tid);
192 }
193
194 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
195 {
196 o.push_back(new osd_reqid_t);
197 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
198 }
199
200 // -- object_locator_t --
201
202 void object_locator_t::encode(bufferlist& bl) const
203 {
204 // verify that nobody's corrupted the locator
205 ceph_assert(hash == -1 || key.empty());
206 __u8 encode_compat = 3;
207 ENCODE_START(6, encode_compat, bl);
208 encode(pool, bl);
209 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
210 encode(preferred, bl);
211 encode(key, bl);
212 encode(nspace, bl);
213 encode(hash, bl);
214 if (hash != -1)
215 encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash
216 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
217 }
218
219 void object_locator_t::decode(bufferlist::const_iterator& p)
220 {
221 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
222 if (struct_v < 2) {
223 int32_t op;
224 decode(op, p);
225 pool = op;
226 int16_t pref;
227 decode(pref, p);
228 } else {
229 decode(pool, p);
230 int32_t preferred;
231 decode(preferred, p);
232 }
233 decode(key, p);
234 if (struct_v >= 5)
235 decode(nspace, p);
236 if (struct_v >= 6)
237 decode(hash, p);
238 else
239 hash = -1;
240 DECODE_FINISH(p);
241 // verify that nobody's corrupted the locator
242 ceph_assert(hash == -1 || key.empty());
243 }
244
245 void object_locator_t::dump(Formatter *f) const
246 {
247 f->dump_int("pool", pool);
248 f->dump_string("key", key);
249 f->dump_string("namespace", nspace);
250 f->dump_int("hash", hash);
251 }
252
253 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
254 {
255 o.push_back(new object_locator_t);
256 o.push_back(new object_locator_t(123));
257 o.push_back(new object_locator_t(123, 876));
258 o.push_back(new object_locator_t(1, "n2"));
259 o.push_back(new object_locator_t(1234, "", "key"));
260 o.push_back(new object_locator_t(12, "n1", "key2"));
261 }
262
263 // -- request_redirect_t --
264 void request_redirect_t::encode(bufferlist& bl) const
265 {
266 ENCODE_START(1, 1, bl);
267 encode(redirect_locator, bl);
268 encode(redirect_object, bl);
269 // legacy of the removed osd_instructions member
270 encode((uint32_t)0, bl);
271 ENCODE_FINISH(bl);
272 }
273
274 void request_redirect_t::decode(bufferlist::const_iterator& bl)
275 {
276 DECODE_START(1, bl);
277 uint32_t legacy_osd_instructions_len;
278 decode(redirect_locator, bl);
279 decode(redirect_object, bl);
280 decode(legacy_osd_instructions_len, bl);
281 if (legacy_osd_instructions_len) {
282 bl.advance(legacy_osd_instructions_len);
283 }
284 DECODE_FINISH(bl);
285 }
286
287 void request_redirect_t::dump(Formatter *f) const
288 {
289 f->dump_string("object", redirect_object);
290 f->open_object_section("locator");
291 redirect_locator.dump(f);
292 f->close_section(); // locator
293 }
294
295 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
296 {
297 object_locator_t loc(1, "redir_obj");
298 o.push_back(new request_redirect_t());
299 o.push_back(new request_redirect_t(loc, 0));
300 o.push_back(new request_redirect_t(loc, "redir_obj"));
301 o.push_back(new request_redirect_t(loc));
302 }
303
304 void objectstore_perf_stat_t::dump(Formatter *f) const
305 {
306 // *_ms values just for compatibility.
307 f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0);
308 f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0);
309 f->dump_unsigned("commit_latency_ns", os_commit_latency_ns);
310 f->dump_unsigned("apply_latency_ns", os_apply_latency_ns);
311 }
312
313 void objectstore_perf_stat_t::encode(bufferlist &bl, uint64_t features) const
314 {
315 uint8_t target_v = 2;
316 if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) {
317 target_v = 1;
318 }
319 ENCODE_START(target_v, target_v, bl);
320 if (target_v >= 2) {
321 encode(os_commit_latency_ns, bl);
322 encode(os_apply_latency_ns, bl);
323 } else {
324 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
325 uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS;
326 uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS;
327 encode(commit_latency_ms, bl); // for compatibility with older monitor.
328 encode(apply_latency_ms, bl); // for compatibility with older monitor.
329 }
330 ENCODE_FINISH(bl);
331 }
332
333 void objectstore_perf_stat_t::decode(bufferlist::const_iterator &bl)
334 {
335 DECODE_START(2, bl);
336 if (struct_v >= 2) {
337 decode(os_commit_latency_ns, bl);
338 decode(os_apply_latency_ns, bl);
339 } else {
340 uint32_t commit_latency_ms;
341 uint32_t apply_latency_ms;
342 decode(commit_latency_ms, bl);
343 decode(apply_latency_ms, bl);
344 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
345 os_commit_latency_ns = commit_latency_ms * NS_PER_MS;
346 os_apply_latency_ns = apply_latency_ms * NS_PER_MS;
347 }
348 DECODE_FINISH(bl);
349 }
350
351 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
352 {
353 o.push_back(new objectstore_perf_stat_t());
354 o.push_back(new objectstore_perf_stat_t());
355 o.back()->os_commit_latency_ns = 20000000;
356 o.back()->os_apply_latency_ns = 30000000;
357 }
358
359 // -- osd_stat_t --
360 void osd_stat_t::dump(Formatter *f) const
361 {
362 f->dump_unsigned("up_from", up_from);
363 f->dump_unsigned("seq", seq);
364 f->dump_unsigned("num_pgs", num_pgs);
365 f->dump_unsigned("num_osds", num_osds);
366 f->dump_unsigned("num_per_pool_osds", num_per_pool_osds);
367
368 /// dump legacy stats fields to ensure backward compatibility.
369 f->dump_unsigned("kb", statfs.kb());
370 f->dump_unsigned("kb_used", statfs.kb_used_raw());
371 f->dump_unsigned("kb_used_data", statfs.kb_used_data());
372 f->dump_unsigned("kb_used_omap", statfs.kb_used_omap());
373 f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata());
374 f->dump_unsigned("kb_avail", statfs.kb_avail());
375 ////////////////////
376
377 f->open_object_section("statfs");
378 statfs.dump(f);
379 f->close_section();
380 f->open_array_section("hb_peers");
381 for (auto p : hb_peers)
382 f->dump_int("osd", p);
383 f->close_section();
384 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
385 f->dump_int("num_snap_trimming", num_snap_trimming);
386 f->dump_int("num_shards_repaired", num_shards_repaired);
387 f->open_object_section("op_queue_age_hist");
388 op_queue_age_hist.dump(f);
389 f->close_section();
390 f->open_object_section("perf_stat");
391 os_perf_stat.dump(f);
392 f->close_section();
393 f->open_array_section("alerts");
394 ::dump(f, os_alerts);
395 f->close_section();
396 f->open_array_section("network_ping_times");
397 for (auto &i : hb_pingtime) {
398 f->open_object_section("entry");
399 f->dump_int("osd", i.first);
400 const time_t lu(i.second.last_update);
401 char buffer[26];
402 string lustr(ctime_r(&lu, buffer));
403 lustr.pop_back(); // Remove trailing \n
404 f->dump_string("last update", lustr);
405 f->open_array_section("interfaces");
406 f->open_object_section("interface");
407 f->dump_string("interface", "back");
408 f->open_object_section("average");
409 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_pingtime[0],3).c_str());
410 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_pingtime[1],3).c_str());
411 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_pingtime[2],3).c_str());
412 f->close_section(); // average
413 f->open_object_section("min");
414 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_min[0],3).c_str());
415 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_min[1],3).c_str());
416 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_min[2],3).c_str());
417 f->close_section(); // min
418 f->open_object_section("max");
419 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_max[0],3).c_str());
420 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_max[1],3).c_str());
421 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_max[2],3).c_str());
422 f->close_section(); // max
423 f->dump_format_unquoted("last", "%s", fixed_u_to_string(i.second.back_last,3).c_str());
424 f->close_section(); // interface
425
426 if (i.second.front_pingtime[0] != 0) {
427 f->open_object_section("interface");
428 f->dump_string("interface", "front");
429 f->open_object_section("average");
430 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_pingtime[0],3).c_str());
431 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_pingtime[1],3).c_str());
432 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_pingtime[2],3).c_str());
433 f->close_section(); // average
434 f->open_object_section("min");
435 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_min[0],3).c_str());
436 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_min[1],3).c_str());
437 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_min[2],3).c_str());
438 f->close_section(); // min
439 f->open_object_section("max");
440 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_max[0],3).c_str());
441 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_max[1],3).c_str());
442 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_max[2],3).c_str());
443 f->close_section(); // max
444 f->dump_format_unquoted("last", "%s", fixed_u_to_string(i.second.front_last,3).c_str());
445 f->close_section(); // interface
446 }
447 f->close_section(); // interfaces
448 f->close_section(); // entry
449 }
450 f->close_section(); // network_ping_time
451 }
452
453 void osd_stat_t::encode(bufferlist &bl, uint64_t features) const
454 {
455 ENCODE_START(14, 2, bl);
456
457 //////// for compatibility ////////
458 int64_t kb = statfs.kb();
459 int64_t kb_used = statfs.kb_used_raw();
460 int64_t kb_avail = statfs.kb_avail();
461 encode(kb, bl);
462 encode(kb_used, bl);
463 encode(kb_avail, bl);
464 ///////////////////////////////////
465
466 encode(snap_trim_queue_len, bl);
467 encode(num_snap_trimming, bl);
468 encode(hb_peers, bl);
469 encode((uint32_t)0, bl);
470 encode(op_queue_age_hist, bl);
471 encode(os_perf_stat, bl, features);
472 encode(up_from, bl);
473 encode(seq, bl);
474 encode(num_pgs, bl);
475
476 //////// for compatibility ////////
477 int64_t kb_used_data = statfs.kb_used_data();
478 int64_t kb_used_omap = statfs.kb_used_omap();
479 int64_t kb_used_meta = statfs.kb_used_internal_metadata();
480 encode(kb_used_data, bl);
481 encode(kb_used_omap, bl);
482 encode(kb_used_meta, bl);
483 encode(statfs, bl);
484 ///////////////////////////////////
485 encode(os_alerts, bl);
486 encode(num_shards_repaired, bl);
487 encode(num_osds, bl);
488 encode(num_per_pool_osds, bl);
489
490 encode((uint32_t)0, bl); // compatibility
491
492 // hb_pingtime map
493 encode((int)hb_pingtime.size(), bl);
494 for (auto i : hb_pingtime) {
495 encode(i.first, bl); // osd
496 encode(i.second.last_update, bl);
497 encode(i.second.back_pingtime[0], bl);
498 encode(i.second.back_pingtime[1], bl);
499 encode(i.second.back_pingtime[2], bl);
500 encode(i.second.back_min[0], bl);
501 encode(i.second.back_min[1], bl);
502 encode(i.second.back_min[2], bl);
503 encode(i.second.back_max[0], bl);
504 encode(i.second.back_max[1], bl);
505 encode(i.second.back_max[2], bl);
506 encode(i.second.back_last, bl);
507 encode(i.second.front_pingtime[0], bl);
508 encode(i.second.front_pingtime[1], bl);
509 encode(i.second.front_pingtime[2], bl);
510 encode(i.second.front_min[0], bl);
511 encode(i.second.front_min[1], bl);
512 encode(i.second.front_min[2], bl);
513 encode(i.second.front_max[0], bl);
514 encode(i.second.front_max[1], bl);
515 encode(i.second.front_max[2], bl);
516 encode(i.second.front_last, bl);
517 }
518 ENCODE_FINISH(bl);
519 }
520
521 void osd_stat_t::decode(bufferlist::const_iterator &bl)
522 {
523 int64_t kb, kb_used,kb_avail;
524 int64_t kb_used_data, kb_used_omap, kb_used_meta;
525 DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl);
526 decode(kb, bl);
527 decode(kb_used, bl);
528 decode(kb_avail, bl);
529 decode(snap_trim_queue_len, bl);
530 decode(num_snap_trimming, bl);
531 decode(hb_peers, bl);
532 vector<int> num_hb_out;
533 decode(num_hb_out, bl);
534 if (struct_v >= 3)
535 decode(op_queue_age_hist, bl);
536 if (struct_v >= 4)
537 decode(os_perf_stat, bl);
538 if (struct_v >= 6) {
539 decode(up_from, bl);
540 decode(seq, bl);
541 }
542 if (struct_v >= 7) {
543 decode(num_pgs, bl);
544 }
545 if (struct_v >= 8) {
546 decode(kb_used_data, bl);
547 decode(kb_used_omap, bl);
548 decode(kb_used_meta, bl);
549 } else {
550 kb_used_data = kb_used;
551 kb_used_omap = 0;
552 kb_used_meta = 0;
553 }
554 if (struct_v >= 9) {
555 decode(statfs, bl);
556 } else {
557 statfs.reset();
558 statfs.total = kb << 10;
559 statfs.available = kb_avail << 10;
560 // actually it's totally unexpected to have ststfs.total < statfs.available
561 // here but unfortunately legacy generate_test_instances produced such a
562 // case hence inserting some handling rather than assert
563 statfs.internally_reserved =
564 statfs.total > statfs.available ? statfs.total - statfs.available : 0;
565 kb_used <<= 10;
566 if ((int64_t)statfs.internally_reserved > kb_used) {
567 statfs.internally_reserved -= kb_used;
568 } else {
569 statfs.internally_reserved = 0;
570 }
571 statfs.allocated = kb_used_data << 10;
572 statfs.omap_allocated = kb_used_omap << 10;
573 statfs.internal_metadata = kb_used_meta << 10;
574 }
575 if (struct_v >= 10) {
576 decode(os_alerts, bl);
577 } else {
578 os_alerts.clear();
579 }
580 if (struct_v >= 11) {
581 decode(num_shards_repaired, bl);
582 } else {
583 num_shards_repaired = 0;
584 }
585 if (struct_v >= 12) {
586 decode(num_osds, bl);
587 decode(num_per_pool_osds, bl);
588 } else {
589 num_osds = 0;
590 num_per_pool_osds = 0;
591 }
592 // Compatibility num_per_pool_omap_osds
593 if (struct_v >= 13) {
594 uint32_t dummy;
595 decode(dummy, bl);
596 }
597 hb_pingtime.clear();
598 if (struct_v >= 14) {
599 int count;
600 decode(count, bl);
601 for (int i = 0 ; i < count ; i++) {
602 int osd;
603 decode(osd, bl);
604 struct Interfaces ifs;
605 decode(ifs.last_update, bl);
606 decode(ifs.back_pingtime[0],bl);
607 decode(ifs.back_pingtime[1], bl);
608 decode(ifs.back_pingtime[2], bl);
609 decode(ifs.back_min[0],bl);
610 decode(ifs.back_min[1], bl);
611 decode(ifs.back_min[2], bl);
612 decode(ifs.back_max[0],bl);
613 decode(ifs.back_max[1], bl);
614 decode(ifs.back_max[2], bl);
615 decode(ifs.back_last, bl);
616 decode(ifs.front_pingtime[0], bl);
617 decode(ifs.front_pingtime[1], bl);
618 decode(ifs.front_pingtime[2], bl);
619 decode(ifs.front_min[0], bl);
620 decode(ifs.front_min[1], bl);
621 decode(ifs.front_min[2], bl);
622 decode(ifs.front_max[0], bl);
623 decode(ifs.front_max[1], bl);
624 decode(ifs.front_max[2], bl);
625 decode(ifs.front_last, bl);
626 hb_pingtime[osd] = ifs;
627 }
628 }
629 DECODE_FINISH(bl);
630 }
631
632 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
633 {
634 o.push_back(new osd_stat_t);
635
636 o.push_back(new osd_stat_t);
637 list<store_statfs_t*> ll;
638 store_statfs_t::generate_test_instances(ll);
639 o.back()->statfs = *ll.back();
640 o.back()->hb_peers.push_back(7);
641 o.back()->snap_trim_queue_len = 8;
642 o.back()->num_snap_trimming = 99;
643 o.back()->num_shards_repaired = 101;
644 o.back()->os_alerts[0].emplace(
645 "some alert", "some alert details");
646 o.back()->os_alerts[1].emplace(
647 "some alert2", "some alert2 details");
648 struct Interfaces gen_interfaces = {
649 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
650 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
651 o.back()->hb_pingtime[20] = gen_interfaces;
652 gen_interfaces = {
653 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
654 o.back()->hb_pingtime[30] = gen_interfaces;
655 }
656
657 // -- pg_t --
658
659 int pg_t::print(char *o, int maxlen) const
660 {
661 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
662 }
663
664 bool pg_t::parse(const char *s)
665 {
666 uint64_t ppool;
667 uint32_t pseed;
668 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
669 if (r < 2)
670 return false;
671 m_pool = ppool;
672 m_seed = pseed;
673 return true;
674 }
675
676 bool spg_t::parse(const char *s)
677 {
678 shard = shard_id_t::NO_SHARD;
679 uint64_t ppool;
680 uint32_t pseed;
681 uint32_t pshard;
682 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
683 if (r < 2)
684 return false;
685 pgid.set_pool(ppool);
686 pgid.set_ps(pseed);
687
688 const char *p = strchr(s, 's');
689 if (p) {
690 r = sscanf(p, "s%u", &pshard);
691 if (r == 1) {
692 shard = shard_id_t(pshard);
693 } else {
694 return false;
695 }
696 }
697 return true;
698 }
699
700 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
701 {
702 while (*suffix_backwords)
703 *--buf = *suffix_backwords++;
704
705 if (!is_no_shard()) {
706 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
707 *--buf = 's';
708 }
709
710 return pgid.calc_name(buf, "");
711 }
712
713 ostream& operator<<(ostream& out, const spg_t &pg)
714 {
715 char buf[spg_t::calc_name_buf_size];
716 buf[spg_t::calc_name_buf_size - 1] = '\0';
717 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
718 return out;
719 }
720
721 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
722 {
723 int old_bits = cbits(old_pg_num);
724 int old_mask = (1 << old_bits) - 1;
725 pg_t ret = *this;
726 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
727 return ret;
728 }
729
730 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
731 {
732 //ceph_assert(m_seed < old_pg_num);
733 if (m_seed >= old_pg_num) {
734 // degenerate case
735 return false;
736 }
737 if (new_pg_num <= old_pg_num)
738 return false;
739
740 bool split = false;
741 if (true) {
742 unsigned old_bits = cbits(old_pg_num);
743 unsigned old_mask = (1 << old_bits) - 1;
744 for (unsigned n = 1; ; n++) {
745 unsigned next_bit = (n << (old_bits-1));
746 unsigned s = next_bit | m_seed;
747
748 if (s < old_pg_num || s == m_seed)
749 continue;
750 if (s >= new_pg_num)
751 break;
752 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
753 split = true;
754 if (children)
755 children->insert(pg_t(s, m_pool));
756 }
757 }
758 }
759 if (false) {
760 // brute force
761 int old_bits = cbits(old_pg_num);
762 int old_mask = (1 << old_bits) - 1;
763 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
764 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
765 if (o == m_seed) {
766 split = true;
767 children->insert(pg_t(x, m_pool));
768 }
769 }
770 }
771 return split;
772 }
773
774 unsigned pg_t::get_split_bits(unsigned pg_num) const {
775 if (pg_num == 1)
776 return 0;
777 ceph_assert(pg_num > 1);
778
779 // Find unique p such that pg_num \in [2^(p-1), 2^p)
780 unsigned p = cbits(pg_num);
781 ceph_assert(p); // silence coverity #751330
782
783 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
784 return p;
785 else
786 return p - 1;
787 }
788
789 bool pg_t::is_merge_source(
790 unsigned old_pg_num,
791 unsigned new_pg_num,
792 pg_t *parent) const
793 {
794 if (m_seed < old_pg_num &&
795 m_seed >= new_pg_num) {
796 if (parent) {
797 pg_t t = *this;
798 while (t.m_seed >= new_pg_num) {
799 t = t.get_parent();
800 }
801 *parent = t;
802 }
803 return true;
804 }
805 return false;
806 }
807
808 pg_t pg_t::get_parent() const
809 {
810 unsigned bits = cbits(m_seed);
811 ceph_assert(bits);
812 pg_t retval = *this;
813 retval.m_seed &= ~((~0)<<(bits - 1));
814 return retval;
815 }
816
817 hobject_t pg_t::get_hobj_start() const
818 {
819 return hobject_t(object_t(), string(), 0, m_seed, m_pool,
820 string());
821 }
822
823 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
824 {
825 // note: this assumes a bitwise sort; with the legacy nibblewise
826 // sort a PG did not always cover a single contiguous range of the
827 // (bit-reversed) hash range.
828 unsigned bits = get_split_bits(pg_num);
829 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
830 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
831 if (rev_end >= 0x100000000) {
832 ceph_assert(rev_end == 0x100000000);
833 return hobject_t::get_max();
834 } else {
835 return hobject_t(object_t(), string(), CEPH_NOSNAP,
836 hobject_t::_reverse_bits(rev_end), m_pool,
837 string());
838 }
839 }
840
841 void pg_t::dump(Formatter *f) const
842 {
843 f->dump_unsigned("pool", m_pool);
844 f->dump_unsigned("seed", m_seed);
845 }
846
847 void pg_t::generate_test_instances(list<pg_t*>& o)
848 {
849 o.push_back(new pg_t);
850 o.push_back(new pg_t(1, 2));
851 o.push_back(new pg_t(13123, 3));
852 o.push_back(new pg_t(131223, 4));
853 }
854
855 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
856 {
857 while (*suffix_backwords)
858 *--buf = *suffix_backwords++;
859
860 buf = ritoa<uint32_t, 16>(m_seed, buf);
861
862 *--buf = '.';
863
864 return ritoa<uint64_t, 10>(m_pool, buf);
865 }
866
867 ostream& operator<<(ostream& out, const pg_t &pg)
868 {
869 char buf[pg_t::calc_name_buf_size];
870 buf[pg_t::calc_name_buf_size - 1] = '\0';
871 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
872 return out;
873 }
874
875
876 // -- coll_t --
877
878 void coll_t::calc_str()
879 {
880 switch (type) {
881 case TYPE_META:
882 strcpy(_str_buff, "meta");
883 _str = _str_buff;
884 break;
885 case TYPE_PG:
886 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
887 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
888 break;
889 case TYPE_PG_TEMP:
890 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
891 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
892 break;
893 default:
894 ceph_abort_msg("unknown collection type");
895 }
896 }
897
898 bool coll_t::parse(const std::string& s)
899 {
900 if (s == "meta") {
901 type = TYPE_META;
902 pgid = spg_t();
903 removal_seq = 0;
904 calc_str();
905 ceph_assert(s == _str);
906 return true;
907 }
908 if (s.find("_head") == s.length() - 5 &&
909 pgid.parse(s.substr(0, s.length() - 5))) {
910 type = TYPE_PG;
911 removal_seq = 0;
912 calc_str();
913 ceph_assert(s == _str);
914 return true;
915 }
916 if (s.find("_TEMP") == s.length() - 5 &&
917 pgid.parse(s.substr(0, s.length() - 5))) {
918 type = TYPE_PG_TEMP;
919 removal_seq = 0;
920 calc_str();
921 ceph_assert(s == _str);
922 return true;
923 }
924 return false;
925 }
926
927 void coll_t::encode(bufferlist& bl) const
928 {
929 using ceph::encode;
930 // when changing this, remember to update encoded_size() too.
931 if (is_temp()) {
932 // can't express this as v2...
933 __u8 struct_v = 3;
934 encode(struct_v, bl);
935 encode(to_str(), bl);
936 } else {
937 __u8 struct_v = 2;
938 encode(struct_v, bl);
939 encode((__u8)type, bl);
940 encode(pgid, bl);
941 snapid_t snap = CEPH_NOSNAP;
942 encode(snap, bl);
943 }
944 }
945
946 size_t coll_t::encoded_size() const
947 {
948 size_t r = sizeof(__u8);
949 if (is_temp()) {
950 // v3
951 r += sizeof(__u32);
952 if (_str) {
953 r += strlen(_str);
954 }
955 } else {
956 // v2
957 // 1. type
958 r += sizeof(__u8);
959 // 2. pgid
960 // - encoding header
961 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
962 // - pg_t
963 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
964 // - shard_id_t
965 r += sizeof(int8_t);
966 // 3. snapid_t
967 r += sizeof(uint64_t);
968 }
969
970 return r;
971 }
972
973 void coll_t::decode(bufferlist::const_iterator& bl)
974 {
975 using ceph::decode;
976 __u8 struct_v;
977 decode(struct_v, bl);
978 switch (struct_v) {
979 case 1:
980 {
981 snapid_t snap;
982 decode(pgid, bl);
983 decode(snap, bl);
984
985 // infer the type
986 if (pgid == spg_t() && snap == 0) {
987 type = TYPE_META;
988 } else {
989 type = TYPE_PG;
990 }
991 removal_seq = 0;
992 }
993 break;
994
995 case 2:
996 {
997 __u8 _type;
998 snapid_t snap;
999 decode(_type, bl);
1000 decode(pgid, bl);
1001 decode(snap, bl);
1002 type = (type_t)_type;
1003 removal_seq = 0;
1004 }
1005 break;
1006
1007 case 3:
1008 {
1009 string str;
1010 decode(str, bl);
1011 bool ok = parse(str);
1012 if (!ok)
1013 throw std::domain_error(std::string("unable to parse pg ") + str);
1014 }
1015 break;
1016
1017 default:
1018 {
1019 ostringstream oss;
1020 oss << "coll_t::decode(): don't know how to decode version "
1021 << struct_v;
1022 throw std::domain_error(oss.str());
1023 }
1024 }
1025 }
1026
1027 void coll_t::dump(Formatter *f) const
1028 {
1029 f->dump_unsigned("type_id", (unsigned)type);
1030 if (type != TYPE_META)
1031 f->dump_stream("pgid") << pgid;
1032 f->dump_string("name", to_str());
1033 }
1034
1035 void coll_t::generate_test_instances(list<coll_t*>& o)
1036 {
1037 o.push_back(new coll_t());
1038 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
1039 o.push_back(new coll_t(o.back()->get_temp()));
1040 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
1041 o.push_back(new coll_t(o.back()->get_temp()));
1042 o.push_back(new coll_t());
1043 }
1044
1045 // ---
1046
1047 std::string pg_vector_string(const vector<int32_t> &a)
1048 {
1049 ostringstream oss;
1050 oss << "[";
1051 for (vector<int32_t>::const_iterator i = a.begin(); i != a.end(); ++i) {
1052 if (i != a.begin())
1053 oss << ",";
1054 if (*i != CRUSH_ITEM_NONE)
1055 oss << *i;
1056 else
1057 oss << "NONE";
1058 }
1059 oss << "]";
1060 return oss.str();
1061 }
1062
1063 std::string pg_state_string(uint64_t state)
1064 {
1065 ostringstream oss;
1066 if (state & PG_STATE_STALE)
1067 oss << "stale+";
1068 if (state & PG_STATE_CREATING)
1069 oss << "creating+";
1070 if (state & PG_STATE_ACTIVE)
1071 oss << "active+";
1072 if (state & PG_STATE_ACTIVATING)
1073 oss << "activating+";
1074 if (state & PG_STATE_CLEAN)
1075 oss << "clean+";
1076 if (state & PG_STATE_RECOVERY_WAIT)
1077 oss << "recovery_wait+";
1078 if (state & PG_STATE_RECOVERY_TOOFULL)
1079 oss << "recovery_toofull+";
1080 if (state & PG_STATE_RECOVERING)
1081 oss << "recovering+";
1082 if (state & PG_STATE_FORCED_RECOVERY)
1083 oss << "forced_recovery+";
1084 if (state & PG_STATE_DOWN)
1085 oss << "down+";
1086 if (state & PG_STATE_RECOVERY_UNFOUND)
1087 oss << "recovery_unfound+";
1088 if (state & PG_STATE_BACKFILL_UNFOUND)
1089 oss << "backfill_unfound+";
1090 if (state & PG_STATE_UNDERSIZED)
1091 oss << "undersized+";
1092 if (state & PG_STATE_DEGRADED)
1093 oss << "degraded+";
1094 if (state & PG_STATE_REMAPPED)
1095 oss << "remapped+";
1096 if (state & PG_STATE_PREMERGE)
1097 oss << "premerge+";
1098 if (state & PG_STATE_SCRUBBING)
1099 oss << "scrubbing+";
1100 if (state & PG_STATE_DEEP_SCRUB)
1101 oss << "deep+";
1102 if (state & PG_STATE_INCONSISTENT)
1103 oss << "inconsistent+";
1104 if (state & PG_STATE_PEERING)
1105 oss << "peering+";
1106 if (state & PG_STATE_REPAIR)
1107 oss << "repair+";
1108 if (state & PG_STATE_BACKFILL_WAIT)
1109 oss << "backfill_wait+";
1110 if (state & PG_STATE_BACKFILLING)
1111 oss << "backfilling+";
1112 if (state & PG_STATE_FORCED_BACKFILL)
1113 oss << "forced_backfill+";
1114 if (state & PG_STATE_BACKFILL_TOOFULL)
1115 oss << "backfill_toofull+";
1116 if (state & PG_STATE_INCOMPLETE)
1117 oss << "incomplete+";
1118 if (state & PG_STATE_PEERED)
1119 oss << "peered+";
1120 if (state & PG_STATE_SNAPTRIM)
1121 oss << "snaptrim+";
1122 if (state & PG_STATE_SNAPTRIM_WAIT)
1123 oss << "snaptrim_wait+";
1124 if (state & PG_STATE_SNAPTRIM_ERROR)
1125 oss << "snaptrim_error+";
1126 if (state & PG_STATE_FAILED_REPAIR)
1127 oss << "failed_repair+";
1128 string ret(oss.str());
1129 if (ret.length() > 0)
1130 ret.resize(ret.length() - 1);
1131 else
1132 ret = "unknown";
1133 return ret;
1134 }
1135
1136 boost::optional<uint64_t> pg_string_state(const std::string& state)
1137 {
1138 boost::optional<uint64_t> type;
1139 if (state == "active")
1140 type = PG_STATE_ACTIVE;
1141 else if (state == "clean")
1142 type = PG_STATE_CLEAN;
1143 else if (state == "down")
1144 type = PG_STATE_DOWN;
1145 else if (state == "recovery_unfound")
1146 type = PG_STATE_RECOVERY_UNFOUND;
1147 else if (state == "backfill_unfound")
1148 type = PG_STATE_BACKFILL_UNFOUND;
1149 else if (state == "premerge")
1150 type = PG_STATE_PREMERGE;
1151 else if (state == "scrubbing")
1152 type = PG_STATE_SCRUBBING;
1153 else if (state == "degraded")
1154 type = PG_STATE_DEGRADED;
1155 else if (state == "inconsistent")
1156 type = PG_STATE_INCONSISTENT;
1157 else if (state == "peering")
1158 type = PG_STATE_PEERING;
1159 else if (state == "repair")
1160 type = PG_STATE_REPAIR;
1161 else if (state == "recovering")
1162 type = PG_STATE_RECOVERING;
1163 else if (state == "forced_recovery")
1164 type = PG_STATE_FORCED_RECOVERY;
1165 else if (state == "backfill_wait")
1166 type = PG_STATE_BACKFILL_WAIT;
1167 else if (state == "incomplete")
1168 type = PG_STATE_INCOMPLETE;
1169 else if (state == "stale")
1170 type = PG_STATE_STALE;
1171 else if (state == "remapped")
1172 type = PG_STATE_REMAPPED;
1173 else if (state == "deep")
1174 type = PG_STATE_DEEP_SCRUB;
1175 else if (state == "backfilling")
1176 type = PG_STATE_BACKFILLING;
1177 else if (state == "forced_backfill")
1178 type = PG_STATE_FORCED_BACKFILL;
1179 else if (state == "backfill_toofull")
1180 type = PG_STATE_BACKFILL_TOOFULL;
1181 else if (state == "recovery_wait")
1182 type = PG_STATE_RECOVERY_WAIT;
1183 else if (state == "recovery_toofull")
1184 type = PG_STATE_RECOVERY_TOOFULL;
1185 else if (state == "undersized")
1186 type = PG_STATE_UNDERSIZED;
1187 else if (state == "activating")
1188 type = PG_STATE_ACTIVATING;
1189 else if (state == "peered")
1190 type = PG_STATE_PEERED;
1191 else if (state == "snaptrim")
1192 type = PG_STATE_SNAPTRIM;
1193 else if (state == "snaptrim_wait")
1194 type = PG_STATE_SNAPTRIM_WAIT;
1195 else if (state == "snaptrim_error")
1196 type = PG_STATE_SNAPTRIM_ERROR;
1197 else if (state == "creating")
1198 type = PG_STATE_CREATING;
1199 else if (state == "failed_repair")
1200 type = PG_STATE_FAILED_REPAIR;
1201 else if (state == "unknown")
1202 type = 0;
1203 else
1204 type = boost::none;
1205 return type;
1206 }
1207
1208 // -- eversion_t --
1209 string eversion_t::get_key_name() const
1210 {
1211 std::string key(32, ' ');
1212 get_key_name(&key[0]);
1213 key.resize(31); // remove the null terminator
1214 return key;
1215 }
1216
1217 // -- pool_snap_info_t --
1218 void pool_snap_info_t::dump(Formatter *f) const
1219 {
1220 f->dump_unsigned("snapid", snapid);
1221 f->dump_stream("stamp") << stamp;
1222 f->dump_string("name", name);
1223 }
1224
1225 void pool_snap_info_t::encode(bufferlist& bl, uint64_t features) const
1226 {
1227 using ceph::encode;
1228 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1229 __u8 struct_v = 1;
1230 encode(struct_v, bl);
1231 encode(snapid, bl);
1232 encode(stamp, bl);
1233 encode(name, bl);
1234 return;
1235 }
1236 ENCODE_START(2, 2, bl);
1237 encode(snapid, bl);
1238 encode(stamp, bl);
1239 encode(name, bl);
1240 ENCODE_FINISH(bl);
1241 }
1242
1243 void pool_snap_info_t::decode(bufferlist::const_iterator& bl)
1244 {
1245 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
1246 decode(snapid, bl);
1247 decode(stamp, bl);
1248 decode(name, bl);
1249 DECODE_FINISH(bl);
1250 }
1251
1252 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
1253 {
1254 o.push_back(new pool_snap_info_t);
1255 o.push_back(new pool_snap_info_t);
1256 o.back()->snapid = 1;
1257 o.back()->stamp = utime_t(1, 2);
1258 o.back()->name = "foo";
1259 }
1260
1261 // -- pool_opts_t --
1262
1263 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
1264 static opt_mapping_t opt_mapping = boost::assign::map_list_of
1265 ("scrub_min_interval", pool_opts_t::opt_desc_t(
1266 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
1267 ("scrub_max_interval", pool_opts_t::opt_desc_t(
1268 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
1269 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1270 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
1271 ("recovery_priority", pool_opts_t::opt_desc_t(
1272 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
1273 ("recovery_op_priority", pool_opts_t::opt_desc_t(
1274 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
1275 ("scrub_priority", pool_opts_t::opt_desc_t(
1276 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
1277 ("compression_mode", pool_opts_t::opt_desc_t(
1278 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
1279 ("compression_algorithm", pool_opts_t::opt_desc_t(
1280 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
1281 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1282 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
1283 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1284 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
1285 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1286 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
1287 ("csum_type", pool_opts_t::opt_desc_t(
1288 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1289 ("csum_max_block", pool_opts_t::opt_desc_t(
1290 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1291 ("csum_min_block", pool_opts_t::opt_desc_t(
1292 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT))
1293 ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1294 pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR))
1295 ("pg_num_min", pool_opts_t::opt_desc_t(
1296 pool_opts_t::PG_NUM_MIN, pool_opts_t::INT))
1297 ("target_size_bytes", pool_opts_t::opt_desc_t(
1298 pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT))
1299 ("target_size_ratio", pool_opts_t::opt_desc_t(
1300 pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE))
1301 ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
1302 pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE));
1303
1304 bool pool_opts_t::is_opt_name(const std::string& name)
1305 {
1306 return opt_mapping.count(name);
1307 }
1308
1309 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name)
1310 {
1311 opt_mapping_t::iterator i = opt_mapping.find(name);
1312 ceph_assert(i != opt_mapping.end());
1313 return i->second;
1314 }
1315
1316 bool pool_opts_t::is_set(pool_opts_t::key_t key) const
1317 {
1318 return opts.count(key);
1319 }
1320
1321 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const
1322 {
1323 opts_t::const_iterator i = opts.find(key);
1324 ceph_assert(i != opts.end());
1325 return i->second;
1326 }
1327
1328 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1329 return opts.erase(key) > 0;
1330 }
1331
1332 class pool_opts_dumper_t : public boost::static_visitor<> {
1333 public:
1334 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1335 name(name_.c_str()), f(f_) {}
1336
1337 void operator()(std::string s) const {
1338 f->dump_string(name, s);
1339 }
1340 void operator()(int64_t i) const {
1341 f->dump_int(name, i);
1342 }
1343 void operator()(double d) const {
1344 f->dump_float(name, d);
1345 }
1346
1347 private:
1348 const char* name;
1349 Formatter* f;
1350 };
1351
1352 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1353 {
1354 const opt_desc_t& desc = get_opt_desc(name);
1355 opts_t::const_iterator i = opts.find(desc.key);
1356 if (i == opts.end()) {
1357 return;
1358 }
1359 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1360 }
1361
1362 void pool_opts_t::dump(Formatter* f) const
1363 {
1364 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1365 ++i) {
1366 const std::string& name = i->first;
1367 const opt_desc_t& desc = i->second;
1368 opts_t::const_iterator j = opts.find(desc.key);
1369 if (j == opts.end()) {
1370 continue;
1371 }
1372 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1373 }
1374 }
1375
1376 class pool_opts_encoder_t : public boost::static_visitor<> {
1377 public:
1378 explicit pool_opts_encoder_t(bufferlist& bl_, uint64_t features)
1379 : bl(bl_),
1380 features(features) {}
1381
1382 void operator()(const std::string &s) const {
1383 encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1384 encode(s, bl);
1385 }
1386 void operator()(int64_t i) const {
1387 encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1388 if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1389 encode(i, bl);
1390 } else {
1391 encode(static_cast<int32_t>(i), bl);
1392 }
1393 }
1394 void operator()(double d) const {
1395 encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1396 encode(d, bl);
1397 }
1398
1399 private:
1400 bufferlist& bl;
1401 uint64_t features;
1402 };
1403
1404 void pool_opts_t::encode(bufferlist& bl, uint64_t features) const
1405 {
1406 unsigned v = 2;
1407 if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1408 v = 1;
1409 }
1410 ENCODE_START(v, 1, bl);
1411 uint32_t n = static_cast<uint32_t>(opts.size());
1412 encode(n, bl);
1413 for (opts_t::const_iterator i = opts.begin(); i != opts.end(); ++i) {
1414 encode(static_cast<int32_t>(i->first), bl);
1415 boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second);
1416 }
1417 ENCODE_FINISH(bl);
1418 }
1419
1420 void pool_opts_t::decode(bufferlist::const_iterator& bl)
1421 {
1422 DECODE_START(1, bl);
1423 __u32 n;
1424 decode(n, bl);
1425 opts.clear();
1426 while (n--) {
1427 int32_t k, t;
1428 decode(k, bl);
1429 decode(t, bl);
1430 if (t == STR) {
1431 std::string s;
1432 decode(s, bl);
1433 opts[static_cast<key_t>(k)] = s;
1434 } else if (t == INT) {
1435 int64_t i;
1436 if (struct_v >= 2) {
1437 decode(i, bl);
1438 } else {
1439 int ii;
1440 decode(ii, bl);
1441 i = ii;
1442 }
1443 opts[static_cast<key_t>(k)] = i;
1444 } else if (t == DOUBLE) {
1445 double d;
1446 decode(d, bl);
1447 opts[static_cast<key_t>(k)] = d;
1448 } else {
1449 ceph_assert(!"invalid type");
1450 }
1451 }
1452 DECODE_FINISH(bl);
1453 }
1454
1455 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1456 {
1457 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1458 ++i) {
1459 const std::string& name = i->first;
1460 const pool_opts_t::opt_desc_t& desc = i->second;
1461 pool_opts_t::opts_t::const_iterator j = opts.opts.find(desc.key);
1462 if (j == opts.opts.end()) {
1463 continue;
1464 }
1465 out << " " << name << " " << j->second;
1466 }
1467 return out;
1468 }
1469
1470 // -- pg_pool_t --
1471
1472 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1473 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1474 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1475
1476 void pg_pool_t::dump(Formatter *f) const
1477 {
1478 f->dump_stream("create_time") << get_create_time();
1479 f->dump_unsigned("flags", get_flags());
1480 f->dump_string("flags_names", get_flags_string());
1481 f->dump_int("type", get_type());
1482 f->dump_int("size", get_size());
1483 f->dump_int("min_size", get_min_size());
1484 f->dump_int("crush_rule", get_crush_rule());
1485 f->dump_int("object_hash", get_object_hash());
1486 f->dump_string("pg_autoscale_mode",
1487 get_pg_autoscale_mode_name(pg_autoscale_mode));
1488 f->dump_unsigned("pg_num", get_pg_num());
1489 f->dump_unsigned("pg_placement_num", get_pgp_num());
1490 f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1491 f->dump_unsigned("pg_num_target", get_pg_num_target());
1492 f->dump_unsigned("pg_num_pending", get_pg_num_pending());
1493 f->dump_object("last_pg_merge_meta", last_pg_merge_meta);
1494 f->dump_stream("last_change") << get_last_change();
1495 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1496 f->dump_stream("last_force_op_resend_prenautilus")
1497 << get_last_force_op_resend_prenautilus();
1498 f->dump_stream("last_force_op_resend_preluminous")
1499 << get_last_force_op_resend_preluminous();
1500 f->dump_unsigned("auid", get_auid());
1501 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1502 f->dump_unsigned("snap_seq", get_snap_seq());
1503 f->dump_unsigned("snap_epoch", get_snap_epoch());
1504 f->open_array_section("pool_snaps");
1505 for (map<snapid_t, pool_snap_info_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
1506 f->open_object_section("pool_snap_info");
1507 p->second.dump(f);
1508 f->close_section();
1509 }
1510 f->close_section();
1511 f->dump_stream("removed_snaps") << removed_snaps;
1512 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1513 f->dump_unsigned("quota_max_objects", quota_max_objects);
1514 f->open_array_section("tiers");
1515 for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
1516 f->dump_unsigned("pool_id", *p);
1517 f->close_section();
1518 f->dump_int("tier_of", tier_of);
1519 f->dump_int("read_tier", read_tier);
1520 f->dump_int("write_tier", write_tier);
1521 f->dump_string("cache_mode", get_cache_mode_name());
1522 f->dump_unsigned("target_max_bytes", target_max_bytes);
1523 f->dump_unsigned("target_max_objects", target_max_objects);
1524 f->dump_unsigned("cache_target_dirty_ratio_micro",
1525 cache_target_dirty_ratio_micro);
1526 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1527 cache_target_dirty_high_ratio_micro);
1528 f->dump_unsigned("cache_target_full_ratio_micro",
1529 cache_target_full_ratio_micro);
1530 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1531 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1532 f->dump_string("erasure_code_profile", erasure_code_profile);
1533 f->open_object_section("hit_set_params");
1534 hit_set_params.dump(f);
1535 f->close_section(); // hit_set_params
1536 f->dump_unsigned("hit_set_period", hit_set_period);
1537 f->dump_unsigned("hit_set_count", hit_set_count);
1538 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1539 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1540 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1541 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1542 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1543 f->open_array_section("grade_table");
1544 for (unsigned i = 0; i < hit_set_count; ++i)
1545 f->dump_unsigned("value", get_grade(i));
1546 f->close_section();
1547 f->dump_unsigned("stripe_width", get_stripe_width());
1548 f->dump_unsigned("expected_num_objects", expected_num_objects);
1549 f->dump_bool("fast_read", fast_read);
1550 f->open_object_section("options");
1551 opts.dump(f);
1552 f->close_section(); // options
1553 f->open_object_section("application_metadata");
1554 for (auto &app_pair : application_metadata) {
1555 f->open_object_section(app_pair.first.c_str());
1556 for (auto &kv_pair : app_pair.second) {
1557 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1558 }
1559 f->close_section(); // application
1560 }
1561 f->close_section(); // application_metadata
1562 }
1563
1564 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1565 for (size_t i = 0; i < from.size(); ++i) {
1566 if (from[i] != CRUSH_ITEM_NONE) {
1567 to->insert(
1568 pg_shard_t(
1569 from[i],
1570 is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1571 }
1572 }
1573 }
1574
1575 void pg_pool_t::calc_pg_masks()
1576 {
1577 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1578 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1579 }
1580
1581 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1582 {
1583 if (pg_num == pg_num_mask + 1)
1584 return pg_num; // power-of-2 split
1585 unsigned mask = pg_num_mask >> 1;
1586 if ((pgid.ps() & mask) < (pg_num & mask))
1587 return pg_num_mask + 1; // smaller bin size (already split)
1588 else
1589 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1590 }
1591
1592 bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const
1593 {
1594 if (pg_num_pending >= pg_num) {
1595 return false;
1596 }
1597 if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) {
1598 if (target) {
1599 *target = false;
1600 }
1601 return true;
1602 }
1603 for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) {
1604 if (pg_t(ps, pgid.pool()).get_parent() == pgid) {
1605 if (target) {
1606 *target = true;
1607 }
1608 return true;
1609 }
1610 }
1611 return false;
1612 }
1613
1614 /*
1615 * we have two snap modes:
1616 * - pool snaps
1617 * - snap existence/non-existence defined by snaps[] and snap_seq
1618 * - user managed snaps
1619 * - existence tracked by librados user
1620 */
1621 bool pg_pool_t::is_pool_snaps_mode() const
1622 {
1623 return has_flag(FLAG_POOL_SNAPS);
1624 }
1625
1626 bool pg_pool_t::is_unmanaged_snaps_mode() const
1627 {
1628 return has_flag(FLAG_SELFMANAGED_SNAPS);
1629 }
1630
1631 bool pg_pool_t::is_removed_snap(snapid_t s) const
1632 {
1633 if (is_pool_snaps_mode())
1634 return s <= get_snap_seq() && snaps.count(s) == 0;
1635 else
1636 return removed_snaps.contains(s);
1637 }
1638
1639 /*
1640 * build set of known-removed sets from either pool snaps or
1641 * explicit removed_snaps set.
1642 */
1643 void pg_pool_t::build_removed_snaps(interval_set<snapid_t>& rs) const
1644 {
1645 if (is_pool_snaps_mode()) {
1646 rs.clear();
1647 for (snapid_t s = 1; s <= get_snap_seq(); s = s + 1)
1648 if (snaps.count(s) == 0)
1649 rs.insert(s);
1650 } else {
1651 rs = removed_snaps;
1652 }
1653 }
1654
1655 bool pg_pool_t::maybe_updated_removed_snaps(const interval_set<snapid_t>& cached) const
1656 {
1657 if (is_unmanaged_snaps_mode()) { // remove_unmanaged_snap increments range_end
1658 if (removed_snaps.empty() || cached.empty()) // range_end is undefined
1659 return removed_snaps.empty() != cached.empty();
1660 return removed_snaps.range_end() != cached.range_end();
1661 }
1662 return true;
1663 }
1664
1665 snapid_t pg_pool_t::snap_exists(const char *s) const
1666 {
1667 for (map<snapid_t,pool_snap_info_t>::const_iterator p = snaps.begin();
1668 p != snaps.end();
1669 ++p)
1670 if (p->second.name == s)
1671 return p->second.snapid;
1672 return 0;
1673 }
1674
1675 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1676 {
1677 ceph_assert(!is_unmanaged_snaps_mode());
1678 flags |= FLAG_POOL_SNAPS;
1679 snapid_t s = get_snap_seq() + 1;
1680 snap_seq = s;
1681 snaps[s].snapid = s;
1682 snaps[s].name = n;
1683 snaps[s].stamp = stamp;
1684 }
1685
1686 void pg_pool_t::add_unmanaged_snap(uint64_t& snapid)
1687 {
1688 ceph_assert(!is_pool_snaps_mode());
1689 if (snap_seq == 0) {
1690 // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
1691 // mimic this field is not decoded but our flag is set; pre-mimic, we
1692 // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1693 removed_snaps.insert(snapid_t(1));
1694 snap_seq = 1;
1695 }
1696 flags |= FLAG_SELFMANAGED_SNAPS;
1697 snapid = snap_seq = snap_seq + 1;
1698 }
1699
1700 void pg_pool_t::remove_snap(snapid_t s)
1701 {
1702 ceph_assert(snaps.count(s));
1703 snaps.erase(s);
1704 snap_seq = snap_seq + 1;
1705 }
1706
1707 void pg_pool_t::remove_unmanaged_snap(snapid_t s)
1708 {
1709 ceph_assert(is_unmanaged_snaps_mode());
1710 removed_snaps.insert(s);
1711 snap_seq = snap_seq + 1;
1712 // try to add in the new seq, just to try to keep the interval_set contiguous
1713 if (!removed_snaps.contains(get_snap_seq())) {
1714 removed_snaps.insert(get_snap_seq());
1715 }
1716 }
1717
1718 SnapContext pg_pool_t::get_snap_context() const
1719 {
1720 vector<snapid_t> s(snaps.size());
1721 unsigned i = 0;
1722 for (map<snapid_t, pool_snap_info_t>::const_reverse_iterator p = snaps.rbegin();
1723 p != snaps.rend();
1724 ++p)
1725 s[i++] = p->first;
1726 return SnapContext(get_snap_seq(), s);
1727 }
1728
1729 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1730 {
1731 if (ns.empty())
1732 return ceph_str_hash(object_hash, key.data(), key.length());
1733 int nsl = ns.length();
1734 int len = key.length() + nsl + 1;
1735 char buf[len];
1736 memcpy(&buf[0], ns.data(), nsl);
1737 buf[nsl] = '\037';
1738 memcpy(&buf[nsl+1], key.data(), key.length());
1739 return ceph_str_hash(object_hash, &buf[0], len);
1740 }
1741
1742 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1743 {
1744 return ceph_stable_mod(v, pg_num, pg_num_mask);
1745 }
1746
1747 /*
1748 * map a raw pg (with full precision ps) into an actual pg, for storage
1749 */
1750 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1751 {
1752 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1753 return pg;
1754 }
1755
1756 /*
1757 * map raw pg (full precision ps) into a placement seed. include
1758 * pool id in that value so that different pools don't use the same
1759 * seeds.
1760 */
1761 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1762 {
1763 if (flags & FLAG_HASHPSPOOL) {
1764 // Hash the pool id so that pool PGs do not overlap.
1765 return
1766 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1767 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1768 pg.pool());
1769 } else {
1770 // Legacy behavior; add ps and pool together. This is not a great
1771 // idea because the PGs from each pool will essentially overlap on
1772 // top of each other: 0.5 == 1.4 == 2.3 == ...
1773 return
1774 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1775 pg.pool();
1776 }
1777 }
1778
1779 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1780 {
1781 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1782 if (pg_num == pg_num_mask + 1) {
1783 r &= ~pg_num_mask;
1784 } else {
1785 unsigned smaller_mask = pg_num_mask >> 1;
1786 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1787 r &= ~pg_num_mask;
1788 } else {
1789 r &= ~smaller_mask;
1790 }
1791 }
1792 r |= pg.ps();
1793 return r;
1794 }
1795
1796 void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
1797 {
1798 using ceph::encode;
1799 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1800 // this encoding matches the old struct ceph_pg_pool
1801 __u8 struct_v = 2;
1802 encode(struct_v, bl);
1803 encode(type, bl);
1804 encode(size, bl);
1805 encode(crush_rule, bl);
1806 encode(object_hash, bl);
1807 encode(pg_num, bl);
1808 encode(pgp_num, bl);
1809 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1810 encode(lpg_num, bl);
1811 encode(lpgp_num, bl);
1812 encode(last_change, bl);
1813 encode(snap_seq, bl);
1814 encode(snap_epoch, bl);
1815
1816 __u32 n = snaps.size();
1817 encode(n, bl);
1818 n = removed_snaps.num_intervals();
1819 encode(n, bl);
1820
1821 encode(auid, bl);
1822
1823 encode_nohead(snaps, bl, features);
1824 encode_nohead(removed_snaps, bl);
1825 return;
1826 }
1827
1828 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1829 __u8 struct_v = 4;
1830 encode(struct_v, bl);
1831 encode(type, bl);
1832 encode(size, bl);
1833 encode(crush_rule, bl);
1834 encode(object_hash, bl);
1835 encode(pg_num, bl);
1836 encode(pgp_num, bl);
1837 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1838 encode(lpg_num, bl);
1839 encode(lpgp_num, bl);
1840 encode(last_change, bl);
1841 encode(snap_seq, bl);
1842 encode(snap_epoch, bl);
1843 encode(snaps, bl, features);
1844 encode(removed_snaps, bl);
1845 encode(auid, bl);
1846 encode(flags, bl);
1847 encode((uint32_t)0, bl); // crash_replay_interval
1848 return;
1849 }
1850
1851 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1852 // we simply added last_force_op_resend here, which is a fully
1853 // backward compatible change. however, encoding the same map
1854 // differently between monitors triggers scrub noise (even though
1855 // they are decodable without the feature), so let's be pendantic
1856 // about it.
1857 ENCODE_START(14, 5, bl);
1858 encode(type, bl);
1859 encode(size, bl);
1860 encode(crush_rule, bl);
1861 encode(object_hash, bl);
1862 encode(pg_num, bl);
1863 encode(pgp_num, bl);
1864 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1865 encode(lpg_num, bl);
1866 encode(lpgp_num, bl);
1867 encode(last_change, bl);
1868 encode(snap_seq, bl);
1869 encode(snap_epoch, bl);
1870 encode(snaps, bl, features);
1871 encode(removed_snaps, bl);
1872 encode(auid, bl);
1873 encode(flags, bl);
1874 encode((uint32_t)0, bl); // crash_replay_interval
1875 encode(min_size, bl);
1876 encode(quota_max_bytes, bl);
1877 encode(quota_max_objects, bl);
1878 encode(tiers, bl);
1879 encode(tier_of, bl);
1880 __u8 c = cache_mode;
1881 encode(c, bl);
1882 encode(read_tier, bl);
1883 encode(write_tier, bl);
1884 encode(properties, bl);
1885 encode(hit_set_params, bl);
1886 encode(hit_set_period, bl);
1887 encode(hit_set_count, bl);
1888 encode(stripe_width, bl);
1889 encode(target_max_bytes, bl);
1890 encode(target_max_objects, bl);
1891 encode(cache_target_dirty_ratio_micro, bl);
1892 encode(cache_target_full_ratio_micro, bl);
1893 encode(cache_min_flush_age, bl);
1894 encode(cache_min_evict_age, bl);
1895 encode(erasure_code_profile, bl);
1896 ENCODE_FINISH(bl);
1897 return;
1898 }
1899
1900 uint8_t v = 29;
1901 // NOTE: any new encoding dependencies must be reflected by
1902 // SIGNIFICANT_FEATURES
1903 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1904 // this was the first post-hammer thing we added; if it's missing, encode
1905 // like hammer.
1906 v = 21;
1907 } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1908 v = 24;
1909 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
1910 v = 26;
1911 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1912 v = 27;
1913 }
1914
1915 ENCODE_START(v, 5, bl);
1916 encode(type, bl);
1917 encode(size, bl);
1918 encode(crush_rule, bl);
1919 encode(object_hash, bl);
1920 encode(pg_num, bl);
1921 encode(pgp_num, bl);
1922 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1923 encode(lpg_num, bl);
1924 encode(lpgp_num, bl);
1925 encode(last_change, bl);
1926 encode(snap_seq, bl);
1927 encode(snap_epoch, bl);
1928 encode(snaps, bl, features);
1929 encode(removed_snaps, bl);
1930 encode(auid, bl);
1931 if (v >= 27) {
1932 encode(flags, bl);
1933 } else {
1934 auto tmp = flags;
1935 tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
1936 encode(tmp, bl);
1937 }
1938 encode((uint32_t)0, bl); // crash_replay_interval
1939 encode(min_size, bl);
1940 encode(quota_max_bytes, bl);
1941 encode(quota_max_objects, bl);
1942 encode(tiers, bl);
1943 encode(tier_of, bl);
1944 __u8 c = cache_mode;
1945 encode(c, bl);
1946 encode(read_tier, bl);
1947 encode(write_tier, bl);
1948 encode(properties, bl);
1949 encode(hit_set_params, bl);
1950 encode(hit_set_period, bl);
1951 encode(hit_set_count, bl);
1952 encode(stripe_width, bl);
1953 encode(target_max_bytes, bl);
1954 encode(target_max_objects, bl);
1955 encode(cache_target_dirty_ratio_micro, bl);
1956 encode(cache_target_full_ratio_micro, bl);
1957 encode(cache_min_flush_age, bl);
1958 encode(cache_min_evict_age, bl);
1959 encode(erasure_code_profile, bl);
1960 encode(last_force_op_resend_preluminous, bl);
1961 encode(min_read_recency_for_promote, bl);
1962 encode(expected_num_objects, bl);
1963 if (v >= 19) {
1964 encode(cache_target_dirty_high_ratio_micro, bl);
1965 }
1966 if (v >= 20) {
1967 encode(min_write_recency_for_promote, bl);
1968 }
1969 if (v >= 21) {
1970 encode(use_gmt_hitset, bl);
1971 }
1972 if (v >= 22) {
1973 encode(fast_read, bl);
1974 }
1975 if (v >= 23) {
1976 encode(hit_set_grade_decay_rate, bl);
1977 encode(hit_set_search_last_n, bl);
1978 }
1979 if (v >= 24) {
1980 encode(opts, bl, features);
1981 }
1982 if (v >= 25) {
1983 encode(last_force_op_resend_prenautilus, bl);
1984 }
1985 if (v >= 26) {
1986 encode(application_metadata, bl);
1987 }
1988 if (v >= 27) {
1989 encode(create_time, bl);
1990 }
1991 if (v >= 28) {
1992 encode(pg_num_target, bl);
1993 encode(pgp_num_target, bl);
1994 encode(pg_num_pending, bl);
1995 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01]
1996 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01]
1997 encode(last_force_op_resend, bl);
1998 encode(pg_autoscale_mode, bl);
1999 }
2000 if (v >= 29) {
2001 encode(last_pg_merge_meta, bl);
2002 }
2003 ENCODE_FINISH(bl);
2004 }
2005
2006 void pg_pool_t::decode(bufferlist::const_iterator& bl)
2007 {
2008 DECODE_START_LEGACY_COMPAT_LEN(29, 5, 5, bl);
2009 decode(type, bl);
2010 decode(size, bl);
2011 decode(crush_rule, bl);
2012 decode(object_hash, bl);
2013 decode(pg_num, bl);
2014 decode(pgp_num, bl);
2015 {
2016 __u32 lpg_num, lpgp_num;
2017 decode(lpg_num, bl);
2018 decode(lpgp_num, bl);
2019 }
2020 decode(last_change, bl);
2021 decode(snap_seq, bl);
2022 decode(snap_epoch, bl);
2023
2024 if (struct_v >= 3) {
2025 decode(snaps, bl);
2026 decode(removed_snaps, bl);
2027 decode(auid, bl);
2028 } else {
2029 __u32 n, m;
2030 decode(n, bl);
2031 decode(m, bl);
2032 decode(auid, bl);
2033 decode_nohead(n, snaps, bl);
2034 decode_nohead(m, removed_snaps, bl);
2035 }
2036
2037 if (struct_v >= 4) {
2038 decode(flags, bl);
2039 uint32_t crash_replay_interval;
2040 decode(crash_replay_interval, bl);
2041 } else {
2042 flags = 0;
2043 }
2044 // upgrade path for selfmanaged vs pool snaps
2045 if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
2046 if (!removed_snaps.empty()) {
2047 flags |= FLAG_SELFMANAGED_SNAPS;
2048 } else {
2049 flags |= FLAG_POOL_SNAPS;
2050 }
2051 }
2052 if (struct_v >= 7) {
2053 decode(min_size, bl);
2054 } else {
2055 min_size = size - size/2;
2056 }
2057 if (struct_v >= 8) {
2058 decode(quota_max_bytes, bl);
2059 decode(quota_max_objects, bl);
2060 }
2061 if (struct_v >= 9) {
2062 decode(tiers, bl);
2063 decode(tier_of, bl);
2064 __u8 v;
2065 decode(v, bl);
2066 cache_mode = (cache_mode_t)v;
2067 decode(read_tier, bl);
2068 decode(write_tier, bl);
2069 }
2070 if (struct_v >= 10) {
2071 decode(properties, bl);
2072 }
2073 if (struct_v >= 11) {
2074 decode(hit_set_params, bl);
2075 decode(hit_set_period, bl);
2076 decode(hit_set_count, bl);
2077 } else {
2078 pg_pool_t def;
2079 hit_set_period = def.hit_set_period;
2080 hit_set_count = def.hit_set_count;
2081 }
2082 if (struct_v >= 12) {
2083 decode(stripe_width, bl);
2084 } else {
2085 set_stripe_width(0);
2086 }
2087 if (struct_v >= 13) {
2088 decode(target_max_bytes, bl);
2089 decode(target_max_objects, bl);
2090 decode(cache_target_dirty_ratio_micro, bl);
2091 decode(cache_target_full_ratio_micro, bl);
2092 decode(cache_min_flush_age, bl);
2093 decode(cache_min_evict_age, bl);
2094 } else {
2095 target_max_bytes = 0;
2096 target_max_objects = 0;
2097 cache_target_dirty_ratio_micro = 0;
2098 cache_target_full_ratio_micro = 0;
2099 cache_min_flush_age = 0;
2100 cache_min_evict_age = 0;
2101 }
2102 if (struct_v >= 14) {
2103 decode(erasure_code_profile, bl);
2104 }
2105 if (struct_v >= 15) {
2106 decode(last_force_op_resend_preluminous, bl);
2107 } else {
2108 last_force_op_resend_preluminous = 0;
2109 }
2110 if (struct_v >= 16) {
2111 decode(min_read_recency_for_promote, bl);
2112 } else {
2113 min_read_recency_for_promote = 1;
2114 }
2115 if (struct_v >= 17) {
2116 decode(expected_num_objects, bl);
2117 } else {
2118 expected_num_objects = 0;
2119 }
2120 if (struct_v >= 19) {
2121 decode(cache_target_dirty_high_ratio_micro, bl);
2122 } else {
2123 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
2124 }
2125 if (struct_v >= 20) {
2126 decode(min_write_recency_for_promote, bl);
2127 } else {
2128 min_write_recency_for_promote = 1;
2129 }
2130 if (struct_v >= 21) {
2131 decode(use_gmt_hitset, bl);
2132 } else {
2133 use_gmt_hitset = false;
2134 }
2135 if (struct_v >= 22) {
2136 decode(fast_read, bl);
2137 } else {
2138 fast_read = false;
2139 }
2140 if (struct_v >= 23) {
2141 decode(hit_set_grade_decay_rate, bl);
2142 decode(hit_set_search_last_n, bl);
2143 } else {
2144 hit_set_grade_decay_rate = 0;
2145 hit_set_search_last_n = 1;
2146 }
2147 if (struct_v >= 24) {
2148 decode(opts, bl);
2149 }
2150 if (struct_v >= 25) {
2151 decode(last_force_op_resend_prenautilus, bl);
2152 } else {
2153 last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
2154 }
2155 if (struct_v >= 26) {
2156 decode(application_metadata, bl);
2157 }
2158 if (struct_v >= 27) {
2159 decode(create_time, bl);
2160 }
2161 if (struct_v >= 28) {
2162 decode(pg_num_target, bl);
2163 decode(pgp_num_target, bl);
2164 decode(pg_num_pending, bl);
2165 epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
2166 decode(old_merge_last_epoch_started, bl);
2167 decode(old_merge_last_epoch_clean, bl);
2168 decode(last_force_op_resend, bl);
2169 decode(pg_autoscale_mode, bl);
2170 if (struct_v >= 29) {
2171 decode(last_pg_merge_meta, bl);
2172 } else {
2173 last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
2174 last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
2175 }
2176 } else {
2177 pg_num_target = pg_num;
2178 pgp_num_target = pgp_num;
2179 pg_num_pending = pg_num;
2180 last_force_op_resend = last_force_op_resend_prenautilus;
2181 pg_autoscale_mode = PG_AUTOSCALE_MODE_WARN; // default to warn on upgrade
2182 }
2183 DECODE_FINISH(bl);
2184 calc_pg_masks();
2185 calc_grade_table();
2186 }
2187
2188 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
2189 {
2190 pg_pool_t a;
2191 o.push_back(new pg_pool_t(a));
2192
2193 a.create_time = utime_t(4,5);
2194 a.type = TYPE_REPLICATED;
2195 a.size = 2;
2196 a.crush_rule = 3;
2197 a.object_hash = 4;
2198 a.pg_num = 6;
2199 a.pgp_num = 4;
2200 a.pgp_num_target = 4;
2201 a.pg_num_target = 5;
2202 a.pg_num_pending = 5;
2203 a.last_pg_merge_meta.last_epoch_started = 2;
2204 a.last_pg_merge_meta.last_epoch_clean = 2;
2205 a.last_change = 9;
2206 a.last_force_op_resend = 123823;
2207 a.last_force_op_resend_preluminous = 123824;
2208 a.snap_seq = 10;
2209 a.snap_epoch = 11;
2210 a.flags = FLAG_POOL_SNAPS;
2211 a.auid = 12;
2212 a.quota_max_bytes = 473;
2213 a.quota_max_objects = 474;
2214 o.push_back(new pg_pool_t(a));
2215
2216 a.snaps[3].name = "asdf";
2217 a.snaps[3].snapid = 3;
2218 a.snaps[3].stamp = utime_t(123, 4);
2219 a.snaps[6].name = "qwer";
2220 a.snaps[6].snapid = 6;
2221 a.snaps[6].stamp = utime_t(23423, 4);
2222 o.push_back(new pg_pool_t(a));
2223
2224 a.flags = FLAG_SELFMANAGED_SNAPS;
2225 a.snaps.clear();
2226 a.removed_snaps.insert(2);
2227 a.quota_max_bytes = 2473;
2228 a.quota_max_objects = 4374;
2229 a.tiers.insert(0);
2230 a.tiers.insert(1);
2231 a.tier_of = 2;
2232 a.cache_mode = CACHEMODE_WRITEBACK;
2233 a.read_tier = 1;
2234 a.write_tier = 1;
2235 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
2236 a.hit_set_period = 3600;
2237 a.hit_set_count = 8;
2238 a.min_read_recency_for_promote = 1;
2239 a.min_write_recency_for_promote = 1;
2240 a.hit_set_grade_decay_rate = 50;
2241 a.hit_set_search_last_n = 1;
2242 a.calc_grade_table();
2243 a.set_stripe_width(12345);
2244 a.target_max_bytes = 1238132132;
2245 a.target_max_objects = 1232132;
2246 a.cache_target_dirty_ratio_micro = 187232;
2247 a.cache_target_dirty_high_ratio_micro = 309856;
2248 a.cache_target_full_ratio_micro = 987222;
2249 a.cache_min_flush_age = 231;
2250 a.cache_min_evict_age = 2321;
2251 a.erasure_code_profile = "profile in osdmap";
2252 a.expected_num_objects = 123456;
2253 a.fast_read = false;
2254 a.application_metadata = {{"rbd", {{"key", "value"}}}};
2255 o.push_back(new pg_pool_t(a));
2256 }
2257
2258 ostream& operator<<(ostream& out, const pg_pool_t& p)
2259 {
2260 out << p.get_type_name()
2261 << " size " << p.get_size()
2262 << " min_size " << p.get_min_size()
2263 << " crush_rule " << p.get_crush_rule()
2264 << " object_hash " << p.get_object_hash_name()
2265 << " pg_num " << p.get_pg_num()
2266 << " pgp_num " << p.get_pgp_num();
2267 if (p.get_pg_num_target() != p.get_pg_num()) {
2268 out << " pg_num_target " << p.get_pg_num_target();
2269 }
2270 if (p.get_pgp_num_target() != p.get_pgp_num()) {
2271 out << " pgp_num_target " << p.get_pgp_num_target();
2272 }
2273 if (p.get_pg_num_pending() != p.get_pg_num()) {
2274 out << " pg_num_pending " << p.get_pg_num_pending();
2275 }
2276 if (p.pg_autoscale_mode) {
2277 out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode);
2278 }
2279 out << " last_change " << p.get_last_change();
2280 if (p.get_last_force_op_resend() ||
2281 p.get_last_force_op_resend_prenautilus() ||
2282 p.get_last_force_op_resend_preluminous())
2283 out << " lfor " << p.get_last_force_op_resend() << "/"
2284 << p.get_last_force_op_resend_prenautilus() << "/"
2285 << p.get_last_force_op_resend_preluminous();
2286 if (p.get_auid())
2287 out << " owner " << p.get_auid();
2288 if (p.flags)
2289 out << " flags " << p.get_flags_string();
2290 if (p.quota_max_bytes)
2291 out << " max_bytes " << p.quota_max_bytes;
2292 if (p.quota_max_objects)
2293 out << " max_objects " << p.quota_max_objects;
2294 if (!p.tiers.empty())
2295 out << " tiers " << p.tiers;
2296 if (p.is_tier())
2297 out << " tier_of " << p.tier_of;
2298 if (p.has_read_tier())
2299 out << " read_tier " << p.read_tier;
2300 if (p.has_write_tier())
2301 out << " write_tier " << p.write_tier;
2302 if (p.cache_mode)
2303 out << " cache_mode " << p.get_cache_mode_name();
2304 if (p.target_max_bytes)
2305 out << " target_bytes " << p.target_max_bytes;
2306 if (p.target_max_objects)
2307 out << " target_objects " << p.target_max_objects;
2308 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
2309 out << " hit_set " << p.hit_set_params
2310 << " " << p.hit_set_period << "s"
2311 << " x" << p.hit_set_count << " decay_rate "
2312 << p.hit_set_grade_decay_rate
2313 << " search_last_n " << p.hit_set_search_last_n;
2314 }
2315 if (p.min_read_recency_for_promote)
2316 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
2317 if (p.min_write_recency_for_promote)
2318 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
2319 out << " stripe_width " << p.get_stripe_width();
2320 if (p.expected_num_objects)
2321 out << " expected_num_objects " << p.expected_num_objects;
2322 if (p.fast_read)
2323 out << " fast_read " << p.fast_read;
2324 out << p.opts;
2325 if (!p.application_metadata.empty()) {
2326 out << " application ";
2327 for (auto it = p.application_metadata.begin();
2328 it != p.application_metadata.end(); ++it) {
2329 if (it != p.application_metadata.begin())
2330 out << ",";
2331 out << it->first;
2332 }
2333 }
2334 return out;
2335 }
2336
2337
2338 // -- object_stat_sum_t --
2339
2340 void object_stat_sum_t::dump(Formatter *f) const
2341 {
2342 f->dump_int("num_bytes", num_bytes);
2343 f->dump_int("num_objects", num_objects);
2344 f->dump_int("num_object_clones", num_object_clones);
2345 f->dump_int("num_object_copies", num_object_copies);
2346 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
2347 f->dump_int("num_objects_missing", num_objects_missing);
2348 f->dump_int("num_objects_degraded", num_objects_degraded);
2349 f->dump_int("num_objects_misplaced", num_objects_misplaced);
2350 f->dump_int("num_objects_unfound", num_objects_unfound);
2351 f->dump_int("num_objects_dirty", num_objects_dirty);
2352 f->dump_int("num_whiteouts", num_whiteouts);
2353 f->dump_int("num_read", num_rd);
2354 f->dump_int("num_read_kb", num_rd_kb);
2355 f->dump_int("num_write", num_wr);
2356 f->dump_int("num_write_kb", num_wr_kb);
2357 f->dump_int("num_scrub_errors", num_scrub_errors);
2358 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
2359 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
2360 f->dump_int("num_objects_recovered", num_objects_recovered);
2361 f->dump_int("num_bytes_recovered", num_bytes_recovered);
2362 f->dump_int("num_keys_recovered", num_keys_recovered);
2363 f->dump_int("num_objects_omap", num_objects_omap);
2364 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
2365 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
2366 f->dump_int("num_flush", num_flush);
2367 f->dump_int("num_flush_kb", num_flush_kb);
2368 f->dump_int("num_evict", num_evict);
2369 f->dump_int("num_evict_kb", num_evict_kb);
2370 f->dump_int("num_promote", num_promote);
2371 f->dump_int("num_flush_mode_high", num_flush_mode_high);
2372 f->dump_int("num_flush_mode_low", num_flush_mode_low);
2373 f->dump_int("num_evict_mode_some", num_evict_mode_some);
2374 f->dump_int("num_evict_mode_full", num_evict_mode_full);
2375 f->dump_int("num_objects_pinned", num_objects_pinned);
2376 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
2377 f->dump_int("num_large_omap_objects", num_large_omap_objects);
2378 f->dump_int("num_objects_manifest", num_objects_manifest);
2379 f->dump_int("num_omap_bytes", num_omap_bytes);
2380 f->dump_int("num_omap_keys", num_omap_keys);
2381 f->dump_int("num_objects_repaired", num_objects_repaired);
2382 }
2383
2384 void object_stat_sum_t::encode(bufferlist& bl) const
2385 {
2386 ENCODE_START(20, 14, bl);
2387 #if defined(CEPH_LITTLE_ENDIAN)
2388 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
2389 #else
2390 encode(num_bytes, bl);
2391 encode(num_objects, bl);
2392 encode(num_object_clones, bl);
2393 encode(num_object_copies, bl);
2394 encode(num_objects_missing_on_primary, bl);
2395 encode(num_objects_degraded, bl);
2396 encode(num_objects_unfound, bl);
2397 encode(num_rd, bl);
2398 encode(num_rd_kb, bl);
2399 encode(num_wr, bl);
2400 encode(num_wr_kb, bl);
2401 encode(num_scrub_errors, bl);
2402 encode(num_objects_recovered, bl);
2403 encode(num_bytes_recovered, bl);
2404 encode(num_keys_recovered, bl);
2405 encode(num_shallow_scrub_errors, bl);
2406 encode(num_deep_scrub_errors, bl);
2407 encode(num_objects_dirty, bl);
2408 encode(num_whiteouts, bl);
2409 encode(num_objects_omap, bl);
2410 encode(num_objects_hit_set_archive, bl);
2411 encode(num_objects_misplaced, bl);
2412 encode(num_bytes_hit_set_archive, bl);
2413 encode(num_flush, bl);
2414 encode(num_flush_kb, bl);
2415 encode(num_evict, bl);
2416 encode(num_evict_kb, bl);
2417 encode(num_promote, bl);
2418 encode(num_flush_mode_high, bl);
2419 encode(num_flush_mode_low, bl);
2420 encode(num_evict_mode_some, bl);
2421 encode(num_evict_mode_full, bl);
2422 encode(num_objects_pinned, bl);
2423 encode(num_objects_missing, bl);
2424 encode(num_legacy_snapsets, bl);
2425 encode(num_large_omap_objects, bl);
2426 encode(num_objects_manifest, bl);
2427 encode(num_omap_bytes, bl);
2428 encode(num_omap_keys, bl);
2429 encode(num_objects_repaired, bl);
2430 #endif
2431 ENCODE_FINISH(bl);
2432 }
2433
2434 void object_stat_sum_t::decode(bufferlist::const_iterator& bl)
2435 {
2436 bool decode_finish = false;
2437 static const int STAT_SUM_DECODE_VERSION = 20;
2438 DECODE_START(STAT_SUM_DECODE_VERSION, bl);
2439 #if defined(CEPH_LITTLE_ENDIAN)
2440 if (struct_v == STAT_SUM_DECODE_VERSION) {
2441 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
2442 decode_finish = true;
2443 }
2444 #endif
2445 if (!decode_finish) {
2446 decode(num_bytes, bl);
2447 decode(num_objects, bl);
2448 decode(num_object_clones, bl);
2449 decode(num_object_copies, bl);
2450 decode(num_objects_missing_on_primary, bl);
2451 decode(num_objects_degraded, bl);
2452 decode(num_objects_unfound, bl);
2453 decode(num_rd, bl);
2454 decode(num_rd_kb, bl);
2455 decode(num_wr, bl);
2456 decode(num_wr_kb, bl);
2457 decode(num_scrub_errors, bl);
2458 decode(num_objects_recovered, bl);
2459 decode(num_bytes_recovered, bl);
2460 decode(num_keys_recovered, bl);
2461 decode(num_shallow_scrub_errors, bl);
2462 decode(num_deep_scrub_errors, bl);
2463 decode(num_objects_dirty, bl);
2464 decode(num_whiteouts, bl);
2465 decode(num_objects_omap, bl);
2466 decode(num_objects_hit_set_archive, bl);
2467 decode(num_objects_misplaced, bl);
2468 decode(num_bytes_hit_set_archive, bl);
2469 decode(num_flush, bl);
2470 decode(num_flush_kb, bl);
2471 decode(num_evict, bl);
2472 decode(num_evict_kb, bl);
2473 decode(num_promote, bl);
2474 decode(num_flush_mode_high, bl);
2475 decode(num_flush_mode_low, bl);
2476 decode(num_evict_mode_some, bl);
2477 decode(num_evict_mode_full, bl);
2478 decode(num_objects_pinned, bl);
2479 decode(num_objects_missing, bl);
2480 if (struct_v >= 16) {
2481 decode(num_legacy_snapsets, bl);
2482 } else {
2483 num_legacy_snapsets = num_object_clones; // upper bound
2484 }
2485 if (struct_v >= 17) {
2486 decode(num_large_omap_objects, bl);
2487 }
2488 if (struct_v >= 18) {
2489 decode(num_objects_manifest, bl);
2490 }
2491 if (struct_v >= 19) {
2492 decode(num_omap_bytes, bl);
2493 decode(num_omap_keys, bl);
2494 }
2495 if (struct_v >= 20) {
2496 decode(num_objects_repaired, bl);
2497 }
2498 }
2499 DECODE_FINISH(bl);
2500 }
2501
2502 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2503 {
2504 object_stat_sum_t a;
2505
2506 a.num_bytes = 1;
2507 a.num_objects = 3;
2508 a.num_object_clones = 4;
2509 a.num_object_copies = 5;
2510 a.num_objects_missing_on_primary = 6;
2511 a.num_objects_missing = 123;
2512 a.num_objects_degraded = 7;
2513 a.num_objects_unfound = 8;
2514 a.num_rd = 9; a.num_rd_kb = 10;
2515 a.num_wr = 11; a.num_wr_kb = 12;
2516 a.num_objects_recovered = 14;
2517 a.num_bytes_recovered = 15;
2518 a.num_keys_recovered = 16;
2519 a.num_deep_scrub_errors = 17;
2520 a.num_shallow_scrub_errors = 18;
2521 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2522 a.num_objects_dirty = 21;
2523 a.num_whiteouts = 22;
2524 a.num_objects_misplaced = 1232;
2525 a.num_objects_hit_set_archive = 2;
2526 a.num_bytes_hit_set_archive = 27;
2527 a.num_flush = 5;
2528 a.num_flush_kb = 6;
2529 a.num_evict = 7;
2530 a.num_evict_kb = 8;
2531 a.num_promote = 9;
2532 a.num_flush_mode_high = 0;
2533 a.num_flush_mode_low = 1;
2534 a.num_evict_mode_some = 1;
2535 a.num_evict_mode_full = 0;
2536 a.num_objects_pinned = 20;
2537 a.num_large_omap_objects = 5;
2538 a.num_objects_manifest = 2;
2539 a.num_omap_bytes = 20000;
2540 a.num_omap_keys = 200;
2541 a.num_objects_repaired = 300;
2542 o.push_back(new object_stat_sum_t(a));
2543 }
2544
2545 void object_stat_sum_t::add(const object_stat_sum_t& o)
2546 {
2547 num_bytes += o.num_bytes;
2548 num_objects += o.num_objects;
2549 num_object_clones += o.num_object_clones;
2550 num_object_copies += o.num_object_copies;
2551 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2552 num_objects_missing += o.num_objects_missing;
2553 num_objects_degraded += o.num_objects_degraded;
2554 num_objects_misplaced += o.num_objects_misplaced;
2555 num_rd += o.num_rd;
2556 num_rd_kb += o.num_rd_kb;
2557 num_wr += o.num_wr;
2558 num_wr_kb += o.num_wr_kb;
2559 num_objects_unfound += o.num_objects_unfound;
2560 num_scrub_errors += o.num_scrub_errors;
2561 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2562 num_deep_scrub_errors += o.num_deep_scrub_errors;
2563 num_objects_recovered += o.num_objects_recovered;
2564 num_bytes_recovered += o.num_bytes_recovered;
2565 num_keys_recovered += o.num_keys_recovered;
2566 num_objects_dirty += o.num_objects_dirty;
2567 num_whiteouts += o.num_whiteouts;
2568 num_objects_omap += o.num_objects_omap;
2569 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2570 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2571 num_flush += o.num_flush;
2572 num_flush_kb += o.num_flush_kb;
2573 num_evict += o.num_evict;
2574 num_evict_kb += o.num_evict_kb;
2575 num_promote += o.num_promote;
2576 num_flush_mode_high += o.num_flush_mode_high;
2577 num_flush_mode_low += o.num_flush_mode_low;
2578 num_evict_mode_some += o.num_evict_mode_some;
2579 num_evict_mode_full += o.num_evict_mode_full;
2580 num_objects_pinned += o.num_objects_pinned;
2581 num_legacy_snapsets += o.num_legacy_snapsets;
2582 num_large_omap_objects += o.num_large_omap_objects;
2583 num_objects_manifest += o.num_objects_manifest;
2584 num_omap_bytes += o.num_omap_bytes;
2585 num_omap_keys += o.num_omap_keys;
2586 num_objects_repaired += o.num_objects_repaired;
2587 }
2588
2589 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2590 {
2591 num_bytes -= o.num_bytes;
2592 num_objects -= o.num_objects;
2593 num_object_clones -= o.num_object_clones;
2594 num_object_copies -= o.num_object_copies;
2595 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2596 num_objects_missing -= o.num_objects_missing;
2597 num_objects_degraded -= o.num_objects_degraded;
2598 num_objects_misplaced -= o.num_objects_misplaced;
2599 num_rd -= o.num_rd;
2600 num_rd_kb -= o.num_rd_kb;
2601 num_wr -= o.num_wr;
2602 num_wr_kb -= o.num_wr_kb;
2603 num_objects_unfound -= o.num_objects_unfound;
2604 num_scrub_errors -= o.num_scrub_errors;
2605 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2606 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2607 num_objects_recovered -= o.num_objects_recovered;
2608 num_bytes_recovered -= o.num_bytes_recovered;
2609 num_keys_recovered -= o.num_keys_recovered;
2610 num_objects_dirty -= o.num_objects_dirty;
2611 num_whiteouts -= o.num_whiteouts;
2612 num_objects_omap -= o.num_objects_omap;
2613 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2614 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2615 num_flush -= o.num_flush;
2616 num_flush_kb -= o.num_flush_kb;
2617 num_evict -= o.num_evict;
2618 num_evict_kb -= o.num_evict_kb;
2619 num_promote -= o.num_promote;
2620 num_flush_mode_high -= o.num_flush_mode_high;
2621 num_flush_mode_low -= o.num_flush_mode_low;
2622 num_evict_mode_some -= o.num_evict_mode_some;
2623 num_evict_mode_full -= o.num_evict_mode_full;
2624 num_objects_pinned -= o.num_objects_pinned;
2625 num_legacy_snapsets -= o.num_legacy_snapsets;
2626 num_large_omap_objects -= o.num_large_omap_objects;
2627 num_objects_manifest -= o.num_objects_manifest;
2628 num_omap_bytes -= o.num_omap_bytes;
2629 num_omap_keys -= o.num_omap_keys;
2630 num_objects_repaired -= o.num_objects_repaired;
2631 }
2632
2633 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2634 {
2635 return
2636 l.num_bytes == r.num_bytes &&
2637 l.num_objects == r.num_objects &&
2638 l.num_object_clones == r.num_object_clones &&
2639 l.num_object_copies == r.num_object_copies &&
2640 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2641 l.num_objects_missing == r.num_objects_missing &&
2642 l.num_objects_degraded == r.num_objects_degraded &&
2643 l.num_objects_misplaced == r.num_objects_misplaced &&
2644 l.num_objects_unfound == r.num_objects_unfound &&
2645 l.num_rd == r.num_rd &&
2646 l.num_rd_kb == r.num_rd_kb &&
2647 l.num_wr == r.num_wr &&
2648 l.num_wr_kb == r.num_wr_kb &&
2649 l.num_scrub_errors == r.num_scrub_errors &&
2650 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2651 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2652 l.num_objects_recovered == r.num_objects_recovered &&
2653 l.num_bytes_recovered == r.num_bytes_recovered &&
2654 l.num_keys_recovered == r.num_keys_recovered &&
2655 l.num_objects_dirty == r.num_objects_dirty &&
2656 l.num_whiteouts == r.num_whiteouts &&
2657 l.num_objects_omap == r.num_objects_omap &&
2658 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2659 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2660 l.num_flush == r.num_flush &&
2661 l.num_flush_kb == r.num_flush_kb &&
2662 l.num_evict == r.num_evict &&
2663 l.num_evict_kb == r.num_evict_kb &&
2664 l.num_promote == r.num_promote &&
2665 l.num_flush_mode_high == r.num_flush_mode_high &&
2666 l.num_flush_mode_low == r.num_flush_mode_low &&
2667 l.num_evict_mode_some == r.num_evict_mode_some &&
2668 l.num_evict_mode_full == r.num_evict_mode_full &&
2669 l.num_objects_pinned == r.num_objects_pinned &&
2670 l.num_legacy_snapsets == r.num_legacy_snapsets &&
2671 l.num_large_omap_objects == r.num_large_omap_objects &&
2672 l.num_objects_manifest == r.num_objects_manifest &&
2673 l.num_omap_bytes == r.num_omap_bytes &&
2674 l.num_omap_keys == r.num_omap_keys &&
2675 l.num_objects_repaired == r.num_objects_repaired;
2676 }
2677
2678 // -- object_stat_collection_t --
2679
2680 void object_stat_collection_t::dump(Formatter *f) const
2681 {
2682 f->open_object_section("stat_sum");
2683 sum.dump(f);
2684 f->close_section();
2685 }
2686
2687 void object_stat_collection_t::encode(bufferlist& bl) const
2688 {
2689 ENCODE_START(2, 2, bl);
2690 encode(sum, bl);
2691 encode((__u32)0, bl);
2692 ENCODE_FINISH(bl);
2693 }
2694
2695 void object_stat_collection_t::decode(bufferlist::const_iterator& bl)
2696 {
2697 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2698 decode(sum, bl);
2699 {
2700 map<string,object_stat_sum_t> cat_sum;
2701 decode(cat_sum, bl);
2702 }
2703 DECODE_FINISH(bl);
2704 }
2705
2706 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2707 {
2708 object_stat_collection_t a;
2709 o.push_back(new object_stat_collection_t(a));
2710 list<object_stat_sum_t*> l;
2711 object_stat_sum_t::generate_test_instances(l);
2712 for (list<object_stat_sum_t*>::iterator p = l.begin(); p != l.end(); ++p) {
2713 a.add(**p);
2714 o.push_back(new object_stat_collection_t(a));
2715 }
2716 }
2717
2718
2719 // -- pg_stat_t --
2720
2721 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2722 {
2723 if (primary && osd == acting_primary) {
2724 return true;
2725 } else if (!primary) {
2726 for(vector<int32_t>::const_iterator it = acting.begin();
2727 it != acting.end(); ++it)
2728 {
2729 if (*it == osd)
2730 return true;
2731 }
2732 }
2733 return false;
2734 }
2735
2736 void pg_stat_t::dump(Formatter *f) const
2737 {
2738 f->dump_stream("version") << version;
2739 f->dump_stream("reported_seq") << reported_seq;
2740 f->dump_stream("reported_epoch") << reported_epoch;
2741 f->dump_string("state", pg_state_string(state));
2742 f->dump_stream("last_fresh") << last_fresh;
2743 f->dump_stream("last_change") << last_change;
2744 f->dump_stream("last_active") << last_active;
2745 f->dump_stream("last_peered") << last_peered;
2746 f->dump_stream("last_clean") << last_clean;
2747 f->dump_stream("last_became_active") << last_became_active;
2748 f->dump_stream("last_became_peered") << last_became_peered;
2749 f->dump_stream("last_unstale") << last_unstale;
2750 f->dump_stream("last_undegraded") << last_undegraded;
2751 f->dump_stream("last_fullsized") << last_fullsized;
2752 f->dump_unsigned("mapping_epoch", mapping_epoch);
2753 f->dump_stream("log_start") << log_start;
2754 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2755 f->dump_unsigned("created", created);
2756 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2757 f->dump_stream("parent") << parent;
2758 f->dump_unsigned("parent_split_bits", parent_split_bits);
2759 f->dump_stream("last_scrub") << last_scrub;
2760 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2761 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2762 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2763 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2764 f->dump_int("log_size", log_size);
2765 f->dump_int("ondisk_log_size", ondisk_log_size);
2766 f->dump_bool("stats_invalid", stats_invalid);
2767 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2768 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2769 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2770 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2771 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2772 f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
2773 f->dump_unsigned("snaptrimq_len", snaptrimq_len);
2774 stats.dump(f);
2775 f->open_array_section("up");
2776 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2777 f->dump_int("osd", *p);
2778 f->close_section();
2779 f->open_array_section("acting");
2780 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2781 f->dump_int("osd", *p);
2782 f->close_section();
2783 f->open_array_section("avail_no_missing");
2784 for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p)
2785 f->dump_stream("shard") << *p;
2786 f->close_section();
2787 f->open_array_section("object_location_counts");
2788 for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) {
2789 f->open_object_section("entry");
2790 f->dump_stream("shards") << p->first;
2791 f->dump_int("objects", p->second);
2792 f->close_section();
2793 }
2794 f->close_section();
2795 f->open_array_section("blocked_by");
2796 for (vector<int32_t>::const_iterator p = blocked_by.begin();
2797 p != blocked_by.end(); ++p)
2798 f->dump_int("osd", *p);
2799 f->close_section();
2800 f->dump_int("up_primary", up_primary);
2801 f->dump_int("acting_primary", acting_primary);
2802 f->open_array_section("purged_snaps");
2803 for (interval_set<snapid_t>::const_iterator i = purged_snaps.begin();
2804 i != purged_snaps.end();
2805 ++i) {
2806 f->open_object_section("interval");
2807 f->dump_stream("start") << i.get_start();
2808 f->dump_stream("length") << i.get_len();
2809 f->close_section();
2810 }
2811 f->close_section();
2812 }
2813
2814 void pg_stat_t::dump_brief(Formatter *f) const
2815 {
2816 f->dump_string("state", pg_state_string(state));
2817 f->open_array_section("up");
2818 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2819 f->dump_int("osd", *p);
2820 f->close_section();
2821 f->open_array_section("acting");
2822 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2823 f->dump_int("osd", *p);
2824 f->close_section();
2825 f->dump_int("up_primary", up_primary);
2826 f->dump_int("acting_primary", acting_primary);
2827 }
2828
2829 void pg_stat_t::encode(bufferlist &bl) const
2830 {
2831 ENCODE_START(26, 22, bl);
2832 encode(version, bl);
2833 encode(reported_seq, bl);
2834 encode(reported_epoch, bl);
2835 encode((__u32)state, bl); // for older peers
2836 encode(log_start, bl);
2837 encode(ondisk_log_start, bl);
2838 encode(created, bl);
2839 encode(last_epoch_clean, bl);
2840 encode(parent, bl);
2841 encode(parent_split_bits, bl);
2842 encode(last_scrub, bl);
2843 encode(last_scrub_stamp, bl);
2844 encode(stats, bl);
2845 encode(log_size, bl);
2846 encode(ondisk_log_size, bl);
2847 encode(up, bl);
2848 encode(acting, bl);
2849 encode(last_fresh, bl);
2850 encode(last_change, bl);
2851 encode(last_active, bl);
2852 encode(last_clean, bl);
2853 encode(last_unstale, bl);
2854 encode(mapping_epoch, bl);
2855 encode(last_deep_scrub, bl);
2856 encode(last_deep_scrub_stamp, bl);
2857 encode(stats_invalid, bl);
2858 encode(last_clean_scrub_stamp, bl);
2859 encode(last_became_active, bl);
2860 encode(dirty_stats_invalid, bl);
2861 encode(up_primary, bl);
2862 encode(acting_primary, bl);
2863 encode(omap_stats_invalid, bl);
2864 encode(hitset_stats_invalid, bl);
2865 encode(blocked_by, bl);
2866 encode(last_undegraded, bl);
2867 encode(last_fullsized, bl);
2868 encode(hitset_bytes_stats_invalid, bl);
2869 encode(last_peered, bl);
2870 encode(last_became_peered, bl);
2871 encode(pin_stats_invalid, bl);
2872 encode(snaptrimq_len, bl);
2873 __u32 top_state = (state >> 32);
2874 encode(top_state, bl);
2875 encode(purged_snaps, bl);
2876 encode(manifest_stats_invalid, bl);
2877 encode(avail_no_missing, bl);
2878 encode(object_location_counts, bl);
2879 ENCODE_FINISH(bl);
2880 }
2881
2882 void pg_stat_t::decode(bufferlist::const_iterator &bl)
2883 {
2884 bool tmp;
2885 uint32_t old_state;
2886 DECODE_START(26, bl);
2887 decode(version, bl);
2888 decode(reported_seq, bl);
2889 decode(reported_epoch, bl);
2890 decode(old_state, bl);
2891 decode(log_start, bl);
2892 decode(ondisk_log_start, bl);
2893 decode(created, bl);
2894 decode(last_epoch_clean, bl);
2895 decode(parent, bl);
2896 decode(parent_split_bits, bl);
2897 decode(last_scrub, bl);
2898 decode(last_scrub_stamp, bl);
2899 decode(stats, bl);
2900 decode(log_size, bl);
2901 decode(ondisk_log_size, bl);
2902 decode(up, bl);
2903 decode(acting, bl);
2904 decode(last_fresh, bl);
2905 decode(last_change, bl);
2906 decode(last_active, bl);
2907 decode(last_clean, bl);
2908 decode(last_unstale, bl);
2909 decode(mapping_epoch, bl);
2910 decode(last_deep_scrub, bl);
2911 decode(last_deep_scrub_stamp, bl);
2912 decode(tmp, bl);
2913 stats_invalid = tmp;
2914 decode(last_clean_scrub_stamp, bl);
2915 decode(last_became_active, bl);
2916 decode(tmp, bl);
2917 dirty_stats_invalid = tmp;
2918 decode(up_primary, bl);
2919 decode(acting_primary, bl);
2920 decode(tmp, bl);
2921 omap_stats_invalid = tmp;
2922 decode(tmp, bl);
2923 hitset_stats_invalid = tmp;
2924 decode(blocked_by, bl);
2925 decode(last_undegraded, bl);
2926 decode(last_fullsized, bl);
2927 decode(tmp, bl);
2928 hitset_bytes_stats_invalid = tmp;
2929 decode(last_peered, bl);
2930 decode(last_became_peered, bl);
2931 decode(tmp, bl);
2932 pin_stats_invalid = tmp;
2933 if (struct_v >= 23) {
2934 decode(snaptrimq_len, bl);
2935 if (struct_v >= 24) {
2936 __u32 top_state;
2937 decode(top_state, bl);
2938 state = (uint64_t)old_state | ((uint64_t)top_state << 32);
2939 decode(purged_snaps, bl);
2940 } else {
2941 state = old_state;
2942 }
2943 if (struct_v >= 25) {
2944 decode(tmp, bl);
2945 manifest_stats_invalid = tmp;
2946 } else {
2947 manifest_stats_invalid = true;
2948 }
2949 if (struct_v >= 26) {
2950 decode(avail_no_missing, bl);
2951 decode(object_location_counts, bl);
2952 }
2953 }
2954 DECODE_FINISH(bl);
2955 }
2956
2957 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
2958 {
2959 pg_stat_t a;
2960 o.push_back(new pg_stat_t(a));
2961
2962 a.version = eversion_t(1, 3);
2963 a.reported_epoch = 1;
2964 a.reported_seq = 2;
2965 a.state = 123;
2966 a.mapping_epoch = 998;
2967 a.last_fresh = utime_t(1002, 1);
2968 a.last_change = utime_t(1002, 2);
2969 a.last_active = utime_t(1002, 3);
2970 a.last_clean = utime_t(1002, 4);
2971 a.last_unstale = utime_t(1002, 5);
2972 a.last_undegraded = utime_t(1002, 7);
2973 a.last_fullsized = utime_t(1002, 8);
2974 a.log_start = eversion_t(1, 4);
2975 a.ondisk_log_start = eversion_t(1, 5);
2976 a.created = 6;
2977 a.last_epoch_clean = 7;
2978 a.parent = pg_t(1, 2);
2979 a.parent_split_bits = 12;
2980 a.last_scrub = eversion_t(9, 10);
2981 a.last_scrub_stamp = utime_t(11, 12);
2982 a.last_deep_scrub = eversion_t(13, 14);
2983 a.last_deep_scrub_stamp = utime_t(15, 16);
2984 a.last_clean_scrub_stamp = utime_t(17, 18);
2985 a.snaptrimq_len = 1048576;
2986 list<object_stat_collection_t*> l;
2987 object_stat_collection_t::generate_test_instances(l);
2988 a.stats = *l.back();
2989 a.log_size = 99;
2990 a.ondisk_log_size = 88;
2991 a.up.push_back(123);
2992 a.up_primary = 123;
2993 a.acting.push_back(456);
2994 a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD));
2995 set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) };
2996 a.object_location_counts.insert(make_pair(sset, 10));
2997 sset.insert(pg_shard_t(2));
2998 a.object_location_counts.insert(make_pair(sset, 5));
2999 a.acting_primary = 456;
3000 o.push_back(new pg_stat_t(a));
3001
3002 a.up.push_back(124);
3003 a.up_primary = 124;
3004 a.acting.push_back(124);
3005 a.acting_primary = 124;
3006 a.blocked_by.push_back(155);
3007 a.blocked_by.push_back(156);
3008 o.push_back(new pg_stat_t(a));
3009 }
3010
3011 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
3012 {
3013 return
3014 l.version == r.version &&
3015 l.reported_seq == r.reported_seq &&
3016 l.reported_epoch == r.reported_epoch &&
3017 l.state == r.state &&
3018 l.last_fresh == r.last_fresh &&
3019 l.last_change == r.last_change &&
3020 l.last_active == r.last_active &&
3021 l.last_peered == r.last_peered &&
3022 l.last_clean == r.last_clean &&
3023 l.last_unstale == r.last_unstale &&
3024 l.last_undegraded == r.last_undegraded &&
3025 l.last_fullsized == r.last_fullsized &&
3026 l.log_start == r.log_start &&
3027 l.ondisk_log_start == r.ondisk_log_start &&
3028 l.created == r.created &&
3029 l.last_epoch_clean == r.last_epoch_clean &&
3030 l.parent == r.parent &&
3031 l.parent_split_bits == r.parent_split_bits &&
3032 l.last_scrub == r.last_scrub &&
3033 l.last_deep_scrub == r.last_deep_scrub &&
3034 l.last_scrub_stamp == r.last_scrub_stamp &&
3035 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
3036 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
3037 l.stats == r.stats &&
3038 l.stats_invalid == r.stats_invalid &&
3039 l.log_size == r.log_size &&
3040 l.ondisk_log_size == r.ondisk_log_size &&
3041 l.up == r.up &&
3042 l.acting == r.acting &&
3043 l.avail_no_missing == r.avail_no_missing &&
3044 l.object_location_counts == r.object_location_counts &&
3045 l.mapping_epoch == r.mapping_epoch &&
3046 l.blocked_by == r.blocked_by &&
3047 l.last_became_active == r.last_became_active &&
3048 l.last_became_peered == r.last_became_peered &&
3049 l.dirty_stats_invalid == r.dirty_stats_invalid &&
3050 l.omap_stats_invalid == r.omap_stats_invalid &&
3051 l.hitset_stats_invalid == r.hitset_stats_invalid &&
3052 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
3053 l.up_primary == r.up_primary &&
3054 l.acting_primary == r.acting_primary &&
3055 l.pin_stats_invalid == r.pin_stats_invalid &&
3056 l.manifest_stats_invalid == r.manifest_stats_invalid &&
3057 l.purged_snaps == r.purged_snaps &&
3058 l.snaptrimq_len == r.snaptrimq_len;
3059 }
3060
3061 // -- store_statfs_t --
3062
3063 bool store_statfs_t::operator==(const store_statfs_t& other) const
3064 {
3065 return total == other.total
3066 && available == other.available
3067 && allocated == other.allocated
3068 && internally_reserved == other.internally_reserved
3069 && data_stored == other.data_stored
3070 && data_compressed == other.data_compressed
3071 && data_compressed_allocated == other.data_compressed_allocated
3072 && data_compressed_original == other.data_compressed_original
3073 && omap_allocated == other.omap_allocated
3074 && internal_metadata == other.internal_metadata;
3075 }
3076
3077 void store_statfs_t::dump(Formatter *f) const
3078 {
3079 f->dump_int("total", total);
3080 f->dump_int("available", available);
3081 f->dump_int("internally_reserved", internally_reserved);
3082 f->dump_int("allocated", allocated);
3083 f->dump_int("data_stored", data_stored);
3084 f->dump_int("data_compressed", data_compressed);
3085 f->dump_int("data_compressed_allocated", data_compressed_allocated);
3086 f->dump_int("data_compressed_original", data_compressed_original);
3087 f->dump_int("omap_allocated", omap_allocated);
3088 f->dump_int("internal_metadata", internal_metadata);
3089 }
3090
3091 ostream& operator<<(ostream& out, const store_statfs_t &s)
3092 {
3093 out << std::hex
3094 << "store_statfs(0x" << s.available
3095 << "/0x" << s.internally_reserved
3096 << "/0x" << s.total
3097 << ", data 0x" << s.data_stored
3098 << "/0x" << s.allocated
3099 << ", compress 0x" << s.data_compressed
3100 << "/0x" << s.data_compressed_allocated
3101 << "/0x" << s.data_compressed_original
3102 << ", omap 0x" << s.omap_allocated
3103 << ", meta 0x" << s.internal_metadata
3104 << std::dec
3105 << ")";
3106 return out;
3107 }
3108
3109 void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o)
3110 {
3111 store_statfs_t a;
3112 o.push_back(new store_statfs_t(a));
3113 a.total = 234;
3114 a.available = 123;
3115 a.internally_reserved = 33;
3116 a.allocated = 32;
3117 a.data_stored = 44;
3118 a.data_compressed = 21;
3119 a.data_compressed_allocated = 12;
3120 a.data_compressed_original = 13;
3121 a.omap_allocated = 14;
3122 a.internal_metadata = 15;
3123 o.push_back(new store_statfs_t(a));
3124 }
3125
3126 // -- pool_stat_t --
3127
3128 void pool_stat_t::dump(Formatter *f) const
3129 {
3130 stats.dump(f);
3131 f->open_object_section("store_stats");
3132 store_stats.dump(f);
3133 f->close_section();
3134 f->dump_int("log_size", log_size);
3135 f->dump_int("ondisk_log_size", ondisk_log_size);
3136 f->dump_int("up", up);
3137 f->dump_int("acting", acting);
3138 f->dump_int("num_store_stats", num_store_stats);
3139 }
3140
3141 void pool_stat_t::encode(bufferlist &bl, uint64_t features) const
3142 {
3143 using ceph::encode;
3144 if ((features & CEPH_FEATURE_OSDENC) == 0) {
3145 __u8 v = 4;
3146 encode(v, bl);
3147 encode(stats, bl);
3148 encode(log_size, bl);
3149 encode(ondisk_log_size, bl);
3150 return;
3151 }
3152
3153 ENCODE_START(7, 5, bl);
3154 encode(stats, bl);
3155 encode(log_size, bl);
3156 encode(ondisk_log_size, bl);
3157 encode(up, bl);
3158 encode(acting, bl);
3159 encode(store_stats, bl);
3160 encode(num_store_stats, bl);
3161 ENCODE_FINISH(bl);
3162 }
3163
3164 void pool_stat_t::decode(bufferlist::const_iterator &bl)
3165 {
3166 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
3167 if (struct_v >= 4) {
3168 decode(stats, bl);
3169 decode(log_size, bl);
3170 decode(ondisk_log_size, bl);
3171 if (struct_v >= 6) {
3172 decode(up, bl);
3173 decode(acting, bl);
3174 } else {
3175 up = 0;
3176 acting = 0;
3177 }
3178 if (struct_v >= 7) {
3179 decode(store_stats, bl);
3180 decode(num_store_stats, bl);
3181 } else {
3182 store_stats.reset();
3183 num_store_stats = 0;
3184 }
3185
3186 } else {
3187 decode(stats.sum.num_bytes, bl);
3188 uint64_t num_kb;
3189 decode(num_kb, bl);
3190 decode(stats.sum.num_objects, bl);
3191 decode(stats.sum.num_object_clones, bl);
3192 decode(stats.sum.num_object_copies, bl);
3193 decode(stats.sum.num_objects_missing_on_primary, bl);
3194 decode(stats.sum.num_objects_degraded, bl);
3195 decode(log_size, bl);
3196 decode(ondisk_log_size, bl);
3197 if (struct_v >= 2) {
3198 decode(stats.sum.num_rd, bl);
3199 decode(stats.sum.num_rd_kb, bl);
3200 decode(stats.sum.num_wr, bl);
3201 decode(stats.sum.num_wr_kb, bl);
3202 }
3203 if (struct_v >= 3) {
3204 decode(stats.sum.num_objects_unfound, bl);
3205 }
3206 }
3207 DECODE_FINISH(bl);
3208 }
3209
3210 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
3211 {
3212 pool_stat_t a;
3213 o.push_back(new pool_stat_t(a));
3214
3215 list<object_stat_collection_t*> l;
3216 object_stat_collection_t::generate_test_instances(l);
3217 list<store_statfs_t*> ll;
3218 store_statfs_t::generate_test_instances(ll);
3219 a.stats = *l.back();
3220 a.store_stats = *ll.back();
3221 a.log_size = 123;
3222 a.ondisk_log_size = 456;
3223 a.acting = 3;
3224 a.up = 4;
3225 a.num_store_stats = 1;
3226 o.push_back(new pool_stat_t(a));
3227 }
3228
3229
3230 // -- pg_history_t --
3231
3232 void pg_history_t::encode(bufferlist &bl) const
3233 {
3234 ENCODE_START(9, 4, bl);
3235 encode(epoch_created, bl);
3236 encode(last_epoch_started, bl);
3237 encode(last_epoch_clean, bl);
3238 encode(last_epoch_split, bl);
3239 encode(same_interval_since, bl);
3240 encode(same_up_since, bl);
3241 encode(same_primary_since, bl);
3242 encode(last_scrub, bl);
3243 encode(last_scrub_stamp, bl);
3244 encode(last_deep_scrub, bl);
3245 encode(last_deep_scrub_stamp, bl);
3246 encode(last_clean_scrub_stamp, bl);
3247 encode(last_epoch_marked_full, bl);
3248 encode(last_interval_started, bl);
3249 encode(last_interval_clean, bl);
3250 encode(epoch_pool_created, bl);
3251 ENCODE_FINISH(bl);
3252 }
3253
3254 void pg_history_t::decode(bufferlist::const_iterator &bl)
3255 {
3256 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
3257 decode(epoch_created, bl);
3258 decode(last_epoch_started, bl);
3259 if (struct_v >= 3)
3260 decode(last_epoch_clean, bl);
3261 else
3262 last_epoch_clean = last_epoch_started; // careful, it's a lie!
3263 decode(last_epoch_split, bl);
3264 decode(same_interval_since, bl);
3265 decode(same_up_since, bl);
3266 decode(same_primary_since, bl);
3267 if (struct_v >= 2) {
3268 decode(last_scrub, bl);
3269 decode(last_scrub_stamp, bl);
3270 }
3271 if (struct_v >= 5) {
3272 decode(last_deep_scrub, bl);
3273 decode(last_deep_scrub_stamp, bl);
3274 }
3275 if (struct_v >= 6) {
3276 decode(last_clean_scrub_stamp, bl);
3277 }
3278 if (struct_v >= 7) {
3279 decode(last_epoch_marked_full, bl);
3280 }
3281 if (struct_v >= 8) {
3282 decode(last_interval_started, bl);
3283 decode(last_interval_clean, bl);
3284 } else {
3285 if (last_epoch_started >= same_interval_since) {
3286 last_interval_started = same_interval_since;
3287 } else {
3288 last_interval_started = last_epoch_started; // best guess
3289 }
3290 if (last_epoch_clean >= same_interval_since) {
3291 last_interval_clean = same_interval_since;
3292 } else {
3293 last_interval_clean = last_epoch_clean; // best guess
3294 }
3295 }
3296 if (struct_v >= 9) {
3297 decode(epoch_pool_created, bl);
3298 } else {
3299 epoch_pool_created = epoch_created;
3300 }
3301 DECODE_FINISH(bl);
3302 }
3303
3304 void pg_history_t::dump(Formatter *f) const
3305 {
3306 f->dump_int("epoch_created", epoch_created);
3307 f->dump_int("epoch_pool_created", epoch_pool_created);
3308 f->dump_int("last_epoch_started", last_epoch_started);
3309 f->dump_int("last_interval_started", last_interval_started);
3310 f->dump_int("last_epoch_clean", last_epoch_clean);
3311 f->dump_int("last_interval_clean", last_interval_clean);
3312 f->dump_int("last_epoch_split", last_epoch_split);
3313 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
3314 f->dump_int("same_up_since", same_up_since);
3315 f->dump_int("same_interval_since", same_interval_since);
3316 f->dump_int("same_primary_since", same_primary_since);
3317 f->dump_stream("last_scrub") << last_scrub;
3318 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
3319 f->dump_stream("last_deep_scrub") << last_deep_scrub;
3320 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
3321 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
3322 }
3323
3324 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
3325 {
3326 o.push_back(new pg_history_t);
3327 o.push_back(new pg_history_t);
3328 o.back()->epoch_created = 1;
3329 o.back()->epoch_pool_created = 1;
3330 o.back()->last_epoch_started = 2;
3331 o.back()->last_interval_started = 2;
3332 o.back()->last_epoch_clean = 3;
3333 o.back()->last_interval_clean = 2;
3334 o.back()->last_epoch_split = 4;
3335 o.back()->same_up_since = 5;
3336 o.back()->same_interval_since = 6;
3337 o.back()->same_primary_since = 7;
3338 o.back()->last_scrub = eversion_t(8, 9);
3339 o.back()->last_scrub_stamp = utime_t(10, 11);
3340 o.back()->last_deep_scrub = eversion_t(12, 13);
3341 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
3342 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
3343 o.back()->last_epoch_marked_full = 18;
3344 }
3345
3346
3347 // -- pg_info_t --
3348
3349 void pg_info_t::encode(bufferlist &bl) const
3350 {
3351 ENCODE_START(32, 26, bl);
3352 encode(pgid.pgid, bl);
3353 encode(last_update, bl);
3354 encode(last_complete, bl);
3355 encode(log_tail, bl);
3356 if (last_backfill_bitwise && !last_backfill.is_max()) {
3357 encode(hobject_t(), bl);
3358 } else {
3359 encode(last_backfill, bl);
3360 }
3361 encode(stats, bl);
3362 history.encode(bl);
3363 encode(purged_snaps, bl);
3364 encode(last_epoch_started, bl);
3365 encode(last_user_version, bl);
3366 encode(hit_set, bl);
3367 encode(pgid.shard, bl);
3368 encode(last_backfill, bl);
3369 encode(last_backfill_bitwise, bl);
3370 encode(last_interval_started, bl);
3371 ENCODE_FINISH(bl);
3372 }
3373
3374 void pg_info_t::decode(bufferlist::const_iterator &bl)
3375 {
3376 DECODE_START(32, bl);
3377 decode(pgid.pgid, bl);
3378 decode(last_update, bl);
3379 decode(last_complete, bl);
3380 decode(log_tail, bl);
3381 {
3382 hobject_t old_last_backfill;
3383 decode(old_last_backfill, bl);
3384 }
3385 decode(stats, bl);
3386 history.decode(bl);
3387 decode(purged_snaps, bl);
3388 decode(last_epoch_started, bl);
3389 decode(last_user_version, bl);
3390 decode(hit_set, bl);
3391 decode(pgid.shard, bl);
3392 decode(last_backfill, bl);
3393 decode(last_backfill_bitwise, bl);
3394 if (struct_v >= 32) {
3395 decode(last_interval_started, bl);
3396 } else {
3397 last_interval_started = last_epoch_started;
3398 }
3399 DECODE_FINISH(bl);
3400 }
3401
3402 // -- pg_info_t --
3403
3404 void pg_info_t::dump(Formatter *f) const
3405 {
3406 f->dump_stream("pgid") << pgid;
3407 f->dump_stream("last_update") << last_update;
3408 f->dump_stream("last_complete") << last_complete;
3409 f->dump_stream("log_tail") << log_tail;
3410 f->dump_int("last_user_version", last_user_version);
3411 f->dump_stream("last_backfill") << last_backfill;
3412 f->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise);
3413 f->open_array_section("purged_snaps");
3414 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
3415 i != purged_snaps.end();
3416 ++i) {
3417 f->open_object_section("purged_snap_interval");
3418 f->dump_stream("start") << i.get_start();
3419 f->dump_stream("length") << i.get_len();
3420 f->close_section();
3421 }
3422 f->close_section();
3423 f->open_object_section("history");
3424 history.dump(f);
3425 f->close_section();
3426 f->open_object_section("stats");
3427 stats.dump(f);
3428 f->close_section();
3429
3430 f->dump_int("empty", is_empty());
3431 f->dump_int("dne", dne());
3432 f->dump_int("incomplete", is_incomplete());
3433 f->dump_int("last_epoch_started", last_epoch_started);
3434
3435 f->open_object_section("hit_set_history");
3436 hit_set.dump(f);
3437 f->close_section();
3438 }
3439
3440 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
3441 {
3442 o.push_back(new pg_info_t);
3443 o.push_back(new pg_info_t);
3444 list<pg_history_t*> h;
3445 pg_history_t::generate_test_instances(h);
3446 o.back()->history = *h.back();
3447 o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD);
3448 o.back()->last_update = eversion_t(3, 4);
3449 o.back()->last_complete = eversion_t(5, 6);
3450 o.back()->last_user_version = 2;
3451 o.back()->log_tail = eversion_t(7, 8);
3452 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
3453 o.back()->last_backfill_bitwise = true;
3454 {
3455 list<pg_stat_t*> s;
3456 pg_stat_t::generate_test_instances(s);
3457 o.back()->stats = *s.back();
3458 }
3459 {
3460 list<pg_hit_set_history_t*> s;
3461 pg_hit_set_history_t::generate_test_instances(s);
3462 o.back()->hit_set = *s.back();
3463 }
3464 }
3465
3466 // -- pg_notify_t --
3467 void pg_notify_t::encode(bufferlist &bl) const
3468 {
3469 ENCODE_START(2, 2, bl);
3470 encode(query_epoch, bl);
3471 encode(epoch_sent, bl);
3472 encode(info, bl);
3473 encode(to, bl);
3474 encode(from, bl);
3475 ENCODE_FINISH(bl);
3476 }
3477
3478 void pg_notify_t::decode(bufferlist::const_iterator &bl)
3479 {
3480 DECODE_START(2, bl);
3481 decode(query_epoch, bl);
3482 decode(epoch_sent, bl);
3483 decode(info, bl);
3484 decode(to, bl);
3485 decode(from, bl);
3486 DECODE_FINISH(bl);
3487 }
3488
3489 void pg_notify_t::dump(Formatter *f) const
3490 {
3491 f->dump_int("from", from);
3492 f->dump_int("to", to);
3493 f->dump_unsigned("query_epoch", query_epoch);
3494 f->dump_unsigned("epoch_sent", epoch_sent);
3495 {
3496 f->open_object_section("info");
3497 info.dump(f);
3498 f->close_section();
3499 }
3500 }
3501
3502 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
3503 {
3504 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, pg_info_t()));
3505 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t()));
3506 }
3507
3508 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
3509 {
3510 lhs << "(query:" << notify.query_epoch
3511 << " sent:" << notify.epoch_sent
3512 << " " << notify.info;
3513 if (notify.from != shard_id_t::NO_SHARD ||
3514 notify.to != shard_id_t::NO_SHARD)
3515 lhs << " " << (unsigned)notify.from
3516 << "->" << (unsigned)notify.to;
3517 return lhs << ")";
3518 }
3519
3520 // -- pg_interval_t --
3521
3522 void PastIntervals::pg_interval_t::encode(bufferlist& bl) const
3523 {
3524 ENCODE_START(4, 2, bl);
3525 encode(first, bl);
3526 encode(last, bl);
3527 encode(up, bl);
3528 encode(acting, bl);
3529 encode(maybe_went_rw, bl);
3530 encode(primary, bl);
3531 encode(up_primary, bl);
3532 ENCODE_FINISH(bl);
3533 }
3534
3535 void PastIntervals::pg_interval_t::decode(bufferlist::const_iterator& bl)
3536 {
3537 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
3538 decode(first, bl);
3539 decode(last, bl);
3540 decode(up, bl);
3541 decode(acting, bl);
3542 decode(maybe_went_rw, bl);
3543 if (struct_v >= 3) {
3544 decode(primary, bl);
3545 } else {
3546 if (acting.size())
3547 primary = acting[0];
3548 }
3549 if (struct_v >= 4) {
3550 decode(up_primary, bl);
3551 } else {
3552 if (up.size())
3553 up_primary = up[0];
3554 }
3555 DECODE_FINISH(bl);
3556 }
3557
3558 void PastIntervals::pg_interval_t::dump(Formatter *f) const
3559 {
3560 f->dump_unsigned("first", first);
3561 f->dump_unsigned("last", last);
3562 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
3563 f->open_array_section("up");
3564 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3565 f->dump_int("osd", *p);
3566 f->close_section();
3567 f->open_array_section("acting");
3568 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3569 f->dump_int("osd", *p);
3570 f->close_section();
3571 f->dump_int("primary", primary);
3572 f->dump_int("up_primary", up_primary);
3573 }
3574
3575 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
3576 {
3577 o.push_back(new pg_interval_t);
3578 o.push_back(new pg_interval_t);
3579 o.back()->up.push_back(1);
3580 o.back()->acting.push_back(2);
3581 o.back()->acting.push_back(3);
3582 o.back()->first = 4;
3583 o.back()->last = 5;
3584 o.back()->maybe_went_rw = true;
3585 }
3586
3587 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
3588
3589
3590 /**
3591 * pi_compact_rep
3592 *
3593 * PastIntervals only needs to be able to answer two questions:
3594 * 1) Where should the primary look for unfound objects?
3595 * 2) List a set of subsets of the OSDs such that contacting at least
3596 * one from each subset guarantees we speak to at least one witness
3597 * of any completed write.
3598 *
3599 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3600 * we don't need to keep any where maybe_went_rw would be false. We also
3601 * needn't keep two intervals where the actingset in one is a subset
3602 * of the other (only need to keep the smaller of the two sets). In order
3603 * to accurately trim the set of intervals as last_epoch_started changes
3604 * without rebuilding the set from scratch, we'll retain the larger set
3605 * if it in an older interval.
3606 */
3607 struct compact_interval_t {
3608 epoch_t first;
3609 epoch_t last;
3610 set<pg_shard_t> acting;
3611 bool supersedes(const compact_interval_t &other) {
3612 for (auto &&i: acting) {
3613 if (!other.acting.count(i))
3614 return false;
3615 }
3616 return true;
3617 }
3618 void dump(Formatter *f) const {
3619 f->open_object_section("compact_interval_t");
3620 f->dump_stream("first") << first;
3621 f->dump_stream("last") << last;
3622 f->dump_stream("acting") << acting;
3623 f->close_section();
3624 }
3625 void encode(bufferlist &bl) const {
3626 ENCODE_START(1, 1, bl);
3627 encode(first, bl);
3628 encode(last, bl);
3629 encode(acting, bl);
3630 ENCODE_FINISH(bl);
3631 }
3632 void decode(bufferlist::const_iterator &bl) {
3633 DECODE_START(1, bl);
3634 decode(first, bl);
3635 decode(last, bl);
3636 decode(acting, bl);
3637 DECODE_FINISH(bl);
3638 }
3639 static void generate_test_instances(list<compact_interval_t*> & o) {
3640 /* Not going to be used, we'll generate pi_compact_rep directly */
3641 }
3642 };
3643 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3644 {
3645 return o << "([" << rhs.first << "," << rhs.last
3646 << "] acting " << rhs.acting << ")";
3647 }
3648 WRITE_CLASS_ENCODER(compact_interval_t)
3649
3650 class pi_compact_rep : public PastIntervals::interval_rep {
3651 epoch_t first = 0;
3652 epoch_t last = 0; // inclusive
3653 set<pg_shard_t> all_participants;
3654 list<compact_interval_t> intervals;
3655 pi_compact_rep(
3656 bool ec_pool,
3657 std::list<PastIntervals::pg_interval_t> &&intervals) {
3658 for (auto &&i: intervals)
3659 add_interval(ec_pool, i);
3660 }
3661 public:
3662 pi_compact_rep() = default;
3663 pi_compact_rep(const pi_compact_rep &) = default;
3664 pi_compact_rep(pi_compact_rep &&) = default;
3665 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3666 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3667
3668 size_t size() const override { return intervals.size(); }
3669 bool empty() const override {
3670 return first > last || (first == 0 && last == 0);
3671 }
3672 void clear() override {
3673 *this = pi_compact_rep();
3674 }
3675 pair<epoch_t, epoch_t> get_bounds() const override {
3676 return make_pair(first, last + 1);
3677 }
3678 void adjust_start_backwards(epoch_t last_epoch_clean) {
3679 first = last_epoch_clean;
3680 }
3681
3682 set<pg_shard_t> get_all_participants(
3683 bool ec_pool) const override {
3684 return all_participants;
3685 }
3686 void add_interval(
3687 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3688 if (first == 0)
3689 first = interval.first;
3690 ceph_assert(interval.last > last);
3691 last = interval.last;
3692 set<pg_shard_t> acting;
3693 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3694 if (interval.acting[i] == CRUSH_ITEM_NONE)
3695 continue;
3696 acting.insert(
3697 pg_shard_t(
3698 interval.acting[i],
3699 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3700 }
3701 all_participants.insert(acting.begin(), acting.end());
3702 if (!interval.maybe_went_rw)
3703 return;
3704 intervals.push_back(
3705 compact_interval_t{interval.first, interval.last, acting});
3706 auto plast = intervals.end();
3707 --plast;
3708 for (auto cur = intervals.begin(); cur != plast; ) {
3709 if (plast->supersedes(*cur)) {
3710 intervals.erase(cur++);
3711 } else {
3712 ++cur;
3713 }
3714 }
3715 }
3716 unique_ptr<PastIntervals::interval_rep> clone() const override {
3717 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3718 }
3719 ostream &print(ostream &out) const override {
3720 return out << "([" << first << "," << last
3721 << "] intervals=" << intervals << ")";
3722 }
3723 void encode(bufferlist &bl) const override {
3724 ENCODE_START(1, 1, bl);
3725 encode(first, bl);
3726 encode(last, bl);
3727 encode(all_participants, bl);
3728 encode(intervals, bl);
3729 ENCODE_FINISH(bl);
3730 }
3731 void decode(bufferlist::const_iterator &bl) override {
3732 DECODE_START(1, bl);
3733 decode(first, bl);
3734 decode(last, bl);
3735 decode(all_participants, bl);
3736 decode(intervals, bl);
3737 DECODE_FINISH(bl);
3738 }
3739 void dump(Formatter *f) const override {
3740 f->open_object_section("PastIntervals::compact_rep");
3741 f->dump_stream("first") << first;
3742 f->dump_stream("last") << last;
3743 f->open_array_section("all_participants");
3744 for (auto& i : all_participants) {
3745 f->dump_object("pg_shard", i);
3746 }
3747 f->close_section();
3748 f->open_array_section("intervals");
3749 for (auto &&i: intervals) {
3750 i.dump(f);
3751 }
3752 f->close_section();
3753 f->close_section();
3754 }
3755 static void generate_test_instances(list<pi_compact_rep*> &o) {
3756 using ival = PastIntervals::pg_interval_t;
3757 using ivallst = std::list<ival>;
3758 o.push_back(
3759 new pi_compact_rep(
3760 true, ivallst
3761 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3762 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3763 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3764 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3765 }));
3766 o.push_back(
3767 new pi_compact_rep(
3768 false, ivallst
3769 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3770 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3771 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3772 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3773 }));
3774 o.push_back(
3775 new pi_compact_rep(
3776 true, ivallst
3777 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3778 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3779 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3780 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3781 }));
3782 }
3783 void iterate_mayberw_back_to(
3784 epoch_t les,
3785 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3786 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3787 if (i->last < les)
3788 break;
3789 f(i->first, i->acting);
3790 }
3791 }
3792 virtual ~pi_compact_rep() override {}
3793 };
3794 WRITE_CLASS_ENCODER(pi_compact_rep)
3795
3796 PastIntervals::PastIntervals()
3797 {
3798 past_intervals.reset(new pi_compact_rep);
3799 }
3800
3801 PastIntervals::PastIntervals(const PastIntervals &rhs)
3802 : past_intervals(rhs.past_intervals ?
3803 rhs.past_intervals->clone() :
3804 nullptr) {}
3805
3806 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3807 {
3808 PastIntervals other(rhs);
3809 swap(other);
3810 return *this;
3811 }
3812
3813 ostream& operator<<(ostream& out, const PastIntervals &i)
3814 {
3815 if (i.past_intervals) {
3816 return i.past_intervals->print(out);
3817 } else {
3818 return out << "(empty)";
3819 }
3820 }
3821
3822 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3823 {
3824 return out << "PriorSet("
3825 << "ec_pool: " << i.ec_pool
3826 << ", probe: " << i.probe
3827 << ", down: " << i.down
3828 << ", blocked_by: " << i.blocked_by
3829 << ", pg_down: " << i.pg_down
3830 << ")";
3831 }
3832
3833 void PastIntervals::decode(bufferlist::const_iterator &bl)
3834 {
3835 DECODE_START(1, bl);
3836 __u8 type = 0;
3837 decode(type, bl);
3838 switch (type) {
3839 case 0:
3840 break;
3841 case 1:
3842 ceph_abort_msg("pi_simple_rep support removed post-luminous");
3843 break;
3844 case 2:
3845 past_intervals.reset(new pi_compact_rep);
3846 past_intervals->decode(bl);
3847 break;
3848 }
3849 DECODE_FINISH(bl);
3850 }
3851
3852 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
3853 {
3854 {
3855 list<pi_compact_rep *> compact;
3856 pi_compact_rep::generate_test_instances(compact);
3857 for (auto &&i: compact) {
3858 // takes ownership of contents
3859 o.push_back(new PastIntervals(i));
3860 }
3861 }
3862 return;
3863 }
3864
3865 bool PastIntervals::is_new_interval(
3866 int old_acting_primary,
3867 int new_acting_primary,
3868 const vector<int> &old_acting,
3869 const vector<int> &new_acting,
3870 int old_up_primary,
3871 int new_up_primary,
3872 const vector<int> &old_up,
3873 const vector<int> &new_up,
3874 int old_size,
3875 int new_size,
3876 int old_min_size,
3877 int new_min_size,
3878 unsigned old_pg_num,
3879 unsigned new_pg_num,
3880 unsigned old_pg_num_pending,
3881 unsigned new_pg_num_pending,
3882 bool old_sort_bitwise,
3883 bool new_sort_bitwise,
3884 bool old_recovery_deletes,
3885 bool new_recovery_deletes,
3886 pg_t pgid) {
3887 return old_acting_primary != new_acting_primary ||
3888 new_acting != old_acting ||
3889 old_up_primary != new_up_primary ||
3890 new_up != old_up ||
3891 old_min_size != new_min_size ||
3892 old_size != new_size ||
3893 pgid.is_split(old_pg_num, new_pg_num, 0) ||
3894 // (is or was) pre-merge source
3895 pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) ||
3896 pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) ||
3897 // merge source
3898 pgid.is_merge_source(old_pg_num, new_pg_num, 0) ||
3899 // (is or was) pre-merge target
3900 pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) ||
3901 pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) ||
3902 // merge target
3903 pgid.is_merge_target(old_pg_num, new_pg_num) ||
3904 old_sort_bitwise != new_sort_bitwise ||
3905 old_recovery_deletes != new_recovery_deletes;
3906 }
3907
3908 bool PastIntervals::is_new_interval(
3909 int old_acting_primary,
3910 int new_acting_primary,
3911 const vector<int> &old_acting,
3912 const vector<int> &new_acting,
3913 int old_up_primary,
3914 int new_up_primary,
3915 const vector<int> &old_up,
3916 const vector<int> &new_up,
3917 OSDMapRef osdmap,
3918 OSDMapRef lastmap,
3919 pg_t pgid)
3920 {
3921 const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool());
3922 if (!plast) {
3923 return false; // after pool is deleted there are no more interval changes
3924 }
3925 const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool());
3926 if (!pi) {
3927 return true; // pool was deleted this epoch -> (final!) interval change
3928 }
3929 return
3930 is_new_interval(old_acting_primary,
3931 new_acting_primary,
3932 old_acting,
3933 new_acting,
3934 old_up_primary,
3935 new_up_primary,
3936 old_up,
3937 new_up,
3938 plast->size,
3939 pi->size,
3940 plast->min_size,
3941 pi->min_size,
3942 plast->get_pg_num(),
3943 pi->get_pg_num(),
3944 plast->get_pg_num_pending(),
3945 pi->get_pg_num_pending(),
3946 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3947 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3948 lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3949 osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3950 pgid);
3951 }
3952
3953 bool PastIntervals::check_new_interval(
3954 int old_acting_primary,
3955 int new_acting_primary,
3956 const vector<int> &old_acting,
3957 const vector<int> &new_acting,
3958 int old_up_primary,
3959 int new_up_primary,
3960 const vector<int> &old_up,
3961 const vector<int> &new_up,
3962 epoch_t same_interval_since,
3963 epoch_t last_epoch_clean,
3964 OSDMapRef osdmap,
3965 OSDMapRef lastmap,
3966 pg_t pgid,
3967 IsPGRecoverablePredicate *could_have_gone_active,
3968 PastIntervals *past_intervals,
3969 std::ostream *out)
3970 {
3971 /*
3972 * We have to be careful to gracefully deal with situations like
3973 * so. Say we have a power outage or something that takes out both
3974 * OSDs, but the monitor doesn't mark them down in the same epoch.
3975 * The history may look like
3976 *
3977 * 1: A B
3978 * 2: B
3979 * 3: let's say B dies for good, too (say, from the power spike)
3980 * 4: A
3981 *
3982 * which makes it look like B may have applied updates to the PG
3983 * that we need in order to proceed. This sucks...
3984 *
3985 * To minimize the risk of this happening, we CANNOT go active if
3986 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3987 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3988 * Then, we have something like
3989 *
3990 * 1: A B
3991 * 2: B up_thru[B]=0
3992 * 3:
3993 * 4: A
3994 *
3995 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
3996 *
3997 * or,
3998 *
3999 * 1: A B
4000 * 2: B up_thru[B]=0
4001 * 3: B up_thru[B]=2
4002 * 4:
4003 * 5: A
4004 *
4005 * -> we must wait for B, bc it was alive through 2, and could have
4006 * written to the pg.
4007 *
4008 * If B is really dead, then an administrator will need to manually
4009 * intervene by marking the OSD as "lost."
4010 */
4011
4012 // remember past interval
4013 // NOTE: a change in the up set primary triggers an interval
4014 // change, even though the interval members in the pg_interval_t
4015 // do not change.
4016 ceph_assert(past_intervals);
4017 ceph_assert(past_intervals->past_intervals);
4018 if (is_new_interval(
4019 old_acting_primary,
4020 new_acting_primary,
4021 old_acting,
4022 new_acting,
4023 old_up_primary,
4024 new_up_primary,
4025 old_up,
4026 new_up,
4027 osdmap,
4028 lastmap,
4029 pgid)) {
4030 pg_interval_t i;
4031 i.first = same_interval_since;
4032 i.last = osdmap->get_epoch() - 1;
4033 ceph_assert(i.first <= i.last);
4034 i.acting = old_acting;
4035 i.up = old_up;
4036 i.primary = old_acting_primary;
4037 i.up_primary = old_up_primary;
4038
4039 unsigned num_acting = 0;
4040 for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();
4041 ++p)
4042 if (*p != CRUSH_ITEM_NONE)
4043 ++num_acting;
4044
4045 ceph_assert(lastmap->get_pools().count(pgid.pool()));
4046 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
4047 set<pg_shard_t> old_acting_shards;
4048 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
4049
4050 if (num_acting &&
4051 i.primary != -1 &&
4052 num_acting >= old_pg_pool.min_size &&
4053 (*could_have_gone_active)(old_acting_shards)) {
4054 if (out)
4055 *out << __func__ << " " << i
4056 << " up_thru " << lastmap->get_up_thru(i.primary)
4057 << " up_from " << lastmap->get_up_from(i.primary)
4058 << " last_epoch_clean " << last_epoch_clean;
4059 if (lastmap->get_up_thru(i.primary) >= i.first &&
4060 lastmap->get_up_from(i.primary) <= i.first) {
4061 i.maybe_went_rw = true;
4062 if (out)
4063 *out << " " << i
4064 << " : primary up " << lastmap->get_up_from(i.primary)
4065 << "-" << lastmap->get_up_thru(i.primary)
4066 << " includes interval"
4067 << std::endl;
4068 } else if (last_epoch_clean >= i.first &&
4069 last_epoch_clean <= i.last) {
4070 // If the last_epoch_clean is included in this interval, then
4071 // the pg must have been rw (for recovery to have completed).
4072 // This is important because we won't know the _real_
4073 // first_epoch because we stop at last_epoch_clean, and we
4074 // don't want the oldest interval to randomly have
4075 // maybe_went_rw false depending on the relative up_thru vs
4076 // last_epoch_clean timing.
4077 i.maybe_went_rw = true;
4078 if (out)
4079 *out << " " << i
4080 << " : includes last_epoch_clean " << last_epoch_clean
4081 << " and presumed to have been rw"
4082 << std::endl;
4083 } else {
4084 i.maybe_went_rw = false;
4085 if (out)
4086 *out << " " << i
4087 << " : primary up " << lastmap->get_up_from(i.primary)
4088 << "-" << lastmap->get_up_thru(i.primary)
4089 << " does not include interval"
4090 << std::endl;
4091 }
4092 } else {
4093 i.maybe_went_rw = false;
4094 if (out)
4095 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
4096 }
4097 past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);
4098 return true;
4099 } else {
4100 return false;
4101 }
4102 }
4103
4104
4105 // true if the given map affects the prior set
4106 bool PastIntervals::PriorSet::affected_by_map(
4107 const OSDMap &osdmap,
4108 const DoutPrefixProvider *dpp) const
4109 {
4110 for (set<pg_shard_t>::iterator p = probe.begin();
4111 p != probe.end();
4112 ++p) {
4113 int o = p->osd;
4114
4115 // did someone in the prior set go down?
4116 if (osdmap.is_down(o) && down.count(o) == 0) {
4117 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
4118 return true;
4119 }
4120
4121 // did a down osd in cur get (re)marked as lost?
4122 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
4123 if (r != blocked_by.end()) {
4124 if (!osdmap.exists(o)) {
4125 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4126 return true;
4127 }
4128 if (osdmap.get_info(o).lost_at != r->second) {
4129 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4130 return true;
4131 }
4132 }
4133 }
4134
4135 // did someone in the prior down set go up?
4136 for (set<int>::const_iterator p = down.begin();
4137 p != down.end();
4138 ++p) {
4139 int o = *p;
4140
4141 if (osdmap.is_up(o)) {
4142 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
4143 return true;
4144 }
4145
4146 // did someone in the prior set get lost or destroyed?
4147 if (!osdmap.exists(o)) {
4148 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4149 return true;
4150 }
4151 // did a down osd in down get (re)marked as lost?
4152 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
4153 if (r != blocked_by.end()) {
4154 if (osdmap.get_info(o).lost_at != r->second) {
4155 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4156 return true;
4157 }
4158 }
4159 }
4160
4161 return false;
4162 }
4163
4164 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
4165 {
4166 out << "interval(" << i.first << "-" << i.last
4167 << " up " << i.up << "(" << i.up_primary << ")"
4168 << " acting " << i.acting << "(" << i.primary << ")";
4169 if (i.maybe_went_rw)
4170 out << " maybe_went_rw";
4171 out << ")";
4172 return out;
4173 }
4174
4175
4176
4177 // -- pg_query_t --
4178
4179 void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
4180 ENCODE_START(3, 3, bl);
4181 encode(type, bl);
4182 encode(since, bl);
4183 history.encode(bl);
4184 encode(epoch_sent, bl);
4185 encode(to, bl);
4186 encode(from, bl);
4187 ENCODE_FINISH(bl);
4188 }
4189
4190 void pg_query_t::decode(bufferlist::const_iterator &bl) {
4191 DECODE_START(3, bl);
4192 decode(type, bl);
4193 decode(since, bl);
4194 history.decode(bl);
4195 decode(epoch_sent, bl);
4196 decode(to, bl);
4197 decode(from, bl);
4198 DECODE_FINISH(bl);
4199 }
4200
4201 void pg_query_t::dump(Formatter *f) const
4202 {
4203 f->dump_int("from", from);
4204 f->dump_int("to", to);
4205 f->dump_string("type", get_type_name());
4206 f->dump_stream("since") << since;
4207 f->dump_stream("epoch_sent") << epoch_sent;
4208 f->open_object_section("history");
4209 history.dump(f);
4210 f->close_section();
4211 }
4212 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
4213 {
4214 o.push_back(new pg_query_t());
4215 list<pg_history_t*> h;
4216 pg_history_t::generate_test_instances(h);
4217 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
4218 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
4219 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
4220 eversion_t(4, 5), *h.back(), 4));
4221 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
4222 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
4223 *h.back(), 5));
4224 }
4225
4226 // -- ObjectModDesc --
4227 void ObjectModDesc::visit(Visitor *visitor) const
4228 {
4229 auto bp = bl.cbegin();
4230 try {
4231 while (!bp.end()) {
4232 DECODE_START(max_required_version, bp);
4233 uint8_t code;
4234 decode(code, bp);
4235 switch (code) {
4236 case APPEND: {
4237 uint64_t size;
4238 decode(size, bp);
4239 visitor->append(size);
4240 break;
4241 }
4242 case SETATTRS: {
4243 map<string, boost::optional<bufferlist> > attrs;
4244 decode(attrs, bp);
4245 visitor->setattrs(attrs);
4246 break;
4247 }
4248 case DELETE: {
4249 version_t old_version;
4250 decode(old_version, bp);
4251 visitor->rmobject(old_version);
4252 break;
4253 }
4254 case CREATE: {
4255 visitor->create();
4256 break;
4257 }
4258 case UPDATE_SNAPS: {
4259 set<snapid_t> snaps;
4260 decode(snaps, bp);
4261 visitor->update_snaps(snaps);
4262 break;
4263 }
4264 case TRY_DELETE: {
4265 version_t old_version;
4266 decode(old_version, bp);
4267 visitor->try_rmobject(old_version);
4268 break;
4269 }
4270 case ROLLBACK_EXTENTS: {
4271 vector<pair<uint64_t, uint64_t> > extents;
4272 version_t gen;
4273 decode(gen, bp);
4274 decode(extents, bp);
4275 visitor->rollback_extents(gen,extents);
4276 break;
4277 }
4278 default:
4279 ceph_abort_msg("Invalid rollback code");
4280 }
4281 DECODE_FINISH(bp);
4282 }
4283 } catch (...) {
4284 ceph_abort_msg("Invalid encoding");
4285 }
4286 }
4287
4288 struct DumpVisitor : public ObjectModDesc::Visitor {
4289 Formatter *f;
4290 explicit DumpVisitor(Formatter *f) : f(f) {}
4291 void append(uint64_t old_size) override {
4292 f->open_object_section("op");
4293 f->dump_string("code", "APPEND");
4294 f->dump_unsigned("old_size", old_size);
4295 f->close_section();
4296 }
4297 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
4298 f->open_object_section("op");
4299 f->dump_string("code", "SETATTRS");
4300 f->open_array_section("attrs");
4301 for (map<string, boost::optional<bufferlist> >::iterator i = attrs.begin();
4302 i != attrs.end();
4303 ++i) {
4304 f->dump_string("attr_name", i->first);
4305 }
4306 f->close_section();
4307 f->close_section();
4308 }
4309 void rmobject(version_t old_version) override {
4310 f->open_object_section("op");
4311 f->dump_string("code", "RMOBJECT");
4312 f->dump_unsigned("old_version", old_version);
4313 f->close_section();
4314 }
4315 void try_rmobject(version_t old_version) override {
4316 f->open_object_section("op");
4317 f->dump_string("code", "TRY_RMOBJECT");
4318 f->dump_unsigned("old_version", old_version);
4319 f->close_section();
4320 }
4321 void create() override {
4322 f->open_object_section("op");
4323 f->dump_string("code", "CREATE");
4324 f->close_section();
4325 }
4326 void update_snaps(const set<snapid_t> &snaps) override {
4327 f->open_object_section("op");
4328 f->dump_string("code", "UPDATE_SNAPS");
4329 f->dump_stream("snaps") << snaps;
4330 f->close_section();
4331 }
4332 void rollback_extents(
4333 version_t gen,
4334 const vector<pair<uint64_t, uint64_t> > &extents) override {
4335 f->open_object_section("op");
4336 f->dump_string("code", "ROLLBACK_EXTENTS");
4337 f->dump_unsigned("gen", gen);
4338 f->dump_stream("snaps") << extents;
4339 f->close_section();
4340 }
4341 };
4342
4343 void ObjectModDesc::dump(Formatter *f) const
4344 {
4345 f->open_object_section("object_mod_desc");
4346 f->dump_bool("can_local_rollback", can_local_rollback);
4347 f->dump_bool("rollback_info_completed", rollback_info_completed);
4348 {
4349 f->open_array_section("ops");
4350 DumpVisitor vis(f);
4351 visit(&vis);
4352 f->close_section();
4353 }
4354 f->close_section();
4355 }
4356
4357 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
4358 {
4359 map<string, boost::optional<bufferlist> > attrs;
4360 attrs[OI_ATTR];
4361 attrs[SS_ATTR];
4362 attrs["asdf"];
4363 o.push_back(new ObjectModDesc());
4364 o.back()->append(100);
4365 o.back()->setattrs(attrs);
4366 o.push_back(new ObjectModDesc());
4367 o.back()->rmobject(1001);
4368 o.push_back(new ObjectModDesc());
4369 o.back()->create();
4370 o.back()->setattrs(attrs);
4371 o.push_back(new ObjectModDesc());
4372 o.back()->create();
4373 o.back()->setattrs(attrs);
4374 o.back()->mark_unrollbackable();
4375 o.back()->append(1000);
4376 }
4377
4378 void ObjectModDesc::encode(bufferlist &_bl) const
4379 {
4380 ENCODE_START(max_required_version, max_required_version, _bl);
4381 encode(can_local_rollback, _bl);
4382 encode(rollback_info_completed, _bl);
4383 encode(bl, _bl);
4384 ENCODE_FINISH(_bl);
4385 }
4386 void ObjectModDesc::decode(bufferlist::const_iterator &_bl)
4387 {
4388 DECODE_START(2, _bl);
4389 max_required_version = struct_v;
4390 decode(can_local_rollback, _bl);
4391 decode(rollback_info_completed, _bl);
4392 decode(bl, _bl);
4393 // ensure bl does not pin a larger buffer in memory
4394 bl.rebuild();
4395 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
4396 DECODE_FINISH(_bl);
4397 }
4398
4399 // -- pg_log_entry_t --
4400
4401 string pg_log_entry_t::get_key_name() const
4402 {
4403 return version.get_key_name();
4404 }
4405
4406 void pg_log_entry_t::encode_with_checksum(bufferlist& bl) const
4407 {
4408 using ceph::encode;
4409 bufferlist ebl(sizeof(*this)*2);
4410 this->encode(ebl);
4411 __u32 crc = ebl.crc32c(0);
4412 encode(ebl, bl);
4413 encode(crc, bl);
4414 }
4415
4416 void pg_log_entry_t::decode_with_checksum(bufferlist::const_iterator& p)
4417 {
4418 using ceph::decode;
4419 bufferlist bl;
4420 decode(bl, p);
4421 __u32 crc;
4422 decode(crc, p);
4423 if (crc != bl.crc32c(0))
4424 throw buffer::malformed_input("bad checksum on pg_log_entry_t");
4425 auto q = bl.cbegin();
4426 this->decode(q);
4427 }
4428
4429 void pg_log_entry_t::encode(bufferlist &bl) const
4430 {
4431 ENCODE_START(12, 4, bl);
4432 encode(op, bl);
4433 encode(soid, bl);
4434 encode(version, bl);
4435
4436 /**
4437 * Added with reverting_to:
4438 * Previous code used prior_version to encode
4439 * what we now call reverting_to. This will
4440 * allow older code to decode reverting_to
4441 * into prior_version as expected.
4442 */
4443 if (op == LOST_REVERT)
4444 encode(reverting_to, bl);
4445 else
4446 encode(prior_version, bl);
4447
4448 encode(reqid, bl);
4449 encode(mtime, bl);
4450 if (op == LOST_REVERT)
4451 encode(prior_version, bl);
4452 encode(snaps, bl);
4453 encode(user_version, bl);
4454 encode(mod_desc, bl);
4455 encode(extra_reqids, bl);
4456 if (op == ERROR)
4457 encode(return_code, bl);
4458 if (!extra_reqids.empty())
4459 encode(extra_reqid_return_codes, bl);
4460 ENCODE_FINISH(bl);
4461 }
4462
4463 void pg_log_entry_t::decode(bufferlist::const_iterator &bl)
4464 {
4465 DECODE_START_LEGACY_COMPAT_LEN(12, 4, 4, bl);
4466 decode(op, bl);
4467 if (struct_v < 2) {
4468 sobject_t old_soid;
4469 decode(old_soid, bl);
4470 soid.oid = old_soid.oid;
4471 soid.snap = old_soid.snap;
4472 invalid_hash = true;
4473 } else {
4474 decode(soid, bl);
4475 }
4476 if (struct_v < 3)
4477 invalid_hash = true;
4478 decode(version, bl);
4479
4480 if (struct_v >= 6 && op == LOST_REVERT)
4481 decode(reverting_to, bl);
4482 else
4483 decode(prior_version, bl);
4484
4485 decode(reqid, bl);
4486
4487 decode(mtime, bl);
4488 if (struct_v < 5)
4489 invalid_pool = true;
4490
4491 if (op == LOST_REVERT) {
4492 if (struct_v >= 6) {
4493 decode(prior_version, bl);
4494 } else {
4495 reverting_to = prior_version;
4496 }
4497 }
4498 if (struct_v >= 7 || // for v >= 7, this is for all ops.
4499 op == CLONE) { // for v < 7, it's only present for CLONE.
4500 decode(snaps, bl);
4501 // ensure snaps does not pin a larger buffer in memory
4502 snaps.rebuild();
4503 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4504 }
4505
4506 if (struct_v >= 8)
4507 decode(user_version, bl);
4508 else
4509 user_version = version.version;
4510
4511 if (struct_v >= 9)
4512 decode(mod_desc, bl);
4513 else
4514 mod_desc.mark_unrollbackable();
4515 if (struct_v >= 10)
4516 decode(extra_reqids, bl);
4517 if (struct_v >= 11 && op == ERROR)
4518 decode(return_code, bl);
4519 if (struct_v >= 12 && !extra_reqids.empty())
4520 decode(extra_reqid_return_codes, bl);
4521 DECODE_FINISH(bl);
4522 }
4523
4524 void pg_log_entry_t::dump(Formatter *f) const
4525 {
4526 f->dump_string("op", get_op_name());
4527 f->dump_stream("object") << soid;
4528 f->dump_stream("version") << version;
4529 f->dump_stream("prior_version") << prior_version;
4530 f->dump_stream("reqid") << reqid;
4531 f->open_array_section("extra_reqids");
4532 uint32_t idx = 0;
4533 for (auto p = extra_reqids.begin();
4534 p != extra_reqids.end();
4535 ++idx, ++p) {
4536 f->open_object_section("extra_reqid");
4537 f->dump_stream("reqid") << p->first;
4538 f->dump_stream("user_version") << p->second;
4539 auto it = extra_reqid_return_codes.find(idx);
4540 if (it != extra_reqid_return_codes.end()) {
4541 f->dump_int("return_code", it->second);
4542 }
4543 f->close_section();
4544 }
4545 f->close_section();
4546 f->dump_stream("mtime") << mtime;
4547 f->dump_int("return_code", return_code);
4548 if (snaps.length() > 0) {
4549 vector<snapid_t> v;
4550 bufferlist c = snaps;
4551 auto p = c.cbegin();
4552 try {
4553 using ceph::decode;
4554 decode(v, p);
4555 } catch (...) {
4556 v.clear();
4557 }
4558 f->open_object_section("snaps");
4559 for (vector<snapid_t>::iterator p = v.begin(); p != v.end(); ++p)
4560 f->dump_unsigned("snap", *p);
4561 f->close_section();
4562 }
4563 {
4564 f->open_object_section("mod_desc");
4565 mod_desc.dump(f);
4566 f->close_section();
4567 }
4568 }
4569
4570 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4571 {
4572 o.push_back(new pg_log_entry_t());
4573 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4574 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4575 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4576 utime_t(8,9), 0));
4577 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4578 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4579 utime_t(8,9), -ENOENT));
4580 }
4581
4582 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4583 {
4584 out << e.version << " (" << e.prior_version << ") "
4585 << std::left << std::setw(8) << e.get_op_name() << ' '
4586 << e.soid << " by " << e.reqid << " " << e.mtime
4587 << " " << e.return_code;
4588 if (e.snaps.length()) {
4589 vector<snapid_t> snaps;
4590 bufferlist c = e.snaps;
4591 auto p = c.cbegin();
4592 try {
4593 decode(snaps, p);
4594 } catch (...) {
4595 snaps.clear();
4596 }
4597 out << " snaps " << snaps;
4598 }
4599 return out;
4600 }
4601
4602 // -- pg_log_dup_t --
4603
4604 std::string pg_log_dup_t::get_key_name() const
4605 {
4606 static const char prefix[] = "dup_";
4607 std::string key(36, ' ');
4608 memcpy(&key[0], prefix, 4);
4609 version.get_key_name(&key[4]);
4610 key.resize(35); // remove the null terminator
4611 return key;
4612 }
4613
4614 void pg_log_dup_t::encode(bufferlist &bl) const
4615 {
4616 ENCODE_START(1, 1, bl);
4617 encode(reqid, bl);
4618 encode(version, bl);
4619 encode(user_version, bl);
4620 encode(return_code, bl);
4621 ENCODE_FINISH(bl);
4622 }
4623
4624 void pg_log_dup_t::decode(bufferlist::const_iterator &bl)
4625 {
4626 DECODE_START(1, bl);
4627 decode(reqid, bl);
4628 decode(version, bl);
4629 decode(user_version, bl);
4630 decode(return_code, bl);
4631 DECODE_FINISH(bl);
4632 }
4633
4634 void pg_log_dup_t::dump(Formatter *f) const
4635 {
4636 f->dump_stream("reqid") << reqid;
4637 f->dump_stream("version") << version;
4638 f->dump_stream("user_version") << user_version;
4639 f->dump_stream("return_code") << return_code;
4640 }
4641
4642 void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
4643 {
4644 o.push_back(new pg_log_dup_t());
4645 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4646 1,
4647 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4648 0));
4649 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4650 2,
4651 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4652 -ENOENT));
4653 }
4654
4655
4656 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
4657 return out << "log_dup(reqid=" << e.reqid <<
4658 " v=" << e.version << " uv=" << e.user_version <<
4659 " rc=" << e.return_code << ")";
4660 }
4661
4662
4663 // -- pg_log_t --
4664
4665 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4666 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4667 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
4668 const string &hit_set_namespace, const pg_log_t &in,
4669 pg_log_t &out, pg_log_t &reject)
4670 {
4671 out = in;
4672 out.log.clear();
4673 reject.log.clear();
4674
4675 for (list<pg_log_entry_t>::const_iterator i = in.log.begin();
4676 i != in.log.end(); ++i) {
4677
4678 // Reject pg log entries for temporary objects
4679 if (i->soid.is_temp()) {
4680 reject.log.push_back(*i);
4681 continue;
4682 }
4683
4684 if (i->soid.nspace != hit_set_namespace) {
4685 object_t oid = i->soid.oid;
4686 object_locator_t loc(i->soid);
4687 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
4688 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
4689
4690 if (import_pgid.pgid == pgid) {
4691 out.log.push_back(*i);
4692 } else {
4693 reject.log.push_back(*i);
4694 }
4695 } else {
4696 out.log.push_back(*i);
4697 }
4698 }
4699 }
4700
4701 void pg_log_t::encode(bufferlist& bl) const
4702 {
4703 ENCODE_START(7, 3, bl);
4704 encode(head, bl);
4705 encode(tail, bl);
4706 encode(log, bl);
4707 encode(can_rollback_to, bl);
4708 encode(rollback_info_trimmed_to, bl);
4709 encode(dups, bl);
4710 ENCODE_FINISH(bl);
4711 }
4712
4713 void pg_log_t::decode(bufferlist::const_iterator &bl, int64_t pool)
4714 {
4715 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
4716 decode(head, bl);
4717 decode(tail, bl);
4718 if (struct_v < 2) {
4719 bool backlog;
4720 decode(backlog, bl);
4721 }
4722 decode(log, bl);
4723 if (struct_v >= 5)
4724 decode(can_rollback_to, bl);
4725
4726 if (struct_v >= 6)
4727 decode(rollback_info_trimmed_to, bl);
4728 else
4729 rollback_info_trimmed_to = tail;
4730
4731 if (struct_v >= 7)
4732 decode(dups, bl);
4733
4734 DECODE_FINISH(bl);
4735
4736 // handle hobject_t format change
4737 if (struct_v < 4) {
4738 for (list<pg_log_entry_t>::iterator i = log.begin();
4739 i != log.end();
4740 ++i) {
4741 if (!i->soid.is_max() && i->soid.pool == -1)
4742 i->soid.pool = pool;
4743 }
4744 }
4745 }
4746
4747 void pg_log_t::dump(Formatter *f) const
4748 {
4749 f->dump_stream("head") << head;
4750 f->dump_stream("tail") << tail;
4751 f->open_array_section("log");
4752 for (list<pg_log_entry_t>::const_iterator p = log.begin(); p != log.end(); ++p) {
4753 f->open_object_section("entry");
4754 p->dump(f);
4755 f->close_section();
4756 }
4757 f->close_section();
4758 f->open_array_section("dups");
4759 for (const auto& entry : dups) {
4760 f->open_object_section("entry");
4761 entry.dump(f);
4762 f->close_section();
4763 }
4764 f->close_section();
4765 }
4766
4767 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
4768 {
4769 o.push_back(new pg_log_t);
4770
4771 // this is nonsensical:
4772 o.push_back(new pg_log_t);
4773 o.back()->head = eversion_t(1,2);
4774 o.back()->tail = eversion_t(3,4);
4775 list<pg_log_entry_t*> e;
4776 pg_log_entry_t::generate_test_instances(e);
4777 for (list<pg_log_entry_t*>::iterator p = e.begin(); p != e.end(); ++p)
4778 o.back()->log.push_back(**p);
4779 }
4780
4781 static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups)
4782 {
4783 auto earliest_dup_version =
4784 target.head.version < maxdups ? 0u : target.head.version - maxdups + 1;
4785 lgeneric_subdout(cct, osd, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version << dendl;
4786
4787 for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) {
4788 if (d->version.version >= earliest_dup_version) {
4789 lgeneric_subdout(cct, osd, 20)
4790 << "copy_up_to/copy_after copy dup version "
4791 << d->version << dendl;
4792 target.dups.push_back(pg_log_dup_t(*d));
4793 }
4794 }
4795
4796 for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) {
4797 ceph_assert(i->version > other.tail);
4798 if (i->version > target.tail)
4799 break;
4800 if (i->version.version >= earliest_dup_version) {
4801 lgeneric_subdout(cct, osd, 20)
4802 << "copy_up_to/copy_after copy dup from log version "
4803 << i->version << dendl;
4804 target.dups.push_back(pg_log_dup_t(*i));
4805 }
4806 }
4807 }
4808
4809
4810 void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v)
4811 {
4812 can_rollback_to = other.can_rollback_to;
4813 head = other.head;
4814 tail = other.tail;
4815 lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v << dendl;
4816 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4817 i != other.log.rend();
4818 ++i) {
4819 ceph_assert(i->version > other.tail);
4820 if (i->version <= v) {
4821 // make tail accurate.
4822 tail = i->version;
4823 break;
4824 }
4825 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
4826 log.push_front(*i);
4827 }
4828 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
4829 }
4830
4831 void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max)
4832 {
4833 can_rollback_to = other.can_rollback_to;
4834 int n = 0;
4835 head = other.head;
4836 tail = other.tail;
4837 lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max << dendl;
4838 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4839 i != other.log.rend();
4840 ++i) {
4841 ceph_assert(i->version > other.tail);
4842 if (n++ >= max) {
4843 tail = i->version;
4844 break;
4845 }
4846 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
4847 log.push_front(*i);
4848 }
4849 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
4850 }
4851
4852 ostream& pg_log_t::print(ostream& out) const
4853 {
4854 out << *this << std::endl;
4855 for (list<pg_log_entry_t>::const_iterator p = log.begin();
4856 p != log.end();
4857 ++p)
4858 out << *p << std::endl;
4859 for (const auto& entry : dups) {
4860 out << " dup entry: " << entry << std::endl;
4861 }
4862 return out;
4863 }
4864
4865 // -- pg_missing_t --
4866
4867 ostream& operator<<(ostream& out, const pg_missing_item& i)
4868 {
4869 out << i.need;
4870 if (i.have != eversion_t())
4871 out << "(" << i.have << ")";
4872 out << " flags = " << i.flag_str();
4873 return out;
4874 }
4875
4876 // -- object_copy_cursor_t --
4877
4878 void object_copy_cursor_t::encode(bufferlist& bl) const
4879 {
4880 ENCODE_START(1, 1, bl);
4881 encode(attr_complete, bl);
4882 encode(data_offset, bl);
4883 encode(data_complete, bl);
4884 encode(omap_offset, bl);
4885 encode(omap_complete, bl);
4886 ENCODE_FINISH(bl);
4887 }
4888
4889 void object_copy_cursor_t::decode(bufferlist::const_iterator &bl)
4890 {
4891 DECODE_START(1, bl);
4892 decode(attr_complete, bl);
4893 decode(data_offset, bl);
4894 decode(data_complete, bl);
4895 decode(omap_offset, bl);
4896 decode(omap_complete, bl);
4897 DECODE_FINISH(bl);
4898 }
4899
4900 void object_copy_cursor_t::dump(Formatter *f) const
4901 {
4902 f->dump_unsigned("attr_complete", (int)attr_complete);
4903 f->dump_unsigned("data_offset", data_offset);
4904 f->dump_unsigned("data_complete", (int)data_complete);
4905 f->dump_string("omap_offset", omap_offset);
4906 f->dump_unsigned("omap_complete", (int)omap_complete);
4907 }
4908
4909 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
4910 {
4911 o.push_back(new object_copy_cursor_t);
4912 o.push_back(new object_copy_cursor_t);
4913 o.back()->attr_complete = true;
4914 o.back()->data_offset = 123;
4915 o.push_back(new object_copy_cursor_t);
4916 o.back()->attr_complete = true;
4917 o.back()->data_complete = true;
4918 o.back()->omap_offset = "foo";
4919 o.push_back(new object_copy_cursor_t);
4920 o.back()->attr_complete = true;
4921 o.back()->data_complete = true;
4922 o.back()->omap_complete = true;
4923 }
4924
4925 // -- object_copy_data_t --
4926
4927 void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
4928 {
4929 ENCODE_START(8, 5, bl);
4930 encode(size, bl);
4931 encode(mtime, bl);
4932 encode(attrs, bl);
4933 encode(data, bl);
4934 encode(omap_data, bl);
4935 encode(cursor, bl);
4936 encode(omap_header, bl);
4937 encode(snaps, bl);
4938 encode(snap_seq, bl);
4939 encode(flags, bl);
4940 encode(data_digest, bl);
4941 encode(omap_digest, bl);
4942 encode(reqids, bl);
4943 encode(truncate_seq, bl);
4944 encode(truncate_size, bl);
4945 encode(reqid_return_codes, bl);
4946 ENCODE_FINISH(bl);
4947 }
4948
4949 void object_copy_data_t::decode(bufferlist::const_iterator& bl)
4950 {
4951 DECODE_START(7, bl);
4952 if (struct_v < 5) {
4953 // old
4954 decode(size, bl);
4955 decode(mtime, bl);
4956 {
4957 string category;
4958 decode(category, bl); // no longer used
4959 }
4960 decode(attrs, bl);
4961 decode(data, bl);
4962 {
4963 map<string,bufferlist> omap;
4964 decode(omap, bl);
4965 omap_data.clear();
4966 if (!omap.empty()) {
4967 using ceph::encode;
4968 encode(omap, omap_data);
4969 }
4970 }
4971 decode(cursor, bl);
4972 if (struct_v >= 2)
4973 decode(omap_header, bl);
4974 if (struct_v >= 3) {
4975 decode(snaps, bl);
4976 decode(snap_seq, bl);
4977 } else {
4978 snaps.clear();
4979 snap_seq = 0;
4980 }
4981 if (struct_v >= 4) {
4982 decode(flags, bl);
4983 decode(data_digest, bl);
4984 decode(omap_digest, bl);
4985 }
4986 } else {
4987 // current
4988 decode(size, bl);
4989 decode(mtime, bl);
4990 decode(attrs, bl);
4991 decode(data, bl);
4992 decode(omap_data, bl);
4993 decode(cursor, bl);
4994 decode(omap_header, bl);
4995 decode(snaps, bl);
4996 decode(snap_seq, bl);
4997 if (struct_v >= 4) {
4998 decode(flags, bl);
4999 decode(data_digest, bl);
5000 decode(omap_digest, bl);
5001 }
5002 if (struct_v >= 6) {
5003 decode(reqids, bl);
5004 }
5005 if (struct_v >= 7) {
5006 decode(truncate_seq, bl);
5007 decode(truncate_size, bl);
5008 }
5009 if (struct_v >= 8) {
5010 decode(reqid_return_codes, bl);
5011 }
5012 }
5013 DECODE_FINISH(bl);
5014 }
5015
5016 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
5017 {
5018 o.push_back(new object_copy_data_t());
5019
5020 list<object_copy_cursor_t*> cursors;
5021 object_copy_cursor_t::generate_test_instances(cursors);
5022 list<object_copy_cursor_t*>::iterator ci = cursors.begin();
5023 o.back()->cursor = **(ci++);
5024
5025 o.push_back(new object_copy_data_t());
5026 o.back()->cursor = **(ci++);
5027
5028 o.push_back(new object_copy_data_t());
5029 o.back()->size = 1234;
5030 o.back()->mtime.set_from_double(1234);
5031 bufferptr bp("there", 5);
5032 bufferlist bl;
5033 bl.push_back(bp);
5034 o.back()->attrs["hello"] = bl;
5035 bufferptr bp2("not", 3);
5036 bufferlist bl2;
5037 bl2.push_back(bp2);
5038 map<string,bufferlist> omap;
5039 omap["why"] = bl2;
5040 using ceph::encode;
5041 encode(omap, o.back()->omap_data);
5042 bufferptr databp("iamsomedatatocontain", 20);
5043 o.back()->data.push_back(databp);
5044 o.back()->omap_header.append("this is an omap header");
5045 o.back()->snaps.push_back(123);
5046 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
5047 }
5048
5049 void object_copy_data_t::dump(Formatter *f) const
5050 {
5051 f->open_object_section("cursor");
5052 cursor.dump(f);
5053 f->close_section(); // cursor
5054 f->dump_int("size", size);
5055 f->dump_stream("mtime") << mtime;
5056 /* we should really print out the attrs here, but bufferlist
5057 const-correctness prevents that */
5058 f->dump_int("attrs_size", attrs.size());
5059 f->dump_int("flags", flags);
5060 f->dump_unsigned("data_digest", data_digest);
5061 f->dump_unsigned("omap_digest", omap_digest);
5062 f->dump_int("omap_data_length", omap_data.length());
5063 f->dump_int("omap_header_length", omap_header.length());
5064 f->dump_int("data_length", data.length());
5065 f->open_array_section("snaps");
5066 for (vector<snapid_t>::const_iterator p = snaps.begin();
5067 p != snaps.end(); ++p)
5068 f->dump_unsigned("snap", *p);
5069 f->close_section();
5070 f->open_array_section("reqids");
5071 uint32_t idx = 0;
5072 for (auto p = reqids.begin();
5073 p != reqids.end();
5074 ++idx, ++p) {
5075 f->open_object_section("extra_reqid");
5076 f->dump_stream("reqid") << p->first;
5077 f->dump_stream("user_version") << p->second;
5078 auto it = reqid_return_codes.find(idx);
5079 if (it != reqid_return_codes.end()) {
5080 f->dump_int("return_code", it->second);
5081 }
5082 f->close_section();
5083 }
5084 f->close_section();
5085 }
5086
5087 // -- pg_create_t --
5088
5089 void pg_create_t::encode(bufferlist &bl) const
5090 {
5091 ENCODE_START(1, 1, bl);
5092 encode(created, bl);
5093 encode(parent, bl);
5094 encode(split_bits, bl);
5095 ENCODE_FINISH(bl);
5096 }
5097
5098 void pg_create_t::decode(bufferlist::const_iterator &bl)
5099 {
5100 DECODE_START(1, bl);
5101 decode(created, bl);
5102 decode(parent, bl);
5103 decode(split_bits, bl);
5104 DECODE_FINISH(bl);
5105 }
5106
5107 void pg_create_t::dump(Formatter *f) const
5108 {
5109 f->dump_unsigned("created", created);
5110 f->dump_stream("parent") << parent;
5111 f->dump_int("split_bits", split_bits);
5112 }
5113
5114 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
5115 {
5116 o.push_back(new pg_create_t);
5117 o.push_back(new pg_create_t(1, pg_t(3, 4), 2));
5118 }
5119
5120
5121 // -- pg_hit_set_info_t --
5122
5123 void pg_hit_set_info_t::encode(bufferlist& bl) const
5124 {
5125 ENCODE_START(2, 1, bl);
5126 encode(begin, bl);
5127 encode(end, bl);
5128 encode(version, bl);
5129 encode(using_gmt, bl);
5130 ENCODE_FINISH(bl);
5131 }
5132
5133 void pg_hit_set_info_t::decode(bufferlist::const_iterator& p)
5134 {
5135 DECODE_START(2, p);
5136 decode(begin, p);
5137 decode(end, p);
5138 decode(version, p);
5139 if (struct_v >= 2) {
5140 decode(using_gmt, p);
5141 } else {
5142 using_gmt = false;
5143 }
5144 DECODE_FINISH(p);
5145 }
5146
5147 void pg_hit_set_info_t::dump(Formatter *f) const
5148 {
5149 f->dump_stream("begin") << begin;
5150 f->dump_stream("end") << end;
5151 f->dump_stream("version") << version;
5152 f->dump_stream("using_gmt") << using_gmt;
5153 }
5154
5155 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
5156 {
5157 ls.push_back(new pg_hit_set_info_t);
5158 ls.push_back(new pg_hit_set_info_t);
5159 ls.back()->begin = utime_t(1, 2);
5160 ls.back()->end = utime_t(3, 4);
5161 }
5162
5163
5164 // -- pg_hit_set_history_t --
5165
5166 void pg_hit_set_history_t::encode(bufferlist& bl) const
5167 {
5168 ENCODE_START(1, 1, bl);
5169 encode(current_last_update, bl);
5170 {
5171 utime_t dummy_stamp;
5172 encode(dummy_stamp, bl);
5173 }
5174 {
5175 pg_hit_set_info_t dummy_info;
5176 encode(dummy_info, bl);
5177 }
5178 encode(history, bl);
5179 ENCODE_FINISH(bl);
5180 }
5181
5182 void pg_hit_set_history_t::decode(bufferlist::const_iterator& p)
5183 {
5184 DECODE_START(1, p);
5185 decode(current_last_update, p);
5186 {
5187 utime_t dummy_stamp;
5188 decode(dummy_stamp, p);
5189 }
5190 {
5191 pg_hit_set_info_t dummy_info;
5192 decode(dummy_info, p);
5193 }
5194 decode(history, p);
5195 DECODE_FINISH(p);
5196 }
5197
5198 void pg_hit_set_history_t::dump(Formatter *f) const
5199 {
5200 f->dump_stream("current_last_update") << current_last_update;
5201 f->open_array_section("history");
5202 for (list<pg_hit_set_info_t>::const_iterator p = history.begin();
5203 p != history.end(); ++p) {
5204 f->open_object_section("info");
5205 p->dump(f);
5206 f->close_section();
5207 }
5208 f->close_section();
5209 }
5210
5211 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
5212 {
5213 ls.push_back(new pg_hit_set_history_t);
5214 ls.push_back(new pg_hit_set_history_t);
5215 ls.back()->current_last_update = eversion_t(1, 2);
5216 ls.back()->history.push_back(pg_hit_set_info_t());
5217 }
5218
5219 // -- OSDSuperblock --
5220
5221 void OSDSuperblock::encode(bufferlist &bl) const
5222 {
5223 ENCODE_START(8, 5, bl);
5224 encode(cluster_fsid, bl);
5225 encode(whoami, bl);
5226 encode(current_epoch, bl);
5227 encode(oldest_map, bl);
5228 encode(newest_map, bl);
5229 encode(weight, bl);
5230 compat_features.encode(bl);
5231 encode(clean_thru, bl);
5232 encode(mounted, bl);
5233 encode(osd_fsid, bl);
5234 encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
5235 encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
5236 ENCODE_FINISH(bl);
5237 }
5238
5239 void OSDSuperblock::decode(bufferlist::const_iterator &bl)
5240 {
5241 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
5242 if (struct_v < 3) {
5243 string magic;
5244 decode(magic, bl);
5245 }
5246 decode(cluster_fsid, bl);
5247 decode(whoami, bl);
5248 decode(current_epoch, bl);
5249 decode(oldest_map, bl);
5250 decode(newest_map, bl);
5251 decode(weight, bl);
5252 if (struct_v >= 2) {
5253 compat_features.decode(bl);
5254 } else { //upgrade it!
5255 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
5256 }
5257 decode(clean_thru, bl);
5258 decode(mounted, bl);
5259 if (struct_v >= 4)
5260 decode(osd_fsid, bl);
5261 if (struct_v >= 6) {
5262 epoch_t last_map_marked_full;
5263 decode(last_map_marked_full, bl);
5264 }
5265 if (struct_v >= 7) {
5266 map<int64_t,epoch_t> pool_last_map_marked_full;
5267 decode(pool_last_map_marked_full, bl);
5268 }
5269 DECODE_FINISH(bl);
5270 }
5271
5272 void OSDSuperblock::dump(Formatter *f) const
5273 {
5274 f->dump_stream("cluster_fsid") << cluster_fsid;
5275 f->dump_stream("osd_fsid") << osd_fsid;
5276 f->dump_int("whoami", whoami);
5277 f->dump_int("current_epoch", current_epoch);
5278 f->dump_int("oldest_map", oldest_map);
5279 f->dump_int("newest_map", newest_map);
5280 f->dump_float("weight", weight);
5281 f->open_object_section("compat");
5282 compat_features.dump(f);
5283 f->close_section();
5284 f->dump_int("clean_thru", clean_thru);
5285 f->dump_int("last_epoch_mounted", mounted);
5286 }
5287
5288 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
5289 {
5290 OSDSuperblock z;
5291 o.push_back(new OSDSuperblock(z));
5292 z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101");
5293 z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
5294 z.whoami = 3;
5295 z.current_epoch = 4;
5296 z.oldest_map = 5;
5297 z.newest_map = 9;
5298 z.mounted = 8;
5299 z.clean_thru = 7;
5300 o.push_back(new OSDSuperblock(z));
5301 o.push_back(new OSDSuperblock(z));
5302 }
5303
5304 // -- SnapSet --
5305
5306 void SnapSet::encode(bufferlist& bl) const
5307 {
5308 ENCODE_START(3, 2, bl);
5309 encode(seq, bl);
5310 encode(true, bl); // head_exists
5311 encode(snaps, bl);
5312 encode(clones, bl);
5313 encode(clone_overlap, bl);
5314 encode(clone_size, bl);
5315 encode(clone_snaps, bl);
5316 ENCODE_FINISH(bl);
5317 }
5318
5319 void SnapSet::decode(bufferlist::const_iterator& bl)
5320 {
5321 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5322 decode(seq, bl);
5323 bl.advance(1u); // skip legacy head_exists (always true)
5324 decode(snaps, bl);
5325 decode(clones, bl);
5326 decode(clone_overlap, bl);
5327 decode(clone_size, bl);
5328 if (struct_v >= 3) {
5329 decode(clone_snaps, bl);
5330 } else {
5331 clone_snaps.clear();
5332 }
5333 DECODE_FINISH(bl);
5334 }
5335
5336 void SnapSet::dump(Formatter *f) const
5337 {
5338 SnapContext sc(seq, snaps);
5339 f->open_object_section("snap_context");
5340 sc.dump(f);
5341 f->close_section();
5342 f->open_array_section("clones");
5343 for (vector<snapid_t>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
5344 f->open_object_section("clone");
5345 f->dump_unsigned("snap", *p);
5346 auto cs = clone_size.find(*p);
5347 if (cs != clone_size.end())
5348 f->dump_unsigned("size", cs->second);
5349 else
5350 f->dump_string("size", "????");
5351 auto co = clone_overlap.find(*p);
5352 if (co != clone_overlap.end())
5353 f->dump_stream("overlap") << co->second;
5354 else
5355 f->dump_stream("overlap") << "????";
5356 auto q = clone_snaps.find(*p);
5357 if (q != clone_snaps.end()) {
5358 f->open_array_section("snaps");
5359 for (auto s : q->second) {
5360 f->dump_unsigned("snap", s);
5361 }
5362 f->close_section();
5363 }
5364 f->close_section();
5365 }
5366 f->close_section();
5367 }
5368
5369 void SnapSet::generate_test_instances(list<SnapSet*>& o)
5370 {
5371 o.push_back(new SnapSet);
5372 o.push_back(new SnapSet);
5373 o.back()->seq = 123;
5374 o.back()->snaps.push_back(123);
5375 o.back()->snaps.push_back(12);
5376 o.push_back(new SnapSet);
5377 o.back()->seq = 123;
5378 o.back()->snaps.push_back(123);
5379 o.back()->snaps.push_back(12);
5380 o.back()->clones.push_back(12);
5381 o.back()->clone_size[12] = 12345;
5382 o.back()->clone_overlap[12];
5383 o.back()->clone_snaps[12] = {12, 10, 8};
5384 }
5385
5386 ostream& operator<<(ostream& out, const SnapSet& cs)
5387 {
5388 return out << cs.seq << "=" << cs.snaps << ":"
5389 << cs.clone_snaps;
5390 }
5391
5392 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
5393 {
5394 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5395 // correct: it will not include snaps that still logically exist
5396 // but for which there was no clone that is defined. For all
5397 // practical purposes this doesn't matter, since we only use that
5398 // information to clone on the OSD, and we have already moved
5399 // forward past that part of the object history.
5400
5401 seq = ss.seq;
5402 set<snapid_t> _snaps;
5403 set<snapid_t> _clones;
5404 for (vector<librados::clone_info_t>::const_iterator p = ss.clones.begin();
5405 p != ss.clones.end();
5406 ++p) {
5407 if (p->cloneid != librados::SNAP_HEAD) {
5408 _clones.insert(p->cloneid);
5409 _snaps.insert(p->snaps.begin(), p->snaps.end());
5410 clone_size[p->cloneid] = p->size;
5411 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
5412 for (vector<pair<uint64_t, uint64_t> >::const_iterator q =
5413 p->overlap.begin(); q != p->overlap.end(); ++q)
5414 clone_overlap[p->cloneid].insert(q->first, q->second);
5415 if (!legacy) {
5416 // p->snaps is ascending; clone_snaps is descending
5417 vector<snapid_t>& v = clone_snaps[p->cloneid];
5418 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
5419 v.push_back(*q);
5420 }
5421 }
5422 }
5423 }
5424
5425 // ascending
5426 clones.clear();
5427 clones.reserve(_clones.size());
5428 for (set<snapid_t>::iterator p = _clones.begin(); p != _clones.end(); ++p)
5429 clones.push_back(*p);
5430
5431 // descending
5432 snaps.clear();
5433 snaps.reserve(_snaps.size());
5434 for (set<snapid_t>::reverse_iterator p = _snaps.rbegin();
5435 p != _snaps.rend(); ++p)
5436 snaps.push_back(*p);
5437 }
5438
5439 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
5440 {
5441 ceph_assert(clone_size.count(clone));
5442 uint64_t size = clone_size.find(clone)->second;
5443 ceph_assert(clone_overlap.count(clone));
5444 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
5445 ceph_assert(size >= (uint64_t)overlap.size());
5446 return size - overlap.size();
5447 }
5448
5449 void SnapSet::filter(const pg_pool_t &pinfo)
5450 {
5451 vector<snapid_t> oldsnaps;
5452 oldsnaps.swap(snaps);
5453 for (vector<snapid_t>::const_iterator i = oldsnaps.begin();
5454 i != oldsnaps.end();
5455 ++i) {
5456 if (!pinfo.is_removed_snap(*i))
5457 snaps.push_back(*i);
5458 }
5459 }
5460
5461 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5462 {
5463 SnapSet ss = *this;
5464 ss.filter(pinfo);
5465 return ss;
5466 }
5467
5468 // -- watch_info_t --
5469
5470 void watch_info_t::encode(bufferlist& bl, uint64_t features) const
5471 {
5472 ENCODE_START(4, 3, bl);
5473 encode(cookie, bl);
5474 encode(timeout_seconds, bl);
5475 encode(addr, bl, features);
5476 ENCODE_FINISH(bl);
5477 }
5478
5479 void watch_info_t::decode(bufferlist::const_iterator& bl)
5480 {
5481 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
5482 decode(cookie, bl);
5483 if (struct_v < 2) {
5484 uint64_t ver;
5485 decode(ver, bl);
5486 }
5487 decode(timeout_seconds, bl);
5488 if (struct_v >= 4) {
5489 decode(addr, bl);
5490 }
5491 DECODE_FINISH(bl);
5492 }
5493
5494 void watch_info_t::dump(Formatter *f) const
5495 {
5496 f->dump_unsigned("cookie", cookie);
5497 f->dump_unsigned("timeout_seconds", timeout_seconds);
5498 f->open_object_section("addr");
5499 addr.dump(f);
5500 f->close_section();
5501 }
5502
5503 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5504 {
5505 o.push_back(new watch_info_t);
5506 o.push_back(new watch_info_t);
5507 o.back()->cookie = 123;
5508 o.back()->timeout_seconds = 99;
5509 entity_addr_t ea;
5510 ea.set_type(entity_addr_t::TYPE_LEGACY);
5511 ea.set_nonce(1);
5512 ea.set_family(AF_INET);
5513 ea.set_in4_quad(0, 127);
5514 ea.set_in4_quad(1, 0);
5515 ea.set_in4_quad(2, 1);
5516 ea.set_in4_quad(3, 2);
5517 ea.set_port(2);
5518 o.back()->addr = ea;
5519 }
5520
5521 // -- chunk_info_t --
5522
5523 void chunk_info_t::encode(bufferlist& bl) const
5524 {
5525 ENCODE_START(1, 1, bl);
5526 encode(offset, bl);
5527 encode(length, bl);
5528 encode(oid, bl);
5529 __u32 _flags = flags;
5530 encode(_flags, bl);
5531 ENCODE_FINISH(bl);
5532 }
5533
5534 void chunk_info_t::decode(bufferlist::const_iterator& bl)
5535 {
5536 DECODE_START(1, bl);
5537 decode(offset, bl);
5538 decode(length, bl);
5539 decode(oid, bl);
5540 __u32 _flags;
5541 decode(_flags, bl);
5542 flags = (cflag_t)_flags;
5543 DECODE_FINISH(bl);
5544 }
5545
5546 void chunk_info_t::dump(Formatter *f) const
5547 {
5548 f->dump_unsigned("length", length);
5549 f->open_object_section("oid");
5550 oid.dump(f);
5551 f->close_section();
5552 f->dump_unsigned("flags", flags);
5553 }
5554
5555 ostream& operator<<(ostream& out, const chunk_info_t& ci)
5556 {
5557 return out << "(len: " << ci.length << " oid: " << ci.oid
5558 << " offset: " << ci.offset
5559 << " flags: " << ci.get_flag_string(ci.flags) << ")";
5560 }
5561
5562 // -- object_manifest_t --
5563
5564 void object_manifest_t::encode(bufferlist& bl) const
5565 {
5566 ENCODE_START(1, 1, bl);
5567 encode(type, bl);
5568 switch (type) {
5569 case TYPE_NONE: break;
5570 case TYPE_REDIRECT:
5571 encode(redirect_target, bl);
5572 break;
5573 case TYPE_CHUNKED:
5574 encode(chunk_map, bl);
5575 break;
5576 default:
5577 ceph_abort();
5578 }
5579 ENCODE_FINISH(bl);
5580 }
5581
5582 void object_manifest_t::decode(bufferlist::const_iterator& bl)
5583 {
5584 DECODE_START(1, bl);
5585 decode(type, bl);
5586 switch (type) {
5587 case TYPE_NONE: break;
5588 case TYPE_REDIRECT:
5589 decode(redirect_target, bl);
5590 break;
5591 case TYPE_CHUNKED:
5592 decode(chunk_map, bl);
5593 break;
5594 default:
5595 ceph_abort();
5596 }
5597 DECODE_FINISH(bl);
5598 }
5599
5600 void object_manifest_t::dump(Formatter *f) const
5601 {
5602 f->dump_unsigned("type", type);
5603 if (type == TYPE_REDIRECT) {
5604 f->open_object_section("redirect_target");
5605 redirect_target.dump(f);
5606 f->close_section();
5607 } else if (type == TYPE_CHUNKED) {
5608 f->open_array_section("chunk_map");
5609 for (auto& p : chunk_map) {
5610 f->open_object_section("chunk");
5611 f->dump_unsigned("offset", p.first);
5612 p.second.dump(f);
5613 f->close_section();
5614 }
5615 f->close_section();
5616 }
5617 }
5618
5619 void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
5620 {
5621 o.push_back(new object_manifest_t());
5622 o.back()->type = TYPE_REDIRECT;
5623 }
5624
5625 ostream& operator<<(ostream& out, const object_manifest_t& om)
5626 {
5627 out << "manifest(" << om.get_type_name();
5628 if (om.is_redirect()) {
5629 out << " " << om.redirect_target;
5630 } else if (om.is_chunked()) {
5631 out << " " << om.chunk_map;
5632 }
5633 out << ")";
5634 return out;
5635 }
5636
5637 // -- object_info_t --
5638
5639 void object_info_t::copy_user_bits(const object_info_t& other)
5640 {
5641 // these bits are copied from head->clone.
5642 size = other.size;
5643 mtime = other.mtime;
5644 local_mtime = other.local_mtime;
5645 last_reqid = other.last_reqid;
5646 truncate_seq = other.truncate_seq;
5647 truncate_size = other.truncate_size;
5648 flags = other.flags;
5649 user_version = other.user_version;
5650 data_digest = other.data_digest;
5651 omap_digest = other.omap_digest;
5652 }
5653
5654 void object_info_t::encode(bufferlist& bl, uint64_t features) const
5655 {
5656 object_locator_t myoloc(soid);
5657 map<entity_name_t, watch_info_t> old_watchers;
5658 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator i =
5659 watchers.begin();
5660 i != watchers.end();
5661 ++i) {
5662 old_watchers.insert(make_pair(i->first.second, i->second));
5663 }
5664 ENCODE_START(17, 8, bl);
5665 encode(soid, bl);
5666 encode(myoloc, bl); //Retained for compatibility
5667 encode((__u32)0, bl); // was category, no longer used
5668 encode(version, bl);
5669 encode(prior_version, bl);
5670 encode(last_reqid, bl);
5671 encode(size, bl);
5672 encode(mtime, bl);
5673 if (soid.snap == CEPH_NOSNAP)
5674 encode(osd_reqid_t(), bl); // used to be wrlock_by
5675 else
5676 encode((uint32_t)0, bl); // was legacy_snaps
5677 encode(truncate_seq, bl);
5678 encode(truncate_size, bl);
5679 encode(is_lost(), bl);
5680 encode(old_watchers, bl, features);
5681 /* shenanigans to avoid breaking backwards compatibility in the disk format.
5682 * When we can, switch this out for simply putting the version_t on disk. */
5683 eversion_t user_eversion(0, user_version);
5684 encode(user_eversion, bl);
5685 encode(test_flag(FLAG_USES_TMAP), bl);
5686 encode(watchers, bl, features);
5687 __u32 _flags = flags;
5688 encode(_flags, bl);
5689 encode(local_mtime, bl);
5690 encode(data_digest, bl);
5691 encode(omap_digest, bl);
5692 encode(expected_object_size, bl);
5693 encode(expected_write_size, bl);
5694 encode(alloc_hint_flags, bl);
5695 if (has_manifest()) {
5696 encode(manifest, bl);
5697 }
5698 ENCODE_FINISH(bl);
5699 }
5700
5701 void object_info_t::decode(bufferlist::const_iterator& bl)
5702 {
5703 object_locator_t myoloc;
5704 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
5705 map<entity_name_t, watch_info_t> old_watchers;
5706 decode(soid, bl);
5707 decode(myoloc, bl);
5708 {
5709 string category;
5710 decode(category, bl); // no longer used
5711 }
5712 decode(version, bl);
5713 decode(prior_version, bl);
5714 decode(last_reqid, bl);
5715 decode(size, bl);
5716 decode(mtime, bl);
5717 if (soid.snap == CEPH_NOSNAP) {
5718 osd_reqid_t wrlock_by;
5719 decode(wrlock_by, bl);
5720 } else {
5721 vector<snapid_t> legacy_snaps;
5722 decode(legacy_snaps, bl);
5723 }
5724 decode(truncate_seq, bl);
5725 decode(truncate_size, bl);
5726
5727 // if this is struct_v >= 13, we will overwrite this
5728 // below since this field is just here for backwards
5729 // compatibility
5730 __u8 lo;
5731 decode(lo, bl);
5732 flags = (flag_t)lo;
5733
5734 decode(old_watchers, bl);
5735 eversion_t user_eversion;
5736 decode(user_eversion, bl);
5737 user_version = user_eversion.version;
5738
5739 if (struct_v >= 9) {
5740 bool uses_tmap = false;
5741 decode(uses_tmap, bl);
5742 if (uses_tmap)
5743 set_flag(FLAG_USES_TMAP);
5744 } else {
5745 set_flag(FLAG_USES_TMAP);
5746 }
5747 if (struct_v < 10)
5748 soid.pool = myoloc.pool;
5749 if (struct_v >= 11) {
5750 decode(watchers, bl);
5751 } else {
5752 for (map<entity_name_t, watch_info_t>::iterator i = old_watchers.begin();
5753 i != old_watchers.end();
5754 ++i) {
5755 watchers.insert(
5756 make_pair(
5757 make_pair(i->second.cookie, i->first), i->second));
5758 }
5759 }
5760 if (struct_v >= 13) {
5761 __u32 _flags;
5762 decode(_flags, bl);
5763 flags = (flag_t)_flags;
5764 }
5765 if (struct_v >= 14) {
5766 decode(local_mtime, bl);
5767 } else {
5768 local_mtime = utime_t();
5769 }
5770 if (struct_v >= 15) {
5771 decode(data_digest, bl);
5772 decode(omap_digest, bl);
5773 } else {
5774 data_digest = omap_digest = -1;
5775 clear_flag(FLAG_DATA_DIGEST);
5776 clear_flag(FLAG_OMAP_DIGEST);
5777 }
5778 if (struct_v >= 16) {
5779 decode(expected_object_size, bl);
5780 decode(expected_write_size, bl);
5781 decode(alloc_hint_flags, bl);
5782 } else {
5783 expected_object_size = 0;
5784 expected_write_size = 0;
5785 alloc_hint_flags = 0;
5786 }
5787 if (struct_v >= 17) {
5788 if (has_manifest()) {
5789 decode(manifest, bl);
5790 }
5791 }
5792 DECODE_FINISH(bl);
5793 }
5794
5795 void object_info_t::dump(Formatter *f) const
5796 {
5797 f->open_object_section("oid");
5798 soid.dump(f);
5799 f->close_section();
5800 f->dump_stream("version") << version;
5801 f->dump_stream("prior_version") << prior_version;
5802 f->dump_stream("last_reqid") << last_reqid;
5803 f->dump_unsigned("user_version", user_version);
5804 f->dump_unsigned("size", size);
5805 f->dump_stream("mtime") << mtime;
5806 f->dump_stream("local_mtime") << local_mtime;
5807 f->dump_unsigned("lost", (int)is_lost());
5808 vector<string> sv = get_flag_vector(flags);
5809 f->open_array_section("flags");
5810 for (auto str: sv)
5811 f->dump_string("flags", str);
5812 f->close_section();
5813 f->dump_unsigned("truncate_seq", truncate_seq);
5814 f->dump_unsigned("truncate_size", truncate_size);
5815 f->dump_format("data_digest", "0x%08x", data_digest);
5816 f->dump_format("omap_digest", "0x%08x", omap_digest);
5817 f->dump_unsigned("expected_object_size", expected_object_size);
5818 f->dump_unsigned("expected_write_size", expected_write_size);
5819 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
5820 f->dump_object("manifest", manifest);
5821 f->open_object_section("watchers");
5822 for (map<pair<uint64_t, entity_name_t>,watch_info_t>::const_iterator p =
5823 watchers.begin(); p != watchers.end(); ++p) {
5824 stringstream ss;
5825 ss << p->first.second;
5826 f->open_object_section(ss.str().c_str());
5827 p->second.dump(f);
5828 f->close_section();
5829 }
5830 f->close_section();
5831 }
5832
5833 void object_info_t::generate_test_instances(list<object_info_t*>& o)
5834 {
5835 o.push_back(new object_info_t());
5836
5837 // fixme
5838 }
5839
5840
5841 ostream& operator<<(ostream& out, const object_info_t& oi)
5842 {
5843 out << oi.soid << "(" << oi.version
5844 << " " << oi.last_reqid;
5845 if (oi.flags)
5846 out << " " << oi.get_flag_string();
5847 out << " s " << oi.size;
5848 out << " uv " << oi.user_version;
5849 if (oi.is_data_digest())
5850 out << " dd " << std::hex << oi.data_digest << std::dec;
5851 if (oi.is_omap_digest())
5852 out << " od " << std::hex << oi.omap_digest << std::dec;
5853 out << " alloc_hint [" << oi.expected_object_size
5854 << " " << oi.expected_write_size
5855 << " " << oi.alloc_hint_flags << "]";
5856 if (oi.has_manifest())
5857 out << " " << oi.manifest;
5858 out << ")";
5859 return out;
5860 }
5861
5862 // -- ObjectRecovery --
5863 void ObjectRecoveryProgress::encode(bufferlist &bl) const
5864 {
5865 ENCODE_START(1, 1, bl);
5866 encode(first, bl);
5867 encode(data_complete, bl);
5868 encode(data_recovered_to, bl);
5869 encode(omap_recovered_to, bl);
5870 encode(omap_complete, bl);
5871 ENCODE_FINISH(bl);
5872 }
5873
5874 void ObjectRecoveryProgress::decode(bufferlist::const_iterator &bl)
5875 {
5876 DECODE_START(1, bl);
5877 decode(first, bl);
5878 decode(data_complete, bl);
5879 decode(data_recovered_to, bl);
5880 decode(omap_recovered_to, bl);
5881 decode(omap_complete, bl);
5882 DECODE_FINISH(bl);
5883 }
5884
5885 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
5886 {
5887 return prog.print(out);
5888 }
5889
5890 void ObjectRecoveryProgress::generate_test_instances(
5891 list<ObjectRecoveryProgress*>& o)
5892 {
5893 o.push_back(new ObjectRecoveryProgress);
5894 o.back()->first = false;
5895 o.back()->data_complete = true;
5896 o.back()->omap_complete = true;
5897 o.back()->data_recovered_to = 100;
5898
5899 o.push_back(new ObjectRecoveryProgress);
5900 o.back()->first = true;
5901 o.back()->data_complete = false;
5902 o.back()->omap_complete = false;
5903 o.back()->data_recovered_to = 0;
5904 }
5905
5906 ostream &ObjectRecoveryProgress::print(ostream &out) const
5907 {
5908 return out << "ObjectRecoveryProgress("
5909 << ( first ? "" : "!" ) << "first, "
5910 << "data_recovered_to:" << data_recovered_to
5911 << ", data_complete:" << ( data_complete ? "true" : "false" )
5912 << ", omap_recovered_to:" << omap_recovered_to
5913 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
5914 << ", error:" << ( error ? "true" : "false" )
5915 << ")";
5916 }
5917
5918 void ObjectRecoveryProgress::dump(Formatter *f) const
5919 {
5920 f->dump_int("first?", first);
5921 f->dump_int("data_complete?", data_complete);
5922 f->dump_unsigned("data_recovered_to", data_recovered_to);
5923 f->dump_int("omap_complete?", omap_complete);
5924 f->dump_string("omap_recovered_to", omap_recovered_to);
5925 }
5926
5927 void ObjectRecoveryInfo::encode(bufferlist &bl, uint64_t features) const
5928 {
5929 ENCODE_START(2, 1, bl);
5930 encode(soid, bl);
5931 encode(version, bl);
5932 encode(size, bl);
5933 encode(oi, bl, features);
5934 encode(ss, bl);
5935 encode(copy_subset, bl);
5936 encode(clone_subset, bl);
5937 ENCODE_FINISH(bl);
5938 }
5939
5940 void ObjectRecoveryInfo::decode(bufferlist::const_iterator &bl,
5941 int64_t pool)
5942 {
5943 DECODE_START(2, bl);
5944 decode(soid, bl);
5945 decode(version, bl);
5946 decode(size, bl);
5947 decode(oi, bl);
5948 decode(ss, bl);
5949 decode(copy_subset, bl);
5950 decode(clone_subset, bl);
5951 DECODE_FINISH(bl);
5952
5953 if (struct_v < 2) {
5954 if (!soid.is_max() && soid.pool == -1)
5955 soid.pool = pool;
5956 map<hobject_t, interval_set<uint64_t>> tmp;
5957 tmp.swap(clone_subset);
5958 for (map<hobject_t, interval_set<uint64_t>>::iterator i = tmp.begin();
5959 i != tmp.end();
5960 ++i) {
5961 hobject_t first(i->first);
5962 if (!first.is_max() && first.pool == -1)
5963 first.pool = pool;
5964 clone_subset[first].swap(i->second);
5965 }
5966 }
5967 }
5968
5969 void ObjectRecoveryInfo::generate_test_instances(
5970 list<ObjectRecoveryInfo*>& o)
5971 {
5972 o.push_back(new ObjectRecoveryInfo);
5973 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
5974 o.back()->version = eversion_t(0,0);
5975 o.back()->size = 100;
5976 }
5977
5978
5979 void ObjectRecoveryInfo::dump(Formatter *f) const
5980 {
5981 f->dump_stream("object") << soid;
5982 f->dump_stream("at_version") << version;
5983 f->dump_stream("size") << size;
5984 {
5985 f->open_object_section("object_info");
5986 oi.dump(f);
5987 f->close_section();
5988 }
5989 {
5990 f->open_object_section("snapset");
5991 ss.dump(f);
5992 f->close_section();
5993 }
5994 f->dump_stream("copy_subset") << copy_subset;
5995 f->dump_stream("clone_subset") << clone_subset;
5996 }
5997
5998 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
5999 {
6000 return inf.print(out);
6001 }
6002
6003 ostream &ObjectRecoveryInfo::print(ostream &out) const
6004 {
6005 return out << "ObjectRecoveryInfo("
6006 << soid << "@" << version
6007 << ", size: " << size
6008 << ", copy_subset: " << copy_subset
6009 << ", clone_subset: " << clone_subset
6010 << ", snapset: " << ss
6011 << ")";
6012 }
6013
6014 // -- PushReplyOp --
6015 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
6016 {
6017 o.push_back(new PushReplyOp);
6018 o.push_back(new PushReplyOp);
6019 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6020 o.push_back(new PushReplyOp);
6021 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6022 }
6023
6024 void PushReplyOp::encode(bufferlist &bl) const
6025 {
6026 ENCODE_START(1, 1, bl);
6027 encode(soid, bl);
6028 ENCODE_FINISH(bl);
6029 }
6030
6031 void PushReplyOp::decode(bufferlist::const_iterator &bl)
6032 {
6033 DECODE_START(1, bl);
6034 decode(soid, bl);
6035 DECODE_FINISH(bl);
6036 }
6037
6038 void PushReplyOp::dump(Formatter *f) const
6039 {
6040 f->dump_stream("soid") << soid;
6041 }
6042
6043 ostream &PushReplyOp::print(ostream &out) const
6044 {
6045 return out
6046 << "PushReplyOp(" << soid
6047 << ")";
6048 }
6049
6050 ostream& operator<<(ostream& out, const PushReplyOp &op)
6051 {
6052 return op.print(out);
6053 }
6054
6055 uint64_t PushReplyOp::cost(CephContext *cct) const
6056 {
6057
6058 return cct->_conf->osd_push_per_object_cost +
6059 cct->_conf->osd_recovery_max_chunk;
6060 }
6061
6062 // -- PullOp --
6063 void PullOp::generate_test_instances(list<PullOp*> &o)
6064 {
6065 o.push_back(new PullOp);
6066 o.push_back(new PullOp);
6067 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6068 o.back()->recovery_info.version = eversion_t(3, 10);
6069 o.push_back(new PullOp);
6070 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6071 o.back()->recovery_info.version = eversion_t(0, 0);
6072 }
6073
6074 void PullOp::encode(bufferlist &bl, uint64_t features) const
6075 {
6076 ENCODE_START(1, 1, bl);
6077 encode(soid, bl);
6078 encode(recovery_info, bl, features);
6079 encode(recovery_progress, bl);
6080 ENCODE_FINISH(bl);
6081 }
6082
6083 void PullOp::decode(bufferlist::const_iterator &bl)
6084 {
6085 DECODE_START(1, bl);
6086 decode(soid, bl);
6087 decode(recovery_info, bl);
6088 decode(recovery_progress, bl);
6089 DECODE_FINISH(bl);
6090 }
6091
6092 void PullOp::dump(Formatter *f) const
6093 {
6094 f->dump_stream("soid") << soid;
6095 {
6096 f->open_object_section("recovery_info");
6097 recovery_info.dump(f);
6098 f->close_section();
6099 }
6100 {
6101 f->open_object_section("recovery_progress");
6102 recovery_progress.dump(f);
6103 f->close_section();
6104 }
6105 }
6106
6107 ostream &PullOp::print(ostream &out) const
6108 {
6109 return out
6110 << "PullOp(" << soid
6111 << ", recovery_info: " << recovery_info
6112 << ", recovery_progress: " << recovery_progress
6113 << ")";
6114 }
6115
6116 ostream& operator<<(ostream& out, const PullOp &op)
6117 {
6118 return op.print(out);
6119 }
6120
6121 uint64_t PullOp::cost(CephContext *cct) const
6122 {
6123 return cct->_conf->osd_push_per_object_cost +
6124 cct->_conf->osd_recovery_max_chunk;
6125 }
6126
6127 // -- PushOp --
6128 void PushOp::generate_test_instances(list<PushOp*> &o)
6129 {
6130 o.push_back(new PushOp);
6131 o.push_back(new PushOp);
6132 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6133 o.back()->version = eversion_t(3, 10);
6134 o.push_back(new PushOp);
6135 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6136 o.back()->version = eversion_t(0, 0);
6137 }
6138
6139 void PushOp::encode(bufferlist &bl, uint64_t features) const
6140 {
6141 ENCODE_START(1, 1, bl);
6142 encode(soid, bl);
6143 encode(version, bl);
6144 encode(data, bl);
6145 encode(data_included, bl);
6146 encode(omap_header, bl);
6147 encode(omap_entries, bl);
6148 encode(attrset, bl);
6149 encode(recovery_info, bl, features);
6150 encode(after_progress, bl);
6151 encode(before_progress, bl);
6152 ENCODE_FINISH(bl);
6153 }
6154
6155 void PushOp::decode(bufferlist::const_iterator &bl)
6156 {
6157 DECODE_START(1, bl);
6158 decode(soid, bl);
6159 decode(version, bl);
6160 decode(data, bl);
6161 decode(data_included, bl);
6162 decode(omap_header, bl);
6163 decode(omap_entries, bl);
6164 decode(attrset, bl);
6165 decode(recovery_info, bl);
6166 decode(after_progress, bl);
6167 decode(before_progress, bl);
6168 DECODE_FINISH(bl);
6169 }
6170
6171 void PushOp::dump(Formatter *f) const
6172 {
6173 f->dump_stream("soid") << soid;
6174 f->dump_stream("version") << version;
6175 f->dump_int("data_len", data.length());
6176 f->dump_stream("data_included") << data_included;
6177 f->dump_int("omap_header_len", omap_header.length());
6178 f->dump_int("omap_entries_len", omap_entries.size());
6179 f->dump_int("attrset_len", attrset.size());
6180 {
6181 f->open_object_section("recovery_info");
6182 recovery_info.dump(f);
6183 f->close_section();
6184 }
6185 {
6186 f->open_object_section("after_progress");
6187 after_progress.dump(f);
6188 f->close_section();
6189 }
6190 {
6191 f->open_object_section("before_progress");
6192 before_progress.dump(f);
6193 f->close_section();
6194 }
6195 }
6196
6197 ostream &PushOp::print(ostream &out) const
6198 {
6199 return out
6200 << "PushOp(" << soid
6201 << ", version: " << version
6202 << ", data_included: " << data_included
6203 << ", data_size: " << data.length()
6204 << ", omap_header_size: " << omap_header.length()
6205 << ", omap_entries_size: " << omap_entries.size()
6206 << ", attrset_size: " << attrset.size()
6207 << ", recovery_info: " << recovery_info
6208 << ", after_progress: " << after_progress
6209 << ", before_progress: " << before_progress
6210 << ")";
6211 }
6212
6213 ostream& operator<<(ostream& out, const PushOp &op)
6214 {
6215 return op.print(out);
6216 }
6217
6218 uint64_t PushOp::cost(CephContext *cct) const
6219 {
6220 uint64_t cost = data_included.size();
6221 for (map<string, bufferlist>::const_iterator i =
6222 omap_entries.begin();
6223 i != omap_entries.end();
6224 ++i) {
6225 cost += i->second.length();
6226 }
6227 cost += cct->_conf->osd_push_per_object_cost;
6228 return cost;
6229 }
6230
6231 // -- ScrubMap --
6232
6233 void ScrubMap::merge_incr(const ScrubMap &l)
6234 {
6235 ceph_assert(valid_through == l.incr_since);
6236 valid_through = l.valid_through;
6237
6238 for (map<hobject_t,object>::const_iterator p = l.objects.begin();
6239 p != l.objects.end();
6240 ++p){
6241 if (p->second.negative) {
6242 map<hobject_t,object>::iterator q = objects.find(p->first);
6243 if (q != objects.end()) {
6244 objects.erase(q);
6245 }
6246 } else {
6247 objects[p->first] = p->second;
6248 }
6249 }
6250 }
6251
6252 void ScrubMap::encode(bufferlist& bl) const
6253 {
6254 ENCODE_START(3, 2, bl);
6255 encode(objects, bl);
6256 encode((__u32)0, bl); // used to be attrs; now deprecated
6257 bufferlist old_logbl; // not used
6258 encode(old_logbl, bl);
6259 encode(valid_through, bl);
6260 encode(incr_since, bl);
6261 ENCODE_FINISH(bl);
6262 }
6263
6264 void ScrubMap::decode(bufferlist::const_iterator& bl, int64_t pool)
6265 {
6266 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
6267 decode(objects, bl);
6268 {
6269 map<string,string> attrs; // deprecated
6270 decode(attrs, bl);
6271 }
6272 bufferlist old_logbl; // not used
6273 decode(old_logbl, bl);
6274 decode(valid_through, bl);
6275 decode(incr_since, bl);
6276 DECODE_FINISH(bl);
6277
6278 // handle hobject_t upgrade
6279 if (struct_v < 3) {
6280 map<hobject_t, object> tmp;
6281 tmp.swap(objects);
6282 for (map<hobject_t, object>::iterator i = tmp.begin();
6283 i != tmp.end();
6284 ++i) {
6285 hobject_t first(i->first);
6286 if (!first.is_max() && first.pool == -1)
6287 first.pool = pool;
6288 objects[first] = i->second;
6289 }
6290 }
6291 }
6292
6293 void ScrubMap::dump(Formatter *f) const
6294 {
6295 f->dump_stream("valid_through") << valid_through;
6296 f->dump_stream("incremental_since") << incr_since;
6297 f->open_array_section("objects");
6298 for (map<hobject_t,object>::const_iterator p = objects.begin(); p != objects.end(); ++p) {
6299 f->open_object_section("object");
6300 f->dump_string("name", p->first.oid.name);
6301 f->dump_unsigned("hash", p->first.get_hash());
6302 f->dump_string("key", p->first.get_key());
6303 f->dump_int("snapid", p->first.snap);
6304 p->second.dump(f);
6305 f->close_section();
6306 }
6307 f->close_section();
6308 }
6309
6310 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
6311 {
6312 o.push_back(new ScrubMap);
6313 o.push_back(new ScrubMap);
6314 o.back()->valid_through = eversion_t(1, 2);
6315 o.back()->incr_since = eversion_t(3, 4);
6316 list<object*> obj;
6317 object::generate_test_instances(obj);
6318 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
6319 obj.pop_back();
6320 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
6321 }
6322
6323 // -- ScrubMap::object --
6324
6325 void ScrubMap::object::encode(bufferlist& bl) const
6326 {
6327 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
6328 ENCODE_START(10, 7, bl);
6329 encode(size, bl);
6330 encode(negative, bl);
6331 encode(attrs, bl);
6332 encode(digest, bl);
6333 encode(digest_present, bl);
6334 encode((uint32_t)0, bl); // obsolete nlinks
6335 encode((uint32_t)0, bl); // snapcolls
6336 encode(omap_digest, bl);
6337 encode(omap_digest_present, bl);
6338 encode(compat_read_error, bl);
6339 encode(stat_error, bl);
6340 encode(read_error, bl);
6341 encode(ec_hash_mismatch, bl);
6342 encode(ec_size_mismatch, bl);
6343 encode(large_omap_object_found, bl);
6344 encode(large_omap_object_key_count, bl);
6345 encode(large_omap_object_value_size, bl);
6346 encode(object_omap_bytes, bl);
6347 encode(object_omap_keys, bl);
6348 ENCODE_FINISH(bl);
6349 }
6350
6351 void ScrubMap::object::decode(bufferlist::const_iterator& bl)
6352 {
6353 DECODE_START(10, bl);
6354 decode(size, bl);
6355 bool tmp, compat_read_error = false;
6356 decode(tmp, bl);
6357 negative = tmp;
6358 decode(attrs, bl);
6359 decode(digest, bl);
6360 decode(tmp, bl);
6361 digest_present = tmp;
6362 {
6363 uint32_t nlinks;
6364 decode(nlinks, bl);
6365 set<snapid_t> snapcolls;
6366 decode(snapcolls, bl);
6367 }
6368 decode(omap_digest, bl);
6369 decode(tmp, bl);
6370 omap_digest_present = tmp;
6371 decode(compat_read_error, bl);
6372 decode(tmp, bl);
6373 stat_error = tmp;
6374 if (struct_v >= 8) {
6375 decode(tmp, bl);
6376 read_error = tmp;
6377 decode(tmp, bl);
6378 ec_hash_mismatch = tmp;
6379 decode(tmp, bl);
6380 ec_size_mismatch = tmp;
6381 }
6382 // If older encoder found a read_error, set read_error
6383 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
6384 read_error = true;
6385 if (struct_v >= 9) {
6386 decode(tmp, bl);
6387 large_omap_object_found = tmp;
6388 decode(large_omap_object_key_count, bl);
6389 decode(large_omap_object_value_size, bl);
6390 }
6391 if (struct_v >= 10) {
6392 decode(object_omap_bytes, bl);
6393 decode(object_omap_keys, bl);
6394 }
6395 DECODE_FINISH(bl);
6396 }
6397
6398 void ScrubMap::object::dump(Formatter *f) const
6399 {
6400 f->dump_int("size", size);
6401 f->dump_int("negative", negative);
6402 f->open_array_section("attrs");
6403 for (map<string,bufferptr>::const_iterator p = attrs.begin(); p != attrs.end(); ++p) {
6404 f->open_object_section("attr");
6405 f->dump_string("name", p->first);
6406 f->dump_int("length", p->second.length());
6407 f->close_section();
6408 }
6409 f->close_section();
6410 }
6411
6412 void ScrubMap::object::generate_test_instances(list<object*>& o)
6413 {
6414 o.push_back(new object);
6415 o.push_back(new object);
6416 o.back()->negative = true;
6417 o.push_back(new object);
6418 o.back()->size = 123;
6419 o.back()->attrs["foo"] = buffer::copy("foo", 3);
6420 o.back()->attrs["bar"] = buffer::copy("barval", 6);
6421 }
6422
6423 // -- OSDOp --
6424
6425 ostream& operator<<(ostream& out, const OSDOp& op)
6426 {
6427 out << ceph_osd_op_name(op.op.op);
6428 if (ceph_osd_op_type_data(op.op.op)) {
6429 // data extent
6430 switch (op.op.op) {
6431 case CEPH_OSD_OP_ASSERT_VER:
6432 out << " v" << op.op.assert_ver.ver;
6433 break;
6434 case CEPH_OSD_OP_TRUNCATE:
6435 out << " " << op.op.extent.offset;
6436 break;
6437 case CEPH_OSD_OP_MASKTRUNC:
6438 case CEPH_OSD_OP_TRIMTRUNC:
6439 out << " " << op.op.extent.truncate_seq << "@"
6440 << (int64_t)op.op.extent.truncate_size;
6441 break;
6442 case CEPH_OSD_OP_ROLLBACK:
6443 out << " " << snapid_t(op.op.snap.snapid);
6444 break;
6445 case CEPH_OSD_OP_WATCH:
6446 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
6447 << " cookie " << op.op.watch.cookie;
6448 if (op.op.watch.gen)
6449 out << " gen " << op.op.watch.gen;
6450 break;
6451 case CEPH_OSD_OP_NOTIFY:
6452 out << " cookie " << op.op.notify.cookie;
6453 break;
6454 case CEPH_OSD_OP_COPY_GET:
6455 out << " max " << op.op.copy_get.max;
6456 break;
6457 case CEPH_OSD_OP_COPY_FROM:
6458 out << " ver " << op.op.copy_from.src_version;
6459 break;
6460 case CEPH_OSD_OP_SETALLOCHINT:
6461 out << " object_size " << op.op.alloc_hint.expected_object_size
6462 << " write_size " << op.op.alloc_hint.expected_write_size;
6463 break;
6464 case CEPH_OSD_OP_READ:
6465 case CEPH_OSD_OP_SPARSE_READ:
6466 case CEPH_OSD_OP_SYNC_READ:
6467 case CEPH_OSD_OP_WRITE:
6468 case CEPH_OSD_OP_WRITEFULL:
6469 case CEPH_OSD_OP_ZERO:
6470 case CEPH_OSD_OP_APPEND:
6471 case CEPH_OSD_OP_MAPEXT:
6472 case CEPH_OSD_OP_CMPEXT:
6473 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
6474 if (op.op.extent.truncate_seq)
6475 out << " [" << op.op.extent.truncate_seq << "@"
6476 << (int64_t)op.op.extent.truncate_size << "]";
6477 if (op.op.flags)
6478 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
6479 default:
6480 // don't show any arg info
6481 break;
6482 }
6483 } else if (ceph_osd_op_type_attr(op.op.op)) {
6484 // xattr name
6485 if (op.op.xattr.name_len && op.indata.length()) {
6486 out << " ";
6487 op.indata.write(0, op.op.xattr.name_len, out);
6488 }
6489 if (op.op.xattr.value_len)
6490 out << " (" << op.op.xattr.value_len << ")";
6491 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
6492 out << " op " << (int)op.op.xattr.cmp_op
6493 << " mode " << (int)op.op.xattr.cmp_mode;
6494 } else if (ceph_osd_op_type_exec(op.op.op)) {
6495 // class.method
6496 if (op.op.cls.class_len && op.indata.length()) {
6497 out << " ";
6498 op.indata.write(0, op.op.cls.class_len, out);
6499 out << ".";
6500 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
6501 }
6502 } else if (ceph_osd_op_type_pg(op.op.op)) {
6503 switch (op.op.op) {
6504 case CEPH_OSD_OP_PGLS:
6505 case CEPH_OSD_OP_PGLS_FILTER:
6506 case CEPH_OSD_OP_PGNLS:
6507 case CEPH_OSD_OP_PGNLS_FILTER:
6508 out << " start_epoch " << op.op.pgls.start_epoch;
6509 break;
6510 case CEPH_OSD_OP_PG_HITSET_LS:
6511 break;
6512 case CEPH_OSD_OP_PG_HITSET_GET:
6513 out << " " << utime_t(op.op.hit_set_get.stamp);
6514 break;
6515 case CEPH_OSD_OP_SCRUBLS:
6516 break;
6517 }
6518 }
6519 return out;
6520 }
6521
6522
6523 void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in)
6524 {
6525 bufferlist::iterator datap = in.begin();
6526 for (unsigned i = 0; i < ops.size(); i++) {
6527 if (ops[i].op.payload_len) {
6528 datap.copy(ops[i].op.payload_len, ops[i].indata);
6529 }
6530 }
6531 }
6532
6533 void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out)
6534 {
6535 for (unsigned i = 0; i < ops.size(); i++) {
6536 if (ops[i].indata.length()) {
6537 ops[i].op.payload_len = ops[i].indata.length();
6538 out.append(ops[i].indata);
6539 }
6540 }
6541 }
6542
6543 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in)
6544 {
6545 bufferlist::iterator datap = in.begin();
6546 for (unsigned i = 0; i < ops.size(); i++) {
6547 if (ops[i].op.payload_len) {
6548 datap.copy(ops[i].op.payload_len, ops[i].outdata);
6549 }
6550 }
6551 }
6552
6553 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out)
6554 {
6555 for (unsigned i = 0; i < ops.size(); i++) {
6556 if (ops[i].outdata.length()) {
6557 ops[i].op.payload_len = ops[i].outdata.length();
6558 out.append(ops[i].outdata);
6559 }
6560 }
6561 }
6562
6563 void OSDOp::clear_data(vector<OSDOp>& ops)
6564 {
6565 for (unsigned i = 0; i < ops.size(); i++) {
6566 OSDOp& op = ops[i];
6567 op.outdata.clear();
6568 if (ceph_osd_op_type_attr(op.op.op) &&
6569 op.op.xattr.name_len &&
6570 op.indata.length() >= op.op.xattr.name_len) {
6571 bufferptr bp(op.op.xattr.name_len);
6572 bufferlist bl;
6573 bl.append(bp);
6574 bl.copy_in(0, op.op.xattr.name_len, op.indata);
6575 op.indata.claim(bl);
6576 } else if (ceph_osd_op_type_exec(op.op.op) &&
6577 op.op.cls.class_len &&
6578 op.indata.length() >
6579 (op.op.cls.class_len + op.op.cls.method_len)) {
6580 __u8 len = op.op.cls.class_len + op.op.cls.method_len;
6581 bufferptr bp(len);
6582 bufferlist bl;
6583 bl.append(bp);
6584 bl.copy_in(0, len, op.indata);
6585 op.indata.claim(bl);
6586 } else {
6587 op.indata.clear();
6588 }
6589 }
6590 }