]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.cc
update sources to 12.2.7
[ceph.git] / ceph / src / osd / osd_types.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <boost/assign/list_of.hpp>
19
20 #include "osd_types.h"
21 #include "include/ceph_features.h"
22 extern "C" {
23 #include "crush/hash.h"
24 }
25 #include "PG.h"
26 #include "OSDMap.h"
27 #include "PGBackend.h"
28
29 const char *ceph_osd_flag_name(unsigned flag)
30 {
31 switch (flag) {
32 case CEPH_OSD_FLAG_ACK: return "ack";
33 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
34 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
35 case CEPH_OSD_FLAG_RETRY: return "retry";
36 case CEPH_OSD_FLAG_READ: return "read";
37 case CEPH_OSD_FLAG_WRITE: return "write";
38 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
39 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
40 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
41 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
42 case CEPH_OSD_FLAG_PGOP: return "pgop";
43 case CEPH_OSD_FLAG_EXEC: return "exec";
44 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
45 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
46 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
47 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
48 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
49 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
50 case CEPH_OSD_FLAG_FLUSH: return "flush";
51 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
52 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
53 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
54 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
55 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
56 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
57 case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
58 default: return "???";
59 }
60 }
61
62 string ceph_osd_flag_string(unsigned flags)
63 {
64 string s;
65 for (unsigned i=0; i<32; ++i) {
66 if (flags & (1u<<i)) {
67 if (s.length())
68 s += "+";
69 s += ceph_osd_flag_name(1u << i);
70 }
71 }
72 if (s.length())
73 return s;
74 return string("-");
75 }
76
77 const char * ceph_osd_op_flag_name(unsigned flag)
78 {
79 const char *name;
80
81 switch(flag) {
82 case CEPH_OSD_OP_FLAG_EXCL:
83 name = "excl";
84 break;
85 case CEPH_OSD_OP_FLAG_FAILOK:
86 name = "failok";
87 break;
88 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
89 name = "fadvise_random";
90 break;
91 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
92 name = "fadvise_sequential";
93 break;
94 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
95 name = "favise_willneed";
96 break;
97 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
98 name = "fadvise_dontneed";
99 break;
100 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
101 name = "fadvise_nocache";
102 break;
103 default:
104 name = "???";
105 };
106
107 return name;
108 }
109
110 string ceph_osd_op_flag_string(unsigned flags)
111 {
112 string s;
113 for (unsigned i=0; i<32; ++i) {
114 if (flags & (1u<<i)) {
115 if (s.length())
116 s += "+";
117 s += ceph_osd_op_flag_name(1u << i);
118 }
119 }
120 if (s.length())
121 return s;
122 return string("-");
123 }
124
125 string ceph_osd_alloc_hint_flag_string(unsigned flags)
126 {
127 string s;
128 for (unsigned i=0; i<32; ++i) {
129 if (flags & (1u<<i)) {
130 if (s.length())
131 s += "+";
132 s += ceph_osd_alloc_hint_flag_name(1u << i);
133 }
134 }
135 if (s.length())
136 return s;
137 return string("-");
138 }
139
140 void pg_shard_t::encode(bufferlist &bl) const
141 {
142 ENCODE_START(1, 1, bl);
143 ::encode(osd, bl);
144 ::encode(shard, bl);
145 ENCODE_FINISH(bl);
146 }
147 void pg_shard_t::decode(bufferlist::iterator &bl)
148 {
149 DECODE_START(1, bl);
150 ::decode(osd, bl);
151 ::decode(shard, bl);
152 DECODE_FINISH(bl);
153 }
154
155 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
156 {
157 if (rhs.is_undefined())
158 return lhs << "?";
159 if (rhs.shard == shard_id_t::NO_SHARD)
160 return lhs << rhs.get_osd();
161 return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
162 }
163
164 // -- osd_reqid_t --
165 void osd_reqid_t::dump(Formatter *f) const
166 {
167 f->dump_stream("name") << name;
168 f->dump_int("inc", inc);
169 f->dump_unsigned("tid", tid);
170 }
171
172 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
173 {
174 o.push_back(new osd_reqid_t);
175 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
176 }
177
178 // -- object_locator_t --
179
180 void object_locator_t::encode(bufferlist& bl) const
181 {
182 // verify that nobody's corrupted the locator
183 assert(hash == -1 || key.empty());
184 __u8 encode_compat = 3;
185 ENCODE_START(6, encode_compat, bl);
186 ::encode(pool, bl);
187 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
188 ::encode(preferred, bl);
189 ::encode(key, bl);
190 ::encode(nspace, bl);
191 ::encode(hash, bl);
192 if (hash != -1)
193 encode_compat = MAX(encode_compat, 6); // need to interpret the hash
194 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
195 }
196
197 void object_locator_t::decode(bufferlist::iterator& p)
198 {
199 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
200 if (struct_v < 2) {
201 int32_t op;
202 ::decode(op, p);
203 pool = op;
204 int16_t pref;
205 ::decode(pref, p);
206 } else {
207 ::decode(pool, p);
208 int32_t preferred;
209 ::decode(preferred, p);
210 }
211 ::decode(key, p);
212 if (struct_v >= 5)
213 ::decode(nspace, p);
214 if (struct_v >= 6)
215 ::decode(hash, p);
216 else
217 hash = -1;
218 DECODE_FINISH(p);
219 // verify that nobody's corrupted the locator
220 assert(hash == -1 || key.empty());
221 }
222
223 void object_locator_t::dump(Formatter *f) const
224 {
225 f->dump_int("pool", pool);
226 f->dump_string("key", key);
227 f->dump_string("namespace", nspace);
228 f->dump_int("hash", hash);
229 }
230
231 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
232 {
233 o.push_back(new object_locator_t);
234 o.push_back(new object_locator_t(123));
235 o.push_back(new object_locator_t(123, 876));
236 o.push_back(new object_locator_t(1, "n2"));
237 o.push_back(new object_locator_t(1234, "", "key"));
238 o.push_back(new object_locator_t(12, "n1", "key2"));
239 }
240
241 // -- request_redirect_t --
242 void request_redirect_t::encode(bufferlist& bl) const
243 {
244 ENCODE_START(1, 1, bl);
245 ::encode(redirect_locator, bl);
246 ::encode(redirect_object, bl);
247 ::encode(osd_instructions, bl);
248 ENCODE_FINISH(bl);
249 }
250
251 void request_redirect_t::decode(bufferlist::iterator& bl)
252 {
253 DECODE_START(1, bl);
254 ::decode(redirect_locator, bl);
255 ::decode(redirect_object, bl);
256 ::decode(osd_instructions, bl);
257 DECODE_FINISH(bl);
258 }
259
260 void request_redirect_t::dump(Formatter *f) const
261 {
262 f->dump_string("object", redirect_object);
263 f->open_object_section("locator");
264 redirect_locator.dump(f);
265 f->close_section(); // locator
266 }
267
268 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
269 {
270 object_locator_t loc(1, "redir_obj");
271 o.push_back(new request_redirect_t());
272 o.push_back(new request_redirect_t(loc, 0));
273 o.push_back(new request_redirect_t(loc, "redir_obj"));
274 o.push_back(new request_redirect_t(loc));
275 }
276
277 void objectstore_perf_stat_t::dump(Formatter *f) const
278 {
279 f->dump_unsigned("commit_latency_ms", os_commit_latency);
280 f->dump_unsigned("apply_latency_ms", os_apply_latency);
281 }
282
283 void objectstore_perf_stat_t::encode(bufferlist &bl) const
284 {
285 ENCODE_START(1, 1, bl);
286 ::encode(os_commit_latency, bl);
287 ::encode(os_apply_latency, bl);
288 ENCODE_FINISH(bl);
289 }
290
291 void objectstore_perf_stat_t::decode(bufferlist::iterator &bl)
292 {
293 DECODE_START(1, bl);
294 ::decode(os_commit_latency, bl);
295 ::decode(os_apply_latency, bl);
296 DECODE_FINISH(bl);
297 }
298
299 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
300 {
301 o.push_back(new objectstore_perf_stat_t());
302 o.push_back(new objectstore_perf_stat_t());
303 o.back()->os_commit_latency = 20;
304 o.back()->os_apply_latency = 30;
305 }
306
307 // -- osd_stat_t --
308 void osd_stat_t::dump(Formatter *f) const
309 {
310 f->dump_unsigned("up_from", up_from);
311 f->dump_unsigned("seq", seq);
312 f->dump_unsigned("num_pgs", num_pgs);
313 f->dump_unsigned("kb", kb);
314 f->dump_unsigned("kb_used", kb_used);
315 f->dump_unsigned("kb_avail", kb_avail);
316 f->open_array_section("hb_peers");
317 for (auto p : hb_peers)
318 f->dump_int("osd", p);
319 f->close_section();
320 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
321 f->dump_int("num_snap_trimming", num_snap_trimming);
322 f->open_object_section("op_queue_age_hist");
323 op_queue_age_hist.dump(f);
324 f->close_section();
325 f->open_object_section("perf_stat");
326 os_perf_stat.dump(f);
327 f->close_section();
328 }
329
330 void osd_stat_t::encode(bufferlist &bl) const
331 {
332 ENCODE_START(7, 2, bl);
333 ::encode(kb, bl);
334 ::encode(kb_used, bl);
335 ::encode(kb_avail, bl);
336 ::encode(snap_trim_queue_len, bl);
337 ::encode(num_snap_trimming, bl);
338 ::encode(hb_peers, bl);
339 ::encode((uint32_t)0, bl);
340 ::encode(op_queue_age_hist, bl);
341 ::encode(os_perf_stat, bl);
342 ::encode(up_from, bl);
343 ::encode(seq, bl);
344 ::encode(num_pgs, bl);
345 ENCODE_FINISH(bl);
346 }
347
348 void osd_stat_t::decode(bufferlist::iterator &bl)
349 {
350 DECODE_START_LEGACY_COMPAT_LEN(6, 2, 2, bl);
351 ::decode(kb, bl);
352 ::decode(kb_used, bl);
353 ::decode(kb_avail, bl);
354 ::decode(snap_trim_queue_len, bl);
355 ::decode(num_snap_trimming, bl);
356 ::decode(hb_peers, bl);
357 vector<int> num_hb_out;
358 ::decode(num_hb_out, bl);
359 if (struct_v >= 3)
360 ::decode(op_queue_age_hist, bl);
361 if (struct_v >= 4)
362 ::decode(os_perf_stat, bl);
363 if (struct_v >= 6) {
364 ::decode(up_from, bl);
365 ::decode(seq, bl);
366 }
367 if (struct_v >= 7) {
368 ::decode(num_pgs, bl);
369 }
370 DECODE_FINISH(bl);
371 }
372
373 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
374 {
375 o.push_back(new osd_stat_t);
376
377 o.push_back(new osd_stat_t);
378 o.back()->kb = 1;
379 o.back()->kb_used = 2;
380 o.back()->kb_avail = 3;
381 o.back()->hb_peers.push_back(7);
382 o.back()->snap_trim_queue_len = 8;
383 o.back()->num_snap_trimming = 99;
384 }
385
386 // -- pg_t --
387
388 int pg_t::print(char *o, int maxlen) const
389 {
390 if (preferred() >= 0)
391 return snprintf(o, maxlen, "%llu.%xp%d", (unsigned long long)pool(), ps(), preferred());
392 else
393 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
394 }
395
396 bool pg_t::parse(const char *s)
397 {
398 uint64_t ppool;
399 uint32_t pseed;
400 int32_t pref;
401 int r = sscanf(s, "%llu.%xp%d", (long long unsigned *)&ppool, &pseed, &pref);
402 if (r < 2)
403 return false;
404 m_pool = ppool;
405 m_seed = pseed;
406 if (r == 3)
407 m_preferred = pref;
408 else
409 m_preferred = -1;
410 return true;
411 }
412
413 bool spg_t::parse(const char *s)
414 {
415 pgid.set_preferred(-1);
416 shard = shard_id_t::NO_SHARD;
417 uint64_t ppool;
418 uint32_t pseed;
419 int32_t pref;
420 uint32_t pshard;
421 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
422 if (r < 2)
423 return false;
424 pgid.set_pool(ppool);
425 pgid.set_ps(pseed);
426
427 const char *p = strchr(s, 'p');
428 if (p) {
429 r = sscanf(p, "p%d", &pref);
430 if (r == 1) {
431 pgid.set_preferred(pref);
432 } else {
433 return false;
434 }
435 }
436
437 p = strchr(s, 's');
438 if (p) {
439 r = sscanf(p, "s%d", &pshard);
440 if (r == 1) {
441 shard = shard_id_t(pshard);
442 } else {
443 return false;
444 }
445 }
446 return true;
447 }
448
449 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
450 {
451 while (*suffix_backwords)
452 *--buf = *suffix_backwords++;
453
454 if (!is_no_shard()) {
455 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
456 *--buf = 's';
457 }
458
459 return pgid.calc_name(buf, "");
460 }
461
462 ostream& operator<<(ostream& out, const spg_t &pg)
463 {
464 char buf[spg_t::calc_name_buf_size];
465 buf[spg_t::calc_name_buf_size - 1] = '\0';
466 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
467 return out;
468 }
469
470 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
471 {
472 int old_bits = cbits(old_pg_num);
473 int old_mask = (1 << old_bits) - 1;
474 pg_t ret = *this;
475 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
476 return ret;
477 }
478
479 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
480 {
481 assert(m_seed < old_pg_num);
482 if (new_pg_num <= old_pg_num)
483 return false;
484
485 bool split = false;
486 if (true) {
487 unsigned old_bits = cbits(old_pg_num);
488 unsigned old_mask = (1 << old_bits) - 1;
489 for (unsigned n = 1; ; n++) {
490 unsigned next_bit = (n << (old_bits-1));
491 unsigned s = next_bit | m_seed;
492
493 if (s < old_pg_num || s == m_seed)
494 continue;
495 if (s >= new_pg_num)
496 break;
497 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
498 split = true;
499 if (children)
500 children->insert(pg_t(s, m_pool, m_preferred));
501 }
502 }
503 }
504 if (false) {
505 // brute force
506 int old_bits = cbits(old_pg_num);
507 int old_mask = (1 << old_bits) - 1;
508 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
509 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
510 if (o == m_seed) {
511 split = true;
512 children->insert(pg_t(x, m_pool, m_preferred));
513 }
514 }
515 }
516 return split;
517 }
518
519 unsigned pg_t::get_split_bits(unsigned pg_num) const {
520 if (pg_num == 1)
521 return 0;
522 assert(pg_num > 1);
523
524 // Find unique p such that pg_num \in [2^(p-1), 2^p)
525 unsigned p = cbits(pg_num);
526 assert(p); // silence coverity #751330
527
528 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
529 return p;
530 else
531 return p - 1;
532 }
533
534 pg_t pg_t::get_parent() const
535 {
536 unsigned bits = cbits(m_seed);
537 assert(bits);
538 pg_t retval = *this;
539 retval.m_seed &= ~((~0)<<(bits - 1));
540 return retval;
541 }
542
543 hobject_t pg_t::get_hobj_start() const
544 {
545 return hobject_t(object_t(), string(), CEPH_NOSNAP, m_seed, m_pool,
546 string());
547 }
548
549 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
550 {
551 // note: this assumes a bitwise sort; with the legacy nibblewise
552 // sort a PG did not always cover a single contiguous range of the
553 // (bit-reversed) hash range.
554 unsigned bits = get_split_bits(pg_num);
555 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
556 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
557 if (rev_end >= 0x100000000) {
558 assert(rev_end == 0x100000000);
559 return hobject_t::get_max();
560 } else {
561 return hobject_t(object_t(), string(), CEPH_NOSNAP,
562 hobject_t::_reverse_bits(rev_end), m_pool,
563 string());
564 }
565 }
566
567 void pg_t::dump(Formatter *f) const
568 {
569 f->dump_unsigned("pool", m_pool);
570 f->dump_unsigned("seed", m_seed);
571 f->dump_int("preferred_osd", m_preferred);
572 }
573
574 void pg_t::generate_test_instances(list<pg_t*>& o)
575 {
576 o.push_back(new pg_t);
577 o.push_back(new pg_t(1, 2, -1));
578 o.push_back(new pg_t(13123, 3, -1));
579 o.push_back(new pg_t(131223, 4, 23));
580 }
581
582 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
583 {
584 while (*suffix_backwords)
585 *--buf = *suffix_backwords++;
586
587 if (m_preferred >= 0)
588 *--buf ='p';
589
590 buf = ritoa<uint32_t, 16>(m_seed, buf);
591
592 *--buf = '.';
593
594 return ritoa<uint64_t, 10>(m_pool, buf);
595 }
596
597 ostream& operator<<(ostream& out, const pg_t &pg)
598 {
599 char buf[pg_t::calc_name_buf_size];
600 buf[pg_t::calc_name_buf_size - 1] = '\0';
601 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
602 return out;
603 }
604
605
606 // -- coll_t --
607
608 void coll_t::calc_str()
609 {
610 switch (type) {
611 case TYPE_META:
612 strcpy(_str_buff, "meta");
613 _str = _str_buff;
614 break;
615 case TYPE_PG:
616 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
617 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
618 break;
619 case TYPE_PG_TEMP:
620 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
621 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
622 break;
623 default:
624 assert(0 == "unknown collection type");
625 }
626 }
627
628 bool coll_t::parse(const std::string& s)
629 {
630 if (s == "meta") {
631 type = TYPE_META;
632 pgid = spg_t();
633 removal_seq = 0;
634 calc_str();
635 assert(s == _str);
636 return true;
637 }
638 if (s.find("_head") == s.length() - 5 &&
639 pgid.parse(s.substr(0, s.length() - 5))) {
640 type = TYPE_PG;
641 removal_seq = 0;
642 calc_str();
643 assert(s == _str);
644 return true;
645 }
646 if (s.find("_TEMP") == s.length() - 5 &&
647 pgid.parse(s.substr(0, s.length() - 5))) {
648 type = TYPE_PG_TEMP;
649 removal_seq = 0;
650 calc_str();
651 assert(s == _str);
652 return true;
653 }
654 return false;
655 }
656
657 void coll_t::encode(bufferlist& bl) const
658 {
659 // when changing this, remember to update encoded_size() too.
660 if (is_temp()) {
661 // can't express this as v2...
662 __u8 struct_v = 3;
663 ::encode(struct_v, bl);
664 ::encode(to_str(), bl);
665 } else {
666 __u8 struct_v = 2;
667 ::encode(struct_v, bl);
668 ::encode((__u8)type, bl);
669 ::encode(pgid, bl);
670 snapid_t snap = CEPH_NOSNAP;
671 ::encode(snap, bl);
672 }
673 }
674
675 size_t coll_t::encoded_size() const
676 {
677 size_t r = sizeof(__u8);
678 if (is_temp()) {
679 // v3
680 r += sizeof(__u32);
681 if (_str) {
682 r += strlen(_str);
683 }
684 } else {
685 // v2
686 // 1. type
687 r += sizeof(__u8);
688 // 2. pgid
689 // - encoding header
690 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
691 // - pg_t
692 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
693 // - shard_id_t
694 r += sizeof(int8_t);
695 // 3. snapid_t
696 r += sizeof(uint64_t);
697 }
698
699 return r;
700 }
701
702 void coll_t::decode(bufferlist::iterator& bl)
703 {
704 __u8 struct_v;
705 ::decode(struct_v, bl);
706 switch (struct_v) {
707 case 1:
708 {
709 snapid_t snap;
710 ::decode(pgid, bl);
711 ::decode(snap, bl);
712
713 // infer the type
714 if (pgid == spg_t() && snap == 0) {
715 type = TYPE_META;
716 } else {
717 type = TYPE_PG;
718 }
719 removal_seq = 0;
720 }
721 break;
722
723 case 2:
724 {
725 __u8 _type;
726 snapid_t snap;
727 ::decode(_type, bl);
728 ::decode(pgid, bl);
729 ::decode(snap, bl);
730 type = (type_t)_type;
731 removal_seq = 0;
732 }
733 break;
734
735 case 3:
736 {
737 string str;
738 ::decode(str, bl);
739 bool ok = parse(str);
740 if (!ok)
741 throw std::domain_error(std::string("unable to parse pg ") + str);
742 }
743 break;
744
745 default:
746 {
747 ostringstream oss;
748 oss << "coll_t::decode(): don't know how to decode version "
749 << struct_v;
750 throw std::domain_error(oss.str());
751 }
752 }
753 }
754
755 void coll_t::dump(Formatter *f) const
756 {
757 f->dump_unsigned("type_id", (unsigned)type);
758 if (type != TYPE_META)
759 f->dump_stream("pgid") << pgid;
760 f->dump_string("name", to_str());
761 }
762
763 void coll_t::generate_test_instances(list<coll_t*>& o)
764 {
765 o.push_back(new coll_t());
766 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
767 o.push_back(new coll_t(o.back()->get_temp()));
768 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
769 o.push_back(new coll_t(o.back()->get_temp()));
770 o.push_back(new coll_t());
771 }
772
773 // ---
774
775 std::string pg_vector_string(const vector<int32_t> &a)
776 {
777 ostringstream oss;
778 oss << "[";
779 for (vector<int32_t>::const_iterator i = a.begin(); i != a.end(); ++i) {
780 if (i != a.begin())
781 oss << ",";
782 if (*i != CRUSH_ITEM_NONE)
783 oss << *i;
784 else
785 oss << "NONE";
786 }
787 oss << "]";
788 return oss.str();
789 }
790
791 std::string pg_state_string(int state)
792 {
793 ostringstream oss;
794 if (state & PG_STATE_STALE)
795 oss << "stale+";
796 if (state & PG_STATE_CREATING)
797 oss << "creating+";
798 if (state & PG_STATE_ACTIVE)
799 oss << "active+";
800 if (state & PG_STATE_ACTIVATING)
801 oss << "activating+";
802 if (state & PG_STATE_CLEAN)
803 oss << "clean+";
804 if (state & PG_STATE_RECOVERY_WAIT)
805 oss << "recovery_wait+";
806 if (state & PG_STATE_RECOVERY_TOOFULL)
807 oss << "recovery_toofull+";
808 if (state & PG_STATE_RECOVERING)
809 oss << "recovering+";
810 if (state & PG_STATE_FORCED_RECOVERY)
811 oss << "forced_recovery+";
812 if (state & PG_STATE_DOWN)
813 oss << "down+";
814 if (state & PG_STATE_RECOVERY_UNFOUND)
815 oss << "recovery_unfound+";
816 if (state & PG_STATE_BACKFILL_UNFOUND)
817 oss << "backfill_unfound+";
818 if (state & PG_STATE_UNDERSIZED)
819 oss << "undersized+";
820 if (state & PG_STATE_DEGRADED)
821 oss << "degraded+";
822 if (state & PG_STATE_REMAPPED)
823 oss << "remapped+";
824 if (state & PG_STATE_SCRUBBING)
825 oss << "scrubbing+";
826 if (state & PG_STATE_DEEP_SCRUB)
827 oss << "deep+";
828 if (state & PG_STATE_INCONSISTENT)
829 oss << "inconsistent+";
830 if (state & PG_STATE_PEERING)
831 oss << "peering+";
832 if (state & PG_STATE_REPAIR)
833 oss << "repair+";
834 if (state & PG_STATE_BACKFILL_WAIT)
835 oss << "backfill_wait+";
836 if (state & PG_STATE_BACKFILLING)
837 oss << "backfilling+";
838 if (state & PG_STATE_FORCED_BACKFILL)
839 oss << "forced_backfill+";
840 if (state & PG_STATE_BACKFILL_TOOFULL)
841 oss << "backfill_toofull+";
842 if (state & PG_STATE_INCOMPLETE)
843 oss << "incomplete+";
844 if (state & PG_STATE_PEERED)
845 oss << "peered+";
846 if (state & PG_STATE_SNAPTRIM)
847 oss << "snaptrim+";
848 if (state & PG_STATE_SNAPTRIM_WAIT)
849 oss << "snaptrim_wait+";
850 if (state & PG_STATE_SNAPTRIM_ERROR)
851 oss << "snaptrim_error+";
852 string ret(oss.str());
853 if (ret.length() > 0)
854 ret.resize(ret.length() - 1);
855 else
856 ret = "unknown";
857 return ret;
858 }
859
860 boost::optional<uint64_t> pg_string_state(const std::string& state)
861 {
862 boost::optional<uint64_t> type;
863 if (state == "active")
864 type = PG_STATE_ACTIVE;
865 else if (state == "clean")
866 type = PG_STATE_CLEAN;
867 else if (state == "down")
868 type = PG_STATE_DOWN;
869 else if (state == "recovery_unfound")
870 type = PG_STATE_RECOVERY_UNFOUND;
871 else if (state == "backfill_unfound")
872 type = PG_STATE_BACKFILL_UNFOUND;
873 else if (state == "scrubbing")
874 type = PG_STATE_SCRUBBING;
875 else if (state == "degraded")
876 type = PG_STATE_DEGRADED;
877 else if (state == "inconsistent")
878 type = PG_STATE_INCONSISTENT;
879 else if (state == "peering")
880 type = PG_STATE_PEERING;
881 else if (state == "repair")
882 type = PG_STATE_REPAIR;
883 else if (state == "recovering")
884 type = PG_STATE_RECOVERING;
885 else if (state == "forced_recovery")
886 type = PG_STATE_FORCED_RECOVERY;
887 else if (state == "backfill_wait")
888 type = PG_STATE_BACKFILL_WAIT;
889 else if (state == "incomplete")
890 type = PG_STATE_INCOMPLETE;
891 else if (state == "stale")
892 type = PG_STATE_STALE;
893 else if (state == "remapped")
894 type = PG_STATE_REMAPPED;
895 else if (state == "deep")
896 type = PG_STATE_DEEP_SCRUB;
897 else if (state == "backfilling")
898 type = PG_STATE_BACKFILLING;
899 else if (state == "forced_backfill")
900 type = PG_STATE_FORCED_BACKFILL;
901 else if (state == "backfill_toofull")
902 type = PG_STATE_BACKFILL_TOOFULL;
903 else if (state == "recovery_wait")
904 type = PG_STATE_RECOVERY_WAIT;
905 else if (state == "recovery_toofull")
906 type = PG_STATE_RECOVERY_TOOFULL;
907 else if (state == "undersized")
908 type = PG_STATE_UNDERSIZED;
909 else if (state == "activating")
910 type = PG_STATE_ACTIVATING;
911 else if (state == "peered")
912 type = PG_STATE_PEERED;
913 else if (state == "snaptrim")
914 type = PG_STATE_SNAPTRIM;
915 else if (state == "snaptrim_wait")
916 type = PG_STATE_SNAPTRIM_WAIT;
917 else if (state == "snaptrim_error")
918 type = PG_STATE_SNAPTRIM_ERROR;
919 else
920 type = boost::none;
921 return type;
922 }
923
924 // -- eversion_t --
925 string eversion_t::get_key_name() const
926 {
927 char key[32];
928 // Below is equivalent of sprintf("%010u.%020llu");
929 key[31] = 0;
930 ritoa<uint64_t, 10, 20>(version, key + 31);
931 key[10] = '.';
932 ritoa<uint32_t, 10, 10>(epoch, key + 10);
933 return string(key);
934 }
935
936
937 // -- pool_snap_info_t --
938 void pool_snap_info_t::dump(Formatter *f) const
939 {
940 f->dump_unsigned("snapid", snapid);
941 f->dump_stream("stamp") << stamp;
942 f->dump_string("name", name);
943 }
944
945 void pool_snap_info_t::encode(bufferlist& bl, uint64_t features) const
946 {
947 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
948 __u8 struct_v = 1;
949 ::encode(struct_v, bl);
950 ::encode(snapid, bl);
951 ::encode(stamp, bl);
952 ::encode(name, bl);
953 return;
954 }
955 ENCODE_START(2, 2, bl);
956 ::encode(snapid, bl);
957 ::encode(stamp, bl);
958 ::encode(name, bl);
959 ENCODE_FINISH(bl);
960 }
961
962 void pool_snap_info_t::decode(bufferlist::iterator& bl)
963 {
964 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
965 ::decode(snapid, bl);
966 ::decode(stamp, bl);
967 ::decode(name, bl);
968 DECODE_FINISH(bl);
969 }
970
971 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
972 {
973 o.push_back(new pool_snap_info_t);
974 o.push_back(new pool_snap_info_t);
975 o.back()->snapid = 1;
976 o.back()->stamp = utime_t(1, 2);
977 o.back()->name = "foo";
978 }
979
980 // -- pool_opts_t --
981
982 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
983 static opt_mapping_t opt_mapping = boost::assign::map_list_of
984 ("scrub_min_interval", pool_opts_t::opt_desc_t(
985 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
986 ("scrub_max_interval", pool_opts_t::opt_desc_t(
987 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
988 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
989 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
990 ("recovery_priority", pool_opts_t::opt_desc_t(
991 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
992 ("recovery_op_priority", pool_opts_t::opt_desc_t(
993 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
994 ("scrub_priority", pool_opts_t::opt_desc_t(
995 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
996 ("compression_mode", pool_opts_t::opt_desc_t(
997 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
998 ("compression_algorithm", pool_opts_t::opt_desc_t(
999 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
1000 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1001 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
1002 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1003 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
1004 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1005 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
1006 ("csum_type", pool_opts_t::opt_desc_t(
1007 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1008 ("csum_max_block", pool_opts_t::opt_desc_t(
1009 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1010 ("csum_min_block", pool_opts_t::opt_desc_t(
1011 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT));
1012
1013 bool pool_opts_t::is_opt_name(const std::string& name) {
1014 return opt_mapping.count(name);
1015 }
1016
1017 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) {
1018 opt_mapping_t::iterator i = opt_mapping.find(name);
1019 assert(i != opt_mapping.end());
1020 return i->second;
1021 }
1022
1023 bool pool_opts_t::is_set(pool_opts_t::key_t key) const {
1024 return opts.count(key);
1025 }
1026
1027 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const {
1028 opts_t::const_iterator i = opts.find(key);
1029 assert(i != opts.end());
1030 return i->second;
1031 }
1032
1033 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1034 return opts.erase(key) > 0;
1035 }
1036
1037 class pool_opts_dumper_t : public boost::static_visitor<>
1038 {
1039 public:
1040 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1041 name(name_.c_str()), f(f_) {}
1042
1043 void operator()(std::string s) const {
1044 f->dump_string(name, s);
1045 }
1046 void operator()(int i) const {
1047 f->dump_int(name, i);
1048 }
1049 void operator()(double d) const {
1050 f->dump_float(name, d);
1051 }
1052
1053 private:
1054 const char* name;
1055 Formatter* f;
1056 };
1057
1058 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1059 {
1060 const opt_desc_t& desc = get_opt_desc(name);
1061 opts_t::const_iterator i = opts.find(desc.key);
1062 if (i == opts.end()) {
1063 return;
1064 }
1065 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1066 }
1067
1068 void pool_opts_t::dump(Formatter* f) const
1069 {
1070 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1071 ++i) {
1072 const std::string& name = i->first;
1073 const opt_desc_t& desc = i->second;
1074 opts_t::const_iterator j = opts.find(desc.key);
1075 if (j == opts.end()) {
1076 continue;
1077 }
1078 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1079 }
1080 }
1081
1082 class pool_opts_encoder_t : public boost::static_visitor<>
1083 {
1084 public:
1085 explicit pool_opts_encoder_t(bufferlist& bl_) : bl(bl_) {}
1086
1087 void operator()(std::string s) const {
1088 ::encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1089 ::encode(s, bl);
1090 }
1091 void operator()(int i) const {
1092 ::encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1093 ::encode(i, bl);
1094 }
1095 void operator()(double d) const {
1096 ::encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1097 ::encode(d, bl);
1098 }
1099
1100 private:
1101 bufferlist& bl;
1102 };
1103
1104 void pool_opts_t::encode(bufferlist& bl) const {
1105 ENCODE_START(1, 1, bl);
1106 uint32_t n = static_cast<uint32_t>(opts.size());
1107 ::encode(n, bl);
1108 for (opts_t::const_iterator i = opts.begin(); i != opts.end(); ++i) {
1109 ::encode(static_cast<int32_t>(i->first), bl);
1110 boost::apply_visitor(pool_opts_encoder_t(bl), i->second);
1111 }
1112 ENCODE_FINISH(bl);
1113 }
1114
1115 void pool_opts_t::decode(bufferlist::iterator& bl) {
1116 DECODE_START(1, bl);
1117 __u32 n;
1118 ::decode(n, bl);
1119 opts.clear();
1120 while (n--) {
1121 int32_t k, t;
1122 ::decode(k, bl);
1123 ::decode(t, bl);
1124 if (t == STR) {
1125 std::string s;
1126 ::decode(s, bl);
1127 opts[static_cast<key_t>(k)] = s;
1128 } else if (t == INT) {
1129 int i;
1130 ::decode(i, bl);
1131 opts[static_cast<key_t>(k)] = i;
1132 } else if (t == DOUBLE) {
1133 double d;
1134 ::decode(d, bl);
1135 opts[static_cast<key_t>(k)] = d;
1136 } else {
1137 assert(!"invalid type");
1138 }
1139 }
1140 DECODE_FINISH(bl);
1141 }
1142
1143 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1144 {
1145 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1146 ++i) {
1147 const std::string& name = i->first;
1148 const pool_opts_t::opt_desc_t& desc = i->second;
1149 pool_opts_t::opts_t::const_iterator j = opts.opts.find(desc.key);
1150 if (j == opts.opts.end()) {
1151 continue;
1152 }
1153 out << " " << name << " " << j->second;
1154 }
1155 return out;
1156 }
1157
1158 // -- pg_pool_t --
1159
1160 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1161 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1162 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1163
1164 void pg_pool_t::dump(Formatter *f) const
1165 {
1166 f->dump_unsigned("flags", get_flags());
1167 f->dump_string("flags_names", get_flags_string());
1168 f->dump_int("type", get_type());
1169 f->dump_int("size", get_size());
1170 f->dump_int("min_size", get_min_size());
1171 f->dump_int("crush_rule", get_crush_rule());
1172 f->dump_int("object_hash", get_object_hash());
1173 f->dump_unsigned("pg_num", get_pg_num());
1174 f->dump_unsigned("pg_placement_num", get_pgp_num());
1175 f->dump_unsigned("crash_replay_interval", get_crash_replay_interval());
1176 f->dump_stream("last_change") << get_last_change();
1177 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1178 f->dump_stream("last_force_op_resend_preluminous")
1179 << get_last_force_op_resend_preluminous();
1180 f->dump_unsigned("auid", get_auid());
1181 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1182 f->dump_unsigned("snap_seq", get_snap_seq());
1183 f->dump_unsigned("snap_epoch", get_snap_epoch());
1184 f->open_array_section("pool_snaps");
1185 for (map<snapid_t, pool_snap_info_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
1186 f->open_object_section("pool_snap_info");
1187 p->second.dump(f);
1188 f->close_section();
1189 }
1190 f->close_section();
1191 f->dump_stream("removed_snaps") << removed_snaps;
1192 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1193 f->dump_unsigned("quota_max_objects", quota_max_objects);
1194 f->open_array_section("tiers");
1195 for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
1196 f->dump_unsigned("pool_id", *p);
1197 f->close_section();
1198 f->dump_int("tier_of", tier_of);
1199 f->dump_int("read_tier", read_tier);
1200 f->dump_int("write_tier", write_tier);
1201 f->dump_string("cache_mode", get_cache_mode_name());
1202 f->dump_unsigned("target_max_bytes", target_max_bytes);
1203 f->dump_unsigned("target_max_objects", target_max_objects);
1204 f->dump_unsigned("cache_target_dirty_ratio_micro",
1205 cache_target_dirty_ratio_micro);
1206 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1207 cache_target_dirty_high_ratio_micro);
1208 f->dump_unsigned("cache_target_full_ratio_micro",
1209 cache_target_full_ratio_micro);
1210 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1211 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1212 f->dump_string("erasure_code_profile", erasure_code_profile);
1213 f->open_object_section("hit_set_params");
1214 hit_set_params.dump(f);
1215 f->close_section(); // hit_set_params
1216 f->dump_unsigned("hit_set_period", hit_set_period);
1217 f->dump_unsigned("hit_set_count", hit_set_count);
1218 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1219 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1220 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1221 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1222 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1223 f->open_array_section("grade_table");
1224 for (unsigned i = 0; i < hit_set_count; ++i)
1225 f->dump_unsigned("value", get_grade(i));
1226 f->close_section();
1227 f->dump_unsigned("stripe_width", get_stripe_width());
1228 f->dump_unsigned("expected_num_objects", expected_num_objects);
1229 f->dump_bool("fast_read", fast_read);
1230 f->open_object_section("options");
1231 opts.dump(f);
1232 f->close_section(); // options
1233 f->open_object_section("application_metadata");
1234 for (auto &app_pair : application_metadata) {
1235 f->open_object_section(app_pair.first.c_str());
1236 for (auto &kv_pair : app_pair.second) {
1237 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1238 }
1239 f->close_section(); // application
1240 }
1241 f->close_section(); // application_metadata
1242 }
1243
1244 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1245 for (size_t i = 0; i < from.size(); ++i) {
1246 if (from[i] != CRUSH_ITEM_NONE) {
1247 to->insert(
1248 pg_shard_t(
1249 from[i],
1250 ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1251 }
1252 }
1253 }
1254
1255 void pg_pool_t::calc_pg_masks()
1256 {
1257 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1258 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1259 }
1260
1261 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1262 {
1263 if (pg_num == pg_num_mask + 1)
1264 return pg_num; // power-of-2 split
1265 unsigned mask = pg_num_mask >> 1;
1266 if ((pgid.ps() & mask) < (pg_num & mask))
1267 return pg_num_mask + 1; // smaller bin size (already split)
1268 else
1269 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1270 }
1271
1272 /*
1273 * we have two snap modes:
1274 * - pool global snaps
1275 * - snap existence/non-existence defined by snaps[] and snap_seq
1276 * - user managed snaps
1277 * - removal governed by removed_snaps
1278 *
1279 * we know which mode we're using based on whether removed_snaps is empty.
1280 */
1281 bool pg_pool_t::is_pool_snaps_mode() const
1282 {
1283 return removed_snaps.empty() && get_snap_seq() > 0;
1284 }
1285
1286 bool pg_pool_t::is_unmanaged_snaps_mode() const
1287 {
1288 return removed_snaps.size() && get_snap_seq() > 0;
1289 }
1290
1291 bool pg_pool_t::is_removed_snap(snapid_t s) const
1292 {
1293 if (is_pool_snaps_mode())
1294 return s <= get_snap_seq() && snaps.count(s) == 0;
1295 else
1296 return removed_snaps.contains(s);
1297 }
1298
1299 /*
1300 * build set of known-removed sets from either pool snaps or
1301 * explicit removed_snaps set.
1302 */
1303 void pg_pool_t::build_removed_snaps(interval_set<snapid_t>& rs) const
1304 {
1305 if (is_pool_snaps_mode()) {
1306 rs.clear();
1307 for (snapid_t s = 1; s <= get_snap_seq(); s = s + 1)
1308 if (snaps.count(s) == 0)
1309 rs.insert(s);
1310 } else {
1311 rs = removed_snaps;
1312 }
1313 }
1314
1315 snapid_t pg_pool_t::snap_exists(const char *s) const
1316 {
1317 for (map<snapid_t,pool_snap_info_t>::const_iterator p = snaps.begin();
1318 p != snaps.end();
1319 ++p)
1320 if (p->second.name == s)
1321 return p->second.snapid;
1322 return 0;
1323 }
1324
1325 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1326 {
1327 assert(!is_unmanaged_snaps_mode());
1328 snapid_t s = get_snap_seq() + 1;
1329 snap_seq = s;
1330 snaps[s].snapid = s;
1331 snaps[s].name = n;
1332 snaps[s].stamp = stamp;
1333 }
1334
1335 void pg_pool_t::add_unmanaged_snap(uint64_t& snapid)
1336 {
1337 if (removed_snaps.empty()) {
1338 assert(!is_pool_snaps_mode());
1339 removed_snaps.insert(snapid_t(1));
1340 snap_seq = 1;
1341 }
1342 snapid = snap_seq = snap_seq + 1;
1343 }
1344
1345 void pg_pool_t::remove_snap(snapid_t s)
1346 {
1347 assert(snaps.count(s));
1348 snaps.erase(s);
1349 snap_seq = snap_seq + 1;
1350 }
1351
1352 void pg_pool_t::remove_unmanaged_snap(snapid_t s)
1353 {
1354 assert(is_unmanaged_snaps_mode());
1355 removed_snaps.insert(s);
1356 snap_seq = snap_seq + 1;
1357 // try to add in the new seq, just to try to keep the interval_set contiguous
1358 if (!removed_snaps.contains(get_snap_seq())) {
1359 removed_snaps.insert(get_snap_seq());
1360 }
1361 }
1362
1363 SnapContext pg_pool_t::get_snap_context() const
1364 {
1365 vector<snapid_t> s(snaps.size());
1366 unsigned i = 0;
1367 for (map<snapid_t, pool_snap_info_t>::const_reverse_iterator p = snaps.rbegin();
1368 p != snaps.rend();
1369 ++p)
1370 s[i++] = p->first;
1371 return SnapContext(get_snap_seq(), s);
1372 }
1373
1374 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1375 {
1376 if (ns.empty())
1377 return ceph_str_hash(object_hash, key.data(), key.length());
1378 int nsl = ns.length();
1379 int len = key.length() + nsl + 1;
1380 char buf[len];
1381 memcpy(&buf[0], ns.data(), nsl);
1382 buf[nsl] = '\037';
1383 memcpy(&buf[nsl+1], key.data(), key.length());
1384 return ceph_str_hash(object_hash, &buf[0], len);
1385 }
1386
1387 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1388 {
1389 return ceph_stable_mod(v, pg_num, pg_num_mask);
1390 }
1391
1392 /*
1393 * map a raw pg (with full precision ps) into an actual pg, for storage
1394 */
1395 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1396 {
1397 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1398 return pg;
1399 }
1400
1401 /*
1402 * map raw pg (full precision ps) into a placement seed. include
1403 * pool id in that value so that different pools don't use the same
1404 * seeds.
1405 */
1406 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1407 {
1408 if (flags & FLAG_HASHPSPOOL) {
1409 // Hash the pool id so that pool PGs do not overlap.
1410 return
1411 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1412 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1413 pg.pool());
1414 } else {
1415 // Legacy behavior; add ps and pool together. This is not a great
1416 // idea because the PGs from each pool will essentially overlap on
1417 // top of each other: 0.5 == 1.4 == 2.3 == ...
1418 return
1419 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1420 pg.pool();
1421 }
1422 }
1423
1424 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1425 {
1426 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1427 if (pg_num == pg_num_mask + 1) {
1428 r &= ~pg_num_mask;
1429 } else {
1430 unsigned smaller_mask = pg_num_mask >> 1;
1431 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1432 r &= ~pg_num_mask;
1433 } else {
1434 r &= ~smaller_mask;
1435 }
1436 }
1437 r |= pg.ps();
1438 return r;
1439 }
1440
1441 void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
1442 {
1443 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1444 // this encoding matches the old struct ceph_pg_pool
1445 __u8 struct_v = 2;
1446 ::encode(struct_v, bl);
1447 ::encode(type, bl);
1448 ::encode(size, bl);
1449 ::encode(crush_rule, bl);
1450 ::encode(object_hash, bl);
1451 ::encode(pg_num, bl);
1452 ::encode(pgp_num, bl);
1453 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1454 ::encode(lpg_num, bl);
1455 ::encode(lpgp_num, bl);
1456 ::encode(last_change, bl);
1457 ::encode(snap_seq, bl);
1458 ::encode(snap_epoch, bl);
1459
1460 __u32 n = snaps.size();
1461 ::encode(n, bl);
1462 n = removed_snaps.num_intervals();
1463 ::encode(n, bl);
1464
1465 ::encode(auid, bl);
1466
1467 ::encode_nohead(snaps, bl, features);
1468 ::encode_nohead(removed_snaps, bl);
1469 return;
1470 }
1471
1472 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1473 __u8 struct_v = 4;
1474 ::encode(struct_v, bl);
1475 ::encode(type, bl);
1476 ::encode(size, bl);
1477 ::encode(crush_rule, bl);
1478 ::encode(object_hash, bl);
1479 ::encode(pg_num, bl);
1480 ::encode(pgp_num, bl);
1481 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1482 ::encode(lpg_num, bl);
1483 ::encode(lpgp_num, bl);
1484 ::encode(last_change, bl);
1485 ::encode(snap_seq, bl);
1486 ::encode(snap_epoch, bl);
1487 ::encode(snaps, bl, features);
1488 ::encode(removed_snaps, bl);
1489 ::encode(auid, bl);
1490 ::encode(flags, bl);
1491 ::encode(crash_replay_interval, bl);
1492 return;
1493 }
1494
1495 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1496 // we simply added last_force_op_resend here, which is a fully
1497 // backward compatible change. however, encoding the same map
1498 // differently between monitors triggers scrub noise (even though
1499 // they are decodable without the feature), so let's be pendantic
1500 // about it.
1501 ENCODE_START(14, 5, bl);
1502 ::encode(type, bl);
1503 ::encode(size, bl);
1504 ::encode(crush_rule, bl);
1505 ::encode(object_hash, bl);
1506 ::encode(pg_num, bl);
1507 ::encode(pgp_num, bl);
1508 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1509 ::encode(lpg_num, bl);
1510 ::encode(lpgp_num, bl);
1511 ::encode(last_change, bl);
1512 ::encode(snap_seq, bl);
1513 ::encode(snap_epoch, bl);
1514 ::encode(snaps, bl, features);
1515 ::encode(removed_snaps, bl);
1516 ::encode(auid, bl);
1517 ::encode(flags, bl);
1518 ::encode(crash_replay_interval, bl);
1519 ::encode(min_size, bl);
1520 ::encode(quota_max_bytes, bl);
1521 ::encode(quota_max_objects, bl);
1522 ::encode(tiers, bl);
1523 ::encode(tier_of, bl);
1524 __u8 c = cache_mode;
1525 ::encode(c, bl);
1526 ::encode(read_tier, bl);
1527 ::encode(write_tier, bl);
1528 ::encode(properties, bl);
1529 ::encode(hit_set_params, bl);
1530 ::encode(hit_set_period, bl);
1531 ::encode(hit_set_count, bl);
1532 ::encode(stripe_width, bl);
1533 ::encode(target_max_bytes, bl);
1534 ::encode(target_max_objects, bl);
1535 ::encode(cache_target_dirty_ratio_micro, bl);
1536 ::encode(cache_target_full_ratio_micro, bl);
1537 ::encode(cache_min_flush_age, bl);
1538 ::encode(cache_min_evict_age, bl);
1539 ::encode(erasure_code_profile, bl);
1540 ENCODE_FINISH(bl);
1541 return;
1542 }
1543
1544 uint8_t v = 26;
1545 // NOTE: any new encoding dependencies must be reflected by
1546 // SIGNIFICANT_FEATURES
1547 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1548 // this was the first post-hammer thing we added; if it's missing, encode
1549 // like hammer.
1550 v = 21;
1551 } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1552 v = 24;
1553 }
1554
1555 ENCODE_START(v, 5, bl);
1556 ::encode(type, bl);
1557 ::encode(size, bl);
1558 ::encode(crush_rule, bl);
1559 ::encode(object_hash, bl);
1560 ::encode(pg_num, bl);
1561 ::encode(pgp_num, bl);
1562 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1563 ::encode(lpg_num, bl);
1564 ::encode(lpgp_num, bl);
1565 ::encode(last_change, bl);
1566 ::encode(snap_seq, bl);
1567 ::encode(snap_epoch, bl);
1568 ::encode(snaps, bl, features);
1569 ::encode(removed_snaps, bl);
1570 ::encode(auid, bl);
1571 ::encode(flags, bl);
1572 ::encode(crash_replay_interval, bl);
1573 ::encode(min_size, bl);
1574 ::encode(quota_max_bytes, bl);
1575 ::encode(quota_max_objects, bl);
1576 ::encode(tiers, bl);
1577 ::encode(tier_of, bl);
1578 __u8 c = cache_mode;
1579 ::encode(c, bl);
1580 ::encode(read_tier, bl);
1581 ::encode(write_tier, bl);
1582 ::encode(properties, bl);
1583 ::encode(hit_set_params, bl);
1584 ::encode(hit_set_period, bl);
1585 ::encode(hit_set_count, bl);
1586 ::encode(stripe_width, bl);
1587 ::encode(target_max_bytes, bl);
1588 ::encode(target_max_objects, bl);
1589 ::encode(cache_target_dirty_ratio_micro, bl);
1590 ::encode(cache_target_full_ratio_micro, bl);
1591 ::encode(cache_min_flush_age, bl);
1592 ::encode(cache_min_evict_age, bl);
1593 ::encode(erasure_code_profile, bl);
1594 ::encode(last_force_op_resend_preluminous, bl);
1595 ::encode(min_read_recency_for_promote, bl);
1596 ::encode(expected_num_objects, bl);
1597 if (v >= 19) {
1598 ::encode(cache_target_dirty_high_ratio_micro, bl);
1599 }
1600 if (v >= 20) {
1601 ::encode(min_write_recency_for_promote, bl);
1602 }
1603 if (v >= 21) {
1604 ::encode(use_gmt_hitset, bl);
1605 }
1606 if (v >= 22) {
1607 ::encode(fast_read, bl);
1608 }
1609 if (v >= 23) {
1610 ::encode(hit_set_grade_decay_rate, bl);
1611 ::encode(hit_set_search_last_n, bl);
1612 }
1613 if (v >= 24) {
1614 ::encode(opts, bl);
1615 }
1616 if (v >= 25) {
1617 ::encode(last_force_op_resend, bl);
1618 }
1619 if (v >= 26) {
1620 ::encode(application_metadata, bl);
1621 }
1622 ENCODE_FINISH(bl);
1623 }
1624
1625 void pg_pool_t::decode(bufferlist::iterator& bl)
1626 {
1627 DECODE_START_LEGACY_COMPAT_LEN(26, 5, 5, bl);
1628 ::decode(type, bl);
1629 ::decode(size, bl);
1630 ::decode(crush_rule, bl);
1631 ::decode(object_hash, bl);
1632 ::decode(pg_num, bl);
1633 ::decode(pgp_num, bl);
1634 {
1635 __u32 lpg_num, lpgp_num;
1636 ::decode(lpg_num, bl);
1637 ::decode(lpgp_num, bl);
1638 }
1639 ::decode(last_change, bl);
1640 ::decode(snap_seq, bl);
1641 ::decode(snap_epoch, bl);
1642
1643 if (struct_v >= 3) {
1644 ::decode(snaps, bl);
1645 ::decode(removed_snaps, bl);
1646 ::decode(auid, bl);
1647 } else {
1648 __u32 n, m;
1649 ::decode(n, bl);
1650 ::decode(m, bl);
1651 ::decode(auid, bl);
1652 ::decode_nohead(n, snaps, bl);
1653 ::decode_nohead(m, removed_snaps, bl);
1654 }
1655
1656 if (struct_v >= 4) {
1657 ::decode(flags, bl);
1658 ::decode(crash_replay_interval, bl);
1659 } else {
1660 flags = 0;
1661
1662 // if this looks like the 'data' pool, set the
1663 // crash_replay_interval appropriately. unfortunately, we can't
1664 // be precise here. this should be good enough to preserve replay
1665 // on the data pool for the majority of cluster upgrades, though.
1666 if (crush_rule == 0 && auid == 0)
1667 crash_replay_interval = 60;
1668 else
1669 crash_replay_interval = 0;
1670 }
1671 if (struct_v >= 7) {
1672 ::decode(min_size, bl);
1673 } else {
1674 min_size = size - size/2;
1675 }
1676 if (struct_v >= 8) {
1677 ::decode(quota_max_bytes, bl);
1678 ::decode(quota_max_objects, bl);
1679 }
1680 if (struct_v >= 9) {
1681 ::decode(tiers, bl);
1682 ::decode(tier_of, bl);
1683 __u8 v;
1684 ::decode(v, bl);
1685 cache_mode = (cache_mode_t)v;
1686 ::decode(read_tier, bl);
1687 ::decode(write_tier, bl);
1688 }
1689 if (struct_v >= 10) {
1690 ::decode(properties, bl);
1691 }
1692 if (struct_v >= 11) {
1693 ::decode(hit_set_params, bl);
1694 ::decode(hit_set_period, bl);
1695 ::decode(hit_set_count, bl);
1696 } else {
1697 pg_pool_t def;
1698 hit_set_period = def.hit_set_period;
1699 hit_set_count = def.hit_set_count;
1700 }
1701 if (struct_v >= 12) {
1702 ::decode(stripe_width, bl);
1703 } else {
1704 set_stripe_width(0);
1705 }
1706 if (struct_v >= 13) {
1707 ::decode(target_max_bytes, bl);
1708 ::decode(target_max_objects, bl);
1709 ::decode(cache_target_dirty_ratio_micro, bl);
1710 ::decode(cache_target_full_ratio_micro, bl);
1711 ::decode(cache_min_flush_age, bl);
1712 ::decode(cache_min_evict_age, bl);
1713 } else {
1714 target_max_bytes = 0;
1715 target_max_objects = 0;
1716 cache_target_dirty_ratio_micro = 0;
1717 cache_target_full_ratio_micro = 0;
1718 cache_min_flush_age = 0;
1719 cache_min_evict_age = 0;
1720 }
1721 if (struct_v >= 14) {
1722 ::decode(erasure_code_profile, bl);
1723 }
1724 if (struct_v >= 15) {
1725 ::decode(last_force_op_resend_preluminous, bl);
1726 } else {
1727 last_force_op_resend_preluminous = 0;
1728 }
1729 if (struct_v >= 16) {
1730 ::decode(min_read_recency_for_promote, bl);
1731 } else {
1732 min_read_recency_for_promote = 1;
1733 }
1734 if (struct_v >= 17) {
1735 ::decode(expected_num_objects, bl);
1736 } else {
1737 expected_num_objects = 0;
1738 }
1739 if (struct_v >= 19) {
1740 ::decode(cache_target_dirty_high_ratio_micro, bl);
1741 } else {
1742 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
1743 }
1744 if (struct_v >= 20) {
1745 ::decode(min_write_recency_for_promote, bl);
1746 } else {
1747 min_write_recency_for_promote = 1;
1748 }
1749 if (struct_v >= 21) {
1750 ::decode(use_gmt_hitset, bl);
1751 } else {
1752 use_gmt_hitset = false;
1753 }
1754 if (struct_v >= 22) {
1755 ::decode(fast_read, bl);
1756 } else {
1757 fast_read = false;
1758 }
1759 if (struct_v >= 23) {
1760 ::decode(hit_set_grade_decay_rate, bl);
1761 ::decode(hit_set_search_last_n, bl);
1762 } else {
1763 hit_set_grade_decay_rate = 0;
1764 hit_set_search_last_n = 1;
1765 }
1766 if (struct_v >= 24) {
1767 ::decode(opts, bl);
1768 }
1769 if (struct_v >= 25) {
1770 ::decode(last_force_op_resend, bl);
1771 } else {
1772 last_force_op_resend = last_force_op_resend_preluminous;
1773 }
1774 if (struct_v >= 26) {
1775 ::decode(application_metadata, bl);
1776 }
1777 DECODE_FINISH(bl);
1778 calc_pg_masks();
1779 calc_grade_table();
1780 }
1781
1782 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
1783 {
1784 pg_pool_t a;
1785 o.push_back(new pg_pool_t(a));
1786
1787 a.type = TYPE_REPLICATED;
1788 a.size = 2;
1789 a.crush_rule = 3;
1790 a.object_hash = 4;
1791 a.pg_num = 6;
1792 a.pgp_num = 5;
1793 a.last_change = 9;
1794 a.last_force_op_resend = 123823;
1795 a.last_force_op_resend_preluminous = 123824;
1796 a.snap_seq = 10;
1797 a.snap_epoch = 11;
1798 a.auid = 12;
1799 a.crash_replay_interval = 13;
1800 a.quota_max_bytes = 473;
1801 a.quota_max_objects = 474;
1802 o.push_back(new pg_pool_t(a));
1803
1804 a.snaps[3].name = "asdf";
1805 a.snaps[3].snapid = 3;
1806 a.snaps[3].stamp = utime_t(123, 4);
1807 a.snaps[6].name = "qwer";
1808 a.snaps[6].snapid = 6;
1809 a.snaps[6].stamp = utime_t(23423, 4);
1810 o.push_back(new pg_pool_t(a));
1811
1812 a.removed_snaps.insert(2); // not quite valid to combine with snaps!
1813 a.quota_max_bytes = 2473;
1814 a.quota_max_objects = 4374;
1815 a.tiers.insert(0);
1816 a.tiers.insert(1);
1817 a.tier_of = 2;
1818 a.cache_mode = CACHEMODE_WRITEBACK;
1819 a.read_tier = 1;
1820 a.write_tier = 1;
1821 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
1822 a.hit_set_period = 3600;
1823 a.hit_set_count = 8;
1824 a.min_read_recency_for_promote = 1;
1825 a.min_write_recency_for_promote = 1;
1826 a.hit_set_grade_decay_rate = 50;
1827 a.hit_set_search_last_n = 1;
1828 a.calc_grade_table();
1829 a.set_stripe_width(12345);
1830 a.target_max_bytes = 1238132132;
1831 a.target_max_objects = 1232132;
1832 a.cache_target_dirty_ratio_micro = 187232;
1833 a.cache_target_dirty_high_ratio_micro = 309856;
1834 a.cache_target_full_ratio_micro = 987222;
1835 a.cache_min_flush_age = 231;
1836 a.cache_min_evict_age = 2321;
1837 a.erasure_code_profile = "profile in osdmap";
1838 a.expected_num_objects = 123456;
1839 a.fast_read = false;
1840 a.application_metadata = {{"rbd", {{"key", "value"}}}};
1841 o.push_back(new pg_pool_t(a));
1842 }
1843
1844 ostream& operator<<(ostream& out, const pg_pool_t& p)
1845 {
1846 out << p.get_type_name()
1847 << " size " << p.get_size()
1848 << " min_size " << p.get_min_size()
1849 << " crush_rule " << p.get_crush_rule()
1850 << " object_hash " << p.get_object_hash_name()
1851 << " pg_num " << p.get_pg_num()
1852 << " pgp_num " << p.get_pgp_num()
1853 << " last_change " << p.get_last_change();
1854 if (p.get_last_force_op_resend() ||
1855 p.get_last_force_op_resend_preluminous())
1856 out << " lfor " << p.get_last_force_op_resend() << "/"
1857 << p.get_last_force_op_resend_preluminous();
1858 if (p.get_auid())
1859 out << " owner " << p.get_auid();
1860 if (p.flags)
1861 out << " flags " << p.get_flags_string();
1862 if (p.crash_replay_interval)
1863 out << " crash_replay_interval " << p.crash_replay_interval;
1864 if (p.quota_max_bytes)
1865 out << " max_bytes " << p.quota_max_bytes;
1866 if (p.quota_max_objects)
1867 out << " max_objects " << p.quota_max_objects;
1868 if (!p.tiers.empty())
1869 out << " tiers " << p.tiers;
1870 if (p.is_tier())
1871 out << " tier_of " << p.tier_of;
1872 if (p.has_read_tier())
1873 out << " read_tier " << p.read_tier;
1874 if (p.has_write_tier())
1875 out << " write_tier " << p.write_tier;
1876 if (p.cache_mode)
1877 out << " cache_mode " << p.get_cache_mode_name();
1878 if (p.target_max_bytes)
1879 out << " target_bytes " << p.target_max_bytes;
1880 if (p.target_max_objects)
1881 out << " target_objects " << p.target_max_objects;
1882 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
1883 out << " hit_set " << p.hit_set_params
1884 << " " << p.hit_set_period << "s"
1885 << " x" << p.hit_set_count << " decay_rate "
1886 << p.hit_set_grade_decay_rate
1887 << " search_last_n " << p.hit_set_search_last_n;
1888 }
1889 if (p.min_read_recency_for_promote)
1890 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
1891 if (p.min_write_recency_for_promote)
1892 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
1893 out << " stripe_width " << p.get_stripe_width();
1894 if (p.expected_num_objects)
1895 out << " expected_num_objects " << p.expected_num_objects;
1896 if (p.fast_read)
1897 out << " fast_read " << p.fast_read;
1898 out << p.opts;
1899 if (!p.application_metadata.empty()) {
1900 out << " application ";
1901 for (auto it = p.application_metadata.begin();
1902 it != p.application_metadata.end(); ++it) {
1903 if (it != p.application_metadata.begin())
1904 out << ",";
1905 out << it->first;
1906 }
1907 }
1908 return out;
1909 }
1910
1911
1912 // -- object_stat_sum_t --
1913
1914 void object_stat_sum_t::dump(Formatter *f) const
1915 {
1916 f->dump_int("num_bytes", num_bytes);
1917 f->dump_int("num_objects", num_objects);
1918 f->dump_int("num_object_clones", num_object_clones);
1919 f->dump_int("num_object_copies", num_object_copies);
1920 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
1921 f->dump_int("num_objects_missing", num_objects_missing);
1922 f->dump_int("num_objects_degraded", num_objects_degraded);
1923 f->dump_int("num_objects_misplaced", num_objects_misplaced);
1924 f->dump_int("num_objects_unfound", num_objects_unfound);
1925 f->dump_int("num_objects_dirty", num_objects_dirty);
1926 f->dump_int("num_whiteouts", num_whiteouts);
1927 f->dump_int("num_read", num_rd);
1928 f->dump_int("num_read_kb", num_rd_kb);
1929 f->dump_int("num_write", num_wr);
1930 f->dump_int("num_write_kb", num_wr_kb);
1931 f->dump_int("num_scrub_errors", num_scrub_errors);
1932 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
1933 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
1934 f->dump_int("num_objects_recovered", num_objects_recovered);
1935 f->dump_int("num_bytes_recovered", num_bytes_recovered);
1936 f->dump_int("num_keys_recovered", num_keys_recovered);
1937 f->dump_int("num_objects_omap", num_objects_omap);
1938 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
1939 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
1940 f->dump_int("num_flush", num_flush);
1941 f->dump_int("num_flush_kb", num_flush_kb);
1942 f->dump_int("num_evict", num_evict);
1943 f->dump_int("num_evict_kb", num_evict_kb);
1944 f->dump_int("num_promote", num_promote);
1945 f->dump_int("num_flush_mode_high", num_flush_mode_high);
1946 f->dump_int("num_flush_mode_low", num_flush_mode_low);
1947 f->dump_int("num_evict_mode_some", num_evict_mode_some);
1948 f->dump_int("num_evict_mode_full", num_evict_mode_full);
1949 f->dump_int("num_objects_pinned", num_objects_pinned);
1950 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
1951 f->dump_int("num_large_omap_objects", num_large_omap_objects);
1952 }
1953
1954 void object_stat_sum_t::encode(bufferlist& bl) const
1955 {
1956 ENCODE_START(17, 14, bl);
1957 #if defined(CEPH_LITTLE_ENDIAN)
1958 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
1959 #else
1960 ::encode(num_bytes, bl);
1961 ::encode(num_objects, bl);
1962 ::encode(num_object_clones, bl);
1963 ::encode(num_object_copies, bl);
1964 ::encode(num_objects_missing_on_primary, bl);
1965 ::encode(num_objects_degraded, bl);
1966 ::encode(num_objects_unfound, bl);
1967 ::encode(num_rd, bl);
1968 ::encode(num_rd_kb, bl);
1969 ::encode(num_wr, bl);
1970 ::encode(num_wr_kb, bl);
1971 ::encode(num_scrub_errors, bl);
1972 ::encode(num_objects_recovered, bl);
1973 ::encode(num_bytes_recovered, bl);
1974 ::encode(num_keys_recovered, bl);
1975 ::encode(num_shallow_scrub_errors, bl);
1976 ::encode(num_deep_scrub_errors, bl);
1977 ::encode(num_objects_dirty, bl);
1978 ::encode(num_whiteouts, bl);
1979 ::encode(num_objects_omap, bl);
1980 ::encode(num_objects_hit_set_archive, bl);
1981 ::encode(num_objects_misplaced, bl);
1982 ::encode(num_bytes_hit_set_archive, bl);
1983 ::encode(num_flush, bl);
1984 ::encode(num_flush_kb, bl);
1985 ::encode(num_evict, bl);
1986 ::encode(num_evict_kb, bl);
1987 ::encode(num_promote, bl);
1988 ::encode(num_flush_mode_high, bl);
1989 ::encode(num_flush_mode_low, bl);
1990 ::encode(num_evict_mode_some, bl);
1991 ::encode(num_evict_mode_full, bl);
1992 ::encode(num_objects_pinned, bl);
1993 ::encode(num_objects_missing, bl);
1994 ::encode(num_legacy_snapsets, bl);
1995 ::encode(num_large_omap_objects, bl);
1996 #endif
1997 ENCODE_FINISH(bl);
1998 }
1999
2000 void object_stat_sum_t::decode(bufferlist::iterator& bl)
2001 {
2002 bool decode_finish = false;
2003 DECODE_START(17, bl); // make sure to also update fast decode below
2004 #if defined(CEPH_LITTLE_ENDIAN)
2005 if (struct_v >= 17) { // this must match newest decode version
2006 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
2007 decode_finish = true;
2008 }
2009 #endif
2010 if (!decode_finish) {
2011 ::decode(num_bytes, bl);
2012 ::decode(num_objects, bl);
2013 ::decode(num_object_clones, bl);
2014 ::decode(num_object_copies, bl);
2015 ::decode(num_objects_missing_on_primary, bl);
2016 ::decode(num_objects_degraded, bl);
2017 ::decode(num_objects_unfound, bl);
2018 ::decode(num_rd, bl);
2019 ::decode(num_rd_kb, bl);
2020 ::decode(num_wr, bl);
2021 ::decode(num_wr_kb, bl);
2022 ::decode(num_scrub_errors, bl);
2023 ::decode(num_objects_recovered, bl);
2024 ::decode(num_bytes_recovered, bl);
2025 ::decode(num_keys_recovered, bl);
2026 ::decode(num_shallow_scrub_errors, bl);
2027 ::decode(num_deep_scrub_errors, bl);
2028 ::decode(num_objects_dirty, bl);
2029 ::decode(num_whiteouts, bl);
2030 ::decode(num_objects_omap, bl);
2031 ::decode(num_objects_hit_set_archive, bl);
2032 ::decode(num_objects_misplaced, bl);
2033 ::decode(num_bytes_hit_set_archive, bl);
2034 ::decode(num_flush, bl);
2035 ::decode(num_flush_kb, bl);
2036 ::decode(num_evict, bl);
2037 ::decode(num_evict_kb, bl);
2038 ::decode(num_promote, bl);
2039 ::decode(num_flush_mode_high, bl);
2040 ::decode(num_flush_mode_low, bl);
2041 ::decode(num_evict_mode_some, bl);
2042 ::decode(num_evict_mode_full, bl);
2043 ::decode(num_objects_pinned, bl);
2044 ::decode(num_objects_missing, bl);
2045 if (struct_v >= 16) {
2046 ::decode(num_legacy_snapsets, bl);
2047 } else {
2048 num_legacy_snapsets = num_object_clones; // upper bound
2049 }
2050 if (struct_v >= 17) {
2051 ::decode(num_large_omap_objects, bl);
2052 }
2053 }
2054 DECODE_FINISH(bl);
2055 }
2056
2057 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2058 {
2059 object_stat_sum_t a;
2060
2061 a.num_bytes = 1;
2062 a.num_objects = 3;
2063 a.num_object_clones = 4;
2064 a.num_object_copies = 5;
2065 a.num_objects_missing_on_primary = 6;
2066 a.num_objects_missing = 123;
2067 a.num_objects_degraded = 7;
2068 a.num_objects_unfound = 8;
2069 a.num_rd = 9; a.num_rd_kb = 10;
2070 a.num_wr = 11; a.num_wr_kb = 12;
2071 a.num_objects_recovered = 14;
2072 a.num_bytes_recovered = 15;
2073 a.num_keys_recovered = 16;
2074 a.num_deep_scrub_errors = 17;
2075 a.num_shallow_scrub_errors = 18;
2076 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2077 a.num_objects_dirty = 21;
2078 a.num_whiteouts = 22;
2079 a.num_objects_misplaced = 1232;
2080 a.num_objects_hit_set_archive = 2;
2081 a.num_bytes_hit_set_archive = 27;
2082 a.num_flush = 5;
2083 a.num_flush_kb = 6;
2084 a.num_evict = 7;
2085 a.num_evict_kb = 8;
2086 a.num_promote = 9;
2087 a.num_flush_mode_high = 0;
2088 a.num_flush_mode_low = 1;
2089 a.num_evict_mode_some = 1;
2090 a.num_evict_mode_full = 0;
2091 a.num_objects_pinned = 20;
2092 a.num_large_omap_objects = 5;
2093 o.push_back(new object_stat_sum_t(a));
2094 }
2095
2096 void object_stat_sum_t::add(const object_stat_sum_t& o)
2097 {
2098 num_bytes += o.num_bytes;
2099 num_objects += o.num_objects;
2100 num_object_clones += o.num_object_clones;
2101 num_object_copies += o.num_object_copies;
2102 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2103 num_objects_missing += o.num_objects_missing;
2104 num_objects_degraded += o.num_objects_degraded;
2105 num_objects_misplaced += o.num_objects_misplaced;
2106 num_rd += o.num_rd;
2107 num_rd_kb += o.num_rd_kb;
2108 num_wr += o.num_wr;
2109 num_wr_kb += o.num_wr_kb;
2110 num_objects_unfound += o.num_objects_unfound;
2111 num_scrub_errors += o.num_scrub_errors;
2112 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2113 num_deep_scrub_errors += o.num_deep_scrub_errors;
2114 num_objects_recovered += o.num_objects_recovered;
2115 num_bytes_recovered += o.num_bytes_recovered;
2116 num_keys_recovered += o.num_keys_recovered;
2117 num_objects_dirty += o.num_objects_dirty;
2118 num_whiteouts += o.num_whiteouts;
2119 num_objects_omap += o.num_objects_omap;
2120 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2121 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2122 num_flush += o.num_flush;
2123 num_flush_kb += o.num_flush_kb;
2124 num_evict += o.num_evict;
2125 num_evict_kb += o.num_evict_kb;
2126 num_promote += o.num_promote;
2127 num_flush_mode_high += o.num_flush_mode_high;
2128 num_flush_mode_low += o.num_flush_mode_low;
2129 num_evict_mode_some += o.num_evict_mode_some;
2130 num_evict_mode_full += o.num_evict_mode_full;
2131 num_objects_pinned += o.num_objects_pinned;
2132 num_legacy_snapsets += o.num_legacy_snapsets;
2133 num_large_omap_objects += o.num_large_omap_objects;
2134 }
2135
2136 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2137 {
2138 num_bytes -= o.num_bytes;
2139 num_objects -= o.num_objects;
2140 num_object_clones -= o.num_object_clones;
2141 num_object_copies -= o.num_object_copies;
2142 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2143 num_objects_missing -= o.num_objects_missing;
2144 num_objects_degraded -= o.num_objects_degraded;
2145 num_objects_misplaced -= o.num_objects_misplaced;
2146 num_rd -= o.num_rd;
2147 num_rd_kb -= o.num_rd_kb;
2148 num_wr -= o.num_wr;
2149 num_wr_kb -= o.num_wr_kb;
2150 num_objects_unfound -= o.num_objects_unfound;
2151 num_scrub_errors -= o.num_scrub_errors;
2152 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2153 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2154 num_objects_recovered -= o.num_objects_recovered;
2155 num_bytes_recovered -= o.num_bytes_recovered;
2156 num_keys_recovered -= o.num_keys_recovered;
2157 num_objects_dirty -= o.num_objects_dirty;
2158 num_whiteouts -= o.num_whiteouts;
2159 num_objects_omap -= o.num_objects_omap;
2160 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2161 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2162 num_flush -= o.num_flush;
2163 num_flush_kb -= o.num_flush_kb;
2164 num_evict -= o.num_evict;
2165 num_evict_kb -= o.num_evict_kb;
2166 num_promote -= o.num_promote;
2167 num_flush_mode_high -= o.num_flush_mode_high;
2168 num_flush_mode_low -= o.num_flush_mode_low;
2169 num_evict_mode_some -= o.num_evict_mode_some;
2170 num_evict_mode_full -= o.num_evict_mode_full;
2171 num_objects_pinned -= o.num_objects_pinned;
2172 num_legacy_snapsets -= o.num_legacy_snapsets;
2173 num_large_omap_objects -= o.num_large_omap_objects;
2174 }
2175
2176 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2177 {
2178 return
2179 l.num_bytes == r.num_bytes &&
2180 l.num_objects == r.num_objects &&
2181 l.num_object_clones == r.num_object_clones &&
2182 l.num_object_copies == r.num_object_copies &&
2183 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2184 l.num_objects_missing == r.num_objects_missing &&
2185 l.num_objects_degraded == r.num_objects_degraded &&
2186 l.num_objects_misplaced == r.num_objects_misplaced &&
2187 l.num_objects_unfound == r.num_objects_unfound &&
2188 l.num_rd == r.num_rd &&
2189 l.num_rd_kb == r.num_rd_kb &&
2190 l.num_wr == r.num_wr &&
2191 l.num_wr_kb == r.num_wr_kb &&
2192 l.num_scrub_errors == r.num_scrub_errors &&
2193 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2194 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2195 l.num_objects_recovered == r.num_objects_recovered &&
2196 l.num_bytes_recovered == r.num_bytes_recovered &&
2197 l.num_keys_recovered == r.num_keys_recovered &&
2198 l.num_objects_dirty == r.num_objects_dirty &&
2199 l.num_whiteouts == r.num_whiteouts &&
2200 l.num_objects_omap == r.num_objects_omap &&
2201 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2202 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2203 l.num_flush == r.num_flush &&
2204 l.num_flush_kb == r.num_flush_kb &&
2205 l.num_evict == r.num_evict &&
2206 l.num_evict_kb == r.num_evict_kb &&
2207 l.num_promote == r.num_promote &&
2208 l.num_flush_mode_high == r.num_flush_mode_high &&
2209 l.num_flush_mode_low == r.num_flush_mode_low &&
2210 l.num_evict_mode_some == r.num_evict_mode_some &&
2211 l.num_evict_mode_full == r.num_evict_mode_full &&
2212 l.num_objects_pinned == r.num_objects_pinned &&
2213 l.num_legacy_snapsets == r.num_legacy_snapsets &&
2214 l.num_large_omap_objects == r.num_large_omap_objects;
2215 }
2216
2217 // -- object_stat_collection_t --
2218
2219 void object_stat_collection_t::dump(Formatter *f) const
2220 {
2221 f->open_object_section("stat_sum");
2222 sum.dump(f);
2223 f->close_section();
2224 }
2225
2226 void object_stat_collection_t::encode(bufferlist& bl) const
2227 {
2228 ENCODE_START(2, 2, bl);
2229 ::encode(sum, bl);
2230 ::encode((__u32)0, bl);
2231 ENCODE_FINISH(bl);
2232 }
2233
2234 void object_stat_collection_t::decode(bufferlist::iterator& bl)
2235 {
2236 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2237 ::decode(sum, bl);
2238 {
2239 map<string,object_stat_sum_t> cat_sum;
2240 ::decode(cat_sum, bl);
2241 }
2242 DECODE_FINISH(bl);
2243 }
2244
2245 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2246 {
2247 object_stat_collection_t a;
2248 o.push_back(new object_stat_collection_t(a));
2249 list<object_stat_sum_t*> l;
2250 object_stat_sum_t::generate_test_instances(l);
2251 for (list<object_stat_sum_t*>::iterator p = l.begin(); p != l.end(); ++p) {
2252 a.add(**p);
2253 o.push_back(new object_stat_collection_t(a));
2254 }
2255 }
2256
2257
2258 // -- pg_stat_t --
2259
2260 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2261 {
2262 if (primary && osd == acting_primary) {
2263 return true;
2264 } else if (!primary) {
2265 for(vector<int32_t>::const_iterator it = acting.begin();
2266 it != acting.end(); ++it)
2267 {
2268 if (*it == osd)
2269 return true;
2270 }
2271 }
2272 return false;
2273 }
2274
2275 void pg_stat_t::dump(Formatter *f) const
2276 {
2277 f->dump_stream("version") << version;
2278 f->dump_stream("reported_seq") << reported_seq;
2279 f->dump_stream("reported_epoch") << reported_epoch;
2280 f->dump_string("state", pg_state_string(state));
2281 f->dump_stream("last_fresh") << last_fresh;
2282 f->dump_stream("last_change") << last_change;
2283 f->dump_stream("last_active") << last_active;
2284 f->dump_stream("last_peered") << last_peered;
2285 f->dump_stream("last_clean") << last_clean;
2286 f->dump_stream("last_became_active") << last_became_active;
2287 f->dump_stream("last_became_peered") << last_became_peered;
2288 f->dump_stream("last_unstale") << last_unstale;
2289 f->dump_stream("last_undegraded") << last_undegraded;
2290 f->dump_stream("last_fullsized") << last_fullsized;
2291 f->dump_unsigned("mapping_epoch", mapping_epoch);
2292 f->dump_stream("log_start") << log_start;
2293 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2294 f->dump_unsigned("created", created);
2295 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2296 f->dump_stream("parent") << parent;
2297 f->dump_unsigned("parent_split_bits", parent_split_bits);
2298 f->dump_stream("last_scrub") << last_scrub;
2299 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2300 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2301 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2302 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2303 f->dump_int("log_size", log_size);
2304 f->dump_int("ondisk_log_size", ondisk_log_size);
2305 f->dump_bool("stats_invalid", stats_invalid);
2306 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2307 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2308 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2309 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2310 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2311 f->dump_unsigned("snaptrimq_len", snaptrimq_len);
2312 stats.dump(f);
2313 f->open_array_section("up");
2314 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2315 f->dump_int("osd", *p);
2316 f->close_section();
2317 f->open_array_section("acting");
2318 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2319 f->dump_int("osd", *p);
2320 f->close_section();
2321 f->open_array_section("blocked_by");
2322 for (vector<int32_t>::const_iterator p = blocked_by.begin();
2323 p != blocked_by.end(); ++p)
2324 f->dump_int("osd", *p);
2325 f->close_section();
2326 f->dump_int("up_primary", up_primary);
2327 f->dump_int("acting_primary", acting_primary);
2328 }
2329
2330 void pg_stat_t::dump_brief(Formatter *f) const
2331 {
2332 f->dump_string("state", pg_state_string(state));
2333 f->open_array_section("up");
2334 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2335 f->dump_int("osd", *p);
2336 f->close_section();
2337 f->open_array_section("acting");
2338 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2339 f->dump_int("osd", *p);
2340 f->close_section();
2341 f->dump_int("up_primary", up_primary);
2342 f->dump_int("acting_primary", acting_primary);
2343 }
2344
2345 void pg_stat_t::encode(bufferlist &bl) const
2346 {
2347 ENCODE_START(23, 22, bl);
2348 ::encode(version, bl);
2349 ::encode(reported_seq, bl);
2350 ::encode(reported_epoch, bl);
2351 ::encode(state, bl);
2352 ::encode(log_start, bl);
2353 ::encode(ondisk_log_start, bl);
2354 ::encode(created, bl);
2355 ::encode(last_epoch_clean, bl);
2356 ::encode(parent, bl);
2357 ::encode(parent_split_bits, bl);
2358 ::encode(last_scrub, bl);
2359 ::encode(last_scrub_stamp, bl);
2360 ::encode(stats, bl);
2361 ::encode(log_size, bl);
2362 ::encode(ondisk_log_size, bl);
2363 ::encode(up, bl);
2364 ::encode(acting, bl);
2365 ::encode(last_fresh, bl);
2366 ::encode(last_change, bl);
2367 ::encode(last_active, bl);
2368 ::encode(last_clean, bl);
2369 ::encode(last_unstale, bl);
2370 ::encode(mapping_epoch, bl);
2371 ::encode(last_deep_scrub, bl);
2372 ::encode(last_deep_scrub_stamp, bl);
2373 ::encode(stats_invalid, bl);
2374 ::encode(last_clean_scrub_stamp, bl);
2375 ::encode(last_became_active, bl);
2376 ::encode(dirty_stats_invalid, bl);
2377 ::encode(up_primary, bl);
2378 ::encode(acting_primary, bl);
2379 ::encode(omap_stats_invalid, bl);
2380 ::encode(hitset_stats_invalid, bl);
2381 ::encode(blocked_by, bl);
2382 ::encode(last_undegraded, bl);
2383 ::encode(last_fullsized, bl);
2384 ::encode(hitset_bytes_stats_invalid, bl);
2385 ::encode(last_peered, bl);
2386 ::encode(last_became_peered, bl);
2387 ::encode(pin_stats_invalid, bl);
2388 ::encode(snaptrimq_len, bl);
2389 ENCODE_FINISH(bl);
2390 }
2391
2392 void pg_stat_t::decode(bufferlist::iterator &bl)
2393 {
2394 bool tmp;
2395 DECODE_START(22, bl);
2396 ::decode(version, bl);
2397 ::decode(reported_seq, bl);
2398 ::decode(reported_epoch, bl);
2399 ::decode(state, bl);
2400 ::decode(log_start, bl);
2401 ::decode(ondisk_log_start, bl);
2402 ::decode(created, bl);
2403 ::decode(last_epoch_clean, bl);
2404 ::decode(parent, bl);
2405 ::decode(parent_split_bits, bl);
2406 ::decode(last_scrub, bl);
2407 ::decode(last_scrub_stamp, bl);
2408 ::decode(stats, bl);
2409 ::decode(log_size, bl);
2410 ::decode(ondisk_log_size, bl);
2411 ::decode(up, bl);
2412 ::decode(acting, bl);
2413 ::decode(last_fresh, bl);
2414 ::decode(last_change, bl);
2415 ::decode(last_active, bl);
2416 ::decode(last_clean, bl);
2417 ::decode(last_unstale, bl);
2418 ::decode(mapping_epoch, bl);
2419 ::decode(last_deep_scrub, bl);
2420 ::decode(last_deep_scrub_stamp, bl);
2421 ::decode(tmp, bl);
2422 stats_invalid = tmp;
2423 ::decode(last_clean_scrub_stamp, bl);
2424 ::decode(last_became_active, bl);
2425 ::decode(tmp, bl);
2426 dirty_stats_invalid = tmp;
2427 ::decode(up_primary, bl);
2428 ::decode(acting_primary, bl);
2429 ::decode(tmp, bl);
2430 omap_stats_invalid = tmp;
2431 ::decode(tmp, bl);
2432 hitset_stats_invalid = tmp;
2433 ::decode(blocked_by, bl);
2434 ::decode(last_undegraded, bl);
2435 ::decode(last_fullsized, bl);
2436 ::decode(tmp, bl);
2437 hitset_bytes_stats_invalid = tmp;
2438 ::decode(last_peered, bl);
2439 ::decode(last_became_peered, bl);
2440 ::decode(tmp, bl);
2441 pin_stats_invalid = tmp;
2442 if (struct_v >= 23) {
2443 ::decode(snaptrimq_len, bl);
2444 }
2445 DECODE_FINISH(bl);
2446 }
2447
2448 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
2449 {
2450 pg_stat_t a;
2451 o.push_back(new pg_stat_t(a));
2452
2453 a.version = eversion_t(1, 3);
2454 a.reported_epoch = 1;
2455 a.reported_seq = 2;
2456 a.state = 123;
2457 a.mapping_epoch = 998;
2458 a.last_fresh = utime_t(1002, 1);
2459 a.last_change = utime_t(1002, 2);
2460 a.last_active = utime_t(1002, 3);
2461 a.last_clean = utime_t(1002, 4);
2462 a.last_unstale = utime_t(1002, 5);
2463 a.last_undegraded = utime_t(1002, 7);
2464 a.last_fullsized = utime_t(1002, 8);
2465 a.log_start = eversion_t(1, 4);
2466 a.ondisk_log_start = eversion_t(1, 5);
2467 a.created = 6;
2468 a.last_epoch_clean = 7;
2469 a.parent = pg_t(1, 2, 3);
2470 a.parent_split_bits = 12;
2471 a.last_scrub = eversion_t(9, 10);
2472 a.last_scrub_stamp = utime_t(11, 12);
2473 a.last_deep_scrub = eversion_t(13, 14);
2474 a.last_deep_scrub_stamp = utime_t(15, 16);
2475 a.last_clean_scrub_stamp = utime_t(17, 18);
2476 a.snaptrimq_len = 1048576;
2477 list<object_stat_collection_t*> l;
2478 object_stat_collection_t::generate_test_instances(l);
2479 a.stats = *l.back();
2480 a.log_size = 99;
2481 a.ondisk_log_size = 88;
2482 a.up.push_back(123);
2483 a.up_primary = 123;
2484 a.acting.push_back(456);
2485 a.acting_primary = 456;
2486 o.push_back(new pg_stat_t(a));
2487
2488 a.up.push_back(124);
2489 a.up_primary = 124;
2490 a.acting.push_back(124);
2491 a.acting_primary = 124;
2492 a.blocked_by.push_back(155);
2493 a.blocked_by.push_back(156);
2494 o.push_back(new pg_stat_t(a));
2495 }
2496
2497 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
2498 {
2499 return
2500 l.version == r.version &&
2501 l.reported_seq == r.reported_seq &&
2502 l.reported_epoch == r.reported_epoch &&
2503 l.state == r.state &&
2504 l.last_fresh == r.last_fresh &&
2505 l.last_change == r.last_change &&
2506 l.last_active == r.last_active &&
2507 l.last_peered == r.last_peered &&
2508 l.last_clean == r.last_clean &&
2509 l.last_unstale == r.last_unstale &&
2510 l.last_undegraded == r.last_undegraded &&
2511 l.last_fullsized == r.last_fullsized &&
2512 l.log_start == r.log_start &&
2513 l.ondisk_log_start == r.ondisk_log_start &&
2514 l.created == r.created &&
2515 l.last_epoch_clean == r.last_epoch_clean &&
2516 l.parent == r.parent &&
2517 l.parent_split_bits == r.parent_split_bits &&
2518 l.last_scrub == r.last_scrub &&
2519 l.last_deep_scrub == r.last_deep_scrub &&
2520 l.last_scrub_stamp == r.last_scrub_stamp &&
2521 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2522 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2523 l.stats == r.stats &&
2524 l.stats_invalid == r.stats_invalid &&
2525 l.log_size == r.log_size &&
2526 l.ondisk_log_size == r.ondisk_log_size &&
2527 l.up == r.up &&
2528 l.acting == r.acting &&
2529 l.mapping_epoch == r.mapping_epoch &&
2530 l.blocked_by == r.blocked_by &&
2531 l.last_became_active == r.last_became_active &&
2532 l.last_became_peered == r.last_became_peered &&
2533 l.dirty_stats_invalid == r.dirty_stats_invalid &&
2534 l.omap_stats_invalid == r.omap_stats_invalid &&
2535 l.hitset_stats_invalid == r.hitset_stats_invalid &&
2536 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
2537 l.up_primary == r.up_primary &&
2538 l.acting_primary == r.acting_primary &&
2539 l.pin_stats_invalid == r.pin_stats_invalid &&
2540 l.snaptrimq_len == r.snaptrimq_len;
2541 }
2542
2543 // -- pool_stat_t --
2544
2545 void pool_stat_t::dump(Formatter *f) const
2546 {
2547 stats.dump(f);
2548 f->dump_int("log_size", log_size);
2549 f->dump_int("ondisk_log_size", ondisk_log_size);
2550 f->dump_int("up", up);
2551 f->dump_int("acting", acting);
2552 }
2553
2554 void pool_stat_t::encode(bufferlist &bl, uint64_t features) const
2555 {
2556 if ((features & CEPH_FEATURE_OSDENC) == 0) {
2557 __u8 v = 4;
2558 ::encode(v, bl);
2559 ::encode(stats, bl);
2560 ::encode(log_size, bl);
2561 ::encode(ondisk_log_size, bl);
2562 return;
2563 }
2564
2565 ENCODE_START(6, 5, bl);
2566 ::encode(stats, bl);
2567 ::encode(log_size, bl);
2568 ::encode(ondisk_log_size, bl);
2569 ::encode(up, bl);
2570 ::encode(acting, bl);
2571 ENCODE_FINISH(bl);
2572 }
2573
2574 void pool_stat_t::decode(bufferlist::iterator &bl)
2575 {
2576 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2577 if (struct_v >= 4) {
2578 ::decode(stats, bl);
2579 ::decode(log_size, bl);
2580 ::decode(ondisk_log_size, bl);
2581 if (struct_v >= 6) {
2582 ::decode(up, bl);
2583 ::decode(acting, bl);
2584 } else {
2585 up = 0;
2586 acting = 0;
2587 }
2588 } else {
2589 ::decode(stats.sum.num_bytes, bl);
2590 uint64_t num_kb;
2591 ::decode(num_kb, bl);
2592 ::decode(stats.sum.num_objects, bl);
2593 ::decode(stats.sum.num_object_clones, bl);
2594 ::decode(stats.sum.num_object_copies, bl);
2595 ::decode(stats.sum.num_objects_missing_on_primary, bl);
2596 ::decode(stats.sum.num_objects_degraded, bl);
2597 ::decode(log_size, bl);
2598 ::decode(ondisk_log_size, bl);
2599 if (struct_v >= 2) {
2600 ::decode(stats.sum.num_rd, bl);
2601 ::decode(stats.sum.num_rd_kb, bl);
2602 ::decode(stats.sum.num_wr, bl);
2603 ::decode(stats.sum.num_wr_kb, bl);
2604 }
2605 if (struct_v >= 3) {
2606 ::decode(stats.sum.num_objects_unfound, bl);
2607 }
2608 }
2609 DECODE_FINISH(bl);
2610 }
2611
2612 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
2613 {
2614 pool_stat_t a;
2615 o.push_back(new pool_stat_t(a));
2616
2617 list<object_stat_collection_t*> l;
2618 object_stat_collection_t::generate_test_instances(l);
2619 a.stats = *l.back();
2620 a.log_size = 123;
2621 a.ondisk_log_size = 456;
2622 a.acting = 3;
2623 a.up = 4;
2624 o.push_back(new pool_stat_t(a));
2625 }
2626
2627
2628 // -- pg_history_t --
2629
2630 void pg_history_t::encode(bufferlist &bl) const
2631 {
2632 ENCODE_START(9, 4, bl);
2633 ::encode(epoch_created, bl);
2634 ::encode(last_epoch_started, bl);
2635 ::encode(last_epoch_clean, bl);
2636 ::encode(last_epoch_split, bl);
2637 ::encode(same_interval_since, bl);
2638 ::encode(same_up_since, bl);
2639 ::encode(same_primary_since, bl);
2640 ::encode(last_scrub, bl);
2641 ::encode(last_scrub_stamp, bl);
2642 ::encode(last_deep_scrub, bl);
2643 ::encode(last_deep_scrub_stamp, bl);
2644 ::encode(last_clean_scrub_stamp, bl);
2645 ::encode(last_epoch_marked_full, bl);
2646 ::encode(last_interval_started, bl);
2647 ::encode(last_interval_clean, bl);
2648 ::encode(epoch_pool_created, bl);
2649 ENCODE_FINISH(bl);
2650 }
2651
2652 void pg_history_t::decode(bufferlist::iterator &bl)
2653 {
2654 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
2655 ::decode(epoch_created, bl);
2656 ::decode(last_epoch_started, bl);
2657 if (struct_v >= 3)
2658 ::decode(last_epoch_clean, bl);
2659 else
2660 last_epoch_clean = last_epoch_started; // careful, it's a lie!
2661 ::decode(last_epoch_split, bl);
2662 ::decode(same_interval_since, bl);
2663 ::decode(same_up_since, bl);
2664 ::decode(same_primary_since, bl);
2665 if (struct_v >= 2) {
2666 ::decode(last_scrub, bl);
2667 ::decode(last_scrub_stamp, bl);
2668 }
2669 if (struct_v >= 5) {
2670 ::decode(last_deep_scrub, bl);
2671 ::decode(last_deep_scrub_stamp, bl);
2672 }
2673 if (struct_v >= 6) {
2674 ::decode(last_clean_scrub_stamp, bl);
2675 }
2676 if (struct_v >= 7) {
2677 ::decode(last_epoch_marked_full, bl);
2678 }
2679 if (struct_v >= 8) {
2680 ::decode(last_interval_started, bl);
2681 ::decode(last_interval_clean, bl);
2682 } else {
2683 if (last_epoch_started >= same_interval_since) {
2684 last_interval_started = same_interval_since;
2685 } else {
2686 last_interval_started = last_epoch_started; // best guess
2687 }
2688 if (last_epoch_clean >= same_interval_since) {
2689 last_interval_clean = same_interval_since;
2690 } else {
2691 last_interval_clean = last_epoch_clean; // best guess
2692 }
2693 }
2694 if (struct_v >= 9) {
2695 ::decode(epoch_pool_created, bl);
2696 } else {
2697 epoch_pool_created = epoch_created;
2698 }
2699 DECODE_FINISH(bl);
2700 }
2701
2702 void pg_history_t::dump(Formatter *f) const
2703 {
2704 f->dump_int("epoch_created", epoch_created);
2705 f->dump_int("epoch_pool_created", epoch_pool_created);
2706 f->dump_int("last_epoch_started", last_epoch_started);
2707 f->dump_int("last_interval_started", last_interval_started);
2708 f->dump_int("last_epoch_clean", last_epoch_clean);
2709 f->dump_int("last_interval_clean", last_interval_clean);
2710 f->dump_int("last_epoch_split", last_epoch_split);
2711 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
2712 f->dump_int("same_up_since", same_up_since);
2713 f->dump_int("same_interval_since", same_interval_since);
2714 f->dump_int("same_primary_since", same_primary_since);
2715 f->dump_stream("last_scrub") << last_scrub;
2716 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2717 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2718 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2719 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2720 }
2721
2722 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
2723 {
2724 o.push_back(new pg_history_t);
2725 o.push_back(new pg_history_t);
2726 o.back()->epoch_created = 1;
2727 o.back()->epoch_pool_created = 1;
2728 o.back()->last_epoch_started = 2;
2729 o.back()->last_interval_started = 2;
2730 o.back()->last_epoch_clean = 3;
2731 o.back()->last_interval_clean = 2;
2732 o.back()->last_epoch_split = 4;
2733 o.back()->same_up_since = 5;
2734 o.back()->same_interval_since = 6;
2735 o.back()->same_primary_since = 7;
2736 o.back()->last_scrub = eversion_t(8, 9);
2737 o.back()->last_scrub_stamp = utime_t(10, 11);
2738 o.back()->last_deep_scrub = eversion_t(12, 13);
2739 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
2740 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
2741 o.back()->last_epoch_marked_full = 18;
2742 }
2743
2744
2745 // -- pg_info_t --
2746
2747 void pg_info_t::encode(bufferlist &bl) const
2748 {
2749 ENCODE_START(32, 26, bl);
2750 ::encode(pgid.pgid, bl);
2751 ::encode(last_update, bl);
2752 ::encode(last_complete, bl);
2753 ::encode(log_tail, bl);
2754 if (last_backfill_bitwise && !last_backfill.is_max()) {
2755 ::encode(hobject_t(), bl);
2756 } else {
2757 ::encode(last_backfill, bl);
2758 }
2759 ::encode(stats, bl);
2760 history.encode(bl);
2761 ::encode(purged_snaps, bl);
2762 ::encode(last_epoch_started, bl);
2763 ::encode(last_user_version, bl);
2764 ::encode(hit_set, bl);
2765 ::encode(pgid.shard, bl);
2766 ::encode(last_backfill, bl);
2767 ::encode(last_backfill_bitwise, bl);
2768 ::encode(last_interval_started, bl);
2769 ENCODE_FINISH(bl);
2770 }
2771
2772 void pg_info_t::decode(bufferlist::iterator &bl)
2773 {
2774 DECODE_START(32, bl);
2775 ::decode(pgid.pgid, bl);
2776 ::decode(last_update, bl);
2777 ::decode(last_complete, bl);
2778 ::decode(log_tail, bl);
2779 {
2780 hobject_t old_last_backfill;
2781 ::decode(old_last_backfill, bl);
2782 }
2783 ::decode(stats, bl);
2784 history.decode(bl);
2785 ::decode(purged_snaps, bl);
2786 ::decode(last_epoch_started, bl);
2787 ::decode(last_user_version, bl);
2788 ::decode(hit_set, bl);
2789 ::decode(pgid.shard, bl);
2790 ::decode(last_backfill, bl);
2791 ::decode(last_backfill_bitwise, bl);
2792 if (struct_v >= 32) {
2793 ::decode(last_interval_started, bl);
2794 } else {
2795 last_interval_started = last_epoch_started;
2796 }
2797 DECODE_FINISH(bl);
2798 }
2799
2800 // -- pg_info_t --
2801
2802 void pg_info_t::dump(Formatter *f) const
2803 {
2804 f->dump_stream("pgid") << pgid;
2805 f->dump_stream("last_update") << last_update;
2806 f->dump_stream("last_complete") << last_complete;
2807 f->dump_stream("log_tail") << log_tail;
2808 f->dump_int("last_user_version", last_user_version);
2809 f->dump_stream("last_backfill") << last_backfill;
2810 f->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise);
2811 f->open_array_section("purged_snaps");
2812 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
2813 i != purged_snaps.end();
2814 ++i) {
2815 f->open_object_section("purged_snap_interval");
2816 f->dump_stream("start") << i.get_start();
2817 f->dump_stream("length") << i.get_len();
2818 f->close_section();
2819 }
2820 f->close_section();
2821 f->open_object_section("history");
2822 history.dump(f);
2823 f->close_section();
2824 f->open_object_section("stats");
2825 stats.dump(f);
2826 f->close_section();
2827
2828 f->dump_int("empty", is_empty());
2829 f->dump_int("dne", dne());
2830 f->dump_int("incomplete", is_incomplete());
2831 f->dump_int("last_epoch_started", last_epoch_started);
2832
2833 f->open_object_section("hit_set_history");
2834 hit_set.dump(f);
2835 f->close_section();
2836 }
2837
2838 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
2839 {
2840 o.push_back(new pg_info_t);
2841 o.push_back(new pg_info_t);
2842 list<pg_history_t*> h;
2843 pg_history_t::generate_test_instances(h);
2844 o.back()->history = *h.back();
2845 o.back()->pgid = spg_t(pg_t(1, 2, -1), shard_id_t::NO_SHARD);
2846 o.back()->last_update = eversion_t(3, 4);
2847 o.back()->last_complete = eversion_t(5, 6);
2848 o.back()->last_user_version = 2;
2849 o.back()->log_tail = eversion_t(7, 8);
2850 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
2851 o.back()->last_backfill_bitwise = true;
2852 {
2853 list<pg_stat_t*> s;
2854 pg_stat_t::generate_test_instances(s);
2855 o.back()->stats = *s.back();
2856 }
2857 {
2858 list<pg_hit_set_history_t*> s;
2859 pg_hit_set_history_t::generate_test_instances(s);
2860 o.back()->hit_set = *s.back();
2861 }
2862 }
2863
2864 // -- pg_notify_t --
2865 void pg_notify_t::encode(bufferlist &bl) const
2866 {
2867 ENCODE_START(2, 2, bl);
2868 ::encode(query_epoch, bl);
2869 ::encode(epoch_sent, bl);
2870 ::encode(info, bl);
2871 ::encode(to, bl);
2872 ::encode(from, bl);
2873 ENCODE_FINISH(bl);
2874 }
2875
2876 void pg_notify_t::decode(bufferlist::iterator &bl)
2877 {
2878 DECODE_START(2, bl);
2879 ::decode(query_epoch, bl);
2880 ::decode(epoch_sent, bl);
2881 ::decode(info, bl);
2882 ::decode(to, bl);
2883 ::decode(from, bl);
2884 DECODE_FINISH(bl);
2885 }
2886
2887 void pg_notify_t::dump(Formatter *f) const
2888 {
2889 f->dump_int("from", from);
2890 f->dump_int("to", to);
2891 f->dump_unsigned("query_epoch", query_epoch);
2892 f->dump_unsigned("epoch_sent", epoch_sent);
2893 {
2894 f->open_object_section("info");
2895 info.dump(f);
2896 f->close_section();
2897 }
2898 }
2899
2900 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
2901 {
2902 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, pg_info_t()));
2903 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t()));
2904 }
2905
2906 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
2907 {
2908 lhs << "(query:" << notify.query_epoch
2909 << " sent:" << notify.epoch_sent
2910 << " " << notify.info;
2911 if (notify.from != shard_id_t::NO_SHARD ||
2912 notify.to != shard_id_t::NO_SHARD)
2913 lhs << " " << (unsigned)notify.from
2914 << "->" << (unsigned)notify.to;
2915 return lhs << ")";
2916 }
2917
2918 // -- pg_interval_t --
2919
2920 void PastIntervals::pg_interval_t::encode(bufferlist& bl) const
2921 {
2922 ENCODE_START(4, 2, bl);
2923 ::encode(first, bl);
2924 ::encode(last, bl);
2925 ::encode(up, bl);
2926 ::encode(acting, bl);
2927 ::encode(maybe_went_rw, bl);
2928 ::encode(primary, bl);
2929 ::encode(up_primary, bl);
2930 ENCODE_FINISH(bl);
2931 }
2932
2933 void PastIntervals::pg_interval_t::decode(bufferlist::iterator& bl)
2934 {
2935 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
2936 ::decode(first, bl);
2937 ::decode(last, bl);
2938 ::decode(up, bl);
2939 ::decode(acting, bl);
2940 ::decode(maybe_went_rw, bl);
2941 if (struct_v >= 3) {
2942 ::decode(primary, bl);
2943 } else {
2944 if (acting.size())
2945 primary = acting[0];
2946 }
2947 if (struct_v >= 4) {
2948 ::decode(up_primary, bl);
2949 } else {
2950 if (up.size())
2951 up_primary = up[0];
2952 }
2953 DECODE_FINISH(bl);
2954 }
2955
2956 void PastIntervals::pg_interval_t::dump(Formatter *f) const
2957 {
2958 f->dump_unsigned("first", first);
2959 f->dump_unsigned("last", last);
2960 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
2961 f->open_array_section("up");
2962 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
2963 f->dump_int("osd", *p);
2964 f->close_section();
2965 f->open_array_section("acting");
2966 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2967 f->dump_int("osd", *p);
2968 f->close_section();
2969 f->dump_int("primary", primary);
2970 f->dump_int("up_primary", up_primary);
2971 }
2972
2973 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
2974 {
2975 o.push_back(new pg_interval_t);
2976 o.push_back(new pg_interval_t);
2977 o.back()->up.push_back(1);
2978 o.back()->acting.push_back(2);
2979 o.back()->acting.push_back(3);
2980 o.back()->first = 4;
2981 o.back()->last = 5;
2982 o.back()->maybe_went_rw = true;
2983 }
2984
2985 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
2986
2987 class pi_simple_rep : public PastIntervals::interval_rep {
2988 map<epoch_t, PastIntervals::pg_interval_t> interval_map;
2989
2990 pi_simple_rep(
2991 bool ec_pool,
2992 std::list<PastIntervals::pg_interval_t> &&intervals) {
2993 for (auto &&i: intervals)
2994 add_interval(ec_pool, i);
2995 }
2996
2997 public:
2998 pi_simple_rep() = default;
2999 pi_simple_rep(const pi_simple_rep &) = default;
3000 pi_simple_rep(pi_simple_rep &&) = default;
3001 pi_simple_rep &operator=(pi_simple_rep &&) = default;
3002 pi_simple_rep &operator=(const pi_simple_rep &) = default;
3003
3004 size_t size() const override { return interval_map.size(); }
3005 bool empty() const override { return interval_map.empty(); }
3006 void clear() override { interval_map.clear(); }
3007 pair<epoch_t, epoch_t> get_bounds() const override {
3008 auto iter = interval_map.begin();
3009 if (iter != interval_map.end()) {
3010 auto riter = interval_map.rbegin();
3011 return make_pair(
3012 iter->second.first,
3013 riter->second.last + 1);
3014 } else {
3015 return make_pair(0, 0);
3016 }
3017 }
3018 set<pg_shard_t> get_all_participants(
3019 bool ec_pool) const override {
3020 set<pg_shard_t> all_participants;
3021
3022 // We need to decide who might have unfound objects that we need
3023 auto p = interval_map.rbegin();
3024 auto end = interval_map.rend();
3025 for (; p != end; ++p) {
3026 const PastIntervals::pg_interval_t &interval(p->second);
3027 // If nothing changed, we don't care about this interval.
3028 if (!interval.maybe_went_rw)
3029 continue;
3030
3031 int i = 0;
3032 std::vector<int>::const_iterator a = interval.acting.begin();
3033 std::vector<int>::const_iterator a_end = interval.acting.end();
3034 for (; a != a_end; ++a, ++i) {
3035 pg_shard_t shard(*a, ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD);
3036 if (*a != CRUSH_ITEM_NONE)
3037 all_participants.insert(shard);
3038 }
3039 }
3040 return all_participants;
3041 }
3042 void add_interval(
3043 bool ec_pool,
3044 const PastIntervals::pg_interval_t &interval) override {
3045 interval_map[interval.first] = interval;
3046 }
3047 unique_ptr<PastIntervals::interval_rep> clone() const override {
3048 return unique_ptr<PastIntervals::interval_rep>(new pi_simple_rep(*this));
3049 }
3050 ostream &print(ostream &out) const override {
3051 return out << interval_map;
3052 }
3053 void encode(bufferlist &bl) const override {
3054 ::encode(interval_map, bl);
3055 }
3056 void decode(bufferlist::iterator &bl) override {
3057 ::decode(interval_map, bl);
3058 }
3059 void dump(Formatter *f) const override {
3060 f->open_array_section("PastIntervals::compat_rep");
3061 for (auto &&i: interval_map) {
3062 f->open_object_section("pg_interval_t");
3063 f->dump_int("epoch", i.first);
3064 f->open_object_section("interval");
3065 i.second.dump(f);
3066 f->close_section();
3067 f->close_section();
3068 }
3069 f->close_section();
3070 }
3071 bool is_classic() const override {
3072 return true;
3073 }
3074 static void generate_test_instances(list<pi_simple_rep*> &o) {
3075 using ival = PastIntervals::pg_interval_t;
3076 using ivallst = std::list<ival>;
3077 o.push_back(
3078 new pi_simple_rep(
3079 true, ivallst
3080 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3081 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3082 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3083 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3084 }));
3085 o.push_back(
3086 new pi_simple_rep(
3087 false, ivallst
3088 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3089 , ival{{ 1, 2}, { 1, 2}, 20, 30, true, 1, 1}
3090 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3091 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3092 }));
3093 o.push_back(
3094 new pi_simple_rep(
3095 true, ivallst
3096 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3097 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3098 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3099 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3100 }));
3101 return;
3102 }
3103 void iterate_mayberw_back_to(
3104 bool ec_pool,
3105 epoch_t les,
3106 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3107 for (auto i = interval_map.rbegin(); i != interval_map.rend(); ++i) {
3108 if (!i->second.maybe_went_rw)
3109 continue;
3110 if (i->second.last < les)
3111 break;
3112 set<pg_shard_t> actingset;
3113 for (unsigned j = 0; j < i->second.acting.size(); ++j) {
3114 if (i->second.acting[j] == CRUSH_ITEM_NONE)
3115 continue;
3116 actingset.insert(
3117 pg_shard_t(
3118 i->second.acting[j],
3119 ec_pool ? shard_id_t(j) : shard_id_t::NO_SHARD));
3120 }
3121 f(i->second.first, actingset);
3122 }
3123 }
3124
3125 bool has_full_intervals() const override { return true; }
3126 void iterate_all_intervals(
3127 std::function<void(const PastIntervals::pg_interval_t &)> &&f
3128 ) const override {
3129 for (auto &&i: interval_map) {
3130 f(i.second);
3131 }
3132 }
3133 virtual ~pi_simple_rep() override {}
3134 };
3135
3136 /**
3137 * pi_compact_rep
3138 *
3139 * PastIntervals only needs to be able to answer two questions:
3140 * 1) Where should the primary look for unfound objects?
3141 * 2) List a set of subsets of the OSDs such that contacting at least
3142 * one from each subset guarrantees we speak to at least one witness
3143 * of any completed write.
3144 *
3145 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3146 * we don't need to keep any where maybe_went_rw would be false. We also
3147 * needn't keep two intervals where the actingset in one is a subset
3148 * of the other (only need to keep the smaller of the two sets). In order
3149 * to accurately trim the set of intervals as last_epoch_started changes
3150 * without rebuilding the set from scratch, we'll retain the larger set
3151 * if it in an older interval.
3152 */
3153 struct compact_interval_t {
3154 epoch_t first;
3155 epoch_t last;
3156 set<pg_shard_t> acting;
3157 bool supersedes(const compact_interval_t &other) {
3158 for (auto &&i: acting) {
3159 if (!other.acting.count(i))
3160 return false;
3161 }
3162 return true;
3163 }
3164 void dump(Formatter *f) const {
3165 f->open_object_section("compact_interval_t");
3166 f->dump_stream("first") << first;
3167 f->dump_stream("last") << last;
3168 f->dump_stream("acting") << acting;
3169 f->close_section();
3170 }
3171 void encode(bufferlist &bl) const {
3172 ENCODE_START(1, 1, bl);
3173 ::encode(first, bl);
3174 ::encode(last, bl);
3175 ::encode(acting, bl);
3176 ENCODE_FINISH(bl);
3177 }
3178 void decode(bufferlist::iterator &bl) {
3179 DECODE_START(1, bl);
3180 ::decode(first, bl);
3181 ::decode(last, bl);
3182 ::decode(acting, bl);
3183 DECODE_FINISH(bl);
3184 }
3185 static void generate_test_instances(list<compact_interval_t*> & o) {
3186 /* Not going to be used, we'll generate pi_compact_rep directly */
3187 }
3188 };
3189 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3190 {
3191 return o << "([" << rhs.first << "," << rhs.last
3192 << "] acting " << rhs.acting << ")";
3193 }
3194 WRITE_CLASS_ENCODER(compact_interval_t)
3195
3196 class pi_compact_rep : public PastIntervals::interval_rep {
3197 epoch_t first = 0;
3198 epoch_t last = 0; // inclusive
3199 set<pg_shard_t> all_participants;
3200 list<compact_interval_t> intervals;
3201 pi_compact_rep(
3202 bool ec_pool,
3203 std::list<PastIntervals::pg_interval_t> &&intervals) {
3204 for (auto &&i: intervals)
3205 add_interval(ec_pool, i);
3206 }
3207 public:
3208 pi_compact_rep() = default;
3209 pi_compact_rep(const pi_compact_rep &) = default;
3210 pi_compact_rep(pi_compact_rep &&) = default;
3211 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3212 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3213
3214 size_t size() const override { return intervals.size(); }
3215 bool empty() const override {
3216 return first > last || (first == 0 && last == 0);
3217 }
3218 void clear() override {
3219 *this = pi_compact_rep();
3220 }
3221 pair<epoch_t, epoch_t> get_bounds() const override {
3222 return make_pair(first, last + 1);
3223 }
3224 set<pg_shard_t> get_all_participants(
3225 bool ec_pool) const override {
3226 return all_participants;
3227 }
3228 void add_interval(
3229 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3230 if (first == 0)
3231 first = interval.first;
3232 assert(interval.last > last);
3233 last = interval.last;
3234 set<pg_shard_t> acting;
3235 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3236 if (interval.acting[i] == CRUSH_ITEM_NONE)
3237 continue;
3238 acting.insert(
3239 pg_shard_t(
3240 interval.acting[i],
3241 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3242 }
3243 all_participants.insert(acting.begin(), acting.end());
3244 if (!interval.maybe_went_rw)
3245 return;
3246 intervals.push_back(
3247 compact_interval_t{interval.first, interval.last, acting});
3248 auto plast = intervals.end();
3249 --plast;
3250 for (auto cur = intervals.begin(); cur != plast; ) {
3251 if (plast->supersedes(*cur)) {
3252 intervals.erase(cur++);
3253 } else {
3254 ++cur;
3255 }
3256 }
3257 }
3258 unique_ptr<PastIntervals::interval_rep> clone() const override {
3259 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3260 }
3261 ostream &print(ostream &out) const override {
3262 return out << "([" << first << "," << last
3263 << "] intervals=" << intervals << ")";
3264 }
3265 void encode(bufferlist &bl) const override {
3266 ENCODE_START(1, 1, bl);
3267 ::encode(first, bl);
3268 ::encode(last, bl);
3269 ::encode(all_participants, bl);
3270 ::encode(intervals, bl);
3271 ENCODE_FINISH(bl);
3272 }
3273 void decode(bufferlist::iterator &bl) override {
3274 DECODE_START(1, bl);
3275 ::decode(first, bl);
3276 ::decode(last, bl);
3277 ::decode(all_participants, bl);
3278 ::decode(intervals, bl);
3279 DECODE_FINISH(bl);
3280 }
3281 void dump(Formatter *f) const override {
3282 f->open_object_section("PastIntervals::compact_rep");
3283 f->dump_stream("first") << first;
3284 f->dump_stream("last") << last;
3285 f->open_array_section("all_participants");
3286 for (auto& i : all_participants) {
3287 f->dump_object("pg_shard", i);
3288 }
3289 f->close_section();
3290 f->open_array_section("intervals");
3291 for (auto &&i: intervals) {
3292 i.dump(f);
3293 }
3294 f->close_section();
3295 f->close_section();
3296 }
3297 bool is_classic() const override {
3298 return false;
3299 }
3300 static void generate_test_instances(list<pi_compact_rep*> &o) {
3301 using ival = PastIntervals::pg_interval_t;
3302 using ivallst = std::list<ival>;
3303 o.push_back(
3304 new pi_compact_rep(
3305 true, ivallst
3306 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3307 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3308 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3309 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3310 }));
3311 o.push_back(
3312 new pi_compact_rep(
3313 false, ivallst
3314 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3315 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3316 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3317 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3318 }));
3319 o.push_back(
3320 new pi_compact_rep(
3321 true, ivallst
3322 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3323 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3324 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3325 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3326 }));
3327 }
3328 void iterate_mayberw_back_to(
3329 bool ec_pool,
3330 epoch_t les,
3331 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3332 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3333 if (i->last < les)
3334 break;
3335 f(i->first, i->acting);
3336 }
3337 }
3338 virtual ~pi_compact_rep() override {}
3339 };
3340 WRITE_CLASS_ENCODER(pi_compact_rep)
3341
3342 PastIntervals::PastIntervals(const PastIntervals &rhs)
3343 : past_intervals(rhs.past_intervals ?
3344 rhs.past_intervals->clone() :
3345 nullptr) {}
3346
3347 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3348 {
3349 PastIntervals other(rhs);
3350 swap(other);
3351 return *this;
3352 }
3353
3354 ostream& operator<<(ostream& out, const PastIntervals &i)
3355 {
3356 if (i.past_intervals) {
3357 return i.past_intervals->print(out);
3358 } else {
3359 return out << "(empty)";
3360 }
3361 }
3362
3363 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3364 {
3365 return out << "PriorSet("
3366 << "ec_pool: " << i.ec_pool
3367 << ", probe: " << i.probe
3368 << ", down: " << i.down
3369 << ", blocked_by: " << i.blocked_by
3370 << ", pg_down: " << i.pg_down
3371 << ")";
3372 }
3373
3374 void PastIntervals::decode(bufferlist::iterator &bl)
3375 {
3376 DECODE_START(1, bl);
3377 __u8 type = 0;
3378 ::decode(type, bl);
3379 switch (type) {
3380 case 0:
3381 break;
3382 case 1:
3383 past_intervals.reset(new pi_simple_rep);
3384 past_intervals->decode(bl);
3385 break;
3386 case 2:
3387 past_intervals.reset(new pi_compact_rep);
3388 past_intervals->decode(bl);
3389 break;
3390 }
3391 DECODE_FINISH(bl);
3392 }
3393
3394 void PastIntervals::decode_classic(bufferlist::iterator &bl)
3395 {
3396 past_intervals.reset(new pi_simple_rep);
3397 past_intervals->decode(bl);
3398 }
3399
3400 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
3401 {
3402 {
3403 list<pi_simple_rep *> simple;
3404 pi_simple_rep::generate_test_instances(simple);
3405 for (auto &&i: simple) {
3406 // takes ownership of contents
3407 o.push_back(new PastIntervals(i));
3408 }
3409 }
3410 {
3411 list<pi_compact_rep *> compact;
3412 pi_compact_rep::generate_test_instances(compact);
3413 for (auto &&i: compact) {
3414 // takes ownership of contents
3415 o.push_back(new PastIntervals(i));
3416 }
3417 }
3418 return;
3419 }
3420
3421 void PastIntervals::update_type(bool ec_pool, bool compact)
3422 {
3423 if (!compact) {
3424 if (!past_intervals) {
3425 past_intervals.reset(new pi_simple_rep);
3426 } else {
3427 // we never convert from compact back to classic
3428 assert(is_classic());
3429 }
3430 } else {
3431 if (!past_intervals) {
3432 past_intervals.reset(new pi_compact_rep);
3433 } else if (is_classic()) {
3434 auto old = std::move(past_intervals);
3435 past_intervals.reset(new pi_compact_rep);
3436 assert(old->has_full_intervals());
3437 old->iterate_all_intervals([&](const pg_interval_t &i) {
3438 past_intervals->add_interval(ec_pool, i);
3439 });
3440 }
3441 }
3442 }
3443
3444 void PastIntervals::update_type_from_map(bool ec_pool, const OSDMap &osdmap)
3445 {
3446 update_type(ec_pool, osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
3447 }
3448
3449 bool PastIntervals::is_new_interval(
3450 int old_acting_primary,
3451 int new_acting_primary,
3452 const vector<int> &old_acting,
3453 const vector<int> &new_acting,
3454 int old_up_primary,
3455 int new_up_primary,
3456 const vector<int> &old_up,
3457 const vector<int> &new_up,
3458 int old_size,
3459 int new_size,
3460 int old_min_size,
3461 int new_min_size,
3462 unsigned old_pg_num,
3463 unsigned new_pg_num,
3464 bool old_sort_bitwise,
3465 bool new_sort_bitwise,
3466 bool old_recovery_deletes,
3467 bool new_recovery_deletes,
3468 pg_t pgid) {
3469 return old_acting_primary != new_acting_primary ||
3470 new_acting != old_acting ||
3471 old_up_primary != new_up_primary ||
3472 new_up != old_up ||
3473 old_min_size != new_min_size ||
3474 old_size != new_size ||
3475 pgid.is_split(old_pg_num, new_pg_num, 0) ||
3476 old_sort_bitwise != new_sort_bitwise ||
3477 old_recovery_deletes != new_recovery_deletes;
3478 }
3479
3480 bool PastIntervals::is_new_interval(
3481 int old_acting_primary,
3482 int new_acting_primary,
3483 const vector<int> &old_acting,
3484 const vector<int> &new_acting,
3485 int old_up_primary,
3486 int new_up_primary,
3487 const vector<int> &old_up,
3488 const vector<int> &new_up,
3489 OSDMapRef osdmap,
3490 OSDMapRef lastmap,
3491 pg_t pgid) {
3492 return !(lastmap->get_pools().count(pgid.pool())) ||
3493 is_new_interval(old_acting_primary,
3494 new_acting_primary,
3495 old_acting,
3496 new_acting,
3497 old_up_primary,
3498 new_up_primary,
3499 old_up,
3500 new_up,
3501 lastmap->get_pools().find(pgid.pool())->second.size,
3502 osdmap->get_pools().find(pgid.pool())->second.size,
3503 lastmap->get_pools().find(pgid.pool())->second.min_size,
3504 osdmap->get_pools().find(pgid.pool())->second.min_size,
3505 lastmap->get_pg_num(pgid.pool()),
3506 osdmap->get_pg_num(pgid.pool()),
3507 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3508 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3509 lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3510 osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3511 pgid);
3512 }
3513
3514 bool PastIntervals::check_new_interval(
3515 int old_acting_primary,
3516 int new_acting_primary,
3517 const vector<int> &old_acting,
3518 const vector<int> &new_acting,
3519 int old_up_primary,
3520 int new_up_primary,
3521 const vector<int> &old_up,
3522 const vector<int> &new_up,
3523 epoch_t same_interval_since,
3524 epoch_t last_epoch_clean,
3525 OSDMapRef osdmap,
3526 OSDMapRef lastmap,
3527 pg_t pgid,
3528 IsPGRecoverablePredicate *could_have_gone_active,
3529 PastIntervals *past_intervals,
3530 std::ostream *out)
3531 {
3532 /*
3533 * We have to be careful to gracefully deal with situations like
3534 * so. Say we have a power outage or something that takes out both
3535 * OSDs, but the monitor doesn't mark them down in the same epoch.
3536 * The history may look like
3537 *
3538 * 1: A B
3539 * 2: B
3540 * 3: let's say B dies for good, too (say, from the power spike)
3541 * 4: A
3542 *
3543 * which makes it look like B may have applied updates to the PG
3544 * that we need in order to proceed. This sucks...
3545 *
3546 * To minimize the risk of this happening, we CANNOT go active if
3547 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3548 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3549 * Then, we have something like
3550 *
3551 * 1: A B
3552 * 2: B up_thru[B]=0
3553 * 3:
3554 * 4: A
3555 *
3556 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
3557 *
3558 * or,
3559 *
3560 * 1: A B
3561 * 2: B up_thru[B]=0
3562 * 3: B up_thru[B]=2
3563 * 4:
3564 * 5: A
3565 *
3566 * -> we must wait for B, bc it was alive through 2, and could have
3567 * written to the pg.
3568 *
3569 * If B is really dead, then an administrator will need to manually
3570 * intervene by marking the OSD as "lost."
3571 */
3572
3573 // remember past interval
3574 // NOTE: a change in the up set primary triggers an interval
3575 // change, even though the interval members in the pg_interval_t
3576 // do not change.
3577 assert(past_intervals);
3578 assert(past_intervals->past_intervals);
3579 if (is_new_interval(
3580 old_acting_primary,
3581 new_acting_primary,
3582 old_acting,
3583 new_acting,
3584 old_up_primary,
3585 new_up_primary,
3586 old_up,
3587 new_up,
3588 osdmap,
3589 lastmap,
3590 pgid)) {
3591 pg_interval_t i;
3592 i.first = same_interval_since;
3593 i.last = osdmap->get_epoch() - 1;
3594 assert(i.first <= i.last);
3595 i.acting = old_acting;
3596 i.up = old_up;
3597 i.primary = old_acting_primary;
3598 i.up_primary = old_up_primary;
3599
3600 unsigned num_acting = 0;
3601 for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();
3602 ++p)
3603 if (*p != CRUSH_ITEM_NONE)
3604 ++num_acting;
3605
3606 assert(lastmap->get_pools().count(pgid.pool()));
3607 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
3608 set<pg_shard_t> old_acting_shards;
3609 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
3610
3611 if (num_acting &&
3612 i.primary != -1 &&
3613 num_acting >= old_pg_pool.min_size &&
3614 (*could_have_gone_active)(old_acting_shards)) {
3615 if (out)
3616 *out << __func__ << " " << i
3617 << ": not rw,"
3618 << " up_thru " << lastmap->get_up_thru(i.primary)
3619 << " up_from " << lastmap->get_up_from(i.primary)
3620 << " last_epoch_clean " << last_epoch_clean
3621 << std::endl;
3622 if (lastmap->get_up_thru(i.primary) >= i.first &&
3623 lastmap->get_up_from(i.primary) <= i.first) {
3624 i.maybe_went_rw = true;
3625 if (out)
3626 *out << __func__ << " " << i
3627 << " : primary up " << lastmap->get_up_from(i.primary)
3628 << "-" << lastmap->get_up_thru(i.primary)
3629 << " includes interval"
3630 << std::endl;
3631 } else if (last_epoch_clean >= i.first &&
3632 last_epoch_clean <= i.last) {
3633 // If the last_epoch_clean is included in this interval, then
3634 // the pg must have been rw (for recovery to have completed).
3635 // This is important because we won't know the _real_
3636 // first_epoch because we stop at last_epoch_clean, and we
3637 // don't want the oldest interval to randomly have
3638 // maybe_went_rw false depending on the relative up_thru vs
3639 // last_epoch_clean timing.
3640 i.maybe_went_rw = true;
3641 if (out)
3642 *out << __func__ << " " << i
3643 << " : includes last_epoch_clean " << last_epoch_clean
3644 << " and presumed to have been rw"
3645 << std::endl;
3646 } else {
3647 i.maybe_went_rw = false;
3648 if (out)
3649 *out << __func__ << " " << i
3650 << " : primary up " << lastmap->get_up_from(i.primary)
3651 << "-" << lastmap->get_up_thru(i.primary)
3652 << " does not include interval"
3653 << std::endl;
3654 }
3655 } else {
3656 i.maybe_went_rw = false;
3657 if (out)
3658 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
3659 }
3660 past_intervals->past_intervals->add_interval(old_pg_pool.ec_pool(), i);
3661 return true;
3662 } else {
3663 return false;
3664 }
3665 }
3666
3667
3668 // true if the given map affects the prior set
3669 bool PastIntervals::PriorSet::affected_by_map(
3670 const OSDMap &osdmap,
3671 const DoutPrefixProvider *dpp) const
3672 {
3673 for (set<pg_shard_t>::iterator p = probe.begin();
3674 p != probe.end();
3675 ++p) {
3676 int o = p->osd;
3677
3678 // did someone in the prior set go down?
3679 if (osdmap.is_down(o) && down.count(o) == 0) {
3680 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
3681 return true;
3682 }
3683
3684 // did a down osd in cur get (re)marked as lost?
3685 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3686 if (r != blocked_by.end()) {
3687 if (!osdmap.exists(o)) {
3688 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3689 return true;
3690 }
3691 if (osdmap.get_info(o).lost_at != r->second) {
3692 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3693 return true;
3694 }
3695 }
3696 }
3697
3698 // did someone in the prior down set go up?
3699 for (set<int>::const_iterator p = down.begin();
3700 p != down.end();
3701 ++p) {
3702 int o = *p;
3703
3704 if (osdmap.is_up(o)) {
3705 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
3706 return true;
3707 }
3708
3709 // did someone in the prior set get lost or destroyed?
3710 if (!osdmap.exists(o)) {
3711 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3712 return true;
3713 }
3714 // did a down osd in down get (re)marked as lost?
3715 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3716 if (r != blocked_by.end()) {
3717 if (osdmap.get_info(o).lost_at != r->second) {
3718 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3719 return true;
3720 }
3721 }
3722 }
3723
3724 return false;
3725 }
3726
3727 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
3728 {
3729 out << "interval(" << i.first << "-" << i.last
3730 << " up " << i.up << "(" << i.up_primary << ")"
3731 << " acting " << i.acting << "(" << i.primary << ")";
3732 if (i.maybe_went_rw)
3733 out << " maybe_went_rw";
3734 out << ")";
3735 return out;
3736 }
3737
3738
3739
3740 // -- pg_query_t --
3741
3742 void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
3743 ENCODE_START(3, 3, bl);
3744 ::encode(type, bl);
3745 ::encode(since, bl);
3746 history.encode(bl);
3747 ::encode(epoch_sent, bl);
3748 ::encode(to, bl);
3749 ::encode(from, bl);
3750 ENCODE_FINISH(bl);
3751 }
3752
3753 void pg_query_t::decode(bufferlist::iterator &bl) {
3754 DECODE_START(3, bl);
3755 ::decode(type, bl);
3756 ::decode(since, bl);
3757 history.decode(bl);
3758 ::decode(epoch_sent, bl);
3759 ::decode(to, bl);
3760 ::decode(from, bl);
3761 DECODE_FINISH(bl);
3762 }
3763
3764 void pg_query_t::dump(Formatter *f) const
3765 {
3766 f->dump_int("from", from);
3767 f->dump_int("to", to);
3768 f->dump_string("type", get_type_name());
3769 f->dump_stream("since") << since;
3770 f->dump_stream("epoch_sent") << epoch_sent;
3771 f->open_object_section("history");
3772 history.dump(f);
3773 f->close_section();
3774 }
3775 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
3776 {
3777 o.push_back(new pg_query_t());
3778 list<pg_history_t*> h;
3779 pg_history_t::generate_test_instances(h);
3780 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
3781 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
3782 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
3783 eversion_t(4, 5), *h.back(), 4));
3784 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
3785 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
3786 *h.back(), 5));
3787 }
3788
3789 // -- ObjectModDesc --
3790 void ObjectModDesc::visit(Visitor *visitor) const
3791 {
3792 bufferlist::iterator bp = bl.begin();
3793 try {
3794 while (!bp.end()) {
3795 DECODE_START(max_required_version, bp);
3796 uint8_t code;
3797 ::decode(code, bp);
3798 switch (code) {
3799 case APPEND: {
3800 uint64_t size;
3801 ::decode(size, bp);
3802 visitor->append(size);
3803 break;
3804 }
3805 case SETATTRS: {
3806 map<string, boost::optional<bufferlist> > attrs;
3807 ::decode(attrs, bp);
3808 visitor->setattrs(attrs);
3809 break;
3810 }
3811 case DELETE: {
3812 version_t old_version;
3813 ::decode(old_version, bp);
3814 visitor->rmobject(old_version);
3815 break;
3816 }
3817 case CREATE: {
3818 visitor->create();
3819 break;
3820 }
3821 case UPDATE_SNAPS: {
3822 set<snapid_t> snaps;
3823 ::decode(snaps, bp);
3824 visitor->update_snaps(snaps);
3825 break;
3826 }
3827 case TRY_DELETE: {
3828 version_t old_version;
3829 ::decode(old_version, bp);
3830 visitor->try_rmobject(old_version);
3831 break;
3832 }
3833 case ROLLBACK_EXTENTS: {
3834 vector<pair<uint64_t, uint64_t> > extents;
3835 version_t gen;
3836 ::decode(gen, bp);
3837 ::decode(extents, bp);
3838 visitor->rollback_extents(gen,extents);
3839 break;
3840 }
3841 default:
3842 assert(0 == "Invalid rollback code");
3843 }
3844 DECODE_FINISH(bp);
3845 }
3846 } catch (...) {
3847 assert(0 == "Invalid encoding");
3848 }
3849 }
3850
3851 struct DumpVisitor : public ObjectModDesc::Visitor {
3852 Formatter *f;
3853 explicit DumpVisitor(Formatter *f) : f(f) {}
3854 void append(uint64_t old_size) override {
3855 f->open_object_section("op");
3856 f->dump_string("code", "APPEND");
3857 f->dump_unsigned("old_size", old_size);
3858 f->close_section();
3859 }
3860 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
3861 f->open_object_section("op");
3862 f->dump_string("code", "SETATTRS");
3863 f->open_array_section("attrs");
3864 for (map<string, boost::optional<bufferlist> >::iterator i = attrs.begin();
3865 i != attrs.end();
3866 ++i) {
3867 f->dump_string("attr_name", i->first);
3868 }
3869 f->close_section();
3870 f->close_section();
3871 }
3872 void rmobject(version_t old_version) override {
3873 f->open_object_section("op");
3874 f->dump_string("code", "RMOBJECT");
3875 f->dump_unsigned("old_version", old_version);
3876 f->close_section();
3877 }
3878 void try_rmobject(version_t old_version) override {
3879 f->open_object_section("op");
3880 f->dump_string("code", "TRY_RMOBJECT");
3881 f->dump_unsigned("old_version", old_version);
3882 f->close_section();
3883 }
3884 void create() override {
3885 f->open_object_section("op");
3886 f->dump_string("code", "CREATE");
3887 f->close_section();
3888 }
3889 void update_snaps(const set<snapid_t> &snaps) override {
3890 f->open_object_section("op");
3891 f->dump_string("code", "UPDATE_SNAPS");
3892 f->dump_stream("snaps") << snaps;
3893 f->close_section();
3894 }
3895 void rollback_extents(
3896 version_t gen,
3897 const vector<pair<uint64_t, uint64_t> > &extents) override {
3898 f->open_object_section("op");
3899 f->dump_string("code", "ROLLBACK_EXTENTS");
3900 f->dump_unsigned("gen", gen);
3901 f->dump_stream("snaps") << extents;
3902 f->close_section();
3903 }
3904 };
3905
3906 void ObjectModDesc::dump(Formatter *f) const
3907 {
3908 f->open_object_section("object_mod_desc");
3909 f->dump_bool("can_local_rollback", can_local_rollback);
3910 f->dump_bool("rollback_info_completed", rollback_info_completed);
3911 {
3912 f->open_array_section("ops");
3913 DumpVisitor vis(f);
3914 visit(&vis);
3915 f->close_section();
3916 }
3917 f->close_section();
3918 }
3919
3920 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
3921 {
3922 map<string, boost::optional<bufferlist> > attrs;
3923 attrs[OI_ATTR];
3924 attrs[SS_ATTR];
3925 attrs["asdf"];
3926 o.push_back(new ObjectModDesc());
3927 o.back()->append(100);
3928 o.back()->setattrs(attrs);
3929 o.push_back(new ObjectModDesc());
3930 o.back()->rmobject(1001);
3931 o.push_back(new ObjectModDesc());
3932 o.back()->create();
3933 o.back()->setattrs(attrs);
3934 o.push_back(new ObjectModDesc());
3935 o.back()->create();
3936 o.back()->setattrs(attrs);
3937 o.back()->mark_unrollbackable();
3938 o.back()->append(1000);
3939 }
3940
3941 void ObjectModDesc::encode(bufferlist &_bl) const
3942 {
3943 ENCODE_START(max_required_version, max_required_version, _bl);
3944 ::encode(can_local_rollback, _bl);
3945 ::encode(rollback_info_completed, _bl);
3946 ::encode(bl, _bl);
3947 ENCODE_FINISH(_bl);
3948 }
3949 void ObjectModDesc::decode(bufferlist::iterator &_bl)
3950 {
3951 DECODE_START(2, _bl);
3952 max_required_version = struct_v;
3953 ::decode(can_local_rollback, _bl);
3954 ::decode(rollback_info_completed, _bl);
3955 ::decode(bl, _bl);
3956 // ensure bl does not pin a larger buffer in memory
3957 bl.rebuild();
3958 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3959 DECODE_FINISH(_bl);
3960 }
3961
3962 // -- pg_log_entry_t --
3963
3964 string pg_log_entry_t::get_key_name() const
3965 {
3966 return version.get_key_name();
3967 }
3968
3969 void pg_log_entry_t::encode_with_checksum(bufferlist& bl) const
3970 {
3971 bufferlist ebl(sizeof(*this)*2);
3972 encode(ebl);
3973 __u32 crc = ebl.crc32c(0);
3974 ::encode(ebl, bl);
3975 ::encode(crc, bl);
3976 }
3977
3978 void pg_log_entry_t::decode_with_checksum(bufferlist::iterator& p)
3979 {
3980 bufferlist bl;
3981 ::decode(bl, p);
3982 __u32 crc;
3983 ::decode(crc, p);
3984 if (crc != bl.crc32c(0))
3985 throw buffer::malformed_input("bad checksum on pg_log_entry_t");
3986 bufferlist::iterator q = bl.begin();
3987 decode(q);
3988 }
3989
3990 void pg_log_entry_t::encode(bufferlist &bl) const
3991 {
3992 ENCODE_START(11, 4, bl);
3993 ::encode(op, bl);
3994 ::encode(soid, bl);
3995 ::encode(version, bl);
3996
3997 /**
3998 * Added with reverting_to:
3999 * Previous code used prior_version to encode
4000 * what we now call reverting_to. This will
4001 * allow older code to decode reverting_to
4002 * into prior_version as expected.
4003 */
4004 if (op == LOST_REVERT)
4005 ::encode(reverting_to, bl);
4006 else
4007 ::encode(prior_version, bl);
4008
4009 ::encode(reqid, bl);
4010 ::encode(mtime, bl);
4011 if (op == LOST_REVERT)
4012 ::encode(prior_version, bl);
4013 ::encode(snaps, bl);
4014 ::encode(user_version, bl);
4015 ::encode(mod_desc, bl);
4016 ::encode(extra_reqids, bl);
4017 if (op == ERROR)
4018 ::encode(return_code, bl);
4019 ENCODE_FINISH(bl);
4020 }
4021
4022 void pg_log_entry_t::decode(bufferlist::iterator &bl)
4023 {
4024 DECODE_START_LEGACY_COMPAT_LEN(11, 4, 4, bl);
4025 ::decode(op, bl);
4026 if (struct_v < 2) {
4027 sobject_t old_soid;
4028 ::decode(old_soid, bl);
4029 soid.oid = old_soid.oid;
4030 soid.snap = old_soid.snap;
4031 invalid_hash = true;
4032 } else {
4033 ::decode(soid, bl);
4034 }
4035 if (struct_v < 3)
4036 invalid_hash = true;
4037 ::decode(version, bl);
4038
4039 if (struct_v >= 6 && op == LOST_REVERT)
4040 ::decode(reverting_to, bl);
4041 else
4042 ::decode(prior_version, bl);
4043
4044 ::decode(reqid, bl);
4045
4046 ::decode(mtime, bl);
4047 if (struct_v < 5)
4048 invalid_pool = true;
4049
4050 if (op == LOST_REVERT) {
4051 if (struct_v >= 6) {
4052 ::decode(prior_version, bl);
4053 } else {
4054 reverting_to = prior_version;
4055 }
4056 }
4057 if (struct_v >= 7 || // for v >= 7, this is for all ops.
4058 op == CLONE) { // for v < 7, it's only present for CLONE.
4059 ::decode(snaps, bl);
4060 // ensure snaps does not pin a larger buffer in memory
4061 snaps.rebuild();
4062 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4063 }
4064
4065 if (struct_v >= 8)
4066 ::decode(user_version, bl);
4067 else
4068 user_version = version.version;
4069
4070 if (struct_v >= 9)
4071 ::decode(mod_desc, bl);
4072 else
4073 mod_desc.mark_unrollbackable();
4074 if (struct_v >= 10)
4075 ::decode(extra_reqids, bl);
4076 if (struct_v >= 11 && op == ERROR)
4077 ::decode(return_code, bl);
4078 DECODE_FINISH(bl);
4079 }
4080
4081 void pg_log_entry_t::dump(Formatter *f) const
4082 {
4083 f->dump_string("op", get_op_name());
4084 f->dump_stream("object") << soid;
4085 f->dump_stream("version") << version;
4086 f->dump_stream("prior_version") << prior_version;
4087 f->dump_stream("reqid") << reqid;
4088 f->open_array_section("extra_reqids");
4089 for (auto p = extra_reqids.begin();
4090 p != extra_reqids.end();
4091 ++p) {
4092 f->open_object_section("extra_reqid");
4093 f->dump_stream("reqid") << p->first;
4094 f->dump_stream("user_version") << p->second;
4095 f->close_section();
4096 }
4097 f->close_section();
4098 f->dump_stream("mtime") << mtime;
4099 f->dump_int("return_code", return_code);
4100 if (snaps.length() > 0) {
4101 vector<snapid_t> v;
4102 bufferlist c = snaps;
4103 bufferlist::iterator p = c.begin();
4104 try {
4105 ::decode(v, p);
4106 } catch (...) {
4107 v.clear();
4108 }
4109 f->open_object_section("snaps");
4110 for (vector<snapid_t>::iterator p = v.begin(); p != v.end(); ++p)
4111 f->dump_unsigned("snap", *p);
4112 f->close_section();
4113 }
4114 {
4115 f->open_object_section("mod_desc");
4116 mod_desc.dump(f);
4117 f->close_section();
4118 }
4119 }
4120
4121 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4122 {
4123 o.push_back(new pg_log_entry_t());
4124 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4125 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4126 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4127 utime_t(8,9), 0));
4128 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4129 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4130 utime_t(8,9), -ENOENT));
4131 }
4132
4133 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4134 {
4135 out << e.version << " (" << e.prior_version << ") "
4136 << std::left << std::setw(8) << e.get_op_name() << ' '
4137 << e.soid << " by " << e.reqid << " " << e.mtime
4138 << " " << e.return_code;
4139 if (e.snaps.length()) {
4140 vector<snapid_t> snaps;
4141 bufferlist c = e.snaps;
4142 bufferlist::iterator p = c.begin();
4143 try {
4144 ::decode(snaps, p);
4145 } catch (...) {
4146 snaps.clear();
4147 }
4148 out << " snaps " << snaps;
4149 }
4150 return out;
4151 }
4152
4153 // -- pg_log_dup_t --
4154
4155 string pg_log_dup_t::get_key_name() const
4156 {
4157 return "dup_" + version.get_key_name();
4158 }
4159
4160 void pg_log_dup_t::encode(bufferlist &bl) const
4161 {
4162 ENCODE_START(1, 1, bl);
4163 ::encode(reqid, bl);
4164 ::encode(version, bl);
4165 ::encode(user_version, bl);
4166 ::encode(return_code, bl);
4167 ENCODE_FINISH(bl);
4168 }
4169
4170 void pg_log_dup_t::decode(bufferlist::iterator &bl)
4171 {
4172 DECODE_START(1, bl);
4173 ::decode(reqid, bl);
4174 ::decode(version, bl);
4175 ::decode(user_version, bl);
4176 ::decode(return_code, bl);
4177 DECODE_FINISH(bl);
4178 }
4179
4180 void pg_log_dup_t::dump(Formatter *f) const
4181 {
4182 f->dump_stream("reqid") << reqid;
4183 f->dump_stream("version") << version;
4184 f->dump_stream("user_version") << user_version;
4185 f->dump_stream("return_code") << return_code;
4186 }
4187
4188 void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
4189 {
4190 o.push_back(new pg_log_dup_t());
4191 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4192 1,
4193 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4194 0));
4195 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4196 2,
4197 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4198 -ENOENT));
4199 }
4200
4201
4202 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
4203 return out << "log_dup(reqid=" << e.reqid <<
4204 " v=" << e.version << " uv=" << e.user_version <<
4205 " rc=" << e.return_code << ")";
4206 }
4207
4208
4209 // -- pg_log_t --
4210
4211 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4212 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4213 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
4214 const string &hit_set_namespace, const pg_log_t &in,
4215 pg_log_t &out, pg_log_t &reject)
4216 {
4217 out = in;
4218 out.log.clear();
4219 reject.log.clear();
4220
4221 for (list<pg_log_entry_t>::const_iterator i = in.log.begin();
4222 i != in.log.end(); ++i) {
4223
4224 // Reject pg log entries for temporary objects
4225 if (i->soid.is_temp()) {
4226 reject.log.push_back(*i);
4227 continue;
4228 }
4229
4230 if (i->soid.nspace != hit_set_namespace) {
4231 object_t oid = i->soid.oid;
4232 object_locator_t loc(i->soid);
4233 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
4234 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
4235
4236 if (import_pgid.pgid == pgid) {
4237 out.log.push_back(*i);
4238 } else {
4239 reject.log.push_back(*i);
4240 }
4241 } else {
4242 out.log.push_back(*i);
4243 }
4244 }
4245 }
4246
4247 void pg_log_t::encode(bufferlist& bl) const
4248 {
4249 ENCODE_START(7, 3, bl);
4250 ::encode(head, bl);
4251 ::encode(tail, bl);
4252 ::encode(log, bl);
4253 ::encode(can_rollback_to, bl);
4254 ::encode(rollback_info_trimmed_to, bl);
4255 ::encode(dups, bl);
4256 ENCODE_FINISH(bl);
4257 }
4258
4259 void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
4260 {
4261 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
4262 ::decode(head, bl);
4263 ::decode(tail, bl);
4264 if (struct_v < 2) {
4265 bool backlog;
4266 ::decode(backlog, bl);
4267 }
4268 ::decode(log, bl);
4269 if (struct_v >= 5)
4270 ::decode(can_rollback_to, bl);
4271
4272 if (struct_v >= 6)
4273 ::decode(rollback_info_trimmed_to, bl);
4274 else
4275 rollback_info_trimmed_to = tail;
4276
4277 if (struct_v >= 7)
4278 ::decode(dups, bl);
4279
4280 DECODE_FINISH(bl);
4281
4282 // handle hobject_t format change
4283 if (struct_v < 4) {
4284 for (list<pg_log_entry_t>::iterator i = log.begin();
4285 i != log.end();
4286 ++i) {
4287 if (!i->soid.is_max() && i->soid.pool == -1)
4288 i->soid.pool = pool;
4289 }
4290 }
4291 }
4292
4293 void pg_log_t::dump(Formatter *f) const
4294 {
4295 f->dump_stream("head") << head;
4296 f->dump_stream("tail") << tail;
4297 f->open_array_section("log");
4298 for (list<pg_log_entry_t>::const_iterator p = log.begin(); p != log.end(); ++p) {
4299 f->open_object_section("entry");
4300 p->dump(f);
4301 f->close_section();
4302 }
4303 f->close_section();
4304 f->open_array_section("dups");
4305 for (const auto& entry : dups) {
4306 f->open_object_section("entry");
4307 entry.dump(f);
4308 f->close_section();
4309 }
4310 f->close_section();
4311 }
4312
4313 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
4314 {
4315 o.push_back(new pg_log_t);
4316
4317 // this is nonsensical:
4318 o.push_back(new pg_log_t);
4319 o.back()->head = eversion_t(1,2);
4320 o.back()->tail = eversion_t(3,4);
4321 list<pg_log_entry_t*> e;
4322 pg_log_entry_t::generate_test_instances(e);
4323 for (list<pg_log_entry_t*>::iterator p = e.begin(); p != e.end(); ++p)
4324 o.back()->log.push_back(**p);
4325 }
4326
4327 void pg_log_t::copy_after(const pg_log_t &other, eversion_t v)
4328 {
4329 can_rollback_to = other.can_rollback_to;
4330 head = other.head;
4331 tail = other.tail;
4332 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4333 i != other.log.rend();
4334 ++i) {
4335 assert(i->version > other.tail);
4336 if (i->version <= v) {
4337 // make tail accurate.
4338 tail = i->version;
4339 break;
4340 }
4341 log.push_front(*i);
4342 }
4343 }
4344
4345 void pg_log_t::copy_range(const pg_log_t &other, eversion_t from, eversion_t to)
4346 {
4347 can_rollback_to = other.can_rollback_to;
4348 list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4349 assert(i != other.log.rend());
4350 while (i->version > to) {
4351 ++i;
4352 assert(i != other.log.rend());
4353 }
4354 assert(i->version == to);
4355 head = to;
4356 for ( ; i != other.log.rend(); ++i) {
4357 if (i->version <= from) {
4358 tail = i->version;
4359 break;
4360 }
4361 log.push_front(*i);
4362 }
4363 }
4364
4365 void pg_log_t::copy_up_to(const pg_log_t &other, int max)
4366 {
4367 can_rollback_to = other.can_rollback_to;
4368 int n = 0;
4369 head = other.head;
4370 tail = other.tail;
4371 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4372 i != other.log.rend();
4373 ++i) {
4374 if (n++ >= max) {
4375 tail = i->version;
4376 break;
4377 }
4378 log.push_front(*i);
4379 }
4380 }
4381
4382 ostream& pg_log_t::print(ostream& out) const
4383 {
4384 out << *this << std::endl;
4385 for (list<pg_log_entry_t>::const_iterator p = log.begin();
4386 p != log.end();
4387 ++p)
4388 out << *p << std::endl;
4389 for (const auto& entry : dups) {
4390 out << " dup entry: " << entry << std::endl;
4391 }
4392 return out;
4393 }
4394
4395 // -- pg_missing_t --
4396
4397 ostream& operator<<(ostream& out, const pg_missing_item& i)
4398 {
4399 out << i.need;
4400 if (i.have != eversion_t())
4401 out << "(" << i.have << ")";
4402 out << " flags = " << i.flag_str();
4403 return out;
4404 }
4405
4406 // -- object_copy_cursor_t --
4407
4408 void object_copy_cursor_t::encode(bufferlist& bl) const
4409 {
4410 ENCODE_START(1, 1, bl);
4411 ::encode(attr_complete, bl);
4412 ::encode(data_offset, bl);
4413 ::encode(data_complete, bl);
4414 ::encode(omap_offset, bl);
4415 ::encode(omap_complete, bl);
4416 ENCODE_FINISH(bl);
4417 }
4418
4419 void object_copy_cursor_t::decode(bufferlist::iterator &bl)
4420 {
4421 DECODE_START(1, bl);
4422 ::decode(attr_complete, bl);
4423 ::decode(data_offset, bl);
4424 ::decode(data_complete, bl);
4425 ::decode(omap_offset, bl);
4426 ::decode(omap_complete, bl);
4427 DECODE_FINISH(bl);
4428 }
4429
4430 void object_copy_cursor_t::dump(Formatter *f) const
4431 {
4432 f->dump_unsigned("attr_complete", (int)attr_complete);
4433 f->dump_unsigned("data_offset", data_offset);
4434 f->dump_unsigned("data_complete", (int)data_complete);
4435 f->dump_string("omap_offset", omap_offset);
4436 f->dump_unsigned("omap_complete", (int)omap_complete);
4437 }
4438
4439 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
4440 {
4441 o.push_back(new object_copy_cursor_t);
4442 o.push_back(new object_copy_cursor_t);
4443 o.back()->attr_complete = true;
4444 o.back()->data_offset = 123;
4445 o.push_back(new object_copy_cursor_t);
4446 o.back()->attr_complete = true;
4447 o.back()->data_complete = true;
4448 o.back()->omap_offset = "foo";
4449 o.push_back(new object_copy_cursor_t);
4450 o.back()->attr_complete = true;
4451 o.back()->data_complete = true;
4452 o.back()->omap_complete = true;
4453 }
4454
4455 // -- object_copy_data_t --
4456
4457 void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
4458 {
4459 ENCODE_START(7, 5, bl);
4460 ::encode(size, bl);
4461 ::encode(mtime, bl);
4462 ::encode(attrs, bl);
4463 ::encode(data, bl);
4464 ::encode(omap_data, bl);
4465 ::encode(cursor, bl);
4466 ::encode(omap_header, bl);
4467 ::encode(snaps, bl);
4468 ::encode(snap_seq, bl);
4469 ::encode(flags, bl);
4470 ::encode(data_digest, bl);
4471 ::encode(omap_digest, bl);
4472 ::encode(reqids, bl);
4473 ::encode(truncate_seq, bl);
4474 ::encode(truncate_size, bl);
4475 ENCODE_FINISH(bl);
4476 }
4477
4478 void object_copy_data_t::decode(bufferlist::iterator& bl)
4479 {
4480 DECODE_START(7, bl);
4481 if (struct_v < 5) {
4482 // old
4483 ::decode(size, bl);
4484 ::decode(mtime, bl);
4485 {
4486 string category;
4487 ::decode(category, bl); // no longer used
4488 }
4489 ::decode(attrs, bl);
4490 ::decode(data, bl);
4491 {
4492 map<string,bufferlist> omap;
4493 ::decode(omap, bl);
4494 omap_data.clear();
4495 if (!omap.empty())
4496 ::encode(omap, omap_data);
4497 }
4498 ::decode(cursor, bl);
4499 if (struct_v >= 2)
4500 ::decode(omap_header, bl);
4501 if (struct_v >= 3) {
4502 ::decode(snaps, bl);
4503 ::decode(snap_seq, bl);
4504 } else {
4505 snaps.clear();
4506 snap_seq = 0;
4507 }
4508 if (struct_v >= 4) {
4509 ::decode(flags, bl);
4510 ::decode(data_digest, bl);
4511 ::decode(omap_digest, bl);
4512 }
4513 } else {
4514 // current
4515 ::decode(size, bl);
4516 ::decode(mtime, bl);
4517 ::decode(attrs, bl);
4518 ::decode(data, bl);
4519 ::decode(omap_data, bl);
4520 ::decode(cursor, bl);
4521 ::decode(omap_header, bl);
4522 ::decode(snaps, bl);
4523 ::decode(snap_seq, bl);
4524 if (struct_v >= 4) {
4525 ::decode(flags, bl);
4526 ::decode(data_digest, bl);
4527 ::decode(omap_digest, bl);
4528 }
4529 if (struct_v >= 6) {
4530 ::decode(reqids, bl);
4531 }
4532 if (struct_v >= 7) {
4533 ::decode(truncate_seq, bl);
4534 ::decode(truncate_size, bl);
4535 }
4536 }
4537 DECODE_FINISH(bl);
4538 }
4539
4540 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
4541 {
4542 o.push_back(new object_copy_data_t());
4543
4544 list<object_copy_cursor_t*> cursors;
4545 object_copy_cursor_t::generate_test_instances(cursors);
4546 list<object_copy_cursor_t*>::iterator ci = cursors.begin();
4547 o.back()->cursor = **(ci++);
4548
4549 o.push_back(new object_copy_data_t());
4550 o.back()->cursor = **(ci++);
4551
4552 o.push_back(new object_copy_data_t());
4553 o.back()->size = 1234;
4554 o.back()->mtime.set_from_double(1234);
4555 bufferptr bp("there", 5);
4556 bufferlist bl;
4557 bl.push_back(bp);
4558 o.back()->attrs["hello"] = bl;
4559 bufferptr bp2("not", 3);
4560 bufferlist bl2;
4561 bl2.push_back(bp2);
4562 map<string,bufferlist> omap;
4563 omap["why"] = bl2;
4564 ::encode(omap, o.back()->omap_data);
4565 bufferptr databp("iamsomedatatocontain", 20);
4566 o.back()->data.push_back(databp);
4567 o.back()->omap_header.append("this is an omap header");
4568 o.back()->snaps.push_back(123);
4569 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
4570 }
4571
4572 void object_copy_data_t::dump(Formatter *f) const
4573 {
4574 f->open_object_section("cursor");
4575 cursor.dump(f);
4576 f->close_section(); // cursor
4577 f->dump_int("size", size);
4578 f->dump_stream("mtime") << mtime;
4579 /* we should really print out the attrs here, but bufferlist
4580 const-correctness prevents that */
4581 f->dump_int("attrs_size", attrs.size());
4582 f->dump_int("flags", flags);
4583 f->dump_unsigned("data_digest", data_digest);
4584 f->dump_unsigned("omap_digest", omap_digest);
4585 f->dump_int("omap_data_length", omap_data.length());
4586 f->dump_int("omap_header_length", omap_header.length());
4587 f->dump_int("data_length", data.length());
4588 f->open_array_section("snaps");
4589 for (vector<snapid_t>::const_iterator p = snaps.begin();
4590 p != snaps.end(); ++p)
4591 f->dump_unsigned("snap", *p);
4592 f->close_section();
4593 f->open_array_section("reqids");
4594 for (auto p = reqids.begin();
4595 p != reqids.end();
4596 ++p) {
4597 f->open_object_section("extra_reqid");
4598 f->dump_stream("reqid") << p->first;
4599 f->dump_stream("user_version") << p->second;
4600 f->close_section();
4601 }
4602 f->close_section();
4603 }
4604
4605 // -- pg_create_t --
4606
4607 void pg_create_t::encode(bufferlist &bl) const
4608 {
4609 ENCODE_START(1, 1, bl);
4610 ::encode(created, bl);
4611 ::encode(parent, bl);
4612 ::encode(split_bits, bl);
4613 ENCODE_FINISH(bl);
4614 }
4615
4616 void pg_create_t::decode(bufferlist::iterator &bl)
4617 {
4618 DECODE_START(1, bl);
4619 ::decode(created, bl);
4620 ::decode(parent, bl);
4621 ::decode(split_bits, bl);
4622 DECODE_FINISH(bl);
4623 }
4624
4625 void pg_create_t::dump(Formatter *f) const
4626 {
4627 f->dump_unsigned("created", created);
4628 f->dump_stream("parent") << parent;
4629 f->dump_int("split_bits", split_bits);
4630 }
4631
4632 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
4633 {
4634 o.push_back(new pg_create_t);
4635 o.push_back(new pg_create_t(1, pg_t(3, 4, -1), 2));
4636 }
4637
4638
4639 // -- pg_hit_set_info_t --
4640
4641 void pg_hit_set_info_t::encode(bufferlist& bl) const
4642 {
4643 ENCODE_START(2, 1, bl);
4644 ::encode(begin, bl);
4645 ::encode(end, bl);
4646 ::encode(version, bl);
4647 ::encode(using_gmt, bl);
4648 ENCODE_FINISH(bl);
4649 }
4650
4651 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
4652 {
4653 DECODE_START(2, p);
4654 ::decode(begin, p);
4655 ::decode(end, p);
4656 ::decode(version, p);
4657 if (struct_v >= 2) {
4658 ::decode(using_gmt, p);
4659 } else {
4660 using_gmt = false;
4661 }
4662 DECODE_FINISH(p);
4663 }
4664
4665 void pg_hit_set_info_t::dump(Formatter *f) const
4666 {
4667 f->dump_stream("begin") << begin;
4668 f->dump_stream("end") << end;
4669 f->dump_stream("version") << version;
4670 f->dump_stream("using_gmt") << using_gmt;
4671 }
4672
4673 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
4674 {
4675 ls.push_back(new pg_hit_set_info_t);
4676 ls.push_back(new pg_hit_set_info_t);
4677 ls.back()->begin = utime_t(1, 2);
4678 ls.back()->end = utime_t(3, 4);
4679 }
4680
4681
4682 // -- pg_hit_set_history_t --
4683
4684 void pg_hit_set_history_t::encode(bufferlist& bl) const
4685 {
4686 ENCODE_START(1, 1, bl);
4687 ::encode(current_last_update, bl);
4688 {
4689 utime_t dummy_stamp;
4690 ::encode(dummy_stamp, bl);
4691 }
4692 {
4693 pg_hit_set_info_t dummy_info;
4694 ::encode(dummy_info, bl);
4695 }
4696 ::encode(history, bl);
4697 ENCODE_FINISH(bl);
4698 }
4699
4700 void pg_hit_set_history_t::decode(bufferlist::iterator& p)
4701 {
4702 DECODE_START(1, p);
4703 ::decode(current_last_update, p);
4704 {
4705 utime_t dummy_stamp;
4706 ::decode(dummy_stamp, p);
4707 }
4708 {
4709 pg_hit_set_info_t dummy_info;
4710 ::decode(dummy_info, p);
4711 }
4712 ::decode(history, p);
4713 DECODE_FINISH(p);
4714 }
4715
4716 void pg_hit_set_history_t::dump(Formatter *f) const
4717 {
4718 f->dump_stream("current_last_update") << current_last_update;
4719 f->open_array_section("history");
4720 for (list<pg_hit_set_info_t>::const_iterator p = history.begin();
4721 p != history.end(); ++p) {
4722 f->open_object_section("info");
4723 p->dump(f);
4724 f->close_section();
4725 }
4726 f->close_section();
4727 }
4728
4729 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
4730 {
4731 ls.push_back(new pg_hit_set_history_t);
4732 ls.push_back(new pg_hit_set_history_t);
4733 ls.back()->current_last_update = eversion_t(1, 2);
4734 ls.back()->history.push_back(pg_hit_set_info_t());
4735 }
4736
4737 // -- osd_peer_stat_t --
4738
4739 void osd_peer_stat_t::encode(bufferlist& bl) const
4740 {
4741 ENCODE_START(1, 1, bl);
4742 ::encode(stamp, bl);
4743 ENCODE_FINISH(bl);
4744 }
4745
4746 void osd_peer_stat_t::decode(bufferlist::iterator& bl)
4747 {
4748 DECODE_START(1, bl);
4749 ::decode(stamp, bl);
4750 DECODE_FINISH(bl);
4751 }
4752
4753 void osd_peer_stat_t::dump(Formatter *f) const
4754 {
4755 f->dump_stream("stamp") << stamp;
4756 }
4757
4758 void osd_peer_stat_t::generate_test_instances(list<osd_peer_stat_t*>& o)
4759 {
4760 o.push_back(new osd_peer_stat_t);
4761 o.push_back(new osd_peer_stat_t);
4762 o.back()->stamp = utime_t(1, 2);
4763 }
4764
4765 ostream& operator<<(ostream& out, const osd_peer_stat_t &stat)
4766 {
4767 return out << "stat(" << stat.stamp << ")";
4768 }
4769
4770
4771 // -- OSDSuperblock --
4772
4773 void OSDSuperblock::encode(bufferlist &bl) const
4774 {
4775 ENCODE_START(8, 5, bl);
4776 ::encode(cluster_fsid, bl);
4777 ::encode(whoami, bl);
4778 ::encode(current_epoch, bl);
4779 ::encode(oldest_map, bl);
4780 ::encode(newest_map, bl);
4781 ::encode(weight, bl);
4782 compat_features.encode(bl);
4783 ::encode(clean_thru, bl);
4784 ::encode(mounted, bl);
4785 ::encode(osd_fsid, bl);
4786 ::encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
4787 ::encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
4788 ENCODE_FINISH(bl);
4789 }
4790
4791 void OSDSuperblock::decode(bufferlist::iterator &bl)
4792 {
4793 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
4794 if (struct_v < 3) {
4795 string magic;
4796 ::decode(magic, bl);
4797 }
4798 ::decode(cluster_fsid, bl);
4799 ::decode(whoami, bl);
4800 ::decode(current_epoch, bl);
4801 ::decode(oldest_map, bl);
4802 ::decode(newest_map, bl);
4803 ::decode(weight, bl);
4804 if (struct_v >= 2) {
4805 compat_features.decode(bl);
4806 } else { //upgrade it!
4807 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4808 }
4809 ::decode(clean_thru, bl);
4810 ::decode(mounted, bl);
4811 if (struct_v >= 4)
4812 ::decode(osd_fsid, bl);
4813 if (struct_v >= 6) {
4814 epoch_t last_map_marked_full;
4815 ::decode(last_map_marked_full, bl);
4816 }
4817 if (struct_v >= 7) {
4818 map<int64_t,epoch_t> pool_last_map_marked_full;
4819 ::decode(pool_last_map_marked_full, bl);
4820 }
4821 DECODE_FINISH(bl);
4822 }
4823
4824 void OSDSuperblock::dump(Formatter *f) const
4825 {
4826 f->dump_stream("cluster_fsid") << cluster_fsid;
4827 f->dump_stream("osd_fsid") << osd_fsid;
4828 f->dump_int("whoami", whoami);
4829 f->dump_int("current_epoch", current_epoch);
4830 f->dump_int("oldest_map", oldest_map);
4831 f->dump_int("newest_map", newest_map);
4832 f->dump_float("weight", weight);
4833 f->open_object_section("compat");
4834 compat_features.dump(f);
4835 f->close_section();
4836 f->dump_int("clean_thru", clean_thru);
4837 f->dump_int("last_epoch_mounted", mounted);
4838 }
4839
4840 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
4841 {
4842 OSDSuperblock z;
4843 o.push_back(new OSDSuperblock(z));
4844 memset(&z.cluster_fsid, 1, sizeof(z.cluster_fsid));
4845 memset(&z.osd_fsid, 2, sizeof(z.osd_fsid));
4846 z.whoami = 3;
4847 z.current_epoch = 4;
4848 z.oldest_map = 5;
4849 z.newest_map = 9;
4850 z.mounted = 8;
4851 z.clean_thru = 7;
4852 o.push_back(new OSDSuperblock(z));
4853 o.push_back(new OSDSuperblock(z));
4854 }
4855
4856 // -- SnapSet --
4857
4858 void SnapSet::encode(bufferlist& bl) const
4859 {
4860 ENCODE_START(3, 2, bl);
4861 ::encode(seq, bl);
4862 ::encode(head_exists, bl);
4863 ::encode(snaps, bl);
4864 ::encode(clones, bl);
4865 ::encode(clone_overlap, bl);
4866 ::encode(clone_size, bl);
4867 ::encode(clone_snaps, bl);
4868 ENCODE_FINISH(bl);
4869 }
4870
4871 void SnapSet::decode(bufferlist::iterator& bl)
4872 {
4873 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
4874 ::decode(seq, bl);
4875 ::decode(head_exists, bl);
4876 ::decode(snaps, bl);
4877 ::decode(clones, bl);
4878 ::decode(clone_overlap, bl);
4879 ::decode(clone_size, bl);
4880 if (struct_v >= 3) {
4881 ::decode(clone_snaps, bl);
4882 } else {
4883 clone_snaps.clear();
4884 }
4885 DECODE_FINISH(bl);
4886 }
4887
4888 void SnapSet::dump(Formatter *f) const
4889 {
4890 SnapContext sc(seq, snaps);
4891 f->open_object_section("snap_context");
4892 sc.dump(f);
4893 f->close_section();
4894 f->dump_int("head_exists", head_exists);
4895 f->open_array_section("clones");
4896 for (vector<snapid_t>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
4897 f->open_object_section("clone");
4898 f->dump_unsigned("snap", *p);
4899 auto cs = clone_size.find(*p);
4900 if (cs != clone_size.end())
4901 f->dump_unsigned("size", cs->second);
4902 else
4903 f->dump_string("size", "????");
4904 auto co = clone_overlap.find(*p);
4905 if (co != clone_overlap.end())
4906 f->dump_stream("overlap") << co->second;
4907 else
4908 f->dump_stream("overlap") << "????";
4909 auto q = clone_snaps.find(*p);
4910 if (q != clone_snaps.end()) {
4911 f->open_array_section("snaps");
4912 for (auto s : q->second) {
4913 f->dump_unsigned("snap", s);
4914 }
4915 f->close_section();
4916 }
4917 f->close_section();
4918 }
4919 f->close_section();
4920 }
4921
4922 void SnapSet::generate_test_instances(list<SnapSet*>& o)
4923 {
4924 o.push_back(new SnapSet);
4925 o.push_back(new SnapSet);
4926 o.back()->head_exists = true;
4927 o.back()->seq = 123;
4928 o.back()->snaps.push_back(123);
4929 o.back()->snaps.push_back(12);
4930 o.push_back(new SnapSet);
4931 o.back()->head_exists = true;
4932 o.back()->seq = 123;
4933 o.back()->snaps.push_back(123);
4934 o.back()->snaps.push_back(12);
4935 o.back()->clones.push_back(12);
4936 o.back()->clone_size[12] = 12345;
4937 o.back()->clone_overlap[12];
4938 o.back()->clone_snaps[12] = {12, 10, 8};
4939 }
4940
4941 ostream& operator<<(ostream& out, const SnapSet& cs)
4942 {
4943 if (cs.is_legacy()) {
4944 out << cs.seq << "=" << cs.snaps << ":"
4945 << cs.clones
4946 << (cs.head_exists ? "+head":"");
4947 if (!cs.clone_snaps.empty()) {
4948 out << "+stray_clone_snaps=" << cs.clone_snaps;
4949 }
4950 return out;
4951 } else {
4952 return out << cs.seq << "=" << cs.snaps << ":"
4953 << cs.clone_snaps;
4954 }
4955 }
4956
4957 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
4958 {
4959 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
4960 // correct: it will not include snaps that still logically exist
4961 // but for which there was no clone that is defined. For all
4962 // practical purposes this doesn't matter, since we only use that
4963 // information to clone on the OSD, and we have already moved
4964 // forward past that part of the object history.
4965
4966 seq = ss.seq;
4967 set<snapid_t> _snaps;
4968 set<snapid_t> _clones;
4969 head_exists = false;
4970 for (vector<librados::clone_info_t>::const_iterator p = ss.clones.begin();
4971 p != ss.clones.end();
4972 ++p) {
4973 if (p->cloneid == librados::SNAP_HEAD) {
4974 head_exists = true;
4975 } else {
4976 _clones.insert(p->cloneid);
4977 _snaps.insert(p->snaps.begin(), p->snaps.end());
4978 clone_size[p->cloneid] = p->size;
4979 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
4980 for (vector<pair<uint64_t, uint64_t> >::const_iterator q =
4981 p->overlap.begin(); q != p->overlap.end(); ++q)
4982 clone_overlap[p->cloneid].insert(q->first, q->second);
4983 if (!legacy) {
4984 // p->snaps is ascending; clone_snaps is descending
4985 vector<snapid_t>& v = clone_snaps[p->cloneid];
4986 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
4987 v.push_back(*q);
4988 }
4989 }
4990 }
4991 }
4992
4993 // ascending
4994 clones.clear();
4995 clones.reserve(_clones.size());
4996 for (set<snapid_t>::iterator p = _clones.begin(); p != _clones.end(); ++p)
4997 clones.push_back(*p);
4998
4999 // descending
5000 snaps.clear();
5001 snaps.reserve(_snaps.size());
5002 for (set<snapid_t>::reverse_iterator p = _snaps.rbegin();
5003 p != _snaps.rend(); ++p)
5004 snaps.push_back(*p);
5005 }
5006
5007 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
5008 {
5009 assert(clone_size.count(clone));
5010 uint64_t size = clone_size.find(clone)->second;
5011 assert(clone_overlap.count(clone));
5012 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
5013 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
5014 i != overlap.end();
5015 ++i) {
5016 assert(size >= i.get_len());
5017 size -= i.get_len();
5018 }
5019 return size;
5020 }
5021
5022 void SnapSet::filter(const pg_pool_t &pinfo)
5023 {
5024 vector<snapid_t> oldsnaps;
5025 oldsnaps.swap(snaps);
5026 for (vector<snapid_t>::const_iterator i = oldsnaps.begin();
5027 i != oldsnaps.end();
5028 ++i) {
5029 if (!pinfo.is_removed_snap(*i))
5030 snaps.push_back(*i);
5031 }
5032 }
5033
5034 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5035 {
5036 SnapSet ss = *this;
5037 ss.filter(pinfo);
5038 return ss;
5039 }
5040
5041 // -- watch_info_t --
5042
5043 void watch_info_t::encode(bufferlist& bl, uint64_t features) const
5044 {
5045 ENCODE_START(4, 3, bl);
5046 ::encode(cookie, bl);
5047 ::encode(timeout_seconds, bl);
5048 ::encode(addr, bl, features);
5049 ENCODE_FINISH(bl);
5050 }
5051
5052 void watch_info_t::decode(bufferlist::iterator& bl)
5053 {
5054 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
5055 ::decode(cookie, bl);
5056 if (struct_v < 2) {
5057 uint64_t ver;
5058 ::decode(ver, bl);
5059 }
5060 ::decode(timeout_seconds, bl);
5061 if (struct_v >= 4) {
5062 ::decode(addr, bl);
5063 }
5064 DECODE_FINISH(bl);
5065 }
5066
5067 void watch_info_t::dump(Formatter *f) const
5068 {
5069 f->dump_unsigned("cookie", cookie);
5070 f->dump_unsigned("timeout_seconds", timeout_seconds);
5071 f->open_object_section("addr");
5072 addr.dump(f);
5073 f->close_section();
5074 }
5075
5076 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5077 {
5078 o.push_back(new watch_info_t);
5079 o.push_back(new watch_info_t);
5080 o.back()->cookie = 123;
5081 o.back()->timeout_seconds = 99;
5082 entity_addr_t ea;
5083 ea.set_type(entity_addr_t::TYPE_LEGACY);
5084 ea.set_nonce(1);
5085 ea.set_family(AF_INET);
5086 ea.set_in4_quad(0, 127);
5087 ea.set_in4_quad(1, 0);
5088 ea.set_in4_quad(2, 1);
5089 ea.set_in4_quad(3, 2);
5090 ea.set_port(2);
5091 o.back()->addr = ea;
5092 }
5093
5094 // -- object_manifest_t --
5095
5096 void object_manifest_t::encode(bufferlist& bl) const
5097 {
5098 ENCODE_START(1, 1, bl);
5099 ::encode(type, bl);
5100 switch (type) {
5101 case TYPE_NONE: break;
5102 case TYPE_REDIRECT:
5103 ::encode(redirect_target, bl);
5104 break;
5105 default:
5106 ceph_abort();
5107 }
5108 ENCODE_FINISH(bl);
5109 }
5110
5111 void object_manifest_t::decode(bufferlist::iterator& bl)
5112 {
5113 DECODE_START(1, bl);
5114 ::decode(type, bl);
5115 switch (type) {
5116 case TYPE_NONE: break;
5117 case TYPE_REDIRECT:
5118 ::decode(redirect_target, bl);
5119 break;
5120 default:
5121 ceph_abort();
5122 }
5123 DECODE_FINISH(bl);
5124 }
5125
5126 void object_manifest_t::dump(Formatter *f) const
5127 {
5128 f->dump_unsigned("type", type);
5129 f->open_object_section("redirect_target");
5130 redirect_target.dump(f);
5131 f->close_section();
5132 }
5133
5134 void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
5135 {
5136 o.push_back(new object_manifest_t());
5137 o.back()->type = TYPE_REDIRECT;
5138 }
5139
5140 ostream& operator<<(ostream& out, const object_manifest_t& om)
5141 {
5142 return out << "type:" << om.type << " redirect_target:" << om.redirect_target;
5143 }
5144
5145 // -- object_info_t --
5146
5147 void object_info_t::copy_user_bits(const object_info_t& other)
5148 {
5149 // these bits are copied from head->clone.
5150 size = other.size;
5151 mtime = other.mtime;
5152 local_mtime = other.local_mtime;
5153 last_reqid = other.last_reqid;
5154 truncate_seq = other.truncate_seq;
5155 truncate_size = other.truncate_size;
5156 flags = other.flags;
5157 user_version = other.user_version;
5158 data_digest = other.data_digest;
5159 omap_digest = other.omap_digest;
5160 }
5161
5162 ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
5163 const object_locator_t &loc) {
5164 ps_t ps;
5165 if (loc.key.length())
5166 // Hack, we don't have the osd map, so we don't really know the hash...
5167 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, loc.key.c_str(),
5168 loc.key.length());
5169 else
5170 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, oid.name.c_str(),
5171 oid.name.length());
5172 return ps;
5173 }
5174
5175 void object_info_t::encode(bufferlist& bl, uint64_t features) const
5176 {
5177 object_locator_t myoloc(soid);
5178 map<entity_name_t, watch_info_t> old_watchers;
5179 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator i =
5180 watchers.begin();
5181 i != watchers.end();
5182 ++i) {
5183 old_watchers.insert(make_pair(i->first.second, i->second));
5184 }
5185 ENCODE_START(17, 8, bl);
5186 ::encode(soid, bl);
5187 ::encode(myoloc, bl); //Retained for compatibility
5188 ::encode((__u32)0, bl); // was category, no longer used
5189 ::encode(version, bl);
5190 ::encode(prior_version, bl);
5191 ::encode(last_reqid, bl);
5192 ::encode(size, bl);
5193 ::encode(mtime, bl);
5194 if (soid.snap == CEPH_NOSNAP)
5195 ::encode(osd_reqid_t(), bl); // used to be wrlock_by
5196 else
5197 ::encode(legacy_snaps, bl);
5198 ::encode(truncate_seq, bl);
5199 ::encode(truncate_size, bl);
5200 ::encode(is_lost(), bl);
5201 ::encode(old_watchers, bl, features);
5202 /* shenanigans to avoid breaking backwards compatibility in the disk format.
5203 * When we can, switch this out for simply putting the version_t on disk. */
5204 eversion_t user_eversion(0, user_version);
5205 ::encode(user_eversion, bl);
5206 ::encode(test_flag(FLAG_USES_TMAP), bl);
5207 ::encode(watchers, bl, features);
5208 __u32 _flags = flags;
5209 ::encode(_flags, bl);
5210 ::encode(local_mtime, bl);
5211 ::encode(data_digest, bl);
5212 ::encode(omap_digest, bl);
5213 ::encode(expected_object_size, bl);
5214 ::encode(expected_write_size, bl);
5215 ::encode(alloc_hint_flags, bl);
5216 if (has_manifest()) {
5217 ::encode(manifest, bl);
5218 }
5219 ENCODE_FINISH(bl);
5220 }
5221
5222 void object_info_t::decode(bufferlist::iterator& bl)
5223 {
5224 object_locator_t myoloc;
5225 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
5226 map<entity_name_t, watch_info_t> old_watchers;
5227 ::decode(soid, bl);
5228 ::decode(myoloc, bl);
5229 {
5230 string category;
5231 ::decode(category, bl); // no longer used
5232 }
5233 ::decode(version, bl);
5234 ::decode(prior_version, bl);
5235 ::decode(last_reqid, bl);
5236 ::decode(size, bl);
5237 ::decode(mtime, bl);
5238 if (soid.snap == CEPH_NOSNAP) {
5239 osd_reqid_t wrlock_by;
5240 ::decode(wrlock_by, bl);
5241 } else {
5242 ::decode(legacy_snaps, bl);
5243 }
5244 ::decode(truncate_seq, bl);
5245 ::decode(truncate_size, bl);
5246
5247 // if this is struct_v >= 13, we will overwrite this
5248 // below since this field is just here for backwards
5249 // compatibility
5250 __u8 lo;
5251 ::decode(lo, bl);
5252 flags = (flag_t)lo;
5253
5254 ::decode(old_watchers, bl);
5255 eversion_t user_eversion;
5256 ::decode(user_eversion, bl);
5257 user_version = user_eversion.version;
5258
5259 if (struct_v >= 9) {
5260 bool uses_tmap = false;
5261 ::decode(uses_tmap, bl);
5262 if (uses_tmap)
5263 set_flag(FLAG_USES_TMAP);
5264 } else {
5265 set_flag(FLAG_USES_TMAP);
5266 }
5267 if (struct_v < 10)
5268 soid.pool = myoloc.pool;
5269 if (struct_v >= 11) {
5270 ::decode(watchers, bl);
5271 } else {
5272 for (map<entity_name_t, watch_info_t>::iterator i = old_watchers.begin();
5273 i != old_watchers.end();
5274 ++i) {
5275 watchers.insert(
5276 make_pair(
5277 make_pair(i->second.cookie, i->first), i->second));
5278 }
5279 }
5280 if (struct_v >= 13) {
5281 __u32 _flags;
5282 ::decode(_flags, bl);
5283 flags = (flag_t)_flags;
5284 }
5285 if (struct_v >= 14) {
5286 ::decode(local_mtime, bl);
5287 } else {
5288 local_mtime = utime_t();
5289 }
5290 if (struct_v >= 15) {
5291 ::decode(data_digest, bl);
5292 ::decode(omap_digest, bl);
5293 } else {
5294 data_digest = omap_digest = -1;
5295 clear_flag(FLAG_DATA_DIGEST);
5296 clear_flag(FLAG_OMAP_DIGEST);
5297 }
5298 if (struct_v >= 16) {
5299 ::decode(expected_object_size, bl);
5300 ::decode(expected_write_size, bl);
5301 ::decode(alloc_hint_flags, bl);
5302 } else {
5303 expected_object_size = 0;
5304 expected_write_size = 0;
5305 alloc_hint_flags = 0;
5306 }
5307 if (struct_v >= 17) {
5308 if (has_manifest()) {
5309 ::decode(manifest, bl);
5310 }
5311 }
5312 DECODE_FINISH(bl);
5313 }
5314
5315 void object_info_t::dump(Formatter *f) const
5316 {
5317 f->open_object_section("oid");
5318 soid.dump(f);
5319 f->close_section();
5320 f->dump_stream("version") << version;
5321 f->dump_stream("prior_version") << prior_version;
5322 f->dump_stream("last_reqid") << last_reqid;
5323 f->dump_unsigned("user_version", user_version);
5324 f->dump_unsigned("size", size);
5325 f->dump_stream("mtime") << mtime;
5326 f->dump_stream("local_mtime") << local_mtime;
5327 f->dump_unsigned("lost", (int)is_lost());
5328 vector<string> sv = get_flag_vector(flags);
5329 f->open_array_section("flags");
5330 for (auto str: sv)
5331 f->dump_string("flags", str);
5332 f->close_section();
5333 f->open_array_section("legacy_snaps");
5334 for (auto s : legacy_snaps) {
5335 f->dump_unsigned("snap", s);
5336 }
5337 f->close_section();
5338 f->dump_unsigned("truncate_seq", truncate_seq);
5339 f->dump_unsigned("truncate_size", truncate_size);
5340 f->dump_format("data_digest", "0x%08x", data_digest);
5341 f->dump_format("omap_digest", "0x%08x", omap_digest);
5342 f->dump_unsigned("expected_object_size", expected_object_size);
5343 f->dump_unsigned("expected_write_size", expected_write_size);
5344 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
5345 f->dump_object("manifest", manifest);
5346 f->open_object_section("watchers");
5347 for (map<pair<uint64_t, entity_name_t>,watch_info_t>::const_iterator p =
5348 watchers.begin(); p != watchers.end(); ++p) {
5349 stringstream ss;
5350 ss << p->first.second;
5351 f->open_object_section(ss.str().c_str());
5352 p->second.dump(f);
5353 f->close_section();
5354 }
5355 f->close_section();
5356 }
5357
5358 void object_info_t::generate_test_instances(list<object_info_t*>& o)
5359 {
5360 o.push_back(new object_info_t());
5361
5362 // fixme
5363 }
5364
5365
5366 ostream& operator<<(ostream& out, const object_info_t& oi)
5367 {
5368 out << oi.soid << "(" << oi.version
5369 << " " << oi.last_reqid;
5370 if (oi.soid.snap != CEPH_NOSNAP && !oi.legacy_snaps.empty())
5371 out << " " << oi.legacy_snaps;
5372 if (oi.flags)
5373 out << " " << oi.get_flag_string();
5374 out << " s " << oi.size;
5375 out << " uv " << oi.user_version;
5376 if (oi.is_data_digest())
5377 out << " dd " << std::hex << oi.data_digest << std::dec;
5378 if (oi.is_omap_digest())
5379 out << " od " << std::hex << oi.omap_digest << std::dec;
5380 out << " alloc_hint [" << oi.expected_object_size
5381 << " " << oi.expected_write_size
5382 << " " << oi.alloc_hint_flags << "]";
5383 if (oi.has_manifest())
5384 out << " " << oi.manifest;
5385
5386 out << ")";
5387 return out;
5388 }
5389
5390 // -- ObjectRecovery --
5391 void ObjectRecoveryProgress::encode(bufferlist &bl) const
5392 {
5393 ENCODE_START(1, 1, bl);
5394 ::encode(first, bl);
5395 ::encode(data_complete, bl);
5396 ::encode(data_recovered_to, bl);
5397 ::encode(omap_recovered_to, bl);
5398 ::encode(omap_complete, bl);
5399 ENCODE_FINISH(bl);
5400 }
5401
5402 void ObjectRecoveryProgress::decode(bufferlist::iterator &bl)
5403 {
5404 DECODE_START(1, bl);
5405 ::decode(first, bl);
5406 ::decode(data_complete, bl);
5407 ::decode(data_recovered_to, bl);
5408 ::decode(omap_recovered_to, bl);
5409 ::decode(omap_complete, bl);
5410 DECODE_FINISH(bl);
5411 }
5412
5413 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
5414 {
5415 return prog.print(out);
5416 }
5417
5418 void ObjectRecoveryProgress::generate_test_instances(
5419 list<ObjectRecoveryProgress*>& o)
5420 {
5421 o.push_back(new ObjectRecoveryProgress);
5422 o.back()->first = false;
5423 o.back()->data_complete = true;
5424 o.back()->omap_complete = true;
5425 o.back()->data_recovered_to = 100;
5426
5427 o.push_back(new ObjectRecoveryProgress);
5428 o.back()->first = true;
5429 o.back()->data_complete = false;
5430 o.back()->omap_complete = false;
5431 o.back()->data_recovered_to = 0;
5432 }
5433
5434 ostream &ObjectRecoveryProgress::print(ostream &out) const
5435 {
5436 return out << "ObjectRecoveryProgress("
5437 << ( first ? "" : "!" ) << "first, "
5438 << "data_recovered_to:" << data_recovered_to
5439 << ", data_complete:" << ( data_complete ? "true" : "false" )
5440 << ", omap_recovered_to:" << omap_recovered_to
5441 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
5442 << ", error:" << ( error ? "true" : "false" )
5443 << ")";
5444 }
5445
5446 void ObjectRecoveryProgress::dump(Formatter *f) const
5447 {
5448 f->dump_int("first?", first);
5449 f->dump_int("data_complete?", data_complete);
5450 f->dump_unsigned("data_recovered_to", data_recovered_to);
5451 f->dump_int("omap_complete?", omap_complete);
5452 f->dump_string("omap_recovered_to", omap_recovered_to);
5453 }
5454
5455 void ObjectRecoveryInfo::encode(bufferlist &bl, uint64_t features) const
5456 {
5457 ENCODE_START(2, 1, bl);
5458 ::encode(soid, bl);
5459 ::encode(version, bl);
5460 ::encode(size, bl);
5461 ::encode(oi, bl, features);
5462 ::encode(ss, bl);
5463 ::encode(copy_subset, bl);
5464 ::encode(clone_subset, bl);
5465 ENCODE_FINISH(bl);
5466 }
5467
5468 void ObjectRecoveryInfo::decode(bufferlist::iterator &bl,
5469 int64_t pool)
5470 {
5471 DECODE_START(2, bl);
5472 ::decode(soid, bl);
5473 ::decode(version, bl);
5474 ::decode(size, bl);
5475 ::decode(oi, bl);
5476 ::decode(ss, bl);
5477 ::decode(copy_subset, bl);
5478 ::decode(clone_subset, bl);
5479 DECODE_FINISH(bl);
5480
5481 if (struct_v < 2) {
5482 if (!soid.is_max() && soid.pool == -1)
5483 soid.pool = pool;
5484 map<hobject_t, interval_set<uint64_t>> tmp;
5485 tmp.swap(clone_subset);
5486 for (map<hobject_t, interval_set<uint64_t>>::iterator i = tmp.begin();
5487 i != tmp.end();
5488 ++i) {
5489 hobject_t first(i->first);
5490 if (!first.is_max() && first.pool == -1)
5491 first.pool = pool;
5492 clone_subset[first].swap(i->second);
5493 }
5494 }
5495 }
5496
5497 void ObjectRecoveryInfo::generate_test_instances(
5498 list<ObjectRecoveryInfo*>& o)
5499 {
5500 o.push_back(new ObjectRecoveryInfo);
5501 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
5502 o.back()->version = eversion_t(0,0);
5503 o.back()->size = 100;
5504 }
5505
5506
5507 void ObjectRecoveryInfo::dump(Formatter *f) const
5508 {
5509 f->dump_stream("object") << soid;
5510 f->dump_stream("at_version") << version;
5511 f->dump_stream("size") << size;
5512 {
5513 f->open_object_section("object_info");
5514 oi.dump(f);
5515 f->close_section();
5516 }
5517 {
5518 f->open_object_section("snapset");
5519 ss.dump(f);
5520 f->close_section();
5521 }
5522 f->dump_stream("copy_subset") << copy_subset;
5523 f->dump_stream("clone_subset") << clone_subset;
5524 }
5525
5526 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
5527 {
5528 return inf.print(out);
5529 }
5530
5531 ostream &ObjectRecoveryInfo::print(ostream &out) const
5532 {
5533 return out << "ObjectRecoveryInfo("
5534 << soid << "@" << version
5535 << ", size: " << size
5536 << ", copy_subset: " << copy_subset
5537 << ", clone_subset: " << clone_subset
5538 << ", snapset: " << ss
5539 << ")";
5540 }
5541
5542 // -- PushReplyOp --
5543 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
5544 {
5545 o.push_back(new PushReplyOp);
5546 o.push_back(new PushReplyOp);
5547 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5548 o.push_back(new PushReplyOp);
5549 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5550 }
5551
5552 void PushReplyOp::encode(bufferlist &bl) const
5553 {
5554 ENCODE_START(1, 1, bl);
5555 ::encode(soid, bl);
5556 ENCODE_FINISH(bl);
5557 }
5558
5559 void PushReplyOp::decode(bufferlist::iterator &bl)
5560 {
5561 DECODE_START(1, bl);
5562 ::decode(soid, bl);
5563 DECODE_FINISH(bl);
5564 }
5565
5566 void PushReplyOp::dump(Formatter *f) const
5567 {
5568 f->dump_stream("soid") << soid;
5569 }
5570
5571 ostream &PushReplyOp::print(ostream &out) const
5572 {
5573 return out
5574 << "PushReplyOp(" << soid
5575 << ")";
5576 }
5577
5578 ostream& operator<<(ostream& out, const PushReplyOp &op)
5579 {
5580 return op.print(out);
5581 }
5582
5583 uint64_t PushReplyOp::cost(CephContext *cct) const
5584 {
5585
5586 return cct->_conf->osd_push_per_object_cost +
5587 cct->_conf->osd_recovery_max_chunk;
5588 }
5589
5590 // -- PullOp --
5591 void PullOp::generate_test_instances(list<PullOp*> &o)
5592 {
5593 o.push_back(new PullOp);
5594 o.push_back(new PullOp);
5595 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5596 o.back()->recovery_info.version = eversion_t(3, 10);
5597 o.push_back(new PullOp);
5598 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5599 o.back()->recovery_info.version = eversion_t(0, 0);
5600 }
5601
5602 void PullOp::encode(bufferlist &bl, uint64_t features) const
5603 {
5604 ENCODE_START(1, 1, bl);
5605 ::encode(soid, bl);
5606 ::encode(recovery_info, bl, features);
5607 ::encode(recovery_progress, bl);
5608 ENCODE_FINISH(bl);
5609 }
5610
5611 void PullOp::decode(bufferlist::iterator &bl)
5612 {
5613 DECODE_START(1, bl);
5614 ::decode(soid, bl);
5615 ::decode(recovery_info, bl);
5616 ::decode(recovery_progress, bl);
5617 DECODE_FINISH(bl);
5618 }
5619
5620 void PullOp::dump(Formatter *f) const
5621 {
5622 f->dump_stream("soid") << soid;
5623 {
5624 f->open_object_section("recovery_info");
5625 recovery_info.dump(f);
5626 f->close_section();
5627 }
5628 {
5629 f->open_object_section("recovery_progress");
5630 recovery_progress.dump(f);
5631 f->close_section();
5632 }
5633 }
5634
5635 ostream &PullOp::print(ostream &out) const
5636 {
5637 return out
5638 << "PullOp(" << soid
5639 << ", recovery_info: " << recovery_info
5640 << ", recovery_progress: " << recovery_progress
5641 << ")";
5642 }
5643
5644 ostream& operator<<(ostream& out, const PullOp &op)
5645 {
5646 return op.print(out);
5647 }
5648
5649 uint64_t PullOp::cost(CephContext *cct) const
5650 {
5651 return cct->_conf->osd_push_per_object_cost +
5652 cct->_conf->osd_recovery_max_chunk;
5653 }
5654
5655 // -- PushOp --
5656 void PushOp::generate_test_instances(list<PushOp*> &o)
5657 {
5658 o.push_back(new PushOp);
5659 o.push_back(new PushOp);
5660 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5661 o.back()->version = eversion_t(3, 10);
5662 o.push_back(new PushOp);
5663 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5664 o.back()->version = eversion_t(0, 0);
5665 }
5666
5667 void PushOp::encode(bufferlist &bl, uint64_t features) const
5668 {
5669 ENCODE_START(1, 1, bl);
5670 ::encode(soid, bl);
5671 ::encode(version, bl);
5672 ::encode(data, bl);
5673 ::encode(data_included, bl);
5674 ::encode(omap_header, bl);
5675 ::encode(omap_entries, bl);
5676 ::encode(attrset, bl);
5677 ::encode(recovery_info, bl, features);
5678 ::encode(after_progress, bl);
5679 ::encode(before_progress, bl);
5680 ENCODE_FINISH(bl);
5681 }
5682
5683 void PushOp::decode(bufferlist::iterator &bl)
5684 {
5685 DECODE_START(1, bl);
5686 ::decode(soid, bl);
5687 ::decode(version, bl);
5688 ::decode(data, bl);
5689 ::decode(data_included, bl);
5690 ::decode(omap_header, bl);
5691 ::decode(omap_entries, bl);
5692 ::decode(attrset, bl);
5693 ::decode(recovery_info, bl);
5694 ::decode(after_progress, bl);
5695 ::decode(before_progress, bl);
5696 DECODE_FINISH(bl);
5697 }
5698
5699 void PushOp::dump(Formatter *f) const
5700 {
5701 f->dump_stream("soid") << soid;
5702 f->dump_stream("version") << version;
5703 f->dump_int("data_len", data.length());
5704 f->dump_stream("data_included") << data_included;
5705 f->dump_int("omap_header_len", omap_header.length());
5706 f->dump_int("omap_entries_len", omap_entries.size());
5707 f->dump_int("attrset_len", attrset.size());
5708 {
5709 f->open_object_section("recovery_info");
5710 recovery_info.dump(f);
5711 f->close_section();
5712 }
5713 {
5714 f->open_object_section("after_progress");
5715 after_progress.dump(f);
5716 f->close_section();
5717 }
5718 {
5719 f->open_object_section("before_progress");
5720 before_progress.dump(f);
5721 f->close_section();
5722 }
5723 }
5724
5725 ostream &PushOp::print(ostream &out) const
5726 {
5727 return out
5728 << "PushOp(" << soid
5729 << ", version: " << version
5730 << ", data_included: " << data_included
5731 << ", data_size: " << data.length()
5732 << ", omap_header_size: " << omap_header.length()
5733 << ", omap_entries_size: " << omap_entries.size()
5734 << ", attrset_size: " << attrset.size()
5735 << ", recovery_info: " << recovery_info
5736 << ", after_progress: " << after_progress
5737 << ", before_progress: " << before_progress
5738 << ")";
5739 }
5740
5741 ostream& operator<<(ostream& out, const PushOp &op)
5742 {
5743 return op.print(out);
5744 }
5745
5746 uint64_t PushOp::cost(CephContext *cct) const
5747 {
5748 uint64_t cost = data_included.size();
5749 for (map<string, bufferlist>::const_iterator i =
5750 omap_entries.begin();
5751 i != omap_entries.end();
5752 ++i) {
5753 cost += i->second.length();
5754 }
5755 cost += cct->_conf->osd_push_per_object_cost;
5756 return cost;
5757 }
5758
5759 // -- ScrubMap --
5760
5761 void ScrubMap::merge_incr(const ScrubMap &l)
5762 {
5763 assert(valid_through == l.incr_since);
5764 valid_through = l.valid_through;
5765
5766 for (map<hobject_t,object>::const_iterator p = l.objects.begin();
5767 p != l.objects.end();
5768 ++p){
5769 if (p->second.negative) {
5770 map<hobject_t,object>::iterator q = objects.find(p->first);
5771 if (q != objects.end()) {
5772 objects.erase(q);
5773 }
5774 } else {
5775 objects[p->first] = p->second;
5776 }
5777 }
5778 }
5779
5780 void ScrubMap::encode(bufferlist& bl) const
5781 {
5782 ENCODE_START(3, 2, bl);
5783 ::encode(objects, bl);
5784 ::encode((__u32)0, bl); // used to be attrs; now deprecated
5785 bufferlist old_logbl; // not used
5786 ::encode(old_logbl, bl);
5787 ::encode(valid_through, bl);
5788 ::encode(incr_since, bl);
5789 ENCODE_FINISH(bl);
5790 }
5791
5792 void ScrubMap::decode(bufferlist::iterator& bl, int64_t pool)
5793 {
5794 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5795 ::decode(objects, bl);
5796 {
5797 map<string,string> attrs; // deprecated
5798 ::decode(attrs, bl);
5799 }
5800 bufferlist old_logbl; // not used
5801 ::decode(old_logbl, bl);
5802 ::decode(valid_through, bl);
5803 ::decode(incr_since, bl);
5804 DECODE_FINISH(bl);
5805
5806 // handle hobject_t upgrade
5807 if (struct_v < 3) {
5808 map<hobject_t, object> tmp;
5809 tmp.swap(objects);
5810 for (map<hobject_t, object>::iterator i = tmp.begin();
5811 i != tmp.end();
5812 ++i) {
5813 hobject_t first(i->first);
5814 if (!first.is_max() && first.pool == -1)
5815 first.pool = pool;
5816 objects[first] = i->second;
5817 }
5818 }
5819 }
5820
5821 void ScrubMap::dump(Formatter *f) const
5822 {
5823 f->dump_stream("valid_through") << valid_through;
5824 f->dump_stream("incremental_since") << incr_since;
5825 f->open_array_section("objects");
5826 for (map<hobject_t,object>::const_iterator p = objects.begin(); p != objects.end(); ++p) {
5827 f->open_object_section("object");
5828 f->dump_string("name", p->first.oid.name);
5829 f->dump_unsigned("hash", p->first.get_hash());
5830 f->dump_string("key", p->first.get_key());
5831 f->dump_int("snapid", p->first.snap);
5832 p->second.dump(f);
5833 f->close_section();
5834 }
5835 f->close_section();
5836 }
5837
5838 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
5839 {
5840 o.push_back(new ScrubMap);
5841 o.push_back(new ScrubMap);
5842 o.back()->valid_through = eversion_t(1, 2);
5843 o.back()->incr_since = eversion_t(3, 4);
5844 list<object*> obj;
5845 object::generate_test_instances(obj);
5846 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
5847 obj.pop_back();
5848 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
5849 }
5850
5851 // -- ScrubMap::object --
5852
5853 void ScrubMap::object::encode(bufferlist& bl) const
5854 {
5855 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
5856 ENCODE_START(9, 7, bl);
5857 ::encode(size, bl);
5858 ::encode(negative, bl);
5859 ::encode(attrs, bl);
5860 ::encode(digest, bl);
5861 ::encode(digest_present, bl);
5862 ::encode((uint32_t)0, bl); // obsolete nlinks
5863 ::encode((uint32_t)0, bl); // snapcolls
5864 ::encode(omap_digest, bl);
5865 ::encode(omap_digest_present, bl);
5866 ::encode(compat_read_error, bl);
5867 ::encode(stat_error, bl);
5868 ::encode(read_error, bl);
5869 ::encode(ec_hash_mismatch, bl);
5870 ::encode(ec_size_mismatch, bl);
5871 ::encode(large_omap_object_found, bl);
5872 ::encode(large_omap_object_key_count, bl);
5873 ::encode(large_omap_object_value_size, bl);
5874 ENCODE_FINISH(bl);
5875 }
5876
5877 void ScrubMap::object::decode(bufferlist::iterator& bl)
5878 {
5879 DECODE_START(9, bl);
5880 ::decode(size, bl);
5881 bool tmp, compat_read_error = false;
5882 ::decode(tmp, bl);
5883 negative = tmp;
5884 ::decode(attrs, bl);
5885 ::decode(digest, bl);
5886 ::decode(tmp, bl);
5887 digest_present = tmp;
5888 {
5889 uint32_t nlinks;
5890 ::decode(nlinks, bl);
5891 set<snapid_t> snapcolls;
5892 ::decode(snapcolls, bl);
5893 }
5894 ::decode(omap_digest, bl);
5895 ::decode(tmp, bl);
5896 omap_digest_present = tmp;
5897 ::decode(compat_read_error, bl);
5898 ::decode(tmp, bl);
5899 stat_error = tmp;
5900 if (struct_v >= 8) {
5901 ::decode(tmp, bl);
5902 read_error = tmp;
5903 ::decode(tmp, bl);
5904 ec_hash_mismatch = tmp;
5905 ::decode(tmp, bl);
5906 ec_size_mismatch = tmp;
5907 }
5908 // If older encoder found a read_error, set read_error
5909 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
5910 read_error = true;
5911 if (struct_v >= 9) {
5912 ::decode(tmp, bl);
5913 large_omap_object_found = tmp;
5914 ::decode(large_omap_object_key_count, bl);
5915 ::decode(large_omap_object_value_size, bl);
5916 }
5917 DECODE_FINISH(bl);
5918 }
5919
5920 void ScrubMap::object::dump(Formatter *f) const
5921 {
5922 f->dump_int("size", size);
5923 f->dump_int("negative", negative);
5924 f->open_array_section("attrs");
5925 for (map<string,bufferptr>::const_iterator p = attrs.begin(); p != attrs.end(); ++p) {
5926 f->open_object_section("attr");
5927 f->dump_string("name", p->first);
5928 f->dump_int("length", p->second.length());
5929 f->close_section();
5930 }
5931 f->close_section();
5932 }
5933
5934 void ScrubMap::object::generate_test_instances(list<object*>& o)
5935 {
5936 o.push_back(new object);
5937 o.push_back(new object);
5938 o.back()->negative = true;
5939 o.push_back(new object);
5940 o.back()->size = 123;
5941 o.back()->attrs["foo"] = buffer::copy("foo", 3);
5942 o.back()->attrs["bar"] = buffer::copy("barval", 6);
5943 }
5944
5945 // -- OSDOp --
5946
5947 ostream& operator<<(ostream& out, const OSDOp& op)
5948 {
5949 out << ceph_osd_op_name(op.op.op);
5950 if (ceph_osd_op_type_data(op.op.op)) {
5951 // data extent
5952 switch (op.op.op) {
5953 case CEPH_OSD_OP_ASSERT_VER:
5954 out << " v" << op.op.assert_ver.ver;
5955 break;
5956 case CEPH_OSD_OP_TRUNCATE:
5957 out << " " << op.op.extent.offset;
5958 break;
5959 case CEPH_OSD_OP_MASKTRUNC:
5960 case CEPH_OSD_OP_TRIMTRUNC:
5961 out << " " << op.op.extent.truncate_seq << "@"
5962 << (int64_t)op.op.extent.truncate_size;
5963 break;
5964 case CEPH_OSD_OP_ROLLBACK:
5965 out << " " << snapid_t(op.op.snap.snapid);
5966 break;
5967 case CEPH_OSD_OP_WATCH:
5968 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
5969 << " cookie " << op.op.watch.cookie;
5970 if (op.op.watch.gen)
5971 out << " gen " << op.op.watch.gen;
5972 break;
5973 case CEPH_OSD_OP_NOTIFY:
5974 case CEPH_OSD_OP_NOTIFY_ACK:
5975 out << " cookie " << op.op.notify.cookie;
5976 break;
5977 case CEPH_OSD_OP_COPY_GET:
5978 out << " max " << op.op.copy_get.max;
5979 break;
5980 case CEPH_OSD_OP_COPY_FROM:
5981 out << " ver " << op.op.copy_from.src_version;
5982 break;
5983 case CEPH_OSD_OP_SETALLOCHINT:
5984 out << " object_size " << op.op.alloc_hint.expected_object_size
5985 << " write_size " << op.op.alloc_hint.expected_write_size;
5986 break;
5987 case CEPH_OSD_OP_READ:
5988 case CEPH_OSD_OP_SPARSE_READ:
5989 case CEPH_OSD_OP_SYNC_READ:
5990 case CEPH_OSD_OP_WRITE:
5991 case CEPH_OSD_OP_WRITEFULL:
5992 case CEPH_OSD_OP_ZERO:
5993 case CEPH_OSD_OP_APPEND:
5994 case CEPH_OSD_OP_MAPEXT:
5995 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
5996 if (op.op.extent.truncate_seq)
5997 out << " [" << op.op.extent.truncate_seq << "@"
5998 << (int64_t)op.op.extent.truncate_size << "]";
5999 if (op.op.flags)
6000 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
6001 default:
6002 // don't show any arg info
6003 break;
6004 }
6005 } else if (ceph_osd_op_type_attr(op.op.op)) {
6006 // xattr name
6007 if (op.op.xattr.name_len && op.indata.length()) {
6008 out << " ";
6009 op.indata.write(0, op.op.xattr.name_len, out);
6010 }
6011 if (op.op.xattr.value_len)
6012 out << " (" << op.op.xattr.value_len << ")";
6013 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
6014 out << " op " << (int)op.op.xattr.cmp_op
6015 << " mode " << (int)op.op.xattr.cmp_mode;
6016 } else if (ceph_osd_op_type_exec(op.op.op)) {
6017 // class.method
6018 if (op.op.cls.class_len && op.indata.length()) {
6019 out << " ";
6020 op.indata.write(0, op.op.cls.class_len, out);
6021 out << ".";
6022 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
6023 }
6024 } else if (ceph_osd_op_type_pg(op.op.op)) {
6025 switch (op.op.op) {
6026 case CEPH_OSD_OP_PGLS:
6027 case CEPH_OSD_OP_PGLS_FILTER:
6028 case CEPH_OSD_OP_PGNLS:
6029 case CEPH_OSD_OP_PGNLS_FILTER:
6030 out << " start_epoch " << op.op.pgls.start_epoch;
6031 break;
6032 case CEPH_OSD_OP_PG_HITSET_LS:
6033 break;
6034 case CEPH_OSD_OP_PG_HITSET_GET:
6035 out << " " << utime_t(op.op.hit_set_get.stamp);
6036 break;
6037 case CEPH_OSD_OP_SCRUBLS:
6038 break;
6039 }
6040 }
6041 return out;
6042 }
6043
6044
6045 void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in)
6046 {
6047 bufferlist::iterator datap = in.begin();
6048 for (unsigned i = 0; i < ops.size(); i++) {
6049 if (ops[i].op.payload_len) {
6050 datap.copy(ops[i].op.payload_len, ops[i].indata);
6051 }
6052 }
6053 }
6054
6055 void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out)
6056 {
6057 for (unsigned i = 0; i < ops.size(); i++) {
6058 if (ops[i].indata.length()) {
6059 ops[i].op.payload_len = ops[i].indata.length();
6060 out.append(ops[i].indata);
6061 }
6062 }
6063 }
6064
6065 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in)
6066 {
6067 bufferlist::iterator datap = in.begin();
6068 for (unsigned i = 0; i < ops.size(); i++) {
6069 if (ops[i].op.payload_len) {
6070 datap.copy(ops[i].op.payload_len, ops[i].outdata);
6071 }
6072 }
6073 }
6074
6075 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out)
6076 {
6077 for (unsigned i = 0; i < ops.size(); i++) {
6078 if (ops[i].outdata.length()) {
6079 ops[i].op.payload_len = ops[i].outdata.length();
6080 out.append(ops[i].outdata);
6081 }
6082 }
6083 }
6084
6085 bool store_statfs_t::operator==(const store_statfs_t& other) const
6086 {
6087 return total == other.total
6088 && available == other.available
6089 && allocated == other.allocated
6090 && stored == other.stored
6091 && compressed == other.compressed
6092 && compressed_allocated == other.compressed_allocated
6093 && compressed_original == other.compressed_original;
6094 }
6095
6096 void store_statfs_t::dump(Formatter *f) const
6097 {
6098 f->dump_int("total", total);
6099 f->dump_int("available", available);
6100 f->dump_int("allocated", allocated);
6101 f->dump_int("stored", stored);
6102 f->dump_int("compressed", compressed);
6103 f->dump_int("compressed_allocated", compressed_allocated);
6104 f->dump_int("compressed_original", compressed_original);
6105 }
6106
6107 ostream& operator<<(ostream& out, const store_statfs_t &s)
6108 {
6109 out << std::hex
6110 << "store_statfs(0x" << s.available
6111 << "/0x" << s.total
6112 << ", stored 0x" << s.stored
6113 << "/0x" << s.allocated
6114 << ", compress 0x" << s.compressed
6115 << "/0x" << s.compressed_allocated
6116 << "/0x" << s.compressed_original
6117 << std::dec
6118 << ")";
6119 return out;
6120 }
6121
6122 void OSDOp::clear_data(vector<OSDOp>& ops)
6123 {
6124 for (unsigned i = 0; i < ops.size(); i++) {
6125 OSDOp& op = ops[i];
6126 op.outdata.clear();
6127 if (ceph_osd_op_type_attr(op.op.op) &&
6128 op.op.xattr.name_len &&
6129 op.indata.length() >= op.op.xattr.name_len) {
6130 bufferptr bp(op.op.xattr.name_len);
6131 bufferlist bl;
6132 bl.append(bp);
6133 bl.copy_in(0, op.op.xattr.name_len, op.indata);
6134 op.indata.claim(bl);
6135 } else if (ceph_osd_op_type_exec(op.op.op) &&
6136 op.op.cls.class_len &&
6137 op.indata.length() >
6138 (op.op.cls.class_len + op.op.cls.method_len)) {
6139 __u8 len = op.op.cls.class_len + op.op.cls.method_len;
6140 bufferptr bp(len);
6141 bufferlist bl;
6142 bl.append(bp);
6143 bl.copy_in(0, len, op.indata);
6144 op.indata.claim(bl);
6145 } else {
6146 op.indata.clear();
6147 }
6148 }
6149 }