]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.cc
update sources to 12.2.10
[ceph.git] / ceph / src / osd / osd_types.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <boost/assign/list_of.hpp>
19
20 #include "osd_types.h"
21 #include "include/ceph_features.h"
22 extern "C" {
23 #include "crush/hash.h"
24 }
25 #include "PG.h"
26 #include "OSDMap.h"
27 #include "PGBackend.h"
28
29 const char *ceph_osd_flag_name(unsigned flag)
30 {
31 switch (flag) {
32 case CEPH_OSD_FLAG_ACK: return "ack";
33 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
34 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
35 case CEPH_OSD_FLAG_RETRY: return "retry";
36 case CEPH_OSD_FLAG_READ: return "read";
37 case CEPH_OSD_FLAG_WRITE: return "write";
38 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
39 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
40 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
41 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
42 case CEPH_OSD_FLAG_PGOP: return "pgop";
43 case CEPH_OSD_FLAG_EXEC: return "exec";
44 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
45 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
46 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
47 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
48 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
49 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
50 case CEPH_OSD_FLAG_FLUSH: return "flush";
51 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
52 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
53 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
54 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
55 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
56 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
57 case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
58 default: return "???";
59 }
60 }
61
62 string ceph_osd_flag_string(unsigned flags)
63 {
64 string s;
65 for (unsigned i=0; i<32; ++i) {
66 if (flags & (1u<<i)) {
67 if (s.length())
68 s += "+";
69 s += ceph_osd_flag_name(1u << i);
70 }
71 }
72 if (s.length())
73 return s;
74 return string("-");
75 }
76
77 const char * ceph_osd_op_flag_name(unsigned flag)
78 {
79 const char *name;
80
81 switch(flag) {
82 case CEPH_OSD_OP_FLAG_EXCL:
83 name = "excl";
84 break;
85 case CEPH_OSD_OP_FLAG_FAILOK:
86 name = "failok";
87 break;
88 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
89 name = "fadvise_random";
90 break;
91 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
92 name = "fadvise_sequential";
93 break;
94 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
95 name = "favise_willneed";
96 break;
97 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
98 name = "fadvise_dontneed";
99 break;
100 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
101 name = "fadvise_nocache";
102 break;
103 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
104 name = "bypass_clean_cache";
105 break;
106 default:
107 name = "???";
108 };
109
110 return name;
111 }
112
113 string ceph_osd_op_flag_string(unsigned flags)
114 {
115 string s;
116 for (unsigned i=0; i<32; ++i) {
117 if (flags & (1u<<i)) {
118 if (s.length())
119 s += "+";
120 s += ceph_osd_op_flag_name(1u << i);
121 }
122 }
123 if (s.length())
124 return s;
125 return string("-");
126 }
127
128 string ceph_osd_alloc_hint_flag_string(unsigned flags)
129 {
130 string s;
131 for (unsigned i=0; i<32; ++i) {
132 if (flags & (1u<<i)) {
133 if (s.length())
134 s += "+";
135 s += ceph_osd_alloc_hint_flag_name(1u << i);
136 }
137 }
138 if (s.length())
139 return s;
140 return string("-");
141 }
142
143 void pg_shard_t::encode(bufferlist &bl) const
144 {
145 ENCODE_START(1, 1, bl);
146 ::encode(osd, bl);
147 ::encode(shard, bl);
148 ENCODE_FINISH(bl);
149 }
150 void pg_shard_t::decode(bufferlist::iterator &bl)
151 {
152 DECODE_START(1, bl);
153 ::decode(osd, bl);
154 ::decode(shard, bl);
155 DECODE_FINISH(bl);
156 }
157
158 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
159 {
160 if (rhs.is_undefined())
161 return lhs << "?";
162 if (rhs.shard == shard_id_t::NO_SHARD)
163 return lhs << rhs.get_osd();
164 return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
165 }
166
167 // -- osd_reqid_t --
168 void osd_reqid_t::dump(Formatter *f) const
169 {
170 f->dump_stream("name") << name;
171 f->dump_int("inc", inc);
172 f->dump_unsigned("tid", tid);
173 }
174
175 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
176 {
177 o.push_back(new osd_reqid_t);
178 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
179 }
180
181 // -- object_locator_t --
182
183 void object_locator_t::encode(bufferlist& bl) const
184 {
185 // verify that nobody's corrupted the locator
186 assert(hash == -1 || key.empty());
187 __u8 encode_compat = 3;
188 ENCODE_START(6, encode_compat, bl);
189 ::encode(pool, bl);
190 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
191 ::encode(preferred, bl);
192 ::encode(key, bl);
193 ::encode(nspace, bl);
194 ::encode(hash, bl);
195 if (hash != -1)
196 encode_compat = MAX(encode_compat, 6); // need to interpret the hash
197 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
198 }
199
200 void object_locator_t::decode(bufferlist::iterator& p)
201 {
202 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
203 if (struct_v < 2) {
204 int32_t op;
205 ::decode(op, p);
206 pool = op;
207 int16_t pref;
208 ::decode(pref, p);
209 } else {
210 ::decode(pool, p);
211 int32_t preferred;
212 ::decode(preferred, p);
213 }
214 ::decode(key, p);
215 if (struct_v >= 5)
216 ::decode(nspace, p);
217 if (struct_v >= 6)
218 ::decode(hash, p);
219 else
220 hash = -1;
221 DECODE_FINISH(p);
222 // verify that nobody's corrupted the locator
223 assert(hash == -1 || key.empty());
224 }
225
226 void object_locator_t::dump(Formatter *f) const
227 {
228 f->dump_int("pool", pool);
229 f->dump_string("key", key);
230 f->dump_string("namespace", nspace);
231 f->dump_int("hash", hash);
232 }
233
234 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
235 {
236 o.push_back(new object_locator_t);
237 o.push_back(new object_locator_t(123));
238 o.push_back(new object_locator_t(123, 876));
239 o.push_back(new object_locator_t(1, "n2"));
240 o.push_back(new object_locator_t(1234, "", "key"));
241 o.push_back(new object_locator_t(12, "n1", "key2"));
242 }
243
244 // -- request_redirect_t --
245 void request_redirect_t::encode(bufferlist& bl) const
246 {
247 ENCODE_START(1, 1, bl);
248 ::encode(redirect_locator, bl);
249 ::encode(redirect_object, bl);
250 ::encode(osd_instructions, bl);
251 ENCODE_FINISH(bl);
252 }
253
254 void request_redirect_t::decode(bufferlist::iterator& bl)
255 {
256 DECODE_START(1, bl);
257 ::decode(redirect_locator, bl);
258 ::decode(redirect_object, bl);
259 ::decode(osd_instructions, bl);
260 DECODE_FINISH(bl);
261 }
262
263 void request_redirect_t::dump(Formatter *f) const
264 {
265 f->dump_string("object", redirect_object);
266 f->open_object_section("locator");
267 redirect_locator.dump(f);
268 f->close_section(); // locator
269 }
270
271 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
272 {
273 object_locator_t loc(1, "redir_obj");
274 o.push_back(new request_redirect_t());
275 o.push_back(new request_redirect_t(loc, 0));
276 o.push_back(new request_redirect_t(loc, "redir_obj"));
277 o.push_back(new request_redirect_t(loc));
278 }
279
280 void objectstore_perf_stat_t::dump(Formatter *f) const
281 {
282 f->dump_unsigned("commit_latency_ms", os_commit_latency);
283 f->dump_unsigned("apply_latency_ms", os_apply_latency);
284 }
285
286 void objectstore_perf_stat_t::encode(bufferlist &bl) const
287 {
288 ENCODE_START(1, 1, bl);
289 ::encode(os_commit_latency, bl);
290 ::encode(os_apply_latency, bl);
291 ENCODE_FINISH(bl);
292 }
293
294 void objectstore_perf_stat_t::decode(bufferlist::iterator &bl)
295 {
296 DECODE_START(1, bl);
297 ::decode(os_commit_latency, bl);
298 ::decode(os_apply_latency, bl);
299 DECODE_FINISH(bl);
300 }
301
302 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
303 {
304 o.push_back(new objectstore_perf_stat_t());
305 o.push_back(new objectstore_perf_stat_t());
306 o.back()->os_commit_latency = 20;
307 o.back()->os_apply_latency = 30;
308 }
309
310 // -- osd_stat_t --
311 void osd_stat_t::dump(Formatter *f) const
312 {
313 f->dump_unsigned("up_from", up_from);
314 f->dump_unsigned("seq", seq);
315 f->dump_unsigned("num_pgs", num_pgs);
316 f->dump_unsigned("kb", kb);
317 f->dump_unsigned("kb_used", kb_used);
318 f->dump_unsigned("kb_avail", kb_avail);
319 f->open_array_section("hb_peers");
320 for (auto p : hb_peers)
321 f->dump_int("osd", p);
322 f->close_section();
323 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
324 f->dump_int("num_snap_trimming", num_snap_trimming);
325 f->open_object_section("op_queue_age_hist");
326 op_queue_age_hist.dump(f);
327 f->close_section();
328 f->open_object_section("perf_stat");
329 os_perf_stat.dump(f);
330 f->close_section();
331 }
332
333 void osd_stat_t::encode(bufferlist &bl) const
334 {
335 ENCODE_START(7, 2, bl);
336 ::encode(kb, bl);
337 ::encode(kb_used, bl);
338 ::encode(kb_avail, bl);
339 ::encode(snap_trim_queue_len, bl);
340 ::encode(num_snap_trimming, bl);
341 ::encode(hb_peers, bl);
342 ::encode((uint32_t)0, bl);
343 ::encode(op_queue_age_hist, bl);
344 ::encode(os_perf_stat, bl);
345 ::encode(up_from, bl);
346 ::encode(seq, bl);
347 ::encode(num_pgs, bl);
348 ENCODE_FINISH(bl);
349 }
350
351 void osd_stat_t::decode(bufferlist::iterator &bl)
352 {
353 DECODE_START_LEGACY_COMPAT_LEN(6, 2, 2, bl);
354 ::decode(kb, bl);
355 ::decode(kb_used, bl);
356 ::decode(kb_avail, bl);
357 ::decode(snap_trim_queue_len, bl);
358 ::decode(num_snap_trimming, bl);
359 ::decode(hb_peers, bl);
360 vector<int> num_hb_out;
361 ::decode(num_hb_out, bl);
362 if (struct_v >= 3)
363 ::decode(op_queue_age_hist, bl);
364 if (struct_v >= 4)
365 ::decode(os_perf_stat, bl);
366 if (struct_v >= 6) {
367 ::decode(up_from, bl);
368 ::decode(seq, bl);
369 }
370 if (struct_v >= 7) {
371 ::decode(num_pgs, bl);
372 }
373 DECODE_FINISH(bl);
374 }
375
376 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
377 {
378 o.push_back(new osd_stat_t);
379
380 o.push_back(new osd_stat_t);
381 o.back()->kb = 1;
382 o.back()->kb_used = 2;
383 o.back()->kb_avail = 3;
384 o.back()->hb_peers.push_back(7);
385 o.back()->snap_trim_queue_len = 8;
386 o.back()->num_snap_trimming = 99;
387 }
388
389 // -- pg_t --
390
391 int pg_t::print(char *o, int maxlen) const
392 {
393 if (preferred() >= 0)
394 return snprintf(o, maxlen, "%llu.%xp%d", (unsigned long long)pool(), ps(), preferred());
395 else
396 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
397 }
398
399 bool pg_t::parse(const char *s)
400 {
401 uint64_t ppool;
402 uint32_t pseed;
403 int32_t pref;
404 int r = sscanf(s, "%llu.%xp%d", (long long unsigned *)&ppool, &pseed, &pref);
405 if (r < 2)
406 return false;
407 m_pool = ppool;
408 m_seed = pseed;
409 if (r == 3)
410 m_preferred = pref;
411 else
412 m_preferred = -1;
413 return true;
414 }
415
416 bool spg_t::parse(const char *s)
417 {
418 pgid.set_preferred(-1);
419 shard = shard_id_t::NO_SHARD;
420 uint64_t ppool;
421 uint32_t pseed;
422 int32_t pref;
423 uint32_t pshard;
424 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
425 if (r < 2)
426 return false;
427 pgid.set_pool(ppool);
428 pgid.set_ps(pseed);
429
430 const char *p = strchr(s, 'p');
431 if (p) {
432 r = sscanf(p, "p%d", &pref);
433 if (r == 1) {
434 pgid.set_preferred(pref);
435 } else {
436 return false;
437 }
438 }
439
440 p = strchr(s, 's');
441 if (p) {
442 r = sscanf(p, "s%d", &pshard);
443 if (r == 1) {
444 shard = shard_id_t(pshard);
445 } else {
446 return false;
447 }
448 }
449 return true;
450 }
451
452 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
453 {
454 while (*suffix_backwords)
455 *--buf = *suffix_backwords++;
456
457 if (!is_no_shard()) {
458 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
459 *--buf = 's';
460 }
461
462 return pgid.calc_name(buf, "");
463 }
464
465 ostream& operator<<(ostream& out, const spg_t &pg)
466 {
467 char buf[spg_t::calc_name_buf_size];
468 buf[spg_t::calc_name_buf_size - 1] = '\0';
469 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
470 return out;
471 }
472
473 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
474 {
475 int old_bits = cbits(old_pg_num);
476 int old_mask = (1 << old_bits) - 1;
477 pg_t ret = *this;
478 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
479 return ret;
480 }
481
482 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
483 {
484 assert(m_seed < old_pg_num);
485 if (new_pg_num <= old_pg_num)
486 return false;
487
488 bool split = false;
489 if (true) {
490 unsigned old_bits = cbits(old_pg_num);
491 unsigned old_mask = (1 << old_bits) - 1;
492 for (unsigned n = 1; ; n++) {
493 unsigned next_bit = (n << (old_bits-1));
494 unsigned s = next_bit | m_seed;
495
496 if (s < old_pg_num || s == m_seed)
497 continue;
498 if (s >= new_pg_num)
499 break;
500 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
501 split = true;
502 if (children)
503 children->insert(pg_t(s, m_pool, m_preferred));
504 }
505 }
506 }
507 if (false) {
508 // brute force
509 int old_bits = cbits(old_pg_num);
510 int old_mask = (1 << old_bits) - 1;
511 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
512 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
513 if (o == m_seed) {
514 split = true;
515 children->insert(pg_t(x, m_pool, m_preferred));
516 }
517 }
518 }
519 return split;
520 }
521
522 unsigned pg_t::get_split_bits(unsigned pg_num) const {
523 if (pg_num == 1)
524 return 0;
525 assert(pg_num > 1);
526
527 // Find unique p such that pg_num \in [2^(p-1), 2^p)
528 unsigned p = cbits(pg_num);
529 assert(p); // silence coverity #751330
530
531 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
532 return p;
533 else
534 return p - 1;
535 }
536
537 pg_t pg_t::get_parent() const
538 {
539 unsigned bits = cbits(m_seed);
540 assert(bits);
541 pg_t retval = *this;
542 retval.m_seed &= ~((~0)<<(bits - 1));
543 return retval;
544 }
545
546 hobject_t pg_t::get_hobj_start() const
547 {
548 return hobject_t(object_t(), string(), CEPH_NOSNAP, m_seed, m_pool,
549 string());
550 }
551
552 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
553 {
554 // note: this assumes a bitwise sort; with the legacy nibblewise
555 // sort a PG did not always cover a single contiguous range of the
556 // (bit-reversed) hash range.
557 unsigned bits = get_split_bits(pg_num);
558 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
559 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
560 if (rev_end >= 0x100000000) {
561 assert(rev_end == 0x100000000);
562 return hobject_t::get_max();
563 } else {
564 return hobject_t(object_t(), string(), CEPH_NOSNAP,
565 hobject_t::_reverse_bits(rev_end), m_pool,
566 string());
567 }
568 }
569
570 void pg_t::dump(Formatter *f) const
571 {
572 f->dump_unsigned("pool", m_pool);
573 f->dump_unsigned("seed", m_seed);
574 f->dump_int("preferred_osd", m_preferred);
575 }
576
577 void pg_t::generate_test_instances(list<pg_t*>& o)
578 {
579 o.push_back(new pg_t);
580 o.push_back(new pg_t(1, 2, -1));
581 o.push_back(new pg_t(13123, 3, -1));
582 o.push_back(new pg_t(131223, 4, 23));
583 }
584
585 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
586 {
587 while (*suffix_backwords)
588 *--buf = *suffix_backwords++;
589
590 if (m_preferred >= 0)
591 *--buf ='p';
592
593 buf = ritoa<uint32_t, 16>(m_seed, buf);
594
595 *--buf = '.';
596
597 return ritoa<uint64_t, 10>(m_pool, buf);
598 }
599
600 ostream& operator<<(ostream& out, const pg_t &pg)
601 {
602 char buf[pg_t::calc_name_buf_size];
603 buf[pg_t::calc_name_buf_size - 1] = '\0';
604 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
605 return out;
606 }
607
608
609 // -- coll_t --
610
611 void coll_t::calc_str()
612 {
613 switch (type) {
614 case TYPE_META:
615 strcpy(_str_buff, "meta");
616 _str = _str_buff;
617 break;
618 case TYPE_PG:
619 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
620 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
621 break;
622 case TYPE_PG_TEMP:
623 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
624 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
625 break;
626 default:
627 assert(0 == "unknown collection type");
628 }
629 }
630
631 bool coll_t::parse(const std::string& s)
632 {
633 if (s == "meta") {
634 type = TYPE_META;
635 pgid = spg_t();
636 removal_seq = 0;
637 calc_str();
638 assert(s == _str);
639 return true;
640 }
641 if (s.find("_head") == s.length() - 5 &&
642 pgid.parse(s.substr(0, s.length() - 5))) {
643 type = TYPE_PG;
644 removal_seq = 0;
645 calc_str();
646 assert(s == _str);
647 return true;
648 }
649 if (s.find("_TEMP") == s.length() - 5 &&
650 pgid.parse(s.substr(0, s.length() - 5))) {
651 type = TYPE_PG_TEMP;
652 removal_seq = 0;
653 calc_str();
654 assert(s == _str);
655 return true;
656 }
657 return false;
658 }
659
660 void coll_t::encode(bufferlist& bl) const
661 {
662 // when changing this, remember to update encoded_size() too.
663 if (is_temp()) {
664 // can't express this as v2...
665 __u8 struct_v = 3;
666 ::encode(struct_v, bl);
667 ::encode(to_str(), bl);
668 } else {
669 __u8 struct_v = 2;
670 ::encode(struct_v, bl);
671 ::encode((__u8)type, bl);
672 ::encode(pgid, bl);
673 snapid_t snap = CEPH_NOSNAP;
674 ::encode(snap, bl);
675 }
676 }
677
678 size_t coll_t::encoded_size() const
679 {
680 size_t r = sizeof(__u8);
681 if (is_temp()) {
682 // v3
683 r += sizeof(__u32);
684 if (_str) {
685 r += strlen(_str);
686 }
687 } else {
688 // v2
689 // 1. type
690 r += sizeof(__u8);
691 // 2. pgid
692 // - encoding header
693 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
694 // - pg_t
695 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
696 // - shard_id_t
697 r += sizeof(int8_t);
698 // 3. snapid_t
699 r += sizeof(uint64_t);
700 }
701
702 return r;
703 }
704
705 void coll_t::decode(bufferlist::iterator& bl)
706 {
707 __u8 struct_v;
708 ::decode(struct_v, bl);
709 switch (struct_v) {
710 case 1:
711 {
712 snapid_t snap;
713 ::decode(pgid, bl);
714 ::decode(snap, bl);
715
716 // infer the type
717 if (pgid == spg_t() && snap == 0) {
718 type = TYPE_META;
719 } else {
720 type = TYPE_PG;
721 }
722 removal_seq = 0;
723 }
724 break;
725
726 case 2:
727 {
728 __u8 _type;
729 snapid_t snap;
730 ::decode(_type, bl);
731 ::decode(pgid, bl);
732 ::decode(snap, bl);
733 type = (type_t)_type;
734 removal_seq = 0;
735 }
736 break;
737
738 case 3:
739 {
740 string str;
741 ::decode(str, bl);
742 bool ok = parse(str);
743 if (!ok)
744 throw std::domain_error(std::string("unable to parse pg ") + str);
745 }
746 break;
747
748 default:
749 {
750 ostringstream oss;
751 oss << "coll_t::decode(): don't know how to decode version "
752 << struct_v;
753 throw std::domain_error(oss.str());
754 }
755 }
756 }
757
758 void coll_t::dump(Formatter *f) const
759 {
760 f->dump_unsigned("type_id", (unsigned)type);
761 if (type != TYPE_META)
762 f->dump_stream("pgid") << pgid;
763 f->dump_string("name", to_str());
764 }
765
766 void coll_t::generate_test_instances(list<coll_t*>& o)
767 {
768 o.push_back(new coll_t());
769 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
770 o.push_back(new coll_t(o.back()->get_temp()));
771 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
772 o.push_back(new coll_t(o.back()->get_temp()));
773 o.push_back(new coll_t());
774 }
775
776 // ---
777
778 std::string pg_vector_string(const vector<int32_t> &a)
779 {
780 ostringstream oss;
781 oss << "[";
782 for (vector<int32_t>::const_iterator i = a.begin(); i != a.end(); ++i) {
783 if (i != a.begin())
784 oss << ",";
785 if (*i != CRUSH_ITEM_NONE)
786 oss << *i;
787 else
788 oss << "NONE";
789 }
790 oss << "]";
791 return oss.str();
792 }
793
794 std::string pg_state_string(int state)
795 {
796 ostringstream oss;
797 if (state & PG_STATE_STALE)
798 oss << "stale+";
799 if (state & PG_STATE_CREATING)
800 oss << "creating+";
801 if (state & PG_STATE_ACTIVE)
802 oss << "active+";
803 if (state & PG_STATE_ACTIVATING)
804 oss << "activating+";
805 if (state & PG_STATE_CLEAN)
806 oss << "clean+";
807 if (state & PG_STATE_RECOVERY_WAIT)
808 oss << "recovery_wait+";
809 if (state & PG_STATE_RECOVERY_TOOFULL)
810 oss << "recovery_toofull+";
811 if (state & PG_STATE_RECOVERING)
812 oss << "recovering+";
813 if (state & PG_STATE_FORCED_RECOVERY)
814 oss << "forced_recovery+";
815 if (state & PG_STATE_DOWN)
816 oss << "down+";
817 if (state & PG_STATE_RECOVERY_UNFOUND)
818 oss << "recovery_unfound+";
819 if (state & PG_STATE_BACKFILL_UNFOUND)
820 oss << "backfill_unfound+";
821 if (state & PG_STATE_UNDERSIZED)
822 oss << "undersized+";
823 if (state & PG_STATE_DEGRADED)
824 oss << "degraded+";
825 if (state & PG_STATE_REMAPPED)
826 oss << "remapped+";
827 if (state & PG_STATE_SCRUBBING)
828 oss << "scrubbing+";
829 if (state & PG_STATE_DEEP_SCRUB)
830 oss << "deep+";
831 if (state & PG_STATE_INCONSISTENT)
832 oss << "inconsistent+";
833 if (state & PG_STATE_PEERING)
834 oss << "peering+";
835 if (state & PG_STATE_REPAIR)
836 oss << "repair+";
837 if (state & PG_STATE_BACKFILL_WAIT)
838 oss << "backfill_wait+";
839 if (state & PG_STATE_BACKFILLING)
840 oss << "backfilling+";
841 if (state & PG_STATE_FORCED_BACKFILL)
842 oss << "forced_backfill+";
843 if (state & PG_STATE_BACKFILL_TOOFULL)
844 oss << "backfill_toofull+";
845 if (state & PG_STATE_INCOMPLETE)
846 oss << "incomplete+";
847 if (state & PG_STATE_PEERED)
848 oss << "peered+";
849 if (state & PG_STATE_SNAPTRIM)
850 oss << "snaptrim+";
851 if (state & PG_STATE_SNAPTRIM_WAIT)
852 oss << "snaptrim_wait+";
853 if (state & PG_STATE_SNAPTRIM_ERROR)
854 oss << "snaptrim_error+";
855 string ret(oss.str());
856 if (ret.length() > 0)
857 ret.resize(ret.length() - 1);
858 else
859 ret = "unknown";
860 return ret;
861 }
862
863 boost::optional<uint64_t> pg_string_state(const std::string& state)
864 {
865 boost::optional<uint64_t> type;
866 if (state == "active")
867 type = PG_STATE_ACTIVE;
868 else if (state == "clean")
869 type = PG_STATE_CLEAN;
870 else if (state == "down")
871 type = PG_STATE_DOWN;
872 else if (state == "recovery_unfound")
873 type = PG_STATE_RECOVERY_UNFOUND;
874 else if (state == "backfill_unfound")
875 type = PG_STATE_BACKFILL_UNFOUND;
876 else if (state == "scrubbing")
877 type = PG_STATE_SCRUBBING;
878 else if (state == "degraded")
879 type = PG_STATE_DEGRADED;
880 else if (state == "inconsistent")
881 type = PG_STATE_INCONSISTENT;
882 else if (state == "peering")
883 type = PG_STATE_PEERING;
884 else if (state == "repair")
885 type = PG_STATE_REPAIR;
886 else if (state == "recovering")
887 type = PG_STATE_RECOVERING;
888 else if (state == "forced_recovery")
889 type = PG_STATE_FORCED_RECOVERY;
890 else if (state == "backfill_wait")
891 type = PG_STATE_BACKFILL_WAIT;
892 else if (state == "incomplete")
893 type = PG_STATE_INCOMPLETE;
894 else if (state == "stale")
895 type = PG_STATE_STALE;
896 else if (state == "remapped")
897 type = PG_STATE_REMAPPED;
898 else if (state == "deep")
899 type = PG_STATE_DEEP_SCRUB;
900 else if (state == "backfilling")
901 type = PG_STATE_BACKFILLING;
902 else if (state == "forced_backfill")
903 type = PG_STATE_FORCED_BACKFILL;
904 else if (state == "backfill_toofull")
905 type = PG_STATE_BACKFILL_TOOFULL;
906 else if (state == "recovery_wait")
907 type = PG_STATE_RECOVERY_WAIT;
908 else if (state == "recovery_toofull")
909 type = PG_STATE_RECOVERY_TOOFULL;
910 else if (state == "undersized")
911 type = PG_STATE_UNDERSIZED;
912 else if (state == "activating")
913 type = PG_STATE_ACTIVATING;
914 else if (state == "peered")
915 type = PG_STATE_PEERED;
916 else if (state == "snaptrim")
917 type = PG_STATE_SNAPTRIM;
918 else if (state == "snaptrim_wait")
919 type = PG_STATE_SNAPTRIM_WAIT;
920 else if (state == "snaptrim_error")
921 type = PG_STATE_SNAPTRIM_ERROR;
922 else if (state == "creating")
923 type = PG_STATE_CREATING;
924 else
925 type = boost::none;
926 return type;
927 }
928
929 // -- eversion_t --
930 string eversion_t::get_key_name() const
931 {
932 char key[32];
933 // Below is equivalent of sprintf("%010u.%020llu");
934 key[31] = 0;
935 ritoa<uint64_t, 10, 20>(version, key + 31);
936 key[10] = '.';
937 ritoa<uint32_t, 10, 10>(epoch, key + 10);
938 return string(key);
939 }
940
941
942 // -- pool_snap_info_t --
943 void pool_snap_info_t::dump(Formatter *f) const
944 {
945 f->dump_unsigned("snapid", snapid);
946 f->dump_stream("stamp") << stamp;
947 f->dump_string("name", name);
948 }
949
950 void pool_snap_info_t::encode(bufferlist& bl, uint64_t features) const
951 {
952 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
953 __u8 struct_v = 1;
954 ::encode(struct_v, bl);
955 ::encode(snapid, bl);
956 ::encode(stamp, bl);
957 ::encode(name, bl);
958 return;
959 }
960 ENCODE_START(2, 2, bl);
961 ::encode(snapid, bl);
962 ::encode(stamp, bl);
963 ::encode(name, bl);
964 ENCODE_FINISH(bl);
965 }
966
967 void pool_snap_info_t::decode(bufferlist::iterator& bl)
968 {
969 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
970 ::decode(snapid, bl);
971 ::decode(stamp, bl);
972 ::decode(name, bl);
973 DECODE_FINISH(bl);
974 }
975
976 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
977 {
978 o.push_back(new pool_snap_info_t);
979 o.push_back(new pool_snap_info_t);
980 o.back()->snapid = 1;
981 o.back()->stamp = utime_t(1, 2);
982 o.back()->name = "foo";
983 }
984
985 // -- pool_opts_t --
986
987 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
988 static opt_mapping_t opt_mapping = boost::assign::map_list_of
989 ("scrub_min_interval", pool_opts_t::opt_desc_t(
990 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
991 ("scrub_max_interval", pool_opts_t::opt_desc_t(
992 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
993 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
994 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
995 ("recovery_priority", pool_opts_t::opt_desc_t(
996 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
997 ("recovery_op_priority", pool_opts_t::opt_desc_t(
998 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
999 ("scrub_priority", pool_opts_t::opt_desc_t(
1000 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
1001 ("compression_mode", pool_opts_t::opt_desc_t(
1002 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
1003 ("compression_algorithm", pool_opts_t::opt_desc_t(
1004 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
1005 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1006 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
1007 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1008 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
1009 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1010 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
1011 ("csum_type", pool_opts_t::opt_desc_t(
1012 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1013 ("csum_max_block", pool_opts_t::opt_desc_t(
1014 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1015 ("csum_min_block", pool_opts_t::opt_desc_t(
1016 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT));
1017
1018 bool pool_opts_t::is_opt_name(const std::string& name) {
1019 return opt_mapping.count(name);
1020 }
1021
1022 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) {
1023 opt_mapping_t::iterator i = opt_mapping.find(name);
1024 assert(i != opt_mapping.end());
1025 return i->second;
1026 }
1027
1028 bool pool_opts_t::is_set(pool_opts_t::key_t key) const {
1029 return opts.count(key);
1030 }
1031
1032 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const {
1033 opts_t::const_iterator i = opts.find(key);
1034 assert(i != opts.end());
1035 return i->second;
1036 }
1037
1038 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1039 return opts.erase(key) > 0;
1040 }
1041
1042 class pool_opts_dumper_t : public boost::static_visitor<>
1043 {
1044 public:
1045 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1046 name(name_.c_str()), f(f_) {}
1047
1048 void operator()(std::string s) const {
1049 f->dump_string(name, s);
1050 }
1051 void operator()(int i) const {
1052 f->dump_int(name, i);
1053 }
1054 void operator()(double d) const {
1055 f->dump_float(name, d);
1056 }
1057
1058 private:
1059 const char* name;
1060 Formatter* f;
1061 };
1062
1063 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1064 {
1065 const opt_desc_t& desc = get_opt_desc(name);
1066 opts_t::const_iterator i = opts.find(desc.key);
1067 if (i == opts.end()) {
1068 return;
1069 }
1070 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1071 }
1072
1073 void pool_opts_t::dump(Formatter* f) const
1074 {
1075 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1076 ++i) {
1077 const std::string& name = i->first;
1078 const opt_desc_t& desc = i->second;
1079 opts_t::const_iterator j = opts.find(desc.key);
1080 if (j == opts.end()) {
1081 continue;
1082 }
1083 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1084 }
1085 }
1086
1087 class pool_opts_encoder_t : public boost::static_visitor<>
1088 {
1089 public:
1090 explicit pool_opts_encoder_t(bufferlist& bl_) : bl(bl_) {}
1091
1092 void operator()(std::string s) const {
1093 ::encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1094 ::encode(s, bl);
1095 }
1096 void operator()(int i) const {
1097 ::encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1098 ::encode(i, bl);
1099 }
1100 void operator()(double d) const {
1101 ::encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1102 ::encode(d, bl);
1103 }
1104
1105 private:
1106 bufferlist& bl;
1107 };
1108
1109 void pool_opts_t::encode(bufferlist& bl) const {
1110 ENCODE_START(1, 1, bl);
1111 uint32_t n = static_cast<uint32_t>(opts.size());
1112 ::encode(n, bl);
1113 for (opts_t::const_iterator i = opts.begin(); i != opts.end(); ++i) {
1114 ::encode(static_cast<int32_t>(i->first), bl);
1115 boost::apply_visitor(pool_opts_encoder_t(bl), i->second);
1116 }
1117 ENCODE_FINISH(bl);
1118 }
1119
1120 void pool_opts_t::decode(bufferlist::iterator& bl) {
1121 DECODE_START(1, bl);
1122 __u32 n;
1123 ::decode(n, bl);
1124 opts.clear();
1125 while (n--) {
1126 int32_t k, t;
1127 ::decode(k, bl);
1128 ::decode(t, bl);
1129 if (t == STR) {
1130 std::string s;
1131 ::decode(s, bl);
1132 opts[static_cast<key_t>(k)] = s;
1133 } else if (t == INT) {
1134 int i;
1135 ::decode(i, bl);
1136 opts[static_cast<key_t>(k)] = i;
1137 } else if (t == DOUBLE) {
1138 double d;
1139 ::decode(d, bl);
1140 opts[static_cast<key_t>(k)] = d;
1141 } else {
1142 assert(!"invalid type");
1143 }
1144 }
1145 DECODE_FINISH(bl);
1146 }
1147
1148 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1149 {
1150 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1151 ++i) {
1152 const std::string& name = i->first;
1153 const pool_opts_t::opt_desc_t& desc = i->second;
1154 pool_opts_t::opts_t::const_iterator j = opts.opts.find(desc.key);
1155 if (j == opts.opts.end()) {
1156 continue;
1157 }
1158 out << " " << name << " " << j->second;
1159 }
1160 return out;
1161 }
1162
1163 // -- pg_pool_t --
1164
1165 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1166 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1167 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1168
1169 void pg_pool_t::dump(Formatter *f) const
1170 {
1171 f->dump_unsigned("flags", get_flags());
1172 f->dump_string("flags_names", get_flags_string());
1173 f->dump_int("type", get_type());
1174 f->dump_int("size", get_size());
1175 f->dump_int("min_size", get_min_size());
1176 f->dump_int("crush_rule", get_crush_rule());
1177 f->dump_int("object_hash", get_object_hash());
1178 f->dump_unsigned("pg_num", get_pg_num());
1179 f->dump_unsigned("pg_placement_num", get_pgp_num());
1180 f->dump_unsigned("crash_replay_interval", get_crash_replay_interval());
1181 f->dump_stream("last_change") << get_last_change();
1182 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1183 f->dump_stream("last_force_op_resend_preluminous")
1184 << get_last_force_op_resend_preluminous();
1185 f->dump_unsigned("auid", get_auid());
1186 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1187 f->dump_unsigned("snap_seq", get_snap_seq());
1188 f->dump_unsigned("snap_epoch", get_snap_epoch());
1189 f->open_array_section("pool_snaps");
1190 for (map<snapid_t, pool_snap_info_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
1191 f->open_object_section("pool_snap_info");
1192 p->second.dump(f);
1193 f->close_section();
1194 }
1195 f->close_section();
1196 f->dump_stream("removed_snaps") << removed_snaps;
1197 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1198 f->dump_unsigned("quota_max_objects", quota_max_objects);
1199 f->open_array_section("tiers");
1200 for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
1201 f->dump_unsigned("pool_id", *p);
1202 f->close_section();
1203 f->dump_int("tier_of", tier_of);
1204 f->dump_int("read_tier", read_tier);
1205 f->dump_int("write_tier", write_tier);
1206 f->dump_string("cache_mode", get_cache_mode_name());
1207 f->dump_unsigned("target_max_bytes", target_max_bytes);
1208 f->dump_unsigned("target_max_objects", target_max_objects);
1209 f->dump_unsigned("cache_target_dirty_ratio_micro",
1210 cache_target_dirty_ratio_micro);
1211 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1212 cache_target_dirty_high_ratio_micro);
1213 f->dump_unsigned("cache_target_full_ratio_micro",
1214 cache_target_full_ratio_micro);
1215 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1216 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1217 f->dump_string("erasure_code_profile", erasure_code_profile);
1218 f->open_object_section("hit_set_params");
1219 hit_set_params.dump(f);
1220 f->close_section(); // hit_set_params
1221 f->dump_unsigned("hit_set_period", hit_set_period);
1222 f->dump_unsigned("hit_set_count", hit_set_count);
1223 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1224 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1225 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1226 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1227 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1228 f->open_array_section("grade_table");
1229 for (unsigned i = 0; i < hit_set_count; ++i)
1230 f->dump_unsigned("value", get_grade(i));
1231 f->close_section();
1232 f->dump_unsigned("stripe_width", get_stripe_width());
1233 f->dump_unsigned("expected_num_objects", expected_num_objects);
1234 f->dump_bool("fast_read", fast_read);
1235 f->open_object_section("options");
1236 opts.dump(f);
1237 f->close_section(); // options
1238 f->open_object_section("application_metadata");
1239 for (auto &app_pair : application_metadata) {
1240 f->open_object_section(app_pair.first.c_str());
1241 for (auto &kv_pair : app_pair.second) {
1242 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1243 }
1244 f->close_section(); // application
1245 }
1246 f->close_section(); // application_metadata
1247 }
1248
1249 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1250 for (size_t i = 0; i < from.size(); ++i) {
1251 if (from[i] != CRUSH_ITEM_NONE) {
1252 to->insert(
1253 pg_shard_t(
1254 from[i],
1255 ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1256 }
1257 }
1258 }
1259
1260 void pg_pool_t::calc_pg_masks()
1261 {
1262 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1263 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1264 }
1265
1266 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1267 {
1268 if (pg_num == pg_num_mask + 1)
1269 return pg_num; // power-of-2 split
1270 unsigned mask = pg_num_mask >> 1;
1271 if ((pgid.ps() & mask) < (pg_num & mask))
1272 return pg_num_mask + 1; // smaller bin size (already split)
1273 else
1274 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1275 }
1276
1277 /*
1278 * we have two snap modes:
1279 * - pool global snaps
1280 * - snap existence/non-existence defined by snaps[] and snap_seq
1281 * - user managed snaps
1282 * - removal governed by removed_snaps
1283 *
1284 * we know which mode we're using based on whether removed_snaps is empty.
1285 */
1286 bool pg_pool_t::is_pool_snaps_mode() const
1287 {
1288 return removed_snaps.empty() && get_snap_seq() > 0;
1289 }
1290
1291 bool pg_pool_t::is_unmanaged_snaps_mode() const
1292 {
1293 return removed_snaps.size() && get_snap_seq() > 0;
1294 }
1295
1296 bool pg_pool_t::is_removed_snap(snapid_t s) const
1297 {
1298 if (is_pool_snaps_mode())
1299 return s <= get_snap_seq() && snaps.count(s) == 0;
1300 else
1301 return removed_snaps.contains(s);
1302 }
1303
1304 /*
1305 * build set of known-removed sets from either pool snaps or
1306 * explicit removed_snaps set.
1307 */
1308 void pg_pool_t::build_removed_snaps(interval_set<snapid_t>& rs) const
1309 {
1310 if (is_pool_snaps_mode()) {
1311 rs.clear();
1312 for (snapid_t s = 1; s <= get_snap_seq(); s = s + 1)
1313 if (snaps.count(s) == 0)
1314 rs.insert(s);
1315 } else {
1316 rs = removed_snaps;
1317 }
1318 }
1319
1320 bool pg_pool_t::maybe_updated_removed_snaps(const interval_set<snapid_t>& cached) const
1321 {
1322 if (is_unmanaged_snaps_mode()) { // remove_unmanaged_snap increments range_end
1323 if (removed_snaps.empty() || cached.empty()) // range_end is undefined
1324 return removed_snaps.empty() != cached.empty();
1325 return removed_snaps.range_end() != cached.range_end();
1326 }
1327 return true;
1328 }
1329
1330 snapid_t pg_pool_t::snap_exists(const char *s) const
1331 {
1332 for (map<snapid_t,pool_snap_info_t>::const_iterator p = snaps.begin();
1333 p != snaps.end();
1334 ++p)
1335 if (p->second.name == s)
1336 return p->second.snapid;
1337 return 0;
1338 }
1339
1340 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1341 {
1342 assert(!is_unmanaged_snaps_mode());
1343 snapid_t s = get_snap_seq() + 1;
1344 snap_seq = s;
1345 snaps[s].snapid = s;
1346 snaps[s].name = n;
1347 snaps[s].stamp = stamp;
1348 }
1349
1350 void pg_pool_t::add_unmanaged_snap(uint64_t& snapid)
1351 {
1352 if (removed_snaps.empty()) {
1353 assert(!is_pool_snaps_mode());
1354 removed_snaps.insert(snapid_t(1));
1355 snap_seq = 1;
1356 }
1357 snapid = snap_seq = snap_seq + 1;
1358 }
1359
1360 void pg_pool_t::remove_snap(snapid_t s)
1361 {
1362 assert(snaps.count(s));
1363 snaps.erase(s);
1364 snap_seq = snap_seq + 1;
1365 }
1366
1367 void pg_pool_t::remove_unmanaged_snap(snapid_t s)
1368 {
1369 assert(is_unmanaged_snaps_mode());
1370 removed_snaps.insert(s);
1371 snap_seq = snap_seq + 1;
1372 // try to add in the new seq, just to try to keep the interval_set contiguous
1373 if (!removed_snaps.contains(get_snap_seq())) {
1374 removed_snaps.insert(get_snap_seq());
1375 }
1376 }
1377
1378 SnapContext pg_pool_t::get_snap_context() const
1379 {
1380 vector<snapid_t> s(snaps.size());
1381 unsigned i = 0;
1382 for (map<snapid_t, pool_snap_info_t>::const_reverse_iterator p = snaps.rbegin();
1383 p != snaps.rend();
1384 ++p)
1385 s[i++] = p->first;
1386 return SnapContext(get_snap_seq(), s);
1387 }
1388
1389 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1390 {
1391 if (ns.empty())
1392 return ceph_str_hash(object_hash, key.data(), key.length());
1393 int nsl = ns.length();
1394 int len = key.length() + nsl + 1;
1395 char buf[len];
1396 memcpy(&buf[0], ns.data(), nsl);
1397 buf[nsl] = '\037';
1398 memcpy(&buf[nsl+1], key.data(), key.length());
1399 return ceph_str_hash(object_hash, &buf[0], len);
1400 }
1401
1402 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1403 {
1404 return ceph_stable_mod(v, pg_num, pg_num_mask);
1405 }
1406
1407 /*
1408 * map a raw pg (with full precision ps) into an actual pg, for storage
1409 */
1410 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1411 {
1412 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1413 return pg;
1414 }
1415
1416 /*
1417 * map raw pg (full precision ps) into a placement seed. include
1418 * pool id in that value so that different pools don't use the same
1419 * seeds.
1420 */
1421 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1422 {
1423 if (flags & FLAG_HASHPSPOOL) {
1424 // Hash the pool id so that pool PGs do not overlap.
1425 return
1426 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1427 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1428 pg.pool());
1429 } else {
1430 // Legacy behavior; add ps and pool together. This is not a great
1431 // idea because the PGs from each pool will essentially overlap on
1432 // top of each other: 0.5 == 1.4 == 2.3 == ...
1433 return
1434 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1435 pg.pool();
1436 }
1437 }
1438
1439 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1440 {
1441 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1442 if (pg_num == pg_num_mask + 1) {
1443 r &= ~pg_num_mask;
1444 } else {
1445 unsigned smaller_mask = pg_num_mask >> 1;
1446 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1447 r &= ~pg_num_mask;
1448 } else {
1449 r &= ~smaller_mask;
1450 }
1451 }
1452 r |= pg.ps();
1453 return r;
1454 }
1455
1456 void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
1457 {
1458 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1459 // this encoding matches the old struct ceph_pg_pool
1460 __u8 struct_v = 2;
1461 ::encode(struct_v, bl);
1462 ::encode(type, bl);
1463 ::encode(size, bl);
1464 ::encode(crush_rule, bl);
1465 ::encode(object_hash, bl);
1466 ::encode(pg_num, bl);
1467 ::encode(pgp_num, bl);
1468 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1469 ::encode(lpg_num, bl);
1470 ::encode(lpgp_num, bl);
1471 ::encode(last_change, bl);
1472 ::encode(snap_seq, bl);
1473 ::encode(snap_epoch, bl);
1474
1475 __u32 n = snaps.size();
1476 ::encode(n, bl);
1477 n = removed_snaps.num_intervals();
1478 ::encode(n, bl);
1479
1480 ::encode(auid, bl);
1481
1482 ::encode_nohead(snaps, bl, features);
1483 ::encode_nohead(removed_snaps, bl);
1484 return;
1485 }
1486
1487 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1488 __u8 struct_v = 4;
1489 ::encode(struct_v, bl);
1490 ::encode(type, bl);
1491 ::encode(size, bl);
1492 ::encode(crush_rule, bl);
1493 ::encode(object_hash, bl);
1494 ::encode(pg_num, bl);
1495 ::encode(pgp_num, bl);
1496 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1497 ::encode(lpg_num, bl);
1498 ::encode(lpgp_num, bl);
1499 ::encode(last_change, bl);
1500 ::encode(snap_seq, bl);
1501 ::encode(snap_epoch, bl);
1502 ::encode(snaps, bl, features);
1503 ::encode(removed_snaps, bl);
1504 ::encode(auid, bl);
1505 ::encode(flags, bl);
1506 ::encode(crash_replay_interval, bl);
1507 return;
1508 }
1509
1510 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1511 // we simply added last_force_op_resend here, which is a fully
1512 // backward compatible change. however, encoding the same map
1513 // differently between monitors triggers scrub noise (even though
1514 // they are decodable without the feature), so let's be pendantic
1515 // about it.
1516 ENCODE_START(14, 5, bl);
1517 ::encode(type, bl);
1518 ::encode(size, bl);
1519 ::encode(crush_rule, bl);
1520 ::encode(object_hash, bl);
1521 ::encode(pg_num, bl);
1522 ::encode(pgp_num, bl);
1523 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1524 ::encode(lpg_num, bl);
1525 ::encode(lpgp_num, bl);
1526 ::encode(last_change, bl);
1527 ::encode(snap_seq, bl);
1528 ::encode(snap_epoch, bl);
1529 ::encode(snaps, bl, features);
1530 ::encode(removed_snaps, bl);
1531 ::encode(auid, bl);
1532 ::encode(flags, bl);
1533 ::encode(crash_replay_interval, bl);
1534 ::encode(min_size, bl);
1535 ::encode(quota_max_bytes, bl);
1536 ::encode(quota_max_objects, bl);
1537 ::encode(tiers, bl);
1538 ::encode(tier_of, bl);
1539 __u8 c = cache_mode;
1540 ::encode(c, bl);
1541 ::encode(read_tier, bl);
1542 ::encode(write_tier, bl);
1543 ::encode(properties, bl);
1544 ::encode(hit_set_params, bl);
1545 ::encode(hit_set_period, bl);
1546 ::encode(hit_set_count, bl);
1547 ::encode(stripe_width, bl);
1548 ::encode(target_max_bytes, bl);
1549 ::encode(target_max_objects, bl);
1550 ::encode(cache_target_dirty_ratio_micro, bl);
1551 ::encode(cache_target_full_ratio_micro, bl);
1552 ::encode(cache_min_flush_age, bl);
1553 ::encode(cache_min_evict_age, bl);
1554 ::encode(erasure_code_profile, bl);
1555 ENCODE_FINISH(bl);
1556 return;
1557 }
1558
1559 uint8_t v = 26;
1560 // NOTE: any new encoding dependencies must be reflected by
1561 // SIGNIFICANT_FEATURES
1562 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1563 // this was the first post-hammer thing we added; if it's missing, encode
1564 // like hammer.
1565 v = 21;
1566 } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1567 v = 24;
1568 }
1569
1570 ENCODE_START(v, 5, bl);
1571 ::encode(type, bl);
1572 ::encode(size, bl);
1573 ::encode(crush_rule, bl);
1574 ::encode(object_hash, bl);
1575 ::encode(pg_num, bl);
1576 ::encode(pgp_num, bl);
1577 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1578 ::encode(lpg_num, bl);
1579 ::encode(lpgp_num, bl);
1580 ::encode(last_change, bl);
1581 ::encode(snap_seq, bl);
1582 ::encode(snap_epoch, bl);
1583 ::encode(snaps, bl, features);
1584 ::encode(removed_snaps, bl);
1585 ::encode(auid, bl);
1586 ::encode(flags, bl);
1587 ::encode(crash_replay_interval, bl);
1588 ::encode(min_size, bl);
1589 ::encode(quota_max_bytes, bl);
1590 ::encode(quota_max_objects, bl);
1591 ::encode(tiers, bl);
1592 ::encode(tier_of, bl);
1593 __u8 c = cache_mode;
1594 ::encode(c, bl);
1595 ::encode(read_tier, bl);
1596 ::encode(write_tier, bl);
1597 ::encode(properties, bl);
1598 ::encode(hit_set_params, bl);
1599 ::encode(hit_set_period, bl);
1600 ::encode(hit_set_count, bl);
1601 ::encode(stripe_width, bl);
1602 ::encode(target_max_bytes, bl);
1603 ::encode(target_max_objects, bl);
1604 ::encode(cache_target_dirty_ratio_micro, bl);
1605 ::encode(cache_target_full_ratio_micro, bl);
1606 ::encode(cache_min_flush_age, bl);
1607 ::encode(cache_min_evict_age, bl);
1608 ::encode(erasure_code_profile, bl);
1609 ::encode(last_force_op_resend_preluminous, bl);
1610 ::encode(min_read_recency_for_promote, bl);
1611 ::encode(expected_num_objects, bl);
1612 if (v >= 19) {
1613 ::encode(cache_target_dirty_high_ratio_micro, bl);
1614 }
1615 if (v >= 20) {
1616 ::encode(min_write_recency_for_promote, bl);
1617 }
1618 if (v >= 21) {
1619 ::encode(use_gmt_hitset, bl);
1620 }
1621 if (v >= 22) {
1622 ::encode(fast_read, bl);
1623 }
1624 if (v >= 23) {
1625 ::encode(hit_set_grade_decay_rate, bl);
1626 ::encode(hit_set_search_last_n, bl);
1627 }
1628 if (v >= 24) {
1629 ::encode(opts, bl);
1630 }
1631 if (v >= 25) {
1632 ::encode(last_force_op_resend, bl);
1633 }
1634 if (v >= 26) {
1635 ::encode(application_metadata, bl);
1636 }
1637 ENCODE_FINISH(bl);
1638 }
1639
1640 void pg_pool_t::decode(bufferlist::iterator& bl)
1641 {
1642 DECODE_START_LEGACY_COMPAT_LEN(26, 5, 5, bl);
1643 ::decode(type, bl);
1644 ::decode(size, bl);
1645 ::decode(crush_rule, bl);
1646 ::decode(object_hash, bl);
1647 ::decode(pg_num, bl);
1648 ::decode(pgp_num, bl);
1649 {
1650 __u32 lpg_num, lpgp_num;
1651 ::decode(lpg_num, bl);
1652 ::decode(lpgp_num, bl);
1653 }
1654 ::decode(last_change, bl);
1655 ::decode(snap_seq, bl);
1656 ::decode(snap_epoch, bl);
1657
1658 if (struct_v >= 3) {
1659 ::decode(snaps, bl);
1660 ::decode(removed_snaps, bl);
1661 ::decode(auid, bl);
1662 } else {
1663 __u32 n, m;
1664 ::decode(n, bl);
1665 ::decode(m, bl);
1666 ::decode(auid, bl);
1667 ::decode_nohead(n, snaps, bl);
1668 ::decode_nohead(m, removed_snaps, bl);
1669 }
1670
1671 if (struct_v >= 4) {
1672 ::decode(flags, bl);
1673 ::decode(crash_replay_interval, bl);
1674 } else {
1675 flags = 0;
1676
1677 // if this looks like the 'data' pool, set the
1678 // crash_replay_interval appropriately. unfortunately, we can't
1679 // be precise here. this should be good enough to preserve replay
1680 // on the data pool for the majority of cluster upgrades, though.
1681 if (crush_rule == 0 && auid == 0)
1682 crash_replay_interval = 60;
1683 else
1684 crash_replay_interval = 0;
1685 }
1686 if (struct_v >= 7) {
1687 ::decode(min_size, bl);
1688 } else {
1689 min_size = size - size/2;
1690 }
1691 if (struct_v >= 8) {
1692 ::decode(quota_max_bytes, bl);
1693 ::decode(quota_max_objects, bl);
1694 }
1695 if (struct_v >= 9) {
1696 ::decode(tiers, bl);
1697 ::decode(tier_of, bl);
1698 __u8 v;
1699 ::decode(v, bl);
1700 cache_mode = (cache_mode_t)v;
1701 ::decode(read_tier, bl);
1702 ::decode(write_tier, bl);
1703 }
1704 if (struct_v >= 10) {
1705 ::decode(properties, bl);
1706 }
1707 if (struct_v >= 11) {
1708 ::decode(hit_set_params, bl);
1709 ::decode(hit_set_period, bl);
1710 ::decode(hit_set_count, bl);
1711 } else {
1712 pg_pool_t def;
1713 hit_set_period = def.hit_set_period;
1714 hit_set_count = def.hit_set_count;
1715 }
1716 if (struct_v >= 12) {
1717 ::decode(stripe_width, bl);
1718 } else {
1719 set_stripe_width(0);
1720 }
1721 if (struct_v >= 13) {
1722 ::decode(target_max_bytes, bl);
1723 ::decode(target_max_objects, bl);
1724 ::decode(cache_target_dirty_ratio_micro, bl);
1725 ::decode(cache_target_full_ratio_micro, bl);
1726 ::decode(cache_min_flush_age, bl);
1727 ::decode(cache_min_evict_age, bl);
1728 } else {
1729 target_max_bytes = 0;
1730 target_max_objects = 0;
1731 cache_target_dirty_ratio_micro = 0;
1732 cache_target_full_ratio_micro = 0;
1733 cache_min_flush_age = 0;
1734 cache_min_evict_age = 0;
1735 }
1736 if (struct_v >= 14) {
1737 ::decode(erasure_code_profile, bl);
1738 }
1739 if (struct_v >= 15) {
1740 ::decode(last_force_op_resend_preluminous, bl);
1741 } else {
1742 last_force_op_resend_preluminous = 0;
1743 }
1744 if (struct_v >= 16) {
1745 ::decode(min_read_recency_for_promote, bl);
1746 } else {
1747 min_read_recency_for_promote = 1;
1748 }
1749 if (struct_v >= 17) {
1750 ::decode(expected_num_objects, bl);
1751 } else {
1752 expected_num_objects = 0;
1753 }
1754 if (struct_v >= 19) {
1755 ::decode(cache_target_dirty_high_ratio_micro, bl);
1756 } else {
1757 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
1758 }
1759 if (struct_v >= 20) {
1760 ::decode(min_write_recency_for_promote, bl);
1761 } else {
1762 min_write_recency_for_promote = 1;
1763 }
1764 if (struct_v >= 21) {
1765 ::decode(use_gmt_hitset, bl);
1766 } else {
1767 use_gmt_hitset = false;
1768 }
1769 if (struct_v >= 22) {
1770 ::decode(fast_read, bl);
1771 } else {
1772 fast_read = false;
1773 }
1774 if (struct_v >= 23) {
1775 ::decode(hit_set_grade_decay_rate, bl);
1776 ::decode(hit_set_search_last_n, bl);
1777 } else {
1778 hit_set_grade_decay_rate = 0;
1779 hit_set_search_last_n = 1;
1780 }
1781 if (struct_v >= 24) {
1782 ::decode(opts, bl);
1783 }
1784 if (struct_v >= 25) {
1785 ::decode(last_force_op_resend, bl);
1786 } else {
1787 last_force_op_resend = last_force_op_resend_preluminous;
1788 }
1789 if (struct_v >= 26) {
1790 ::decode(application_metadata, bl);
1791 }
1792 DECODE_FINISH(bl);
1793 calc_pg_masks();
1794 calc_grade_table();
1795 }
1796
1797 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
1798 {
1799 pg_pool_t a;
1800 o.push_back(new pg_pool_t(a));
1801
1802 a.type = TYPE_REPLICATED;
1803 a.size = 2;
1804 a.crush_rule = 3;
1805 a.object_hash = 4;
1806 a.pg_num = 6;
1807 a.pgp_num = 5;
1808 a.last_change = 9;
1809 a.last_force_op_resend = 123823;
1810 a.last_force_op_resend_preluminous = 123824;
1811 a.snap_seq = 10;
1812 a.snap_epoch = 11;
1813 a.auid = 12;
1814 a.crash_replay_interval = 13;
1815 a.quota_max_bytes = 473;
1816 a.quota_max_objects = 474;
1817 o.push_back(new pg_pool_t(a));
1818
1819 a.snaps[3].name = "asdf";
1820 a.snaps[3].snapid = 3;
1821 a.snaps[3].stamp = utime_t(123, 4);
1822 a.snaps[6].name = "qwer";
1823 a.snaps[6].snapid = 6;
1824 a.snaps[6].stamp = utime_t(23423, 4);
1825 o.push_back(new pg_pool_t(a));
1826
1827 a.removed_snaps.insert(2); // not quite valid to combine with snaps!
1828 a.quota_max_bytes = 2473;
1829 a.quota_max_objects = 4374;
1830 a.tiers.insert(0);
1831 a.tiers.insert(1);
1832 a.tier_of = 2;
1833 a.cache_mode = CACHEMODE_WRITEBACK;
1834 a.read_tier = 1;
1835 a.write_tier = 1;
1836 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
1837 a.hit_set_period = 3600;
1838 a.hit_set_count = 8;
1839 a.min_read_recency_for_promote = 1;
1840 a.min_write_recency_for_promote = 1;
1841 a.hit_set_grade_decay_rate = 50;
1842 a.hit_set_search_last_n = 1;
1843 a.calc_grade_table();
1844 a.set_stripe_width(12345);
1845 a.target_max_bytes = 1238132132;
1846 a.target_max_objects = 1232132;
1847 a.cache_target_dirty_ratio_micro = 187232;
1848 a.cache_target_dirty_high_ratio_micro = 309856;
1849 a.cache_target_full_ratio_micro = 987222;
1850 a.cache_min_flush_age = 231;
1851 a.cache_min_evict_age = 2321;
1852 a.erasure_code_profile = "profile in osdmap";
1853 a.expected_num_objects = 123456;
1854 a.fast_read = false;
1855 a.application_metadata = {{"rbd", {{"key", "value"}}}};
1856 o.push_back(new pg_pool_t(a));
1857 }
1858
1859 ostream& operator<<(ostream& out, const pg_pool_t& p)
1860 {
1861 out << p.get_type_name()
1862 << " size " << p.get_size()
1863 << " min_size " << p.get_min_size()
1864 << " crush_rule " << p.get_crush_rule()
1865 << " object_hash " << p.get_object_hash_name()
1866 << " pg_num " << p.get_pg_num()
1867 << " pgp_num " << p.get_pgp_num()
1868 << " last_change " << p.get_last_change();
1869 if (p.get_last_force_op_resend() ||
1870 p.get_last_force_op_resend_preluminous())
1871 out << " lfor " << p.get_last_force_op_resend() << "/"
1872 << p.get_last_force_op_resend_preluminous();
1873 if (p.get_auid())
1874 out << " owner " << p.get_auid();
1875 if (p.flags)
1876 out << " flags " << p.get_flags_string();
1877 if (p.crash_replay_interval)
1878 out << " crash_replay_interval " << p.crash_replay_interval;
1879 if (p.quota_max_bytes)
1880 out << " max_bytes " << p.quota_max_bytes;
1881 if (p.quota_max_objects)
1882 out << " max_objects " << p.quota_max_objects;
1883 if (!p.tiers.empty())
1884 out << " tiers " << p.tiers;
1885 if (p.is_tier())
1886 out << " tier_of " << p.tier_of;
1887 if (p.has_read_tier())
1888 out << " read_tier " << p.read_tier;
1889 if (p.has_write_tier())
1890 out << " write_tier " << p.write_tier;
1891 if (p.cache_mode)
1892 out << " cache_mode " << p.get_cache_mode_name();
1893 if (p.target_max_bytes)
1894 out << " target_bytes " << p.target_max_bytes;
1895 if (p.target_max_objects)
1896 out << " target_objects " << p.target_max_objects;
1897 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
1898 out << " hit_set " << p.hit_set_params
1899 << " " << p.hit_set_period << "s"
1900 << " x" << p.hit_set_count << " decay_rate "
1901 << p.hit_set_grade_decay_rate
1902 << " search_last_n " << p.hit_set_search_last_n;
1903 }
1904 if (p.min_read_recency_for_promote)
1905 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
1906 if (p.min_write_recency_for_promote)
1907 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
1908 out << " stripe_width " << p.get_stripe_width();
1909 if (p.expected_num_objects)
1910 out << " expected_num_objects " << p.expected_num_objects;
1911 if (p.fast_read)
1912 out << " fast_read " << p.fast_read;
1913 out << p.opts;
1914 if (!p.application_metadata.empty()) {
1915 out << " application ";
1916 for (auto it = p.application_metadata.begin();
1917 it != p.application_metadata.end(); ++it) {
1918 if (it != p.application_metadata.begin())
1919 out << ",";
1920 out << it->first;
1921 }
1922 }
1923 return out;
1924 }
1925
1926
1927 // -- object_stat_sum_t --
1928
1929 void object_stat_sum_t::dump(Formatter *f) const
1930 {
1931 f->dump_int("num_bytes", num_bytes);
1932 f->dump_int("num_objects", num_objects);
1933 f->dump_int("num_object_clones", num_object_clones);
1934 f->dump_int("num_object_copies", num_object_copies);
1935 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
1936 f->dump_int("num_objects_missing", num_objects_missing);
1937 f->dump_int("num_objects_degraded", num_objects_degraded);
1938 f->dump_int("num_objects_misplaced", num_objects_misplaced);
1939 f->dump_int("num_objects_unfound", num_objects_unfound);
1940 f->dump_int("num_objects_dirty", num_objects_dirty);
1941 f->dump_int("num_whiteouts", num_whiteouts);
1942 f->dump_int("num_read", num_rd);
1943 f->dump_int("num_read_kb", num_rd_kb);
1944 f->dump_int("num_write", num_wr);
1945 f->dump_int("num_write_kb", num_wr_kb);
1946 f->dump_int("num_scrub_errors", num_scrub_errors);
1947 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
1948 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
1949 f->dump_int("num_objects_recovered", num_objects_recovered);
1950 f->dump_int("num_bytes_recovered", num_bytes_recovered);
1951 f->dump_int("num_keys_recovered", num_keys_recovered);
1952 f->dump_int("num_objects_omap", num_objects_omap);
1953 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
1954 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
1955 f->dump_int("num_flush", num_flush);
1956 f->dump_int("num_flush_kb", num_flush_kb);
1957 f->dump_int("num_evict", num_evict);
1958 f->dump_int("num_evict_kb", num_evict_kb);
1959 f->dump_int("num_promote", num_promote);
1960 f->dump_int("num_flush_mode_high", num_flush_mode_high);
1961 f->dump_int("num_flush_mode_low", num_flush_mode_low);
1962 f->dump_int("num_evict_mode_some", num_evict_mode_some);
1963 f->dump_int("num_evict_mode_full", num_evict_mode_full);
1964 f->dump_int("num_objects_pinned", num_objects_pinned);
1965 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
1966 f->dump_int("num_large_omap_objects", num_large_omap_objects);
1967 }
1968
1969 void object_stat_sum_t::encode(bufferlist& bl) const
1970 {
1971 ENCODE_START(17, 14, bl);
1972 #if defined(CEPH_LITTLE_ENDIAN)
1973 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
1974 #else
1975 ::encode(num_bytes, bl);
1976 ::encode(num_objects, bl);
1977 ::encode(num_object_clones, bl);
1978 ::encode(num_object_copies, bl);
1979 ::encode(num_objects_missing_on_primary, bl);
1980 ::encode(num_objects_degraded, bl);
1981 ::encode(num_objects_unfound, bl);
1982 ::encode(num_rd, bl);
1983 ::encode(num_rd_kb, bl);
1984 ::encode(num_wr, bl);
1985 ::encode(num_wr_kb, bl);
1986 ::encode(num_scrub_errors, bl);
1987 ::encode(num_objects_recovered, bl);
1988 ::encode(num_bytes_recovered, bl);
1989 ::encode(num_keys_recovered, bl);
1990 ::encode(num_shallow_scrub_errors, bl);
1991 ::encode(num_deep_scrub_errors, bl);
1992 ::encode(num_objects_dirty, bl);
1993 ::encode(num_whiteouts, bl);
1994 ::encode(num_objects_omap, bl);
1995 ::encode(num_objects_hit_set_archive, bl);
1996 ::encode(num_objects_misplaced, bl);
1997 ::encode(num_bytes_hit_set_archive, bl);
1998 ::encode(num_flush, bl);
1999 ::encode(num_flush_kb, bl);
2000 ::encode(num_evict, bl);
2001 ::encode(num_evict_kb, bl);
2002 ::encode(num_promote, bl);
2003 ::encode(num_flush_mode_high, bl);
2004 ::encode(num_flush_mode_low, bl);
2005 ::encode(num_evict_mode_some, bl);
2006 ::encode(num_evict_mode_full, bl);
2007 ::encode(num_objects_pinned, bl);
2008 ::encode(num_objects_missing, bl);
2009 ::encode(num_legacy_snapsets, bl);
2010 ::encode(num_large_omap_objects, bl);
2011 #endif
2012 ENCODE_FINISH(bl);
2013 }
2014
2015 void object_stat_sum_t::decode(bufferlist::iterator& bl)
2016 {
2017 bool decode_finish = false;
2018 DECODE_START(17, bl); // make sure to also update fast decode below
2019 #if defined(CEPH_LITTLE_ENDIAN)
2020 if (struct_v >= 17) { // this must match newest decode version
2021 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
2022 decode_finish = true;
2023 }
2024 #endif
2025 if (!decode_finish) {
2026 ::decode(num_bytes, bl);
2027 ::decode(num_objects, bl);
2028 ::decode(num_object_clones, bl);
2029 ::decode(num_object_copies, bl);
2030 ::decode(num_objects_missing_on_primary, bl);
2031 ::decode(num_objects_degraded, bl);
2032 ::decode(num_objects_unfound, bl);
2033 ::decode(num_rd, bl);
2034 ::decode(num_rd_kb, bl);
2035 ::decode(num_wr, bl);
2036 ::decode(num_wr_kb, bl);
2037 ::decode(num_scrub_errors, bl);
2038 ::decode(num_objects_recovered, bl);
2039 ::decode(num_bytes_recovered, bl);
2040 ::decode(num_keys_recovered, bl);
2041 ::decode(num_shallow_scrub_errors, bl);
2042 ::decode(num_deep_scrub_errors, bl);
2043 ::decode(num_objects_dirty, bl);
2044 ::decode(num_whiteouts, bl);
2045 ::decode(num_objects_omap, bl);
2046 ::decode(num_objects_hit_set_archive, bl);
2047 ::decode(num_objects_misplaced, bl);
2048 ::decode(num_bytes_hit_set_archive, bl);
2049 ::decode(num_flush, bl);
2050 ::decode(num_flush_kb, bl);
2051 ::decode(num_evict, bl);
2052 ::decode(num_evict_kb, bl);
2053 ::decode(num_promote, bl);
2054 ::decode(num_flush_mode_high, bl);
2055 ::decode(num_flush_mode_low, bl);
2056 ::decode(num_evict_mode_some, bl);
2057 ::decode(num_evict_mode_full, bl);
2058 ::decode(num_objects_pinned, bl);
2059 ::decode(num_objects_missing, bl);
2060 if (struct_v >= 16) {
2061 ::decode(num_legacy_snapsets, bl);
2062 } else {
2063 num_legacy_snapsets = num_object_clones; // upper bound
2064 }
2065 if (struct_v >= 17) {
2066 ::decode(num_large_omap_objects, bl);
2067 }
2068 }
2069 DECODE_FINISH(bl);
2070 }
2071
2072 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2073 {
2074 object_stat_sum_t a;
2075
2076 a.num_bytes = 1;
2077 a.num_objects = 3;
2078 a.num_object_clones = 4;
2079 a.num_object_copies = 5;
2080 a.num_objects_missing_on_primary = 6;
2081 a.num_objects_missing = 123;
2082 a.num_objects_degraded = 7;
2083 a.num_objects_unfound = 8;
2084 a.num_rd = 9; a.num_rd_kb = 10;
2085 a.num_wr = 11; a.num_wr_kb = 12;
2086 a.num_objects_recovered = 14;
2087 a.num_bytes_recovered = 15;
2088 a.num_keys_recovered = 16;
2089 a.num_deep_scrub_errors = 17;
2090 a.num_shallow_scrub_errors = 18;
2091 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2092 a.num_objects_dirty = 21;
2093 a.num_whiteouts = 22;
2094 a.num_objects_misplaced = 1232;
2095 a.num_objects_hit_set_archive = 2;
2096 a.num_bytes_hit_set_archive = 27;
2097 a.num_flush = 5;
2098 a.num_flush_kb = 6;
2099 a.num_evict = 7;
2100 a.num_evict_kb = 8;
2101 a.num_promote = 9;
2102 a.num_flush_mode_high = 0;
2103 a.num_flush_mode_low = 1;
2104 a.num_evict_mode_some = 1;
2105 a.num_evict_mode_full = 0;
2106 a.num_objects_pinned = 20;
2107 a.num_large_omap_objects = 5;
2108 o.push_back(new object_stat_sum_t(a));
2109 }
2110
2111 void object_stat_sum_t::add(const object_stat_sum_t& o)
2112 {
2113 num_bytes += o.num_bytes;
2114 num_objects += o.num_objects;
2115 num_object_clones += o.num_object_clones;
2116 num_object_copies += o.num_object_copies;
2117 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2118 num_objects_missing += o.num_objects_missing;
2119 num_objects_degraded += o.num_objects_degraded;
2120 num_objects_misplaced += o.num_objects_misplaced;
2121 num_rd += o.num_rd;
2122 num_rd_kb += o.num_rd_kb;
2123 num_wr += o.num_wr;
2124 num_wr_kb += o.num_wr_kb;
2125 num_objects_unfound += o.num_objects_unfound;
2126 num_scrub_errors += o.num_scrub_errors;
2127 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2128 num_deep_scrub_errors += o.num_deep_scrub_errors;
2129 num_objects_recovered += o.num_objects_recovered;
2130 num_bytes_recovered += o.num_bytes_recovered;
2131 num_keys_recovered += o.num_keys_recovered;
2132 num_objects_dirty += o.num_objects_dirty;
2133 num_whiteouts += o.num_whiteouts;
2134 num_objects_omap += o.num_objects_omap;
2135 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2136 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2137 num_flush += o.num_flush;
2138 num_flush_kb += o.num_flush_kb;
2139 num_evict += o.num_evict;
2140 num_evict_kb += o.num_evict_kb;
2141 num_promote += o.num_promote;
2142 num_flush_mode_high += o.num_flush_mode_high;
2143 num_flush_mode_low += o.num_flush_mode_low;
2144 num_evict_mode_some += o.num_evict_mode_some;
2145 num_evict_mode_full += o.num_evict_mode_full;
2146 num_objects_pinned += o.num_objects_pinned;
2147 num_legacy_snapsets += o.num_legacy_snapsets;
2148 num_large_omap_objects += o.num_large_omap_objects;
2149 }
2150
2151 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2152 {
2153 num_bytes -= o.num_bytes;
2154 num_objects -= o.num_objects;
2155 num_object_clones -= o.num_object_clones;
2156 num_object_copies -= o.num_object_copies;
2157 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2158 num_objects_missing -= o.num_objects_missing;
2159 num_objects_degraded -= o.num_objects_degraded;
2160 num_objects_misplaced -= o.num_objects_misplaced;
2161 num_rd -= o.num_rd;
2162 num_rd_kb -= o.num_rd_kb;
2163 num_wr -= o.num_wr;
2164 num_wr_kb -= o.num_wr_kb;
2165 num_objects_unfound -= o.num_objects_unfound;
2166 num_scrub_errors -= o.num_scrub_errors;
2167 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2168 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2169 num_objects_recovered -= o.num_objects_recovered;
2170 num_bytes_recovered -= o.num_bytes_recovered;
2171 num_keys_recovered -= o.num_keys_recovered;
2172 num_objects_dirty -= o.num_objects_dirty;
2173 num_whiteouts -= o.num_whiteouts;
2174 num_objects_omap -= o.num_objects_omap;
2175 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2176 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2177 num_flush -= o.num_flush;
2178 num_flush_kb -= o.num_flush_kb;
2179 num_evict -= o.num_evict;
2180 num_evict_kb -= o.num_evict_kb;
2181 num_promote -= o.num_promote;
2182 num_flush_mode_high -= o.num_flush_mode_high;
2183 num_flush_mode_low -= o.num_flush_mode_low;
2184 num_evict_mode_some -= o.num_evict_mode_some;
2185 num_evict_mode_full -= o.num_evict_mode_full;
2186 num_objects_pinned -= o.num_objects_pinned;
2187 num_legacy_snapsets -= o.num_legacy_snapsets;
2188 num_large_omap_objects -= o.num_large_omap_objects;
2189 }
2190
2191 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2192 {
2193 return
2194 l.num_bytes == r.num_bytes &&
2195 l.num_objects == r.num_objects &&
2196 l.num_object_clones == r.num_object_clones &&
2197 l.num_object_copies == r.num_object_copies &&
2198 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2199 l.num_objects_missing == r.num_objects_missing &&
2200 l.num_objects_degraded == r.num_objects_degraded &&
2201 l.num_objects_misplaced == r.num_objects_misplaced &&
2202 l.num_objects_unfound == r.num_objects_unfound &&
2203 l.num_rd == r.num_rd &&
2204 l.num_rd_kb == r.num_rd_kb &&
2205 l.num_wr == r.num_wr &&
2206 l.num_wr_kb == r.num_wr_kb &&
2207 l.num_scrub_errors == r.num_scrub_errors &&
2208 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2209 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2210 l.num_objects_recovered == r.num_objects_recovered &&
2211 l.num_bytes_recovered == r.num_bytes_recovered &&
2212 l.num_keys_recovered == r.num_keys_recovered &&
2213 l.num_objects_dirty == r.num_objects_dirty &&
2214 l.num_whiteouts == r.num_whiteouts &&
2215 l.num_objects_omap == r.num_objects_omap &&
2216 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2217 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2218 l.num_flush == r.num_flush &&
2219 l.num_flush_kb == r.num_flush_kb &&
2220 l.num_evict == r.num_evict &&
2221 l.num_evict_kb == r.num_evict_kb &&
2222 l.num_promote == r.num_promote &&
2223 l.num_flush_mode_high == r.num_flush_mode_high &&
2224 l.num_flush_mode_low == r.num_flush_mode_low &&
2225 l.num_evict_mode_some == r.num_evict_mode_some &&
2226 l.num_evict_mode_full == r.num_evict_mode_full &&
2227 l.num_objects_pinned == r.num_objects_pinned &&
2228 l.num_legacy_snapsets == r.num_legacy_snapsets &&
2229 l.num_large_omap_objects == r.num_large_omap_objects;
2230 }
2231
2232 // -- object_stat_collection_t --
2233
2234 void object_stat_collection_t::dump(Formatter *f) const
2235 {
2236 f->open_object_section("stat_sum");
2237 sum.dump(f);
2238 f->close_section();
2239 }
2240
2241 void object_stat_collection_t::encode(bufferlist& bl) const
2242 {
2243 ENCODE_START(2, 2, bl);
2244 ::encode(sum, bl);
2245 ::encode((__u32)0, bl);
2246 ENCODE_FINISH(bl);
2247 }
2248
2249 void object_stat_collection_t::decode(bufferlist::iterator& bl)
2250 {
2251 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2252 ::decode(sum, bl);
2253 {
2254 map<string,object_stat_sum_t> cat_sum;
2255 ::decode(cat_sum, bl);
2256 }
2257 DECODE_FINISH(bl);
2258 }
2259
2260 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2261 {
2262 object_stat_collection_t a;
2263 o.push_back(new object_stat_collection_t(a));
2264 list<object_stat_sum_t*> l;
2265 object_stat_sum_t::generate_test_instances(l);
2266 for (list<object_stat_sum_t*>::iterator p = l.begin(); p != l.end(); ++p) {
2267 a.add(**p);
2268 o.push_back(new object_stat_collection_t(a));
2269 }
2270 }
2271
2272
2273 // -- pg_stat_t --
2274
2275 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2276 {
2277 if (primary && osd == acting_primary) {
2278 return true;
2279 } else if (!primary) {
2280 for(vector<int32_t>::const_iterator it = acting.begin();
2281 it != acting.end(); ++it)
2282 {
2283 if (*it == osd)
2284 return true;
2285 }
2286 }
2287 return false;
2288 }
2289
2290 void pg_stat_t::dump(Formatter *f) const
2291 {
2292 f->dump_stream("version") << version;
2293 f->dump_stream("reported_seq") << reported_seq;
2294 f->dump_stream("reported_epoch") << reported_epoch;
2295 f->dump_string("state", pg_state_string(state));
2296 f->dump_stream("last_fresh") << last_fresh;
2297 f->dump_stream("last_change") << last_change;
2298 f->dump_stream("last_active") << last_active;
2299 f->dump_stream("last_peered") << last_peered;
2300 f->dump_stream("last_clean") << last_clean;
2301 f->dump_stream("last_became_active") << last_became_active;
2302 f->dump_stream("last_became_peered") << last_became_peered;
2303 f->dump_stream("last_unstale") << last_unstale;
2304 f->dump_stream("last_undegraded") << last_undegraded;
2305 f->dump_stream("last_fullsized") << last_fullsized;
2306 f->dump_unsigned("mapping_epoch", mapping_epoch);
2307 f->dump_stream("log_start") << log_start;
2308 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2309 f->dump_unsigned("created", created);
2310 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2311 f->dump_stream("parent") << parent;
2312 f->dump_unsigned("parent_split_bits", parent_split_bits);
2313 f->dump_stream("last_scrub") << last_scrub;
2314 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2315 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2316 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2317 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2318 f->dump_int("log_size", log_size);
2319 f->dump_int("ondisk_log_size", ondisk_log_size);
2320 f->dump_bool("stats_invalid", stats_invalid);
2321 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2322 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2323 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2324 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2325 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2326 f->dump_unsigned("snaptrimq_len", snaptrimq_len);
2327 stats.dump(f);
2328 f->open_array_section("up");
2329 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2330 f->dump_int("osd", *p);
2331 f->close_section();
2332 f->open_array_section("acting");
2333 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2334 f->dump_int("osd", *p);
2335 f->close_section();
2336 f->open_array_section("blocked_by");
2337 for (vector<int32_t>::const_iterator p = blocked_by.begin();
2338 p != blocked_by.end(); ++p)
2339 f->dump_int("osd", *p);
2340 f->close_section();
2341 f->dump_int("up_primary", up_primary);
2342 f->dump_int("acting_primary", acting_primary);
2343 }
2344
2345 void pg_stat_t::dump_brief(Formatter *f) const
2346 {
2347 f->dump_string("state", pg_state_string(state));
2348 f->open_array_section("up");
2349 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2350 f->dump_int("osd", *p);
2351 f->close_section();
2352 f->open_array_section("acting");
2353 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2354 f->dump_int("osd", *p);
2355 f->close_section();
2356 f->dump_int("up_primary", up_primary);
2357 f->dump_int("acting_primary", acting_primary);
2358 }
2359
2360 void pg_stat_t::encode(bufferlist &bl) const
2361 {
2362 ENCODE_START(23, 22, bl);
2363 ::encode(version, bl);
2364 ::encode(reported_seq, bl);
2365 ::encode(reported_epoch, bl);
2366 ::encode(state, bl);
2367 ::encode(log_start, bl);
2368 ::encode(ondisk_log_start, bl);
2369 ::encode(created, bl);
2370 ::encode(last_epoch_clean, bl);
2371 ::encode(parent, bl);
2372 ::encode(parent_split_bits, bl);
2373 ::encode(last_scrub, bl);
2374 ::encode(last_scrub_stamp, bl);
2375 ::encode(stats, bl);
2376 ::encode(log_size, bl);
2377 ::encode(ondisk_log_size, bl);
2378 ::encode(up, bl);
2379 ::encode(acting, bl);
2380 ::encode(last_fresh, bl);
2381 ::encode(last_change, bl);
2382 ::encode(last_active, bl);
2383 ::encode(last_clean, bl);
2384 ::encode(last_unstale, bl);
2385 ::encode(mapping_epoch, bl);
2386 ::encode(last_deep_scrub, bl);
2387 ::encode(last_deep_scrub_stamp, bl);
2388 ::encode(stats_invalid, bl);
2389 ::encode(last_clean_scrub_stamp, bl);
2390 ::encode(last_became_active, bl);
2391 ::encode(dirty_stats_invalid, bl);
2392 ::encode(up_primary, bl);
2393 ::encode(acting_primary, bl);
2394 ::encode(omap_stats_invalid, bl);
2395 ::encode(hitset_stats_invalid, bl);
2396 ::encode(blocked_by, bl);
2397 ::encode(last_undegraded, bl);
2398 ::encode(last_fullsized, bl);
2399 ::encode(hitset_bytes_stats_invalid, bl);
2400 ::encode(last_peered, bl);
2401 ::encode(last_became_peered, bl);
2402 ::encode(pin_stats_invalid, bl);
2403 ::encode(snaptrimq_len, bl);
2404 ENCODE_FINISH(bl);
2405 }
2406
2407 void pg_stat_t::decode(bufferlist::iterator &bl)
2408 {
2409 bool tmp;
2410 DECODE_START(22, bl);
2411 ::decode(version, bl);
2412 ::decode(reported_seq, bl);
2413 ::decode(reported_epoch, bl);
2414 ::decode(state, bl);
2415 ::decode(log_start, bl);
2416 ::decode(ondisk_log_start, bl);
2417 ::decode(created, bl);
2418 ::decode(last_epoch_clean, bl);
2419 ::decode(parent, bl);
2420 ::decode(parent_split_bits, bl);
2421 ::decode(last_scrub, bl);
2422 ::decode(last_scrub_stamp, bl);
2423 ::decode(stats, bl);
2424 ::decode(log_size, bl);
2425 ::decode(ondisk_log_size, bl);
2426 ::decode(up, bl);
2427 ::decode(acting, bl);
2428 ::decode(last_fresh, bl);
2429 ::decode(last_change, bl);
2430 ::decode(last_active, bl);
2431 ::decode(last_clean, bl);
2432 ::decode(last_unstale, bl);
2433 ::decode(mapping_epoch, bl);
2434 ::decode(last_deep_scrub, bl);
2435 ::decode(last_deep_scrub_stamp, bl);
2436 ::decode(tmp, bl);
2437 stats_invalid = tmp;
2438 ::decode(last_clean_scrub_stamp, bl);
2439 ::decode(last_became_active, bl);
2440 ::decode(tmp, bl);
2441 dirty_stats_invalid = tmp;
2442 ::decode(up_primary, bl);
2443 ::decode(acting_primary, bl);
2444 ::decode(tmp, bl);
2445 omap_stats_invalid = tmp;
2446 ::decode(tmp, bl);
2447 hitset_stats_invalid = tmp;
2448 ::decode(blocked_by, bl);
2449 ::decode(last_undegraded, bl);
2450 ::decode(last_fullsized, bl);
2451 ::decode(tmp, bl);
2452 hitset_bytes_stats_invalid = tmp;
2453 ::decode(last_peered, bl);
2454 ::decode(last_became_peered, bl);
2455 ::decode(tmp, bl);
2456 pin_stats_invalid = tmp;
2457 if (struct_v >= 23) {
2458 ::decode(snaptrimq_len, bl);
2459 }
2460 DECODE_FINISH(bl);
2461 }
2462
2463 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
2464 {
2465 pg_stat_t a;
2466 o.push_back(new pg_stat_t(a));
2467
2468 a.version = eversion_t(1, 3);
2469 a.reported_epoch = 1;
2470 a.reported_seq = 2;
2471 a.state = 123;
2472 a.mapping_epoch = 998;
2473 a.last_fresh = utime_t(1002, 1);
2474 a.last_change = utime_t(1002, 2);
2475 a.last_active = utime_t(1002, 3);
2476 a.last_clean = utime_t(1002, 4);
2477 a.last_unstale = utime_t(1002, 5);
2478 a.last_undegraded = utime_t(1002, 7);
2479 a.last_fullsized = utime_t(1002, 8);
2480 a.log_start = eversion_t(1, 4);
2481 a.ondisk_log_start = eversion_t(1, 5);
2482 a.created = 6;
2483 a.last_epoch_clean = 7;
2484 a.parent = pg_t(1, 2, 3);
2485 a.parent_split_bits = 12;
2486 a.last_scrub = eversion_t(9, 10);
2487 a.last_scrub_stamp = utime_t(11, 12);
2488 a.last_deep_scrub = eversion_t(13, 14);
2489 a.last_deep_scrub_stamp = utime_t(15, 16);
2490 a.last_clean_scrub_stamp = utime_t(17, 18);
2491 a.snaptrimq_len = 1048576;
2492 list<object_stat_collection_t*> l;
2493 object_stat_collection_t::generate_test_instances(l);
2494 a.stats = *l.back();
2495 a.log_size = 99;
2496 a.ondisk_log_size = 88;
2497 a.up.push_back(123);
2498 a.up_primary = 123;
2499 a.acting.push_back(456);
2500 a.acting_primary = 456;
2501 o.push_back(new pg_stat_t(a));
2502
2503 a.up.push_back(124);
2504 a.up_primary = 124;
2505 a.acting.push_back(124);
2506 a.acting_primary = 124;
2507 a.blocked_by.push_back(155);
2508 a.blocked_by.push_back(156);
2509 o.push_back(new pg_stat_t(a));
2510 }
2511
2512 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
2513 {
2514 return
2515 l.version == r.version &&
2516 l.reported_seq == r.reported_seq &&
2517 l.reported_epoch == r.reported_epoch &&
2518 l.state == r.state &&
2519 l.last_fresh == r.last_fresh &&
2520 l.last_change == r.last_change &&
2521 l.last_active == r.last_active &&
2522 l.last_peered == r.last_peered &&
2523 l.last_clean == r.last_clean &&
2524 l.last_unstale == r.last_unstale &&
2525 l.last_undegraded == r.last_undegraded &&
2526 l.last_fullsized == r.last_fullsized &&
2527 l.log_start == r.log_start &&
2528 l.ondisk_log_start == r.ondisk_log_start &&
2529 l.created == r.created &&
2530 l.last_epoch_clean == r.last_epoch_clean &&
2531 l.parent == r.parent &&
2532 l.parent_split_bits == r.parent_split_bits &&
2533 l.last_scrub == r.last_scrub &&
2534 l.last_deep_scrub == r.last_deep_scrub &&
2535 l.last_scrub_stamp == r.last_scrub_stamp &&
2536 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2537 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2538 l.stats == r.stats &&
2539 l.stats_invalid == r.stats_invalid &&
2540 l.log_size == r.log_size &&
2541 l.ondisk_log_size == r.ondisk_log_size &&
2542 l.up == r.up &&
2543 l.acting == r.acting &&
2544 l.mapping_epoch == r.mapping_epoch &&
2545 l.blocked_by == r.blocked_by &&
2546 l.last_became_active == r.last_became_active &&
2547 l.last_became_peered == r.last_became_peered &&
2548 l.dirty_stats_invalid == r.dirty_stats_invalid &&
2549 l.omap_stats_invalid == r.omap_stats_invalid &&
2550 l.hitset_stats_invalid == r.hitset_stats_invalid &&
2551 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
2552 l.up_primary == r.up_primary &&
2553 l.acting_primary == r.acting_primary &&
2554 l.pin_stats_invalid == r.pin_stats_invalid &&
2555 l.snaptrimq_len == r.snaptrimq_len;
2556 }
2557
2558 // -- pool_stat_t --
2559
2560 void pool_stat_t::dump(Formatter *f) const
2561 {
2562 stats.dump(f);
2563 f->dump_int("log_size", log_size);
2564 f->dump_int("ondisk_log_size", ondisk_log_size);
2565 f->dump_int("up", up);
2566 f->dump_int("acting", acting);
2567 }
2568
2569 void pool_stat_t::encode(bufferlist &bl, uint64_t features) const
2570 {
2571 if ((features & CEPH_FEATURE_OSDENC) == 0) {
2572 __u8 v = 4;
2573 ::encode(v, bl);
2574 ::encode(stats, bl);
2575 ::encode(log_size, bl);
2576 ::encode(ondisk_log_size, bl);
2577 return;
2578 }
2579
2580 ENCODE_START(6, 5, bl);
2581 ::encode(stats, bl);
2582 ::encode(log_size, bl);
2583 ::encode(ondisk_log_size, bl);
2584 ::encode(up, bl);
2585 ::encode(acting, bl);
2586 ENCODE_FINISH(bl);
2587 }
2588
2589 void pool_stat_t::decode(bufferlist::iterator &bl)
2590 {
2591 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2592 if (struct_v >= 4) {
2593 ::decode(stats, bl);
2594 ::decode(log_size, bl);
2595 ::decode(ondisk_log_size, bl);
2596 if (struct_v >= 6) {
2597 ::decode(up, bl);
2598 ::decode(acting, bl);
2599 } else {
2600 up = 0;
2601 acting = 0;
2602 }
2603 } else {
2604 ::decode(stats.sum.num_bytes, bl);
2605 uint64_t num_kb;
2606 ::decode(num_kb, bl);
2607 ::decode(stats.sum.num_objects, bl);
2608 ::decode(stats.sum.num_object_clones, bl);
2609 ::decode(stats.sum.num_object_copies, bl);
2610 ::decode(stats.sum.num_objects_missing_on_primary, bl);
2611 ::decode(stats.sum.num_objects_degraded, bl);
2612 ::decode(log_size, bl);
2613 ::decode(ondisk_log_size, bl);
2614 if (struct_v >= 2) {
2615 ::decode(stats.sum.num_rd, bl);
2616 ::decode(stats.sum.num_rd_kb, bl);
2617 ::decode(stats.sum.num_wr, bl);
2618 ::decode(stats.sum.num_wr_kb, bl);
2619 }
2620 if (struct_v >= 3) {
2621 ::decode(stats.sum.num_objects_unfound, bl);
2622 }
2623 }
2624 DECODE_FINISH(bl);
2625 }
2626
2627 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
2628 {
2629 pool_stat_t a;
2630 o.push_back(new pool_stat_t(a));
2631
2632 list<object_stat_collection_t*> l;
2633 object_stat_collection_t::generate_test_instances(l);
2634 a.stats = *l.back();
2635 a.log_size = 123;
2636 a.ondisk_log_size = 456;
2637 a.acting = 3;
2638 a.up = 4;
2639 o.push_back(new pool_stat_t(a));
2640 }
2641
2642
2643 // -- pg_history_t --
2644
2645 void pg_history_t::encode(bufferlist &bl) const
2646 {
2647 ENCODE_START(9, 4, bl);
2648 ::encode(epoch_created, bl);
2649 ::encode(last_epoch_started, bl);
2650 ::encode(last_epoch_clean, bl);
2651 ::encode(last_epoch_split, bl);
2652 ::encode(same_interval_since, bl);
2653 ::encode(same_up_since, bl);
2654 ::encode(same_primary_since, bl);
2655 ::encode(last_scrub, bl);
2656 ::encode(last_scrub_stamp, bl);
2657 ::encode(last_deep_scrub, bl);
2658 ::encode(last_deep_scrub_stamp, bl);
2659 ::encode(last_clean_scrub_stamp, bl);
2660 ::encode(last_epoch_marked_full, bl);
2661 ::encode(last_interval_started, bl);
2662 ::encode(last_interval_clean, bl);
2663 ::encode(epoch_pool_created, bl);
2664 ENCODE_FINISH(bl);
2665 }
2666
2667 void pg_history_t::decode(bufferlist::iterator &bl)
2668 {
2669 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
2670 ::decode(epoch_created, bl);
2671 ::decode(last_epoch_started, bl);
2672 if (struct_v >= 3)
2673 ::decode(last_epoch_clean, bl);
2674 else
2675 last_epoch_clean = last_epoch_started; // careful, it's a lie!
2676 ::decode(last_epoch_split, bl);
2677 ::decode(same_interval_since, bl);
2678 ::decode(same_up_since, bl);
2679 ::decode(same_primary_since, bl);
2680 if (struct_v >= 2) {
2681 ::decode(last_scrub, bl);
2682 ::decode(last_scrub_stamp, bl);
2683 }
2684 if (struct_v >= 5) {
2685 ::decode(last_deep_scrub, bl);
2686 ::decode(last_deep_scrub_stamp, bl);
2687 }
2688 if (struct_v >= 6) {
2689 ::decode(last_clean_scrub_stamp, bl);
2690 }
2691 if (struct_v >= 7) {
2692 ::decode(last_epoch_marked_full, bl);
2693 }
2694 if (struct_v >= 8) {
2695 ::decode(last_interval_started, bl);
2696 ::decode(last_interval_clean, bl);
2697 } else {
2698 if (last_epoch_started >= same_interval_since) {
2699 last_interval_started = same_interval_since;
2700 } else {
2701 last_interval_started = last_epoch_started; // best guess
2702 }
2703 if (last_epoch_clean >= same_interval_since) {
2704 last_interval_clean = same_interval_since;
2705 } else {
2706 last_interval_clean = last_epoch_clean; // best guess
2707 }
2708 }
2709 if (struct_v >= 9) {
2710 ::decode(epoch_pool_created, bl);
2711 } else {
2712 epoch_pool_created = epoch_created;
2713 }
2714 DECODE_FINISH(bl);
2715 }
2716
2717 void pg_history_t::dump(Formatter *f) const
2718 {
2719 f->dump_int("epoch_created", epoch_created);
2720 f->dump_int("epoch_pool_created", epoch_pool_created);
2721 f->dump_int("last_epoch_started", last_epoch_started);
2722 f->dump_int("last_interval_started", last_interval_started);
2723 f->dump_int("last_epoch_clean", last_epoch_clean);
2724 f->dump_int("last_interval_clean", last_interval_clean);
2725 f->dump_int("last_epoch_split", last_epoch_split);
2726 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
2727 f->dump_int("same_up_since", same_up_since);
2728 f->dump_int("same_interval_since", same_interval_since);
2729 f->dump_int("same_primary_since", same_primary_since);
2730 f->dump_stream("last_scrub") << last_scrub;
2731 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2732 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2733 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2734 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2735 }
2736
2737 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
2738 {
2739 o.push_back(new pg_history_t);
2740 o.push_back(new pg_history_t);
2741 o.back()->epoch_created = 1;
2742 o.back()->epoch_pool_created = 1;
2743 o.back()->last_epoch_started = 2;
2744 o.back()->last_interval_started = 2;
2745 o.back()->last_epoch_clean = 3;
2746 o.back()->last_interval_clean = 2;
2747 o.back()->last_epoch_split = 4;
2748 o.back()->same_up_since = 5;
2749 o.back()->same_interval_since = 6;
2750 o.back()->same_primary_since = 7;
2751 o.back()->last_scrub = eversion_t(8, 9);
2752 o.back()->last_scrub_stamp = utime_t(10, 11);
2753 o.back()->last_deep_scrub = eversion_t(12, 13);
2754 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
2755 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
2756 o.back()->last_epoch_marked_full = 18;
2757 }
2758
2759
2760 // -- pg_info_t --
2761
2762 void pg_info_t::encode(bufferlist &bl) const
2763 {
2764 ENCODE_START(32, 26, bl);
2765 ::encode(pgid.pgid, bl);
2766 ::encode(last_update, bl);
2767 ::encode(last_complete, bl);
2768 ::encode(log_tail, bl);
2769 if (last_backfill_bitwise && !last_backfill.is_max()) {
2770 ::encode(hobject_t(), bl);
2771 } else {
2772 ::encode(last_backfill, bl);
2773 }
2774 ::encode(stats, bl);
2775 history.encode(bl);
2776 ::encode(purged_snaps, bl);
2777 ::encode(last_epoch_started, bl);
2778 ::encode(last_user_version, bl);
2779 ::encode(hit_set, bl);
2780 ::encode(pgid.shard, bl);
2781 ::encode(last_backfill, bl);
2782 ::encode(last_backfill_bitwise, bl);
2783 ::encode(last_interval_started, bl);
2784 ENCODE_FINISH(bl);
2785 }
2786
2787 void pg_info_t::decode(bufferlist::iterator &bl)
2788 {
2789 DECODE_START(32, bl);
2790 ::decode(pgid.pgid, bl);
2791 ::decode(last_update, bl);
2792 ::decode(last_complete, bl);
2793 ::decode(log_tail, bl);
2794 {
2795 hobject_t old_last_backfill;
2796 ::decode(old_last_backfill, bl);
2797 }
2798 ::decode(stats, bl);
2799 history.decode(bl);
2800 ::decode(purged_snaps, bl);
2801 ::decode(last_epoch_started, bl);
2802 ::decode(last_user_version, bl);
2803 ::decode(hit_set, bl);
2804 ::decode(pgid.shard, bl);
2805 ::decode(last_backfill, bl);
2806 ::decode(last_backfill_bitwise, bl);
2807 if (struct_v >= 32) {
2808 ::decode(last_interval_started, bl);
2809 } else {
2810 last_interval_started = last_epoch_started;
2811 }
2812 DECODE_FINISH(bl);
2813 }
2814
2815 // -- pg_info_t --
2816
2817 void pg_info_t::dump(Formatter *f) const
2818 {
2819 f->dump_stream("pgid") << pgid;
2820 f->dump_stream("last_update") << last_update;
2821 f->dump_stream("last_complete") << last_complete;
2822 f->dump_stream("log_tail") << log_tail;
2823 f->dump_int("last_user_version", last_user_version);
2824 f->dump_stream("last_backfill") << last_backfill;
2825 f->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise);
2826 f->open_array_section("purged_snaps");
2827 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
2828 i != purged_snaps.end();
2829 ++i) {
2830 f->open_object_section("purged_snap_interval");
2831 f->dump_stream("start") << i.get_start();
2832 f->dump_stream("length") << i.get_len();
2833 f->close_section();
2834 }
2835 f->close_section();
2836 f->open_object_section("history");
2837 history.dump(f);
2838 f->close_section();
2839 f->open_object_section("stats");
2840 stats.dump(f);
2841 f->close_section();
2842
2843 f->dump_int("empty", is_empty());
2844 f->dump_int("dne", dne());
2845 f->dump_int("incomplete", is_incomplete());
2846 f->dump_int("last_epoch_started", last_epoch_started);
2847
2848 f->open_object_section("hit_set_history");
2849 hit_set.dump(f);
2850 f->close_section();
2851 }
2852
2853 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
2854 {
2855 o.push_back(new pg_info_t);
2856 o.push_back(new pg_info_t);
2857 list<pg_history_t*> h;
2858 pg_history_t::generate_test_instances(h);
2859 o.back()->history = *h.back();
2860 o.back()->pgid = spg_t(pg_t(1, 2, -1), shard_id_t::NO_SHARD);
2861 o.back()->last_update = eversion_t(3, 4);
2862 o.back()->last_complete = eversion_t(5, 6);
2863 o.back()->last_user_version = 2;
2864 o.back()->log_tail = eversion_t(7, 8);
2865 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
2866 o.back()->last_backfill_bitwise = true;
2867 {
2868 list<pg_stat_t*> s;
2869 pg_stat_t::generate_test_instances(s);
2870 o.back()->stats = *s.back();
2871 }
2872 {
2873 list<pg_hit_set_history_t*> s;
2874 pg_hit_set_history_t::generate_test_instances(s);
2875 o.back()->hit_set = *s.back();
2876 }
2877 }
2878
2879 // -- pg_notify_t --
2880 void pg_notify_t::encode(bufferlist &bl) const
2881 {
2882 ENCODE_START(2, 2, bl);
2883 ::encode(query_epoch, bl);
2884 ::encode(epoch_sent, bl);
2885 ::encode(info, bl);
2886 ::encode(to, bl);
2887 ::encode(from, bl);
2888 ENCODE_FINISH(bl);
2889 }
2890
2891 void pg_notify_t::decode(bufferlist::iterator &bl)
2892 {
2893 DECODE_START(2, bl);
2894 ::decode(query_epoch, bl);
2895 ::decode(epoch_sent, bl);
2896 ::decode(info, bl);
2897 ::decode(to, bl);
2898 ::decode(from, bl);
2899 DECODE_FINISH(bl);
2900 }
2901
2902 void pg_notify_t::dump(Formatter *f) const
2903 {
2904 f->dump_int("from", from);
2905 f->dump_int("to", to);
2906 f->dump_unsigned("query_epoch", query_epoch);
2907 f->dump_unsigned("epoch_sent", epoch_sent);
2908 {
2909 f->open_object_section("info");
2910 info.dump(f);
2911 f->close_section();
2912 }
2913 }
2914
2915 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
2916 {
2917 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, pg_info_t()));
2918 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t()));
2919 }
2920
2921 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
2922 {
2923 lhs << "(query:" << notify.query_epoch
2924 << " sent:" << notify.epoch_sent
2925 << " " << notify.info;
2926 if (notify.from != shard_id_t::NO_SHARD ||
2927 notify.to != shard_id_t::NO_SHARD)
2928 lhs << " " << (unsigned)notify.from
2929 << "->" << (unsigned)notify.to;
2930 return lhs << ")";
2931 }
2932
2933 // -- pg_interval_t --
2934
2935 void PastIntervals::pg_interval_t::encode(bufferlist& bl) const
2936 {
2937 ENCODE_START(4, 2, bl);
2938 ::encode(first, bl);
2939 ::encode(last, bl);
2940 ::encode(up, bl);
2941 ::encode(acting, bl);
2942 ::encode(maybe_went_rw, bl);
2943 ::encode(primary, bl);
2944 ::encode(up_primary, bl);
2945 ENCODE_FINISH(bl);
2946 }
2947
2948 void PastIntervals::pg_interval_t::decode(bufferlist::iterator& bl)
2949 {
2950 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
2951 ::decode(first, bl);
2952 ::decode(last, bl);
2953 ::decode(up, bl);
2954 ::decode(acting, bl);
2955 ::decode(maybe_went_rw, bl);
2956 if (struct_v >= 3) {
2957 ::decode(primary, bl);
2958 } else {
2959 if (acting.size())
2960 primary = acting[0];
2961 }
2962 if (struct_v >= 4) {
2963 ::decode(up_primary, bl);
2964 } else {
2965 if (up.size())
2966 up_primary = up[0];
2967 }
2968 DECODE_FINISH(bl);
2969 }
2970
2971 void PastIntervals::pg_interval_t::dump(Formatter *f) const
2972 {
2973 f->dump_unsigned("first", first);
2974 f->dump_unsigned("last", last);
2975 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
2976 f->open_array_section("up");
2977 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
2978 f->dump_int("osd", *p);
2979 f->close_section();
2980 f->open_array_section("acting");
2981 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2982 f->dump_int("osd", *p);
2983 f->close_section();
2984 f->dump_int("primary", primary);
2985 f->dump_int("up_primary", up_primary);
2986 }
2987
2988 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
2989 {
2990 o.push_back(new pg_interval_t);
2991 o.push_back(new pg_interval_t);
2992 o.back()->up.push_back(1);
2993 o.back()->acting.push_back(2);
2994 o.back()->acting.push_back(3);
2995 o.back()->first = 4;
2996 o.back()->last = 5;
2997 o.back()->maybe_went_rw = true;
2998 }
2999
3000 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
3001
3002 class pi_simple_rep : public PastIntervals::interval_rep {
3003 map<epoch_t, PastIntervals::pg_interval_t> interval_map;
3004
3005 pi_simple_rep(
3006 bool ec_pool,
3007 std::list<PastIntervals::pg_interval_t> &&intervals) {
3008 for (auto &&i: intervals)
3009 add_interval(ec_pool, i);
3010 }
3011
3012 public:
3013 pi_simple_rep() = default;
3014 pi_simple_rep(const pi_simple_rep &) = default;
3015 pi_simple_rep(pi_simple_rep &&) = default;
3016 pi_simple_rep &operator=(pi_simple_rep &&) = default;
3017 pi_simple_rep &operator=(const pi_simple_rep &) = default;
3018
3019 size_t size() const override { return interval_map.size(); }
3020 bool empty() const override { return interval_map.empty(); }
3021 void clear() override { interval_map.clear(); }
3022 pair<epoch_t, epoch_t> get_bounds() const override {
3023 auto iter = interval_map.begin();
3024 if (iter != interval_map.end()) {
3025 auto riter = interval_map.rbegin();
3026 return make_pair(
3027 iter->second.first,
3028 riter->second.last + 1);
3029 } else {
3030 return make_pair(0, 0);
3031 }
3032 }
3033 set<pg_shard_t> get_all_participants(
3034 bool ec_pool) const override {
3035 set<pg_shard_t> all_participants;
3036
3037 // We need to decide who might have unfound objects that we need
3038 auto p = interval_map.rbegin();
3039 auto end = interval_map.rend();
3040 for (; p != end; ++p) {
3041 const PastIntervals::pg_interval_t &interval(p->second);
3042 // If nothing changed, we don't care about this interval.
3043 if (!interval.maybe_went_rw)
3044 continue;
3045
3046 int i = 0;
3047 std::vector<int>::const_iterator a = interval.acting.begin();
3048 std::vector<int>::const_iterator a_end = interval.acting.end();
3049 for (; a != a_end; ++a, ++i) {
3050 pg_shard_t shard(*a, ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD);
3051 if (*a != CRUSH_ITEM_NONE)
3052 all_participants.insert(shard);
3053 }
3054 }
3055 return all_participants;
3056 }
3057 void add_interval(
3058 bool ec_pool,
3059 const PastIntervals::pg_interval_t &interval) override {
3060 interval_map[interval.first] = interval;
3061 }
3062 unique_ptr<PastIntervals::interval_rep> clone() const override {
3063 return unique_ptr<PastIntervals::interval_rep>(new pi_simple_rep(*this));
3064 }
3065 ostream &print(ostream &out) const override {
3066 return out << interval_map;
3067 }
3068 void encode(bufferlist &bl) const override {
3069 ::encode(interval_map, bl);
3070 }
3071 void decode(bufferlist::iterator &bl) override {
3072 ::decode(interval_map, bl);
3073 }
3074 void dump(Formatter *f) const override {
3075 f->open_array_section("PastIntervals::compat_rep");
3076 for (auto &&i: interval_map) {
3077 f->open_object_section("pg_interval_t");
3078 f->dump_int("epoch", i.first);
3079 f->open_object_section("interval");
3080 i.second.dump(f);
3081 f->close_section();
3082 f->close_section();
3083 }
3084 f->close_section();
3085 }
3086 bool is_classic() const override {
3087 return true;
3088 }
3089 static void generate_test_instances(list<pi_simple_rep*> &o) {
3090 using ival = PastIntervals::pg_interval_t;
3091 using ivallst = std::list<ival>;
3092 o.push_back(
3093 new pi_simple_rep(
3094 true, ivallst
3095 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3096 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3097 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3098 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3099 }));
3100 o.push_back(
3101 new pi_simple_rep(
3102 false, ivallst
3103 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3104 , ival{{ 1, 2}, { 1, 2}, 20, 30, true, 1, 1}
3105 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3106 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3107 }));
3108 o.push_back(
3109 new pi_simple_rep(
3110 true, ivallst
3111 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3112 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3113 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3114 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3115 }));
3116 return;
3117 }
3118 void iterate_mayberw_back_to(
3119 bool ec_pool,
3120 epoch_t les,
3121 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3122 for (auto i = interval_map.rbegin(); i != interval_map.rend(); ++i) {
3123 if (!i->second.maybe_went_rw)
3124 continue;
3125 if (i->second.last < les)
3126 break;
3127 set<pg_shard_t> actingset;
3128 for (unsigned j = 0; j < i->second.acting.size(); ++j) {
3129 if (i->second.acting[j] == CRUSH_ITEM_NONE)
3130 continue;
3131 actingset.insert(
3132 pg_shard_t(
3133 i->second.acting[j],
3134 ec_pool ? shard_id_t(j) : shard_id_t::NO_SHARD));
3135 }
3136 f(i->second.first, actingset);
3137 }
3138 }
3139
3140 bool has_full_intervals() const override { return true; }
3141 void iterate_all_intervals(
3142 std::function<void(const PastIntervals::pg_interval_t &)> &&f
3143 ) const override {
3144 for (auto &&i: interval_map) {
3145 f(i.second);
3146 }
3147 }
3148 virtual ~pi_simple_rep() override {}
3149 };
3150
3151 /**
3152 * pi_compact_rep
3153 *
3154 * PastIntervals only needs to be able to answer two questions:
3155 * 1) Where should the primary look for unfound objects?
3156 * 2) List a set of subsets of the OSDs such that contacting at least
3157 * one from each subset guarrantees we speak to at least one witness
3158 * of any completed write.
3159 *
3160 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3161 * we don't need to keep any where maybe_went_rw would be false. We also
3162 * needn't keep two intervals where the actingset in one is a subset
3163 * of the other (only need to keep the smaller of the two sets). In order
3164 * to accurately trim the set of intervals as last_epoch_started changes
3165 * without rebuilding the set from scratch, we'll retain the larger set
3166 * if it in an older interval.
3167 */
3168 struct compact_interval_t {
3169 epoch_t first;
3170 epoch_t last;
3171 set<pg_shard_t> acting;
3172 bool supersedes(const compact_interval_t &other) {
3173 for (auto &&i: acting) {
3174 if (!other.acting.count(i))
3175 return false;
3176 }
3177 return true;
3178 }
3179 void dump(Formatter *f) const {
3180 f->open_object_section("compact_interval_t");
3181 f->dump_stream("first") << first;
3182 f->dump_stream("last") << last;
3183 f->dump_stream("acting") << acting;
3184 f->close_section();
3185 }
3186 void encode(bufferlist &bl) const {
3187 ENCODE_START(1, 1, bl);
3188 ::encode(first, bl);
3189 ::encode(last, bl);
3190 ::encode(acting, bl);
3191 ENCODE_FINISH(bl);
3192 }
3193 void decode(bufferlist::iterator &bl) {
3194 DECODE_START(1, bl);
3195 ::decode(first, bl);
3196 ::decode(last, bl);
3197 ::decode(acting, bl);
3198 DECODE_FINISH(bl);
3199 }
3200 static void generate_test_instances(list<compact_interval_t*> & o) {
3201 /* Not going to be used, we'll generate pi_compact_rep directly */
3202 }
3203 };
3204 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3205 {
3206 return o << "([" << rhs.first << "," << rhs.last
3207 << "] acting " << rhs.acting << ")";
3208 }
3209 WRITE_CLASS_ENCODER(compact_interval_t)
3210
3211 class pi_compact_rep : public PastIntervals::interval_rep {
3212 epoch_t first = 0;
3213 epoch_t last = 0; // inclusive
3214 set<pg_shard_t> all_participants;
3215 list<compact_interval_t> intervals;
3216 pi_compact_rep(
3217 bool ec_pool,
3218 std::list<PastIntervals::pg_interval_t> &&intervals) {
3219 for (auto &&i: intervals)
3220 add_interval(ec_pool, i);
3221 }
3222 public:
3223 pi_compact_rep() = default;
3224 pi_compact_rep(const pi_compact_rep &) = default;
3225 pi_compact_rep(pi_compact_rep &&) = default;
3226 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3227 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3228
3229 size_t size() const override { return intervals.size(); }
3230 bool empty() const override {
3231 return first > last || (first == 0 && last == 0);
3232 }
3233 void clear() override {
3234 *this = pi_compact_rep();
3235 }
3236 pair<epoch_t, epoch_t> get_bounds() const override {
3237 return make_pair(first, last + 1);
3238 }
3239 set<pg_shard_t> get_all_participants(
3240 bool ec_pool) const override {
3241 return all_participants;
3242 }
3243 void add_interval(
3244 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3245 if (first == 0)
3246 first = interval.first;
3247 assert(interval.last > last);
3248 last = interval.last;
3249 set<pg_shard_t> acting;
3250 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3251 if (interval.acting[i] == CRUSH_ITEM_NONE)
3252 continue;
3253 acting.insert(
3254 pg_shard_t(
3255 interval.acting[i],
3256 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3257 }
3258 all_participants.insert(acting.begin(), acting.end());
3259 if (!interval.maybe_went_rw)
3260 return;
3261 intervals.push_back(
3262 compact_interval_t{interval.first, interval.last, acting});
3263 auto plast = intervals.end();
3264 --plast;
3265 for (auto cur = intervals.begin(); cur != plast; ) {
3266 if (plast->supersedes(*cur)) {
3267 intervals.erase(cur++);
3268 } else {
3269 ++cur;
3270 }
3271 }
3272 }
3273 unique_ptr<PastIntervals::interval_rep> clone() const override {
3274 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3275 }
3276 ostream &print(ostream &out) const override {
3277 return out << "([" << first << "," << last
3278 << "] intervals=" << intervals << ")";
3279 }
3280 void encode(bufferlist &bl) const override {
3281 ENCODE_START(1, 1, bl);
3282 ::encode(first, bl);
3283 ::encode(last, bl);
3284 ::encode(all_participants, bl);
3285 ::encode(intervals, bl);
3286 ENCODE_FINISH(bl);
3287 }
3288 void decode(bufferlist::iterator &bl) override {
3289 DECODE_START(1, bl);
3290 ::decode(first, bl);
3291 ::decode(last, bl);
3292 ::decode(all_participants, bl);
3293 ::decode(intervals, bl);
3294 DECODE_FINISH(bl);
3295 }
3296 void dump(Formatter *f) const override {
3297 f->open_object_section("PastIntervals::compact_rep");
3298 f->dump_stream("first") << first;
3299 f->dump_stream("last") << last;
3300 f->open_array_section("all_participants");
3301 for (auto& i : all_participants) {
3302 f->dump_object("pg_shard", i);
3303 }
3304 f->close_section();
3305 f->open_array_section("intervals");
3306 for (auto &&i: intervals) {
3307 i.dump(f);
3308 }
3309 f->close_section();
3310 f->close_section();
3311 }
3312 bool is_classic() const override {
3313 return false;
3314 }
3315 static void generate_test_instances(list<pi_compact_rep*> &o) {
3316 using ival = PastIntervals::pg_interval_t;
3317 using ivallst = std::list<ival>;
3318 o.push_back(
3319 new pi_compact_rep(
3320 true, ivallst
3321 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3322 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3323 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3324 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3325 }));
3326 o.push_back(
3327 new pi_compact_rep(
3328 false, ivallst
3329 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3330 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3331 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3332 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3333 }));
3334 o.push_back(
3335 new pi_compact_rep(
3336 true, ivallst
3337 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3338 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3339 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3340 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3341 }));
3342 }
3343 void iterate_mayberw_back_to(
3344 bool ec_pool,
3345 epoch_t les,
3346 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3347 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3348 if (i->last < les)
3349 break;
3350 f(i->first, i->acting);
3351 }
3352 }
3353 virtual ~pi_compact_rep() override {}
3354 };
3355 WRITE_CLASS_ENCODER(pi_compact_rep)
3356
3357 PastIntervals::PastIntervals(const PastIntervals &rhs)
3358 : past_intervals(rhs.past_intervals ?
3359 rhs.past_intervals->clone() :
3360 nullptr) {}
3361
3362 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3363 {
3364 PastIntervals other(rhs);
3365 swap(other);
3366 return *this;
3367 }
3368
3369 ostream& operator<<(ostream& out, const PastIntervals &i)
3370 {
3371 if (i.past_intervals) {
3372 return i.past_intervals->print(out);
3373 } else {
3374 return out << "(empty)";
3375 }
3376 }
3377
3378 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3379 {
3380 return out << "PriorSet("
3381 << "ec_pool: " << i.ec_pool
3382 << ", probe: " << i.probe
3383 << ", down: " << i.down
3384 << ", blocked_by: " << i.blocked_by
3385 << ", pg_down: " << i.pg_down
3386 << ")";
3387 }
3388
3389 void PastIntervals::decode(bufferlist::iterator &bl)
3390 {
3391 DECODE_START(1, bl);
3392 __u8 type = 0;
3393 ::decode(type, bl);
3394 switch (type) {
3395 case 0:
3396 break;
3397 case 1:
3398 past_intervals.reset(new pi_simple_rep);
3399 past_intervals->decode(bl);
3400 break;
3401 case 2:
3402 past_intervals.reset(new pi_compact_rep);
3403 past_intervals->decode(bl);
3404 break;
3405 }
3406 DECODE_FINISH(bl);
3407 }
3408
3409 void PastIntervals::decode_classic(bufferlist::iterator &bl)
3410 {
3411 past_intervals.reset(new pi_simple_rep);
3412 past_intervals->decode(bl);
3413 }
3414
3415 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
3416 {
3417 {
3418 list<pi_simple_rep *> simple;
3419 pi_simple_rep::generate_test_instances(simple);
3420 for (auto &&i: simple) {
3421 // takes ownership of contents
3422 o.push_back(new PastIntervals(i));
3423 }
3424 }
3425 {
3426 list<pi_compact_rep *> compact;
3427 pi_compact_rep::generate_test_instances(compact);
3428 for (auto &&i: compact) {
3429 // takes ownership of contents
3430 o.push_back(new PastIntervals(i));
3431 }
3432 }
3433 return;
3434 }
3435
3436 void PastIntervals::update_type(bool ec_pool, bool compact)
3437 {
3438 if (!compact) {
3439 if (!past_intervals) {
3440 past_intervals.reset(new pi_simple_rep);
3441 } else {
3442 // we never convert from compact back to classic
3443 assert(is_classic());
3444 }
3445 } else {
3446 if (!past_intervals) {
3447 past_intervals.reset(new pi_compact_rep);
3448 } else if (is_classic()) {
3449 auto old = std::move(past_intervals);
3450 past_intervals.reset(new pi_compact_rep);
3451 assert(old->has_full_intervals());
3452 old->iterate_all_intervals([&](const pg_interval_t &i) {
3453 past_intervals->add_interval(ec_pool, i);
3454 });
3455 }
3456 }
3457 }
3458
3459 void PastIntervals::update_type_from_map(bool ec_pool, const OSDMap &osdmap)
3460 {
3461 update_type(ec_pool, osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
3462 }
3463
3464 bool PastIntervals::is_new_interval(
3465 int old_acting_primary,
3466 int new_acting_primary,
3467 const vector<int> &old_acting,
3468 const vector<int> &new_acting,
3469 int old_up_primary,
3470 int new_up_primary,
3471 const vector<int> &old_up,
3472 const vector<int> &new_up,
3473 int old_size,
3474 int new_size,
3475 int old_min_size,
3476 int new_min_size,
3477 unsigned old_pg_num,
3478 unsigned new_pg_num,
3479 bool old_sort_bitwise,
3480 bool new_sort_bitwise,
3481 bool old_recovery_deletes,
3482 bool new_recovery_deletes,
3483 pg_t pgid) {
3484 return old_acting_primary != new_acting_primary ||
3485 new_acting != old_acting ||
3486 old_up_primary != new_up_primary ||
3487 new_up != old_up ||
3488 old_min_size != new_min_size ||
3489 old_size != new_size ||
3490 pgid.is_split(old_pg_num, new_pg_num, 0) ||
3491 old_sort_bitwise != new_sort_bitwise ||
3492 old_recovery_deletes != new_recovery_deletes;
3493 }
3494
3495 bool PastIntervals::is_new_interval(
3496 int old_acting_primary,
3497 int new_acting_primary,
3498 const vector<int> &old_acting,
3499 const vector<int> &new_acting,
3500 int old_up_primary,
3501 int new_up_primary,
3502 const vector<int> &old_up,
3503 const vector<int> &new_up,
3504 OSDMapRef osdmap,
3505 OSDMapRef lastmap,
3506 pg_t pgid) {
3507 return !(lastmap->get_pools().count(pgid.pool())) ||
3508 is_new_interval(old_acting_primary,
3509 new_acting_primary,
3510 old_acting,
3511 new_acting,
3512 old_up_primary,
3513 new_up_primary,
3514 old_up,
3515 new_up,
3516 lastmap->get_pools().find(pgid.pool())->second.size,
3517 osdmap->get_pools().find(pgid.pool())->second.size,
3518 lastmap->get_pools().find(pgid.pool())->second.min_size,
3519 osdmap->get_pools().find(pgid.pool())->second.min_size,
3520 lastmap->get_pg_num(pgid.pool()),
3521 osdmap->get_pg_num(pgid.pool()),
3522 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3523 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3524 lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3525 osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3526 pgid);
3527 }
3528
3529 bool PastIntervals::check_new_interval(
3530 int old_acting_primary,
3531 int new_acting_primary,
3532 const vector<int> &old_acting,
3533 const vector<int> &new_acting,
3534 int old_up_primary,
3535 int new_up_primary,
3536 const vector<int> &old_up,
3537 const vector<int> &new_up,
3538 epoch_t same_interval_since,
3539 epoch_t last_epoch_clean,
3540 OSDMapRef osdmap,
3541 OSDMapRef lastmap,
3542 pg_t pgid,
3543 IsPGRecoverablePredicate *could_have_gone_active,
3544 PastIntervals *past_intervals,
3545 std::ostream *out)
3546 {
3547 /*
3548 * We have to be careful to gracefully deal with situations like
3549 * so. Say we have a power outage or something that takes out both
3550 * OSDs, but the monitor doesn't mark them down in the same epoch.
3551 * The history may look like
3552 *
3553 * 1: A B
3554 * 2: B
3555 * 3: let's say B dies for good, too (say, from the power spike)
3556 * 4: A
3557 *
3558 * which makes it look like B may have applied updates to the PG
3559 * that we need in order to proceed. This sucks...
3560 *
3561 * To minimize the risk of this happening, we CANNOT go active if
3562 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3563 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3564 * Then, we have something like
3565 *
3566 * 1: A B
3567 * 2: B up_thru[B]=0
3568 * 3:
3569 * 4: A
3570 *
3571 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
3572 *
3573 * or,
3574 *
3575 * 1: A B
3576 * 2: B up_thru[B]=0
3577 * 3: B up_thru[B]=2
3578 * 4:
3579 * 5: A
3580 *
3581 * -> we must wait for B, bc it was alive through 2, and could have
3582 * written to the pg.
3583 *
3584 * If B is really dead, then an administrator will need to manually
3585 * intervene by marking the OSD as "lost."
3586 */
3587
3588 // remember past interval
3589 // NOTE: a change in the up set primary triggers an interval
3590 // change, even though the interval members in the pg_interval_t
3591 // do not change.
3592 assert(past_intervals);
3593 assert(past_intervals->past_intervals);
3594 if (is_new_interval(
3595 old_acting_primary,
3596 new_acting_primary,
3597 old_acting,
3598 new_acting,
3599 old_up_primary,
3600 new_up_primary,
3601 old_up,
3602 new_up,
3603 osdmap,
3604 lastmap,
3605 pgid)) {
3606 pg_interval_t i;
3607 i.first = same_interval_since;
3608 i.last = osdmap->get_epoch() - 1;
3609 assert(i.first <= i.last);
3610 i.acting = old_acting;
3611 i.up = old_up;
3612 i.primary = old_acting_primary;
3613 i.up_primary = old_up_primary;
3614
3615 unsigned num_acting = 0;
3616 for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();
3617 ++p)
3618 if (*p != CRUSH_ITEM_NONE)
3619 ++num_acting;
3620
3621 assert(lastmap->get_pools().count(pgid.pool()));
3622 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
3623 set<pg_shard_t> old_acting_shards;
3624 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
3625
3626 if (num_acting &&
3627 i.primary != -1 &&
3628 num_acting >= old_pg_pool.min_size &&
3629 (*could_have_gone_active)(old_acting_shards)) {
3630 if (out)
3631 *out << __func__ << " " << i
3632 << ": not rw,"
3633 << " up_thru " << lastmap->get_up_thru(i.primary)
3634 << " up_from " << lastmap->get_up_from(i.primary)
3635 << " last_epoch_clean " << last_epoch_clean
3636 << std::endl;
3637 if (lastmap->get_up_thru(i.primary) >= i.first &&
3638 lastmap->get_up_from(i.primary) <= i.first) {
3639 i.maybe_went_rw = true;
3640 if (out)
3641 *out << __func__ << " " << i
3642 << " : primary up " << lastmap->get_up_from(i.primary)
3643 << "-" << lastmap->get_up_thru(i.primary)
3644 << " includes interval"
3645 << std::endl;
3646 } else if (last_epoch_clean >= i.first &&
3647 last_epoch_clean <= i.last) {
3648 // If the last_epoch_clean is included in this interval, then
3649 // the pg must have been rw (for recovery to have completed).
3650 // This is important because we won't know the _real_
3651 // first_epoch because we stop at last_epoch_clean, and we
3652 // don't want the oldest interval to randomly have
3653 // maybe_went_rw false depending on the relative up_thru vs
3654 // last_epoch_clean timing.
3655 i.maybe_went_rw = true;
3656 if (out)
3657 *out << __func__ << " " << i
3658 << " : includes last_epoch_clean " << last_epoch_clean
3659 << " and presumed to have been rw"
3660 << std::endl;
3661 } else {
3662 i.maybe_went_rw = false;
3663 if (out)
3664 *out << __func__ << " " << i
3665 << " : primary up " << lastmap->get_up_from(i.primary)
3666 << "-" << lastmap->get_up_thru(i.primary)
3667 << " does not include interval"
3668 << std::endl;
3669 }
3670 } else {
3671 i.maybe_went_rw = false;
3672 if (out)
3673 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
3674 }
3675 past_intervals->past_intervals->add_interval(old_pg_pool.ec_pool(), i);
3676 return true;
3677 } else {
3678 return false;
3679 }
3680 }
3681
3682
3683 // true if the given map affects the prior set
3684 bool PastIntervals::PriorSet::affected_by_map(
3685 const OSDMap &osdmap,
3686 const DoutPrefixProvider *dpp) const
3687 {
3688 for (set<pg_shard_t>::iterator p = probe.begin();
3689 p != probe.end();
3690 ++p) {
3691 int o = p->osd;
3692
3693 // did someone in the prior set go down?
3694 if (osdmap.is_down(o) && down.count(o) == 0) {
3695 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
3696 return true;
3697 }
3698
3699 // did a down osd in cur get (re)marked as lost?
3700 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3701 if (r != blocked_by.end()) {
3702 if (!osdmap.exists(o)) {
3703 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3704 return true;
3705 }
3706 if (osdmap.get_info(o).lost_at != r->second) {
3707 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3708 return true;
3709 }
3710 }
3711 }
3712
3713 // did someone in the prior down set go up?
3714 for (set<int>::const_iterator p = down.begin();
3715 p != down.end();
3716 ++p) {
3717 int o = *p;
3718
3719 if (osdmap.is_up(o)) {
3720 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
3721 return true;
3722 }
3723
3724 // did someone in the prior set get lost or destroyed?
3725 if (!osdmap.exists(o)) {
3726 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3727 return true;
3728 }
3729 // did a down osd in down get (re)marked as lost?
3730 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3731 if (r != blocked_by.end()) {
3732 if (osdmap.get_info(o).lost_at != r->second) {
3733 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3734 return true;
3735 }
3736 }
3737 }
3738
3739 return false;
3740 }
3741
3742 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
3743 {
3744 out << "interval(" << i.first << "-" << i.last
3745 << " up " << i.up << "(" << i.up_primary << ")"
3746 << " acting " << i.acting << "(" << i.primary << ")";
3747 if (i.maybe_went_rw)
3748 out << " maybe_went_rw";
3749 out << ")";
3750 return out;
3751 }
3752
3753
3754
3755 // -- pg_query_t --
3756
3757 void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
3758 ENCODE_START(3, 3, bl);
3759 ::encode(type, bl);
3760 ::encode(since, bl);
3761 history.encode(bl);
3762 ::encode(epoch_sent, bl);
3763 ::encode(to, bl);
3764 ::encode(from, bl);
3765 ENCODE_FINISH(bl);
3766 }
3767
3768 void pg_query_t::decode(bufferlist::iterator &bl) {
3769 DECODE_START(3, bl);
3770 ::decode(type, bl);
3771 ::decode(since, bl);
3772 history.decode(bl);
3773 ::decode(epoch_sent, bl);
3774 ::decode(to, bl);
3775 ::decode(from, bl);
3776 DECODE_FINISH(bl);
3777 }
3778
3779 void pg_query_t::dump(Formatter *f) const
3780 {
3781 f->dump_int("from", from);
3782 f->dump_int("to", to);
3783 f->dump_string("type", get_type_name());
3784 f->dump_stream("since") << since;
3785 f->dump_stream("epoch_sent") << epoch_sent;
3786 f->open_object_section("history");
3787 history.dump(f);
3788 f->close_section();
3789 }
3790 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
3791 {
3792 o.push_back(new pg_query_t());
3793 list<pg_history_t*> h;
3794 pg_history_t::generate_test_instances(h);
3795 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
3796 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
3797 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
3798 eversion_t(4, 5), *h.back(), 4));
3799 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
3800 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
3801 *h.back(), 5));
3802 }
3803
3804 // -- ObjectModDesc --
3805 void ObjectModDesc::visit(Visitor *visitor) const
3806 {
3807 bufferlist::iterator bp = bl.begin();
3808 try {
3809 while (!bp.end()) {
3810 DECODE_START(max_required_version, bp);
3811 uint8_t code;
3812 ::decode(code, bp);
3813 switch (code) {
3814 case APPEND: {
3815 uint64_t size;
3816 ::decode(size, bp);
3817 visitor->append(size);
3818 break;
3819 }
3820 case SETATTRS: {
3821 map<string, boost::optional<bufferlist> > attrs;
3822 ::decode(attrs, bp);
3823 visitor->setattrs(attrs);
3824 break;
3825 }
3826 case DELETE: {
3827 version_t old_version;
3828 ::decode(old_version, bp);
3829 visitor->rmobject(old_version);
3830 break;
3831 }
3832 case CREATE: {
3833 visitor->create();
3834 break;
3835 }
3836 case UPDATE_SNAPS: {
3837 set<snapid_t> snaps;
3838 ::decode(snaps, bp);
3839 visitor->update_snaps(snaps);
3840 break;
3841 }
3842 case TRY_DELETE: {
3843 version_t old_version;
3844 ::decode(old_version, bp);
3845 visitor->try_rmobject(old_version);
3846 break;
3847 }
3848 case ROLLBACK_EXTENTS: {
3849 vector<pair<uint64_t, uint64_t> > extents;
3850 version_t gen;
3851 ::decode(gen, bp);
3852 ::decode(extents, bp);
3853 visitor->rollback_extents(gen,extents);
3854 break;
3855 }
3856 default:
3857 assert(0 == "Invalid rollback code");
3858 }
3859 DECODE_FINISH(bp);
3860 }
3861 } catch (...) {
3862 assert(0 == "Invalid encoding");
3863 }
3864 }
3865
3866 struct DumpVisitor : public ObjectModDesc::Visitor {
3867 Formatter *f;
3868 explicit DumpVisitor(Formatter *f) : f(f) {}
3869 void append(uint64_t old_size) override {
3870 f->open_object_section("op");
3871 f->dump_string("code", "APPEND");
3872 f->dump_unsigned("old_size", old_size);
3873 f->close_section();
3874 }
3875 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
3876 f->open_object_section("op");
3877 f->dump_string("code", "SETATTRS");
3878 f->open_array_section("attrs");
3879 for (map<string, boost::optional<bufferlist> >::iterator i = attrs.begin();
3880 i != attrs.end();
3881 ++i) {
3882 f->dump_string("attr_name", i->first);
3883 }
3884 f->close_section();
3885 f->close_section();
3886 }
3887 void rmobject(version_t old_version) override {
3888 f->open_object_section("op");
3889 f->dump_string("code", "RMOBJECT");
3890 f->dump_unsigned("old_version", old_version);
3891 f->close_section();
3892 }
3893 void try_rmobject(version_t old_version) override {
3894 f->open_object_section("op");
3895 f->dump_string("code", "TRY_RMOBJECT");
3896 f->dump_unsigned("old_version", old_version);
3897 f->close_section();
3898 }
3899 void create() override {
3900 f->open_object_section("op");
3901 f->dump_string("code", "CREATE");
3902 f->close_section();
3903 }
3904 void update_snaps(const set<snapid_t> &snaps) override {
3905 f->open_object_section("op");
3906 f->dump_string("code", "UPDATE_SNAPS");
3907 f->dump_stream("snaps") << snaps;
3908 f->close_section();
3909 }
3910 void rollback_extents(
3911 version_t gen,
3912 const vector<pair<uint64_t, uint64_t> > &extents) override {
3913 f->open_object_section("op");
3914 f->dump_string("code", "ROLLBACK_EXTENTS");
3915 f->dump_unsigned("gen", gen);
3916 f->dump_stream("snaps") << extents;
3917 f->close_section();
3918 }
3919 };
3920
3921 void ObjectModDesc::dump(Formatter *f) const
3922 {
3923 f->open_object_section("object_mod_desc");
3924 f->dump_bool("can_local_rollback", can_local_rollback);
3925 f->dump_bool("rollback_info_completed", rollback_info_completed);
3926 {
3927 f->open_array_section("ops");
3928 DumpVisitor vis(f);
3929 visit(&vis);
3930 f->close_section();
3931 }
3932 f->close_section();
3933 }
3934
3935 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
3936 {
3937 map<string, boost::optional<bufferlist> > attrs;
3938 attrs[OI_ATTR];
3939 attrs[SS_ATTR];
3940 attrs["asdf"];
3941 o.push_back(new ObjectModDesc());
3942 o.back()->append(100);
3943 o.back()->setattrs(attrs);
3944 o.push_back(new ObjectModDesc());
3945 o.back()->rmobject(1001);
3946 o.push_back(new ObjectModDesc());
3947 o.back()->create();
3948 o.back()->setattrs(attrs);
3949 o.push_back(new ObjectModDesc());
3950 o.back()->create();
3951 o.back()->setattrs(attrs);
3952 o.back()->mark_unrollbackable();
3953 o.back()->append(1000);
3954 }
3955
3956 void ObjectModDesc::encode(bufferlist &_bl) const
3957 {
3958 ENCODE_START(max_required_version, max_required_version, _bl);
3959 ::encode(can_local_rollback, _bl);
3960 ::encode(rollback_info_completed, _bl);
3961 ::encode(bl, _bl);
3962 ENCODE_FINISH(_bl);
3963 }
3964 void ObjectModDesc::decode(bufferlist::iterator &_bl)
3965 {
3966 DECODE_START(2, _bl);
3967 max_required_version = struct_v;
3968 ::decode(can_local_rollback, _bl);
3969 ::decode(rollback_info_completed, _bl);
3970 ::decode(bl, _bl);
3971 // ensure bl does not pin a larger buffer in memory
3972 bl.rebuild();
3973 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3974 DECODE_FINISH(_bl);
3975 }
3976
3977 // -- pg_log_entry_t --
3978
3979 string pg_log_entry_t::get_key_name() const
3980 {
3981 return version.get_key_name();
3982 }
3983
3984 void pg_log_entry_t::encode_with_checksum(bufferlist& bl) const
3985 {
3986 bufferlist ebl(sizeof(*this)*2);
3987 encode(ebl);
3988 __u32 crc = ebl.crc32c(0);
3989 ::encode(ebl, bl);
3990 ::encode(crc, bl);
3991 }
3992
3993 void pg_log_entry_t::decode_with_checksum(bufferlist::iterator& p)
3994 {
3995 bufferlist bl;
3996 ::decode(bl, p);
3997 __u32 crc;
3998 ::decode(crc, p);
3999 if (crc != bl.crc32c(0))
4000 throw buffer::malformed_input("bad checksum on pg_log_entry_t");
4001 bufferlist::iterator q = bl.begin();
4002 decode(q);
4003 }
4004
4005 void pg_log_entry_t::encode(bufferlist &bl) const
4006 {
4007 ENCODE_START(11, 4, bl);
4008 ::encode(op, bl);
4009 ::encode(soid, bl);
4010 ::encode(version, bl);
4011
4012 /**
4013 * Added with reverting_to:
4014 * Previous code used prior_version to encode
4015 * what we now call reverting_to. This will
4016 * allow older code to decode reverting_to
4017 * into prior_version as expected.
4018 */
4019 if (op == LOST_REVERT)
4020 ::encode(reverting_to, bl);
4021 else
4022 ::encode(prior_version, bl);
4023
4024 ::encode(reqid, bl);
4025 ::encode(mtime, bl);
4026 if (op == LOST_REVERT)
4027 ::encode(prior_version, bl);
4028 ::encode(snaps, bl);
4029 ::encode(user_version, bl);
4030 ::encode(mod_desc, bl);
4031 ::encode(extra_reqids, bl);
4032 if (op == ERROR)
4033 ::encode(return_code, bl);
4034 ENCODE_FINISH(bl);
4035 }
4036
4037 void pg_log_entry_t::decode(bufferlist::iterator &bl)
4038 {
4039 DECODE_START_LEGACY_COMPAT_LEN(11, 4, 4, bl);
4040 ::decode(op, bl);
4041 if (struct_v < 2) {
4042 sobject_t old_soid;
4043 ::decode(old_soid, bl);
4044 soid.oid = old_soid.oid;
4045 soid.snap = old_soid.snap;
4046 invalid_hash = true;
4047 } else {
4048 ::decode(soid, bl);
4049 }
4050 if (struct_v < 3)
4051 invalid_hash = true;
4052 ::decode(version, bl);
4053
4054 if (struct_v >= 6 && op == LOST_REVERT)
4055 ::decode(reverting_to, bl);
4056 else
4057 ::decode(prior_version, bl);
4058
4059 ::decode(reqid, bl);
4060
4061 ::decode(mtime, bl);
4062 if (struct_v < 5)
4063 invalid_pool = true;
4064
4065 if (op == LOST_REVERT) {
4066 if (struct_v >= 6) {
4067 ::decode(prior_version, bl);
4068 } else {
4069 reverting_to = prior_version;
4070 }
4071 }
4072 if (struct_v >= 7 || // for v >= 7, this is for all ops.
4073 op == CLONE) { // for v < 7, it's only present for CLONE.
4074 ::decode(snaps, bl);
4075 // ensure snaps does not pin a larger buffer in memory
4076 snaps.rebuild();
4077 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4078 }
4079
4080 if (struct_v >= 8)
4081 ::decode(user_version, bl);
4082 else
4083 user_version = version.version;
4084
4085 if (struct_v >= 9)
4086 ::decode(mod_desc, bl);
4087 else
4088 mod_desc.mark_unrollbackable();
4089 if (struct_v >= 10)
4090 ::decode(extra_reqids, bl);
4091 if (struct_v >= 11 && op == ERROR)
4092 ::decode(return_code, bl);
4093 DECODE_FINISH(bl);
4094 }
4095
4096 void pg_log_entry_t::dump(Formatter *f) const
4097 {
4098 f->dump_string("op", get_op_name());
4099 f->dump_stream("object") << soid;
4100 f->dump_stream("version") << version;
4101 f->dump_stream("prior_version") << prior_version;
4102 f->dump_stream("reqid") << reqid;
4103 f->open_array_section("extra_reqids");
4104 for (auto p = extra_reqids.begin();
4105 p != extra_reqids.end();
4106 ++p) {
4107 f->open_object_section("extra_reqid");
4108 f->dump_stream("reqid") << p->first;
4109 f->dump_stream("user_version") << p->second;
4110 f->close_section();
4111 }
4112 f->close_section();
4113 f->dump_stream("mtime") << mtime;
4114 f->dump_int("return_code", return_code);
4115 if (snaps.length() > 0) {
4116 vector<snapid_t> v;
4117 bufferlist c = snaps;
4118 bufferlist::iterator p = c.begin();
4119 try {
4120 ::decode(v, p);
4121 } catch (...) {
4122 v.clear();
4123 }
4124 f->open_object_section("snaps");
4125 for (vector<snapid_t>::iterator p = v.begin(); p != v.end(); ++p)
4126 f->dump_unsigned("snap", *p);
4127 f->close_section();
4128 }
4129 {
4130 f->open_object_section("mod_desc");
4131 mod_desc.dump(f);
4132 f->close_section();
4133 }
4134 }
4135
4136 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4137 {
4138 o.push_back(new pg_log_entry_t());
4139 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4140 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4141 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4142 utime_t(8,9), 0));
4143 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4144 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4145 utime_t(8,9), -ENOENT));
4146 }
4147
4148 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4149 {
4150 out << e.version << " (" << e.prior_version << ") "
4151 << std::left << std::setw(8) << e.get_op_name() << ' '
4152 << e.soid << " by " << e.reqid << " " << e.mtime
4153 << " " << e.return_code;
4154 if (e.snaps.length()) {
4155 vector<snapid_t> snaps;
4156 bufferlist c = e.snaps;
4157 bufferlist::iterator p = c.begin();
4158 try {
4159 ::decode(snaps, p);
4160 } catch (...) {
4161 snaps.clear();
4162 }
4163 out << " snaps " << snaps;
4164 }
4165 return out;
4166 }
4167
4168 // -- pg_log_dup_t --
4169
4170 string pg_log_dup_t::get_key_name() const
4171 {
4172 return "dup_" + version.get_key_name();
4173 }
4174
4175 void pg_log_dup_t::encode(bufferlist &bl) const
4176 {
4177 ENCODE_START(1, 1, bl);
4178 ::encode(reqid, bl);
4179 ::encode(version, bl);
4180 ::encode(user_version, bl);
4181 ::encode(return_code, bl);
4182 ENCODE_FINISH(bl);
4183 }
4184
4185 void pg_log_dup_t::decode(bufferlist::iterator &bl)
4186 {
4187 DECODE_START(1, bl);
4188 ::decode(reqid, bl);
4189 ::decode(version, bl);
4190 ::decode(user_version, bl);
4191 ::decode(return_code, bl);
4192 DECODE_FINISH(bl);
4193 }
4194
4195 void pg_log_dup_t::dump(Formatter *f) const
4196 {
4197 f->dump_stream("reqid") << reqid;
4198 f->dump_stream("version") << version;
4199 f->dump_stream("user_version") << user_version;
4200 f->dump_stream("return_code") << return_code;
4201 }
4202
4203 void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
4204 {
4205 o.push_back(new pg_log_dup_t());
4206 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4207 1,
4208 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4209 0));
4210 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4211 2,
4212 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4213 -ENOENT));
4214 }
4215
4216
4217 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
4218 return out << "log_dup(reqid=" << e.reqid <<
4219 " v=" << e.version << " uv=" << e.user_version <<
4220 " rc=" << e.return_code << ")";
4221 }
4222
4223
4224 // -- pg_log_t --
4225
4226 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4227 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4228 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
4229 const string &hit_set_namespace, const pg_log_t &in,
4230 pg_log_t &out, pg_log_t &reject)
4231 {
4232 out = in;
4233 out.log.clear();
4234 reject.log.clear();
4235
4236 for (list<pg_log_entry_t>::const_iterator i = in.log.begin();
4237 i != in.log.end(); ++i) {
4238
4239 // Reject pg log entries for temporary objects
4240 if (i->soid.is_temp()) {
4241 reject.log.push_back(*i);
4242 continue;
4243 }
4244
4245 if (i->soid.nspace != hit_set_namespace) {
4246 object_t oid = i->soid.oid;
4247 object_locator_t loc(i->soid);
4248 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
4249 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
4250
4251 if (import_pgid.pgid == pgid) {
4252 out.log.push_back(*i);
4253 } else {
4254 reject.log.push_back(*i);
4255 }
4256 } else {
4257 out.log.push_back(*i);
4258 }
4259 }
4260 }
4261
4262 void pg_log_t::encode(bufferlist& bl) const
4263 {
4264 ENCODE_START(7, 3, bl);
4265 ::encode(head, bl);
4266 ::encode(tail, bl);
4267 ::encode(log, bl);
4268 ::encode(can_rollback_to, bl);
4269 ::encode(rollback_info_trimmed_to, bl);
4270 ::encode(dups, bl);
4271 ENCODE_FINISH(bl);
4272 }
4273
4274 void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
4275 {
4276 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
4277 ::decode(head, bl);
4278 ::decode(tail, bl);
4279 if (struct_v < 2) {
4280 bool backlog;
4281 ::decode(backlog, bl);
4282 }
4283 ::decode(log, bl);
4284 if (struct_v >= 5)
4285 ::decode(can_rollback_to, bl);
4286
4287 if (struct_v >= 6)
4288 ::decode(rollback_info_trimmed_to, bl);
4289 else
4290 rollback_info_trimmed_to = tail;
4291
4292 if (struct_v >= 7)
4293 ::decode(dups, bl);
4294
4295 DECODE_FINISH(bl);
4296
4297 // handle hobject_t format change
4298 if (struct_v < 4) {
4299 for (list<pg_log_entry_t>::iterator i = log.begin();
4300 i != log.end();
4301 ++i) {
4302 if (!i->soid.is_max() && i->soid.pool == -1)
4303 i->soid.pool = pool;
4304 }
4305 }
4306 }
4307
4308 void pg_log_t::dump(Formatter *f) const
4309 {
4310 f->dump_stream("head") << head;
4311 f->dump_stream("tail") << tail;
4312 f->open_array_section("log");
4313 for (list<pg_log_entry_t>::const_iterator p = log.begin(); p != log.end(); ++p) {
4314 f->open_object_section("entry");
4315 p->dump(f);
4316 f->close_section();
4317 }
4318 f->close_section();
4319 f->open_array_section("dups");
4320 for (const auto& entry : dups) {
4321 f->open_object_section("entry");
4322 entry.dump(f);
4323 f->close_section();
4324 }
4325 f->close_section();
4326 }
4327
4328 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
4329 {
4330 o.push_back(new pg_log_t);
4331
4332 // this is nonsensical:
4333 o.push_back(new pg_log_t);
4334 o.back()->head = eversion_t(1,2);
4335 o.back()->tail = eversion_t(3,4);
4336 list<pg_log_entry_t*> e;
4337 pg_log_entry_t::generate_test_instances(e);
4338 for (list<pg_log_entry_t*>::iterator p = e.begin(); p != e.end(); ++p)
4339 o.back()->log.push_back(**p);
4340 }
4341
4342 void pg_log_t::copy_after(const pg_log_t &other, eversion_t v)
4343 {
4344 can_rollback_to = other.can_rollback_to;
4345 head = other.head;
4346 tail = other.tail;
4347 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4348 i != other.log.rend();
4349 ++i) {
4350 assert(i->version > other.tail);
4351 if (i->version <= v) {
4352 // make tail accurate.
4353 tail = i->version;
4354 break;
4355 }
4356 log.push_front(*i);
4357 }
4358 }
4359
4360 void pg_log_t::copy_range(const pg_log_t &other, eversion_t from, eversion_t to)
4361 {
4362 can_rollback_to = other.can_rollback_to;
4363 list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4364 assert(i != other.log.rend());
4365 while (i->version > to) {
4366 ++i;
4367 assert(i != other.log.rend());
4368 }
4369 assert(i->version == to);
4370 head = to;
4371 for ( ; i != other.log.rend(); ++i) {
4372 if (i->version <= from) {
4373 tail = i->version;
4374 break;
4375 }
4376 log.push_front(*i);
4377 }
4378 }
4379
4380 void pg_log_t::copy_up_to(const pg_log_t &other, int max)
4381 {
4382 can_rollback_to = other.can_rollback_to;
4383 int n = 0;
4384 head = other.head;
4385 tail = other.tail;
4386 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4387 i != other.log.rend();
4388 ++i) {
4389 if (n++ >= max) {
4390 tail = i->version;
4391 break;
4392 }
4393 log.push_front(*i);
4394 }
4395 }
4396
4397 ostream& pg_log_t::print(ostream& out) const
4398 {
4399 out << *this << std::endl;
4400 for (list<pg_log_entry_t>::const_iterator p = log.begin();
4401 p != log.end();
4402 ++p)
4403 out << *p << std::endl;
4404 for (const auto& entry : dups) {
4405 out << " dup entry: " << entry << std::endl;
4406 }
4407 return out;
4408 }
4409
4410 // -- pg_missing_t --
4411
4412 ostream& operator<<(ostream& out, const pg_missing_item& i)
4413 {
4414 out << i.need;
4415 if (i.have != eversion_t())
4416 out << "(" << i.have << ")";
4417 out << " flags = " << i.flag_str();
4418 return out;
4419 }
4420
4421 // -- object_copy_cursor_t --
4422
4423 void object_copy_cursor_t::encode(bufferlist& bl) const
4424 {
4425 ENCODE_START(1, 1, bl);
4426 ::encode(attr_complete, bl);
4427 ::encode(data_offset, bl);
4428 ::encode(data_complete, bl);
4429 ::encode(omap_offset, bl);
4430 ::encode(omap_complete, bl);
4431 ENCODE_FINISH(bl);
4432 }
4433
4434 void object_copy_cursor_t::decode(bufferlist::iterator &bl)
4435 {
4436 DECODE_START(1, bl);
4437 ::decode(attr_complete, bl);
4438 ::decode(data_offset, bl);
4439 ::decode(data_complete, bl);
4440 ::decode(omap_offset, bl);
4441 ::decode(omap_complete, bl);
4442 DECODE_FINISH(bl);
4443 }
4444
4445 void object_copy_cursor_t::dump(Formatter *f) const
4446 {
4447 f->dump_unsigned("attr_complete", (int)attr_complete);
4448 f->dump_unsigned("data_offset", data_offset);
4449 f->dump_unsigned("data_complete", (int)data_complete);
4450 f->dump_string("omap_offset", omap_offset);
4451 f->dump_unsigned("omap_complete", (int)omap_complete);
4452 }
4453
4454 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
4455 {
4456 o.push_back(new object_copy_cursor_t);
4457 o.push_back(new object_copy_cursor_t);
4458 o.back()->attr_complete = true;
4459 o.back()->data_offset = 123;
4460 o.push_back(new object_copy_cursor_t);
4461 o.back()->attr_complete = true;
4462 o.back()->data_complete = true;
4463 o.back()->omap_offset = "foo";
4464 o.push_back(new object_copy_cursor_t);
4465 o.back()->attr_complete = true;
4466 o.back()->data_complete = true;
4467 o.back()->omap_complete = true;
4468 }
4469
4470 // -- object_copy_data_t --
4471
4472 void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
4473 {
4474 ENCODE_START(7, 5, bl);
4475 ::encode(size, bl);
4476 ::encode(mtime, bl);
4477 ::encode(attrs, bl);
4478 ::encode(data, bl);
4479 ::encode(omap_data, bl);
4480 ::encode(cursor, bl);
4481 ::encode(omap_header, bl);
4482 ::encode(snaps, bl);
4483 ::encode(snap_seq, bl);
4484 ::encode(flags, bl);
4485 ::encode(data_digest, bl);
4486 ::encode(omap_digest, bl);
4487 ::encode(reqids, bl);
4488 ::encode(truncate_seq, bl);
4489 ::encode(truncate_size, bl);
4490 ENCODE_FINISH(bl);
4491 }
4492
4493 void object_copy_data_t::decode(bufferlist::iterator& bl)
4494 {
4495 DECODE_START(7, bl);
4496 if (struct_v < 5) {
4497 // old
4498 ::decode(size, bl);
4499 ::decode(mtime, bl);
4500 {
4501 string category;
4502 ::decode(category, bl); // no longer used
4503 }
4504 ::decode(attrs, bl);
4505 ::decode(data, bl);
4506 {
4507 map<string,bufferlist> omap;
4508 ::decode(omap, bl);
4509 omap_data.clear();
4510 if (!omap.empty())
4511 ::encode(omap, omap_data);
4512 }
4513 ::decode(cursor, bl);
4514 if (struct_v >= 2)
4515 ::decode(omap_header, bl);
4516 if (struct_v >= 3) {
4517 ::decode(snaps, bl);
4518 ::decode(snap_seq, bl);
4519 } else {
4520 snaps.clear();
4521 snap_seq = 0;
4522 }
4523 if (struct_v >= 4) {
4524 ::decode(flags, bl);
4525 ::decode(data_digest, bl);
4526 ::decode(omap_digest, bl);
4527 }
4528 } else {
4529 // current
4530 ::decode(size, bl);
4531 ::decode(mtime, bl);
4532 ::decode(attrs, bl);
4533 ::decode(data, bl);
4534 ::decode(omap_data, bl);
4535 ::decode(cursor, bl);
4536 ::decode(omap_header, bl);
4537 ::decode(snaps, bl);
4538 ::decode(snap_seq, bl);
4539 if (struct_v >= 4) {
4540 ::decode(flags, bl);
4541 ::decode(data_digest, bl);
4542 ::decode(omap_digest, bl);
4543 }
4544 if (struct_v >= 6) {
4545 ::decode(reqids, bl);
4546 }
4547 if (struct_v >= 7) {
4548 ::decode(truncate_seq, bl);
4549 ::decode(truncate_size, bl);
4550 }
4551 }
4552 DECODE_FINISH(bl);
4553 }
4554
4555 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
4556 {
4557 o.push_back(new object_copy_data_t());
4558
4559 list<object_copy_cursor_t*> cursors;
4560 object_copy_cursor_t::generate_test_instances(cursors);
4561 list<object_copy_cursor_t*>::iterator ci = cursors.begin();
4562 o.back()->cursor = **(ci++);
4563
4564 o.push_back(new object_copy_data_t());
4565 o.back()->cursor = **(ci++);
4566
4567 o.push_back(new object_copy_data_t());
4568 o.back()->size = 1234;
4569 o.back()->mtime.set_from_double(1234);
4570 bufferptr bp("there", 5);
4571 bufferlist bl;
4572 bl.push_back(bp);
4573 o.back()->attrs["hello"] = bl;
4574 bufferptr bp2("not", 3);
4575 bufferlist bl2;
4576 bl2.push_back(bp2);
4577 map<string,bufferlist> omap;
4578 omap["why"] = bl2;
4579 ::encode(omap, o.back()->omap_data);
4580 bufferptr databp("iamsomedatatocontain", 20);
4581 o.back()->data.push_back(databp);
4582 o.back()->omap_header.append("this is an omap header");
4583 o.back()->snaps.push_back(123);
4584 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
4585 }
4586
4587 void object_copy_data_t::dump(Formatter *f) const
4588 {
4589 f->open_object_section("cursor");
4590 cursor.dump(f);
4591 f->close_section(); // cursor
4592 f->dump_int("size", size);
4593 f->dump_stream("mtime") << mtime;
4594 /* we should really print out the attrs here, but bufferlist
4595 const-correctness prevents that */
4596 f->dump_int("attrs_size", attrs.size());
4597 f->dump_int("flags", flags);
4598 f->dump_unsigned("data_digest", data_digest);
4599 f->dump_unsigned("omap_digest", omap_digest);
4600 f->dump_int("omap_data_length", omap_data.length());
4601 f->dump_int("omap_header_length", omap_header.length());
4602 f->dump_int("data_length", data.length());
4603 f->open_array_section("snaps");
4604 for (vector<snapid_t>::const_iterator p = snaps.begin();
4605 p != snaps.end(); ++p)
4606 f->dump_unsigned("snap", *p);
4607 f->close_section();
4608 f->open_array_section("reqids");
4609 for (auto p = reqids.begin();
4610 p != reqids.end();
4611 ++p) {
4612 f->open_object_section("extra_reqid");
4613 f->dump_stream("reqid") << p->first;
4614 f->dump_stream("user_version") << p->second;
4615 f->close_section();
4616 }
4617 f->close_section();
4618 }
4619
4620 // -- pg_create_t --
4621
4622 void pg_create_t::encode(bufferlist &bl) const
4623 {
4624 ENCODE_START(1, 1, bl);
4625 ::encode(created, bl);
4626 ::encode(parent, bl);
4627 ::encode(split_bits, bl);
4628 ENCODE_FINISH(bl);
4629 }
4630
4631 void pg_create_t::decode(bufferlist::iterator &bl)
4632 {
4633 DECODE_START(1, bl);
4634 ::decode(created, bl);
4635 ::decode(parent, bl);
4636 ::decode(split_bits, bl);
4637 DECODE_FINISH(bl);
4638 }
4639
4640 void pg_create_t::dump(Formatter *f) const
4641 {
4642 f->dump_unsigned("created", created);
4643 f->dump_stream("parent") << parent;
4644 f->dump_int("split_bits", split_bits);
4645 }
4646
4647 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
4648 {
4649 o.push_back(new pg_create_t);
4650 o.push_back(new pg_create_t(1, pg_t(3, 4, -1), 2));
4651 }
4652
4653
4654 // -- pg_hit_set_info_t --
4655
4656 void pg_hit_set_info_t::encode(bufferlist& bl) const
4657 {
4658 ENCODE_START(2, 1, bl);
4659 ::encode(begin, bl);
4660 ::encode(end, bl);
4661 ::encode(version, bl);
4662 ::encode(using_gmt, bl);
4663 ENCODE_FINISH(bl);
4664 }
4665
4666 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
4667 {
4668 DECODE_START(2, p);
4669 ::decode(begin, p);
4670 ::decode(end, p);
4671 ::decode(version, p);
4672 if (struct_v >= 2) {
4673 ::decode(using_gmt, p);
4674 } else {
4675 using_gmt = false;
4676 }
4677 DECODE_FINISH(p);
4678 }
4679
4680 void pg_hit_set_info_t::dump(Formatter *f) const
4681 {
4682 f->dump_stream("begin") << begin;
4683 f->dump_stream("end") << end;
4684 f->dump_stream("version") << version;
4685 f->dump_stream("using_gmt") << using_gmt;
4686 }
4687
4688 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
4689 {
4690 ls.push_back(new pg_hit_set_info_t);
4691 ls.push_back(new pg_hit_set_info_t);
4692 ls.back()->begin = utime_t(1, 2);
4693 ls.back()->end = utime_t(3, 4);
4694 }
4695
4696
4697 // -- pg_hit_set_history_t --
4698
4699 void pg_hit_set_history_t::encode(bufferlist& bl) const
4700 {
4701 ENCODE_START(1, 1, bl);
4702 ::encode(current_last_update, bl);
4703 {
4704 utime_t dummy_stamp;
4705 ::encode(dummy_stamp, bl);
4706 }
4707 {
4708 pg_hit_set_info_t dummy_info;
4709 ::encode(dummy_info, bl);
4710 }
4711 ::encode(history, bl);
4712 ENCODE_FINISH(bl);
4713 }
4714
4715 void pg_hit_set_history_t::decode(bufferlist::iterator& p)
4716 {
4717 DECODE_START(1, p);
4718 ::decode(current_last_update, p);
4719 {
4720 utime_t dummy_stamp;
4721 ::decode(dummy_stamp, p);
4722 }
4723 {
4724 pg_hit_set_info_t dummy_info;
4725 ::decode(dummy_info, p);
4726 }
4727 ::decode(history, p);
4728 DECODE_FINISH(p);
4729 }
4730
4731 void pg_hit_set_history_t::dump(Formatter *f) const
4732 {
4733 f->dump_stream("current_last_update") << current_last_update;
4734 f->open_array_section("history");
4735 for (list<pg_hit_set_info_t>::const_iterator p = history.begin();
4736 p != history.end(); ++p) {
4737 f->open_object_section("info");
4738 p->dump(f);
4739 f->close_section();
4740 }
4741 f->close_section();
4742 }
4743
4744 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
4745 {
4746 ls.push_back(new pg_hit_set_history_t);
4747 ls.push_back(new pg_hit_set_history_t);
4748 ls.back()->current_last_update = eversion_t(1, 2);
4749 ls.back()->history.push_back(pg_hit_set_info_t());
4750 }
4751
4752 // -- osd_peer_stat_t --
4753
4754 void osd_peer_stat_t::encode(bufferlist& bl) const
4755 {
4756 ENCODE_START(1, 1, bl);
4757 ::encode(stamp, bl);
4758 ENCODE_FINISH(bl);
4759 }
4760
4761 void osd_peer_stat_t::decode(bufferlist::iterator& bl)
4762 {
4763 DECODE_START(1, bl);
4764 ::decode(stamp, bl);
4765 DECODE_FINISH(bl);
4766 }
4767
4768 void osd_peer_stat_t::dump(Formatter *f) const
4769 {
4770 f->dump_stream("stamp") << stamp;
4771 }
4772
4773 void osd_peer_stat_t::generate_test_instances(list<osd_peer_stat_t*>& o)
4774 {
4775 o.push_back(new osd_peer_stat_t);
4776 o.push_back(new osd_peer_stat_t);
4777 o.back()->stamp = utime_t(1, 2);
4778 }
4779
4780 ostream& operator<<(ostream& out, const osd_peer_stat_t &stat)
4781 {
4782 return out << "stat(" << stat.stamp << ")";
4783 }
4784
4785
4786 // -- OSDSuperblock --
4787
4788 void OSDSuperblock::encode(bufferlist &bl) const
4789 {
4790 ENCODE_START(8, 5, bl);
4791 ::encode(cluster_fsid, bl);
4792 ::encode(whoami, bl);
4793 ::encode(current_epoch, bl);
4794 ::encode(oldest_map, bl);
4795 ::encode(newest_map, bl);
4796 ::encode(weight, bl);
4797 compat_features.encode(bl);
4798 ::encode(clean_thru, bl);
4799 ::encode(mounted, bl);
4800 ::encode(osd_fsid, bl);
4801 ::encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
4802 ::encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
4803 ENCODE_FINISH(bl);
4804 }
4805
4806 void OSDSuperblock::decode(bufferlist::iterator &bl)
4807 {
4808 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
4809 if (struct_v < 3) {
4810 string magic;
4811 ::decode(magic, bl);
4812 }
4813 ::decode(cluster_fsid, bl);
4814 ::decode(whoami, bl);
4815 ::decode(current_epoch, bl);
4816 ::decode(oldest_map, bl);
4817 ::decode(newest_map, bl);
4818 ::decode(weight, bl);
4819 if (struct_v >= 2) {
4820 compat_features.decode(bl);
4821 } else { //upgrade it!
4822 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4823 }
4824 ::decode(clean_thru, bl);
4825 ::decode(mounted, bl);
4826 if (struct_v >= 4)
4827 ::decode(osd_fsid, bl);
4828 if (struct_v >= 6) {
4829 epoch_t last_map_marked_full;
4830 ::decode(last_map_marked_full, bl);
4831 }
4832 if (struct_v >= 7) {
4833 map<int64_t,epoch_t> pool_last_map_marked_full;
4834 ::decode(pool_last_map_marked_full, bl);
4835 }
4836 DECODE_FINISH(bl);
4837 }
4838
4839 void OSDSuperblock::dump(Formatter *f) const
4840 {
4841 f->dump_stream("cluster_fsid") << cluster_fsid;
4842 f->dump_stream("osd_fsid") << osd_fsid;
4843 f->dump_int("whoami", whoami);
4844 f->dump_int("current_epoch", current_epoch);
4845 f->dump_int("oldest_map", oldest_map);
4846 f->dump_int("newest_map", newest_map);
4847 f->dump_float("weight", weight);
4848 f->open_object_section("compat");
4849 compat_features.dump(f);
4850 f->close_section();
4851 f->dump_int("clean_thru", clean_thru);
4852 f->dump_int("last_epoch_mounted", mounted);
4853 }
4854
4855 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
4856 {
4857 OSDSuperblock z;
4858 o.push_back(new OSDSuperblock(z));
4859 memset(&z.cluster_fsid, 1, sizeof(z.cluster_fsid));
4860 memset(&z.osd_fsid, 2, sizeof(z.osd_fsid));
4861 z.whoami = 3;
4862 z.current_epoch = 4;
4863 z.oldest_map = 5;
4864 z.newest_map = 9;
4865 z.mounted = 8;
4866 z.clean_thru = 7;
4867 o.push_back(new OSDSuperblock(z));
4868 o.push_back(new OSDSuperblock(z));
4869 }
4870
4871 // -- SnapSet --
4872
4873 void SnapSet::encode(bufferlist& bl) const
4874 {
4875 ENCODE_START(3, 2, bl);
4876 ::encode(seq, bl);
4877 ::encode(head_exists, bl);
4878 ::encode(snaps, bl);
4879 ::encode(clones, bl);
4880 ::encode(clone_overlap, bl);
4881 ::encode(clone_size, bl);
4882 ::encode(clone_snaps, bl);
4883 ENCODE_FINISH(bl);
4884 }
4885
4886 void SnapSet::decode(bufferlist::iterator& bl)
4887 {
4888 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
4889 ::decode(seq, bl);
4890 ::decode(head_exists, bl);
4891 ::decode(snaps, bl);
4892 ::decode(clones, bl);
4893 ::decode(clone_overlap, bl);
4894 ::decode(clone_size, bl);
4895 if (struct_v >= 3) {
4896 ::decode(clone_snaps, bl);
4897 } else {
4898 clone_snaps.clear();
4899 }
4900 DECODE_FINISH(bl);
4901 }
4902
4903 void SnapSet::dump(Formatter *f) const
4904 {
4905 SnapContext sc(seq, snaps);
4906 f->open_object_section("snap_context");
4907 sc.dump(f);
4908 f->close_section();
4909 f->dump_int("head_exists", head_exists);
4910 f->open_array_section("clones");
4911 for (vector<snapid_t>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
4912 f->open_object_section("clone");
4913 f->dump_unsigned("snap", *p);
4914 auto cs = clone_size.find(*p);
4915 if (cs != clone_size.end())
4916 f->dump_unsigned("size", cs->second);
4917 else
4918 f->dump_string("size", "????");
4919 auto co = clone_overlap.find(*p);
4920 if (co != clone_overlap.end())
4921 f->dump_stream("overlap") << co->second;
4922 else
4923 f->dump_stream("overlap") << "????";
4924 auto q = clone_snaps.find(*p);
4925 if (q != clone_snaps.end()) {
4926 f->open_array_section("snaps");
4927 for (auto s : q->second) {
4928 f->dump_unsigned("snap", s);
4929 }
4930 f->close_section();
4931 }
4932 f->close_section();
4933 }
4934 f->close_section();
4935 }
4936
4937 void SnapSet::generate_test_instances(list<SnapSet*>& o)
4938 {
4939 o.push_back(new SnapSet);
4940 o.push_back(new SnapSet);
4941 o.back()->head_exists = true;
4942 o.back()->seq = 123;
4943 o.back()->snaps.push_back(123);
4944 o.back()->snaps.push_back(12);
4945 o.push_back(new SnapSet);
4946 o.back()->head_exists = true;
4947 o.back()->seq = 123;
4948 o.back()->snaps.push_back(123);
4949 o.back()->snaps.push_back(12);
4950 o.back()->clones.push_back(12);
4951 o.back()->clone_size[12] = 12345;
4952 o.back()->clone_overlap[12];
4953 o.back()->clone_snaps[12] = {12, 10, 8};
4954 }
4955
4956 ostream& operator<<(ostream& out, const SnapSet& cs)
4957 {
4958 if (cs.is_legacy()) {
4959 out << cs.seq << "=" << cs.snaps << ":"
4960 << cs.clones
4961 << (cs.head_exists ? "+head":"");
4962 if (!cs.clone_snaps.empty()) {
4963 out << "+stray_clone_snaps=" << cs.clone_snaps;
4964 }
4965 return out;
4966 } else {
4967 return out << cs.seq << "=" << cs.snaps << ":"
4968 << cs.clone_snaps;
4969 }
4970 }
4971
4972 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
4973 {
4974 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
4975 // correct: it will not include snaps that still logically exist
4976 // but for which there was no clone that is defined. For all
4977 // practical purposes this doesn't matter, since we only use that
4978 // information to clone on the OSD, and we have already moved
4979 // forward past that part of the object history.
4980
4981 seq = ss.seq;
4982 set<snapid_t> _snaps;
4983 set<snapid_t> _clones;
4984 head_exists = false;
4985 for (vector<librados::clone_info_t>::const_iterator p = ss.clones.begin();
4986 p != ss.clones.end();
4987 ++p) {
4988 if (p->cloneid == librados::SNAP_HEAD) {
4989 head_exists = true;
4990 } else {
4991 _clones.insert(p->cloneid);
4992 _snaps.insert(p->snaps.begin(), p->snaps.end());
4993 clone_size[p->cloneid] = p->size;
4994 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
4995 for (vector<pair<uint64_t, uint64_t> >::const_iterator q =
4996 p->overlap.begin(); q != p->overlap.end(); ++q)
4997 clone_overlap[p->cloneid].insert(q->first, q->second);
4998 if (!legacy) {
4999 // p->snaps is ascending; clone_snaps is descending
5000 vector<snapid_t>& v = clone_snaps[p->cloneid];
5001 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
5002 v.push_back(*q);
5003 }
5004 }
5005 }
5006 }
5007
5008 // ascending
5009 clones.clear();
5010 clones.reserve(_clones.size());
5011 for (set<snapid_t>::iterator p = _clones.begin(); p != _clones.end(); ++p)
5012 clones.push_back(*p);
5013
5014 // descending
5015 snaps.clear();
5016 snaps.reserve(_snaps.size());
5017 for (set<snapid_t>::reverse_iterator p = _snaps.rbegin();
5018 p != _snaps.rend(); ++p)
5019 snaps.push_back(*p);
5020 }
5021
5022 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
5023 {
5024 assert(clone_size.count(clone));
5025 uint64_t size = clone_size.find(clone)->second;
5026 assert(clone_overlap.count(clone));
5027 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
5028 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
5029 i != overlap.end();
5030 ++i) {
5031 assert(size >= i.get_len());
5032 size -= i.get_len();
5033 }
5034 return size;
5035 }
5036
5037 void SnapSet::filter(const pg_pool_t &pinfo)
5038 {
5039 vector<snapid_t> oldsnaps;
5040 oldsnaps.swap(snaps);
5041 for (vector<snapid_t>::const_iterator i = oldsnaps.begin();
5042 i != oldsnaps.end();
5043 ++i) {
5044 if (!pinfo.is_removed_snap(*i))
5045 snaps.push_back(*i);
5046 }
5047 }
5048
5049 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5050 {
5051 SnapSet ss = *this;
5052 ss.filter(pinfo);
5053 return ss;
5054 }
5055
5056 // -- watch_info_t --
5057
5058 void watch_info_t::encode(bufferlist& bl, uint64_t features) const
5059 {
5060 ENCODE_START(4, 3, bl);
5061 ::encode(cookie, bl);
5062 ::encode(timeout_seconds, bl);
5063 ::encode(addr, bl, features);
5064 ENCODE_FINISH(bl);
5065 }
5066
5067 void watch_info_t::decode(bufferlist::iterator& bl)
5068 {
5069 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
5070 ::decode(cookie, bl);
5071 if (struct_v < 2) {
5072 uint64_t ver;
5073 ::decode(ver, bl);
5074 }
5075 ::decode(timeout_seconds, bl);
5076 if (struct_v >= 4) {
5077 ::decode(addr, bl);
5078 }
5079 DECODE_FINISH(bl);
5080 }
5081
5082 void watch_info_t::dump(Formatter *f) const
5083 {
5084 f->dump_unsigned("cookie", cookie);
5085 f->dump_unsigned("timeout_seconds", timeout_seconds);
5086 f->open_object_section("addr");
5087 addr.dump(f);
5088 f->close_section();
5089 }
5090
5091 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5092 {
5093 o.push_back(new watch_info_t);
5094 o.push_back(new watch_info_t);
5095 o.back()->cookie = 123;
5096 o.back()->timeout_seconds = 99;
5097 entity_addr_t ea;
5098 ea.set_type(entity_addr_t::TYPE_LEGACY);
5099 ea.set_nonce(1);
5100 ea.set_family(AF_INET);
5101 ea.set_in4_quad(0, 127);
5102 ea.set_in4_quad(1, 0);
5103 ea.set_in4_quad(2, 1);
5104 ea.set_in4_quad(3, 2);
5105 ea.set_port(2);
5106 o.back()->addr = ea;
5107 }
5108
5109 // -- object_manifest_t --
5110
5111 void object_manifest_t::encode(bufferlist& bl) const
5112 {
5113 ENCODE_START(1, 1, bl);
5114 ::encode(type, bl);
5115 switch (type) {
5116 case TYPE_NONE: break;
5117 case TYPE_REDIRECT:
5118 ::encode(redirect_target, bl);
5119 break;
5120 default:
5121 ceph_abort();
5122 }
5123 ENCODE_FINISH(bl);
5124 }
5125
5126 void object_manifest_t::decode(bufferlist::iterator& bl)
5127 {
5128 DECODE_START(1, bl);
5129 ::decode(type, bl);
5130 switch (type) {
5131 case TYPE_NONE: break;
5132 case TYPE_REDIRECT:
5133 ::decode(redirect_target, bl);
5134 break;
5135 default:
5136 ceph_abort();
5137 }
5138 DECODE_FINISH(bl);
5139 }
5140
5141 void object_manifest_t::dump(Formatter *f) const
5142 {
5143 f->dump_unsigned("type", type);
5144 f->open_object_section("redirect_target");
5145 redirect_target.dump(f);
5146 f->close_section();
5147 }
5148
5149 void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
5150 {
5151 o.push_back(new object_manifest_t());
5152 o.back()->type = TYPE_REDIRECT;
5153 }
5154
5155 ostream& operator<<(ostream& out, const object_manifest_t& om)
5156 {
5157 return out << "type:" << om.type << " redirect_target:" << om.redirect_target;
5158 }
5159
5160 // -- object_info_t --
5161
5162 void object_info_t::copy_user_bits(const object_info_t& other)
5163 {
5164 // these bits are copied from head->clone.
5165 size = other.size;
5166 mtime = other.mtime;
5167 local_mtime = other.local_mtime;
5168 last_reqid = other.last_reqid;
5169 truncate_seq = other.truncate_seq;
5170 truncate_size = other.truncate_size;
5171 flags = other.flags;
5172 user_version = other.user_version;
5173 data_digest = other.data_digest;
5174 omap_digest = other.omap_digest;
5175 }
5176
5177 ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
5178 const object_locator_t &loc) {
5179 ps_t ps;
5180 if (loc.key.length())
5181 // Hack, we don't have the osd map, so we don't really know the hash...
5182 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, loc.key.c_str(),
5183 loc.key.length());
5184 else
5185 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, oid.name.c_str(),
5186 oid.name.length());
5187 return ps;
5188 }
5189
5190 void object_info_t::encode(bufferlist& bl, uint64_t features) const
5191 {
5192 object_locator_t myoloc(soid);
5193 map<entity_name_t, watch_info_t> old_watchers;
5194 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator i =
5195 watchers.begin();
5196 i != watchers.end();
5197 ++i) {
5198 old_watchers.insert(make_pair(i->first.second, i->second));
5199 }
5200 ENCODE_START(17, 8, bl);
5201 ::encode(soid, bl);
5202 ::encode(myoloc, bl); //Retained for compatibility
5203 ::encode((__u32)0, bl); // was category, no longer used
5204 ::encode(version, bl);
5205 ::encode(prior_version, bl);
5206 ::encode(last_reqid, bl);
5207 ::encode(size, bl);
5208 ::encode(mtime, bl);
5209 if (soid.snap == CEPH_NOSNAP)
5210 ::encode(osd_reqid_t(), bl); // used to be wrlock_by
5211 else
5212 ::encode(legacy_snaps, bl);
5213 ::encode(truncate_seq, bl);
5214 ::encode(truncate_size, bl);
5215 ::encode(is_lost(), bl);
5216 ::encode(old_watchers, bl, features);
5217 /* shenanigans to avoid breaking backwards compatibility in the disk format.
5218 * When we can, switch this out for simply putting the version_t on disk. */
5219 eversion_t user_eversion(0, user_version);
5220 ::encode(user_eversion, bl);
5221 ::encode(test_flag(FLAG_USES_TMAP), bl);
5222 ::encode(watchers, bl, features);
5223 __u32 _flags = flags;
5224 ::encode(_flags, bl);
5225 ::encode(local_mtime, bl);
5226 ::encode(data_digest, bl);
5227 ::encode(omap_digest, bl);
5228 ::encode(expected_object_size, bl);
5229 ::encode(expected_write_size, bl);
5230 ::encode(alloc_hint_flags, bl);
5231 if (has_manifest()) {
5232 ::encode(manifest, bl);
5233 }
5234 ENCODE_FINISH(bl);
5235 }
5236
5237 void object_info_t::decode(bufferlist::iterator& bl)
5238 {
5239 object_locator_t myoloc;
5240 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
5241 map<entity_name_t, watch_info_t> old_watchers;
5242 ::decode(soid, bl);
5243 ::decode(myoloc, bl);
5244 {
5245 string category;
5246 ::decode(category, bl); // no longer used
5247 }
5248 ::decode(version, bl);
5249 ::decode(prior_version, bl);
5250 ::decode(last_reqid, bl);
5251 ::decode(size, bl);
5252 ::decode(mtime, bl);
5253 if (soid.snap == CEPH_NOSNAP) {
5254 osd_reqid_t wrlock_by;
5255 ::decode(wrlock_by, bl);
5256 } else {
5257 ::decode(legacy_snaps, bl);
5258 }
5259 ::decode(truncate_seq, bl);
5260 ::decode(truncate_size, bl);
5261
5262 // if this is struct_v >= 13, we will overwrite this
5263 // below since this field is just here for backwards
5264 // compatibility
5265 __u8 lo;
5266 ::decode(lo, bl);
5267 flags = (flag_t)lo;
5268
5269 ::decode(old_watchers, bl);
5270 eversion_t user_eversion;
5271 ::decode(user_eversion, bl);
5272 user_version = user_eversion.version;
5273
5274 if (struct_v >= 9) {
5275 bool uses_tmap = false;
5276 ::decode(uses_tmap, bl);
5277 if (uses_tmap)
5278 set_flag(FLAG_USES_TMAP);
5279 } else {
5280 set_flag(FLAG_USES_TMAP);
5281 }
5282 if (struct_v < 10)
5283 soid.pool = myoloc.pool;
5284 if (struct_v >= 11) {
5285 ::decode(watchers, bl);
5286 } else {
5287 for (map<entity_name_t, watch_info_t>::iterator i = old_watchers.begin();
5288 i != old_watchers.end();
5289 ++i) {
5290 watchers.insert(
5291 make_pair(
5292 make_pair(i->second.cookie, i->first), i->second));
5293 }
5294 }
5295 if (struct_v >= 13) {
5296 __u32 _flags;
5297 ::decode(_flags, bl);
5298 flags = (flag_t)_flags;
5299 }
5300 if (struct_v >= 14) {
5301 ::decode(local_mtime, bl);
5302 } else {
5303 local_mtime = utime_t();
5304 }
5305 if (struct_v >= 15) {
5306 ::decode(data_digest, bl);
5307 ::decode(omap_digest, bl);
5308 } else {
5309 data_digest = omap_digest = -1;
5310 clear_flag(FLAG_DATA_DIGEST);
5311 clear_flag(FLAG_OMAP_DIGEST);
5312 }
5313 if (struct_v >= 16) {
5314 ::decode(expected_object_size, bl);
5315 ::decode(expected_write_size, bl);
5316 ::decode(alloc_hint_flags, bl);
5317 } else {
5318 expected_object_size = 0;
5319 expected_write_size = 0;
5320 alloc_hint_flags = 0;
5321 }
5322 if (struct_v >= 17) {
5323 if (has_manifest()) {
5324 ::decode(manifest, bl);
5325 }
5326 }
5327 DECODE_FINISH(bl);
5328 }
5329
5330 void object_info_t::dump(Formatter *f) const
5331 {
5332 f->open_object_section("oid");
5333 soid.dump(f);
5334 f->close_section();
5335 f->dump_stream("version") << version;
5336 f->dump_stream("prior_version") << prior_version;
5337 f->dump_stream("last_reqid") << last_reqid;
5338 f->dump_unsigned("user_version", user_version);
5339 f->dump_unsigned("size", size);
5340 f->dump_stream("mtime") << mtime;
5341 f->dump_stream("local_mtime") << local_mtime;
5342 f->dump_unsigned("lost", (int)is_lost());
5343 vector<string> sv = get_flag_vector(flags);
5344 f->open_array_section("flags");
5345 for (auto str: sv)
5346 f->dump_string("flags", str);
5347 f->close_section();
5348 f->open_array_section("legacy_snaps");
5349 for (auto s : legacy_snaps) {
5350 f->dump_unsigned("snap", s);
5351 }
5352 f->close_section();
5353 f->dump_unsigned("truncate_seq", truncate_seq);
5354 f->dump_unsigned("truncate_size", truncate_size);
5355 f->dump_format("data_digest", "0x%08x", data_digest);
5356 f->dump_format("omap_digest", "0x%08x", omap_digest);
5357 f->dump_unsigned("expected_object_size", expected_object_size);
5358 f->dump_unsigned("expected_write_size", expected_write_size);
5359 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
5360 f->dump_object("manifest", manifest);
5361 f->open_object_section("watchers");
5362 for (map<pair<uint64_t, entity_name_t>,watch_info_t>::const_iterator p =
5363 watchers.begin(); p != watchers.end(); ++p) {
5364 stringstream ss;
5365 ss << p->first.second;
5366 f->open_object_section(ss.str().c_str());
5367 p->second.dump(f);
5368 f->close_section();
5369 }
5370 f->close_section();
5371 }
5372
5373 void object_info_t::generate_test_instances(list<object_info_t*>& o)
5374 {
5375 o.push_back(new object_info_t());
5376
5377 // fixme
5378 }
5379
5380
5381 ostream& operator<<(ostream& out, const object_info_t& oi)
5382 {
5383 out << oi.soid << "(" << oi.version
5384 << " " << oi.last_reqid;
5385 if (oi.soid.snap != CEPH_NOSNAP && !oi.legacy_snaps.empty())
5386 out << " " << oi.legacy_snaps;
5387 if (oi.flags)
5388 out << " " << oi.get_flag_string();
5389 out << " s " << oi.size;
5390 out << " uv " << oi.user_version;
5391 if (oi.is_data_digest())
5392 out << " dd " << std::hex << oi.data_digest << std::dec;
5393 if (oi.is_omap_digest())
5394 out << " od " << std::hex << oi.omap_digest << std::dec;
5395 out << " alloc_hint [" << oi.expected_object_size
5396 << " " << oi.expected_write_size
5397 << " " << oi.alloc_hint_flags << "]";
5398 if (oi.has_manifest())
5399 out << " " << oi.manifest;
5400
5401 out << ")";
5402 return out;
5403 }
5404
5405 // -- ObjectRecovery --
5406 void ObjectRecoveryProgress::encode(bufferlist &bl) const
5407 {
5408 ENCODE_START(1, 1, bl);
5409 ::encode(first, bl);
5410 ::encode(data_complete, bl);
5411 ::encode(data_recovered_to, bl);
5412 ::encode(omap_recovered_to, bl);
5413 ::encode(omap_complete, bl);
5414 ENCODE_FINISH(bl);
5415 }
5416
5417 void ObjectRecoveryProgress::decode(bufferlist::iterator &bl)
5418 {
5419 DECODE_START(1, bl);
5420 ::decode(first, bl);
5421 ::decode(data_complete, bl);
5422 ::decode(data_recovered_to, bl);
5423 ::decode(omap_recovered_to, bl);
5424 ::decode(omap_complete, bl);
5425 DECODE_FINISH(bl);
5426 }
5427
5428 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
5429 {
5430 return prog.print(out);
5431 }
5432
5433 void ObjectRecoveryProgress::generate_test_instances(
5434 list<ObjectRecoveryProgress*>& o)
5435 {
5436 o.push_back(new ObjectRecoveryProgress);
5437 o.back()->first = false;
5438 o.back()->data_complete = true;
5439 o.back()->omap_complete = true;
5440 o.back()->data_recovered_to = 100;
5441
5442 o.push_back(new ObjectRecoveryProgress);
5443 o.back()->first = true;
5444 o.back()->data_complete = false;
5445 o.back()->omap_complete = false;
5446 o.back()->data_recovered_to = 0;
5447 }
5448
5449 ostream &ObjectRecoveryProgress::print(ostream &out) const
5450 {
5451 return out << "ObjectRecoveryProgress("
5452 << ( first ? "" : "!" ) << "first, "
5453 << "data_recovered_to:" << data_recovered_to
5454 << ", data_complete:" << ( data_complete ? "true" : "false" )
5455 << ", omap_recovered_to:" << omap_recovered_to
5456 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
5457 << ", error:" << ( error ? "true" : "false" )
5458 << ")";
5459 }
5460
5461 void ObjectRecoveryProgress::dump(Formatter *f) const
5462 {
5463 f->dump_int("first?", first);
5464 f->dump_int("data_complete?", data_complete);
5465 f->dump_unsigned("data_recovered_to", data_recovered_to);
5466 f->dump_int("omap_complete?", omap_complete);
5467 f->dump_string("omap_recovered_to", omap_recovered_to);
5468 }
5469
5470 void ObjectRecoveryInfo::encode(bufferlist &bl, uint64_t features) const
5471 {
5472 ENCODE_START(2, 1, bl);
5473 ::encode(soid, bl);
5474 ::encode(version, bl);
5475 ::encode(size, bl);
5476 ::encode(oi, bl, features);
5477 ::encode(ss, bl);
5478 ::encode(copy_subset, bl);
5479 ::encode(clone_subset, bl);
5480 ENCODE_FINISH(bl);
5481 }
5482
5483 void ObjectRecoveryInfo::decode(bufferlist::iterator &bl,
5484 int64_t pool)
5485 {
5486 DECODE_START(2, bl);
5487 ::decode(soid, bl);
5488 ::decode(version, bl);
5489 ::decode(size, bl);
5490 ::decode(oi, bl);
5491 ::decode(ss, bl);
5492 ::decode(copy_subset, bl);
5493 ::decode(clone_subset, bl);
5494 DECODE_FINISH(bl);
5495
5496 if (struct_v < 2) {
5497 if (!soid.is_max() && soid.pool == -1)
5498 soid.pool = pool;
5499 map<hobject_t, interval_set<uint64_t>> tmp;
5500 tmp.swap(clone_subset);
5501 for (map<hobject_t, interval_set<uint64_t>>::iterator i = tmp.begin();
5502 i != tmp.end();
5503 ++i) {
5504 hobject_t first(i->first);
5505 if (!first.is_max() && first.pool == -1)
5506 first.pool = pool;
5507 clone_subset[first].swap(i->second);
5508 }
5509 }
5510 }
5511
5512 void ObjectRecoveryInfo::generate_test_instances(
5513 list<ObjectRecoveryInfo*>& o)
5514 {
5515 o.push_back(new ObjectRecoveryInfo);
5516 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
5517 o.back()->version = eversion_t(0,0);
5518 o.back()->size = 100;
5519 }
5520
5521
5522 void ObjectRecoveryInfo::dump(Formatter *f) const
5523 {
5524 f->dump_stream("object") << soid;
5525 f->dump_stream("at_version") << version;
5526 f->dump_stream("size") << size;
5527 {
5528 f->open_object_section("object_info");
5529 oi.dump(f);
5530 f->close_section();
5531 }
5532 {
5533 f->open_object_section("snapset");
5534 ss.dump(f);
5535 f->close_section();
5536 }
5537 f->dump_stream("copy_subset") << copy_subset;
5538 f->dump_stream("clone_subset") << clone_subset;
5539 }
5540
5541 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
5542 {
5543 return inf.print(out);
5544 }
5545
5546 ostream &ObjectRecoveryInfo::print(ostream &out) const
5547 {
5548 return out << "ObjectRecoveryInfo("
5549 << soid << "@" << version
5550 << ", size: " << size
5551 << ", copy_subset: " << copy_subset
5552 << ", clone_subset: " << clone_subset
5553 << ", snapset: " << ss
5554 << ")";
5555 }
5556
5557 // -- PushReplyOp --
5558 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
5559 {
5560 o.push_back(new PushReplyOp);
5561 o.push_back(new PushReplyOp);
5562 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5563 o.push_back(new PushReplyOp);
5564 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5565 }
5566
5567 void PushReplyOp::encode(bufferlist &bl) const
5568 {
5569 ENCODE_START(1, 1, bl);
5570 ::encode(soid, bl);
5571 ENCODE_FINISH(bl);
5572 }
5573
5574 void PushReplyOp::decode(bufferlist::iterator &bl)
5575 {
5576 DECODE_START(1, bl);
5577 ::decode(soid, bl);
5578 DECODE_FINISH(bl);
5579 }
5580
5581 void PushReplyOp::dump(Formatter *f) const
5582 {
5583 f->dump_stream("soid") << soid;
5584 }
5585
5586 ostream &PushReplyOp::print(ostream &out) const
5587 {
5588 return out
5589 << "PushReplyOp(" << soid
5590 << ")";
5591 }
5592
5593 ostream& operator<<(ostream& out, const PushReplyOp &op)
5594 {
5595 return op.print(out);
5596 }
5597
5598 uint64_t PushReplyOp::cost(CephContext *cct) const
5599 {
5600
5601 return cct->_conf->osd_push_per_object_cost +
5602 cct->_conf->osd_recovery_max_chunk;
5603 }
5604
5605 // -- PullOp --
5606 void PullOp::generate_test_instances(list<PullOp*> &o)
5607 {
5608 o.push_back(new PullOp);
5609 o.push_back(new PullOp);
5610 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5611 o.back()->recovery_info.version = eversion_t(3, 10);
5612 o.push_back(new PullOp);
5613 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5614 o.back()->recovery_info.version = eversion_t(0, 0);
5615 }
5616
5617 void PullOp::encode(bufferlist &bl, uint64_t features) const
5618 {
5619 ENCODE_START(1, 1, bl);
5620 ::encode(soid, bl);
5621 ::encode(recovery_info, bl, features);
5622 ::encode(recovery_progress, bl);
5623 ENCODE_FINISH(bl);
5624 }
5625
5626 void PullOp::decode(bufferlist::iterator &bl)
5627 {
5628 DECODE_START(1, bl);
5629 ::decode(soid, bl);
5630 ::decode(recovery_info, bl);
5631 ::decode(recovery_progress, bl);
5632 DECODE_FINISH(bl);
5633 }
5634
5635 void PullOp::dump(Formatter *f) const
5636 {
5637 f->dump_stream("soid") << soid;
5638 {
5639 f->open_object_section("recovery_info");
5640 recovery_info.dump(f);
5641 f->close_section();
5642 }
5643 {
5644 f->open_object_section("recovery_progress");
5645 recovery_progress.dump(f);
5646 f->close_section();
5647 }
5648 }
5649
5650 ostream &PullOp::print(ostream &out) const
5651 {
5652 return out
5653 << "PullOp(" << soid
5654 << ", recovery_info: " << recovery_info
5655 << ", recovery_progress: " << recovery_progress
5656 << ")";
5657 }
5658
5659 ostream& operator<<(ostream& out, const PullOp &op)
5660 {
5661 return op.print(out);
5662 }
5663
5664 uint64_t PullOp::cost(CephContext *cct) const
5665 {
5666 return cct->_conf->osd_push_per_object_cost +
5667 cct->_conf->osd_recovery_max_chunk;
5668 }
5669
5670 // -- PushOp --
5671 void PushOp::generate_test_instances(list<PushOp*> &o)
5672 {
5673 o.push_back(new PushOp);
5674 o.push_back(new PushOp);
5675 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5676 o.back()->version = eversion_t(3, 10);
5677 o.push_back(new PushOp);
5678 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5679 o.back()->version = eversion_t(0, 0);
5680 }
5681
5682 void PushOp::encode(bufferlist &bl, uint64_t features) const
5683 {
5684 ENCODE_START(1, 1, bl);
5685 ::encode(soid, bl);
5686 ::encode(version, bl);
5687 ::encode(data, bl);
5688 ::encode(data_included, bl);
5689 ::encode(omap_header, bl);
5690 ::encode(omap_entries, bl);
5691 ::encode(attrset, bl);
5692 ::encode(recovery_info, bl, features);
5693 ::encode(after_progress, bl);
5694 ::encode(before_progress, bl);
5695 ENCODE_FINISH(bl);
5696 }
5697
5698 void PushOp::decode(bufferlist::iterator &bl)
5699 {
5700 DECODE_START(1, bl);
5701 ::decode(soid, bl);
5702 ::decode(version, bl);
5703 ::decode(data, bl);
5704 ::decode(data_included, bl);
5705 ::decode(omap_header, bl);
5706 ::decode(omap_entries, bl);
5707 ::decode(attrset, bl);
5708 ::decode(recovery_info, bl);
5709 ::decode(after_progress, bl);
5710 ::decode(before_progress, bl);
5711 DECODE_FINISH(bl);
5712 }
5713
5714 void PushOp::dump(Formatter *f) const
5715 {
5716 f->dump_stream("soid") << soid;
5717 f->dump_stream("version") << version;
5718 f->dump_int("data_len", data.length());
5719 f->dump_stream("data_included") << data_included;
5720 f->dump_int("omap_header_len", omap_header.length());
5721 f->dump_int("omap_entries_len", omap_entries.size());
5722 f->dump_int("attrset_len", attrset.size());
5723 {
5724 f->open_object_section("recovery_info");
5725 recovery_info.dump(f);
5726 f->close_section();
5727 }
5728 {
5729 f->open_object_section("after_progress");
5730 after_progress.dump(f);
5731 f->close_section();
5732 }
5733 {
5734 f->open_object_section("before_progress");
5735 before_progress.dump(f);
5736 f->close_section();
5737 }
5738 }
5739
5740 ostream &PushOp::print(ostream &out) const
5741 {
5742 return out
5743 << "PushOp(" << soid
5744 << ", version: " << version
5745 << ", data_included: " << data_included
5746 << ", data_size: " << data.length()
5747 << ", omap_header_size: " << omap_header.length()
5748 << ", omap_entries_size: " << omap_entries.size()
5749 << ", attrset_size: " << attrset.size()
5750 << ", recovery_info: " << recovery_info
5751 << ", after_progress: " << after_progress
5752 << ", before_progress: " << before_progress
5753 << ")";
5754 }
5755
5756 ostream& operator<<(ostream& out, const PushOp &op)
5757 {
5758 return op.print(out);
5759 }
5760
5761 uint64_t PushOp::cost(CephContext *cct) const
5762 {
5763 uint64_t cost = data_included.size();
5764 for (map<string, bufferlist>::const_iterator i =
5765 omap_entries.begin();
5766 i != omap_entries.end();
5767 ++i) {
5768 cost += i->second.length();
5769 }
5770 cost += cct->_conf->osd_push_per_object_cost;
5771 return cost;
5772 }
5773
5774 // -- ScrubMap --
5775
5776 void ScrubMap::merge_incr(const ScrubMap &l)
5777 {
5778 assert(valid_through == l.incr_since);
5779 valid_through = l.valid_through;
5780
5781 for (map<hobject_t,object>::const_iterator p = l.objects.begin();
5782 p != l.objects.end();
5783 ++p){
5784 if (p->second.negative) {
5785 map<hobject_t,object>::iterator q = objects.find(p->first);
5786 if (q != objects.end()) {
5787 objects.erase(q);
5788 }
5789 } else {
5790 objects[p->first] = p->second;
5791 }
5792 }
5793 }
5794
5795 void ScrubMap::encode(bufferlist& bl) const
5796 {
5797 ENCODE_START(3, 2, bl);
5798 ::encode(objects, bl);
5799 ::encode((__u32)0, bl); // used to be attrs; now deprecated
5800 bufferlist old_logbl; // not used
5801 ::encode(old_logbl, bl);
5802 ::encode(valid_through, bl);
5803 ::encode(incr_since, bl);
5804 ENCODE_FINISH(bl);
5805 }
5806
5807 void ScrubMap::decode(bufferlist::iterator& bl, int64_t pool)
5808 {
5809 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5810 ::decode(objects, bl);
5811 {
5812 map<string,string> attrs; // deprecated
5813 ::decode(attrs, bl);
5814 }
5815 bufferlist old_logbl; // not used
5816 ::decode(old_logbl, bl);
5817 ::decode(valid_through, bl);
5818 ::decode(incr_since, bl);
5819 DECODE_FINISH(bl);
5820
5821 // handle hobject_t upgrade
5822 if (struct_v < 3) {
5823 map<hobject_t, object> tmp;
5824 tmp.swap(objects);
5825 for (map<hobject_t, object>::iterator i = tmp.begin();
5826 i != tmp.end();
5827 ++i) {
5828 hobject_t first(i->first);
5829 if (!first.is_max() && first.pool == -1)
5830 first.pool = pool;
5831 objects[first] = i->second;
5832 }
5833 }
5834 }
5835
5836 void ScrubMap::dump(Formatter *f) const
5837 {
5838 f->dump_stream("valid_through") << valid_through;
5839 f->dump_stream("incremental_since") << incr_since;
5840 f->open_array_section("objects");
5841 for (map<hobject_t,object>::const_iterator p = objects.begin(); p != objects.end(); ++p) {
5842 f->open_object_section("object");
5843 f->dump_string("name", p->first.oid.name);
5844 f->dump_unsigned("hash", p->first.get_hash());
5845 f->dump_string("key", p->first.get_key());
5846 f->dump_int("snapid", p->first.snap);
5847 p->second.dump(f);
5848 f->close_section();
5849 }
5850 f->close_section();
5851 }
5852
5853 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
5854 {
5855 o.push_back(new ScrubMap);
5856 o.push_back(new ScrubMap);
5857 o.back()->valid_through = eversion_t(1, 2);
5858 o.back()->incr_since = eversion_t(3, 4);
5859 list<object*> obj;
5860 object::generate_test_instances(obj);
5861 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
5862 obj.pop_back();
5863 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
5864 }
5865
5866 // -- ScrubMap::object --
5867
5868 void ScrubMap::object::encode(bufferlist& bl) const
5869 {
5870 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
5871 ENCODE_START(9, 7, bl);
5872 ::encode(size, bl);
5873 ::encode(negative, bl);
5874 ::encode(attrs, bl);
5875 ::encode(digest, bl);
5876 ::encode(digest_present, bl);
5877 ::encode((uint32_t)0, bl); // obsolete nlinks
5878 ::encode((uint32_t)0, bl); // snapcolls
5879 ::encode(omap_digest, bl);
5880 ::encode(omap_digest_present, bl);
5881 ::encode(compat_read_error, bl);
5882 ::encode(stat_error, bl);
5883 ::encode(read_error, bl);
5884 ::encode(ec_hash_mismatch, bl);
5885 ::encode(ec_size_mismatch, bl);
5886 ::encode(large_omap_object_found, bl);
5887 ::encode(large_omap_object_key_count, bl);
5888 ::encode(large_omap_object_value_size, bl);
5889 ENCODE_FINISH(bl);
5890 }
5891
5892 void ScrubMap::object::decode(bufferlist::iterator& bl)
5893 {
5894 DECODE_START(9, bl);
5895 ::decode(size, bl);
5896 bool tmp, compat_read_error = false;
5897 ::decode(tmp, bl);
5898 negative = tmp;
5899 ::decode(attrs, bl);
5900 ::decode(digest, bl);
5901 ::decode(tmp, bl);
5902 digest_present = tmp;
5903 {
5904 uint32_t nlinks;
5905 ::decode(nlinks, bl);
5906 set<snapid_t> snapcolls;
5907 ::decode(snapcolls, bl);
5908 }
5909 ::decode(omap_digest, bl);
5910 ::decode(tmp, bl);
5911 omap_digest_present = tmp;
5912 ::decode(compat_read_error, bl);
5913 ::decode(tmp, bl);
5914 stat_error = tmp;
5915 if (struct_v >= 8) {
5916 ::decode(tmp, bl);
5917 read_error = tmp;
5918 ::decode(tmp, bl);
5919 ec_hash_mismatch = tmp;
5920 ::decode(tmp, bl);
5921 ec_size_mismatch = tmp;
5922 }
5923 // If older encoder found a read_error, set read_error
5924 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
5925 read_error = true;
5926 if (struct_v >= 9) {
5927 ::decode(tmp, bl);
5928 large_omap_object_found = tmp;
5929 ::decode(large_omap_object_key_count, bl);
5930 ::decode(large_omap_object_value_size, bl);
5931 }
5932 DECODE_FINISH(bl);
5933 }
5934
5935 void ScrubMap::object::dump(Formatter *f) const
5936 {
5937 f->dump_int("size", size);
5938 f->dump_int("negative", negative);
5939 f->open_array_section("attrs");
5940 for (map<string,bufferptr>::const_iterator p = attrs.begin(); p != attrs.end(); ++p) {
5941 f->open_object_section("attr");
5942 f->dump_string("name", p->first);
5943 f->dump_int("length", p->second.length());
5944 f->close_section();
5945 }
5946 f->close_section();
5947 }
5948
5949 void ScrubMap::object::generate_test_instances(list<object*>& o)
5950 {
5951 o.push_back(new object);
5952 o.push_back(new object);
5953 o.back()->negative = true;
5954 o.push_back(new object);
5955 o.back()->size = 123;
5956 o.back()->attrs["foo"] = buffer::copy("foo", 3);
5957 o.back()->attrs["bar"] = buffer::copy("barval", 6);
5958 }
5959
5960 // -- OSDOp --
5961
5962 ostream& operator<<(ostream& out, const OSDOp& op)
5963 {
5964 out << ceph_osd_op_name(op.op.op);
5965 if (ceph_osd_op_type_data(op.op.op)) {
5966 // data extent
5967 switch (op.op.op) {
5968 case CEPH_OSD_OP_ASSERT_VER:
5969 out << " v" << op.op.assert_ver.ver;
5970 break;
5971 case CEPH_OSD_OP_TRUNCATE:
5972 out << " " << op.op.extent.offset;
5973 break;
5974 case CEPH_OSD_OP_MASKTRUNC:
5975 case CEPH_OSD_OP_TRIMTRUNC:
5976 out << " " << op.op.extent.truncate_seq << "@"
5977 << (int64_t)op.op.extent.truncate_size;
5978 break;
5979 case CEPH_OSD_OP_ROLLBACK:
5980 out << " " << snapid_t(op.op.snap.snapid);
5981 break;
5982 case CEPH_OSD_OP_WATCH:
5983 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
5984 << " cookie " << op.op.watch.cookie;
5985 if (op.op.watch.gen)
5986 out << " gen " << op.op.watch.gen;
5987 break;
5988 case CEPH_OSD_OP_NOTIFY:
5989 case CEPH_OSD_OP_NOTIFY_ACK:
5990 out << " cookie " << op.op.notify.cookie;
5991 break;
5992 case CEPH_OSD_OP_COPY_GET:
5993 out << " max " << op.op.copy_get.max;
5994 break;
5995 case CEPH_OSD_OP_COPY_FROM:
5996 out << " ver " << op.op.copy_from.src_version;
5997 break;
5998 case CEPH_OSD_OP_SETALLOCHINT:
5999 out << " object_size " << op.op.alloc_hint.expected_object_size
6000 << " write_size " << op.op.alloc_hint.expected_write_size;
6001 break;
6002 case CEPH_OSD_OP_READ:
6003 case CEPH_OSD_OP_SPARSE_READ:
6004 case CEPH_OSD_OP_SYNC_READ:
6005 case CEPH_OSD_OP_WRITE:
6006 case CEPH_OSD_OP_WRITEFULL:
6007 case CEPH_OSD_OP_ZERO:
6008 case CEPH_OSD_OP_APPEND:
6009 case CEPH_OSD_OP_MAPEXT:
6010 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
6011 if (op.op.extent.truncate_seq)
6012 out << " [" << op.op.extent.truncate_seq << "@"
6013 << (int64_t)op.op.extent.truncate_size << "]";
6014 if (op.op.flags)
6015 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
6016 default:
6017 // don't show any arg info
6018 break;
6019 }
6020 } else if (ceph_osd_op_type_attr(op.op.op)) {
6021 // xattr name
6022 if (op.op.xattr.name_len && op.indata.length()) {
6023 out << " ";
6024 op.indata.write(0, op.op.xattr.name_len, out);
6025 }
6026 if (op.op.xattr.value_len)
6027 out << " (" << op.op.xattr.value_len << ")";
6028 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
6029 out << " op " << (int)op.op.xattr.cmp_op
6030 << " mode " << (int)op.op.xattr.cmp_mode;
6031 } else if (ceph_osd_op_type_exec(op.op.op)) {
6032 // class.method
6033 if (op.op.cls.class_len && op.indata.length()) {
6034 out << " ";
6035 op.indata.write(0, op.op.cls.class_len, out);
6036 out << ".";
6037 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
6038 }
6039 } else if (ceph_osd_op_type_pg(op.op.op)) {
6040 switch (op.op.op) {
6041 case CEPH_OSD_OP_PGLS:
6042 case CEPH_OSD_OP_PGLS_FILTER:
6043 case CEPH_OSD_OP_PGNLS:
6044 case CEPH_OSD_OP_PGNLS_FILTER:
6045 out << " start_epoch " << op.op.pgls.start_epoch;
6046 break;
6047 case CEPH_OSD_OP_PG_HITSET_LS:
6048 break;
6049 case CEPH_OSD_OP_PG_HITSET_GET:
6050 out << " " << utime_t(op.op.hit_set_get.stamp);
6051 break;
6052 case CEPH_OSD_OP_SCRUBLS:
6053 break;
6054 }
6055 }
6056 return out;
6057 }
6058
6059
6060 void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in)
6061 {
6062 bufferlist::iterator datap = in.begin();
6063 for (unsigned i = 0; i < ops.size(); i++) {
6064 if (ops[i].op.payload_len) {
6065 datap.copy(ops[i].op.payload_len, ops[i].indata);
6066 }
6067 }
6068 }
6069
6070 void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out)
6071 {
6072 for (unsigned i = 0; i < ops.size(); i++) {
6073 if (ops[i].indata.length()) {
6074 ops[i].op.payload_len = ops[i].indata.length();
6075 out.append(ops[i].indata);
6076 }
6077 }
6078 }
6079
6080 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in)
6081 {
6082 bufferlist::iterator datap = in.begin();
6083 for (unsigned i = 0; i < ops.size(); i++) {
6084 if (ops[i].op.payload_len) {
6085 datap.copy(ops[i].op.payload_len, ops[i].outdata);
6086 }
6087 }
6088 }
6089
6090 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out)
6091 {
6092 for (unsigned i = 0; i < ops.size(); i++) {
6093 if (ops[i].outdata.length()) {
6094 ops[i].op.payload_len = ops[i].outdata.length();
6095 out.append(ops[i].outdata);
6096 }
6097 }
6098 }
6099
6100 bool store_statfs_t::operator==(const store_statfs_t& other) const
6101 {
6102 return total == other.total
6103 && available == other.available
6104 && allocated == other.allocated
6105 && stored == other.stored
6106 && compressed == other.compressed
6107 && compressed_allocated == other.compressed_allocated
6108 && compressed_original == other.compressed_original;
6109 }
6110
6111 void store_statfs_t::dump(Formatter *f) const
6112 {
6113 f->dump_int("total", total);
6114 f->dump_int("available", available);
6115 f->dump_int("allocated", allocated);
6116 f->dump_int("stored", stored);
6117 f->dump_int("compressed", compressed);
6118 f->dump_int("compressed_allocated", compressed_allocated);
6119 f->dump_int("compressed_original", compressed_original);
6120 }
6121
6122 ostream& operator<<(ostream& out, const store_statfs_t &s)
6123 {
6124 out << std::hex
6125 << "store_statfs(0x" << s.available
6126 << "/0x" << s.total
6127 << ", stored 0x" << s.stored
6128 << "/0x" << s.allocated
6129 << ", compress 0x" << s.compressed
6130 << "/0x" << s.compressed_allocated
6131 << "/0x" << s.compressed_original
6132 << std::dec
6133 << ")";
6134 return out;
6135 }
6136
6137 void OSDOp::clear_data(vector<OSDOp>& ops)
6138 {
6139 for (unsigned i = 0; i < ops.size(); i++) {
6140 OSDOp& op = ops[i];
6141 op.outdata.clear();
6142 if (ceph_osd_op_type_attr(op.op.op) &&
6143 op.op.xattr.name_len &&
6144 op.indata.length() >= op.op.xattr.name_len) {
6145 bufferptr bp(op.op.xattr.name_len);
6146 bufferlist bl;
6147 bl.append(bp);
6148 bl.copy_in(0, op.op.xattr.name_len, op.indata);
6149 op.indata.claim(bl);
6150 } else if (ceph_osd_op_type_exec(op.op.op) &&
6151 op.op.cls.class_len &&
6152 op.indata.length() >
6153 (op.op.cls.class_len + op.op.cls.method_len)) {
6154 __u8 len = op.op.cls.class_len + op.op.cls.method_len;
6155 bufferptr bp(len);
6156 bufferlist bl;
6157 bl.append(bp);
6158 bl.copy_in(0, len, op.indata);
6159 op.indata.claim(bl);
6160 } else {
6161 op.indata.clear();
6162 }
6163 }
6164 }