]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/osd_types.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / osd / osd_types.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <boost/assign/list_of.hpp>
19
20 #include "osd_types.h"
21 #include "include/ceph_features.h"
22 extern "C" {
23 #include "crush/hash.h"
24 }
25 #include "PG.h"
26 #include "OSDMap.h"
27 #include "PGBackend.h"
28
29 const char *ceph_osd_flag_name(unsigned flag)
30 {
31 switch (flag) {
32 case CEPH_OSD_FLAG_ACK: return "ack";
33 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
34 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
35 case CEPH_OSD_FLAG_RETRY: return "retry";
36 case CEPH_OSD_FLAG_READ: return "read";
37 case CEPH_OSD_FLAG_WRITE: return "write";
38 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
39 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
40 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
41 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
42 case CEPH_OSD_FLAG_PGOP: return "pgop";
43 case CEPH_OSD_FLAG_EXEC: return "exec";
44 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
45 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
46 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
47 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
48 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
49 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
50 case CEPH_OSD_FLAG_FLUSH: return "flush";
51 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
52 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
53 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
54 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
55 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
56 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
57 default: return "???";
58 }
59 }
60
61 string ceph_osd_flag_string(unsigned flags)
62 {
63 string s;
64 for (unsigned i=0; i<32; ++i) {
65 if (flags & (1u<<i)) {
66 if (s.length())
67 s += "+";
68 s += ceph_osd_flag_name(1u << i);
69 }
70 }
71 if (s.length())
72 return s;
73 return string("-");
74 }
75
76 const char * ceph_osd_op_flag_name(unsigned flag)
77 {
78 const char *name;
79
80 switch(flag) {
81 case CEPH_OSD_OP_FLAG_EXCL:
82 name = "excl";
83 break;
84 case CEPH_OSD_OP_FLAG_FAILOK:
85 name = "failok";
86 break;
87 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
88 name = "fadvise_random";
89 break;
90 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
91 name = "fadvise_sequential";
92 break;
93 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
94 name = "favise_willneed";
95 break;
96 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
97 name = "fadvise_dontneed";
98 break;
99 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
100 name = "fadvise_nocache";
101 break;
102 default:
103 name = "???";
104 };
105
106 return name;
107 }
108
109 string ceph_osd_op_flag_string(unsigned flags)
110 {
111 string s;
112 for (unsigned i=0; i<32; ++i) {
113 if (flags & (1u<<i)) {
114 if (s.length())
115 s += "+";
116 s += ceph_osd_op_flag_name(1u << i);
117 }
118 }
119 if (s.length())
120 return s;
121 return string("-");
122 }
123
124 string ceph_osd_alloc_hint_flag_string(unsigned flags)
125 {
126 string s;
127 for (unsigned i=0; i<32; ++i) {
128 if (flags & (1u<<i)) {
129 if (s.length())
130 s += "+";
131 s += ceph_osd_alloc_hint_flag_name(1u << i);
132 }
133 }
134 if (s.length())
135 return s;
136 return string("-");
137 }
138
139 void pg_shard_t::encode(bufferlist &bl) const
140 {
141 ENCODE_START(1, 1, bl);
142 ::encode(osd, bl);
143 ::encode(shard, bl);
144 ENCODE_FINISH(bl);
145 }
146 void pg_shard_t::decode(bufferlist::iterator &bl)
147 {
148 DECODE_START(1, bl);
149 ::decode(osd, bl);
150 ::decode(shard, bl);
151 DECODE_FINISH(bl);
152 }
153
154 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
155 {
156 if (rhs.is_undefined())
157 return lhs << "?";
158 if (rhs.shard == shard_id_t::NO_SHARD)
159 return lhs << rhs.osd;
160 return lhs << rhs.osd << '(' << (unsigned)(rhs.shard) << ')';
161 }
162
163 // -- osd_reqid_t --
164 void osd_reqid_t::dump(Formatter *f) const
165 {
166 f->dump_stream("name") << name;
167 f->dump_int("inc", inc);
168 f->dump_unsigned("tid", tid);
169 }
170
171 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
172 {
173 o.push_back(new osd_reqid_t);
174 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
175 }
176
177 // -- object_locator_t --
178
179 void object_locator_t::encode(bufferlist& bl) const
180 {
181 // verify that nobody's corrupted the locator
182 assert(hash == -1 || key.empty());
183 __u8 encode_compat = 3;
184 ENCODE_START(6, encode_compat, bl);
185 ::encode(pool, bl);
186 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
187 ::encode(preferred, bl);
188 ::encode(key, bl);
189 ::encode(nspace, bl);
190 ::encode(hash, bl);
191 if (hash != -1)
192 encode_compat = MAX(encode_compat, 6); // need to interpret the hash
193 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
194 }
195
196 void object_locator_t::decode(bufferlist::iterator& p)
197 {
198 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
199 if (struct_v < 2) {
200 int32_t op;
201 ::decode(op, p);
202 pool = op;
203 int16_t pref;
204 ::decode(pref, p);
205 } else {
206 ::decode(pool, p);
207 int32_t preferred;
208 ::decode(preferred, p);
209 }
210 ::decode(key, p);
211 if (struct_v >= 5)
212 ::decode(nspace, p);
213 if (struct_v >= 6)
214 ::decode(hash, p);
215 else
216 hash = -1;
217 DECODE_FINISH(p);
218 // verify that nobody's corrupted the locator
219 assert(hash == -1 || key.empty());
220 }
221
222 void object_locator_t::dump(Formatter *f) const
223 {
224 f->dump_int("pool", pool);
225 f->dump_string("key", key);
226 f->dump_string("namespace", nspace);
227 f->dump_int("hash", hash);
228 }
229
230 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
231 {
232 o.push_back(new object_locator_t);
233 o.push_back(new object_locator_t(123));
234 o.push_back(new object_locator_t(123, 876));
235 o.push_back(new object_locator_t(1, "n2"));
236 o.push_back(new object_locator_t(1234, "", "key"));
237 o.push_back(new object_locator_t(12, "n1", "key2"));
238 }
239
240 // -- request_redirect_t --
241 void request_redirect_t::encode(bufferlist& bl) const
242 {
243 ENCODE_START(1, 1, bl);
244 ::encode(redirect_locator, bl);
245 ::encode(redirect_object, bl);
246 ::encode(osd_instructions, bl);
247 ENCODE_FINISH(bl);
248 }
249
250 void request_redirect_t::decode(bufferlist::iterator& bl)
251 {
252 DECODE_START(1, bl);
253 ::decode(redirect_locator, bl);
254 ::decode(redirect_object, bl);
255 ::decode(osd_instructions, bl);
256 DECODE_FINISH(bl);
257 }
258
259 void request_redirect_t::dump(Formatter *f) const
260 {
261 f->dump_string("object", redirect_object);
262 f->open_object_section("locator");
263 redirect_locator.dump(f);
264 f->close_section(); // locator
265 }
266
267 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
268 {
269 object_locator_t loc(1, "redir_obj");
270 o.push_back(new request_redirect_t());
271 o.push_back(new request_redirect_t(loc, 0));
272 o.push_back(new request_redirect_t(loc, "redir_obj"));
273 o.push_back(new request_redirect_t(loc));
274 }
275
276 void objectstore_perf_stat_t::dump(Formatter *f) const
277 {
278 f->dump_unsigned("commit_latency_ms", os_commit_latency);
279 f->dump_unsigned("apply_latency_ms", os_apply_latency);
280 }
281
282 void objectstore_perf_stat_t::encode(bufferlist &bl) const
283 {
284 ENCODE_START(1, 1, bl);
285 ::encode(os_commit_latency, bl);
286 ::encode(os_apply_latency, bl);
287 ENCODE_FINISH(bl);
288 }
289
290 void objectstore_perf_stat_t::decode(bufferlist::iterator &bl)
291 {
292 DECODE_START(1, bl);
293 ::decode(os_commit_latency, bl);
294 ::decode(os_apply_latency, bl);
295 DECODE_FINISH(bl);
296 }
297
298 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
299 {
300 o.push_back(new objectstore_perf_stat_t());
301 o.push_back(new objectstore_perf_stat_t());
302 o.back()->os_commit_latency = 20;
303 o.back()->os_apply_latency = 30;
304 }
305
306 // -- osd_stat_t --
307 void osd_stat_t::dump(Formatter *f) const
308 {
309 f->dump_unsigned("kb", kb);
310 f->dump_unsigned("kb_used", kb_used);
311 f->dump_unsigned("kb_avail", kb_avail);
312 f->open_array_section("hb_peers");
313 for (auto p : hb_peers)
314 f->dump_int("osd", p);
315 f->close_section();
316 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
317 f->dump_int("num_snap_trimming", num_snap_trimming);
318 f->open_object_section("op_queue_age_hist");
319 op_queue_age_hist.dump(f);
320 f->close_section();
321 f->open_object_section("perf_stat");
322 os_perf_stat.dump(f);
323 f->close_section();
324 }
325
326 void osd_stat_t::encode(bufferlist &bl) const
327 {
328 ENCODE_START(5, 2, bl);
329 ::encode(kb, bl);
330 ::encode(kb_used, bl);
331 ::encode(kb_avail, bl);
332 ::encode(snap_trim_queue_len, bl);
333 ::encode(num_snap_trimming, bl);
334 ::encode(hb_peers, bl);
335 ::encode((uint32_t)0, bl);
336 ::encode(op_queue_age_hist, bl);
337 ::encode(os_perf_stat, bl);
338 ENCODE_FINISH(bl);
339 }
340
341 void osd_stat_t::decode(bufferlist::iterator &bl)
342 {
343 DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
344 ::decode(kb, bl);
345 ::decode(kb_used, bl);
346 ::decode(kb_avail, bl);
347 ::decode(snap_trim_queue_len, bl);
348 ::decode(num_snap_trimming, bl);
349 ::decode(hb_peers, bl);
350 vector<int> num_hb_out;
351 ::decode(num_hb_out, bl);
352 if (struct_v >= 3)
353 ::decode(op_queue_age_hist, bl);
354 if (struct_v >= 4)
355 ::decode(os_perf_stat, bl);
356 DECODE_FINISH(bl);
357 }
358
359 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
360 {
361 o.push_back(new osd_stat_t);
362
363 o.push_back(new osd_stat_t);
364 o.back()->kb = 1;
365 o.back()->kb_used = 2;
366 o.back()->kb_avail = 3;
367 o.back()->hb_peers.push_back(7);
368 o.back()->snap_trim_queue_len = 8;
369 o.back()->num_snap_trimming = 99;
370 }
371
372 // -- pg_t --
373
374 int pg_t::print(char *o, int maxlen) const
375 {
376 if (preferred() >= 0)
377 return snprintf(o, maxlen, "%llu.%xp%d", (unsigned long long)pool(), ps(), preferred());
378 else
379 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
380 }
381
382 bool pg_t::parse(const char *s)
383 {
384 uint64_t ppool;
385 uint32_t pseed;
386 int32_t pref;
387 int r = sscanf(s, "%llu.%xp%d", (long long unsigned *)&ppool, &pseed, &pref);
388 if (r < 2)
389 return false;
390 m_pool = ppool;
391 m_seed = pseed;
392 if (r == 3)
393 m_preferred = pref;
394 else
395 m_preferred = -1;
396 return true;
397 }
398
399 bool spg_t::parse(const char *s)
400 {
401 pgid.set_preferred(-1);
402 shard = shard_id_t::NO_SHARD;
403 uint64_t ppool;
404 uint32_t pseed;
405 int32_t pref;
406 uint32_t pshard;
407 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
408 if (r < 2)
409 return false;
410 pgid.set_pool(ppool);
411 pgid.set_ps(pseed);
412
413 const char *p = strchr(s, 'p');
414 if (p) {
415 r = sscanf(p, "p%d", &pref);
416 if (r == 1) {
417 pgid.set_preferred(pref);
418 } else {
419 return false;
420 }
421 }
422
423 p = strchr(s, 's');
424 if (p) {
425 r = sscanf(p, "s%d", &pshard);
426 if (r == 1) {
427 shard = shard_id_t(pshard);
428 } else {
429 return false;
430 }
431 }
432 return true;
433 }
434
435 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
436 {
437 while (*suffix_backwords)
438 *--buf = *suffix_backwords++;
439
440 if (!is_no_shard()) {
441 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
442 *--buf = 's';
443 }
444
445 return pgid.calc_name(buf, "");
446 }
447
448 ostream& operator<<(ostream& out, const spg_t &pg)
449 {
450 char buf[spg_t::calc_name_buf_size];
451 buf[spg_t::calc_name_buf_size - 1] = '\0';
452 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
453 return out;
454 }
455
456 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
457 {
458 int old_bits = cbits(old_pg_num);
459 int old_mask = (1 << old_bits) - 1;
460 pg_t ret = *this;
461 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
462 return ret;
463 }
464
465 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
466 {
467 assert(m_seed < old_pg_num);
468 if (new_pg_num <= old_pg_num)
469 return false;
470
471 bool split = false;
472 if (true) {
473 unsigned old_bits = cbits(old_pg_num);
474 unsigned old_mask = (1 << old_bits) - 1;
475 for (unsigned n = 1; ; n++) {
476 unsigned next_bit = (n << (old_bits-1));
477 unsigned s = next_bit | m_seed;
478
479 if (s < old_pg_num || s == m_seed)
480 continue;
481 if (s >= new_pg_num)
482 break;
483 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
484 split = true;
485 if (children)
486 children->insert(pg_t(s, m_pool, m_preferred));
487 }
488 }
489 }
490 if (false) {
491 // brute force
492 int old_bits = cbits(old_pg_num);
493 int old_mask = (1 << old_bits) - 1;
494 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
495 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
496 if (o == m_seed) {
497 split = true;
498 children->insert(pg_t(x, m_pool, m_preferred));
499 }
500 }
501 }
502 return split;
503 }
504
505 unsigned pg_t::get_split_bits(unsigned pg_num) const {
506 if (pg_num == 1)
507 return 0;
508 assert(pg_num > 1);
509
510 // Find unique p such that pg_num \in [2^(p-1), 2^p)
511 unsigned p = cbits(pg_num);
512 assert(p); // silence coverity #751330
513
514 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
515 return p;
516 else
517 return p - 1;
518 }
519
520 pg_t pg_t::get_parent() const
521 {
522 unsigned bits = cbits(m_seed);
523 assert(bits);
524 pg_t retval = *this;
525 retval.m_seed &= ~((~0)<<(bits - 1));
526 return retval;
527 }
528
529 hobject_t pg_t::get_hobj_start() const
530 {
531 return hobject_t(object_t(), string(), CEPH_NOSNAP, m_seed, m_pool,
532 string());
533 }
534
535 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
536 {
537 // note: this assumes a bitwise sort; with the legacy nibblewise
538 // sort a PG did not always cover a single contiguous range of the
539 // (bit-reversed) hash range.
540 unsigned bits = get_split_bits(pg_num);
541 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
542 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
543 if (rev_end >= 0x100000000) {
544 assert(rev_end == 0x100000000);
545 return hobject_t::get_max();
546 } else {
547 return hobject_t(object_t(), string(), CEPH_NOSNAP,
548 hobject_t::_reverse_bits(rev_end), m_pool,
549 string());
550 }
551 }
552
553 void pg_t::dump(Formatter *f) const
554 {
555 f->dump_unsigned("pool", m_pool);
556 f->dump_unsigned("seed", m_seed);
557 f->dump_int("preferred_osd", m_preferred);
558 }
559
560 void pg_t::generate_test_instances(list<pg_t*>& o)
561 {
562 o.push_back(new pg_t);
563 o.push_back(new pg_t(1, 2, -1));
564 o.push_back(new pg_t(13123, 3, -1));
565 o.push_back(new pg_t(131223, 4, 23));
566 }
567
568 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
569 {
570 while (*suffix_backwords)
571 *--buf = *suffix_backwords++;
572
573 if (m_preferred >= 0)
574 *--buf ='p';
575
576 buf = ritoa<uint32_t, 16>(m_seed, buf);
577
578 *--buf = '.';
579
580 return ritoa<uint64_t, 10>(m_pool, buf);
581 }
582
583 ostream& operator<<(ostream& out, const pg_t &pg)
584 {
585 char buf[pg_t::calc_name_buf_size];
586 buf[pg_t::calc_name_buf_size - 1] = '\0';
587 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
588 return out;
589 }
590
591
592 // -- coll_t --
593
594 void coll_t::calc_str()
595 {
596 switch (type) {
597 case TYPE_META:
598 strcpy(_str_buff, "meta");
599 _str = _str_buff;
600 break;
601 case TYPE_PG:
602 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
603 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
604 break;
605 case TYPE_PG_TEMP:
606 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
607 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
608 break;
609 default:
610 assert(0 == "unknown collection type");
611 }
612 }
613
614 bool coll_t::parse(const std::string& s)
615 {
616 if (s == "meta") {
617 type = TYPE_META;
618 pgid = spg_t();
619 removal_seq = 0;
620 calc_str();
621 assert(s == _str);
622 return true;
623 }
624 if (s.find("_head") == s.length() - 5 &&
625 pgid.parse(s.substr(0, s.length() - 5))) {
626 type = TYPE_PG;
627 removal_seq = 0;
628 calc_str();
629 assert(s == _str);
630 return true;
631 }
632 if (s.find("_TEMP") == s.length() - 5 &&
633 pgid.parse(s.substr(0, s.length() - 5))) {
634 type = TYPE_PG_TEMP;
635 removal_seq = 0;
636 calc_str();
637 assert(s == _str);
638 return true;
639 }
640 return false;
641 }
642
643 void coll_t::encode(bufferlist& bl) const
644 {
645 // when changing this, remember to update encoded_size() too.
646 if (is_temp()) {
647 // can't express this as v2...
648 __u8 struct_v = 3;
649 ::encode(struct_v, bl);
650 ::encode(to_str(), bl);
651 } else {
652 __u8 struct_v = 2;
653 ::encode(struct_v, bl);
654 ::encode((__u8)type, bl);
655 ::encode(pgid, bl);
656 snapid_t snap = CEPH_NOSNAP;
657 ::encode(snap, bl);
658 }
659 }
660
661 size_t coll_t::encoded_size() const
662 {
663 size_t r = sizeof(__u8);
664 if (is_temp()) {
665 // v3
666 r += sizeof(__u32);
667 if (_str) {
668 r += strlen(_str);
669 }
670 } else {
671 // v2
672 // 1. type
673 r += sizeof(__u8);
674 // 2. pgid
675 // - encoding header
676 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
677 // - pg_t
678 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
679 // - shard_id_t
680 r += sizeof(int8_t);
681 // 3. snapid_t
682 r += sizeof(uint64_t);
683 }
684
685 return r;
686 }
687
688 void coll_t::decode(bufferlist::iterator& bl)
689 {
690 __u8 struct_v;
691 ::decode(struct_v, bl);
692 switch (struct_v) {
693 case 1:
694 {
695 snapid_t snap;
696 ::decode(pgid, bl);
697 ::decode(snap, bl);
698
699 // infer the type
700 if (pgid == spg_t() && snap == 0) {
701 type = TYPE_META;
702 } else {
703 type = TYPE_PG;
704 }
705 removal_seq = 0;
706 }
707 break;
708
709 case 2:
710 {
711 __u8 _type;
712 snapid_t snap;
713 ::decode(_type, bl);
714 ::decode(pgid, bl);
715 ::decode(snap, bl);
716 type = (type_t)_type;
717 removal_seq = 0;
718 }
719 break;
720
721 case 3:
722 {
723 string str;
724 ::decode(str, bl);
725 bool ok = parse(str);
726 if (!ok)
727 throw std::domain_error(std::string("unable to parse pg ") + str);
728 }
729 break;
730
731 default:
732 {
733 ostringstream oss;
734 oss << "coll_t::decode(): don't know how to decode version "
735 << struct_v;
736 throw std::domain_error(oss.str());
737 }
738 }
739 }
740
741 void coll_t::dump(Formatter *f) const
742 {
743 f->dump_unsigned("type_id", (unsigned)type);
744 if (type != TYPE_META)
745 f->dump_stream("pgid") << pgid;
746 f->dump_string("name", to_str());
747 }
748
749 void coll_t::generate_test_instances(list<coll_t*>& o)
750 {
751 o.push_back(new coll_t());
752 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
753 o.push_back(new coll_t(o.back()->get_temp()));
754 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
755 o.push_back(new coll_t(o.back()->get_temp()));
756 o.push_back(new coll_t());
757 }
758
759 // ---
760
761 std::string pg_vector_string(const vector<int32_t> &a)
762 {
763 ostringstream oss;
764 oss << "[";
765 for (vector<int32_t>::const_iterator i = a.begin(); i != a.end(); ++i) {
766 if (i != a.begin())
767 oss << ",";
768 if (*i != CRUSH_ITEM_NONE)
769 oss << *i;
770 else
771 oss << "NONE";
772 }
773 oss << "]";
774 return oss.str();
775 }
776
777 std::string pg_state_string(int state)
778 {
779 ostringstream oss;
780 if (state & PG_STATE_STALE)
781 oss << "stale+";
782 if (state & PG_STATE_CREATING)
783 oss << "creating+";
784 if (state & PG_STATE_ACTIVE)
785 oss << "active+";
786 if (state & PG_STATE_ACTIVATING)
787 oss << "activating+";
788 if (state & PG_STATE_CLEAN)
789 oss << "clean+";
790 if (state & PG_STATE_RECOVERY_WAIT)
791 oss << "recovery_wait+";
792 if (state & PG_STATE_RECOVERY_TOOFULL)
793 oss << "recovery_toofull+";
794 if (state & PG_STATE_RECOVERING)
795 oss << "recovering+";
796 if (state & PG_STATE_DOWN)
797 oss << "down+";
798 if (state & PG_STATE_UNDERSIZED)
799 oss << "undersized+";
800 if (state & PG_STATE_DEGRADED)
801 oss << "degraded+";
802 if (state & PG_STATE_REMAPPED)
803 oss << "remapped+";
804 if (state & PG_STATE_SCRUBBING)
805 oss << "scrubbing+";
806 if (state & PG_STATE_DEEP_SCRUB)
807 oss << "deep+";
808 if (state & PG_STATE_INCONSISTENT)
809 oss << "inconsistent+";
810 if (state & PG_STATE_PEERING)
811 oss << "peering+";
812 if (state & PG_STATE_REPAIR)
813 oss << "repair+";
814 if ((state & PG_STATE_BACKFILL_WAIT) &&
815 !(state &PG_STATE_BACKFILL))
816 oss << "backfill_wait+";
817 if (state & PG_STATE_BACKFILL)
818 oss << "backfilling+";
819 if (state & PG_STATE_BACKFILL_TOOFULL)
820 oss << "backfill_toofull+";
821 if (state & PG_STATE_INCOMPLETE)
822 oss << "incomplete+";
823 if (state & PG_STATE_PEERED)
824 oss << "peered+";
825 if (state & PG_STATE_SNAPTRIM)
826 oss << "snaptrim+";
827 if (state & PG_STATE_SNAPTRIM_WAIT)
828 oss << "snaptrim_wait+";
829 string ret(oss.str());
830 if (ret.length() > 0)
831 ret.resize(ret.length() - 1);
832 else
833 ret = "inactive";
834 return ret;
835 }
836
837 int pg_string_state(const std::string& state)
838 {
839 int type;
840 if (state == "active")
841 type = PG_STATE_ACTIVE;
842 else if (state == "clean")
843 type = PG_STATE_CLEAN;
844 else if (state == "down")
845 type = PG_STATE_DOWN;
846 else if (state == "scrubbing")
847 type = PG_STATE_SCRUBBING;
848 else if (state == "degraded")
849 type = PG_STATE_DEGRADED;
850 else if (state == "inconsistent")
851 type = PG_STATE_INCONSISTENT;
852 else if (state == "peering")
853 type = PG_STATE_PEERING;
854 else if (state == "repair")
855 type = PG_STATE_REPAIR;
856 else if (state == "recovering")
857 type = PG_STATE_RECOVERING;
858 else if (state == "backfill_wait")
859 type = PG_STATE_BACKFILL_WAIT;
860 else if (state == "incomplete")
861 type = PG_STATE_INCOMPLETE;
862 else if (state == "stale")
863 type = PG_STATE_STALE;
864 else if (state == "remapped")
865 type = PG_STATE_REMAPPED;
866 else if (state == "deep_scrub")
867 type = PG_STATE_DEEP_SCRUB;
868 else if (state == "backfill")
869 type = PG_STATE_BACKFILL;
870 else if (state == "backfill_toofull")
871 type = PG_STATE_BACKFILL_TOOFULL;
872 else if (state == "recovery_wait")
873 type = PG_STATE_RECOVERY_WAIT;
874 else if (state == "recovery_toofull")
875 type = PG_STATE_RECOVERY_TOOFULL;
876 else if (state == "undersized")
877 type = PG_STATE_UNDERSIZED;
878 else if (state == "activating")
879 type = PG_STATE_ACTIVATING;
880 else if (state == "peered")
881 type = PG_STATE_PEERED;
882 else if (state == "snaptrim")
883 type = PG_STATE_SNAPTRIM;
884 else if (state == "snaptrim_wait")
885 type = PG_STATE_SNAPTRIM_WAIT;
886 else
887 type = -1;
888 return type;
889 }
890
891 // -- eversion_t --
892 string eversion_t::get_key_name() const
893 {
894 char key[32];
895 // Below is equivalent of sprintf("%010u.%020llu");
896 key[31] = 0;
897 ritoa<uint64_t, 10, 20>(version, key + 31);
898 key[10] = '.';
899 ritoa<uint32_t, 10, 10>(epoch, key + 10);
900 return string(key);
901 }
902
903
904 // -- pool_snap_info_t --
905 void pool_snap_info_t::dump(Formatter *f) const
906 {
907 f->dump_unsigned("snapid", snapid);
908 f->dump_stream("stamp") << stamp;
909 f->dump_string("name", name);
910 }
911
912 void pool_snap_info_t::encode(bufferlist& bl, uint64_t features) const
913 {
914 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
915 __u8 struct_v = 1;
916 ::encode(struct_v, bl);
917 ::encode(snapid, bl);
918 ::encode(stamp, bl);
919 ::encode(name, bl);
920 return;
921 }
922 ENCODE_START(2, 2, bl);
923 ::encode(snapid, bl);
924 ::encode(stamp, bl);
925 ::encode(name, bl);
926 ENCODE_FINISH(bl);
927 }
928
929 void pool_snap_info_t::decode(bufferlist::iterator& bl)
930 {
931 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
932 ::decode(snapid, bl);
933 ::decode(stamp, bl);
934 ::decode(name, bl);
935 DECODE_FINISH(bl);
936 }
937
938 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
939 {
940 o.push_back(new pool_snap_info_t);
941 o.push_back(new pool_snap_info_t);
942 o.back()->snapid = 1;
943 o.back()->stamp = utime_t(1, 2);
944 o.back()->name = "foo";
945 }
946
947 // -- pool_opts_t --
948
949 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
950 static opt_mapping_t opt_mapping = boost::assign::map_list_of
951 ("scrub_min_interval", pool_opts_t::opt_desc_t(
952 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
953 ("scrub_max_interval", pool_opts_t::opt_desc_t(
954 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
955 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
956 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
957 ("recovery_priority", pool_opts_t::opt_desc_t(
958 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
959 ("recovery_op_priority", pool_opts_t::opt_desc_t(
960 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
961 ("scrub_priority", pool_opts_t::opt_desc_t(
962 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
963 ("compression_mode", pool_opts_t::opt_desc_t(
964 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
965 ("compression_algorithm", pool_opts_t::opt_desc_t(
966 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
967 ("compression_required_ratio", pool_opts_t::opt_desc_t(
968 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
969 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
970 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
971 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
972 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
973 ("csum_type", pool_opts_t::opt_desc_t(
974 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
975 ("csum_max_block", pool_opts_t::opt_desc_t(
976 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
977 ("csum_min_block", pool_opts_t::opt_desc_t(
978 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT));
979
980 bool pool_opts_t::is_opt_name(const std::string& name) {
981 return opt_mapping.find(name) != opt_mapping.end();
982 }
983
984 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) {
985 opt_mapping_t::iterator i = opt_mapping.find(name);
986 assert(i != opt_mapping.end());
987 return i->second;
988 }
989
990 bool pool_opts_t::is_set(pool_opts_t::key_t key) const {
991 return opts.find(key) != opts.end();
992 }
993
994 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const {
995 opts_t::const_iterator i = opts.find(key);
996 assert(i != opts.end());
997 return i->second;
998 }
999
1000 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1001 return opts.erase(key) > 0;
1002 }
1003
1004 class pool_opts_dumper_t : public boost::static_visitor<>
1005 {
1006 public:
1007 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1008 name(name_.c_str()), f(f_) {}
1009
1010 void operator()(std::string s) const {
1011 f->dump_string(name, s);
1012 }
1013 void operator()(int i) const {
1014 f->dump_int(name, i);
1015 }
1016 void operator()(double d) const {
1017 f->dump_float(name, d);
1018 }
1019
1020 private:
1021 const char* name;
1022 Formatter* f;
1023 };
1024
1025 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1026 {
1027 const opt_desc_t& desc = get_opt_desc(name);
1028 opts_t::const_iterator i = opts.find(desc.key);
1029 if (i == opts.end()) {
1030 return;
1031 }
1032 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1033 }
1034
1035 void pool_opts_t::dump(Formatter* f) const
1036 {
1037 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1038 ++i) {
1039 const std::string& name = i->first;
1040 const opt_desc_t& desc = i->second;
1041 opts_t::const_iterator j = opts.find(desc.key);
1042 if (j == opts.end()) {
1043 continue;
1044 }
1045 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1046 }
1047 }
1048
1049 class pool_opts_encoder_t : public boost::static_visitor<>
1050 {
1051 public:
1052 explicit pool_opts_encoder_t(bufferlist& bl_) : bl(bl_) {}
1053
1054 void operator()(std::string s) const {
1055 ::encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1056 ::encode(s, bl);
1057 }
1058 void operator()(int i) const {
1059 ::encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1060 ::encode(i, bl);
1061 }
1062 void operator()(double d) const {
1063 ::encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1064 ::encode(d, bl);
1065 }
1066
1067 private:
1068 bufferlist& bl;
1069 };
1070
1071 void pool_opts_t::encode(bufferlist& bl) const {
1072 ENCODE_START(1, 1, bl);
1073 uint32_t n = static_cast<uint32_t>(opts.size());
1074 ::encode(n, bl);
1075 for (opts_t::const_iterator i = opts.begin(); i != opts.end(); ++i) {
1076 ::encode(static_cast<int32_t>(i->first), bl);
1077 boost::apply_visitor(pool_opts_encoder_t(bl), i->second);
1078 }
1079 ENCODE_FINISH(bl);
1080 }
1081
1082 void pool_opts_t::decode(bufferlist::iterator& bl) {
1083 DECODE_START(1, bl);
1084 __u32 n;
1085 ::decode(n, bl);
1086 opts.clear();
1087 while (n--) {
1088 int32_t k, t;
1089 ::decode(k, bl);
1090 ::decode(t, bl);
1091 if (t == STR) {
1092 std::string s;
1093 ::decode(s, bl);
1094 opts[static_cast<key_t>(k)] = s;
1095 } else if (t == INT) {
1096 int i;
1097 ::decode(i, bl);
1098 opts[static_cast<key_t>(k)] = i;
1099 } else if (t == DOUBLE) {
1100 double d;
1101 ::decode(d, bl);
1102 opts[static_cast<key_t>(k)] = d;
1103 } else {
1104 assert(!"invalid type");
1105 }
1106 }
1107 DECODE_FINISH(bl);
1108 }
1109
1110 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1111 {
1112 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1113 ++i) {
1114 const std::string& name = i->first;
1115 const pool_opts_t::opt_desc_t& desc = i->second;
1116 pool_opts_t::opts_t::const_iterator j = opts.opts.find(desc.key);
1117 if (j == opts.opts.end()) {
1118 continue;
1119 }
1120 out << " " << name << " " << j->second;
1121 }
1122 return out;
1123 }
1124
1125 // -- pg_pool_t --
1126
1127 void pg_pool_t::dump(Formatter *f) const
1128 {
1129 f->dump_unsigned("flags", get_flags());
1130 f->dump_string("flags_names", get_flags_string());
1131 f->dump_int("type", get_type());
1132 f->dump_int("size", get_size());
1133 f->dump_int("min_size", get_min_size());
1134 f->dump_int("crush_ruleset", get_crush_ruleset());
1135 f->dump_int("object_hash", get_object_hash());
1136 f->dump_unsigned("pg_num", get_pg_num());
1137 f->dump_unsigned("pg_placement_num", get_pgp_num());
1138 f->dump_unsigned("crash_replay_interval", get_crash_replay_interval());
1139 f->dump_stream("last_change") << get_last_change();
1140 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1141 f->dump_stream("last_force_op_resend_preluminous")
1142 << get_last_force_op_resend_preluminous();
1143 f->dump_unsigned("auid", get_auid());
1144 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1145 f->dump_unsigned("snap_seq", get_snap_seq());
1146 f->dump_unsigned("snap_epoch", get_snap_epoch());
1147 f->open_array_section("pool_snaps");
1148 for (map<snapid_t, pool_snap_info_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
1149 f->open_object_section("pool_snap_info");
1150 p->second.dump(f);
1151 f->close_section();
1152 }
1153 f->close_section();
1154 f->dump_stream("removed_snaps") << removed_snaps;
1155 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1156 f->dump_unsigned("quota_max_objects", quota_max_objects);
1157 f->open_array_section("tiers");
1158 for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
1159 f->dump_unsigned("pool_id", *p);
1160 f->close_section();
1161 f->dump_int("tier_of", tier_of);
1162 f->dump_int("read_tier", read_tier);
1163 f->dump_int("write_tier", write_tier);
1164 f->dump_string("cache_mode", get_cache_mode_name());
1165 f->dump_unsigned("target_max_bytes", target_max_bytes);
1166 f->dump_unsigned("target_max_objects", target_max_objects);
1167 f->dump_unsigned("cache_target_dirty_ratio_micro",
1168 cache_target_dirty_ratio_micro);
1169 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1170 cache_target_dirty_high_ratio_micro);
1171 f->dump_unsigned("cache_target_full_ratio_micro",
1172 cache_target_full_ratio_micro);
1173 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1174 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1175 f->dump_string("erasure_code_profile", erasure_code_profile);
1176 f->open_object_section("hit_set_params");
1177 hit_set_params.dump(f);
1178 f->close_section(); // hit_set_params
1179 f->dump_unsigned("hit_set_period", hit_set_period);
1180 f->dump_unsigned("hit_set_count", hit_set_count);
1181 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1182 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1183 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1184 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1185 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1186 f->open_array_section("grade_table");
1187 for (unsigned i = 0; i < hit_set_count; ++i)
1188 f->dump_unsigned("value", get_grade(i));
1189 f->close_section();
1190 f->dump_unsigned("stripe_width", get_stripe_width());
1191 f->dump_unsigned("expected_num_objects", expected_num_objects);
1192 f->dump_bool("fast_read", fast_read);
1193 f->open_object_section("options");
1194 opts.dump(f);
1195 f->close_section(); // options
1196 }
1197
1198 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1199 for (size_t i = 0; i < from.size(); ++i) {
1200 if (from[i] != CRUSH_ITEM_NONE) {
1201 to->insert(
1202 pg_shard_t(
1203 from[i],
1204 ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1205 }
1206 }
1207 }
1208
1209 void pg_pool_t::calc_pg_masks()
1210 {
1211 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1212 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1213 }
1214
1215 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1216 {
1217 if (pg_num == pg_num_mask + 1)
1218 return pg_num; // power-of-2 split
1219 unsigned mask = pg_num_mask >> 1;
1220 if ((pgid.ps() & mask) < (pg_num & mask))
1221 return pg_num_mask + 1; // smaller bin size (already split)
1222 else
1223 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1224 }
1225
1226 /*
1227 * we have two snap modes:
1228 * - pool global snaps
1229 * - snap existence/non-existence defined by snaps[] and snap_seq
1230 * - user managed snaps
1231 * - removal governed by removed_snaps
1232 *
1233 * we know which mode we're using based on whether removed_snaps is empty.
1234 */
1235 bool pg_pool_t::is_pool_snaps_mode() const
1236 {
1237 return removed_snaps.empty() && get_snap_seq() > 0;
1238 }
1239
1240 bool pg_pool_t::is_unmanaged_snaps_mode() const
1241 {
1242 return removed_snaps.size() && get_snap_seq() > 0;
1243 }
1244
1245 bool pg_pool_t::is_removed_snap(snapid_t s) const
1246 {
1247 if (is_pool_snaps_mode())
1248 return s <= get_snap_seq() && snaps.count(s) == 0;
1249 else
1250 return removed_snaps.contains(s);
1251 }
1252
1253 /*
1254 * build set of known-removed sets from either pool snaps or
1255 * explicit removed_snaps set.
1256 */
1257 void pg_pool_t::build_removed_snaps(interval_set<snapid_t>& rs) const
1258 {
1259 if (is_pool_snaps_mode()) {
1260 rs.clear();
1261 for (snapid_t s = 1; s <= get_snap_seq(); s = s + 1)
1262 if (snaps.count(s) == 0)
1263 rs.insert(s);
1264 } else {
1265 rs = removed_snaps;
1266 }
1267 }
1268
1269 snapid_t pg_pool_t::snap_exists(const char *s) const
1270 {
1271 for (map<snapid_t,pool_snap_info_t>::const_iterator p = snaps.begin();
1272 p != snaps.end();
1273 ++p)
1274 if (p->second.name == s)
1275 return p->second.snapid;
1276 return 0;
1277 }
1278
1279 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1280 {
1281 assert(!is_unmanaged_snaps_mode());
1282 snapid_t s = get_snap_seq() + 1;
1283 snap_seq = s;
1284 snaps[s].snapid = s;
1285 snaps[s].name = n;
1286 snaps[s].stamp = stamp;
1287 }
1288
1289 void pg_pool_t::add_unmanaged_snap(uint64_t& snapid)
1290 {
1291 if (removed_snaps.empty()) {
1292 assert(!is_pool_snaps_mode());
1293 removed_snaps.insert(snapid_t(1));
1294 snap_seq = 1;
1295 }
1296 snapid = snap_seq = snap_seq + 1;
1297 }
1298
1299 void pg_pool_t::remove_snap(snapid_t s)
1300 {
1301 assert(snaps.count(s));
1302 snaps.erase(s);
1303 snap_seq = snap_seq + 1;
1304 }
1305
1306 void pg_pool_t::remove_unmanaged_snap(snapid_t s)
1307 {
1308 assert(is_unmanaged_snaps_mode());
1309 removed_snaps.insert(s);
1310 snap_seq = snap_seq + 1;
1311 removed_snaps.insert(get_snap_seq());
1312 }
1313
1314 SnapContext pg_pool_t::get_snap_context() const
1315 {
1316 vector<snapid_t> s(snaps.size());
1317 unsigned i = 0;
1318 for (map<snapid_t, pool_snap_info_t>::const_reverse_iterator p = snaps.rbegin();
1319 p != snaps.rend();
1320 ++p)
1321 s[i++] = p->first;
1322 return SnapContext(get_snap_seq(), s);
1323 }
1324
1325 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1326 {
1327 if (ns.empty())
1328 return ceph_str_hash(object_hash, key.data(), key.length());
1329 int nsl = ns.length();
1330 int len = key.length() + nsl + 1;
1331 char buf[len];
1332 memcpy(&buf[0], ns.data(), nsl);
1333 buf[nsl] = '\037';
1334 memcpy(&buf[nsl+1], key.data(), key.length());
1335 return ceph_str_hash(object_hash, &buf[0], len);
1336 }
1337
1338 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1339 {
1340 return ceph_stable_mod(v, pg_num, pg_num_mask);
1341 }
1342
1343 /*
1344 * map a raw pg (with full precision ps) into an actual pg, for storage
1345 */
1346 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1347 {
1348 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1349 return pg;
1350 }
1351
1352 /*
1353 * map raw pg (full precision ps) into a placement seed. include
1354 * pool id in that value so that different pools don't use the same
1355 * seeds.
1356 */
1357 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1358 {
1359 if (flags & FLAG_HASHPSPOOL) {
1360 // Hash the pool id so that pool PGs do not overlap.
1361 return
1362 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1363 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1364 pg.pool());
1365 } else {
1366 // Legacy behavior; add ps and pool together. This is not a great
1367 // idea because the PGs from each pool will essentially overlap on
1368 // top of each other: 0.5 == 1.4 == 2.3 == ...
1369 return
1370 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1371 pg.pool();
1372 }
1373 }
1374
1375 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1376 {
1377 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1378 if (pg_num == pg_num_mask + 1) {
1379 r &= ~pg_num_mask;
1380 } else {
1381 unsigned smaller_mask = pg_num_mask >> 1;
1382 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1383 r &= ~pg_num_mask;
1384 } else {
1385 r &= ~smaller_mask;
1386 }
1387 }
1388 r |= pg.ps();
1389 return r;
1390 }
1391
1392 void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
1393 {
1394 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1395 // this encoding matches the old struct ceph_pg_pool
1396 __u8 struct_v = 2;
1397 ::encode(struct_v, bl);
1398 ::encode(type, bl);
1399 ::encode(size, bl);
1400 ::encode(crush_ruleset, bl);
1401 ::encode(object_hash, bl);
1402 ::encode(pg_num, bl);
1403 ::encode(pgp_num, bl);
1404 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1405 ::encode(lpg_num, bl);
1406 ::encode(lpgp_num, bl);
1407 ::encode(last_change, bl);
1408 ::encode(snap_seq, bl);
1409 ::encode(snap_epoch, bl);
1410
1411 __u32 n = snaps.size();
1412 ::encode(n, bl);
1413 n = removed_snaps.num_intervals();
1414 ::encode(n, bl);
1415
1416 ::encode(auid, bl);
1417
1418 ::encode_nohead(snaps, bl, features);
1419 ::encode_nohead(removed_snaps, bl);
1420 return;
1421 }
1422
1423 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1424 __u8 struct_v = 4;
1425 ::encode(struct_v, bl);
1426 ::encode(type, bl);
1427 ::encode(size, bl);
1428 ::encode(crush_ruleset, bl);
1429 ::encode(object_hash, bl);
1430 ::encode(pg_num, bl);
1431 ::encode(pgp_num, bl);
1432 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1433 ::encode(lpg_num, bl);
1434 ::encode(lpgp_num, bl);
1435 ::encode(last_change, bl);
1436 ::encode(snap_seq, bl);
1437 ::encode(snap_epoch, bl);
1438 ::encode(snaps, bl, features);
1439 ::encode(removed_snaps, bl);
1440 ::encode(auid, bl);
1441 ::encode(flags, bl);
1442 ::encode(crash_replay_interval, bl);
1443 return;
1444 }
1445
1446 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1447 // we simply added last_force_op_resend here, which is a fully
1448 // backward compatible change. however, encoding the same map
1449 // differently between monitors triggers scrub noise (even though
1450 // they are decodable without the feature), so let's be pendantic
1451 // about it.
1452 ENCODE_START(14, 5, bl);
1453 ::encode(type, bl);
1454 ::encode(size, bl);
1455 ::encode(crush_ruleset, bl);
1456 ::encode(object_hash, bl);
1457 ::encode(pg_num, bl);
1458 ::encode(pgp_num, bl);
1459 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1460 ::encode(lpg_num, bl);
1461 ::encode(lpgp_num, bl);
1462 ::encode(last_change, bl);
1463 ::encode(snap_seq, bl);
1464 ::encode(snap_epoch, bl);
1465 ::encode(snaps, bl, features);
1466 ::encode(removed_snaps, bl);
1467 ::encode(auid, bl);
1468 ::encode(flags, bl);
1469 ::encode(crash_replay_interval, bl);
1470 ::encode(min_size, bl);
1471 ::encode(quota_max_bytes, bl);
1472 ::encode(quota_max_objects, bl);
1473 ::encode(tiers, bl);
1474 ::encode(tier_of, bl);
1475 __u8 c = cache_mode;
1476 ::encode(c, bl);
1477 ::encode(read_tier, bl);
1478 ::encode(write_tier, bl);
1479 ::encode(properties, bl);
1480 ::encode(hit_set_params, bl);
1481 ::encode(hit_set_period, bl);
1482 ::encode(hit_set_count, bl);
1483 ::encode(stripe_width, bl);
1484 ::encode(target_max_bytes, bl);
1485 ::encode(target_max_objects, bl);
1486 ::encode(cache_target_dirty_ratio_micro, bl);
1487 ::encode(cache_target_full_ratio_micro, bl);
1488 ::encode(cache_min_flush_age, bl);
1489 ::encode(cache_min_evict_age, bl);
1490 ::encode(erasure_code_profile, bl);
1491 ENCODE_FINISH(bl);
1492 return;
1493 }
1494
1495 uint8_t v = 25;
1496 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1497 // this was the first post-hammer thing we added; if it's missing, encode
1498 // like hammer.
1499 v = 21;
1500 }
1501 if ((features &
1502 (CEPH_FEATURE_RESEND_ON_SPLIT|CEPH_FEATURE_SERVER_JEWEL)) !=
1503 (CEPH_FEATURE_RESEND_ON_SPLIT|CEPH_FEATURE_SERVER_JEWEL)) {
1504 v = 24;
1505 }
1506
1507 ENCODE_START(v, 5, bl);
1508 ::encode(type, bl);
1509 ::encode(size, bl);
1510 ::encode(crush_ruleset, bl);
1511 ::encode(object_hash, bl);
1512 ::encode(pg_num, bl);
1513 ::encode(pgp_num, bl);
1514 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1515 ::encode(lpg_num, bl);
1516 ::encode(lpgp_num, bl);
1517 ::encode(last_change, bl);
1518 ::encode(snap_seq, bl);
1519 ::encode(snap_epoch, bl);
1520 ::encode(snaps, bl, features);
1521 ::encode(removed_snaps, bl);
1522 ::encode(auid, bl);
1523 ::encode(flags, bl);
1524 ::encode(crash_replay_interval, bl);
1525 ::encode(min_size, bl);
1526 ::encode(quota_max_bytes, bl);
1527 ::encode(quota_max_objects, bl);
1528 ::encode(tiers, bl);
1529 ::encode(tier_of, bl);
1530 __u8 c = cache_mode;
1531 ::encode(c, bl);
1532 ::encode(read_tier, bl);
1533 ::encode(write_tier, bl);
1534 ::encode(properties, bl);
1535 ::encode(hit_set_params, bl);
1536 ::encode(hit_set_period, bl);
1537 ::encode(hit_set_count, bl);
1538 ::encode(stripe_width, bl);
1539 ::encode(target_max_bytes, bl);
1540 ::encode(target_max_objects, bl);
1541 ::encode(cache_target_dirty_ratio_micro, bl);
1542 ::encode(cache_target_full_ratio_micro, bl);
1543 ::encode(cache_min_flush_age, bl);
1544 ::encode(cache_min_evict_age, bl);
1545 ::encode(erasure_code_profile, bl);
1546 ::encode(last_force_op_resend_preluminous, bl);
1547 ::encode(min_read_recency_for_promote, bl);
1548 ::encode(expected_num_objects, bl);
1549 if (v >= 19) {
1550 ::encode(cache_target_dirty_high_ratio_micro, bl);
1551 }
1552 if (v >= 20) {
1553 ::encode(min_write_recency_for_promote, bl);
1554 }
1555 if (v >= 21) {
1556 ::encode(use_gmt_hitset, bl);
1557 }
1558 if (v >= 22) {
1559 ::encode(fast_read, bl);
1560 }
1561 if (v >= 23) {
1562 ::encode(hit_set_grade_decay_rate, bl);
1563 ::encode(hit_set_search_last_n, bl);
1564 }
1565 if (v >= 24) {
1566 ::encode(opts, bl);
1567 }
1568 if (v >= 25) {
1569 ::encode(last_force_op_resend, bl);
1570 }
1571 ENCODE_FINISH(bl);
1572 }
1573
1574 void pg_pool_t::decode(bufferlist::iterator& bl)
1575 {
1576 DECODE_START_LEGACY_COMPAT_LEN(24, 5, 5, bl);
1577 ::decode(type, bl);
1578 ::decode(size, bl);
1579 ::decode(crush_ruleset, bl);
1580 ::decode(object_hash, bl);
1581 ::decode(pg_num, bl);
1582 ::decode(pgp_num, bl);
1583 {
1584 __u32 lpg_num, lpgp_num;
1585 ::decode(lpg_num, bl);
1586 ::decode(lpgp_num, bl);
1587 }
1588 ::decode(last_change, bl);
1589 ::decode(snap_seq, bl);
1590 ::decode(snap_epoch, bl);
1591
1592 if (struct_v >= 3) {
1593 ::decode(snaps, bl);
1594 ::decode(removed_snaps, bl);
1595 ::decode(auid, bl);
1596 } else {
1597 __u32 n, m;
1598 ::decode(n, bl);
1599 ::decode(m, bl);
1600 ::decode(auid, bl);
1601 ::decode_nohead(n, snaps, bl);
1602 ::decode_nohead(m, removed_snaps, bl);
1603 }
1604
1605 if (struct_v >= 4) {
1606 ::decode(flags, bl);
1607 ::decode(crash_replay_interval, bl);
1608 } else {
1609 flags = 0;
1610
1611 // if this looks like the 'data' pool, set the
1612 // crash_replay_interval appropriately. unfortunately, we can't
1613 // be precise here. this should be good enough to preserve replay
1614 // on the data pool for the majority of cluster upgrades, though.
1615 if (crush_ruleset == 0 && auid == 0)
1616 crash_replay_interval = 60;
1617 else
1618 crash_replay_interval = 0;
1619 }
1620 if (struct_v >= 7) {
1621 ::decode(min_size, bl);
1622 } else {
1623 min_size = size - size/2;
1624 }
1625 if (struct_v >= 8) {
1626 ::decode(quota_max_bytes, bl);
1627 ::decode(quota_max_objects, bl);
1628 }
1629 if (struct_v >= 9) {
1630 ::decode(tiers, bl);
1631 ::decode(tier_of, bl);
1632 __u8 v;
1633 ::decode(v, bl);
1634 cache_mode = (cache_mode_t)v;
1635 ::decode(read_tier, bl);
1636 ::decode(write_tier, bl);
1637 }
1638 if (struct_v >= 10) {
1639 ::decode(properties, bl);
1640 }
1641 if (struct_v >= 11) {
1642 ::decode(hit_set_params, bl);
1643 ::decode(hit_set_period, bl);
1644 ::decode(hit_set_count, bl);
1645 } else {
1646 pg_pool_t def;
1647 hit_set_period = def.hit_set_period;
1648 hit_set_count = def.hit_set_count;
1649 }
1650 if (struct_v >= 12) {
1651 ::decode(stripe_width, bl);
1652 } else {
1653 set_stripe_width(0);
1654 }
1655 if (struct_v >= 13) {
1656 ::decode(target_max_bytes, bl);
1657 ::decode(target_max_objects, bl);
1658 ::decode(cache_target_dirty_ratio_micro, bl);
1659 ::decode(cache_target_full_ratio_micro, bl);
1660 ::decode(cache_min_flush_age, bl);
1661 ::decode(cache_min_evict_age, bl);
1662 } else {
1663 target_max_bytes = 0;
1664 target_max_objects = 0;
1665 cache_target_dirty_ratio_micro = 0;
1666 cache_target_full_ratio_micro = 0;
1667 cache_min_flush_age = 0;
1668 cache_min_evict_age = 0;
1669 }
1670 if (struct_v >= 14) {
1671 ::decode(erasure_code_profile, bl);
1672 }
1673 if (struct_v >= 15) {
1674 ::decode(last_force_op_resend_preluminous, bl);
1675 } else {
1676 last_force_op_resend_preluminous = 0;
1677 }
1678 if (struct_v >= 16) {
1679 ::decode(min_read_recency_for_promote, bl);
1680 } else {
1681 min_read_recency_for_promote = 1;
1682 }
1683 if (struct_v >= 17) {
1684 ::decode(expected_num_objects, bl);
1685 } else {
1686 expected_num_objects = 0;
1687 }
1688 if (struct_v >= 19) {
1689 ::decode(cache_target_dirty_high_ratio_micro, bl);
1690 } else {
1691 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
1692 }
1693 if (struct_v >= 20) {
1694 ::decode(min_write_recency_for_promote, bl);
1695 } else {
1696 min_write_recency_for_promote = 1;
1697 }
1698 if (struct_v >= 21) {
1699 ::decode(use_gmt_hitset, bl);
1700 } else {
1701 use_gmt_hitset = false;
1702 }
1703 if (struct_v >= 22) {
1704 ::decode(fast_read, bl);
1705 } else {
1706 fast_read = false;
1707 }
1708 if (struct_v >= 23) {
1709 ::decode(hit_set_grade_decay_rate, bl);
1710 ::decode(hit_set_search_last_n, bl);
1711 } else {
1712 hit_set_grade_decay_rate = 0;
1713 hit_set_search_last_n = 1;
1714 }
1715 if (struct_v >= 24) {
1716 ::decode(opts, bl);
1717 }
1718 if (struct_v >= 25) {
1719 ::decode(last_force_op_resend, bl);
1720 } else {
1721 last_force_op_resend = last_force_op_resend_preluminous;
1722 }
1723 DECODE_FINISH(bl);
1724 calc_pg_masks();
1725 calc_grade_table();
1726 }
1727
1728 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
1729 {
1730 pg_pool_t a;
1731 o.push_back(new pg_pool_t(a));
1732
1733 a.type = TYPE_REPLICATED;
1734 a.size = 2;
1735 a.crush_ruleset = 3;
1736 a.object_hash = 4;
1737 a.pg_num = 6;
1738 a.pgp_num = 5;
1739 a.last_change = 9;
1740 a.last_force_op_resend = 123823;
1741 a.last_force_op_resend_preluminous = 123824;
1742 a.snap_seq = 10;
1743 a.snap_epoch = 11;
1744 a.auid = 12;
1745 a.crash_replay_interval = 13;
1746 a.quota_max_bytes = 473;
1747 a.quota_max_objects = 474;
1748 o.push_back(new pg_pool_t(a));
1749
1750 a.snaps[3].name = "asdf";
1751 a.snaps[3].snapid = 3;
1752 a.snaps[3].stamp = utime_t(123, 4);
1753 a.snaps[6].name = "qwer";
1754 a.snaps[6].snapid = 6;
1755 a.snaps[6].stamp = utime_t(23423, 4);
1756 o.push_back(new pg_pool_t(a));
1757
1758 a.removed_snaps.insert(2); // not quite valid to combine with snaps!
1759 a.quota_max_bytes = 2473;
1760 a.quota_max_objects = 4374;
1761 a.tiers.insert(0);
1762 a.tiers.insert(1);
1763 a.tier_of = 2;
1764 a.cache_mode = CACHEMODE_WRITEBACK;
1765 a.read_tier = 1;
1766 a.write_tier = 1;
1767 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
1768 a.hit_set_period = 3600;
1769 a.hit_set_count = 8;
1770 a.min_read_recency_for_promote = 1;
1771 a.min_write_recency_for_promote = 1;
1772 a.hit_set_grade_decay_rate = 50;
1773 a.hit_set_search_last_n = 1;
1774 a.calc_grade_table();
1775 a.set_stripe_width(12345);
1776 a.target_max_bytes = 1238132132;
1777 a.target_max_objects = 1232132;
1778 a.cache_target_dirty_ratio_micro = 187232;
1779 a.cache_target_dirty_high_ratio_micro = 309856;
1780 a.cache_target_full_ratio_micro = 987222;
1781 a.cache_min_flush_age = 231;
1782 a.cache_min_evict_age = 2321;
1783 a.erasure_code_profile = "profile in osdmap";
1784 a.expected_num_objects = 123456;
1785 a.fast_read = false;
1786 o.push_back(new pg_pool_t(a));
1787 }
1788
1789 ostream& operator<<(ostream& out, const pg_pool_t& p)
1790 {
1791 out << p.get_type_name()
1792 << " size " << p.get_size()
1793 << " min_size " << p.get_min_size()
1794 << " crush_ruleset " << p.get_crush_ruleset()
1795 << " object_hash " << p.get_object_hash_name()
1796 << " pg_num " << p.get_pg_num()
1797 << " pgp_num " << p.get_pgp_num()
1798 << " last_change " << p.get_last_change();
1799 if (p.get_last_force_op_resend() ||
1800 p.get_last_force_op_resend_preluminous())
1801 out << " lfor " << p.get_last_force_op_resend() << "/"
1802 << p.get_last_force_op_resend_preluminous();
1803 if (p.get_auid())
1804 out << " owner " << p.get_auid();
1805 if (p.flags)
1806 out << " flags " << p.get_flags_string();
1807 if (p.crash_replay_interval)
1808 out << " crash_replay_interval " << p.crash_replay_interval;
1809 if (p.quota_max_bytes)
1810 out << " max_bytes " << p.quota_max_bytes;
1811 if (p.quota_max_objects)
1812 out << " max_objects " << p.quota_max_objects;
1813 if (!p.tiers.empty())
1814 out << " tiers " << p.tiers;
1815 if (p.is_tier())
1816 out << " tier_of " << p.tier_of;
1817 if (p.has_read_tier())
1818 out << " read_tier " << p.read_tier;
1819 if (p.has_write_tier())
1820 out << " write_tier " << p.write_tier;
1821 if (p.cache_mode)
1822 out << " cache_mode " << p.get_cache_mode_name();
1823 if (p.target_max_bytes)
1824 out << " target_bytes " << p.target_max_bytes;
1825 if (p.target_max_objects)
1826 out << " target_objects " << p.target_max_objects;
1827 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
1828 out << " hit_set " << p.hit_set_params
1829 << " " << p.hit_set_period << "s"
1830 << " x" << p.hit_set_count << " decay_rate "
1831 << p.hit_set_grade_decay_rate
1832 << " search_last_n " << p.hit_set_search_last_n;
1833 }
1834 if (p.min_read_recency_for_promote)
1835 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
1836 if (p.min_write_recency_for_promote)
1837 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
1838 out << " stripe_width " << p.get_stripe_width();
1839 if (p.expected_num_objects)
1840 out << " expected_num_objects " << p.expected_num_objects;
1841 if (p.fast_read)
1842 out << " fast_read " << p.fast_read;
1843 out << p.opts;
1844 return out;
1845 }
1846
1847
1848 // -- object_stat_sum_t --
1849
1850 void object_stat_sum_t::dump(Formatter *f) const
1851 {
1852 f->dump_int("num_bytes", num_bytes);
1853 f->dump_int("num_objects", num_objects);
1854 f->dump_int("num_object_clones", num_object_clones);
1855 f->dump_int("num_object_copies", num_object_copies);
1856 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
1857 f->dump_int("num_objects_missing", num_objects_missing);
1858 f->dump_int("num_objects_degraded", num_objects_degraded);
1859 f->dump_int("num_objects_misplaced", num_objects_misplaced);
1860 f->dump_int("num_objects_unfound", num_objects_unfound);
1861 f->dump_int("num_objects_dirty", num_objects_dirty);
1862 f->dump_int("num_whiteouts", num_whiteouts);
1863 f->dump_int("num_read", num_rd);
1864 f->dump_int("num_read_kb", num_rd_kb);
1865 f->dump_int("num_write", num_wr);
1866 f->dump_int("num_write_kb", num_wr_kb);
1867 f->dump_int("num_scrub_errors", num_scrub_errors);
1868 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
1869 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
1870 f->dump_int("num_objects_recovered", num_objects_recovered);
1871 f->dump_int("num_bytes_recovered", num_bytes_recovered);
1872 f->dump_int("num_keys_recovered", num_keys_recovered);
1873 f->dump_int("num_objects_omap", num_objects_omap);
1874 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
1875 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
1876 f->dump_int("num_flush", num_flush);
1877 f->dump_int("num_flush_kb", num_flush_kb);
1878 f->dump_int("num_evict", num_evict);
1879 f->dump_int("num_evict_kb", num_evict_kb);
1880 f->dump_int("num_promote", num_promote);
1881 f->dump_int("num_flush_mode_high", num_flush_mode_high);
1882 f->dump_int("num_flush_mode_low", num_flush_mode_low);
1883 f->dump_int("num_evict_mode_some", num_evict_mode_some);
1884 f->dump_int("num_evict_mode_full", num_evict_mode_full);
1885 f->dump_int("num_objects_pinned", num_objects_pinned);
1886 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
1887 }
1888
1889 void object_stat_sum_t::encode(bufferlist& bl) const
1890 {
1891 ENCODE_START(16, 14, bl);
1892 #if defined(CEPH_LITTLE_ENDIAN)
1893 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
1894 #else
1895 ::encode(num_bytes, bl);
1896 ::encode(num_objects, bl);
1897 ::encode(num_object_clones, bl);
1898 ::encode(num_object_copies, bl);
1899 ::encode(num_objects_missing_on_primary, bl);
1900 ::encode(num_objects_degraded, bl);
1901 ::encode(num_objects_unfound, bl);
1902 ::encode(num_rd, bl);
1903 ::encode(num_rd_kb, bl);
1904 ::encode(num_wr, bl);
1905 ::encode(num_wr_kb, bl);
1906 ::encode(num_scrub_errors, bl);
1907 ::encode(num_objects_recovered, bl);
1908 ::encode(num_bytes_recovered, bl);
1909 ::encode(num_keys_recovered, bl);
1910 ::encode(num_shallow_scrub_errors, bl);
1911 ::encode(num_deep_scrub_errors, bl);
1912 ::encode(num_objects_dirty, bl);
1913 ::encode(num_whiteouts, bl);
1914 ::encode(num_objects_omap, bl);
1915 ::encode(num_objects_hit_set_archive, bl);
1916 ::encode(num_objects_misplaced, bl);
1917 ::encode(num_bytes_hit_set_archive, bl);
1918 ::encode(num_flush, bl);
1919 ::encode(num_flush_kb, bl);
1920 ::encode(num_evict, bl);
1921 ::encode(num_evict_kb, bl);
1922 ::encode(num_promote, bl);
1923 ::encode(num_flush_mode_high, bl);
1924 ::encode(num_flush_mode_low, bl);
1925 ::encode(num_evict_mode_some, bl);
1926 ::encode(num_evict_mode_full, bl);
1927 ::encode(num_objects_pinned, bl);
1928 ::encode(num_objects_missing, bl);
1929 ::encode(num_legacy_snapsets, bl);
1930 #endif
1931 ENCODE_FINISH(bl);
1932 }
1933
1934 void object_stat_sum_t::decode(bufferlist::iterator& bl)
1935 {
1936 bool decode_finish = false;
1937 DECODE_START(16, bl);
1938 #if defined(CEPH_LITTLE_ENDIAN)
1939 if (struct_v >= 16) {
1940 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
1941 decode_finish = true;
1942 }
1943 #endif
1944 if (!decode_finish) {
1945 ::decode(num_bytes, bl);
1946 ::decode(num_objects, bl);
1947 ::decode(num_object_clones, bl);
1948 ::decode(num_object_copies, bl);
1949 ::decode(num_objects_missing_on_primary, bl);
1950 ::decode(num_objects_degraded, bl);
1951 ::decode(num_objects_unfound, bl);
1952 ::decode(num_rd, bl);
1953 ::decode(num_rd_kb, bl);
1954 ::decode(num_wr, bl);
1955 ::decode(num_wr_kb, bl);
1956 ::decode(num_scrub_errors, bl);
1957 ::decode(num_objects_recovered, bl);
1958 ::decode(num_bytes_recovered, bl);
1959 ::decode(num_keys_recovered, bl);
1960 ::decode(num_shallow_scrub_errors, bl);
1961 ::decode(num_deep_scrub_errors, bl);
1962 ::decode(num_objects_dirty, bl);
1963 ::decode(num_whiteouts, bl);
1964 ::decode(num_objects_omap, bl);
1965 ::decode(num_objects_hit_set_archive, bl);
1966 ::decode(num_objects_misplaced, bl);
1967 ::decode(num_bytes_hit_set_archive, bl);
1968 ::decode(num_flush, bl);
1969 ::decode(num_flush_kb, bl);
1970 ::decode(num_evict, bl);
1971 ::decode(num_evict_kb, bl);
1972 ::decode(num_promote, bl);
1973 ::decode(num_flush_mode_high, bl);
1974 ::decode(num_flush_mode_low, bl);
1975 ::decode(num_evict_mode_some, bl);
1976 ::decode(num_evict_mode_full, bl);
1977 ::decode(num_objects_pinned, bl);
1978 ::decode(num_objects_missing, bl);
1979 if (struct_v >= 16) {
1980 ::decode(num_legacy_snapsets, bl);
1981 } else {
1982 num_legacy_snapsets = num_object_clones; // upper bound
1983 }
1984 }
1985 DECODE_FINISH(bl);
1986 }
1987
1988 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
1989 {
1990 object_stat_sum_t a;
1991
1992 a.num_bytes = 1;
1993 a.num_objects = 3;
1994 a.num_object_clones = 4;
1995 a.num_object_copies = 5;
1996 a.num_objects_missing_on_primary = 6;
1997 a.num_objects_missing = 123;
1998 a.num_objects_degraded = 7;
1999 a.num_objects_unfound = 8;
2000 a.num_rd = 9; a.num_rd_kb = 10;
2001 a.num_wr = 11; a.num_wr_kb = 12;
2002 a.num_objects_recovered = 14;
2003 a.num_bytes_recovered = 15;
2004 a.num_keys_recovered = 16;
2005 a.num_deep_scrub_errors = 17;
2006 a.num_shallow_scrub_errors = 18;
2007 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2008 a.num_objects_dirty = 21;
2009 a.num_whiteouts = 22;
2010 a.num_objects_misplaced = 1232;
2011 a.num_objects_hit_set_archive = 2;
2012 a.num_bytes_hit_set_archive = 27;
2013 a.num_flush = 5;
2014 a.num_flush_kb = 6;
2015 a.num_evict = 7;
2016 a.num_evict_kb = 8;
2017 a.num_promote = 9;
2018 a.num_flush_mode_high = 0;
2019 a.num_flush_mode_low = 1;
2020 a.num_evict_mode_some = 1;
2021 a.num_evict_mode_full = 0;
2022 a.num_objects_pinned = 20;
2023 o.push_back(new object_stat_sum_t(a));
2024 }
2025
2026 void object_stat_sum_t::add(const object_stat_sum_t& o)
2027 {
2028 num_bytes += o.num_bytes;
2029 num_objects += o.num_objects;
2030 num_object_clones += o.num_object_clones;
2031 num_object_copies += o.num_object_copies;
2032 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2033 num_objects_missing += o.num_objects_missing;
2034 num_objects_degraded += o.num_objects_degraded;
2035 num_objects_misplaced += o.num_objects_misplaced;
2036 num_rd += o.num_rd;
2037 num_rd_kb += o.num_rd_kb;
2038 num_wr += o.num_wr;
2039 num_wr_kb += o.num_wr_kb;
2040 num_objects_unfound += o.num_objects_unfound;
2041 num_scrub_errors += o.num_scrub_errors;
2042 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2043 num_deep_scrub_errors += o.num_deep_scrub_errors;
2044 num_objects_recovered += o.num_objects_recovered;
2045 num_bytes_recovered += o.num_bytes_recovered;
2046 num_keys_recovered += o.num_keys_recovered;
2047 num_objects_dirty += o.num_objects_dirty;
2048 num_whiteouts += o.num_whiteouts;
2049 num_objects_omap += o.num_objects_omap;
2050 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2051 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2052 num_flush += o.num_flush;
2053 num_flush_kb += o.num_flush_kb;
2054 num_evict += o.num_evict;
2055 num_evict_kb += o.num_evict_kb;
2056 num_promote += o.num_promote;
2057 num_flush_mode_high += o.num_flush_mode_high;
2058 num_flush_mode_low += o.num_flush_mode_low;
2059 num_evict_mode_some += o.num_evict_mode_some;
2060 num_evict_mode_full += o.num_evict_mode_full;
2061 num_objects_pinned += o.num_objects_pinned;
2062 num_legacy_snapsets += o.num_legacy_snapsets;
2063 }
2064
2065 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2066 {
2067 num_bytes -= o.num_bytes;
2068 num_objects -= o.num_objects;
2069 num_object_clones -= o.num_object_clones;
2070 num_object_copies -= o.num_object_copies;
2071 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2072 num_objects_missing -= o.num_objects_missing;
2073 num_objects_degraded -= o.num_objects_degraded;
2074 num_objects_misplaced -= o.num_objects_misplaced;
2075 num_rd -= o.num_rd;
2076 num_rd_kb -= o.num_rd_kb;
2077 num_wr -= o.num_wr;
2078 num_wr_kb -= o.num_wr_kb;
2079 num_objects_unfound -= o.num_objects_unfound;
2080 num_scrub_errors -= o.num_scrub_errors;
2081 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2082 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2083 num_objects_recovered -= o.num_objects_recovered;
2084 num_bytes_recovered -= o.num_bytes_recovered;
2085 num_keys_recovered -= o.num_keys_recovered;
2086 num_objects_dirty -= o.num_objects_dirty;
2087 num_whiteouts -= o.num_whiteouts;
2088 num_objects_omap -= o.num_objects_omap;
2089 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2090 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2091 num_flush -= o.num_flush;
2092 num_flush_kb -= o.num_flush_kb;
2093 num_evict -= o.num_evict;
2094 num_evict_kb -= o.num_evict_kb;
2095 num_promote -= o.num_promote;
2096 num_flush_mode_high -= o.num_flush_mode_high;
2097 num_flush_mode_low -= o.num_flush_mode_low;
2098 num_evict_mode_some -= o.num_evict_mode_some;
2099 num_evict_mode_full -= o.num_evict_mode_full;
2100 num_objects_pinned -= o.num_objects_pinned;
2101 num_legacy_snapsets -= o.num_legacy_snapsets;
2102 }
2103
2104 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2105 {
2106 return
2107 l.num_bytes == r.num_bytes &&
2108 l.num_objects == r.num_objects &&
2109 l.num_object_clones == r.num_object_clones &&
2110 l.num_object_copies == r.num_object_copies &&
2111 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2112 l.num_objects_missing == r.num_objects_missing &&
2113 l.num_objects_degraded == r.num_objects_degraded &&
2114 l.num_objects_misplaced == r.num_objects_misplaced &&
2115 l.num_objects_unfound == r.num_objects_unfound &&
2116 l.num_rd == r.num_rd &&
2117 l.num_rd_kb == r.num_rd_kb &&
2118 l.num_wr == r.num_wr &&
2119 l.num_wr_kb == r.num_wr_kb &&
2120 l.num_scrub_errors == r.num_scrub_errors &&
2121 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2122 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2123 l.num_objects_recovered == r.num_objects_recovered &&
2124 l.num_bytes_recovered == r.num_bytes_recovered &&
2125 l.num_keys_recovered == r.num_keys_recovered &&
2126 l.num_objects_dirty == r.num_objects_dirty &&
2127 l.num_whiteouts == r.num_whiteouts &&
2128 l.num_objects_omap == r.num_objects_omap &&
2129 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2130 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2131 l.num_flush == r.num_flush &&
2132 l.num_flush_kb == r.num_flush_kb &&
2133 l.num_evict == r.num_evict &&
2134 l.num_evict_kb == r.num_evict_kb &&
2135 l.num_promote == r.num_promote &&
2136 l.num_flush_mode_high == r.num_flush_mode_high &&
2137 l.num_flush_mode_low == r.num_flush_mode_low &&
2138 l.num_evict_mode_some == r.num_evict_mode_some &&
2139 l.num_evict_mode_full == r.num_evict_mode_full &&
2140 l.num_objects_pinned == r.num_objects_pinned &&
2141 l.num_legacy_snapsets == r.num_legacy_snapsets;
2142 }
2143
2144 // -- object_stat_collection_t --
2145
2146 void object_stat_collection_t::dump(Formatter *f) const
2147 {
2148 f->open_object_section("stat_sum");
2149 sum.dump(f);
2150 f->close_section();
2151 }
2152
2153 void object_stat_collection_t::encode(bufferlist& bl) const
2154 {
2155 ENCODE_START(2, 2, bl);
2156 ::encode(sum, bl);
2157 ::encode((__u32)0, bl);
2158 ENCODE_FINISH(bl);
2159 }
2160
2161 void object_stat_collection_t::decode(bufferlist::iterator& bl)
2162 {
2163 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2164 ::decode(sum, bl);
2165 {
2166 map<string,object_stat_sum_t> cat_sum;
2167 ::decode(cat_sum, bl);
2168 }
2169 DECODE_FINISH(bl);
2170 }
2171
2172 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2173 {
2174 object_stat_collection_t a;
2175 o.push_back(new object_stat_collection_t(a));
2176 list<object_stat_sum_t*> l;
2177 object_stat_sum_t::generate_test_instances(l);
2178 for (list<object_stat_sum_t*>::iterator p = l.begin(); p != l.end(); ++p) {
2179 a.add(**p);
2180 o.push_back(new object_stat_collection_t(a));
2181 }
2182 }
2183
2184
2185 // -- pg_stat_t --
2186
2187 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2188 {
2189 if (primary && osd == acting_primary) {
2190 return true;
2191 } else if (!primary) {
2192 for(vector<int32_t>::const_iterator it = acting.begin();
2193 it != acting.end(); ++it)
2194 {
2195 if (*it == osd)
2196 return true;
2197 }
2198 }
2199 return false;
2200 }
2201
2202 void pg_stat_t::dump(Formatter *f) const
2203 {
2204 f->dump_stream("version") << version;
2205 f->dump_stream("reported_seq") << reported_seq;
2206 f->dump_stream("reported_epoch") << reported_epoch;
2207 f->dump_string("state", pg_state_string(state));
2208 f->dump_stream("last_fresh") << last_fresh;
2209 f->dump_stream("last_change") << last_change;
2210 f->dump_stream("last_active") << last_active;
2211 f->dump_stream("last_peered") << last_peered;
2212 f->dump_stream("last_clean") << last_clean;
2213 f->dump_stream("last_became_active") << last_became_active;
2214 f->dump_stream("last_became_peered") << last_became_peered;
2215 f->dump_stream("last_unstale") << last_unstale;
2216 f->dump_stream("last_undegraded") << last_undegraded;
2217 f->dump_stream("last_fullsized") << last_fullsized;
2218 f->dump_unsigned("mapping_epoch", mapping_epoch);
2219 f->dump_stream("log_start") << log_start;
2220 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2221 f->dump_unsigned("created", created);
2222 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2223 f->dump_stream("parent") << parent;
2224 f->dump_unsigned("parent_split_bits", parent_split_bits);
2225 f->dump_stream("last_scrub") << last_scrub;
2226 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2227 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2228 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2229 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2230 f->dump_int("log_size", log_size);
2231 f->dump_int("ondisk_log_size", ondisk_log_size);
2232 f->dump_bool("stats_invalid", stats_invalid);
2233 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2234 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2235 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2236 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2237 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2238 stats.dump(f);
2239 f->open_array_section("up");
2240 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2241 f->dump_int("osd", *p);
2242 f->close_section();
2243 f->open_array_section("acting");
2244 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2245 f->dump_int("osd", *p);
2246 f->close_section();
2247 f->open_array_section("blocked_by");
2248 for (vector<int32_t>::const_iterator p = blocked_by.begin();
2249 p != blocked_by.end(); ++p)
2250 f->dump_int("osd", *p);
2251 f->close_section();
2252 f->dump_int("up_primary", up_primary);
2253 f->dump_int("acting_primary", acting_primary);
2254 }
2255
2256 void pg_stat_t::dump_brief(Formatter *f) const
2257 {
2258 f->dump_string("state", pg_state_string(state));
2259 f->open_array_section("up");
2260 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2261 f->dump_int("osd", *p);
2262 f->close_section();
2263 f->open_array_section("acting");
2264 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2265 f->dump_int("osd", *p);
2266 f->close_section();
2267 f->dump_int("up_primary", up_primary);
2268 f->dump_int("acting_primary", acting_primary);
2269 }
2270
2271 void pg_stat_t::encode(bufferlist &bl) const
2272 {
2273 ENCODE_START(22, 22, bl);
2274 ::encode(version, bl);
2275 ::encode(reported_seq, bl);
2276 ::encode(reported_epoch, bl);
2277 ::encode(state, bl);
2278 ::encode(log_start, bl);
2279 ::encode(ondisk_log_start, bl);
2280 ::encode(created, bl);
2281 ::encode(last_epoch_clean, bl);
2282 ::encode(parent, bl);
2283 ::encode(parent_split_bits, bl);
2284 ::encode(last_scrub, bl);
2285 ::encode(last_scrub_stamp, bl);
2286 ::encode(stats, bl);
2287 ::encode(log_size, bl);
2288 ::encode(ondisk_log_size, bl);
2289 ::encode(up, bl);
2290 ::encode(acting, bl);
2291 ::encode(last_fresh, bl);
2292 ::encode(last_change, bl);
2293 ::encode(last_active, bl);
2294 ::encode(last_clean, bl);
2295 ::encode(last_unstale, bl);
2296 ::encode(mapping_epoch, bl);
2297 ::encode(last_deep_scrub, bl);
2298 ::encode(last_deep_scrub_stamp, bl);
2299 ::encode(stats_invalid, bl);
2300 ::encode(last_clean_scrub_stamp, bl);
2301 ::encode(last_became_active, bl);
2302 ::encode(dirty_stats_invalid, bl);
2303 ::encode(up_primary, bl);
2304 ::encode(acting_primary, bl);
2305 ::encode(omap_stats_invalid, bl);
2306 ::encode(hitset_stats_invalid, bl);
2307 ::encode(blocked_by, bl);
2308 ::encode(last_undegraded, bl);
2309 ::encode(last_fullsized, bl);
2310 ::encode(hitset_bytes_stats_invalid, bl);
2311 ::encode(last_peered, bl);
2312 ::encode(last_became_peered, bl);
2313 ::encode(pin_stats_invalid, bl);
2314 ENCODE_FINISH(bl);
2315 }
2316
2317 void pg_stat_t::decode(bufferlist::iterator &bl)
2318 {
2319 bool tmp;
2320 DECODE_START(22, bl);
2321 ::decode(version, bl);
2322 ::decode(reported_seq, bl);
2323 ::decode(reported_epoch, bl);
2324 ::decode(state, bl);
2325 ::decode(log_start, bl);
2326 ::decode(ondisk_log_start, bl);
2327 ::decode(created, bl);
2328 ::decode(last_epoch_clean, bl);
2329 ::decode(parent, bl);
2330 ::decode(parent_split_bits, bl);
2331 ::decode(last_scrub, bl);
2332 ::decode(last_scrub_stamp, bl);
2333 ::decode(stats, bl);
2334 ::decode(log_size, bl);
2335 ::decode(ondisk_log_size, bl);
2336 ::decode(up, bl);
2337 ::decode(acting, bl);
2338 ::decode(last_fresh, bl);
2339 ::decode(last_change, bl);
2340 ::decode(last_active, bl);
2341 ::decode(last_clean, bl);
2342 ::decode(last_unstale, bl);
2343 ::decode(mapping_epoch, bl);
2344 ::decode(last_deep_scrub, bl);
2345 ::decode(last_deep_scrub_stamp, bl);
2346 ::decode(tmp, bl);
2347 stats_invalid = tmp;
2348 ::decode(last_clean_scrub_stamp, bl);
2349 ::decode(last_became_active, bl);
2350 ::decode(tmp, bl);
2351 dirty_stats_invalid = tmp;
2352 ::decode(up_primary, bl);
2353 ::decode(acting_primary, bl);
2354 ::decode(tmp, bl);
2355 omap_stats_invalid = tmp;
2356 ::decode(tmp, bl);
2357 hitset_stats_invalid = tmp;
2358 ::decode(blocked_by, bl);
2359 ::decode(last_undegraded, bl);
2360 ::decode(last_fullsized, bl);
2361 ::decode(tmp, bl);
2362 hitset_bytes_stats_invalid = tmp;
2363 ::decode(last_peered, bl);
2364 ::decode(last_became_peered, bl);
2365 ::decode(tmp, bl);
2366 pin_stats_invalid = tmp;
2367 DECODE_FINISH(bl);
2368 }
2369
2370 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
2371 {
2372 pg_stat_t a;
2373 o.push_back(new pg_stat_t(a));
2374
2375 a.version = eversion_t(1, 3);
2376 a.reported_epoch = 1;
2377 a.reported_seq = 2;
2378 a.state = 123;
2379 a.mapping_epoch = 998;
2380 a.last_fresh = utime_t(1002, 1);
2381 a.last_change = utime_t(1002, 2);
2382 a.last_active = utime_t(1002, 3);
2383 a.last_clean = utime_t(1002, 4);
2384 a.last_unstale = utime_t(1002, 5);
2385 a.last_undegraded = utime_t(1002, 7);
2386 a.last_fullsized = utime_t(1002, 8);
2387 a.log_start = eversion_t(1, 4);
2388 a.ondisk_log_start = eversion_t(1, 5);
2389 a.created = 6;
2390 a.last_epoch_clean = 7;
2391 a.parent = pg_t(1, 2, 3);
2392 a.parent_split_bits = 12;
2393 a.last_scrub = eversion_t(9, 10);
2394 a.last_scrub_stamp = utime_t(11, 12);
2395 a.last_deep_scrub = eversion_t(13, 14);
2396 a.last_deep_scrub_stamp = utime_t(15, 16);
2397 a.last_clean_scrub_stamp = utime_t(17, 18);
2398 list<object_stat_collection_t*> l;
2399 object_stat_collection_t::generate_test_instances(l);
2400 a.stats = *l.back();
2401 a.log_size = 99;
2402 a.ondisk_log_size = 88;
2403 a.up.push_back(123);
2404 a.up_primary = 123;
2405 a.acting.push_back(456);
2406 a.acting_primary = 456;
2407 o.push_back(new pg_stat_t(a));
2408
2409 a.up.push_back(124);
2410 a.up_primary = 124;
2411 a.acting.push_back(124);
2412 a.acting_primary = 124;
2413 a.blocked_by.push_back(155);
2414 a.blocked_by.push_back(156);
2415 o.push_back(new pg_stat_t(a));
2416 }
2417
2418 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
2419 {
2420 return
2421 l.version == r.version &&
2422 l.reported_seq == r.reported_seq &&
2423 l.reported_epoch == r.reported_epoch &&
2424 l.state == r.state &&
2425 l.last_fresh == r.last_fresh &&
2426 l.last_change == r.last_change &&
2427 l.last_active == r.last_active &&
2428 l.last_peered == r.last_peered &&
2429 l.last_clean == r.last_clean &&
2430 l.last_unstale == r.last_unstale &&
2431 l.last_undegraded == r.last_undegraded &&
2432 l.last_fullsized == r.last_fullsized &&
2433 l.log_start == r.log_start &&
2434 l.ondisk_log_start == r.ondisk_log_start &&
2435 l.created == r.created &&
2436 l.last_epoch_clean == r.last_epoch_clean &&
2437 l.parent == r.parent &&
2438 l.parent_split_bits == r.parent_split_bits &&
2439 l.last_scrub == r.last_scrub &&
2440 l.last_deep_scrub == r.last_deep_scrub &&
2441 l.last_scrub_stamp == r.last_scrub_stamp &&
2442 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2443 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2444 l.stats == r.stats &&
2445 l.stats_invalid == r.stats_invalid &&
2446 l.log_size == r.log_size &&
2447 l.ondisk_log_size == r.ondisk_log_size &&
2448 l.up == r.up &&
2449 l.acting == r.acting &&
2450 l.mapping_epoch == r.mapping_epoch &&
2451 l.blocked_by == r.blocked_by &&
2452 l.last_became_active == r.last_became_active &&
2453 l.last_became_peered == r.last_became_peered &&
2454 l.dirty_stats_invalid == r.dirty_stats_invalid &&
2455 l.omap_stats_invalid == r.omap_stats_invalid &&
2456 l.hitset_stats_invalid == r.hitset_stats_invalid &&
2457 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
2458 l.up_primary == r.up_primary &&
2459 l.acting_primary == r.acting_primary &&
2460 l.pin_stats_invalid == r.pin_stats_invalid;
2461 }
2462
2463 // -- pool_stat_t --
2464
2465 void pool_stat_t::dump(Formatter *f) const
2466 {
2467 stats.dump(f);
2468 f->dump_int("log_size", log_size);
2469 f->dump_int("ondisk_log_size", ondisk_log_size);
2470 f->dump_int("up", up);
2471 f->dump_int("acting", acting);
2472 }
2473
2474 void pool_stat_t::encode(bufferlist &bl, uint64_t features) const
2475 {
2476 if ((features & CEPH_FEATURE_OSDENC) == 0) {
2477 __u8 v = 4;
2478 ::encode(v, bl);
2479 ::encode(stats, bl);
2480 ::encode(log_size, bl);
2481 ::encode(ondisk_log_size, bl);
2482 return;
2483 }
2484
2485 ENCODE_START(6, 5, bl);
2486 ::encode(stats, bl);
2487 ::encode(log_size, bl);
2488 ::encode(ondisk_log_size, bl);
2489 ::encode(up, bl);
2490 ::encode(acting, bl);
2491 ENCODE_FINISH(bl);
2492 }
2493
2494 void pool_stat_t::decode(bufferlist::iterator &bl)
2495 {
2496 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2497 if (struct_v >= 4) {
2498 ::decode(stats, bl);
2499 ::decode(log_size, bl);
2500 ::decode(ondisk_log_size, bl);
2501 if (struct_v >= 6) {
2502 ::decode(up, bl);
2503 ::decode(acting, bl);
2504 } else {
2505 up = 0;
2506 acting = 0;
2507 }
2508 } else {
2509 ::decode(stats.sum.num_bytes, bl);
2510 uint64_t num_kb;
2511 ::decode(num_kb, bl);
2512 ::decode(stats.sum.num_objects, bl);
2513 ::decode(stats.sum.num_object_clones, bl);
2514 ::decode(stats.sum.num_object_copies, bl);
2515 ::decode(stats.sum.num_objects_missing_on_primary, bl);
2516 ::decode(stats.sum.num_objects_degraded, bl);
2517 ::decode(log_size, bl);
2518 ::decode(ondisk_log_size, bl);
2519 if (struct_v >= 2) {
2520 ::decode(stats.sum.num_rd, bl);
2521 ::decode(stats.sum.num_rd_kb, bl);
2522 ::decode(stats.sum.num_wr, bl);
2523 ::decode(stats.sum.num_wr_kb, bl);
2524 }
2525 if (struct_v >= 3) {
2526 ::decode(stats.sum.num_objects_unfound, bl);
2527 }
2528 }
2529 DECODE_FINISH(bl);
2530 }
2531
2532 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
2533 {
2534 pool_stat_t a;
2535 o.push_back(new pool_stat_t(a));
2536
2537 list<object_stat_collection_t*> l;
2538 object_stat_collection_t::generate_test_instances(l);
2539 a.stats = *l.back();
2540 a.log_size = 123;
2541 a.ondisk_log_size = 456;
2542 a.acting = 3;
2543 a.up = 4;
2544 o.push_back(new pool_stat_t(a));
2545 }
2546
2547
2548 // -- pg_history_t --
2549
2550 void pg_history_t::encode(bufferlist &bl) const
2551 {
2552 ENCODE_START(8, 4, bl);
2553 ::encode(epoch_created, bl);
2554 ::encode(last_epoch_started, bl);
2555 ::encode(last_epoch_clean, bl);
2556 ::encode(last_epoch_split, bl);
2557 ::encode(same_interval_since, bl);
2558 ::encode(same_up_since, bl);
2559 ::encode(same_primary_since, bl);
2560 ::encode(last_scrub, bl);
2561 ::encode(last_scrub_stamp, bl);
2562 ::encode(last_deep_scrub, bl);
2563 ::encode(last_deep_scrub_stamp, bl);
2564 ::encode(last_clean_scrub_stamp, bl);
2565 ::encode(last_epoch_marked_full, bl);
2566 ::encode(last_interval_started, bl);
2567 ::encode(last_interval_clean, bl);
2568 ENCODE_FINISH(bl);
2569 }
2570
2571 void pg_history_t::decode(bufferlist::iterator &bl)
2572 {
2573 DECODE_START_LEGACY_COMPAT_LEN(8, 4, 4, bl);
2574 ::decode(epoch_created, bl);
2575 ::decode(last_epoch_started, bl);
2576 if (struct_v >= 3)
2577 ::decode(last_epoch_clean, bl);
2578 else
2579 last_epoch_clean = last_epoch_started; // careful, it's a lie!
2580 ::decode(last_epoch_split, bl);
2581 ::decode(same_interval_since, bl);
2582 ::decode(same_up_since, bl);
2583 ::decode(same_primary_since, bl);
2584 if (struct_v >= 2) {
2585 ::decode(last_scrub, bl);
2586 ::decode(last_scrub_stamp, bl);
2587 }
2588 if (struct_v >= 5) {
2589 ::decode(last_deep_scrub, bl);
2590 ::decode(last_deep_scrub_stamp, bl);
2591 }
2592 if (struct_v >= 6) {
2593 ::decode(last_clean_scrub_stamp, bl);
2594 }
2595 if (struct_v >= 7) {
2596 ::decode(last_epoch_marked_full, bl);
2597 }
2598 if (struct_v >= 8) {
2599 ::decode(last_interval_started, bl);
2600 ::decode(last_interval_clean, bl);
2601 } else {
2602 if (last_epoch_started >= same_interval_since) {
2603 last_interval_started = same_interval_since;
2604 } else {
2605 last_interval_started = last_epoch_started; // best guess
2606 }
2607 if (last_epoch_clean >= same_interval_since) {
2608 last_interval_clean = same_interval_since;
2609 } else {
2610 last_interval_clean = last_epoch_clean; // best guess
2611 }
2612 }
2613 DECODE_FINISH(bl);
2614 }
2615
2616 void pg_history_t::dump(Formatter *f) const
2617 {
2618 f->dump_int("epoch_created", epoch_created);
2619 f->dump_int("last_epoch_started", last_epoch_started);
2620 f->dump_int("last_interval_started", last_interval_started);
2621 f->dump_int("last_epoch_clean", last_epoch_clean);
2622 f->dump_int("last_interval_clean", last_interval_clean);
2623 f->dump_int("last_epoch_split", last_epoch_split);
2624 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
2625 f->dump_int("same_up_since", same_up_since);
2626 f->dump_int("same_interval_since", same_interval_since);
2627 f->dump_int("same_primary_since", same_primary_since);
2628 f->dump_stream("last_scrub") << last_scrub;
2629 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2630 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2631 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2632 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2633 }
2634
2635 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
2636 {
2637 o.push_back(new pg_history_t);
2638 o.push_back(new pg_history_t);
2639 o.back()->epoch_created = 1;
2640 o.back()->last_epoch_started = 2;
2641 o.back()->last_interval_started = 2;
2642 o.back()->last_epoch_clean = 3;
2643 o.back()->last_interval_clean = 2;
2644 o.back()->last_epoch_split = 4;
2645 o.back()->same_up_since = 5;
2646 o.back()->same_interval_since = 6;
2647 o.back()->same_primary_since = 7;
2648 o.back()->last_scrub = eversion_t(8, 9);
2649 o.back()->last_scrub_stamp = utime_t(10, 11);
2650 o.back()->last_deep_scrub = eversion_t(12, 13);
2651 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
2652 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
2653 o.back()->last_epoch_marked_full = 18;
2654 }
2655
2656
2657 // -- pg_info_t --
2658
2659 void pg_info_t::encode(bufferlist &bl) const
2660 {
2661 ENCODE_START(32, 26, bl);
2662 ::encode(pgid.pgid, bl);
2663 ::encode(last_update, bl);
2664 ::encode(last_complete, bl);
2665 ::encode(log_tail, bl);
2666 if (last_backfill_bitwise && !last_backfill.is_max()) {
2667 ::encode(hobject_t(), bl);
2668 } else {
2669 ::encode(last_backfill, bl);
2670 }
2671 ::encode(stats, bl);
2672 history.encode(bl);
2673 ::encode(purged_snaps, bl);
2674 ::encode(last_epoch_started, bl);
2675 ::encode(last_user_version, bl);
2676 ::encode(hit_set, bl);
2677 ::encode(pgid.shard, bl);
2678 ::encode(last_backfill, bl);
2679 ::encode(last_backfill_bitwise, bl);
2680 ::encode(last_interval_started, bl);
2681 ENCODE_FINISH(bl);
2682 }
2683
2684 void pg_info_t::decode(bufferlist::iterator &bl)
2685 {
2686 DECODE_START(32, bl);
2687 ::decode(pgid.pgid, bl);
2688 ::decode(last_update, bl);
2689 ::decode(last_complete, bl);
2690 ::decode(log_tail, bl);
2691 {
2692 hobject_t old_last_backfill;
2693 ::decode(old_last_backfill, bl);
2694 }
2695 ::decode(stats, bl);
2696 history.decode(bl);
2697 ::decode(purged_snaps, bl);
2698 ::decode(last_epoch_started, bl);
2699 ::decode(last_user_version, bl);
2700 ::decode(hit_set, bl);
2701 ::decode(pgid.shard, bl);
2702 ::decode(last_backfill, bl);
2703 ::decode(last_backfill_bitwise, bl);
2704 if (struct_v >= 32) {
2705 ::decode(last_interval_started, bl);
2706 } else {
2707 last_interval_started = last_epoch_started;
2708 }
2709 DECODE_FINISH(bl);
2710 }
2711
2712 // -- pg_info_t --
2713
2714 void pg_info_t::dump(Formatter *f) const
2715 {
2716 f->dump_stream("pgid") << pgid;
2717 f->dump_stream("last_update") << last_update;
2718 f->dump_stream("last_complete") << last_complete;
2719 f->dump_stream("log_tail") << log_tail;
2720 f->dump_int("last_user_version", last_user_version);
2721 f->dump_stream("last_backfill") << last_backfill;
2722 f->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise);
2723 f->open_array_section("purged_snaps");
2724 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
2725 i != purged_snaps.end();
2726 ++i) {
2727 f->open_object_section("purged_snap_interval");
2728 f->dump_stream("start") << i.get_start();
2729 f->dump_stream("length") << i.get_len();
2730 f->close_section();
2731 }
2732 f->close_section();
2733 f->open_object_section("history");
2734 history.dump(f);
2735 f->close_section();
2736 f->open_object_section("stats");
2737 stats.dump(f);
2738 f->close_section();
2739
2740 f->dump_int("empty", is_empty());
2741 f->dump_int("dne", dne());
2742 f->dump_int("incomplete", is_incomplete());
2743 f->dump_int("last_epoch_started", last_epoch_started);
2744
2745 f->open_object_section("hit_set_history");
2746 hit_set.dump(f);
2747 f->close_section();
2748 }
2749
2750 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
2751 {
2752 o.push_back(new pg_info_t);
2753 o.push_back(new pg_info_t);
2754 list<pg_history_t*> h;
2755 pg_history_t::generate_test_instances(h);
2756 o.back()->history = *h.back();
2757 o.back()->pgid = spg_t(pg_t(1, 2, -1), shard_id_t::NO_SHARD);
2758 o.back()->last_update = eversion_t(3, 4);
2759 o.back()->last_complete = eversion_t(5, 6);
2760 o.back()->last_user_version = 2;
2761 o.back()->log_tail = eversion_t(7, 8);
2762 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
2763 o.back()->last_backfill_bitwise = true;
2764 {
2765 list<pg_stat_t*> s;
2766 pg_stat_t::generate_test_instances(s);
2767 o.back()->stats = *s.back();
2768 }
2769 {
2770 list<pg_hit_set_history_t*> s;
2771 pg_hit_set_history_t::generate_test_instances(s);
2772 o.back()->hit_set = *s.back();
2773 }
2774 }
2775
2776 // -- pg_notify_t --
2777 void pg_notify_t::encode(bufferlist &bl) const
2778 {
2779 ENCODE_START(2, 2, bl);
2780 ::encode(query_epoch, bl);
2781 ::encode(epoch_sent, bl);
2782 ::encode(info, bl);
2783 ::encode(to, bl);
2784 ::encode(from, bl);
2785 ENCODE_FINISH(bl);
2786 }
2787
2788 void pg_notify_t::decode(bufferlist::iterator &bl)
2789 {
2790 DECODE_START(2, bl);
2791 ::decode(query_epoch, bl);
2792 ::decode(epoch_sent, bl);
2793 ::decode(info, bl);
2794 ::decode(to, bl);
2795 ::decode(from, bl);
2796 DECODE_FINISH(bl);
2797 }
2798
2799 void pg_notify_t::dump(Formatter *f) const
2800 {
2801 f->dump_int("from", from);
2802 f->dump_int("to", to);
2803 f->dump_unsigned("query_epoch", query_epoch);
2804 f->dump_unsigned("epoch_sent", epoch_sent);
2805 {
2806 f->open_object_section("info");
2807 info.dump(f);
2808 f->close_section();
2809 }
2810 }
2811
2812 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
2813 {
2814 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, pg_info_t()));
2815 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t()));
2816 }
2817
2818 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
2819 {
2820 lhs << "(query:" << notify.query_epoch
2821 << " sent:" << notify.epoch_sent
2822 << " " << notify.info;
2823 if (notify.from != shard_id_t::NO_SHARD ||
2824 notify.to != shard_id_t::NO_SHARD)
2825 lhs << " " << (unsigned)notify.from
2826 << "->" << (unsigned)notify.to;
2827 return lhs << ")";
2828 }
2829
2830 // -- pg_interval_t --
2831
2832 void PastIntervals::pg_interval_t::encode(bufferlist& bl) const
2833 {
2834 ENCODE_START(4, 2, bl);
2835 ::encode(first, bl);
2836 ::encode(last, bl);
2837 ::encode(up, bl);
2838 ::encode(acting, bl);
2839 ::encode(maybe_went_rw, bl);
2840 ::encode(primary, bl);
2841 ::encode(up_primary, bl);
2842 ENCODE_FINISH(bl);
2843 }
2844
2845 void PastIntervals::pg_interval_t::decode(bufferlist::iterator& bl)
2846 {
2847 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
2848 ::decode(first, bl);
2849 ::decode(last, bl);
2850 ::decode(up, bl);
2851 ::decode(acting, bl);
2852 ::decode(maybe_went_rw, bl);
2853 if (struct_v >= 3) {
2854 ::decode(primary, bl);
2855 } else {
2856 if (acting.size())
2857 primary = acting[0];
2858 }
2859 if (struct_v >= 4) {
2860 ::decode(up_primary, bl);
2861 } else {
2862 if (up.size())
2863 up_primary = up[0];
2864 }
2865 DECODE_FINISH(bl);
2866 }
2867
2868 void PastIntervals::pg_interval_t::dump(Formatter *f) const
2869 {
2870 f->dump_unsigned("first", first);
2871 f->dump_unsigned("last", last);
2872 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
2873 f->open_array_section("up");
2874 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
2875 f->dump_int("osd", *p);
2876 f->close_section();
2877 f->open_array_section("acting");
2878 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2879 f->dump_int("osd", *p);
2880 f->close_section();
2881 f->dump_int("primary", primary);
2882 f->dump_int("up_primary", up_primary);
2883 }
2884
2885 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
2886 {
2887 o.push_back(new pg_interval_t);
2888 o.push_back(new pg_interval_t);
2889 o.back()->up.push_back(1);
2890 o.back()->acting.push_back(2);
2891 o.back()->acting.push_back(3);
2892 o.back()->first = 4;
2893 o.back()->last = 5;
2894 o.back()->maybe_went_rw = true;
2895 }
2896
2897 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
2898
2899 class pi_simple_rep : public PastIntervals::interval_rep {
2900 map<epoch_t, PastIntervals::pg_interval_t> interval_map;
2901
2902 pi_simple_rep(
2903 bool ec_pool,
2904 std::list<PastIntervals::pg_interval_t> &&intervals) {
2905 for (auto &&i: intervals)
2906 add_interval(ec_pool, i);
2907 }
2908
2909 public:
2910 pi_simple_rep() = default;
2911 pi_simple_rep(const pi_simple_rep &) = default;
2912 pi_simple_rep(pi_simple_rep &&) = default;
2913 pi_simple_rep &operator=(pi_simple_rep &&) = default;
2914 pi_simple_rep &operator=(const pi_simple_rep &) = default;
2915
2916 size_t size() const override { return interval_map.size(); }
2917 bool empty() const override { return interval_map.empty(); }
2918 void clear() override { interval_map.clear(); }
2919 pair<epoch_t, epoch_t> get_bounds() const override {
2920 auto iter = interval_map.begin();
2921 if (iter != interval_map.end()) {
2922 auto riter = interval_map.rbegin();
2923 return make_pair(
2924 iter->second.first,
2925 riter->second.last + 1);
2926 } else {
2927 return make_pair(0, 0);
2928 }
2929 }
2930 set<pg_shard_t> get_all_participants(
2931 bool ec_pool) const override {
2932 set<pg_shard_t> all_participants;
2933
2934 // We need to decide who might have unfound objects that we need
2935 auto p = interval_map.rbegin();
2936 auto end = interval_map.rend();
2937 for (; p != end; ++p) {
2938 const PastIntervals::pg_interval_t &interval(p->second);
2939 // If nothing changed, we don't care about this interval.
2940 if (!interval.maybe_went_rw)
2941 continue;
2942
2943 int i = 0;
2944 std::vector<int>::const_iterator a = interval.acting.begin();
2945 std::vector<int>::const_iterator a_end = interval.acting.end();
2946 for (; a != a_end; ++a, ++i) {
2947 pg_shard_t shard(*a, ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD);
2948 if (*a != CRUSH_ITEM_NONE)
2949 all_participants.insert(shard);
2950 }
2951 }
2952 return all_participants;
2953 }
2954 void add_interval(
2955 bool ec_pool,
2956 const PastIntervals::pg_interval_t &interval) override {
2957 interval_map[interval.first] = interval;
2958 }
2959 unique_ptr<PastIntervals::interval_rep> clone() const override {
2960 return unique_ptr<PastIntervals::interval_rep>(new pi_simple_rep(*this));
2961 }
2962 ostream &print(ostream &out) const override {
2963 return out << interval_map;
2964 }
2965 void encode(bufferlist &bl) const override {
2966 ::encode(interval_map, bl);
2967 }
2968 void decode(bufferlist::iterator &bl) override {
2969 ::decode(interval_map, bl);
2970 }
2971 void dump(Formatter *f) const override {
2972 f->open_array_section("PastIntervals::compat_rep");
2973 for (auto &&i: interval_map) {
2974 f->open_object_section("pg_interval_t");
2975 f->dump_int("epoch", i.first);
2976 f->open_object_section("interval");
2977 i.second.dump(f);
2978 f->close_section();
2979 f->close_section();
2980 }
2981 f->close_section();
2982 }
2983 bool is_classic() const override {
2984 return true;
2985 }
2986 static void generate_test_instances(list<pi_simple_rep*> &o) {
2987 using ival = PastIntervals::pg_interval_t;
2988 using ivallst = std::list<ival>;
2989 o.push_back(
2990 new pi_simple_rep(
2991 true, ivallst
2992 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
2993 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
2994 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
2995 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
2996 }));
2997 o.push_back(
2998 new pi_simple_rep(
2999 false, ivallst
3000 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3001 , ival{{ 1, 2}, { 1, 2}, 20, 30, true, 1, 1}
3002 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3003 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3004 }));
3005 o.push_back(
3006 new pi_simple_rep(
3007 true, ivallst
3008 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3009 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3010 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3011 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3012 }));
3013 return;
3014 }
3015 void iterate_mayberw_back_to(
3016 bool ec_pool,
3017 epoch_t les,
3018 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3019 for (auto i = interval_map.rbegin(); i != interval_map.rend(); ++i) {
3020 if (!i->second.maybe_went_rw)
3021 continue;
3022 if (i->second.last < les)
3023 break;
3024 set<pg_shard_t> actingset;
3025 for (unsigned j = 0; j < i->second.acting.size(); ++j) {
3026 if (i->second.acting[j] == CRUSH_ITEM_NONE)
3027 continue;
3028 actingset.insert(
3029 pg_shard_t(
3030 i->second.acting[j],
3031 ec_pool ? shard_id_t(j) : shard_id_t::NO_SHARD));
3032 }
3033 f(i->second.first, actingset);
3034 }
3035 }
3036
3037 bool has_full_intervals() const override { return true; }
3038 void iterate_all_intervals(
3039 std::function<void(const PastIntervals::pg_interval_t &)> &&f
3040 ) const override {
3041 for (auto &&i: interval_map) {
3042 f(i.second);
3043 }
3044 }
3045 virtual ~pi_simple_rep() override {}
3046 };
3047
3048 /**
3049 * pi_compact_rep
3050 *
3051 * PastIntervals only needs to be able to answer two questions:
3052 * 1) Where should the primary look for unfound objects?
3053 * 2) List a set of subsets of the OSDs such that contacting at least
3054 * one from each subset guarrantees we speak to at least one witness
3055 * of any completed write.
3056 *
3057 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3058 * we don't need to keep any where maybe_went_rw would be false. We also
3059 * needn't keep two intervals where the actingset in one is a subset
3060 * of the other (only need to keep the smaller of the two sets). In order
3061 * to accurately trim the set of intervals as last_epoch_started changes
3062 * without rebuilding the set from scratch, we'll retain the larger set
3063 * if it in an older interval.
3064 */
3065 struct compact_interval_t {
3066 epoch_t first;
3067 epoch_t last;
3068 set<pg_shard_t> acting;
3069 bool supersedes(const compact_interval_t &other) {
3070 for (auto &&i: acting) {
3071 if (!other.acting.count(i))
3072 return false;
3073 }
3074 return true;
3075 }
3076 void dump(Formatter *f) const {
3077 f->open_object_section("compact_interval_t");
3078 f->dump_stream("first") << first;
3079 f->dump_stream("last") << last;
3080 f->dump_stream("acting") << acting;
3081 f->close_section();
3082 }
3083 void encode(bufferlist &bl) const {
3084 ENCODE_START(1, 1, bl);
3085 ::encode(first, bl);
3086 ::encode(last, bl);
3087 ::encode(acting, bl);
3088 ENCODE_FINISH(bl);
3089 }
3090 void decode(bufferlist::iterator &bl) {
3091 DECODE_START(1, bl);
3092 ::decode(first, bl);
3093 ::decode(last, bl);
3094 ::decode(acting, bl);
3095 DECODE_FINISH(bl);
3096 }
3097 static void generate_test_instances(list<compact_interval_t*> & o) {
3098 /* Not going to be used, we'll generate pi_compact_rep directly */
3099 }
3100 };
3101 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3102 {
3103 return o << "([" << rhs.first << "," << rhs.last
3104 << "] acting " << rhs.acting << ")";
3105 }
3106 WRITE_CLASS_ENCODER(compact_interval_t)
3107
3108 class pi_compact_rep : public PastIntervals::interval_rep {
3109 epoch_t first = 0;
3110 epoch_t last = 0; // inclusive
3111 set<pg_shard_t> all_participants;
3112 list<compact_interval_t> intervals;
3113 pi_compact_rep(
3114 bool ec_pool,
3115 std::list<PastIntervals::pg_interval_t> &&intervals) {
3116 for (auto &&i: intervals)
3117 add_interval(ec_pool, i);
3118 }
3119 public:
3120 pi_compact_rep() = default;
3121 pi_compact_rep(const pi_compact_rep &) = default;
3122 pi_compact_rep(pi_compact_rep &&) = default;
3123 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3124 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3125
3126 size_t size() const override { return intervals.size(); }
3127 bool empty() const override {
3128 return first > last || (first == 0 && last == 0);
3129 }
3130 void clear() override {
3131 *this = pi_compact_rep();
3132 }
3133 pair<epoch_t, epoch_t> get_bounds() const override {
3134 return make_pair(first, last + 1);
3135 }
3136 set<pg_shard_t> get_all_participants(
3137 bool ec_pool) const override {
3138 return all_participants;
3139 }
3140 void add_interval(
3141 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3142 if (first == 0)
3143 first = interval.first;
3144 assert(interval.last > last);
3145 last = interval.last;
3146 set<pg_shard_t> acting;
3147 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3148 if (interval.acting[i] == CRUSH_ITEM_NONE)
3149 continue;
3150 acting.insert(
3151 pg_shard_t(
3152 interval.acting[i],
3153 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3154 }
3155 all_participants.insert(acting.begin(), acting.end());
3156 if (!interval.maybe_went_rw)
3157 return;
3158 intervals.push_back(
3159 compact_interval_t{interval.first, interval.last, acting});
3160 auto plast = intervals.end();
3161 --plast;
3162 for (auto cur = intervals.begin(); cur != plast; ) {
3163 if (plast->supersedes(*cur)) {
3164 intervals.erase(cur++);
3165 } else {
3166 ++cur;
3167 }
3168 }
3169 }
3170 unique_ptr<PastIntervals::interval_rep> clone() const override {
3171 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3172 }
3173 ostream &print(ostream &out) const override {
3174 return out << "([" << first << "," << last
3175 << "] intervals=" << intervals << ")";
3176 }
3177 void encode(bufferlist &bl) const override {
3178 ENCODE_START(1, 1, bl);
3179 ::encode(first, bl);
3180 ::encode(last, bl);
3181 ::encode(all_participants, bl);
3182 ::encode(intervals, bl);
3183 ENCODE_FINISH(bl);
3184 }
3185 void decode(bufferlist::iterator &bl) override {
3186 DECODE_START(1, bl);
3187 ::decode(first, bl);
3188 ::decode(last, bl);
3189 ::decode(all_participants, bl);
3190 ::decode(intervals, bl);
3191 DECODE_FINISH(bl);
3192 }
3193 void dump(Formatter *f) const override {
3194 f->open_object_section("PastIntervals::compact_rep");
3195 f->dump_stream("first") << first;
3196 f->dump_stream("last") << last;
3197 f->open_array_section("all_participants");
3198 for (auto& i : all_participants) {
3199 f->dump_object("pg_shard", i);
3200 }
3201 f->close_section();
3202 f->open_array_section("intervals");
3203 for (auto &&i: intervals) {
3204 i.dump(f);
3205 }
3206 f->close_section();
3207 f->close_section();
3208 }
3209 bool is_classic() const override {
3210 return false;
3211 }
3212 static void generate_test_instances(list<pi_compact_rep*> &o) {
3213 using ival = PastIntervals::pg_interval_t;
3214 using ivallst = std::list<ival>;
3215 o.push_back(
3216 new pi_compact_rep(
3217 true, ivallst
3218 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3219 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3220 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3221 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3222 }));
3223 o.push_back(
3224 new pi_compact_rep(
3225 false, ivallst
3226 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3227 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3228 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3229 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3230 }));
3231 o.push_back(
3232 new pi_compact_rep(
3233 true, ivallst
3234 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3235 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3236 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3237 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3238 }));
3239 }
3240 void iterate_mayberw_back_to(
3241 bool ec_pool,
3242 epoch_t les,
3243 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3244 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3245 if (i->last < les)
3246 break;
3247 f(i->first, i->acting);
3248 }
3249 }
3250 virtual ~pi_compact_rep() override {}
3251 };
3252 WRITE_CLASS_ENCODER(pi_compact_rep)
3253
3254 PastIntervals::PastIntervals(const PastIntervals &rhs)
3255 : past_intervals(rhs.past_intervals ?
3256 rhs.past_intervals->clone() :
3257 nullptr) {}
3258
3259 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3260 {
3261 PastIntervals other(rhs);
3262 ::swap(other, *this);
3263 return *this;
3264 }
3265
3266 ostream& operator<<(ostream& out, const PastIntervals &i)
3267 {
3268 if (i.past_intervals) {
3269 return i.past_intervals->print(out);
3270 } else {
3271 return out << "(empty)";
3272 }
3273 }
3274
3275 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3276 {
3277 return out << "PriorSet("
3278 << "ec_pool: " << i.ec_pool
3279 << ", probe: " << i.probe
3280 << ", down: " << i.down
3281 << ", blocked_by: " << i.blocked_by
3282 << ", pg_down: " << i.pg_down
3283 << ")";
3284 }
3285
3286 void PastIntervals::decode(bufferlist::iterator &bl)
3287 {
3288 DECODE_START(1, bl);
3289 __u8 type = 0;
3290 ::decode(type, bl);
3291 switch (type) {
3292 case 0:
3293 break;
3294 case 1:
3295 past_intervals.reset(new pi_simple_rep);
3296 past_intervals->decode(bl);
3297 break;
3298 case 2:
3299 past_intervals.reset(new pi_compact_rep);
3300 past_intervals->decode(bl);
3301 break;
3302 }
3303 DECODE_FINISH(bl);
3304 }
3305
3306 void PastIntervals::decode_classic(bufferlist::iterator &bl)
3307 {
3308 past_intervals.reset(new pi_simple_rep);
3309 past_intervals->decode(bl);
3310 }
3311
3312 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
3313 {
3314 {
3315 list<pi_simple_rep *> simple;
3316 pi_simple_rep::generate_test_instances(simple);
3317 for (auto &&i: simple) {
3318 // takes ownership of contents
3319 o.push_back(new PastIntervals(i));
3320 }
3321 }
3322 {
3323 list<pi_compact_rep *> compact;
3324 pi_compact_rep::generate_test_instances(compact);
3325 for (auto &&i: compact) {
3326 // takes ownership of contents
3327 o.push_back(new PastIntervals(i));
3328 }
3329 }
3330 return;
3331 }
3332
3333 void PastIntervals::update_type(bool ec_pool, bool compact)
3334 {
3335 if (!compact) {
3336 if (!past_intervals) {
3337 past_intervals.reset(new pi_simple_rep);
3338 } else {
3339 // we never convert from compact back to classic
3340 assert(is_classic());
3341 }
3342 } else {
3343 if (!past_intervals) {
3344 past_intervals.reset(new pi_compact_rep);
3345 } else if (is_classic()) {
3346 auto old = std::move(past_intervals);
3347 past_intervals.reset(new pi_compact_rep);
3348 assert(old->has_full_intervals());
3349 old->iterate_all_intervals([&](const pg_interval_t &i) {
3350 past_intervals->add_interval(ec_pool, i);
3351 });
3352 }
3353 }
3354 }
3355
3356 void PastIntervals::update_type_from_map(bool ec_pool, const OSDMap &osdmap)
3357 {
3358 update_type(ec_pool, osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS));
3359 }
3360
3361 bool PastIntervals::is_new_interval(
3362 int old_acting_primary,
3363 int new_acting_primary,
3364 const vector<int> &old_acting,
3365 const vector<int> &new_acting,
3366 int old_up_primary,
3367 int new_up_primary,
3368 const vector<int> &old_up,
3369 const vector<int> &new_up,
3370 int old_size,
3371 int new_size,
3372 int old_min_size,
3373 int new_min_size,
3374 unsigned old_pg_num,
3375 unsigned new_pg_num,
3376 bool old_sort_bitwise,
3377 bool new_sort_bitwise,
3378 pg_t pgid) {
3379 return old_acting_primary != new_acting_primary ||
3380 new_acting != old_acting ||
3381 old_up_primary != new_up_primary ||
3382 new_up != old_up ||
3383 old_min_size != new_min_size ||
3384 old_size != new_size ||
3385 pgid.is_split(old_pg_num, new_pg_num, 0) ||
3386 old_sort_bitwise != new_sort_bitwise;
3387 }
3388
3389 bool PastIntervals::is_new_interval(
3390 int old_acting_primary,
3391 int new_acting_primary,
3392 const vector<int> &old_acting,
3393 const vector<int> &new_acting,
3394 int old_up_primary,
3395 int new_up_primary,
3396 const vector<int> &old_up,
3397 const vector<int> &new_up,
3398 OSDMapRef osdmap,
3399 OSDMapRef lastmap,
3400 pg_t pgid) {
3401 return !(lastmap->get_pools().count(pgid.pool())) ||
3402 is_new_interval(old_acting_primary,
3403 new_acting_primary,
3404 old_acting,
3405 new_acting,
3406 old_up_primary,
3407 new_up_primary,
3408 old_up,
3409 new_up,
3410 lastmap->get_pools().find(pgid.pool())->second.size,
3411 osdmap->get_pools().find(pgid.pool())->second.size,
3412 lastmap->get_pools().find(pgid.pool())->second.min_size,
3413 osdmap->get_pools().find(pgid.pool())->second.min_size,
3414 lastmap->get_pg_num(pgid.pool()),
3415 osdmap->get_pg_num(pgid.pool()),
3416 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3417 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3418 pgid);
3419 }
3420
3421 bool PastIntervals::check_new_interval(
3422 int old_acting_primary,
3423 int new_acting_primary,
3424 const vector<int> &old_acting,
3425 const vector<int> &new_acting,
3426 int old_up_primary,
3427 int new_up_primary,
3428 const vector<int> &old_up,
3429 const vector<int> &new_up,
3430 epoch_t same_interval_since,
3431 epoch_t last_epoch_clean,
3432 OSDMapRef osdmap,
3433 OSDMapRef lastmap,
3434 pg_t pgid,
3435 IsPGRecoverablePredicate *could_have_gone_active,
3436 PastIntervals *past_intervals,
3437 std::ostream *out)
3438 {
3439 /*
3440 * We have to be careful to gracefully deal with situations like
3441 * so. Say we have a power outage or something that takes out both
3442 * OSDs, but the monitor doesn't mark them down in the same epoch.
3443 * The history may look like
3444 *
3445 * 1: A B
3446 * 2: B
3447 * 3: let's say B dies for good, too (say, from the power spike)
3448 * 4: A
3449 *
3450 * which makes it look like B may have applied updates to the PG
3451 * that we need in order to proceed. This sucks...
3452 *
3453 * To minimize the risk of this happening, we CANNOT go active if
3454 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3455 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3456 * Then, we have something like
3457 *
3458 * 1: A B
3459 * 2: B up_thru[B]=0
3460 * 3:
3461 * 4: A
3462 *
3463 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
3464 *
3465 * or,
3466 *
3467 * 1: A B
3468 * 2: B up_thru[B]=0
3469 * 3: B up_thru[B]=2
3470 * 4:
3471 * 5: A
3472 *
3473 * -> we must wait for B, bc it was alive through 2, and could have
3474 * written to the pg.
3475 *
3476 * If B is really dead, then an administrator will need to manually
3477 * intervene by marking the OSD as "lost."
3478 */
3479
3480 // remember past interval
3481 // NOTE: a change in the up set primary triggers an interval
3482 // change, even though the interval members in the pg_interval_t
3483 // do not change.
3484 assert(past_intervals);
3485 assert(past_intervals->past_intervals);
3486 if (is_new_interval(
3487 old_acting_primary,
3488 new_acting_primary,
3489 old_acting,
3490 new_acting,
3491 old_up_primary,
3492 new_up_primary,
3493 old_up,
3494 new_up,
3495 osdmap,
3496 lastmap,
3497 pgid)) {
3498 pg_interval_t i;
3499 i.first = same_interval_since;
3500 i.last = osdmap->get_epoch() - 1;
3501 assert(i.first <= i.last);
3502 i.acting = old_acting;
3503 i.up = old_up;
3504 i.primary = old_acting_primary;
3505 i.up_primary = old_up_primary;
3506
3507 unsigned num_acting = 0;
3508 for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();
3509 ++p)
3510 if (*p != CRUSH_ITEM_NONE)
3511 ++num_acting;
3512
3513 assert(lastmap->get_pools().count(pgid.pool()));
3514 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
3515 set<pg_shard_t> old_acting_shards;
3516 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
3517
3518 if (num_acting &&
3519 i.primary != -1 &&
3520 num_acting >= old_pg_pool.min_size &&
3521 (*could_have_gone_active)(old_acting_shards)) {
3522 if (out)
3523 *out << __func__ << " " << i
3524 << ": not rw,"
3525 << " up_thru " << lastmap->get_up_thru(i.primary)
3526 << " up_from " << lastmap->get_up_from(i.primary)
3527 << " last_epoch_clean " << last_epoch_clean
3528 << std::endl;
3529 if (lastmap->get_up_thru(i.primary) >= i.first &&
3530 lastmap->get_up_from(i.primary) <= i.first) {
3531 i.maybe_went_rw = true;
3532 if (out)
3533 *out << __func__ << " " << i
3534 << " : primary up " << lastmap->get_up_from(i.primary)
3535 << "-" << lastmap->get_up_thru(i.primary)
3536 << " includes interval"
3537 << std::endl;
3538 } else if (last_epoch_clean >= i.first &&
3539 last_epoch_clean <= i.last) {
3540 // If the last_epoch_clean is included in this interval, then
3541 // the pg must have been rw (for recovery to have completed).
3542 // This is important because we won't know the _real_
3543 // first_epoch because we stop at last_epoch_clean, and we
3544 // don't want the oldest interval to randomly have
3545 // maybe_went_rw false depending on the relative up_thru vs
3546 // last_epoch_clean timing.
3547 i.maybe_went_rw = true;
3548 if (out)
3549 *out << __func__ << " " << i
3550 << " : includes last_epoch_clean " << last_epoch_clean
3551 << " and presumed to have been rw"
3552 << std::endl;
3553 } else {
3554 i.maybe_went_rw = false;
3555 if (out)
3556 *out << __func__ << " " << i
3557 << " : primary up " << lastmap->get_up_from(i.primary)
3558 << "-" << lastmap->get_up_thru(i.primary)
3559 << " does not include interval"
3560 << std::endl;
3561 }
3562 } else {
3563 i.maybe_went_rw = false;
3564 if (out)
3565 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
3566 }
3567 past_intervals->past_intervals->add_interval(old_pg_pool.ec_pool(), i);
3568 return true;
3569 } else {
3570 return false;
3571 }
3572 }
3573
3574
3575 // true if the given map affects the prior set
3576 bool PastIntervals::PriorSet::affected_by_map(
3577 const OSDMap &osdmap,
3578 const DoutPrefixProvider *dpp) const
3579 {
3580 for (set<pg_shard_t>::iterator p = probe.begin();
3581 p != probe.end();
3582 ++p) {
3583 int o = p->osd;
3584
3585 // did someone in the prior set go down?
3586 if (osdmap.is_down(o) && down.count(o) == 0) {
3587 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
3588 return true;
3589 }
3590
3591 // did a down osd in cur get (re)marked as lost?
3592 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3593 if (r != blocked_by.end()) {
3594 if (!osdmap.exists(o)) {
3595 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3596 return true;
3597 }
3598 if (osdmap.get_info(o).lost_at != r->second) {
3599 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3600 return true;
3601 }
3602 }
3603 }
3604
3605 // did someone in the prior down set go up?
3606 for (set<int>::const_iterator p = down.begin();
3607 p != down.end();
3608 ++p) {
3609 int o = *p;
3610
3611 if (osdmap.is_up(o)) {
3612 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
3613 return true;
3614 }
3615
3616 // did someone in the prior set get lost or destroyed?
3617 if (!osdmap.exists(o)) {
3618 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3619 return true;
3620 }
3621 // did a down osd in down get (re)marked as lost?
3622 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3623 if (r != blocked_by.end()) {
3624 if (osdmap.get_info(o).lost_at != r->second) {
3625 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3626 return true;
3627 }
3628 }
3629 }
3630
3631 return false;
3632 }
3633
3634 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
3635 {
3636 out << "interval(" << i.first << "-" << i.last
3637 << " up " << i.up << "(" << i.up_primary << ")"
3638 << " acting " << i.acting << "(" << i.primary << ")";
3639 if (i.maybe_went_rw)
3640 out << " maybe_went_rw";
3641 out << ")";
3642 return out;
3643 }
3644
3645
3646
3647 // -- pg_query_t --
3648
3649 void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
3650 ENCODE_START(3, 3, bl);
3651 ::encode(type, bl);
3652 ::encode(since, bl);
3653 history.encode(bl);
3654 ::encode(epoch_sent, bl);
3655 ::encode(to, bl);
3656 ::encode(from, bl);
3657 ENCODE_FINISH(bl);
3658 }
3659
3660 void pg_query_t::decode(bufferlist::iterator &bl) {
3661 DECODE_START(3, bl);
3662 ::decode(type, bl);
3663 ::decode(since, bl);
3664 history.decode(bl);
3665 ::decode(epoch_sent, bl);
3666 ::decode(to, bl);
3667 ::decode(from, bl);
3668 DECODE_FINISH(bl);
3669 }
3670
3671 void pg_query_t::dump(Formatter *f) const
3672 {
3673 f->dump_int("from", from);
3674 f->dump_int("to", to);
3675 f->dump_string("type", get_type_name());
3676 f->dump_stream("since") << since;
3677 f->dump_stream("epoch_sent") << epoch_sent;
3678 f->open_object_section("history");
3679 history.dump(f);
3680 f->close_section();
3681 }
3682 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
3683 {
3684 o.push_back(new pg_query_t());
3685 list<pg_history_t*> h;
3686 pg_history_t::generate_test_instances(h);
3687 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
3688 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
3689 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
3690 eversion_t(4, 5), *h.back(), 4));
3691 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
3692 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
3693 *h.back(), 5));
3694 }
3695
3696 // -- ObjectModDesc --
3697 void ObjectModDesc::visit(Visitor *visitor) const
3698 {
3699 bufferlist::iterator bp = bl.begin();
3700 try {
3701 while (!bp.end()) {
3702 DECODE_START(max_required_version, bp);
3703 uint8_t code;
3704 ::decode(code, bp);
3705 switch (code) {
3706 case APPEND: {
3707 uint64_t size;
3708 ::decode(size, bp);
3709 visitor->append(size);
3710 break;
3711 }
3712 case SETATTRS: {
3713 map<string, boost::optional<bufferlist> > attrs;
3714 ::decode(attrs, bp);
3715 visitor->setattrs(attrs);
3716 break;
3717 }
3718 case DELETE: {
3719 version_t old_version;
3720 ::decode(old_version, bp);
3721 visitor->rmobject(old_version);
3722 break;
3723 }
3724 case CREATE: {
3725 visitor->create();
3726 break;
3727 }
3728 case UPDATE_SNAPS: {
3729 set<snapid_t> snaps;
3730 ::decode(snaps, bp);
3731 visitor->update_snaps(snaps);
3732 break;
3733 }
3734 case TRY_DELETE: {
3735 version_t old_version;
3736 ::decode(old_version, bp);
3737 visitor->try_rmobject(old_version);
3738 break;
3739 }
3740 case ROLLBACK_EXTENTS: {
3741 vector<pair<uint64_t, uint64_t> > extents;
3742 version_t gen;
3743 ::decode(gen, bp);
3744 ::decode(extents, bp);
3745 visitor->rollback_extents(gen,extents);
3746 break;
3747 }
3748 default:
3749 assert(0 == "Invalid rollback code");
3750 }
3751 DECODE_FINISH(bp);
3752 }
3753 } catch (...) {
3754 assert(0 == "Invalid encoding");
3755 }
3756 }
3757
3758 struct DumpVisitor : public ObjectModDesc::Visitor {
3759 Formatter *f;
3760 explicit DumpVisitor(Formatter *f) : f(f) {}
3761 void append(uint64_t old_size) override {
3762 f->open_object_section("op");
3763 f->dump_string("code", "APPEND");
3764 f->dump_unsigned("old_size", old_size);
3765 f->close_section();
3766 }
3767 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
3768 f->open_object_section("op");
3769 f->dump_string("code", "SETATTRS");
3770 f->open_array_section("attrs");
3771 for (map<string, boost::optional<bufferlist> >::iterator i = attrs.begin();
3772 i != attrs.end();
3773 ++i) {
3774 f->dump_string("attr_name", i->first);
3775 }
3776 f->close_section();
3777 f->close_section();
3778 }
3779 void rmobject(version_t old_version) override {
3780 f->open_object_section("op");
3781 f->dump_string("code", "RMOBJECT");
3782 f->dump_unsigned("old_version", old_version);
3783 f->close_section();
3784 }
3785 void try_rmobject(version_t old_version) override {
3786 f->open_object_section("op");
3787 f->dump_string("code", "TRY_RMOBJECT");
3788 f->dump_unsigned("old_version", old_version);
3789 f->close_section();
3790 }
3791 void create() override {
3792 f->open_object_section("op");
3793 f->dump_string("code", "CREATE");
3794 f->close_section();
3795 }
3796 void update_snaps(const set<snapid_t> &snaps) override {
3797 f->open_object_section("op");
3798 f->dump_string("code", "UPDATE_SNAPS");
3799 f->dump_stream("snaps") << snaps;
3800 f->close_section();
3801 }
3802 void rollback_extents(
3803 version_t gen,
3804 const vector<pair<uint64_t, uint64_t> > &extents) override {
3805 f->open_object_section("op");
3806 f->dump_string("code", "ROLLBACK_EXTENTS");
3807 f->dump_unsigned("gen", gen);
3808 f->dump_stream("snaps") << extents;
3809 f->close_section();
3810 }
3811 };
3812
3813 void ObjectModDesc::dump(Formatter *f) const
3814 {
3815 f->open_object_section("object_mod_desc");
3816 f->dump_bool("can_local_rollback", can_local_rollback);
3817 f->dump_bool("rollback_info_completed", rollback_info_completed);
3818 {
3819 f->open_array_section("ops");
3820 DumpVisitor vis(f);
3821 visit(&vis);
3822 f->close_section();
3823 }
3824 f->close_section();
3825 }
3826
3827 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
3828 {
3829 map<string, boost::optional<bufferlist> > attrs;
3830 attrs[OI_ATTR];
3831 attrs[SS_ATTR];
3832 attrs["asdf"];
3833 o.push_back(new ObjectModDesc());
3834 o.back()->append(100);
3835 o.back()->setattrs(attrs);
3836 o.push_back(new ObjectModDesc());
3837 o.back()->rmobject(1001);
3838 o.push_back(new ObjectModDesc());
3839 o.back()->create();
3840 o.back()->setattrs(attrs);
3841 o.push_back(new ObjectModDesc());
3842 o.back()->create();
3843 o.back()->setattrs(attrs);
3844 o.back()->mark_unrollbackable();
3845 o.back()->append(1000);
3846 }
3847
3848 void ObjectModDesc::encode(bufferlist &_bl) const
3849 {
3850 ENCODE_START(max_required_version, max_required_version, _bl);
3851 ::encode(can_local_rollback, _bl);
3852 ::encode(rollback_info_completed, _bl);
3853 ::encode(bl, _bl);
3854 ENCODE_FINISH(_bl);
3855 }
3856 void ObjectModDesc::decode(bufferlist::iterator &_bl)
3857 {
3858 DECODE_START(2, _bl);
3859 max_required_version = struct_v;
3860 ::decode(can_local_rollback, _bl);
3861 ::decode(rollback_info_completed, _bl);
3862 ::decode(bl, _bl);
3863 // ensure bl does not pin a larger buffer in memory
3864 bl.rebuild();
3865 DECODE_FINISH(_bl);
3866 }
3867
3868 // -- pg_log_entry_t --
3869
3870 string pg_log_entry_t::get_key_name() const
3871 {
3872 return version.get_key_name();
3873 }
3874
3875 void pg_log_entry_t::encode_with_checksum(bufferlist& bl) const
3876 {
3877 bufferlist ebl(sizeof(*this)*2);
3878 encode(ebl);
3879 __u32 crc = ebl.crc32c(0);
3880 ::encode(ebl, bl);
3881 ::encode(crc, bl);
3882 }
3883
3884 void pg_log_entry_t::decode_with_checksum(bufferlist::iterator& p)
3885 {
3886 bufferlist bl;
3887 ::decode(bl, p);
3888 __u32 crc;
3889 ::decode(crc, p);
3890 if (crc != bl.crc32c(0))
3891 throw buffer::malformed_input("bad checksum on pg_log_entry_t");
3892 bufferlist::iterator q = bl.begin();
3893 decode(q);
3894 }
3895
3896 void pg_log_entry_t::encode(bufferlist &bl) const
3897 {
3898 ENCODE_START(11, 4, bl);
3899 ::encode(op, bl);
3900 ::encode(soid, bl);
3901 ::encode(version, bl);
3902
3903 /**
3904 * Added with reverting_to:
3905 * Previous code used prior_version to encode
3906 * what we now call reverting_to. This will
3907 * allow older code to decode reverting_to
3908 * into prior_version as expected.
3909 */
3910 if (op == LOST_REVERT)
3911 ::encode(reverting_to, bl);
3912 else
3913 ::encode(prior_version, bl);
3914
3915 ::encode(reqid, bl);
3916 ::encode(mtime, bl);
3917 if (op == LOST_REVERT)
3918 ::encode(prior_version, bl);
3919 ::encode(snaps, bl);
3920 ::encode(user_version, bl);
3921 ::encode(mod_desc, bl);
3922 ::encode(extra_reqids, bl);
3923 if (op == ERROR)
3924 ::encode(return_code, bl);
3925 ENCODE_FINISH(bl);
3926 }
3927
3928 void pg_log_entry_t::decode(bufferlist::iterator &bl)
3929 {
3930 DECODE_START_LEGACY_COMPAT_LEN(11, 4, 4, bl);
3931 ::decode(op, bl);
3932 if (struct_v < 2) {
3933 sobject_t old_soid;
3934 ::decode(old_soid, bl);
3935 soid.oid = old_soid.oid;
3936 soid.snap = old_soid.snap;
3937 invalid_hash = true;
3938 } else {
3939 ::decode(soid, bl);
3940 }
3941 if (struct_v < 3)
3942 invalid_hash = true;
3943 ::decode(version, bl);
3944
3945 if (struct_v >= 6 && op == LOST_REVERT)
3946 ::decode(reverting_to, bl);
3947 else
3948 ::decode(prior_version, bl);
3949
3950 ::decode(reqid, bl);
3951
3952 ::decode(mtime, bl);
3953 if (struct_v < 5)
3954 invalid_pool = true;
3955
3956 if (op == LOST_REVERT) {
3957 if (struct_v >= 6) {
3958 ::decode(prior_version, bl);
3959 } else {
3960 reverting_to = prior_version;
3961 }
3962 }
3963 if (struct_v >= 7 || // for v >= 7, this is for all ops.
3964 op == CLONE) { // for v < 7, it's only present for CLONE.
3965 ::decode(snaps, bl);
3966 // ensure snaps does not pin a larger buffer in memory
3967 snaps.rebuild();
3968 }
3969
3970 if (struct_v >= 8)
3971 ::decode(user_version, bl);
3972 else
3973 user_version = version.version;
3974
3975 if (struct_v >= 9)
3976 ::decode(mod_desc, bl);
3977 else
3978 mod_desc.mark_unrollbackable();
3979 if (struct_v >= 10)
3980 ::decode(extra_reqids, bl);
3981 if (struct_v >= 11 && op == ERROR)
3982 ::decode(return_code, bl);
3983 DECODE_FINISH(bl);
3984 }
3985
3986 void pg_log_entry_t::dump(Formatter *f) const
3987 {
3988 f->dump_string("op", get_op_name());
3989 f->dump_stream("object") << soid;
3990 f->dump_stream("version") << version;
3991 f->dump_stream("prior_version") << prior_version;
3992 f->dump_stream("reqid") << reqid;
3993 f->open_array_section("extra_reqids");
3994 for (vector<pair<osd_reqid_t, version_t> >::const_iterator p =
3995 extra_reqids.begin();
3996 p != extra_reqids.end();
3997 ++p) {
3998 f->open_object_section("extra_reqid");
3999 f->dump_stream("reqid") << p->first;
4000 f->dump_stream("user_version") << p->second;
4001 f->close_section();
4002 }
4003 f->close_section();
4004 f->dump_stream("mtime") << mtime;
4005 f->dump_int("return_code", return_code);
4006 if (snaps.length() > 0) {
4007 vector<snapid_t> v;
4008 bufferlist c = snaps;
4009 bufferlist::iterator p = c.begin();
4010 try {
4011 ::decode(v, p);
4012 } catch (...) {
4013 v.clear();
4014 }
4015 f->open_object_section("snaps");
4016 for (vector<snapid_t>::iterator p = v.begin(); p != v.end(); ++p)
4017 f->dump_unsigned("snap", *p);
4018 f->close_section();
4019 }
4020 {
4021 f->open_object_section("mod_desc");
4022 mod_desc.dump(f);
4023 f->close_section();
4024 }
4025 }
4026
4027 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4028 {
4029 o.push_back(new pg_log_entry_t());
4030 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4031 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4032 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4033 utime_t(8,9), 0));
4034 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4035 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4036 utime_t(8,9), -ENOENT));
4037 }
4038
4039 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4040 {
4041 out << e.version << " (" << e.prior_version << ") "
4042 << std::left << std::setw(8) << e.get_op_name() << ' '
4043 << e.soid << " by " << e.reqid << " " << e.mtime
4044 << " " << e.return_code;
4045 if (e.snaps.length()) {
4046 vector<snapid_t> snaps;
4047 bufferlist c = e.snaps;
4048 bufferlist::iterator p = c.begin();
4049 try {
4050 ::decode(snaps, p);
4051 } catch (...) {
4052 snaps.clear();
4053 }
4054 out << " snaps " << snaps;
4055 }
4056 return out;
4057 }
4058
4059
4060 // -- pg_log_t --
4061
4062 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4063 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4064 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
4065 const string &hit_set_namespace, const pg_log_t &in,
4066 pg_log_t &out, pg_log_t &reject)
4067 {
4068 out = in;
4069 out.log.clear();
4070 reject.log.clear();
4071
4072 for (list<pg_log_entry_t>::const_iterator i = in.log.begin();
4073 i != in.log.end(); ++i) {
4074
4075 // Reject pg log entries for temporary objects
4076 if (i->soid.is_temp()) {
4077 reject.log.push_back(*i);
4078 continue;
4079 }
4080
4081 if (i->soid.nspace != hit_set_namespace) {
4082 object_t oid = i->soid.oid;
4083 object_locator_t loc(i->soid);
4084 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
4085 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
4086
4087 if (import_pgid.pgid == pgid) {
4088 out.log.push_back(*i);
4089 } else {
4090 reject.log.push_back(*i);
4091 }
4092 } else {
4093 out.log.push_back(*i);
4094 }
4095 }
4096 }
4097
4098 void pg_log_t::encode(bufferlist& bl) const
4099 {
4100 ENCODE_START(6, 3, bl);
4101 ::encode(head, bl);
4102 ::encode(tail, bl);
4103 ::encode(log, bl);
4104 ::encode(can_rollback_to, bl);
4105 ::encode(rollback_info_trimmed_to, bl);
4106 ENCODE_FINISH(bl);
4107 }
4108
4109 void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
4110 {
4111 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
4112 ::decode(head, bl);
4113 ::decode(tail, bl);
4114 if (struct_v < 2) {
4115 bool backlog;
4116 ::decode(backlog, bl);
4117 }
4118 ::decode(log, bl);
4119 if (struct_v >= 5)
4120 ::decode(can_rollback_to, bl);
4121
4122 if (struct_v >= 6)
4123 ::decode(rollback_info_trimmed_to, bl);
4124 else
4125 rollback_info_trimmed_to = tail;
4126 DECODE_FINISH(bl);
4127
4128 // handle hobject_t format change
4129 if (struct_v < 4) {
4130 for (list<pg_log_entry_t>::iterator i = log.begin();
4131 i != log.end();
4132 ++i) {
4133 if (!i->soid.is_max() && i->soid.pool == -1)
4134 i->soid.pool = pool;
4135 }
4136 }
4137 }
4138
4139 void pg_log_t::dump(Formatter *f) const
4140 {
4141 f->dump_stream("head") << head;
4142 f->dump_stream("tail") << tail;
4143 f->open_array_section("log");
4144 for (list<pg_log_entry_t>::const_iterator p = log.begin(); p != log.end(); ++p) {
4145 f->open_object_section("entry");
4146 p->dump(f);
4147 f->close_section();
4148 }
4149 f->close_section();
4150 }
4151
4152 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
4153 {
4154 o.push_back(new pg_log_t);
4155
4156 // this is nonsensical:
4157 o.push_back(new pg_log_t);
4158 o.back()->head = eversion_t(1,2);
4159 o.back()->tail = eversion_t(3,4);
4160 list<pg_log_entry_t*> e;
4161 pg_log_entry_t::generate_test_instances(e);
4162 for (list<pg_log_entry_t*>::iterator p = e.begin(); p != e.end(); ++p)
4163 o.back()->log.push_back(**p);
4164 }
4165
4166 void pg_log_t::copy_after(const pg_log_t &other, eversion_t v)
4167 {
4168 can_rollback_to = other.can_rollback_to;
4169 head = other.head;
4170 tail = other.tail;
4171 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4172 i != other.log.rend();
4173 ++i) {
4174 assert(i->version > other.tail);
4175 if (i->version <= v) {
4176 // make tail accurate.
4177 tail = i->version;
4178 break;
4179 }
4180 log.push_front(*i);
4181 }
4182 }
4183
4184 void pg_log_t::copy_range(const pg_log_t &other, eversion_t from, eversion_t to)
4185 {
4186 can_rollback_to = other.can_rollback_to;
4187 list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4188 assert(i != other.log.rend());
4189 while (i->version > to) {
4190 ++i;
4191 assert(i != other.log.rend());
4192 }
4193 assert(i->version == to);
4194 head = to;
4195 for ( ; i != other.log.rend(); ++i) {
4196 if (i->version <= from) {
4197 tail = i->version;
4198 break;
4199 }
4200 log.push_front(*i);
4201 }
4202 }
4203
4204 void pg_log_t::copy_up_to(const pg_log_t &other, int max)
4205 {
4206 can_rollback_to = other.can_rollback_to;
4207 int n = 0;
4208 head = other.head;
4209 tail = other.tail;
4210 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4211 i != other.log.rend();
4212 ++i) {
4213 if (n++ >= max) {
4214 tail = i->version;
4215 break;
4216 }
4217 log.push_front(*i);
4218 }
4219 }
4220
4221 ostream& pg_log_t::print(ostream& out) const
4222 {
4223 out << *this << std::endl;
4224 for (list<pg_log_entry_t>::const_iterator p = log.begin();
4225 p != log.end();
4226 ++p)
4227 out << *p << std::endl;
4228 return out;
4229 }
4230
4231 // -- pg_missing_t --
4232
4233 ostream& operator<<(ostream& out, const pg_missing_item& i)
4234 {
4235 out << i.need;
4236 if (i.have != eversion_t())
4237 out << "(" << i.have << ")";
4238 return out;
4239 }
4240
4241 // -- object_copy_cursor_t --
4242
4243 void object_copy_cursor_t::encode(bufferlist& bl) const
4244 {
4245 ENCODE_START(1, 1, bl);
4246 ::encode(attr_complete, bl);
4247 ::encode(data_offset, bl);
4248 ::encode(data_complete, bl);
4249 ::encode(omap_offset, bl);
4250 ::encode(omap_complete, bl);
4251 ENCODE_FINISH(bl);
4252 }
4253
4254 void object_copy_cursor_t::decode(bufferlist::iterator &bl)
4255 {
4256 DECODE_START(1, bl);
4257 ::decode(attr_complete, bl);
4258 ::decode(data_offset, bl);
4259 ::decode(data_complete, bl);
4260 ::decode(omap_offset, bl);
4261 ::decode(omap_complete, bl);
4262 DECODE_FINISH(bl);
4263 }
4264
4265 void object_copy_cursor_t::dump(Formatter *f) const
4266 {
4267 f->dump_unsigned("attr_complete", (int)attr_complete);
4268 f->dump_unsigned("data_offset", data_offset);
4269 f->dump_unsigned("data_complete", (int)data_complete);
4270 f->dump_string("omap_offset", omap_offset);
4271 f->dump_unsigned("omap_complete", (int)omap_complete);
4272 }
4273
4274 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
4275 {
4276 o.push_back(new object_copy_cursor_t);
4277 o.push_back(new object_copy_cursor_t);
4278 o.back()->attr_complete = true;
4279 o.back()->data_offset = 123;
4280 o.push_back(new object_copy_cursor_t);
4281 o.back()->attr_complete = true;
4282 o.back()->data_complete = true;
4283 o.back()->omap_offset = "foo";
4284 o.push_back(new object_copy_cursor_t);
4285 o.back()->attr_complete = true;
4286 o.back()->data_complete = true;
4287 o.back()->omap_complete = true;
4288 }
4289
4290 // -- object_copy_data_t --
4291
4292 void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
4293 {
4294 ENCODE_START(7, 5, bl);
4295 ::encode(size, bl);
4296 ::encode(mtime, bl);
4297 ::encode(attrs, bl);
4298 ::encode(data, bl);
4299 ::encode(omap_data, bl);
4300 ::encode(cursor, bl);
4301 ::encode(omap_header, bl);
4302 ::encode(snaps, bl);
4303 ::encode(snap_seq, bl);
4304 ::encode(flags, bl);
4305 ::encode(data_digest, bl);
4306 ::encode(omap_digest, bl);
4307 ::encode(reqids, bl);
4308 ::encode(truncate_seq, bl);
4309 ::encode(truncate_size, bl);
4310 ENCODE_FINISH(bl);
4311 }
4312
4313 void object_copy_data_t::decode(bufferlist::iterator& bl)
4314 {
4315 DECODE_START(7, bl);
4316 if (struct_v < 5) {
4317 // old
4318 ::decode(size, bl);
4319 ::decode(mtime, bl);
4320 {
4321 string category;
4322 ::decode(category, bl); // no longer used
4323 }
4324 ::decode(attrs, bl);
4325 ::decode(data, bl);
4326 {
4327 map<string,bufferlist> omap;
4328 ::decode(omap, bl);
4329 omap_data.clear();
4330 if (!omap.empty())
4331 ::encode(omap, omap_data);
4332 }
4333 ::decode(cursor, bl);
4334 if (struct_v >= 2)
4335 ::decode(omap_header, bl);
4336 if (struct_v >= 3) {
4337 ::decode(snaps, bl);
4338 ::decode(snap_seq, bl);
4339 } else {
4340 snaps.clear();
4341 snap_seq = 0;
4342 }
4343 if (struct_v >= 4) {
4344 ::decode(flags, bl);
4345 ::decode(data_digest, bl);
4346 ::decode(omap_digest, bl);
4347 }
4348 } else {
4349 // current
4350 ::decode(size, bl);
4351 ::decode(mtime, bl);
4352 ::decode(attrs, bl);
4353 ::decode(data, bl);
4354 ::decode(omap_data, bl);
4355 ::decode(cursor, bl);
4356 ::decode(omap_header, bl);
4357 ::decode(snaps, bl);
4358 ::decode(snap_seq, bl);
4359 if (struct_v >= 4) {
4360 ::decode(flags, bl);
4361 ::decode(data_digest, bl);
4362 ::decode(omap_digest, bl);
4363 }
4364 if (struct_v >= 6) {
4365 ::decode(reqids, bl);
4366 }
4367 if (struct_v >= 7) {
4368 ::decode(truncate_seq, bl);
4369 ::decode(truncate_size, bl);
4370 }
4371 }
4372 DECODE_FINISH(bl);
4373 }
4374
4375 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
4376 {
4377 o.push_back(new object_copy_data_t());
4378
4379 list<object_copy_cursor_t*> cursors;
4380 object_copy_cursor_t::generate_test_instances(cursors);
4381 list<object_copy_cursor_t*>::iterator ci = cursors.begin();
4382 o.back()->cursor = **(ci++);
4383
4384 o.push_back(new object_copy_data_t());
4385 o.back()->cursor = **(ci++);
4386
4387 o.push_back(new object_copy_data_t());
4388 o.back()->size = 1234;
4389 o.back()->mtime.set_from_double(1234);
4390 bufferptr bp("there", 5);
4391 bufferlist bl;
4392 bl.push_back(bp);
4393 o.back()->attrs["hello"] = bl;
4394 bufferptr bp2("not", 3);
4395 bufferlist bl2;
4396 bl2.push_back(bp2);
4397 map<string,bufferlist> omap;
4398 omap["why"] = bl2;
4399 ::encode(omap, o.back()->omap_data);
4400 bufferptr databp("iamsomedatatocontain", 20);
4401 o.back()->data.push_back(databp);
4402 o.back()->omap_header.append("this is an omap header");
4403 o.back()->snaps.push_back(123);
4404 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
4405 }
4406
4407 void object_copy_data_t::dump(Formatter *f) const
4408 {
4409 f->open_object_section("cursor");
4410 cursor.dump(f);
4411 f->close_section(); // cursor
4412 f->dump_int("size", size);
4413 f->dump_stream("mtime") << mtime;
4414 /* we should really print out the attrs here, but bufferlist
4415 const-correctness prevents that */
4416 f->dump_int("attrs_size", attrs.size());
4417 f->dump_int("flags", flags);
4418 f->dump_unsigned("data_digest", data_digest);
4419 f->dump_unsigned("omap_digest", omap_digest);
4420 f->dump_int("omap_data_length", omap_data.length());
4421 f->dump_int("omap_header_length", omap_header.length());
4422 f->dump_int("data_length", data.length());
4423 f->open_array_section("snaps");
4424 for (vector<snapid_t>::const_iterator p = snaps.begin();
4425 p != snaps.end(); ++p)
4426 f->dump_unsigned("snap", *p);
4427 f->close_section();
4428 f->open_array_section("reqids");
4429 for (vector<pair<osd_reqid_t, version_t> >::const_iterator p = reqids.begin();
4430 p != reqids.end();
4431 ++p) {
4432 f->open_object_section("extra_reqid");
4433 f->dump_stream("reqid") << p->first;
4434 f->dump_stream("user_version") << p->second;
4435 f->close_section();
4436 }
4437 f->close_section();
4438 }
4439
4440 // -- pg_create_t --
4441
4442 void pg_create_t::encode(bufferlist &bl) const
4443 {
4444 ENCODE_START(1, 1, bl);
4445 ::encode(created, bl);
4446 ::encode(parent, bl);
4447 ::encode(split_bits, bl);
4448 ENCODE_FINISH(bl);
4449 }
4450
4451 void pg_create_t::decode(bufferlist::iterator &bl)
4452 {
4453 DECODE_START(1, bl);
4454 ::decode(created, bl);
4455 ::decode(parent, bl);
4456 ::decode(split_bits, bl);
4457 DECODE_FINISH(bl);
4458 }
4459
4460 void pg_create_t::dump(Formatter *f) const
4461 {
4462 f->dump_unsigned("created", created);
4463 f->dump_stream("parent") << parent;
4464 f->dump_int("split_bits", split_bits);
4465 }
4466
4467 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
4468 {
4469 o.push_back(new pg_create_t);
4470 o.push_back(new pg_create_t(1, pg_t(3, 4, -1), 2));
4471 }
4472
4473
4474 // -- pg_hit_set_info_t --
4475
4476 void pg_hit_set_info_t::encode(bufferlist& bl) const
4477 {
4478 ENCODE_START(2, 1, bl);
4479 ::encode(begin, bl);
4480 ::encode(end, bl);
4481 ::encode(version, bl);
4482 ::encode(using_gmt, bl);
4483 ENCODE_FINISH(bl);
4484 }
4485
4486 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
4487 {
4488 DECODE_START(2, p);
4489 ::decode(begin, p);
4490 ::decode(end, p);
4491 ::decode(version, p);
4492 if (struct_v >= 2) {
4493 ::decode(using_gmt, p);
4494 } else {
4495 using_gmt = false;
4496 }
4497 DECODE_FINISH(p);
4498 }
4499
4500 void pg_hit_set_info_t::dump(Formatter *f) const
4501 {
4502 f->dump_stream("begin") << begin;
4503 f->dump_stream("end") << end;
4504 f->dump_stream("version") << version;
4505 f->dump_stream("using_gmt") << using_gmt;
4506 }
4507
4508 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
4509 {
4510 ls.push_back(new pg_hit_set_info_t);
4511 ls.push_back(new pg_hit_set_info_t);
4512 ls.back()->begin = utime_t(1, 2);
4513 ls.back()->end = utime_t(3, 4);
4514 }
4515
4516
4517 // -- pg_hit_set_history_t --
4518
4519 void pg_hit_set_history_t::encode(bufferlist& bl) const
4520 {
4521 ENCODE_START(1, 1, bl);
4522 ::encode(current_last_update, bl);
4523 {
4524 utime_t dummy_stamp;
4525 ::encode(dummy_stamp, bl);
4526 }
4527 {
4528 pg_hit_set_info_t dummy_info;
4529 ::encode(dummy_info, bl);
4530 }
4531 ::encode(history, bl);
4532 ENCODE_FINISH(bl);
4533 }
4534
4535 void pg_hit_set_history_t::decode(bufferlist::iterator& p)
4536 {
4537 DECODE_START(1, p);
4538 ::decode(current_last_update, p);
4539 {
4540 utime_t dummy_stamp;
4541 ::decode(dummy_stamp, p);
4542 }
4543 {
4544 pg_hit_set_info_t dummy_info;
4545 ::decode(dummy_info, p);
4546 }
4547 ::decode(history, p);
4548 DECODE_FINISH(p);
4549 }
4550
4551 void pg_hit_set_history_t::dump(Formatter *f) const
4552 {
4553 f->dump_stream("current_last_update") << current_last_update;
4554 f->open_array_section("history");
4555 for (list<pg_hit_set_info_t>::const_iterator p = history.begin();
4556 p != history.end(); ++p) {
4557 f->open_object_section("info");
4558 p->dump(f);
4559 f->close_section();
4560 }
4561 f->close_section();
4562 }
4563
4564 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
4565 {
4566 ls.push_back(new pg_hit_set_history_t);
4567 ls.push_back(new pg_hit_set_history_t);
4568 ls.back()->current_last_update = eversion_t(1, 2);
4569 ls.back()->history.push_back(pg_hit_set_info_t());
4570 }
4571
4572 // -- osd_peer_stat_t --
4573
4574 void osd_peer_stat_t::encode(bufferlist& bl) const
4575 {
4576 ENCODE_START(1, 1, bl);
4577 ::encode(stamp, bl);
4578 ENCODE_FINISH(bl);
4579 }
4580
4581 void osd_peer_stat_t::decode(bufferlist::iterator& bl)
4582 {
4583 DECODE_START(1, bl);
4584 ::decode(stamp, bl);
4585 DECODE_FINISH(bl);
4586 }
4587
4588 void osd_peer_stat_t::dump(Formatter *f) const
4589 {
4590 f->dump_stream("stamp") << stamp;
4591 }
4592
4593 void osd_peer_stat_t::generate_test_instances(list<osd_peer_stat_t*>& o)
4594 {
4595 o.push_back(new osd_peer_stat_t);
4596 o.push_back(new osd_peer_stat_t);
4597 o.back()->stamp = utime_t(1, 2);
4598 }
4599
4600 ostream& operator<<(ostream& out, const osd_peer_stat_t &stat)
4601 {
4602 return out << "stat(" << stat.stamp << ")";
4603 }
4604
4605
4606 // -- OSDSuperblock --
4607
4608 void OSDSuperblock::encode(bufferlist &bl) const
4609 {
4610 ENCODE_START(8, 5, bl);
4611 ::encode(cluster_fsid, bl);
4612 ::encode(whoami, bl);
4613 ::encode(current_epoch, bl);
4614 ::encode(oldest_map, bl);
4615 ::encode(newest_map, bl);
4616 ::encode(weight, bl);
4617 compat_features.encode(bl);
4618 ::encode(clean_thru, bl);
4619 ::encode(mounted, bl);
4620 ::encode(osd_fsid, bl);
4621 ::encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
4622 ::encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
4623 ENCODE_FINISH(bl);
4624 }
4625
4626 void OSDSuperblock::decode(bufferlist::iterator &bl)
4627 {
4628 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
4629 if (struct_v < 3) {
4630 string magic;
4631 ::decode(magic, bl);
4632 }
4633 ::decode(cluster_fsid, bl);
4634 ::decode(whoami, bl);
4635 ::decode(current_epoch, bl);
4636 ::decode(oldest_map, bl);
4637 ::decode(newest_map, bl);
4638 ::decode(weight, bl);
4639 if (struct_v >= 2) {
4640 compat_features.decode(bl);
4641 } else { //upgrade it!
4642 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4643 }
4644 ::decode(clean_thru, bl);
4645 ::decode(mounted, bl);
4646 if (struct_v >= 4)
4647 ::decode(osd_fsid, bl);
4648 if (struct_v >= 6) {
4649 epoch_t last_map_marked_full;
4650 ::decode(last_map_marked_full, bl);
4651 }
4652 if (struct_v >= 7) {
4653 map<int64_t,epoch_t> pool_last_map_marked_full;
4654 ::decode(pool_last_map_marked_full, bl);
4655 }
4656 DECODE_FINISH(bl);
4657 }
4658
4659 void OSDSuperblock::dump(Formatter *f) const
4660 {
4661 f->dump_stream("cluster_fsid") << cluster_fsid;
4662 f->dump_stream("osd_fsid") << osd_fsid;
4663 f->dump_int("whoami", whoami);
4664 f->dump_int("current_epoch", current_epoch);
4665 f->dump_int("oldest_map", oldest_map);
4666 f->dump_int("newest_map", newest_map);
4667 f->dump_float("weight", weight);
4668 f->open_object_section("compat");
4669 compat_features.dump(f);
4670 f->close_section();
4671 f->dump_int("clean_thru", clean_thru);
4672 f->dump_int("last_epoch_mounted", mounted);
4673 }
4674
4675 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
4676 {
4677 OSDSuperblock z;
4678 o.push_back(new OSDSuperblock(z));
4679 memset(&z.cluster_fsid, 1, sizeof(z.cluster_fsid));
4680 memset(&z.osd_fsid, 2, sizeof(z.osd_fsid));
4681 z.whoami = 3;
4682 z.current_epoch = 4;
4683 z.oldest_map = 5;
4684 z.newest_map = 9;
4685 z.mounted = 8;
4686 z.clean_thru = 7;
4687 o.push_back(new OSDSuperblock(z));
4688 o.push_back(new OSDSuperblock(z));
4689 }
4690
4691 // -- SnapSet --
4692
4693 void SnapSet::encode(bufferlist& bl) const
4694 {
4695 ENCODE_START(3, 2, bl);
4696 ::encode(seq, bl);
4697 ::encode(head_exists, bl);
4698 ::encode(snaps, bl);
4699 ::encode(clones, bl);
4700 ::encode(clone_overlap, bl);
4701 ::encode(clone_size, bl);
4702 ::encode(clone_snaps, bl);
4703 ENCODE_FINISH(bl);
4704 }
4705
4706 void SnapSet::decode(bufferlist::iterator& bl)
4707 {
4708 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
4709 ::decode(seq, bl);
4710 ::decode(head_exists, bl);
4711 ::decode(snaps, bl);
4712 ::decode(clones, bl);
4713 ::decode(clone_overlap, bl);
4714 ::decode(clone_size, bl);
4715 if (struct_v >= 3) {
4716 ::decode(clone_snaps, bl);
4717 } else {
4718 clone_snaps.clear();
4719 }
4720 DECODE_FINISH(bl);
4721 }
4722
4723 void SnapSet::dump(Formatter *f) const
4724 {
4725 SnapContext sc(seq, snaps);
4726 f->open_object_section("snap_context");
4727 sc.dump(f);
4728 f->close_section();
4729 f->dump_int("head_exists", head_exists);
4730 f->open_array_section("clones");
4731 for (vector<snapid_t>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
4732 f->open_object_section("clone");
4733 f->dump_unsigned("snap", *p);
4734 f->dump_unsigned("size", clone_size.find(*p)->second);
4735 f->dump_stream("overlap") << clone_overlap.find(*p)->second;
4736 auto q = clone_snaps.find(*p);
4737 if (q != clone_snaps.end()) {
4738 f->open_array_section("snaps");
4739 for (auto s : q->second) {
4740 f->dump_unsigned("snap", s);
4741 }
4742 f->close_section();
4743 }
4744 f->close_section();
4745 }
4746 f->close_section();
4747 }
4748
4749 void SnapSet::generate_test_instances(list<SnapSet*>& o)
4750 {
4751 o.push_back(new SnapSet);
4752 o.push_back(new SnapSet);
4753 o.back()->head_exists = true;
4754 o.back()->seq = 123;
4755 o.back()->snaps.push_back(123);
4756 o.back()->snaps.push_back(12);
4757 o.push_back(new SnapSet);
4758 o.back()->head_exists = true;
4759 o.back()->seq = 123;
4760 o.back()->snaps.push_back(123);
4761 o.back()->snaps.push_back(12);
4762 o.back()->clones.push_back(12);
4763 o.back()->clone_size[12] = 12345;
4764 o.back()->clone_overlap[12];
4765 o.back()->clone_snaps[12] = {12, 10, 8};
4766 }
4767
4768 ostream& operator<<(ostream& out, const SnapSet& cs)
4769 {
4770 if (cs.is_legacy()) {
4771 out << cs.seq << "=" << cs.snaps << ":"
4772 << cs.clones
4773 << (cs.head_exists ? "+head":"");
4774 if (!cs.clone_snaps.empty()) {
4775 out << "+stray_clone_snaps=" << cs.clone_snaps;
4776 }
4777 return out;
4778 } else {
4779 return out << cs.seq << "=" << cs.snaps << ":"
4780 << cs.clone_snaps;
4781 }
4782 }
4783
4784 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
4785 {
4786 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
4787 // correct: it will not include snaps that still logically exist
4788 // but for which there was no clone that is defined. For all
4789 // practical purposes this doesn't matter, since we only use that
4790 // information to clone on the OSD, and we have already moved
4791 // forward past that part of the object history.
4792
4793 seq = ss.seq;
4794 set<snapid_t> _snaps;
4795 set<snapid_t> _clones;
4796 head_exists = false;
4797 for (vector<librados::clone_info_t>::const_iterator p = ss.clones.begin();
4798 p != ss.clones.end();
4799 ++p) {
4800 if (p->cloneid == librados::SNAP_HEAD) {
4801 head_exists = true;
4802 } else {
4803 _clones.insert(p->cloneid);
4804 _snaps.insert(p->snaps.begin(), p->snaps.end());
4805 clone_size[p->cloneid] = p->size;
4806 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
4807 for (vector<pair<uint64_t, uint64_t> >::const_iterator q =
4808 p->overlap.begin(); q != p->overlap.end(); ++q)
4809 clone_overlap[p->cloneid].insert(q->first, q->second);
4810 if (!legacy) {
4811 // p->snaps is ascending; clone_snaps is descending
4812 vector<snapid_t>& v = clone_snaps[p->cloneid];
4813 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
4814 v.push_back(*q);
4815 }
4816 }
4817 }
4818 }
4819
4820 // ascending
4821 clones.clear();
4822 clones.reserve(_clones.size());
4823 for (set<snapid_t>::iterator p = _clones.begin(); p != _clones.end(); ++p)
4824 clones.push_back(*p);
4825
4826 // descending
4827 snaps.clear();
4828 snaps.reserve(_snaps.size());
4829 for (set<snapid_t>::reverse_iterator p = _snaps.rbegin();
4830 p != _snaps.rend(); ++p)
4831 snaps.push_back(*p);
4832 }
4833
4834 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
4835 {
4836 assert(clone_size.count(clone));
4837 uint64_t size = clone_size.find(clone)->second;
4838 assert(clone_overlap.count(clone));
4839 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
4840 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
4841 i != overlap.end();
4842 ++i) {
4843 assert(size >= i.get_len());
4844 size -= i.get_len();
4845 }
4846 return size;
4847 }
4848
4849 void SnapSet::filter(const pg_pool_t &pinfo)
4850 {
4851 vector<snapid_t> oldsnaps;
4852 oldsnaps.swap(snaps);
4853 for (vector<snapid_t>::const_iterator i = oldsnaps.begin();
4854 i != oldsnaps.end();
4855 ++i) {
4856 if (!pinfo.is_removed_snap(*i))
4857 snaps.push_back(*i);
4858 }
4859 }
4860
4861 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
4862 {
4863 SnapSet ss = *this;
4864 ss.filter(pinfo);
4865 return ss;
4866 }
4867
4868 // -- watch_info_t --
4869
4870 void watch_info_t::encode(bufferlist& bl, uint64_t features) const
4871 {
4872 ENCODE_START(4, 3, bl);
4873 ::encode(cookie, bl);
4874 ::encode(timeout_seconds, bl);
4875 ::encode(addr, bl, features);
4876 ENCODE_FINISH(bl);
4877 }
4878
4879 void watch_info_t::decode(bufferlist::iterator& bl)
4880 {
4881 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
4882 ::decode(cookie, bl);
4883 if (struct_v < 2) {
4884 uint64_t ver;
4885 ::decode(ver, bl);
4886 }
4887 ::decode(timeout_seconds, bl);
4888 if (struct_v >= 4) {
4889 ::decode(addr, bl);
4890 }
4891 DECODE_FINISH(bl);
4892 }
4893
4894 void watch_info_t::dump(Formatter *f) const
4895 {
4896 f->dump_unsigned("cookie", cookie);
4897 f->dump_unsigned("timeout_seconds", timeout_seconds);
4898 f->open_object_section("addr");
4899 addr.dump(f);
4900 f->close_section();
4901 }
4902
4903 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
4904 {
4905 o.push_back(new watch_info_t);
4906 o.push_back(new watch_info_t);
4907 o.back()->cookie = 123;
4908 o.back()->timeout_seconds = 99;
4909 entity_addr_t ea;
4910 ea.set_type(entity_addr_t::TYPE_LEGACY);
4911 ea.set_nonce(1);
4912 ea.set_family(AF_INET);
4913 ea.set_in4_quad(0, 127);
4914 ea.set_in4_quad(1, 0);
4915 ea.set_in4_quad(2, 1);
4916 ea.set_in4_quad(3, 2);
4917 ea.set_port(2);
4918 o.back()->addr = ea;
4919 }
4920
4921
4922 // -- object_info_t --
4923
4924 void object_info_t::copy_user_bits(const object_info_t& other)
4925 {
4926 // these bits are copied from head->clone.
4927 size = other.size;
4928 mtime = other.mtime;
4929 local_mtime = other.local_mtime;
4930 last_reqid = other.last_reqid;
4931 truncate_seq = other.truncate_seq;
4932 truncate_size = other.truncate_size;
4933 flags = other.flags;
4934 user_version = other.user_version;
4935 data_digest = other.data_digest;
4936 omap_digest = other.omap_digest;
4937 }
4938
4939 ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
4940 const object_locator_t &loc) {
4941 ps_t ps;
4942 if (loc.key.length())
4943 // Hack, we don't have the osd map, so we don't really know the hash...
4944 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, loc.key.c_str(),
4945 loc.key.length());
4946 else
4947 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, oid.name.c_str(),
4948 oid.name.length());
4949 return ps;
4950 }
4951
4952 void object_info_t::encode(bufferlist& bl, uint64_t features) const
4953 {
4954 object_locator_t myoloc(soid);
4955 map<entity_name_t, watch_info_t> old_watchers;
4956 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator i =
4957 watchers.begin();
4958 i != watchers.end();
4959 ++i) {
4960 old_watchers.insert(make_pair(i->first.second, i->second));
4961 }
4962 ENCODE_START(16, 8, bl);
4963 ::encode(soid, bl);
4964 ::encode(myoloc, bl); //Retained for compatibility
4965 ::encode((__u32)0, bl); // was category, no longer used
4966 ::encode(version, bl);
4967 ::encode(prior_version, bl);
4968 ::encode(last_reqid, bl);
4969 ::encode(size, bl);
4970 ::encode(mtime, bl);
4971 if (soid.snap == CEPH_NOSNAP)
4972 ::encode(osd_reqid_t(), bl); // used to be wrlock_by
4973 else
4974 ::encode(legacy_snaps, bl);
4975 ::encode(truncate_seq, bl);
4976 ::encode(truncate_size, bl);
4977 ::encode(is_lost(), bl);
4978 ::encode(old_watchers, bl, features);
4979 /* shenanigans to avoid breaking backwards compatibility in the disk format.
4980 * When we can, switch this out for simply putting the version_t on disk. */
4981 eversion_t user_eversion(0, user_version);
4982 ::encode(user_eversion, bl);
4983 ::encode(test_flag(FLAG_USES_TMAP), bl);
4984 ::encode(watchers, bl, features);
4985 __u32 _flags = flags;
4986 ::encode(_flags, bl);
4987 ::encode(local_mtime, bl);
4988 ::encode(data_digest, bl);
4989 ::encode(omap_digest, bl);
4990 ::encode(expected_object_size, bl);
4991 ::encode(expected_write_size, bl);
4992 ::encode(alloc_hint_flags, bl);
4993 ENCODE_FINISH(bl);
4994 }
4995
4996 void object_info_t::decode(bufferlist::iterator& bl)
4997 {
4998 object_locator_t myoloc;
4999 DECODE_START_LEGACY_COMPAT_LEN(16, 8, 8, bl);
5000 map<entity_name_t, watch_info_t> old_watchers;
5001 ::decode(soid, bl);
5002 ::decode(myoloc, bl);
5003 {
5004 string category;
5005 ::decode(category, bl); // no longer used
5006 }
5007 ::decode(version, bl);
5008 ::decode(prior_version, bl);
5009 ::decode(last_reqid, bl);
5010 ::decode(size, bl);
5011 ::decode(mtime, bl);
5012 if (soid.snap == CEPH_NOSNAP) {
5013 osd_reqid_t wrlock_by;
5014 ::decode(wrlock_by, bl);
5015 } else {
5016 ::decode(legacy_snaps, bl);
5017 }
5018 ::decode(truncate_seq, bl);
5019 ::decode(truncate_size, bl);
5020
5021 // if this is struct_v >= 13, we will overwrite this
5022 // below since this field is just here for backwards
5023 // compatibility
5024 __u8 lo;
5025 ::decode(lo, bl);
5026 flags = (flag_t)lo;
5027
5028 ::decode(old_watchers, bl);
5029 eversion_t user_eversion;
5030 ::decode(user_eversion, bl);
5031 user_version = user_eversion.version;
5032
5033 if (struct_v >= 9) {
5034 bool uses_tmap = false;
5035 ::decode(uses_tmap, bl);
5036 if (uses_tmap)
5037 set_flag(FLAG_USES_TMAP);
5038 } else {
5039 set_flag(FLAG_USES_TMAP);
5040 }
5041 if (struct_v < 10)
5042 soid.pool = myoloc.pool;
5043 if (struct_v >= 11) {
5044 ::decode(watchers, bl);
5045 } else {
5046 for (map<entity_name_t, watch_info_t>::iterator i = old_watchers.begin();
5047 i != old_watchers.end();
5048 ++i) {
5049 watchers.insert(
5050 make_pair(
5051 make_pair(i->second.cookie, i->first), i->second));
5052 }
5053 }
5054 if (struct_v >= 13) {
5055 __u32 _flags;
5056 ::decode(_flags, bl);
5057 flags = (flag_t)_flags;
5058 }
5059 if (struct_v >= 14) {
5060 ::decode(local_mtime, bl);
5061 } else {
5062 local_mtime = utime_t();
5063 }
5064 if (struct_v >= 15) {
5065 ::decode(data_digest, bl);
5066 ::decode(omap_digest, bl);
5067 } else {
5068 data_digest = omap_digest = -1;
5069 clear_flag(FLAG_DATA_DIGEST);
5070 clear_flag(FLAG_OMAP_DIGEST);
5071 }
5072 if (struct_v >= 16) {
5073 ::decode(expected_object_size, bl);
5074 ::decode(expected_write_size, bl);
5075 ::decode(alloc_hint_flags, bl);
5076 } else {
5077 expected_object_size = 0;
5078 expected_write_size = 0;
5079 alloc_hint_flags = 0;
5080 }
5081 DECODE_FINISH(bl);
5082 }
5083
5084 void object_info_t::dump(Formatter *f) const
5085 {
5086 f->open_object_section("oid");
5087 soid.dump(f);
5088 f->close_section();
5089 f->dump_stream("version") << version;
5090 f->dump_stream("prior_version") << prior_version;
5091 f->dump_stream("last_reqid") << last_reqid;
5092 f->dump_unsigned("user_version", user_version);
5093 f->dump_unsigned("size", size);
5094 f->dump_stream("mtime") << mtime;
5095 f->dump_stream("local_mtime") << local_mtime;
5096 f->dump_unsigned("lost", (int)is_lost());
5097 f->dump_unsigned("flags", (int)flags);
5098 f->open_array_section("legacy_snaps");
5099 for (auto s : legacy_snaps) {
5100 f->dump_unsigned("snap", s);
5101 }
5102 f->close_section();
5103 f->dump_unsigned("truncate_seq", truncate_seq);
5104 f->dump_unsigned("truncate_size", truncate_size);
5105 f->dump_unsigned("data_digest", data_digest);
5106 f->dump_unsigned("omap_digest", omap_digest);
5107 f->dump_unsigned("expected_object_size", expected_object_size);
5108 f->dump_unsigned("expected_write_size", expected_write_size);
5109 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
5110 f->open_object_section("watchers");
5111 for (map<pair<uint64_t, entity_name_t>,watch_info_t>::const_iterator p =
5112 watchers.begin(); p != watchers.end(); ++p) {
5113 stringstream ss;
5114 ss << p->first.second;
5115 f->open_object_section(ss.str().c_str());
5116 p->second.dump(f);
5117 f->close_section();
5118 }
5119 f->close_section();
5120 }
5121
5122 void object_info_t::generate_test_instances(list<object_info_t*>& o)
5123 {
5124 o.push_back(new object_info_t());
5125
5126 // fixme
5127 }
5128
5129
5130 ostream& operator<<(ostream& out, const object_info_t& oi)
5131 {
5132 out << oi.soid << "(" << oi.version
5133 << " " << oi.last_reqid;
5134 if (oi.soid.snap != CEPH_NOSNAP && !oi.legacy_snaps.empty())
5135 out << " " << oi.legacy_snaps;
5136 if (oi.flags)
5137 out << " " << oi.get_flag_string();
5138 out << " s " << oi.size;
5139 out << " uv " << oi.user_version;
5140 if (oi.is_data_digest())
5141 out << " dd " << std::hex << oi.data_digest << std::dec;
5142 if (oi.is_omap_digest())
5143 out << " od " << std::hex << oi.omap_digest << std::dec;
5144 out << " alloc_hint [" << oi.expected_object_size
5145 << " " << oi.expected_write_size
5146 << " " << oi.alloc_hint_flags << "]";
5147
5148 out << ")";
5149 return out;
5150 }
5151
5152 // -- ObjectRecovery --
5153 void ObjectRecoveryProgress::encode(bufferlist &bl) const
5154 {
5155 ENCODE_START(1, 1, bl);
5156 ::encode(first, bl);
5157 ::encode(data_complete, bl);
5158 ::encode(data_recovered_to, bl);
5159 ::encode(omap_recovered_to, bl);
5160 ::encode(omap_complete, bl);
5161 ENCODE_FINISH(bl);
5162 }
5163
5164 void ObjectRecoveryProgress::decode(bufferlist::iterator &bl)
5165 {
5166 DECODE_START(1, bl);
5167 ::decode(first, bl);
5168 ::decode(data_complete, bl);
5169 ::decode(data_recovered_to, bl);
5170 ::decode(omap_recovered_to, bl);
5171 ::decode(omap_complete, bl);
5172 DECODE_FINISH(bl);
5173 }
5174
5175 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
5176 {
5177 return prog.print(out);
5178 }
5179
5180 void ObjectRecoveryProgress::generate_test_instances(
5181 list<ObjectRecoveryProgress*>& o)
5182 {
5183 o.push_back(new ObjectRecoveryProgress);
5184 o.back()->first = false;
5185 o.back()->data_complete = true;
5186 o.back()->omap_complete = true;
5187 o.back()->data_recovered_to = 100;
5188
5189 o.push_back(new ObjectRecoveryProgress);
5190 o.back()->first = true;
5191 o.back()->data_complete = false;
5192 o.back()->omap_complete = false;
5193 o.back()->data_recovered_to = 0;
5194 }
5195
5196 ostream &ObjectRecoveryProgress::print(ostream &out) const
5197 {
5198 return out << "ObjectRecoveryProgress("
5199 << ( first ? "" : "!" ) << "first, "
5200 << "data_recovered_to:" << data_recovered_to
5201 << ", data_complete:" << ( data_complete ? "true" : "false" )
5202 << ", omap_recovered_to:" << omap_recovered_to
5203 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
5204 << ")";
5205 }
5206
5207 void ObjectRecoveryProgress::dump(Formatter *f) const
5208 {
5209 f->dump_int("first?", first);
5210 f->dump_int("data_complete?", data_complete);
5211 f->dump_unsigned("data_recovered_to", data_recovered_to);
5212 f->dump_int("omap_complete?", omap_complete);
5213 f->dump_string("omap_recovered_to", omap_recovered_to);
5214 }
5215
5216 void ObjectRecoveryInfo::encode(bufferlist &bl, uint64_t features) const
5217 {
5218 ENCODE_START(2, 1, bl);
5219 ::encode(soid, bl);
5220 ::encode(version, bl);
5221 ::encode(size, bl);
5222 ::encode(oi, bl, features);
5223 ::encode(ss, bl);
5224 ::encode(copy_subset, bl);
5225 ::encode(clone_subset, bl);
5226 ENCODE_FINISH(bl);
5227 }
5228
5229 void ObjectRecoveryInfo::decode(bufferlist::iterator &bl,
5230 int64_t pool)
5231 {
5232 DECODE_START(2, bl);
5233 ::decode(soid, bl);
5234 ::decode(version, bl);
5235 ::decode(size, bl);
5236 ::decode(oi, bl);
5237 ::decode(ss, bl);
5238 ::decode(copy_subset, bl);
5239 ::decode(clone_subset, bl);
5240 DECODE_FINISH(bl);
5241
5242 if (struct_v < 2) {
5243 if (!soid.is_max() && soid.pool == -1)
5244 soid.pool = pool;
5245 map<hobject_t, interval_set<uint64_t>> tmp;
5246 tmp.swap(clone_subset);
5247 for (map<hobject_t, interval_set<uint64_t>>::iterator i = tmp.begin();
5248 i != tmp.end();
5249 ++i) {
5250 hobject_t first(i->first);
5251 if (!first.is_max() && first.pool == -1)
5252 first.pool = pool;
5253 clone_subset[first].swap(i->second);
5254 }
5255 }
5256 }
5257
5258 void ObjectRecoveryInfo::generate_test_instances(
5259 list<ObjectRecoveryInfo*>& o)
5260 {
5261 o.push_back(new ObjectRecoveryInfo);
5262 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
5263 o.back()->version = eversion_t(0,0);
5264 o.back()->size = 100;
5265 }
5266
5267
5268 void ObjectRecoveryInfo::dump(Formatter *f) const
5269 {
5270 f->dump_stream("object") << soid;
5271 f->dump_stream("at_version") << version;
5272 f->dump_stream("size") << size;
5273 {
5274 f->open_object_section("object_info");
5275 oi.dump(f);
5276 f->close_section();
5277 }
5278 {
5279 f->open_object_section("snapset");
5280 ss.dump(f);
5281 f->close_section();
5282 }
5283 f->dump_stream("copy_subset") << copy_subset;
5284 f->dump_stream("clone_subset") << clone_subset;
5285 }
5286
5287 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
5288 {
5289 return inf.print(out);
5290 }
5291
5292 ostream &ObjectRecoveryInfo::print(ostream &out) const
5293 {
5294 return out << "ObjectRecoveryInfo("
5295 << soid << "@" << version
5296 << ", size: " << size
5297 << ", copy_subset: " << copy_subset
5298 << ", clone_subset: " << clone_subset
5299 << ", snapset: " << ss
5300 << ")";
5301 }
5302
5303 // -- PushReplyOp --
5304 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
5305 {
5306 o.push_back(new PushReplyOp);
5307 o.push_back(new PushReplyOp);
5308 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5309 o.push_back(new PushReplyOp);
5310 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5311 }
5312
5313 void PushReplyOp::encode(bufferlist &bl) const
5314 {
5315 ENCODE_START(1, 1, bl);
5316 ::encode(soid, bl);
5317 ENCODE_FINISH(bl);
5318 }
5319
5320 void PushReplyOp::decode(bufferlist::iterator &bl)
5321 {
5322 DECODE_START(1, bl);
5323 ::decode(soid, bl);
5324 DECODE_FINISH(bl);
5325 }
5326
5327 void PushReplyOp::dump(Formatter *f) const
5328 {
5329 f->dump_stream("soid") << soid;
5330 }
5331
5332 ostream &PushReplyOp::print(ostream &out) const
5333 {
5334 return out
5335 << "PushReplyOp(" << soid
5336 << ")";
5337 }
5338
5339 ostream& operator<<(ostream& out, const PushReplyOp &op)
5340 {
5341 return op.print(out);
5342 }
5343
5344 uint64_t PushReplyOp::cost(CephContext *cct) const
5345 {
5346
5347 return cct->_conf->osd_push_per_object_cost +
5348 cct->_conf->osd_recovery_max_chunk;
5349 }
5350
5351 // -- PullOp --
5352 void PullOp::generate_test_instances(list<PullOp*> &o)
5353 {
5354 o.push_back(new PullOp);
5355 o.push_back(new PullOp);
5356 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5357 o.back()->recovery_info.version = eversion_t(3, 10);
5358 o.push_back(new PullOp);
5359 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5360 o.back()->recovery_info.version = eversion_t(0, 0);
5361 }
5362
5363 void PullOp::encode(bufferlist &bl, uint64_t features) const
5364 {
5365 ENCODE_START(1, 1, bl);
5366 ::encode(soid, bl);
5367 ::encode(recovery_info, bl, features);
5368 ::encode(recovery_progress, bl);
5369 ENCODE_FINISH(bl);
5370 }
5371
5372 void PullOp::decode(bufferlist::iterator &bl)
5373 {
5374 DECODE_START(1, bl);
5375 ::decode(soid, bl);
5376 ::decode(recovery_info, bl);
5377 ::decode(recovery_progress, bl);
5378 DECODE_FINISH(bl);
5379 }
5380
5381 void PullOp::dump(Formatter *f) const
5382 {
5383 f->dump_stream("soid") << soid;
5384 {
5385 f->open_object_section("recovery_info");
5386 recovery_info.dump(f);
5387 f->close_section();
5388 }
5389 {
5390 f->open_object_section("recovery_progress");
5391 recovery_progress.dump(f);
5392 f->close_section();
5393 }
5394 }
5395
5396 ostream &PullOp::print(ostream &out) const
5397 {
5398 return out
5399 << "PullOp(" << soid
5400 << ", recovery_info: " << recovery_info
5401 << ", recovery_progress: " << recovery_progress
5402 << ")";
5403 }
5404
5405 ostream& operator<<(ostream& out, const PullOp &op)
5406 {
5407 return op.print(out);
5408 }
5409
5410 uint64_t PullOp::cost(CephContext *cct) const
5411 {
5412 return cct->_conf->osd_push_per_object_cost +
5413 cct->_conf->osd_recovery_max_chunk;
5414 }
5415
5416 // -- PushOp --
5417 void PushOp::generate_test_instances(list<PushOp*> &o)
5418 {
5419 o.push_back(new PushOp);
5420 o.push_back(new PushOp);
5421 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5422 o.back()->version = eversion_t(3, 10);
5423 o.push_back(new PushOp);
5424 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5425 o.back()->version = eversion_t(0, 0);
5426 }
5427
5428 void PushOp::encode(bufferlist &bl, uint64_t features) const
5429 {
5430 ENCODE_START(1, 1, bl);
5431 ::encode(soid, bl);
5432 ::encode(version, bl);
5433 ::encode(data, bl);
5434 ::encode(data_included, bl);
5435 ::encode(omap_header, bl);
5436 ::encode(omap_entries, bl);
5437 ::encode(attrset, bl);
5438 ::encode(recovery_info, bl, features);
5439 ::encode(after_progress, bl);
5440 ::encode(before_progress, bl);
5441 ENCODE_FINISH(bl);
5442 }
5443
5444 void PushOp::decode(bufferlist::iterator &bl)
5445 {
5446 DECODE_START(1, bl);
5447 ::decode(soid, bl);
5448 ::decode(version, bl);
5449 ::decode(data, bl);
5450 ::decode(data_included, bl);
5451 ::decode(omap_header, bl);
5452 ::decode(omap_entries, bl);
5453 ::decode(attrset, bl);
5454 ::decode(recovery_info, bl);
5455 ::decode(after_progress, bl);
5456 ::decode(before_progress, bl);
5457 DECODE_FINISH(bl);
5458 }
5459
5460 void PushOp::dump(Formatter *f) const
5461 {
5462 f->dump_stream("soid") << soid;
5463 f->dump_stream("version") << version;
5464 f->dump_int("data_len", data.length());
5465 f->dump_stream("data_included") << data_included;
5466 f->dump_int("omap_header_len", omap_header.length());
5467 f->dump_int("omap_entries_len", omap_entries.size());
5468 f->dump_int("attrset_len", attrset.size());
5469 {
5470 f->open_object_section("recovery_info");
5471 recovery_info.dump(f);
5472 f->close_section();
5473 }
5474 {
5475 f->open_object_section("after_progress");
5476 after_progress.dump(f);
5477 f->close_section();
5478 }
5479 {
5480 f->open_object_section("before_progress");
5481 before_progress.dump(f);
5482 f->close_section();
5483 }
5484 }
5485
5486 ostream &PushOp::print(ostream &out) const
5487 {
5488 return out
5489 << "PushOp(" << soid
5490 << ", version: " << version
5491 << ", data_included: " << data_included
5492 << ", data_size: " << data.length()
5493 << ", omap_header_size: " << omap_header.length()
5494 << ", omap_entries_size: " << omap_entries.size()
5495 << ", attrset_size: " << attrset.size()
5496 << ", recovery_info: " << recovery_info
5497 << ", after_progress: " << after_progress
5498 << ", before_progress: " << before_progress
5499 << ")";
5500 }
5501
5502 ostream& operator<<(ostream& out, const PushOp &op)
5503 {
5504 return op.print(out);
5505 }
5506
5507 uint64_t PushOp::cost(CephContext *cct) const
5508 {
5509 uint64_t cost = data_included.size();
5510 for (map<string, bufferlist>::const_iterator i =
5511 omap_entries.begin();
5512 i != omap_entries.end();
5513 ++i) {
5514 cost += i->second.length();
5515 }
5516 cost += cct->_conf->osd_push_per_object_cost;
5517 return cost;
5518 }
5519
5520 // -- ScrubMap --
5521
5522 void ScrubMap::merge_incr(const ScrubMap &l)
5523 {
5524 assert(valid_through == l.incr_since);
5525 valid_through = l.valid_through;
5526
5527 for (map<hobject_t,object>::const_iterator p = l.objects.begin();
5528 p != l.objects.end();
5529 ++p){
5530 if (p->second.negative) {
5531 map<hobject_t,object>::iterator q = objects.find(p->first);
5532 if (q != objects.end()) {
5533 objects.erase(q);
5534 }
5535 } else {
5536 objects[p->first] = p->second;
5537 }
5538 }
5539 }
5540
5541 void ScrubMap::encode(bufferlist& bl) const
5542 {
5543 ENCODE_START(3, 2, bl);
5544 ::encode(objects, bl);
5545 ::encode((__u32)0, bl); // used to be attrs; now deprecated
5546 bufferlist old_logbl; // not used
5547 ::encode(old_logbl, bl);
5548 ::encode(valid_through, bl);
5549 ::encode(incr_since, bl);
5550 ENCODE_FINISH(bl);
5551 }
5552
5553 void ScrubMap::decode(bufferlist::iterator& bl, int64_t pool)
5554 {
5555 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5556 ::decode(objects, bl);
5557 {
5558 map<string,string> attrs; // deprecated
5559 ::decode(attrs, bl);
5560 }
5561 bufferlist old_logbl; // not used
5562 ::decode(old_logbl, bl);
5563 ::decode(valid_through, bl);
5564 ::decode(incr_since, bl);
5565 DECODE_FINISH(bl);
5566
5567 // handle hobject_t upgrade
5568 if (struct_v < 3) {
5569 map<hobject_t, object> tmp;
5570 tmp.swap(objects);
5571 for (map<hobject_t, object>::iterator i = tmp.begin();
5572 i != tmp.end();
5573 ++i) {
5574 hobject_t first(i->first);
5575 if (!first.is_max() && first.pool == -1)
5576 first.pool = pool;
5577 objects[first] = i->second;
5578 }
5579 }
5580 }
5581
5582 void ScrubMap::dump(Formatter *f) const
5583 {
5584 f->dump_stream("valid_through") << valid_through;
5585 f->dump_stream("incremental_since") << incr_since;
5586 f->open_array_section("objects");
5587 for (map<hobject_t,object>::const_iterator p = objects.begin(); p != objects.end(); ++p) {
5588 f->open_object_section("object");
5589 f->dump_string("name", p->first.oid.name);
5590 f->dump_unsigned("hash", p->first.get_hash());
5591 f->dump_string("key", p->first.get_key());
5592 f->dump_int("snapid", p->first.snap);
5593 p->second.dump(f);
5594 f->close_section();
5595 }
5596 f->close_section();
5597 }
5598
5599 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
5600 {
5601 o.push_back(new ScrubMap);
5602 o.push_back(new ScrubMap);
5603 o.back()->valid_through = eversion_t(1, 2);
5604 o.back()->incr_since = eversion_t(3, 4);
5605 list<object*> obj;
5606 object::generate_test_instances(obj);
5607 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
5608 obj.pop_back();
5609 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
5610 }
5611
5612 // -- ScrubMap::object --
5613
5614 void ScrubMap::object::encode(bufferlist& bl) const
5615 {
5616 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
5617 ENCODE_START(8, 7, bl);
5618 ::encode(size, bl);
5619 ::encode(negative, bl);
5620 ::encode(attrs, bl);
5621 ::encode(digest, bl);
5622 ::encode(digest_present, bl);
5623 ::encode((uint32_t)0, bl); // obsolete nlinks
5624 ::encode((uint32_t)0, bl); // snapcolls
5625 ::encode(omap_digest, bl);
5626 ::encode(omap_digest_present, bl);
5627 ::encode(compat_read_error, bl);
5628 ::encode(stat_error, bl);
5629 ::encode(read_error, bl);
5630 ::encode(ec_hash_mismatch, bl);
5631 ::encode(ec_size_mismatch, bl);
5632 ENCODE_FINISH(bl);
5633 }
5634
5635 void ScrubMap::object::decode(bufferlist::iterator& bl)
5636 {
5637 DECODE_START(8, bl);
5638 ::decode(size, bl);
5639 bool tmp, compat_read_error = false;
5640 ::decode(tmp, bl);
5641 negative = tmp;
5642 ::decode(attrs, bl);
5643 ::decode(digest, bl);
5644 ::decode(tmp, bl);
5645 digest_present = tmp;
5646 {
5647 uint32_t nlinks;
5648 ::decode(nlinks, bl);
5649 set<snapid_t> snapcolls;
5650 ::decode(snapcolls, bl);
5651 }
5652 ::decode(omap_digest, bl);
5653 ::decode(tmp, bl);
5654 omap_digest_present = tmp;
5655 ::decode(compat_read_error, bl);
5656 ::decode(tmp, bl);
5657 stat_error = tmp;
5658 if (struct_v >= 8) {
5659 ::decode(tmp, bl);
5660 read_error = tmp;
5661 ::decode(tmp, bl);
5662 ec_hash_mismatch = tmp;
5663 ::decode(tmp, bl);
5664 ec_size_mismatch = tmp;
5665 }
5666 // If older encoder found a read_error, set read_error
5667 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
5668 read_error = true;
5669 DECODE_FINISH(bl);
5670 }
5671
5672 void ScrubMap::object::dump(Formatter *f) const
5673 {
5674 f->dump_int("size", size);
5675 f->dump_int("negative", negative);
5676 f->open_array_section("attrs");
5677 for (map<string,bufferptr>::const_iterator p = attrs.begin(); p != attrs.end(); ++p) {
5678 f->open_object_section("attr");
5679 f->dump_string("name", p->first);
5680 f->dump_int("length", p->second.length());
5681 f->close_section();
5682 }
5683 f->close_section();
5684 }
5685
5686 void ScrubMap::object::generate_test_instances(list<object*>& o)
5687 {
5688 o.push_back(new object);
5689 o.push_back(new object);
5690 o.back()->negative = true;
5691 o.push_back(new object);
5692 o.back()->size = 123;
5693 o.back()->attrs["foo"] = buffer::copy("foo", 3);
5694 o.back()->attrs["bar"] = buffer::copy("barval", 6);
5695 }
5696
5697 // -- OSDOp --
5698
5699 ostream& operator<<(ostream& out, const OSDOp& op)
5700 {
5701 out << ceph_osd_op_name(op.op.op);
5702 if (ceph_osd_op_type_data(op.op.op)) {
5703 // data extent
5704 switch (op.op.op) {
5705 case CEPH_OSD_OP_ASSERT_VER:
5706 out << " v" << op.op.assert_ver.ver;
5707 break;
5708 case CEPH_OSD_OP_TRUNCATE:
5709 out << " " << op.op.extent.offset;
5710 break;
5711 case CEPH_OSD_OP_MASKTRUNC:
5712 case CEPH_OSD_OP_TRIMTRUNC:
5713 out << " " << op.op.extent.truncate_seq << "@"
5714 << (int64_t)op.op.extent.truncate_size;
5715 break;
5716 case CEPH_OSD_OP_ROLLBACK:
5717 out << " " << snapid_t(op.op.snap.snapid);
5718 break;
5719 case CEPH_OSD_OP_WATCH:
5720 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
5721 << " cookie " << op.op.watch.cookie;
5722 if (op.op.watch.gen)
5723 out << " gen " << op.op.watch.gen;
5724 break;
5725 case CEPH_OSD_OP_NOTIFY:
5726 case CEPH_OSD_OP_NOTIFY_ACK:
5727 out << " cookie " << op.op.notify.cookie;
5728 break;
5729 case CEPH_OSD_OP_COPY_GET:
5730 out << " max " << op.op.copy_get.max;
5731 break;
5732 case CEPH_OSD_OP_COPY_FROM:
5733 out << " ver " << op.op.copy_from.src_version;
5734 break;
5735 case CEPH_OSD_OP_SETALLOCHINT:
5736 out << " object_size " << op.op.alloc_hint.expected_object_size
5737 << " write_size " << op.op.alloc_hint.expected_write_size;
5738 break;
5739 case CEPH_OSD_OP_READ:
5740 case CEPH_OSD_OP_SPARSE_READ:
5741 case CEPH_OSD_OP_SYNC_READ:
5742 case CEPH_OSD_OP_WRITE:
5743 case CEPH_OSD_OP_WRITEFULL:
5744 case CEPH_OSD_OP_ZERO:
5745 case CEPH_OSD_OP_APPEND:
5746 case CEPH_OSD_OP_MAPEXT:
5747 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
5748 if (op.op.extent.truncate_seq)
5749 out << " [" << op.op.extent.truncate_seq << "@"
5750 << (int64_t)op.op.extent.truncate_size << "]";
5751 if (op.op.flags)
5752 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
5753 default:
5754 // don't show any arg info
5755 break;
5756 }
5757 } else if (ceph_osd_op_type_attr(op.op.op)) {
5758 // xattr name
5759 if (op.op.xattr.name_len && op.indata.length()) {
5760 out << " ";
5761 op.indata.write(0, op.op.xattr.name_len, out);
5762 }
5763 if (op.op.xattr.value_len)
5764 out << " (" << op.op.xattr.value_len << ")";
5765 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
5766 out << " op " << (int)op.op.xattr.cmp_op
5767 << " mode " << (int)op.op.xattr.cmp_mode;
5768 } else if (ceph_osd_op_type_exec(op.op.op)) {
5769 // class.method
5770 if (op.op.cls.class_len && op.indata.length()) {
5771 out << " ";
5772 op.indata.write(0, op.op.cls.class_len, out);
5773 out << ".";
5774 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
5775 }
5776 } else if (ceph_osd_op_type_pg(op.op.op)) {
5777 switch (op.op.op) {
5778 case CEPH_OSD_OP_PGLS:
5779 case CEPH_OSD_OP_PGLS_FILTER:
5780 case CEPH_OSD_OP_PGNLS:
5781 case CEPH_OSD_OP_PGNLS_FILTER:
5782 out << " start_epoch " << op.op.pgls.start_epoch;
5783 break;
5784 case CEPH_OSD_OP_PG_HITSET_LS:
5785 break;
5786 case CEPH_OSD_OP_PG_HITSET_GET:
5787 out << " " << utime_t(op.op.hit_set_get.stamp);
5788 break;
5789 case CEPH_OSD_OP_SCRUBLS:
5790 break;
5791 }
5792 }
5793 return out;
5794 }
5795
5796
5797 void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in)
5798 {
5799 bufferlist::iterator datap = in.begin();
5800 for (unsigned i = 0; i < ops.size(); i++) {
5801 if (ops[i].op.payload_len) {
5802 datap.copy(ops[i].op.payload_len, ops[i].indata);
5803 }
5804 }
5805 }
5806
5807 void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out)
5808 {
5809 for (unsigned i = 0; i < ops.size(); i++) {
5810 if (ops[i].indata.length()) {
5811 ops[i].op.payload_len = ops[i].indata.length();
5812 out.append(ops[i].indata);
5813 }
5814 }
5815 }
5816
5817 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in)
5818 {
5819 bufferlist::iterator datap = in.begin();
5820 for (unsigned i = 0; i < ops.size(); i++) {
5821 if (ops[i].op.payload_len) {
5822 datap.copy(ops[i].op.payload_len, ops[i].outdata);
5823 }
5824 }
5825 }
5826
5827 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out)
5828 {
5829 for (unsigned i = 0; i < ops.size(); i++) {
5830 if (ops[i].outdata.length()) {
5831 ops[i].op.payload_len = ops[i].outdata.length();
5832 out.append(ops[i].outdata);
5833 }
5834 }
5835 }
5836
5837 bool store_statfs_t::operator==(const store_statfs_t& other) const
5838 {
5839 return total == other.total
5840 && available == other.available
5841 && allocated == other.allocated
5842 && stored == other.stored
5843 && compressed == other.compressed
5844 && compressed_allocated == other.compressed_allocated
5845 && compressed_original == other.compressed_original;
5846 }
5847
5848 void store_statfs_t::dump(Formatter *f) const
5849 {
5850 f->dump_int("total", total);
5851 f->dump_int("available", available);
5852 f->dump_int("allocated", allocated);
5853 f->dump_int("stored", stored);
5854 f->dump_int("compressed", compressed);
5855 f->dump_int("compressed_allocated", compressed_allocated);
5856 f->dump_int("compressed_original", compressed_original);
5857 }
5858
5859 ostream& operator<<(ostream& out, const store_statfs_t &s)
5860 {
5861 out << std::hex
5862 << "store_statfs(0x" << s.available
5863 << "/0x" << s.total
5864 << ", stored 0x" << s.stored
5865 << "/0x" << s.allocated
5866 << ", compress 0x" << s.compressed
5867 << "/0x" << s.compressed_allocated
5868 << "/0x" << s.compressed_original
5869 << std::dec
5870 << ")";
5871 return out;
5872 }